From ce9ec5366ab200193140bbd060b948f62a10286b Mon Sep 17 00:00:00 2001 From: Neil Date: Wed, 28 Feb 2024 11:44:50 +1100 Subject: Add variant of UTF8Classify that takes a char* so that client code does not have to reinterpret_cast. Make functions in header constexpr. Prefer .data() to &[] since safer. Avoid else when not needed. --- src/Document.cxx | 2 +- src/UniConversion.cxx | 32 +++++++++++++++++--------------- src/UniConversion.h | 7 ++++--- 3 files changed, 22 insertions(+), 19 deletions(-) diff --git a/src/Document.cxx b/src/Document.cxx index 5f77ec2de..df4e570d0 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -2235,7 +2235,7 @@ Sci::Position Document::FindText(Sci::Position minPos, Sci::Position maxPos, con for (int b = 1; b < widthCharBytes; b++) { bytes[b] = cbView.CharAt(posIndexDocument + b); } - widthChar = UTF8Classify(reinterpret_cast(bytes), widthCharBytes) & UTF8MaskWidth; + widthChar = UTF8Classify(bytes, widthCharBytes) & UTF8MaskWidth; if (!indexSearch) { // First character widthFirstCharacter = widthChar; } diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx index 3f3bc5904..eadac8915 100644 --- a/src/UniConversion.cxx +++ b/src/UniConversion.cxx @@ -241,12 +241,12 @@ std::wstring WStringFromUTF8(std::string_view svu8) { if constexpr (sizeof(wchar_t) == 2) { const size_t len16 = UTF16Length(svu8); std::wstring ws(len16, 0); - UTF16FromUTF8(svu8, &ws[0], len16); + UTF16FromUTF8(svu8, ws.data(), len16); return ws; } else { const size_t len32 = UTF32Length(svu8); std::wstring ws(len32, 0); - UTF32FromUTF8(svu8, reinterpret_cast(&ws[0]), len32); + UTF32FromUTF8(svu8, reinterpret_cast(ws.data()), len32); return ws; } } @@ -255,11 +255,10 @@ unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) noexcept { if (val < SUPPLEMENTAL_PLANE_FIRST) { tbuf[0] = static_cast(val); return 1; - } else { - tbuf[0] = static_cast(((val - SUPPLEMENTAL_PLANE_FIRST) >> 10) + SURROGATE_LEAD_FIRST); - tbuf[1] = static_cast((val & 0x3ff) + SURROGATE_TRAIL_FIRST); - return 2; } + tbuf[0] = static_cast(((val - SUPPLEMENTAL_PLANE_FIRST) >> 10) + SURROGATE_LEAD_FIRST); + tbuf[1] = static_cast((val & 0x3ff) + SURROGATE_TRAIL_FIRST); + return 2; } const unsigned char UTF8BytesOfLead[256] = { @@ -358,25 +357,28 @@ int UTF8Classify(const unsigned char *us, size_t len) noexcept { return UTF8MaskInvalid | 1; } +int UTF8Classify(const char *s, size_t len) noexcept { + return UTF8Classify(reinterpret_cast(s), len); +} + int UTF8DrawBytes(const char *s, size_t len) noexcept { - const int utf8StatusNext = UTF8Classify(reinterpret_cast(s), len); + const int utf8StatusNext = UTF8Classify(s, len); return (utf8StatusNext & UTF8MaskInvalid) ? 1 : (utf8StatusNext & UTF8MaskWidth); } bool UTF8IsValid(std::string_view svu8) noexcept { - const unsigned char *us = reinterpret_cast(svu8.data()); + const char *s = svu8.data(); size_t remaining = svu8.length(); while (remaining > 0) { - const int utf8Status = UTF8Classify(us, remaining); + const int utf8Status = UTF8Classify(s, remaining); if (utf8Status & UTF8MaskInvalid) { return false; - } else { - const int lenChar = utf8Status & UTF8MaskWidth; - us += lenChar; - remaining -= lenChar; } + const int lenChar = utf8Status & UTF8MaskWidth; + s += lenChar; + remaining -= lenChar; } - return remaining == 0; + return true; } // Replace invalid bytes in UTF-8 with the replacement character @@ -385,7 +387,7 @@ std::string FixInvalidUTF8(const std::string &text) { const char *s = text.c_str(); size_t remaining = text.size(); while (remaining > 0) { - const int utf8Status = UTF8Classify(reinterpret_cast(s), remaining); + const int utf8Status = UTF8Classify(s, remaining); if (utf8Status & UTF8MaskInvalid) { // Replacement character 0xFFFD = UTF8:"efbfbd". result.append("\xef\xbf\xbd"); diff --git a/src/UniConversion.h b/src/UniConversion.h index b4f4c89f6..657e3eca7 100644 --- a/src/UniConversion.h +++ b/src/UniConversion.h @@ -59,8 +59,9 @@ constexpr bool UTF8IsAscii(char ch) noexcept { enum { UTF8MaskWidth=0x7, UTF8MaskInvalid=0x8 }; int UTF8Classify(const unsigned char *us, size_t len) noexcept; +int UTF8Classify(const char *s, size_t len) noexcept; inline int UTF8Classify(std::string_view sv) noexcept { - return UTF8Classify(reinterpret_cast(sv.data()), sv.length()); + return UTF8Classify(sv.data(), sv.length()); } // Similar to UTF8Classify but returns a length of 1 for invalid bytes @@ -70,13 +71,13 @@ int UTF8DrawBytes(const char *s, size_t len) noexcept; // Line separator is U+2028 \xe2\x80\xa8 // Paragraph separator is U+2029 \xe2\x80\xa9 constexpr int UTF8SeparatorLength = 3; -inline bool UTF8IsSeparator(const unsigned char *us) noexcept { +constexpr bool UTF8IsSeparator(const unsigned char *us) noexcept { return (us[0] == 0xe2) && (us[1] == 0x80) && ((us[2] == 0xa8) || (us[2] == 0xa9)); } // NEL is U+0085 \xc2\x85 constexpr int UTF8NELLength = 2; -inline bool UTF8IsNEL(const unsigned char *us) noexcept { +constexpr bool UTF8IsNEL(const unsigned char *us) noexcept { return (us[0] == 0xc2) && (us[1] == 0x85); } -- cgit v1.2.3