diff options
author | Neil <nyamatongwe@gmail.com> | 2024-02-28 11:44:50 +1100 |
---|---|---|
committer | Neil <nyamatongwe@gmail.com> | 2024-02-28 11:44:50 +1100 |
commit | ce9ec5366ab200193140bbd060b948f62a10286b (patch) | |
tree | 99043aca968b7a9dd7776bcacf28f626593aed5d | |
parent | d021884ab62bb339427cb01bebb2660f47417019 (diff) | |
download | scintilla-mirror-ce9ec5366ab200193140bbd060b948f62a10286b.tar.gz |
Add variant of UTF8Classify that takes a char* so that client code does not have
to reinterpret_cast.
Make functions in header constexpr.
Prefer .data() to &[] since safer.
Avoid else when not needed.
-rw-r--r-- | src/Document.cxx | 2 | ||||
-rw-r--r-- | src/UniConversion.cxx | 32 | ||||
-rw-r--r-- | src/UniConversion.h | 7 |
3 files changed, 22 insertions, 19 deletions
diff --git a/src/Document.cxx b/src/Document.cxx index 5f77ec2de..df4e570d0 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -2235,7 +2235,7 @@ Sci::Position Document::FindText(Sci::Position minPos, Sci::Position maxPos, con for (int b = 1; b < widthCharBytes; b++) { bytes[b] = cbView.CharAt(posIndexDocument + b); } - widthChar = UTF8Classify(reinterpret_cast<const unsigned char *>(bytes), widthCharBytes) & UTF8MaskWidth; + widthChar = UTF8Classify(bytes, widthCharBytes) & UTF8MaskWidth; if (!indexSearch) { // First character widthFirstCharacter = widthChar; } diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx index 3f3bc5904..eadac8915 100644 --- a/src/UniConversion.cxx +++ b/src/UniConversion.cxx @@ -241,12 +241,12 @@ std::wstring WStringFromUTF8(std::string_view svu8) { if constexpr (sizeof(wchar_t) == 2) { const size_t len16 = UTF16Length(svu8); std::wstring ws(len16, 0); - UTF16FromUTF8(svu8, &ws[0], len16); + UTF16FromUTF8(svu8, ws.data(), len16); return ws; } else { const size_t len32 = UTF32Length(svu8); std::wstring ws(len32, 0); - UTF32FromUTF8(svu8, reinterpret_cast<unsigned int *>(&ws[0]), len32); + UTF32FromUTF8(svu8, reinterpret_cast<unsigned int *>(ws.data()), len32); return ws; } } @@ -255,11 +255,10 @@ unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) noexcept { if (val < SUPPLEMENTAL_PLANE_FIRST) { tbuf[0] = static_cast<wchar_t>(val); return 1; - } else { - tbuf[0] = static_cast<wchar_t>(((val - SUPPLEMENTAL_PLANE_FIRST) >> 10) + SURROGATE_LEAD_FIRST); - tbuf[1] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST); - return 2; } + tbuf[0] = static_cast<wchar_t>(((val - SUPPLEMENTAL_PLANE_FIRST) >> 10) + SURROGATE_LEAD_FIRST); + tbuf[1] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST); + return 2; } const unsigned char UTF8BytesOfLead[256] = { @@ -358,25 +357,28 @@ int UTF8Classify(const unsigned char *us, size_t len) noexcept { return UTF8MaskInvalid | 1; } +int UTF8Classify(const char *s, size_t len) noexcept { + return UTF8Classify(reinterpret_cast<const unsigned char *>(s), len); +} + int UTF8DrawBytes(const char *s, size_t len) noexcept { - const int utf8StatusNext = UTF8Classify(reinterpret_cast<const unsigned char *>(s), len); + const int utf8StatusNext = UTF8Classify(s, len); return (utf8StatusNext & UTF8MaskInvalid) ? 1 : (utf8StatusNext & UTF8MaskWidth); } bool UTF8IsValid(std::string_view svu8) noexcept { - const unsigned char *us = reinterpret_cast<const unsigned char *>(svu8.data()); + const char *s = svu8.data(); size_t remaining = svu8.length(); while (remaining > 0) { - const int utf8Status = UTF8Classify(us, remaining); + const int utf8Status = UTF8Classify(s, remaining); if (utf8Status & UTF8MaskInvalid) { return false; - } else { - const int lenChar = utf8Status & UTF8MaskWidth; - us += lenChar; - remaining -= lenChar; } + const int lenChar = utf8Status & UTF8MaskWidth; + s += lenChar; + remaining -= lenChar; } - return remaining == 0; + return true; } // Replace invalid bytes in UTF-8 with the replacement character @@ -385,7 +387,7 @@ std::string FixInvalidUTF8(const std::string &text) { const char *s = text.c_str(); size_t remaining = text.size(); while (remaining > 0) { - const int utf8Status = UTF8Classify(reinterpret_cast<const unsigned char *>(s), remaining); + const int utf8Status = UTF8Classify(s, remaining); if (utf8Status & UTF8MaskInvalid) { // Replacement character 0xFFFD = UTF8:"efbfbd". result.append("\xef\xbf\xbd"); diff --git a/src/UniConversion.h b/src/UniConversion.h index b4f4c89f6..657e3eca7 100644 --- a/src/UniConversion.h +++ b/src/UniConversion.h @@ -59,8 +59,9 @@ constexpr bool UTF8IsAscii(char ch) noexcept { enum { UTF8MaskWidth=0x7, UTF8MaskInvalid=0x8 }; int UTF8Classify(const unsigned char *us, size_t len) noexcept; +int UTF8Classify(const char *s, size_t len) noexcept; inline int UTF8Classify(std::string_view sv) noexcept { - return UTF8Classify(reinterpret_cast<const unsigned char *>(sv.data()), sv.length()); + return UTF8Classify(sv.data(), sv.length()); } // Similar to UTF8Classify but returns a length of 1 for invalid bytes @@ -70,13 +71,13 @@ int UTF8DrawBytes(const char *s, size_t len) noexcept; // Line separator is U+2028 \xe2\x80\xa8 // Paragraph separator is U+2029 \xe2\x80\xa9 constexpr int UTF8SeparatorLength = 3; -inline bool UTF8IsSeparator(const unsigned char *us) noexcept { +constexpr bool UTF8IsSeparator(const unsigned char *us) noexcept { return (us[0] == 0xe2) && (us[1] == 0x80) && ((us[2] == 0xa8) || (us[2] == 0xa9)); } // NEL is U+0085 \xc2\x85 constexpr int UTF8NELLength = 2; -inline bool UTF8IsNEL(const unsigned char *us) noexcept { +constexpr bool UTF8IsNEL(const unsigned char *us) noexcept { return (us[0] == 0xc2) && (us[1] == 0x85); } |