diff options
-rw-r--r-- | src/CaseConvert.cxx | 2 | ||||
-rw-r--r-- | src/UniConversion.cxx | 12 | ||||
-rw-r--r-- | src/UniConversion.h | 1 | ||||
-rw-r--r-- | test/unit/testUniConversion.cxx | 5 |
4 files changed, 19 insertions, 1 deletions
diff --git a/src/CaseConvert.cxx b/src/CaseConvert.cxx index 53824a987..752fd54e0 100644 --- a/src/CaseConvert.cxx +++ b/src/CaseConvert.cxx @@ -749,7 +749,7 @@ void CaseConverter::SetupConversions(CaseConversion conversion) { break; } if (!converted.empty()) { - const int character = UnicodeFromUTF8(reinterpret_cast<const unsigned char *>(originUTF8.data())); + const int character = UnicodeFromUTF8(originUTF8); Add(character, converted); } } diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx index eadac8915..868fbacf5 100644 --- a/src/UniConversion.cxx +++ b/src/UniConversion.cxx @@ -261,6 +261,18 @@ unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) noexcept { return 2; } +int UnicodeFromUTF8(std::string_view sv) noexcept { + if (!sv.empty()) { + const unsigned char uch = sv.front(); + const unsigned int byteCount = UTF8BytesOfLead[uch]; + if (sv.length() >= byteCount) { + return UnicodeFromUTF8(reinterpret_cast<const unsigned char *>(sv.data())); + } + } + // Failure so let the caller know + return unicodeReplacementChar; +} + const unsigned char UTF8BytesOfLead[256] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00 - 0F 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10 - 1F diff --git a/src/UniConversion.h b/src/UniConversion.h index 657e3eca7..7a51b2d08 100644 --- a/src/UniConversion.h +++ b/src/UniConversion.h @@ -43,6 +43,7 @@ inline int UnicodeFromUTF8(const unsigned char *us) noexcept { return ((us[0] & 0x7) << 18) + ((us[1] & 0x3F) << 12) + ((us[2] & 0x3F) << 6) + (us[3] & 0x3F); } } +int UnicodeFromUTF8(std::string_view sv) noexcept; constexpr bool UTF8IsTrailByte(unsigned char ch) noexcept { return (ch >= 0x80) && (ch < 0xc0); diff --git a/test/unit/testUniConversion.cxx b/test/unit/testUniConversion.cxx index 0327db7e1..a3b15d84c 100644 --- a/test/unit/testUniConversion.cxx +++ b/test/unit/testUniConversion.cxx @@ -108,6 +108,11 @@ TEST_CASE("UniConversion") { REQUIRE(UnicodeFromUTF8(s) == 0x10348); } + SECTION("UnicodeFromUTF8 StringView") { + const unsigned char s[]="\xF0\x90\x8D\x88"; + REQUIRE(UnicodeFromUTF8(s) == 0x10348); + } + // UTF16FromUTF8 SECTION("UTF16FromUTF8 ASCII") { |