aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--src/CaseConvert.cxx2
-rw-r--r--src/UniConversion.cxx12
-rw-r--r--src/UniConversion.h1
-rw-r--r--test/unit/testUniConversion.cxx5
4 files changed, 19 insertions, 1 deletions
diff --git a/src/CaseConvert.cxx b/src/CaseConvert.cxx
index 53824a987..752fd54e0 100644
--- a/src/CaseConvert.cxx
+++ b/src/CaseConvert.cxx
@@ -749,7 +749,7 @@ void CaseConverter::SetupConversions(CaseConversion conversion) {
break;
}
if (!converted.empty()) {
- const int character = UnicodeFromUTF8(reinterpret_cast<const unsigned char *>(originUTF8.data()));
+ const int character = UnicodeFromUTF8(originUTF8);
Add(character, converted);
}
}
diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx
index eadac8915..868fbacf5 100644
--- a/src/UniConversion.cxx
+++ b/src/UniConversion.cxx
@@ -261,6 +261,18 @@ unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) noexcept {
return 2;
}
+int UnicodeFromUTF8(std::string_view sv) noexcept {
+ if (!sv.empty()) {
+ const unsigned char uch = sv.front();
+ const unsigned int byteCount = UTF8BytesOfLead[uch];
+ if (sv.length() >= byteCount) {
+ return UnicodeFromUTF8(reinterpret_cast<const unsigned char *>(sv.data()));
+ }
+ }
+ // Failure so let the caller know
+ return unicodeReplacementChar;
+}
+
const unsigned char UTF8BytesOfLead[256] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00 - 0F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10 - 1F
diff --git a/src/UniConversion.h b/src/UniConversion.h
index 657e3eca7..7a51b2d08 100644
--- a/src/UniConversion.h
+++ b/src/UniConversion.h
@@ -43,6 +43,7 @@ inline int UnicodeFromUTF8(const unsigned char *us) noexcept {
return ((us[0] & 0x7) << 18) + ((us[1] & 0x3F) << 12) + ((us[2] & 0x3F) << 6) + (us[3] & 0x3F);
}
}
+int UnicodeFromUTF8(std::string_view sv) noexcept;
constexpr bool UTF8IsTrailByte(unsigned char ch) noexcept {
return (ch >= 0x80) && (ch < 0xc0);
diff --git a/test/unit/testUniConversion.cxx b/test/unit/testUniConversion.cxx
index 0327db7e1..a3b15d84c 100644
--- a/test/unit/testUniConversion.cxx
+++ b/test/unit/testUniConversion.cxx
@@ -108,6 +108,11 @@ TEST_CASE("UniConversion") {
REQUIRE(UnicodeFromUTF8(s) == 0x10348);
}
+ SECTION("UnicodeFromUTF8 StringView") {
+ const unsigned char s[]="\xF0\x90\x8D\x88";
+ REQUIRE(UnicodeFromUTF8(s) == 0x10348);
+ }
+
// UTF16FromUTF8
SECTION("UTF16FromUTF8 ASCII") {