diff options
-rw-r--r-- | src/UniConversion.cxx | 46 | ||||
-rw-r--r-- | src/UniConversion.h | 2 |
2 files changed, 23 insertions, 25 deletions
diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx index 64599afaf..58e899faa 100644 --- a/src/UniConversion.cxx +++ b/src/UniConversion.cxx @@ -105,9 +105,8 @@ constexpr unsigned char TrailByteValue(unsigned char c) { size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen) { size_t ui = 0; - const unsigned char *us = reinterpret_cast<const unsigned char *>(s); for (size_t i = 0; i < len;) { - unsigned char ch = us[i]; + unsigned char ch = s[i]; const unsigned int byteCount = UTF8BytesOfLead[ch]; unsigned int value; @@ -132,26 +131,26 @@ size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen) { break; case 2: value = (ch & 0x1F) << 6; - ch = us[i++]; + ch = s[i++]; value += TrailByteValue(ch); tbuf[ui] = static_cast<wchar_t>(value); break; case 3: value = (ch & 0xF) << 12; - ch = us[i++]; + ch = s[i++]; value += (TrailByteValue(ch) << 6); - ch = us[i++]; + ch = s[i++]; value += TrailByteValue(ch); tbuf[ui] = static_cast<wchar_t>(value); break; default: // Outside the BMP so need two surrogates value = (ch & 0x7) << 18; - ch = us[i++]; + ch = s[i++]; value += TrailByteValue(ch) << 12; - ch = us[i++]; + ch = s[i++]; value += TrailByteValue(ch) << 6; - ch = us[i++]; + ch = s[i++]; value += TrailByteValue(ch); tbuf[ui] = static_cast<wchar_t>(((value - 0x10000) >> 10) + SURROGATE_LEAD_FIRST); ui++; @@ -165,9 +164,8 @@ size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen) { size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen) { size_t ui = 0; - const unsigned char *us = reinterpret_cast<const unsigned char *>(s); for (size_t i = 0; i < len;) { - unsigned char ch = us[i]; + unsigned char ch = s[i]; const unsigned int byteCount = UTF8BytesOfLead[ch]; unsigned int value; @@ -191,23 +189,23 @@ size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen) break; case 2: value = (ch & 0x1F) << 6; - ch = us[i++]; + ch = s[i++]; value += TrailByteValue(ch); break; case 3: value = (ch & 0xF) << 12; - ch = us[i++]; + ch = s[i++]; value += TrailByteValue(ch) << 6; - ch = us[i++]; + ch = s[i++]; value += TrailByteValue(ch); break; default: value = (ch & 0x7) << 18; - ch = us[i++]; + ch = s[i++]; value += TrailByteValue(ch) << 12; - ch = us[i++]; + ch = s[i++]; value += TrailByteValue(ch) << 6; - ch = us[i++]; + ch = s[i++]; value += TrailByteValue(ch); break; } @@ -253,14 +251,14 @@ const unsigned char UTF8BytesOfLead[256] = { // the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be // reasonably treated as code points in some circumstances. They will, however, // not have associated glyphs. -int UTF8Classify(const unsigned char *us, int len) noexcept { +int UTF8Classify(const unsigned char *us, size_t len) noexcept { // For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8 if (us[0] < 0x80) { // ASCII return 1; } - const int byteCount = UTF8BytesOfLead[us[0]]; + const size_t byteCount = UTF8BytesOfLead[us[0]]; if (byteCount == 1 || byteCount > len) { // Invalid lead byte return UTF8MaskInvalid | 1; @@ -332,19 +330,19 @@ int UTF8DrawBytes(const unsigned char *us, int len) noexcept { // Replace invalid bytes in UTF-8 with the replacement character std::string FixInvalidUTF8(const std::string &text) { std::string result; - const unsigned char *us = reinterpret_cast<const unsigned char *>(text.c_str()); + const char *s = text.c_str(); size_t remaining = text.size(); while (remaining > 0) { - const int utf8Status = UTF8Classify(us, static_cast<int>(remaining)); + const int utf8Status = UTF8Classify(reinterpret_cast<const unsigned char *>(s), remaining); if (utf8Status & UTF8MaskInvalid) { // Replacement character 0xFFFD = UTF8:"efbfbd". result.append("\xef\xbf\xbd"); - us++; + s++; remaining--; } else { - const int len = utf8Status&UTF8MaskWidth; - result.append(reinterpret_cast<const char *>(us), len); - us += len; + const size_t len = utf8Status & UTF8MaskWidth; + result.append(s, len); + s += len; remaining -= len; } } diff --git a/src/UniConversion.h b/src/UniConversion.h index bece53f4d..4cdfe1fac 100644 --- a/src/UniConversion.h +++ b/src/UniConversion.h @@ -47,7 +47,7 @@ inline bool UTF8IsAscii(int ch) noexcept { } enum { UTF8MaskWidth=0x7, UTF8MaskInvalid=0x8 }; -int UTF8Classify(const unsigned char *us, int len) noexcept; +int UTF8Classify(const unsigned char *us, size_t len) noexcept; // Similar to UTF8Classify but returns a length of 1 for invalid bytes // instead of setting the invalid flag |