diff options
Diffstat (limited to 'src/UniConversion.cxx')
-rw-r--r-- | src/UniConversion.cxx | 180 |
1 files changed, 96 insertions, 84 deletions
diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx index 8e537c689..19b968932 100644 --- a/src/UniConversion.cxx +++ b/src/UniConversion.cxx @@ -65,35 +65,15 @@ void UTF8FromUTF16(const wchar_t *uptr, size_t tlen, char *putf, size_t len) { putf[k] = '\0'; } -unsigned int UTF8CharLength(unsigned char ch) { - if (ch < 0x80) { - return 1; - } else if (ch < 0x80 + 0x40 + 0x20) { - return 2; - } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) { - return 3; - } else { - return 4; - } -} - size_t UTF16Length(const char *s, size_t len) { size_t ulen = 0; - size_t charLen; - for (size_t i = 0; i<len;) { - const unsigned char ch = static_cast<unsigned char>(s[i]); - if (ch < 0x80) { - charLen = 1; - } else if (ch < 0x80 + 0x40 + 0x20) { - charLen = 2; - } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) { - charLen = 3; - } else { - charLen = 4; - ulen++; - } - i += charLen; - ulen++; + const unsigned char *us = reinterpret_cast<const unsigned char *>(s); + for (size_t i = 0; i < len;) { + const unsigned char ch = us[i]; + const unsigned int byteCount = UTF8BytesOfLead[ch]; + const unsigned int utf16Len = UTF16LengthFromUTF8ByteCount(byteCount); + i += byteCount; + ulen += (i > len) ? 1 : utf16Len; } return ulen; } @@ -104,39 +84,60 @@ constexpr unsigned char TrailByteValue(unsigned char c) { return c & 0b0011'1111; } -const unsigned char utf8Start3 = 0b1110'0000; -const unsigned char utf8Start4 = 0b1111'0000; - size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen) { size_t ui = 0; const unsigned char *us = reinterpret_cast<const unsigned char *>(s); - size_t i = 0; - while ((i<len) && (ui<tlen)) { - unsigned char ch = us[i++]; - if (ch < 0x80) { + for (size_t i = 0; i < len;) { + unsigned char ch = us[i]; + const unsigned int byteCount = UTF8BytesOfLead[ch]; + unsigned int value; + + if (i + byteCount > len) { + // Trying to read past end but still have space to write + if (ui < tlen) { + tbuf[ui] = ch; + ui++; + } + break; + } + + const size_t outLen = (byteCount < 4) ? 1 : 2; + if (ui + outLen > tlen) { + throw std::runtime_error("UTF16FromUTF8: attempted write beyond end"); + } + + i++; + switch (byteCount) { + case 1: tbuf[ui] = ch; - } else if (ch < utf8Start3) { - tbuf[ui] = static_cast<wchar_t>((ch & 0x1F) << 6); + break; + case 2: + value = (ch & 0x1F) << 6; ch = us[i++]; - tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + TrailByteValue(ch)); - } else if (ch < utf8Start4) { - tbuf[ui] = static_cast<wchar_t>((ch & 0xF) << 12); + value += TrailByteValue(ch); + tbuf[ui] = static_cast<wchar_t>(value); + break; + case 3: + value = (ch & 0xF) << 12; ch = us[i++]; - tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (TrailByteValue(ch) << 6)); + value += (TrailByteValue(ch) << 6); ch = us[i++]; - tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + TrailByteValue(ch)); - } else { + value += TrailByteValue(ch); + tbuf[ui] = static_cast<wchar_t>(value); + break; + default: // Outside the BMP so need two surrogates - int val = (ch & 0x7) << 18; + value = (ch & 0x7) << 18; ch = us[i++]; - val += TrailByteValue(ch) << 12; + value += TrailByteValue(ch) << 12; ch = us[i++]; - val += TrailByteValue(ch) << 6; + value += TrailByteValue(ch) << 6; ch = us[i++]; - val += TrailByteValue(ch); - tbuf[ui] = static_cast<wchar_t>(((val - 0x10000) >> 10) + SURROGATE_LEAD_FIRST); + value += TrailByteValue(ch); + tbuf[ui] = static_cast<wchar_t>(((value - 0x10000) >> 10) + SURROGATE_LEAD_FIRST); ui++; - tbuf[ui] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST); + tbuf[ui] = static_cast<wchar_t>((value & 0x3ff) + SURROGATE_TRAIL_FIRST); + break; } ui++; } @@ -144,25 +145,44 @@ size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen) { } size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen) { - size_t ui=0; + size_t ui = 0; const unsigned char *us = reinterpret_cast<const unsigned char *>(s); - size_t i=0; - while ((i<len) && (ui<tlen)) { - unsigned char ch = us[i++]; - unsigned int value = 0; - if (ch < 0x80) { + for (size_t i = 0; i < len;) { + unsigned char ch = us[i]; + const unsigned int byteCount = UTF8BytesOfLead[ch]; + unsigned int value; + + if (i + byteCount > len) { + // Trying to read past end but still have space to write + if (ui < tlen) { + tbuf[ui] = ch; + ui++; + } + break; + } + + if (ui == tlen) { + throw std::runtime_error("UTF32FromUTF8: attempted write beyond end"); + } + + i++; + switch (byteCount) { + case 1: value = ch; - } else if (((len-i) >= 1) && (ch < utf8Start3)) { + break; + case 2: value = (ch & 0x1F) << 6; ch = us[i++]; value += TrailByteValue(ch); - } else if (((len-i) >= 2) && (ch < utf8Start4)) { + break; + case 3: value = (ch & 0xF) << 12; ch = us[i++]; value += TrailByteValue(ch) << 6; ch = us[i++]; value += TrailByteValue(ch); - } else if ((len-i) >= 3) { + break; + default: value = (ch & 0x7) << 18; ch = us[i++]; value += TrailByteValue(ch) << 12; @@ -170,6 +190,7 @@ size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen) value += TrailByteValue(ch) << 6; ch = us[i++]; value += TrailByteValue(ch); + break; } tbuf[ui] = value; ui++; @@ -188,33 +209,24 @@ unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) { } } -int UTF8BytesOfLead[256]; -static bool initialisedBytesOfLead = false; - -static int BytesFromLead(int leadByte) { - if (leadByte < 0xC2) { - // Single byte or invalid - return 1; - } else if (leadByte < 0xE0) { - return 2; - } else if (leadByte < 0xF0) { - return 3; - } else if (leadByte < 0xF5) { - return 4; - } else { - // Characters longer than 4 bytes not possible in current UTF-8 - return 1; - } -} - -void UTF8BytesOfLeadInitialise() { - if (!initialisedBytesOfLead) { - for (int i=0; i<256; i++) { - UTF8BytesOfLead[i] = BytesFromLead(i); - } - initialisedBytesOfLead = true; - } -} +const unsigned char UTF8BytesOfLead[256] = { +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00 - 0F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10 - 1F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20 - 2F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30 - 3F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40 - 4F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50 - 5F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60 - 6F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70 - 7F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80 - 8F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 90 - 9F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A0 - AF +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B0 - BF +1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0 - CF +2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0 - DF +3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0 - EF +4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // F0 - FF +}; // Return both the width of the first character in the string and a status // saying whether it is valid or invalid. |