diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/UniConversion.cxx | 87 |
1 files changed, 42 insertions, 45 deletions
diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx index 16a9cef8a..39f553bc2 100644 --- a/src/UniConversion.cxx +++ b/src/UniConversion.cxx @@ -229,6 +229,7 @@ unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) { } } +// generated by scripts/GenerateCharTable.py const unsigned char UTF8BytesOfLead[256] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00 - 0F 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10 - 1F @@ -256,39 +257,28 @@ const unsigned char UTF8BytesOfLead[256] = { // not have associated glyphs. int UTF8Classify(const unsigned char *us, int len) { // For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8 - if (*us < 0x80) { - // Single bytes easy + if (us[0] < 0x80) { + // ASCII return 1; - } else if (*us > 0xf4) { - // Characters longer than 4 bytes not possible in current UTF-8 + } + + const int byteCount = UTF8BytesOfLead[us[0]]; + if (byteCount == 1 || byteCount > len) { + // Invalid lead byte return UTF8MaskInvalid | 1; - } else if (*us >= 0xf0) { - // 4 bytes - if (len < 4) - return UTF8MaskInvalid | 1; - if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) { - if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) { - // *FFFE or *FFFF non-character - return UTF8MaskInvalid | 4; - } - if (*us == 0xf4) { - // Check if encoding a value beyond the last Unicode character 10FFFF - if (us[1] > 0x8f) { - return UTF8MaskInvalid | 1; - } - } else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) { - // Overlong - return UTF8MaskInvalid | 1; - } - return 4; - } else { - return UTF8MaskInvalid | 1; - } - } else if (*us >= 0xe0) { - // 3 bytes - if (len < 3) - return UTF8MaskInvalid | 1; - if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2])) { + } + + if (!UTF8IsTrailByte(us[1])) { + // Invalid trail byte + return UTF8MaskInvalid | 1; + } + + switch (byteCount) { + case 2: + return 2; + + case 3: + if (UTF8IsTrailByte(us[2])) { if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) { // Overlong return UTF8MaskInvalid | 1; @@ -310,23 +300,30 @@ int UTF8Classify(const unsigned char *us, int len) { return UTF8MaskInvalid | 3; } return 3; - } else { - return UTF8MaskInvalid | 1; } - } else if (*us >= 0xc2) { - // 2 bytes - if (len < 2) - return UTF8MaskInvalid | 1; - if (UTF8IsTrailByte(us[1])) { - return 2; - } else { - return UTF8MaskInvalid | 1; + break; + + default: + if (UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) { + if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) { + // *FFFE or *FFFF non-character + return UTF8MaskInvalid | 4; + } + if (*us == 0xf4) { + // Check if encoding a value beyond the last Unicode character 10FFFF + if (us[1] > 0x8f) { + return UTF8MaskInvalid | 1; + } + } else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) { + // Overlong + return UTF8MaskInvalid | 1; + } + return 4; } - } else { - // 0xc0 .. 0xc1 is overlong encoding - // 0x80 .. 0xbf is trail byte - return UTF8MaskInvalid | 1; + break; } + + return UTF8MaskInvalid | 1; } int UTF8DrawBytes(const unsigned char *us, int len) { |