diff options
-rw-r--r-- | src/UniConversion.cxx | 87 | ||||
-rw-r--r-- | test/unit/testUniConversion.cxx | 79 |
2 files changed, 90 insertions, 76 deletions
diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx index 16a9cef8a..39f553bc2 100644 --- a/src/UniConversion.cxx +++ b/src/UniConversion.cxx @@ -229,6 +229,7 @@ unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) { } } +// generated by scripts/GenerateCharTable.py const unsigned char UTF8BytesOfLead[256] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00 - 0F 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10 - 1F @@ -256,39 +257,28 @@ const unsigned char UTF8BytesOfLead[256] = { // not have associated glyphs. int UTF8Classify(const unsigned char *us, int len) { // For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8 - if (*us < 0x80) { - // Single bytes easy + if (us[0] < 0x80) { + // ASCII return 1; - } else if (*us > 0xf4) { - // Characters longer than 4 bytes not possible in current UTF-8 + } + + const int byteCount = UTF8BytesOfLead[us[0]]; + if (byteCount == 1 || byteCount > len) { + // Invalid lead byte return UTF8MaskInvalid | 1; - } else if (*us >= 0xf0) { - // 4 bytes - if (len < 4) - return UTF8MaskInvalid | 1; - if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) { - if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) { - // *FFFE or *FFFF non-character - return UTF8MaskInvalid | 4; - } - if (*us == 0xf4) { - // Check if encoding a value beyond the last Unicode character 10FFFF - if (us[1] > 0x8f) { - return UTF8MaskInvalid | 1; - } - } else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) { - // Overlong - return UTF8MaskInvalid | 1; - } - return 4; - } else { - return UTF8MaskInvalid | 1; - } - } else if (*us >= 0xe0) { - // 3 bytes - if (len < 3) - return UTF8MaskInvalid | 1; - if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2])) { + } + + if (!UTF8IsTrailByte(us[1])) { + // Invalid trail byte + return UTF8MaskInvalid | 1; + } + + switch (byteCount) { + case 2: + return 2; + + case 3: + if (UTF8IsTrailByte(us[2])) { if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) { // Overlong return UTF8MaskInvalid | 1; @@ -310,23 +300,30 @@ int UTF8Classify(const unsigned char *us, int len) { return UTF8MaskInvalid | 3; } return 3; - } else { - return UTF8MaskInvalid | 1; } - } else if (*us >= 0xc2) { - // 2 bytes - if (len < 2) - return UTF8MaskInvalid | 1; - if (UTF8IsTrailByte(us[1])) { - return 2; - } else { - return UTF8MaskInvalid | 1; + break; + + default: + if (UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) { + if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) { + // *FFFE or *FFFF non-character + return UTF8MaskInvalid | 4; + } + if (*us == 0xf4) { + // Check if encoding a value beyond the last Unicode character 10FFFF + if (us[1] > 0x8f) { + return UTF8MaskInvalid | 1; + } + } else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) { + // Overlong + return UTF8MaskInvalid | 1; + } + return 4; } - } else { - // 0xc0 .. 0xc1 is overlong encoding - // 0x80 .. 0xbf is trail byte - return UTF8MaskInvalid | 1; + break; } + + return UTF8MaskInvalid | 1; } int UTF8DrawBytes(const unsigned char *us, int len) { diff --git a/test/unit/testUniConversion.cxx b/test/unit/testUniConversion.cxx index 4bb7d361a..63b56c426 100644 --- a/test/unit/testUniConversion.cxx +++ b/test/unit/testUniConversion.cxx @@ -257,8 +257,8 @@ int UTFClass(const char *s) { TEST_CASE("UTF8Classify") { - // These tests are supposed to hit every return statement in UTF8Classify once in order - // except the last which is hit twice. + // These tests are supposed to hit every return statement in UTF8Classify in order + // with some hit multiple times. // Single byte @@ -268,34 +268,45 @@ TEST_CASE("UTF8Classify") { SECTION("UTF8Classify Invalid Too large lead") { REQUIRE(UTFClass("\xF5") == (1|UTF8MaskInvalid)); } + SECTION("UTF8Classify Overlong") { + REQUIRE(UTFClass("\xC0\x80") == (1 | UTF8MaskInvalid)); + } + SECTION("UTF8Classify single trail byte") { + REQUIRE(UTFClass("\x80") == (1 | UTF8MaskInvalid)); + } - // 4 byte lead + // Invalid length tests - SECTION("UTF8Classify 4 byte lead, string less than 4 long") { - REQUIRE(UTFClass("\xF0") == (1 | UTF8MaskInvalid)); + SECTION("UTF8Classify 2 byte lead, string less than 2 long") { + REQUIRE(UTFClass("\xD0") == (1 | UTF8MaskInvalid)); } - SECTION("UTF8Classify 1FFFF non-character") { - REQUIRE(UTFClass("\xF0\x9F\xBF\xBF") == (4 | UTF8MaskInvalid)); + SECTION("UTF8Classify 3 byte lead, string less than 3 long") { + REQUIRE(UTFClass("\xEF") == (1 | UTF8MaskInvalid)); } - SECTION("UTF8Classify 1 Greater than max Unicode 110000") { - // Maximum Unicode value is 10FFFF so 110000 is out of range - REQUIRE(UTFClass("\xF4\x90\x80\x80") == (1 | UTF8MaskInvalid)); + SECTION("UTF8Classify 4 byte lead, string less than 4 long") { + REQUIRE(UTFClass("\xF0") == (1 | UTF8MaskInvalid)); } - SECTION("UTF8Classify 4 byte overlong") { - REQUIRE(UTFClass("\xF0\x80\x80\x80") == (1 | UTF8MaskInvalid)); + + // Invalid first trail byte tests + + SECTION("UTF8Classify 2 byte lead trail is invalid") { + REQUIRE(UTFClass("\xD0q") == (1 | UTF8MaskInvalid)); } - SECTION("UTF8Classify 4 byte valid character") { - REQUIRE(UTFClass("\xF0\x9F\x8C\x90") == 4); + SECTION("UTF8Classify 3 byte lead invalid trails") { + REQUIRE(UTFClass("\xE2qq") == (1 | UTF8MaskInvalid)); } SECTION("UTF8Classify 4 byte bad trails") { REQUIRE(UTFClass("\xF0xyz") == (1 | UTF8MaskInvalid)); } - // 3 byte lead + // 2 byte lead - SECTION("UTF8Classify 3 byte lead, string less than 3 long") { - REQUIRE(UTFClass("\xEF") == (1 | UTF8MaskInvalid)); + SECTION("UTF8Classify 2 byte valid character") { + REQUIRE(UTFClass("\xD0\x80") == 2); } + + // 3 byte lead + SECTION("UTF8Classify 3 byte lead, overlong") { REQUIRE(UTFClass("\xE0\x80\xAF") == (1 | UTF8MaskInvalid)); } @@ -314,25 +325,31 @@ TEST_CASE("UTF8Classify") { SECTION("UTF8Classify 3 byte valid character") { REQUIRE(UTFClass("\xE2\x82\xAC") == 3); } - SECTION("UTF8Classify 3 byte bad trails") { - REQUIRE(UTFClass("\xE2qq") == (1 | UTF8MaskInvalid)); - } - // 2 byte lead + // 4 byte lead - SECTION("UTF8Classify 2 byte lead, string less than 2 long") { - REQUIRE(UTFClass("\xD0") == (1 | UTF8MaskInvalid)); + SECTION("UTF8Classify 1FFFF non-character") { + REQUIRE(UTFClass("\xF0\x9F\xBF\xBF") == (4 | UTF8MaskInvalid)); } - SECTION("UTF8Classify 2 byte valid character") { - REQUIRE(UTFClass("\xD0\x80") == 2); + SECTION("UTF8Classify 1 Greater than max Unicode 110000") { + // Maximum Unicode value is 10FFFF so 110000 is out of range + REQUIRE(UTFClass("\xF4\x90\x80\x80") == (1 | UTF8MaskInvalid)); } - SECTION("UTF8Classify 2 byte lead trail is invalid") { - REQUIRE(UTFClass("\xD0q") == (1 | UTF8MaskInvalid)); + SECTION("UTF8Classify 4 byte overlong") { + REQUIRE(UTFClass("\xF0\x80\x80\x80") == (1 | UTF8MaskInvalid)); } - SECTION("UTF8Classify Overlong") { - REQUIRE(UTFClass("\xC0") == (1 | UTF8MaskInvalid)); + SECTION("UTF8Classify 4 byte valid character") { + REQUIRE(UTFClass("\xF0\x9F\x8C\x90") == 4); } - SECTION("UTF8Classify single trail byte") { - REQUIRE(UTFClass("\x80") == (1 | UTF8MaskInvalid)); + + // Invalid 2nd or 3rd continuation bytes + SECTION("UTF8Classify 3 byte lead invalid 2nd trail") { + REQUIRE(UTFClass("\xE2\x82q") == (1 | UTF8MaskInvalid)); + } + SECTION("UTF8Classify 4 byte lead invalid 2nd trail") { + REQUIRE(UTFClass("\xF0\x9Fq\x9F") == (1 | UTF8MaskInvalid)); + } + SECTION("UTF8Classify 4 byte lead invalid 3rd trail") { + REQUIRE(UTFClass("\xF0\x9F\x9Fq") == (1 | UTF8MaskInvalid)); } } |