2 files changed, 90 insertions, 76 deletions
diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx
index 16a9cef8a..39f553bc2 100644
--- a/src/UniConversion.cxx
+++ b/src/UniConversion.cxx
@@ -229,6 +229,7 @@ unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) {
 	}
 }
 
+// generated by scripts/GenerateCharTable.py
 const unsigned char UTF8BytesOfLead[256] = {
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00 - 0F
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10 - 1F
@@ -256,39 +257,28 @@ const unsigned char UTF8BytesOfLead[256] = {
 // not have associated glyphs.
 int UTF8Classify(const unsigned char *us, int len) {
 	// For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
-	if (*us < 0x80) {
-		// Single bytes easy
+	if (us[0] < 0x80) {
+		// ASCII
 		return 1;
-	} else if (*us > 0xf4) {
-		// Characters longer than 4 bytes not possible in current UTF-8
+	}
+
+	const int byteCount = UTF8BytesOfLead[us[0]];
+	if (byteCount == 1 || byteCount > len) {
+		// Invalid lead byte
 		return UTF8MaskInvalid | 1;
-	} else if (*us >= 0xf0) {
-		// 4 bytes
-		if (len < 4)
-			return UTF8MaskInvalid | 1;
-		if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
-			if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
-				// *FFFE or *FFFF non-character
-				return UTF8MaskInvalid | 4;
-			}
-			if (*us == 0xf4) {
-				// Check if encoding a value beyond the last Unicode character 10FFFF
-				if (us[1] > 0x8f) {
-					return UTF8MaskInvalid | 1;
-				}
-			} else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
-				// Overlong
-				return UTF8MaskInvalid | 1;
-			}
-			return 4;
-		} else {
-			return UTF8MaskInvalid | 1;
-		}
-	} else if (*us >= 0xe0) {
-		// 3 bytes
-		if (len < 3)
-			return UTF8MaskInvalid | 1;
-		if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2])) {
+	}
+
+	if (!UTF8IsTrailByte(us[1])) {
+		// Invalid trail byte
+		return UTF8MaskInvalid | 1;
+	}
+
+	switch (byteCount) {
+	case 2:
+		return 2;
+
+	case 3:
+		if (UTF8IsTrailByte(us[2])) {
 			if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {
 				// Overlong
 				return UTF8MaskInvalid | 1;
@@ -310,23 +300,30 @@ int UTF8Classify(const unsigned char *us, int len) {
 				return UTF8MaskInvalid | 3;
 			}
 			return 3;
-		} else {
-			return UTF8MaskInvalid | 1;
 		}
-	} else if (*us >= 0xc2) {
-		// 2 bytes
-		if (len < 2)
-			return UTF8MaskInvalid | 1;
-		if (UTF8IsTrailByte(us[1])) {
-			return 2;
-		} else {
-			return UTF8MaskInvalid | 1;
+		break;
+
+	default:
+		if (UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
+			if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
+				// *FFFE or *FFFF non-character
+				return UTF8MaskInvalid | 4;
+			}
+			if (*us == 0xf4) {
+				// Check if encoding a value beyond the last Unicode character 10FFFF
+				if (us[1] > 0x8f) {
+					return UTF8MaskInvalid | 1;
+				}
+			} else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
+				// Overlong
+				return UTF8MaskInvalid | 1;
+			}
+			return 4;
 		}
-	} else {
-		// 0xc0 .. 0xc1 is overlong encoding
-		// 0x80 .. 0xbf is trail byte
-		return UTF8MaskInvalid | 1;
+		break;
 	}
+
+	return UTF8MaskInvalid | 1;
 }
 
 int UTF8DrawBytes(const unsigned char *us, int len) {
diff --git a/test/unit/testUniConversion.cxx b/test/unit/testUniConversion.cxx
index 4bb7d361a..63b56c426 100644
--- a/test/unit/testUniConversion.cxx
+++ b/test/unit/testUniConversion.cxx
@@ -257,8 +257,8 @@ int UTFClass(const char *s) {
 
 TEST_CASE("UTF8Classify") {
 
-	// These tests are supposed to hit every return statement in UTF8Classify once in order
-	// except the last which is hit twice.
+	// These tests are supposed to hit every return statement in UTF8Classify in order
+	// with some hit multiple times.
 
 	// Single byte
 
@@ -268,34 +268,45 @@ TEST_CASE("UTF8Classify") {
 	SECTION("UTF8Classify Invalid Too large lead") {
 		REQUIRE(UTFClass("\xF5") == (1|UTF8MaskInvalid));
 	}
+	SECTION("UTF8Classify Overlong") {
+		REQUIRE(UTFClass("\xC0\x80") == (1 | UTF8MaskInvalid));
+	}
+	SECTION("UTF8Classify single trail byte") {
+		REQUIRE(UTFClass("\x80") == (1 | UTF8MaskInvalid));
+	}
 
-	// 4 byte lead
+	// Invalid length tests
 
-	SECTION("UTF8Classify 4 byte lead, string less than 4 long") {
-		REQUIRE(UTFClass("\xF0") == (1 | UTF8MaskInvalid));
+	SECTION("UTF8Classify 2 byte lead, string less than 2 long") {
+		REQUIRE(UTFClass("\xD0") == (1 | UTF8MaskInvalid));
 	}
-	SECTION("UTF8Classify 1FFFF non-character") {
-		REQUIRE(UTFClass("\xF0\x9F\xBF\xBF") == (4 | UTF8MaskInvalid));
+	SECTION("UTF8Classify 3 byte lead, string less than 3 long") {
+		REQUIRE(UTFClass("\xEF") == (1 | UTF8MaskInvalid));
 	}
-	SECTION("UTF8Classify 1 Greater than max Unicode 110000") {
-		// Maximum Unicode value is 10FFFF so 110000 is out of range
-		REQUIRE(UTFClass("\xF4\x90\x80\x80") == (1 | UTF8MaskInvalid));
+	SECTION("UTF8Classify 4 byte lead, string less than 4 long") {
+		REQUIRE(UTFClass("\xF0") == (1 | UTF8MaskInvalid));
 	}
-	SECTION("UTF8Classify 4 byte overlong") {
-		REQUIRE(UTFClass("\xF0\x80\x80\x80") == (1 | UTF8MaskInvalid));
+
+	// Invalid first trail byte tests
+
+	SECTION("UTF8Classify 2 byte lead trail is invalid") {
+		REQUIRE(UTFClass("\xD0q") == (1 | UTF8MaskInvalid));
 	}
-	SECTION("UTF8Classify 4 byte valid character") {
-		REQUIRE(UTFClass("\xF0\x9F\x8C\x90") == 4);
+	SECTION("UTF8Classify 3 byte lead invalid trails") {
+		REQUIRE(UTFClass("\xE2qq") == (1 | UTF8MaskInvalid));
 	}
 	SECTION("UTF8Classify 4 byte bad trails") {
 		REQUIRE(UTFClass("\xF0xyz") == (1 | UTF8MaskInvalid));
 	}
 
-	// 3 byte lead
+	// 2 byte lead
 
-	SECTION("UTF8Classify 3 byte lead, string less than 3 long") {
-		REQUIRE(UTFClass("\xEF") == (1 | UTF8MaskInvalid));
+	SECTION("UTF8Classify 2 byte valid character") {
+		REQUIRE(UTFClass("\xD0\x80") == 2);
 	}
+
+	// 3 byte lead
+
 	SECTION("UTF8Classify 3 byte lead, overlong") {
 		REQUIRE(UTFClass("\xE0\x80\xAF") == (1 | UTF8MaskInvalid));
 	}
@@ -314,25 +325,31 @@ TEST_CASE("UTF8Classify") {
 	SECTION("UTF8Classify 3 byte valid character") {
 		REQUIRE(UTFClass("\xE2\x82\xAC") == 3);
 	}
-	SECTION("UTF8Classify 3 byte bad trails") {
-		REQUIRE(UTFClass("\xE2qq") == (1 | UTF8MaskInvalid));
-	}
 
-	// 2 byte lead
+	// 4 byte lead
 
-	SECTION("UTF8Classify 2 byte lead, string less than 2 long") {
-		REQUIRE(UTFClass("\xD0") == (1 | UTF8MaskInvalid));
+	SECTION("UTF8Classify 1FFFF non-character") {
+		REQUIRE(UTFClass("\xF0\x9F\xBF\xBF") == (4 | UTF8MaskInvalid));
 	}
-	SECTION("UTF8Classify 2 byte valid character") {
-		REQUIRE(UTFClass("\xD0\x80") == 2);
+	SECTION("UTF8Classify 1 Greater than max Unicode 110000") {
+		// Maximum Unicode value is 10FFFF so 110000 is out of range
+		REQUIRE(UTFClass("\xF4\x90\x80\x80") == (1 | UTF8MaskInvalid));
 	}
-	SECTION("UTF8Classify 2 byte lead trail is invalid") {
-		REQUIRE(UTFClass("\xD0q") == (1 | UTF8MaskInvalid));
+	SECTION("UTF8Classify 4 byte overlong") {
+		REQUIRE(UTFClass("\xF0\x80\x80\x80") == (1 | UTF8MaskInvalid));
 	}
-	SECTION("UTF8Classify Overlong") {
-		REQUIRE(UTFClass("\xC0") == (1 | UTF8MaskInvalid));
+	SECTION("UTF8Classify 4 byte valid character") {
+		REQUIRE(UTFClass("\xF0\x9F\x8C\x90") == 4);
 	}
-	SECTION("UTF8Classify single trail byte") {
-		REQUIRE(UTFClass("\x80") == (1 | UTF8MaskInvalid));
+
+	// Invalid 2nd or 3rd continuation bytes
+	SECTION("UTF8Classify 3 byte lead invalid 2nd trail") {
+		REQUIRE(UTFClass("\xE2\x82q") == (1 | UTF8MaskInvalid));
+	}
+	SECTION("UTF8Classify 4 byte lead invalid 2nd trail") {
+		REQUIRE(UTFClass("\xF0\x9Fq\x9F") == (1 | UTF8MaskInvalid));
+	}
+	SECTION("UTF8Classify 4 byte lead invalid 3rd trail") {
+		REQUIRE(UTFClass("\xF0\x9F\x9Fq") == (1 | UTF8MaskInvalid));
 	}
 }