Feature [feature-requests:#1213]. Clarify UTF8Classify.

Use UTF8BytesOfLead to determine expected length early in function to quickly detect argument not long enough, invalid single bytes and invalid first trail then branching on length for more detailed checks.
author: Zufu Liu <unknown> 2018-03-25 10:51:16 +1100
committer: Zufu Liu <unknown> 2018-03-25 10:51:16 +1100
commit: aa5481a2fbd394e059ffdd7a8cd677abd55227d5 (patch)
tree: 759febaf4312d5f210196b26feb250751fd3c220
parent: 4692de654d0e973f6ceed4b00d7859ef9b6af254 (diff)
download: scintilla-mirror-aa5481a2fbd394e059ffdd7a8cd677abd55227d5.tar.gz
2 files changed, 90 insertions, 76 deletions
diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx
index 16a9cef8a..39f553bc2 100644
--- a/src/UniConversion.cxx
+++ b/src/UniConversion.cxx
@@ -229,6 +229,7 @@ unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) {
 	}
 }
 
+// generated by scripts/GenerateCharTable.py
 const unsigned char UTF8BytesOfLead[256] = {
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00 - 0F
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10 - 1F
@@ -256,39 +257,28 @@ const unsigned char UTF8BytesOfLead[256] = {
 // not have associated glyphs.
 int UTF8Classify(const unsigned char *us, int len) {
 	// For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
-	if (*us < 0x80) {
-		// Single bytes easy
+	if (us[0] < 0x80) {
+		// ASCII
 		return 1;
-	} else if (*us > 0xf4) {
-		// Characters longer than 4 bytes not possible in current UTF-8
+	}
+
+	const int byteCount = UTF8BytesOfLead[us[0]];
+	if (byteCount == 1 || byteCount > len) {
+		// Invalid lead byte
 		return UTF8MaskInvalid | 1;
-	} else if (*us >= 0xf0) {
-		// 4 bytes
-		if (len < 4)
-			return UTF8MaskInvalid | 1;
-		if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
-			if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
-				// *FFFE or *FFFF non-character
-				return UTF8MaskInvalid | 4;
-			}
-			if (*us == 0xf4) {
-				// Check if encoding a value beyond the last Unicode character 10FFFF
-				if (us[1] > 0x8f) {
-					return UTF8MaskInvalid | 1;
-				}
-			} else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
-				// Overlong
-				return UTF8MaskInvalid | 1;
-			}
-			return 4;
-		} else {
-			return UTF8MaskInvalid | 1;
-		}
-	} else if (*us >= 0xe0) {
-		// 3 bytes
-		if (len < 3)
-			return UTF8MaskInvalid | 1;
-		if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2])) {
+	}
+
+	if (!UTF8IsTrailByte(us[1])) {
+		// Invalid trail byte
+		return UTF8MaskInvalid | 1;
+	}
+
+	switch (byteCount) {
+	case 2:
+		return 2;
+
+	case 3:
+		if (UTF8IsTrailByte(us[2])) {
 			if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {
 				// Overlong
 				return UTF8MaskInvalid | 1;
@@ -310,23 +300,30 @@ int UTF8Classify(const unsigned char *us, int len) {
 				return UTF8MaskInvalid | 3;
 			}
 			return 3;
-		} else {
-			return UTF8MaskInvalid | 1;
 		}
-	} else if (*us >= 0xc2) {
-		// 2 bytes
-		if (len < 2)
-			return UTF8MaskInvalid | 1;
-		if (UTF8IsTrailByte(us[1])) {
-			return 2;
-		} else {
-			return UTF8MaskInvalid | 1;
+		break;
+
+	default:
+		if (UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
+			if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
+				// *FFFE or *FFFF non-character
+				return UTF8MaskInvalid | 4;
+			}
+			if (*us == 0xf4) {
+				// Check if encoding a value beyond the last Unicode character 10FFFF
+				if (us[1] > 0x8f) {
+					return UTF8MaskInvalid | 1;
+				}
+			} else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
+				// Overlong
+				return UTF8MaskInvalid | 1;
+			}
+			return 4;
 		}
-	} else {
-		// 0xc0 .. 0xc1 is overlong encoding
-		// 0x80 .. 0xbf is trail byte
-		return UTF8MaskInvalid | 1;
+		break;
 	}
+
+	return UTF8MaskInvalid | 1;
 }
 
 int UTF8DrawBytes(const unsigned char *us, int len) {
diff --git a/test/unit/testUniConversion.cxx b/test/unit/testUniConversion.cxx
index 4bb7d361a..63b56c426 100644
--- a/test/unit/testUniConversion.cxx
+++ b/test/unit/testUniConversion.cxx
@@ -257,8 +257,8 @@ int UTFClass(const char *s) {
 
 TEST_CASE("UTF8Classify") {
 
-	// These tests are supposed to hit every return statement in UTF8Classify once in order
-	// except the last which is hit twice.
+	// These tests are supposed to hit every return statement in UTF8Classify in order
+	// with some hit multiple times.
 
 	// Single byte
 
@@ -268,34 +268,45 @@ TEST_CASE("UTF8Classify") {
 	SECTION("UTF8Classify Invalid Too large lead") {
 		REQUIRE(UTFClass("\xF5") == (1|UTF8MaskInvalid));
 	}
+	SECTION("UTF8Classify Overlong") {
+		REQUIRE(UTFClass("\xC0\x80") == (1 | UTF8MaskInvalid));
+	}
+	SECTION("UTF8Classify single trail byte") {
+		REQUIRE(UTFClass("\x80") == (1 | UTF8MaskInvalid));
+	}
 
-	// 4 byte lead
+	// Invalid length tests
 
-	SECTION("UTF8Classify 4 byte lead, string less than 4 long") {
-		REQUIRE(UTFClass("\xF0") == (1 | UTF8MaskInvalid));
+	SECTION("UTF8Classify 2 byte lead, string less than 2 long") {
+		REQUIRE(UTFClass("\xD0") == (1 | UTF8MaskInvalid));
 	}
-	SECTION("UTF8Classify 1FFFF non-character") {
-		REQUIRE(UTFClass("\xF0\x9F\xBF\xBF") == (4 | UTF8MaskInvalid));
+	SECTION("UTF8Classify 3 byte lead, string less than 3 long") {
+		REQUIRE(UTFClass("\xEF") == (1 | UTF8MaskInvalid));
 	}
-	SECTION("UTF8Classify 1 Greater than max Unicode 110000") {
-		// Maximum Unicode value is 10FFFF so 110000 is out of range
-		REQUIRE(UTFClass("\xF4\x90\x80\x80") == (1 | UTF8MaskInvalid));
+	SECTION("UTF8Classify 4 byte lead, string less than 4 long") {
+		REQUIRE(UTFClass("\xF0") == (1 | UTF8MaskInvalid));
 	}
-	SECTION("UTF8Classify 4 byte overlong") {
-		REQUIRE(UTFClass("\xF0\x80\x80\x80") == (1 | UTF8MaskInvalid));
+
+	// Invalid first trail byte tests
+
+	SECTION("UTF8Classify 2 byte lead trail is invalid") {
+		REQUIRE(UTFClass("\xD0q") == (1 | UTF8MaskInvalid));
 	}
-	SECTION("UTF8Classify 4 byte valid character") {
-		REQUIRE(UTFClass("\xF0\x9F\x8C\x90") == 4);
+	SECTION("UTF8Classify 3 byte lead invalid trails") {
+		REQUIRE(UTFClass("\xE2qq") == (1 | UTF8MaskInvalid));
 	}
 	SECTION("UTF8Classify 4 byte bad trails") {
 		REQUIRE(UTFClass("\xF0xyz") == (1 | UTF8MaskInvalid));
 	}
 
-	// 3 byte lead
+	// 2 byte lead
 
-	SECTION("UTF8Classify 3 byte lead, string less than 3 long") {
-		REQUIRE(UTFClass("\xEF") == (1 | UTF8MaskInvalid));
+	SECTION("UTF8Classify 2 byte valid character") {
+		REQUIRE(UTFClass("\xD0\x80") == 2);
 	}
+
+	// 3 byte lead
+
 	SECTION("UTF8Classify 3 byte lead, overlong") {
 		REQUIRE(UTFClass("\xE0\x80\xAF") == (1 | UTF8MaskInvalid));
 	}
@@ -314,25 +325,31 @@ TEST_CASE("UTF8Classify") {
 	SECTION("UTF8Classify 3 byte valid character") {
 		REQUIRE(UTFClass("\xE2\x82\xAC") == 3);
 	}
-	SECTION("UTF8Classify 3 byte bad trails") {
-		REQUIRE(UTFClass("\xE2qq") == (1 | UTF8MaskInvalid));
-	}
 
-	// 2 byte lead
+	// 4 byte lead
 
-	SECTION("UTF8Classify 2 byte lead, string less than 2 long") {
-		REQUIRE(UTFClass("\xD0") == (1 | UTF8MaskInvalid));
+	SECTION("UTF8Classify 1FFFF non-character") {
+		REQUIRE(UTFClass("\xF0\x9F\xBF\xBF") == (4 | UTF8MaskInvalid));
 	}
-	SECTION("UTF8Classify 2 byte valid character") {
-		REQUIRE(UTFClass("\xD0\x80") == 2);
+	SECTION("UTF8Classify 1 Greater than max Unicode 110000") {
+		// Maximum Unicode value is 10FFFF so 110000 is out of range
+		REQUIRE(UTFClass("\xF4\x90\x80\x80") == (1 | UTF8MaskInvalid));
 	}
-	SECTION("UTF8Classify 2 byte lead trail is invalid") {
-		REQUIRE(UTFClass("\xD0q") == (1 | UTF8MaskInvalid));
+	SECTION("UTF8Classify 4 byte overlong") {
+		REQUIRE(UTFClass("\xF0\x80\x80\x80") == (1 | UTF8MaskInvalid));
 	}
-	SECTION("UTF8Classify Overlong") {
-		REQUIRE(UTFClass("\xC0") == (1 | UTF8MaskInvalid));
+	SECTION("UTF8Classify 4 byte valid character") {
+		REQUIRE(UTFClass("\xF0\x9F\x8C\x90") == 4);
 	}
-	SECTION("UTF8Classify single trail byte") {
-		REQUIRE(UTFClass("\x80") == (1 | UTF8MaskInvalid));
+
+	// Invalid 2nd or 3rd continuation bytes
+	SECTION("UTF8Classify 3 byte lead invalid 2nd trail") {
+		REQUIRE(UTFClass("\xE2\x82q") == (1 | UTF8MaskInvalid));
+	}
+	SECTION("UTF8Classify 4 byte lead invalid 2nd trail") {
+		REQUIRE(UTFClass("\xF0\x9Fq\x9F") == (1 | UTF8MaskInvalid));
+	}
+	SECTION("UTF8Classify 4 byte lead invalid 3rd trail") {
+		REQUIRE(UTFClass("\xF0\x9F\x9Fq") == (1 | UTF8MaskInvalid));
 	}
 }
author	Zufu Liu <unknown>	2018-03-25 10:51:16 +1100
committer	Zufu Liu <unknown>	2018-03-25 10:51:16 +1100
commit	aa5481a2fbd394e059ffdd7a8cd677abd55227d5 (patch)
tree	759febaf4312d5f210196b26feb250751fd3c220
parent	4692de654d0e973f6ceed4b00d7859ef9b6af254 (diff)
download	scintilla-mirror-aa5481a2fbd394e059ffdd7a8cd677abd55227d5.tar.gz