diff options
| -rw-r--r-- | src/UniConversion.cxx | 87 | ||||
| -rw-r--r-- | test/unit/testUniConversion.cxx | 79 | 
2 files changed, 90 insertions, 76 deletions
diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx index 16a9cef8a..39f553bc2 100644 --- a/src/UniConversion.cxx +++ b/src/UniConversion.cxx @@ -229,6 +229,7 @@ unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) {  	}  } +// generated by scripts/GenerateCharTable.py  const unsigned char UTF8BytesOfLead[256] = {  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00 - 0F  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10 - 1F @@ -256,39 +257,28 @@ const unsigned char UTF8BytesOfLead[256] = {  // not have associated glyphs.  int UTF8Classify(const unsigned char *us, int len) {  	// For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8 -	if (*us < 0x80) { -		// Single bytes easy +	if (us[0] < 0x80) { +		// ASCII  		return 1; -	} else if (*us > 0xf4) { -		// Characters longer than 4 bytes not possible in current UTF-8 +	} + +	const int byteCount = UTF8BytesOfLead[us[0]]; +	if (byteCount == 1 || byteCount > len) { +		// Invalid lead byte  		return UTF8MaskInvalid | 1; -	} else if (*us >= 0xf0) { -		// 4 bytes -		if (len < 4) -			return UTF8MaskInvalid | 1; -		if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) { -			if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) { -				// *FFFE or *FFFF non-character -				return UTF8MaskInvalid | 4; -			} -			if (*us == 0xf4) { -				// Check if encoding a value beyond the last Unicode character 10FFFF -				if (us[1] > 0x8f) { -					return UTF8MaskInvalid | 1; -				} -			} else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) { -				// Overlong -				return UTF8MaskInvalid | 1; -			} -			return 4; -		} else { -			return UTF8MaskInvalid | 1; -		} -	} else if (*us >= 0xe0) { -		// 3 bytes -		if (len < 3) -			return UTF8MaskInvalid | 1; -		if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2])) { +	} + +	if (!UTF8IsTrailByte(us[1])) { +		// Invalid trail byte +		return UTF8MaskInvalid | 1; +	} + +	switch (byteCount) { +	case 2: +		return 2; + +	case 3: +		if (UTF8IsTrailByte(us[2])) {  			if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {  				// Overlong  				return UTF8MaskInvalid | 1; @@ -310,23 +300,30 @@ int UTF8Classify(const unsigned char *us, int len) {  				return UTF8MaskInvalid | 3;  			}  			return 3; -		} else { -			return UTF8MaskInvalid | 1;  		} -	} else if (*us >= 0xc2) { -		// 2 bytes -		if (len < 2) -			return UTF8MaskInvalid | 1; -		if (UTF8IsTrailByte(us[1])) { -			return 2; -		} else { -			return UTF8MaskInvalid | 1; +		break; + +	default: +		if (UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) { +			if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) { +				// *FFFE or *FFFF non-character +				return UTF8MaskInvalid | 4; +			} +			if (*us == 0xf4) { +				// Check if encoding a value beyond the last Unicode character 10FFFF +				if (us[1] > 0x8f) { +					return UTF8MaskInvalid | 1; +				} +			} else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) { +				// Overlong +				return UTF8MaskInvalid | 1; +			} +			return 4;  		} -	} else { -		// 0xc0 .. 0xc1 is overlong encoding -		// 0x80 .. 0xbf is trail byte -		return UTF8MaskInvalid | 1; +		break;  	} + +	return UTF8MaskInvalid | 1;  }  int UTF8DrawBytes(const unsigned char *us, int len) { diff --git a/test/unit/testUniConversion.cxx b/test/unit/testUniConversion.cxx index 4bb7d361a..63b56c426 100644 --- a/test/unit/testUniConversion.cxx +++ b/test/unit/testUniConversion.cxx @@ -257,8 +257,8 @@ int UTFClass(const char *s) {  TEST_CASE("UTF8Classify") { -	// These tests are supposed to hit every return statement in UTF8Classify once in order -	// except the last which is hit twice. +	// These tests are supposed to hit every return statement in UTF8Classify in order +	// with some hit multiple times.  	// Single byte @@ -268,34 +268,45 @@ TEST_CASE("UTF8Classify") {  	SECTION("UTF8Classify Invalid Too large lead") {  		REQUIRE(UTFClass("\xF5") == (1|UTF8MaskInvalid));  	} +	SECTION("UTF8Classify Overlong") { +		REQUIRE(UTFClass("\xC0\x80") == (1 | UTF8MaskInvalid)); +	} +	SECTION("UTF8Classify single trail byte") { +		REQUIRE(UTFClass("\x80") == (1 | UTF8MaskInvalid)); +	} -	// 4 byte lead +	// Invalid length tests -	SECTION("UTF8Classify 4 byte lead, string less than 4 long") { -		REQUIRE(UTFClass("\xF0") == (1 | UTF8MaskInvalid)); +	SECTION("UTF8Classify 2 byte lead, string less than 2 long") { +		REQUIRE(UTFClass("\xD0") == (1 | UTF8MaskInvalid));  	} -	SECTION("UTF8Classify 1FFFF non-character") { -		REQUIRE(UTFClass("\xF0\x9F\xBF\xBF") == (4 | UTF8MaskInvalid)); +	SECTION("UTF8Classify 3 byte lead, string less than 3 long") { +		REQUIRE(UTFClass("\xEF") == (1 | UTF8MaskInvalid));  	} -	SECTION("UTF8Classify 1 Greater than max Unicode 110000") { -		// Maximum Unicode value is 10FFFF so 110000 is out of range -		REQUIRE(UTFClass("\xF4\x90\x80\x80") == (1 | UTF8MaskInvalid)); +	SECTION("UTF8Classify 4 byte lead, string less than 4 long") { +		REQUIRE(UTFClass("\xF0") == (1 | UTF8MaskInvalid));  	} -	SECTION("UTF8Classify 4 byte overlong") { -		REQUIRE(UTFClass("\xF0\x80\x80\x80") == (1 | UTF8MaskInvalid)); + +	// Invalid first trail byte tests + +	SECTION("UTF8Classify 2 byte lead trail is invalid") { +		REQUIRE(UTFClass("\xD0q") == (1 | UTF8MaskInvalid));  	} -	SECTION("UTF8Classify 4 byte valid character") { -		REQUIRE(UTFClass("\xF0\x9F\x8C\x90") == 4); +	SECTION("UTF8Classify 3 byte lead invalid trails") { +		REQUIRE(UTFClass("\xE2qq") == (1 | UTF8MaskInvalid));  	}  	SECTION("UTF8Classify 4 byte bad trails") {  		REQUIRE(UTFClass("\xF0xyz") == (1 | UTF8MaskInvalid));  	} -	// 3 byte lead +	// 2 byte lead -	SECTION("UTF8Classify 3 byte lead, string less than 3 long") { -		REQUIRE(UTFClass("\xEF") == (1 | UTF8MaskInvalid)); +	SECTION("UTF8Classify 2 byte valid character") { +		REQUIRE(UTFClass("\xD0\x80") == 2);  	} + +	// 3 byte lead +  	SECTION("UTF8Classify 3 byte lead, overlong") {  		REQUIRE(UTFClass("\xE0\x80\xAF") == (1 | UTF8MaskInvalid));  	} @@ -314,25 +325,31 @@ TEST_CASE("UTF8Classify") {  	SECTION("UTF8Classify 3 byte valid character") {  		REQUIRE(UTFClass("\xE2\x82\xAC") == 3);  	} -	SECTION("UTF8Classify 3 byte bad trails") { -		REQUIRE(UTFClass("\xE2qq") == (1 | UTF8MaskInvalid)); -	} -	// 2 byte lead +	// 4 byte lead -	SECTION("UTF8Classify 2 byte lead, string less than 2 long") { -		REQUIRE(UTFClass("\xD0") == (1 | UTF8MaskInvalid)); +	SECTION("UTF8Classify 1FFFF non-character") { +		REQUIRE(UTFClass("\xF0\x9F\xBF\xBF") == (4 | UTF8MaskInvalid));  	} -	SECTION("UTF8Classify 2 byte valid character") { -		REQUIRE(UTFClass("\xD0\x80") == 2); +	SECTION("UTF8Classify 1 Greater than max Unicode 110000") { +		// Maximum Unicode value is 10FFFF so 110000 is out of range +		REQUIRE(UTFClass("\xF4\x90\x80\x80") == (1 | UTF8MaskInvalid));  	} -	SECTION("UTF8Classify 2 byte lead trail is invalid") { -		REQUIRE(UTFClass("\xD0q") == (1 | UTF8MaskInvalid)); +	SECTION("UTF8Classify 4 byte overlong") { +		REQUIRE(UTFClass("\xF0\x80\x80\x80") == (1 | UTF8MaskInvalid));  	} -	SECTION("UTF8Classify Overlong") { -		REQUIRE(UTFClass("\xC0") == (1 | UTF8MaskInvalid)); +	SECTION("UTF8Classify 4 byte valid character") { +		REQUIRE(UTFClass("\xF0\x9F\x8C\x90") == 4);  	} -	SECTION("UTF8Classify single trail byte") { -		REQUIRE(UTFClass("\x80") == (1 | UTF8MaskInvalid)); + +	// Invalid 2nd or 3rd continuation bytes +	SECTION("UTF8Classify 3 byte lead invalid 2nd trail") { +		REQUIRE(UTFClass("\xE2\x82q") == (1 | UTF8MaskInvalid)); +	} +	SECTION("UTF8Classify 4 byte lead invalid 2nd trail") { +		REQUIRE(UTFClass("\xF0\x9Fq\x9F") == (1 | UTF8MaskInvalid)); +	} +	SECTION("UTF8Classify 4 byte lead invalid 3rd trail") { +		REQUIRE(UTFClass("\xF0\x9F\x9Fq") == (1 | UTF8MaskInvalid));  	}  }  | 
