Feature [feature-requests:#1211]. Use pre-computed table for UTF8BytesOfLead.

Friendlier treatment of invalid UTF-8. Add tests for UniConversion handling invalid UTF-8. Simplify UTF8Classify tests.
author: Zufu Liu <unknown> 2018-03-22 15:02:38 +1100
committer: Zufu Liu <unknown> 2018-03-22 15:02:38 +1100
commit: ff707f0fe276677a4d89633ae4964e8b94712ca3 (patch)
tree: 103d8741341108a8dc04ef59923e19da6f4a64e4 /test/unit/testUniConversion.cxx
parent: 9e4cdff7752304fff978ab7f606b64ea85310baf (diff)
download: scintilla-mirror-ff707f0fe276677a4d89633ae4964e8b94712ca3.tar.gz
1 files changed, 107 insertions, 60 deletions
diff --git a/test/unit/testUniConversion.cxx b/test/unit/testUniConversion.cxx
index 16ea1d974..4d34abd60 100644
--- a/test/unit/testUniConversion.cxx
+++ b/test/unit/testUniConversion.cxx
@@ -53,6 +53,24 @@ TEST_CASE("UTF16Length") {
 		size_t len = UTF16Length(s, strlen(s));
 		REQUIRE(len == 2U);
 	}
+
+	SECTION("UTF16Length Invalid Trail byte in lead position") {
+		const char *s = "a\xB5yz";
+		size_t len = UTF16Length(s, strlen(s));
+		REQUIRE(len == 4U);
+	}
+
+	SECTION("UTF16Length Invalid Lead byte at end") {
+		const char *s = "a\xC2";
+		size_t len = UTF16Length(s, strlen(s));
+		REQUIRE(len == 2U);
+	}
+
+	SECTION("UTF16Length Invalid Lead byte implies 3 trails but only 2") {
+		const char *s = "a\xF1yz";
+		size_t len = UTF16Length(s, strlen(s));
+		REQUIRE(len == 2U);
+	}
 }
 
 TEST_CASE("UniConversion") {
@@ -100,6 +118,35 @@ TEST_CASE("UniConversion") {
 		REQUIRE(tbuf[1] == 0xDF48);
 	}
 
+	SECTION("UTF16FromUTF8 Invalid Trail byte in lead position") {
+		const char s[] = "a\xB5yz";
+		wchar_t tbuf[4] = {};
+		size_t tlen = UTF16FromUTF8(s, 4, tbuf, 4);
+		REQUIRE(tlen == 4U);
+		REQUIRE(tbuf[0] == 'a');
+		REQUIRE(tbuf[1] == 0xB5);
+		REQUIRE(tbuf[2] == 'y');
+		REQUIRE(tbuf[3] == 'z');
+	}
+
+	SECTION("UTF16FromUTF8 Invalid Lead byte at end") {
+		const char s[] = "a\xC2";
+		wchar_t tbuf[2] = {};
+		size_t tlen = UTF16FromUTF8(s, 2, tbuf, 2);
+		REQUIRE(tlen == 2U);
+		REQUIRE(tbuf[0] == 'a');
+		REQUIRE(tbuf[1] == 0xC2);
+	}
+
+	SECTION("UTF16FromUTF8 Invalid Lead byte implies 3 trails but only 2") {
+		const char *s = "a\xF1yz";
+		wchar_t tbuf[4] = {};
+		size_t tlen = UTF16FromUTF8(s, 4, tbuf, 4);
+		REQUIRE(tlen == 2U);
+		REQUIRE(tbuf[0] == 'a');
+		REQUIRE(tbuf[1] == 0xF1);
+	}
+
 	// UTF32FromUTF8
 
 	SECTION("UTF32FromUTF8 ASCII") {
@@ -141,6 +188,44 @@ TEST_CASE("UniConversion") {
 		REQUIRE(tlen == 1U);
 		REQUIRE(tbuf[0] == 0x10348);
 	}
+
+	SECTION("UTF32FromUTF8 Invalid Trail byte in lead position") {
+		const char s[] = "a\xB5yz";
+		unsigned int tbuf[4] = {};
+		size_t tlen = UTF32FromUTF8(s, 4, tbuf, 4);
+		REQUIRE(tlen == 4U);
+		REQUIRE(tbuf[0] == static_cast<unsigned int>('a'));
+		REQUIRE(tbuf[1] == 0xB5);
+		REQUIRE(tbuf[2] == static_cast<unsigned int>('y'));
+		REQUIRE(tbuf[3] == static_cast<unsigned int>('z'));
+	}
+
+	SECTION("UTF32FromUTF8 Invalid Lead byte at end") {
+		const char s[] = "a\xC2";
+		unsigned int tbuf[2] = {};
+		size_t tlen = UTF32FromUTF8(s, 2, tbuf, 2);
+		REQUIRE(tlen == 2U);
+		REQUIRE(tbuf[0] == static_cast<unsigned int>('a'));
+		REQUIRE(tbuf[1] == 0xC2);
+	}
+
+	SECTION("UTF32FromUTF8 Invalid Lead byte implies 3 trails but only 2") {
+		const char *s = "a\xF1yz";
+		unsigned int tbuf[4] = {};
+		size_t tlen = UTF32FromUTF8(s, 4, tbuf, 4);
+		REQUIRE(tlen == 2U);
+		REQUIRE(tbuf[0] == static_cast<unsigned int>('a'));
+		REQUIRE(tbuf[1] == 0xF1);
+	}
+}
+
+namespace {
+
+// Simple adapter to avoid casting
+int UTFClass(const char *s) {
+	return UTF8Classify(reinterpret_cast<const unsigned char *>(s), static_cast<int>(strlen(s)));
+}
+
 }
 
 TEST_CASE("UTF8Classify") {
@@ -151,114 +236,76 @@ TEST_CASE("UTF8Classify") {
 	// Single byte
 
 	SECTION("UTF8Classify Simple ASCII") {
-		const char *s = "a";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == 1);
+		REQUIRE(UTFClass("a") == 1);
 	}
-
 	SECTION("UTF8Classify Invalid Too large lead") {
-		const char *s = "\xF5";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1|UTF8MaskInvalid));
+		REQUIRE(UTFClass("\xF5") == (1|UTF8MaskInvalid));
 	}
 
 	// 4 byte lead
 
 	SECTION("UTF8Classify 4 byte lead, string less than 4 long") {
-		const char *s = "\xF0";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+		REQUIRE(UTFClass("\xF0") == (1 | UTF8MaskInvalid));
 	}
-
 	SECTION("UTF8Classify 1FFFF non-character") {
-		const char *s = "\xF0\x9F\xBF\xBF";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (4 | UTF8MaskInvalid));
+		REQUIRE(UTFClass("\xF0\x9F\xBF\xBF") == (4 | UTF8MaskInvalid));
 	}
-
 	SECTION("UTF8Classify 1 Greater than max Unicode 110000") {
 		// Maximum Unicode value is 10FFFF so 110000 is out of range
-		const char *s = "\xF4\x90\x80\x80";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+		REQUIRE(UTFClass("\xF4\x90\x80\x80") == (1 | UTF8MaskInvalid));
 	}
-
 	SECTION("UTF8Classify 4 byte overlong") {
-		const char *s = "\xF0\x80\x80\x80";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+		REQUIRE(UTFClass("\xF0\x80\x80\x80") == (1 | UTF8MaskInvalid));
 	}
-
 	SECTION("UTF8Classify 4 byte valid character") {
-		const char *s = "\xF0\x9F\x8C\x90";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == 4);
+		REQUIRE(UTFClass("\xF0\x9F\x8C\x90") == 4);
 	}
-
 	SECTION("UTF8Classify 4 byte bad trails") {
-		const char *s = "\xF0xyz";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+		REQUIRE(UTFClass("\xF0xyz") == (1 | UTF8MaskInvalid));
 	}
 
 	// 3 byte lead
 
 	SECTION("UTF8Classify 3 byte lead, string less than 3 long") {
-		const char *s = "\xEF";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+		REQUIRE(UTFClass("\xEF") == (1 | UTF8MaskInvalid));
 	}
-
 	SECTION("UTF8Classify 3 byte lead, overlong") {
-		const char *s = "\xE0\x80\xAF";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+		REQUIRE(UTFClass("\xE0\x80\xAF") == (1 | UTF8MaskInvalid));
 	}
-
 	SECTION("UTF8Classify 3 byte lead, surrogate") {
-		const char *s = "\xED\xA0\x80";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+		REQUIRE(UTFClass("\xED\xA0\x80") == (1 | UTF8MaskInvalid));
 	}
-
 	SECTION("UTF8Classify FFFE non-character") {
-		const char *s = "\xEF\xBF\xBE";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (3 | UTF8MaskInvalid));
+		REQUIRE(UTFClass("\xEF\xBF\xBE") == (3 | UTF8MaskInvalid));
 	}
-
 	SECTION("UTF8Classify FFFF non-character") {
-		const char *s = "\xEF\xBF\xBF";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (3 | UTF8MaskInvalid));
+		REQUIRE(UTFClass("\xEF\xBF\xBF") == (3 | UTF8MaskInvalid));
 	}
-
 	SECTION("UTF8Classify FDD0 non-character") {
-		const char *s = "\xEF\xB7\x90";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (3 | UTF8MaskInvalid));
+		REQUIRE(UTFClass("\xEF\xB7\x90") == (3 | UTF8MaskInvalid));
 	}
-
 	SECTION("UTF8Classify 3 byte valid character") {
-		const char *s = "\xE2\x82\xAC";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == 3);
+		REQUIRE(UTFClass("\xE2\x82\xAC") == 3);
 	}
-
 	SECTION("UTF8Classify 3 byte bad trails") {
-		const char *s = "\xE2qq";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+		REQUIRE(UTFClass("\xE2qq") == (1 | UTF8MaskInvalid));
 	}
 
 	// 2 byte lead
 
 	SECTION("UTF8Classify 2 byte lead, string less than 2 long") {
-		const char *s = "\xD0";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+		REQUIRE(UTFClass("\xD0") == (1 | UTF8MaskInvalid));
 	}
-
 	SECTION("UTF8Classify 2 byte valid character") {
-		const char *s = "\xD0\x80";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == 2);
+		REQUIRE(UTFClass("\xD0\x80") == 2);
 	}
-
 	SECTION("UTF8Classify 2 byte lead trail is invalid") {
-		const char *s = "\xD0q";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+		REQUIRE(UTFClass("\xD0q") == (1 | UTF8MaskInvalid));
 	}
-
 	SECTION("UTF8Classify Overlong") {
-		const char *s = "\xC0";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+		REQUIRE(UTFClass("\xC0") == (1 | UTF8MaskInvalid));
 	}
-
 	SECTION("UTF8Classify single trail byte") {
-		const char *s = "\x80";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+		REQUIRE(UTFClass("\x80") == (1 | UTF8MaskInvalid));
 	}
-}
-\ No newline at end of file
+}
author	Zufu Liu <unknown>	2018-03-22 15:02:38 +1100
committer	Zufu Liu <unknown>	2018-03-22 15:02:38 +1100
commit	ff707f0fe276677a4d89633ae4964e8b94712ca3 (patch)
tree	103d8741341108a8dc04ef59923e19da6f4a64e4 /test/unit/testUniConversion.cxx
parent	9e4cdff7752304fff978ab7f606b64ea85310baf (diff)
download	scintilla-mirror-ff707f0fe276677a4d89633ae4964e8b94712ca3.tar.gz