Feature [feature-requests:#1211]. Use pre-computed table for UTF8BytesOfLead.

Friendlier treatment of invalid UTF-8. Add tests for UniConversion handling invalid UTF-8. Simplify UTF8Classify tests.
author: Zufu Liu <unknown> 2018-03-22 15:02:38 +1100
committer: Zufu Liu <unknown> 2018-03-22 15:02:38 +1100
commit: ff707f0fe276677a4d89633ae4964e8b94712ca3 (patch)
tree: 103d8741341108a8dc04ef59923e19da6f4a64e4
parent: 9e4cdff7752304fff978ab7f606b64ea85310baf (diff)
download: scintilla-mirror-ff707f0fe276677a4d89633ae4964e8b94712ca3.tar.gz
11 files changed, 235 insertions, 194 deletions
diff --git a/.hgignore b/.hgignore
index 9fc409a8b..73365de42 100644
--- a/.hgignore
+++ b/.hgignore
@@ -1,6 +1,7 @@
 syntax: glob
 *.o
 *.a
+*.asm
 *.lib
 *.obj
 *.iobj
@@ -9,6 +10,7 @@ syntax: glob
 *.dylib
 *.framework
 *.pyd
+*.exe
 *.exp
 *.lib
 *.pdb
diff --git a/cocoa/PlatCocoa.mm b/cocoa/PlatCocoa.mm
index 5f9d788cd..c89a6f3aa 100644
--- a/cocoa/PlatCocoa.mm
+++ b/cocoa/PlatCocoa.mm
@@ -31,6 +31,7 @@
 
 #include "StringCopy.h"
 #include "XPM.h"
+#include "UniConversion.h"
 
 #import "ScintillaView.h"
 #import "ScintillaCocoa.h"
@@ -864,18 +865,6 @@ void SurfaceImpl::DrawTextTransparent(PRectangle rc, Font &font_, XYPOSITION yba
 	textLayout->draw(rc.left, ybase);
 }
 
-static size_t utf8LengthFromLead(unsigned char uch) {
-	if (uch >= (0x80 + 0x40 + 0x20 + 0x10)) {
-		return 4;
-	} else if (uch >= (0x80 + 0x40 + 0x20)) {
-		return 3;
-	} else if (uch >= (0x80)) {
-		return 2;
-	} else {
-		return 1;
-	}
-}
-
 //--------------------------------------------------------------------------------------------------
 
 void SurfaceImpl::MeasureWidths(Font &font_, const char *s, int len, XYPOSITION *positions) {
@@ -892,10 +881,10 @@ void SurfaceImpl::MeasureWidths(Font &font_, const char *s, int len, XYPOSITION
 		const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
 		int i=0;
 		while (ui<fit) {
-			size_t lenChar = utf8LengthFromLead(us[i]);
-			size_t codeUnits = (lenChar < 4) ? 1 : 2;
+			const unsigned int byteCount = UTF8BytesOfLead[us[i]];
+			const int codeUnits = UTF16LengthFromUTF8ByteCount(byteCount);
 			CGFloat xPosition = CTLineGetOffsetForStringIndex(mLine, ui+codeUnits, NULL);
-			for (unsigned int bytePos=0; (bytePos<lenChar) && (i<len); bytePos++) {
+			for (unsigned int bytePos=0; (bytePos<byteCount) && (i<len); bytePos++) {
 				positions[i++] = static_cast<XYPOSITION>(xPosition);
 			}
 			ui += codeUnits;
diff --git a/doc/ScintillaHistory.html b/doc/ScintillaHistory.html
index 7b9a51dab..222b29a2e 100644
--- a/doc/ScintillaHistory.html
+++ b/doc/ScintillaHistory.html
@@ -546,6 +546,12 @@
 	The statically linked version of SciTE, Sc1, links to this static library.
 	</li>
 	<li>
+	In some cases, invalid UTF-8 is handled in a way that is a little friendlier.
+	For example, when copying to the clipboard on Windows, an invalid lead byte will be copied as the
+	equivalent ISO 8859-1 character and will not hide the following byte.
+	<a href="http://sourceforge.net/p/scintilla/feature-requests/1211/">Feature #1211.</a>
+	</li>
+	<li>
 	Lexer added for the Maxima computer algebra language.
 	<a href="http://sourceforge.net/p/scintilla/feature-requests/1210/">Feature #1210.</a>
 	</li>
diff --git a/gtk/PlatGTK.cxx b/gtk/PlatGTK.cxx
index ebedc6e93..da04bae61 100644
--- a/gtk/PlatGTK.cxx
+++ b/gtk/PlatGTK.cxx
@@ -781,7 +781,7 @@ void SurfaceImpl::MeasureWidths(Font &font_, const char *s, int len, XYPOSITION
 									positions[i++] = iti.position - (places - place) * iti.distance / places;
 									positionsCalculated++;
 								}
-								clusterStart += UTF8CharLength(static_cast<unsigned char>(utfForm.c_str()[clusterStart]));
+								clusterStart += UTF8BytesOfLead[static_cast<unsigned char>(utfForm.c_str()[clusterStart])];
 								place++;
 							}
 						}
diff --git a/qt/ScintillaEditBase/PlatQt.cpp b/qt/ScintillaEditBase/PlatQt.cpp
index 713f4c46f..87496a191 100644
--- a/qt/ScintillaEditBase/PlatQt.cpp
+++ b/qt/ScintillaEditBase/PlatQt.cpp
@@ -10,6 +10,7 @@
 
 #include "PlatQt.h"
 #include "Scintilla.h"
+#include "UniConversion.h"
 #include "DBCS.h"
 #include "FontQuality.h"
 
@@ -438,19 +439,6 @@ void SurfaceImpl::SetClip(PRectangle rc)
 	GetPainter()->setClipRect(QRectFFromPRect(rc));
 }
 
-static size_t utf8LengthFromLead(unsigned char uch)
-{
-	if (uch >= (0x80 + 0x40 + 0x20 + 0x10)) {
-		return 4;
-	} else if (uch >= (0x80 + 0x40 + 0x20)) {
-		return 3;
-	} else if (uch >= (0x80)) {
-		return 2;
-	} else {
-		return 1;
-	}
-}
-
 void SurfaceImpl::MeasureWidths(Font &font,
                                 const char *s,
                                 int len,
@@ -470,10 +458,10 @@ void SurfaceImpl::MeasureWidths(Font &font,
 		const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
 		int i=0;
 		while (ui<fit) {
-			size_t lenChar = utf8LengthFromLead(us[i]);
-			int codeUnits = (lenChar < 4) ? 1 : 2;
+			const unsigned int byteCount = UTF8BytesOfLead[us[i]];
+			const int codeUnits = UTF16LengthFromUTF8ByteCount(byteCount);
 			qreal xPosition = tl.cursorToX(ui+codeUnits);
-			for (unsigned int bytePos=0; (bytePos<lenChar) && (i<len); bytePos++) {
+			for (unsigned int bytePos=0; (bytePos<byteCount) && (i<len); bytePos++) {
 				positions[i++] = xPosition;
 			}
 			ui += codeUnits;
diff --git a/src/Document.cxx b/src/Document.cxx
index 412798def..48913a16c 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -116,8 +116,6 @@ Document::Document(int options) :
 
 	matchesValid = false;
 
-	UTF8BytesOfLeadInitialise();
-
 	perLineData[ldMarkers] = std::make_unique<LineMarkers>();
 	perLineData[ldLevels] = std::make_unique<LineLevels>();
 	perLineData[ldState] = std::make_unique<LineState>();
diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx
index 8e537c689..19b968932 100644
--- a/src/UniConversion.cxx
+++ b/src/UniConversion.cxx
@@ -65,35 +65,15 @@ void UTF8FromUTF16(const wchar_t *uptr, size_t tlen, char *putf, size_t len) {
 		putf[k] = '\0';
 }
 
-unsigned int UTF8CharLength(unsigned char ch) {
-	if (ch < 0x80) {
-		return 1;
-	} else if (ch < 0x80 + 0x40 + 0x20) {
-		return 2;
-	} else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
-		return 3;
-	} else {
-		return 4;
-	}
-}
-
 size_t UTF16Length(const char *s, size_t len) {
 	size_t ulen = 0;
-	size_t charLen;
-	for (size_t i = 0; i<len;) {
-		const unsigned char ch = static_cast<unsigned char>(s[i]);
-		if (ch < 0x80) {
-			charLen = 1;
-		} else if (ch < 0x80 + 0x40 + 0x20) {
-			charLen = 2;
-		} else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
-			charLen = 3;
-		} else {
-			charLen = 4;
-			ulen++;
-		}
-		i += charLen;
-		ulen++;
+	const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
+	for (size_t i = 0; i < len;) {
+		const unsigned char ch = us[i];
+		const unsigned int byteCount = UTF8BytesOfLead[ch];
+		const unsigned int utf16Len = UTF16LengthFromUTF8ByteCount(byteCount);
+		i += byteCount;
+		ulen += (i > len) ? 1 : utf16Len;
 	}
 	return ulen;
 }
@@ -104,39 +84,60 @@ constexpr unsigned char TrailByteValue(unsigned char c) {
 	return c & 0b0011'1111;
 }
 
-const unsigned char utf8Start3 = 0b1110'0000;
-const unsigned char utf8Start4 = 0b1111'0000;
-
 size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen) {
 	size_t ui = 0;
 	const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
-	size_t i = 0;
-	while ((i<len) && (ui<tlen)) {
-		unsigned char ch = us[i++];
-		if (ch < 0x80) {
+	for (size_t i = 0; i < len;) {
+		unsigned char ch = us[i];
+		const unsigned int byteCount = UTF8BytesOfLead[ch];
+		unsigned int value;
+
+		if (i + byteCount > len) {
+			// Trying to read past end but still have space to write
+			if (ui < tlen) {
+				tbuf[ui] = ch;
+				ui++;
+			}
+			break;
+		}
+
+		const size_t outLen = (byteCount < 4) ? 1 : 2;
+		if (ui + outLen > tlen) {
+			throw std::runtime_error("UTF16FromUTF8: attempted write beyond end");
+		}
+
+		i++;
+		switch (byteCount) {
+		case 1:
 			tbuf[ui] = ch;
-		} else if (ch < utf8Start3) {
-			tbuf[ui] = static_cast<wchar_t>((ch & 0x1F) << 6);
+			break;
+		case 2:
+			value = (ch & 0x1F) << 6;
 			ch = us[i++];
-			tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + TrailByteValue(ch));
-		} else if (ch < utf8Start4) {
-			tbuf[ui] = static_cast<wchar_t>((ch & 0xF) << 12);
+			value += TrailByteValue(ch);
+			tbuf[ui] = static_cast<wchar_t>(value);
+			break;
+		case 3:
+			value = (ch & 0xF) << 12;
 			ch = us[i++];
-			tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (TrailByteValue(ch) << 6));
+			value += (TrailByteValue(ch) << 6);
 			ch = us[i++];
-			tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + TrailByteValue(ch));
-		} else {
+			value += TrailByteValue(ch);
+			tbuf[ui] = static_cast<wchar_t>(value);
+			break;
+		default:
 			// Outside the BMP so need two surrogates
-			int val = (ch & 0x7) << 18;
+			value = (ch & 0x7) << 18;
 			ch = us[i++];
-			val += TrailByteValue(ch) << 12;
+			value += TrailByteValue(ch) << 12;
 			ch = us[i++];
-			val += TrailByteValue(ch) << 6;
+			value += TrailByteValue(ch) << 6;
 			ch = us[i++];
-			val += TrailByteValue(ch);
-			tbuf[ui] = static_cast<wchar_t>(((val - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);
+			value += TrailByteValue(ch);
+			tbuf[ui] = static_cast<wchar_t>(((value - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);
 			ui++;
-			tbuf[ui] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
+			tbuf[ui] = static_cast<wchar_t>((value & 0x3ff) + SURROGATE_TRAIL_FIRST);
+			break;
 		}
 		ui++;
 	}
@@ -144,25 +145,44 @@ size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen) {
 }
 
 size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen) {
-	size_t ui=0;
+	size_t ui = 0;
 	const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
-	size_t i=0;
-	while ((i<len) && (ui<tlen)) {
-		unsigned char ch = us[i++];
-		unsigned int value = 0;
-		if (ch < 0x80) {
+	for (size_t i = 0; i < len;) {
+		unsigned char ch = us[i];
+		const unsigned int byteCount = UTF8BytesOfLead[ch];
+		unsigned int value;
+
+		if (i + byteCount > len) {
+			// Trying to read past end but still have space to write
+			if (ui < tlen) {
+				tbuf[ui] = ch;
+				ui++;
+			}
+			break;
+		}
+
+		if (ui == tlen) {
+			throw std::runtime_error("UTF32FromUTF8: attempted write beyond end");
+		}
+
+		i++;
+		switch (byteCount) {
+		case 1:
 			value = ch;
-		} else if (((len-i) >= 1) && (ch < utf8Start3)) {
+			break;
+		case 2:
 			value = (ch & 0x1F) << 6;
 			ch = us[i++];
 			value += TrailByteValue(ch);
-		} else if (((len-i) >= 2) && (ch < utf8Start4)) {
+			break;
+		case 3:
 			value = (ch & 0xF) << 12;
 			ch = us[i++];
 			value += TrailByteValue(ch) << 6;
 			ch = us[i++];
 			value += TrailByteValue(ch);
-		} else if ((len-i) >= 3) {
+			break;
+		default:
 			value = (ch & 0x7) << 18;
 			ch = us[i++];
 			value += TrailByteValue(ch) << 12;
@@ -170,6 +190,7 @@ size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen)
 			value += TrailByteValue(ch) << 6;
 			ch = us[i++];
 			value += TrailByteValue(ch);
+			break;
 		}
 		tbuf[ui] = value;
 		ui++;
@@ -188,33 +209,24 @@ unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) {
 	}
 }
 
-int UTF8BytesOfLead[256];
-static bool initialisedBytesOfLead = false;
-
-static int BytesFromLead(int leadByte) {
-	if (leadByte < 0xC2) {
-		// Single byte or invalid
-		return 1;
-	} else if (leadByte < 0xE0) {
-		return 2;
-	} else if (leadByte < 0xF0) {
-		return 3;
-	} else if (leadByte < 0xF5) {
-		return 4;
-	} else {
-		// Characters longer than 4 bytes not possible in current UTF-8
-		return 1;
-	}
-}
-
-void UTF8BytesOfLeadInitialise() {
-	if (!initialisedBytesOfLead) {
-		for (int i=0; i<256; i++) {
-			UTF8BytesOfLead[i] = BytesFromLead(i);
-		}
-		initialisedBytesOfLead = true;
-	}
-}
+const unsigned char UTF8BytesOfLead[256] = {
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00 - 0F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10 - 1F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20 - 2F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30 - 3F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40 - 4F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50 - 5F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60 - 6F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70 - 7F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80 - 8F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 90 - 9F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A0 - AF
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B0 - BF
+1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0 - CF
+2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0 - DF
+3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0 - EF
+4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // F0 - FF
+};
 
 // Return both the width of the first character in the string and a status
 // saying whether it is valid or invalid.
diff --git a/src/UniConversion.h b/src/UniConversion.h
index 2f358c9c5..0f22c06e6 100644
--- a/src/UniConversion.h
+++ b/src/UniConversion.h
@@ -16,17 +16,15 @@ const int unicodeReplacementChar = 0xFFFD;
 
 size_t UTF8Length(const wchar_t *uptr, size_t tlen);
 void UTF8FromUTF16(const wchar_t *uptr, size_t tlen, char *putf, size_t len);
-unsigned int UTF8CharLength(unsigned char ch);
 size_t UTF16Length(const char *s, size_t len);
 size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen);
 size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen);
 unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf);
 std::string FixInvalidUTF8(const std::string &text);
 
-extern int UTF8BytesOfLead[256];
-void UTF8BytesOfLeadInitialise();
+extern const unsigned char UTF8BytesOfLead[256];
 
-inline bool UTF8IsTrailByte(int ch) {
+inline bool UTF8IsTrailByte(unsigned char ch) {
 	return (ch >= 0x80) && (ch < 0xc0);
 }
 
@@ -64,6 +62,10 @@ inline unsigned int UTF16CharLength(wchar_t uch) {
 	return ((uch >= SURROGATE_LEAD_FIRST) && (uch <= SURROGATE_LEAD_LAST)) ? 2 : 1;
 }
 
+inline unsigned int UTF16LengthFromUTF8ByteCount(unsigned int byteCount) {
+    return (byteCount < 4) ? 1 : 2;
+}
+
 }
 
 #endif
diff --git a/test/unit/testUniConversion.cxx b/test/unit/testUniConversion.cxx
index 16ea1d974..4d34abd60 100644
--- a/test/unit/testUniConversion.cxx
+++ b/test/unit/testUniConversion.cxx
@@ -53,6 +53,24 @@ TEST_CASE("UTF16Length") {
 		size_t len = UTF16Length(s, strlen(s));
 		REQUIRE(len == 2U);
 	}
+
+	SECTION("UTF16Length Invalid Trail byte in lead position") {
+		const char *s = "a\xB5yz";
+		size_t len = UTF16Length(s, strlen(s));
+		REQUIRE(len == 4U);
+	}
+
+	SECTION("UTF16Length Invalid Lead byte at end") {
+		const char *s = "a\xC2";
+		size_t len = UTF16Length(s, strlen(s));
+		REQUIRE(len == 2U);
+	}
+
+	SECTION("UTF16Length Invalid Lead byte implies 3 trails but only 2") {
+		const char *s = "a\xF1yz";
+		size_t len = UTF16Length(s, strlen(s));
+		REQUIRE(len == 2U);
+	}
 }
 
 TEST_CASE("UniConversion") {
@@ -100,6 +118,35 @@ TEST_CASE("UniConversion") {
 		REQUIRE(tbuf[1] == 0xDF48);
 	}
 
+	SECTION("UTF16FromUTF8 Invalid Trail byte in lead position") {
+		const char s[] = "a\xB5yz";
+		wchar_t tbuf[4] = {};
+		size_t tlen = UTF16FromUTF8(s, 4, tbuf, 4);
+		REQUIRE(tlen == 4U);
+		REQUIRE(tbuf[0] == 'a');
+		REQUIRE(tbuf[1] == 0xB5);
+		REQUIRE(tbuf[2] == 'y');
+		REQUIRE(tbuf[3] == 'z');
+	}
+
+	SECTION("UTF16FromUTF8 Invalid Lead byte at end") {
+		const char s[] = "a\xC2";
+		wchar_t tbuf[2] = {};
+		size_t tlen = UTF16FromUTF8(s, 2, tbuf, 2);
+		REQUIRE(tlen == 2U);
+		REQUIRE(tbuf[0] == 'a');
+		REQUIRE(tbuf[1] == 0xC2);
+	}
+
+	SECTION("UTF16FromUTF8 Invalid Lead byte implies 3 trails but only 2") {
+		const char *s = "a\xF1yz";
+		wchar_t tbuf[4] = {};
+		size_t tlen = UTF16FromUTF8(s, 4, tbuf, 4);
+		REQUIRE(tlen == 2U);
+		REQUIRE(tbuf[0] == 'a');
+		REQUIRE(tbuf[1] == 0xF1);
+	}
+
 	// UTF32FromUTF8
 
 	SECTION("UTF32FromUTF8 ASCII") {
@@ -141,6 +188,44 @@ TEST_CASE("UniConversion") {
 		REQUIRE(tlen == 1U);
 		REQUIRE(tbuf[0] == 0x10348);
 	}
+
+	SECTION("UTF32FromUTF8 Invalid Trail byte in lead position") {
+		const char s[] = "a\xB5yz";
+		unsigned int tbuf[4] = {};
+		size_t tlen = UTF32FromUTF8(s, 4, tbuf, 4);
+		REQUIRE(tlen == 4U);
+		REQUIRE(tbuf[0] == static_cast<unsigned int>('a'));
+		REQUIRE(tbuf[1] == 0xB5);
+		REQUIRE(tbuf[2] == static_cast<unsigned int>('y'));
+		REQUIRE(tbuf[3] == static_cast<unsigned int>('z'));
+	}
+
+	SECTION("UTF32FromUTF8 Invalid Lead byte at end") {
+		const char s[] = "a\xC2";
+		unsigned int tbuf[2] = {};
+		size_t tlen = UTF32FromUTF8(s, 2, tbuf, 2);
+		REQUIRE(tlen == 2U);
+		REQUIRE(tbuf[0] == static_cast<unsigned int>('a'));
+		REQUIRE(tbuf[1] == 0xC2);
+	}
+
+	SECTION("UTF32FromUTF8 Invalid Lead byte implies 3 trails but only 2") {
+		const char *s = "a\xF1yz";
+		unsigned int tbuf[4] = {};
+		size_t tlen = UTF32FromUTF8(s, 4, tbuf, 4);
+		REQUIRE(tlen == 2U);
+		REQUIRE(tbuf[0] == static_cast<unsigned int>('a'));
+		REQUIRE(tbuf[1] == 0xF1);
+	}
+}
+
+namespace {
+
+// Simple adapter to avoid casting
+int UTFClass(const char *s) {
+	return UTF8Classify(reinterpret_cast<const unsigned char *>(s), static_cast<int>(strlen(s)));
+}
+
 }
 
 TEST_CASE("UTF8Classify") {
@@ -151,114 +236,76 @@ TEST_CASE("UTF8Classify") {
 	// Single byte
 
 	SECTION("UTF8Classify Simple ASCII") {
-		const char *s = "a";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == 1);
+		REQUIRE(UTFClass("a") == 1);
 	}
-
 	SECTION("UTF8Classify Invalid Too large lead") {
-		const char *s = "\xF5";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1|UTF8MaskInvalid));
+		REQUIRE(UTFClass("\xF5") == (1|UTF8MaskInvalid));
 	}
 
 	// 4 byte lead
 
 	SECTION("UTF8Classify 4 byte lead, string less than 4 long") {
-		const char *s = "\xF0";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+		REQUIRE(UTFClass("\xF0") == (1 | UTF8MaskInvalid));
 	}
-
 	SECTION("UTF8Classify 1FFFF non-character") {
-		const char *s = "\xF0\x9F\xBF\xBF";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (4 | UTF8MaskInvalid));
+		REQUIRE(UTFClass("\xF0\x9F\xBF\xBF") == (4 | UTF8MaskInvalid));
 	}
-
 	SECTION("UTF8Classify 1 Greater than max Unicode 110000") {
 		// Maximum Unicode value is 10FFFF so 110000 is out of range
-		const char *s = "\xF4\x90\x80\x80";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+		REQUIRE(UTFClass("\xF4\x90\x80\x80") == (1 | UTF8MaskInvalid));
 	}
-
 	SECTION("UTF8Classify 4 byte overlong") {
-		const char *s = "\xF0\x80\x80\x80";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+		REQUIRE(UTFClass("\xF0\x80\x80\x80") == (1 | UTF8MaskInvalid));
 	}
-
 	SECTION("UTF8Classify 4 byte valid character") {
-		const char *s = "\xF0\x9F\x8C\x90";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == 4);
+		REQUIRE(UTFClass("\xF0\x9F\x8C\x90") == 4);
 	}
-
 	SECTION("UTF8Classify 4 byte bad trails") {
-		const char *s = "\xF0xyz";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+		REQUIRE(UTFClass("\xF0xyz") == (1 | UTF8MaskInvalid));
 	}
 
 	// 3 byte lead
 
 	SECTION("UTF8Classify 3 byte lead, string less than 3 long") {
-		const char *s = "\xEF";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+		REQUIRE(UTFClass("\xEF") == (1 | UTF8MaskInvalid));
 	}
-
 	SECTION("UTF8Classify 3 byte lead, overlong") {
-		const char *s = "\xE0\x80\xAF";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+		REQUIRE(UTFClass("\xE0\x80\xAF") == (1 | UTF8MaskInvalid));
 	}
-
 	SECTION("UTF8Classify 3 byte lead, surrogate") {
-		const char *s = "\xED\xA0\x80";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+		REQUIRE(UTFClass("\xED\xA0\x80") == (1 | UTF8MaskInvalid));
 	}
-
 	SECTION("UTF8Classify FFFE non-character") {
-		const char *s = "\xEF\xBF\xBE";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (3 | UTF8MaskInvalid));
+		REQUIRE(UTFClass("\xEF\xBF\xBE") == (3 | UTF8MaskInvalid));
 	}
-
 	SECTION("UTF8Classify FFFF non-character") {
-		const char *s = "\xEF\xBF\xBF";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (3 | UTF8MaskInvalid));
+		REQUIRE(UTFClass("\xEF\xBF\xBF") == (3 | UTF8MaskInvalid));
 	}
-
 	SECTION("UTF8Classify FDD0 non-character") {
-		const char *s = "\xEF\xB7\x90";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (3 | UTF8MaskInvalid));
+		REQUIRE(UTFClass("\xEF\xB7\x90") == (3 | UTF8MaskInvalid));
 	}
-
 	SECTION("UTF8Classify 3 byte valid character") {
-		const char *s = "\xE2\x82\xAC";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == 3);
+		REQUIRE(UTFClass("\xE2\x82\xAC") == 3);
 	}
-
 	SECTION("UTF8Classify 3 byte bad trails") {
-		const char *s = "\xE2qq";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+		REQUIRE(UTFClass("\xE2qq") == (1 | UTF8MaskInvalid));
 	}
 
 	// 2 byte lead
 
 	SECTION("UTF8Classify 2 byte lead, string less than 2 long") {
-		const char *s = "\xD0";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+		REQUIRE(UTFClass("\xD0") == (1 | UTF8MaskInvalid));
 	}
-
 	SECTION("UTF8Classify 2 byte valid character") {
-		const char *s = "\xD0\x80";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == 2);
+		REQUIRE(UTFClass("\xD0\x80") == 2);
 	}
-
 	SECTION("UTF8Classify 2 byte lead trail is invalid") {
-		const char *s = "\xD0q";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+		REQUIRE(UTFClass("\xD0q") == (1 | UTF8MaskInvalid));
 	}
-
 	SECTION("UTF8Classify Overlong") {
-		const char *s = "\xC0";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+		REQUIRE(UTFClass("\xC0") == (1 | UTF8MaskInvalid));
 	}
-
 	SECTION("UTF8Classify single trail byte") {
-		const char *s = "\x80";
-		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+		REQUIRE(UTFClass("\x80") == (1 | UTF8MaskInvalid));
 	}
-}
-\ No newline at end of file
+}
diff --git a/win32/PlatWin.cxx b/win32/PlatWin.cxx
index 9e89e2f84..79970a969 100644
--- a/win32/PlatWin.cxx
+++ b/win32/PlatWin.cxx
@@ -951,12 +951,14 @@ void SurfaceGDI::MeasureWidths(Font &font_, const char *s, int len, XYPOSITION *
 			return;
 		}
 		// Map the widths given for UTF-16 characters back onto the UTF-8 input string
+		const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
 		for (int ui = 0; ui < fit; ui++) {
-			const unsigned int lenChar = UTF8BytesOfLead[static_cast<unsigned char>(s[i])];
-			if (lenChar == 4) {	// Non-BMP
+			const unsigned char uch = us[i];
+			const unsigned int byteCount = UTF8BytesOfLead[uch];
+			if (byteCount == 4) {	// Non-BMP
 				ui++;
 			}
-			for (unsigned int bytePos=0; (bytePos<lenChar) && (i<len); bytePos++) {
+			for (unsigned int bytePos=0; (bytePos<byteCount) && (i<len); bytePos++) {
 				positions[i++] = static_cast<XYPOSITION>(poses.buffer[ui]);
 			}
 		}
@@ -1623,16 +1625,11 @@ void SurfaceD2D::MeasureWidths(Font &font_, const char *s, int len, XYPOSITION *
 		int i=0;
 		while (ui<tbuf.tlen) {
 			const unsigned char uch = us[i];
-			unsigned int lenChar = 1;
-			if (uch >= (0x80 + 0x40 + 0x20 + 0x10)) {
-				lenChar = 4;
+			const unsigned int byteCount = UTF8BytesOfLead[uch];
+			if (byteCount == 4) {	// Non-BMP
 				ui++;
-			} else if (uch >= (0x80 + 0x40 + 0x20)) {
-				lenChar = 3;
-			} else if (uch >= (0x80)) {
-				lenChar = 2;
 			}
-			for (unsigned int bytePos=0; (bytePos<lenChar) && (i<len); bytePos++) {
+			for (unsigned int bytePos=0; (bytePos<byteCount) && (i<len); bytePos++) {
 				positions[i++] = poses.buffer[ui];
 			}
 			ui++;
diff --git a/win32/scintilla.mak b/win32/scintilla.mak
index 29f882032..7f943474b 100644
--- a/win32/scintilla.mak
+++ b/win32/scintilla.mak
@@ -66,7 +66,7 @@ CXXFLAGS=$(CXXFLAGS) $(INCLUDEDIRS)
 all:	$(COMPONENT) $(LEXCOMPONENT) $(LIBSCI)
 
 clean:
-	-del /q $(DIR_O)\*.obj $(DIR_O)\*.pdb $(COMPONENT) $(LEXCOMPONENT) \
+	-del /q $(DIR_O)\*.obj $(DIR_O)\*.pdb $(DIR_O)\*.asm $(COMPONENT) $(LEXCOMPONENT) \
 	$(DIR_O)\*.res $(DIR_BIN)\*.map $(DIR_BIN)\*.exp $(DIR_BIN)\*.pdb $(DIR_BIN)\*.lib
 
 # Required for base Scintilla
author	Zufu Liu <unknown>	2018-03-22 15:02:38 +1100
committer	Zufu Liu <unknown>	2018-03-22 15:02:38 +1100
commit	ff707f0fe276677a4d89633ae4964e8b94712ca3 (patch)
tree	103d8741341108a8dc04ef59923e19da6f4a64e4
parent	9e4cdff7752304fff978ab7f606b64ea85310baf (diff)
download	scintilla-mirror-ff707f0fe276677a4d89633ae4964e8b94712ca3.tar.gz