diff options
| -rw-r--r-- | .hgignore | 2 | ||||
| -rw-r--r-- | cocoa/PlatCocoa.mm | 19 | ||||
| -rw-r--r-- | doc/ScintillaHistory.html | 6 | ||||
| -rw-r--r-- | gtk/PlatGTK.cxx | 2 | ||||
| -rw-r--r-- | qt/ScintillaEditBase/PlatQt.cpp | 20 | ||||
| -rw-r--r-- | src/Document.cxx | 2 | ||||
| -rw-r--r-- | src/UniConversion.cxx | 180 | ||||
| -rw-r--r-- | src/UniConversion.h | 10 | ||||
| -rw-r--r-- | test/unit/testUniConversion.cxx | 167 | ||||
| -rw-r--r-- | win32/PlatWin.cxx | 19 | ||||
| -rw-r--r-- | win32/scintilla.mak | 2 | 
11 files changed, 235 insertions, 194 deletions
@@ -1,6 +1,7 @@  syntax: glob  *.o  *.a +*.asm  *.lib  *.obj  *.iobj @@ -9,6 +10,7 @@ syntax: glob  *.dylib  *.framework  *.pyd +*.exe  *.exp  *.lib  *.pdb diff --git a/cocoa/PlatCocoa.mm b/cocoa/PlatCocoa.mm index 5f9d788cd..c89a6f3aa 100644 --- a/cocoa/PlatCocoa.mm +++ b/cocoa/PlatCocoa.mm @@ -31,6 +31,7 @@  #include "StringCopy.h"  #include "XPM.h" +#include "UniConversion.h"  #import "ScintillaView.h"  #import "ScintillaCocoa.h" @@ -864,18 +865,6 @@ void SurfaceImpl::DrawTextTransparent(PRectangle rc, Font &font_, XYPOSITION yba  	textLayout->draw(rc.left, ybase);  } -static size_t utf8LengthFromLead(unsigned char uch) { -	if (uch >= (0x80 + 0x40 + 0x20 + 0x10)) { -		return 4; -	} else if (uch >= (0x80 + 0x40 + 0x20)) { -		return 3; -	} else if (uch >= (0x80)) { -		return 2; -	} else { -		return 1; -	} -} -  //--------------------------------------------------------------------------------------------------  void SurfaceImpl::MeasureWidths(Font &font_, const char *s, int len, XYPOSITION *positions) { @@ -892,10 +881,10 @@ void SurfaceImpl::MeasureWidths(Font &font_, const char *s, int len, XYPOSITION  		const unsigned char *us = reinterpret_cast<const unsigned char *>(s);  		int i=0;  		while (ui<fit) { -			size_t lenChar = utf8LengthFromLead(us[i]); -			size_t codeUnits = (lenChar < 4) ? 1 : 2; +			const unsigned int byteCount = UTF8BytesOfLead[us[i]]; +			const int codeUnits = UTF16LengthFromUTF8ByteCount(byteCount);  			CGFloat xPosition = CTLineGetOffsetForStringIndex(mLine, ui+codeUnits, NULL); -			for (unsigned int bytePos=0; (bytePos<lenChar) && (i<len); bytePos++) { +			for (unsigned int bytePos=0; (bytePos<byteCount) && (i<len); bytePos++) {  				positions[i++] = static_cast<XYPOSITION>(xPosition);  			}  			ui += codeUnits; diff --git a/doc/ScintillaHistory.html b/doc/ScintillaHistory.html index 7b9a51dab..222b29a2e 100644 --- a/doc/ScintillaHistory.html +++ b/doc/ScintillaHistory.html @@ -546,6 +546,12 @@  	The statically linked version of SciTE, Sc1, links to this static library.  	</li>  	<li> +	In some cases, invalid UTF-8 is handled in a way that is a little friendlier. +	For example, when copying to the clipboard on Windows, an invalid lead byte will be copied as the +	equivalent ISO 8859-1 character and will not hide the following byte. +	<a href="http://sourceforge.net/p/scintilla/feature-requests/1211/">Feature #1211.</a> +	</li> +	<li>  	Lexer added for the Maxima computer algebra language.  	<a href="http://sourceforge.net/p/scintilla/feature-requests/1210/">Feature #1210.</a>  	</li> diff --git a/gtk/PlatGTK.cxx b/gtk/PlatGTK.cxx index ebedc6e93..da04bae61 100644 --- a/gtk/PlatGTK.cxx +++ b/gtk/PlatGTK.cxx @@ -781,7 +781,7 @@ void SurfaceImpl::MeasureWidths(Font &font_, const char *s, int len, XYPOSITION  									positions[i++] = iti.position - (places - place) * iti.distance / places;  									positionsCalculated++;  								} -								clusterStart += UTF8CharLength(static_cast<unsigned char>(utfForm.c_str()[clusterStart])); +								clusterStart += UTF8BytesOfLead[static_cast<unsigned char>(utfForm.c_str()[clusterStart])];  								place++;  							}  						} diff --git a/qt/ScintillaEditBase/PlatQt.cpp b/qt/ScintillaEditBase/PlatQt.cpp index 713f4c46f..87496a191 100644 --- a/qt/ScintillaEditBase/PlatQt.cpp +++ b/qt/ScintillaEditBase/PlatQt.cpp @@ -10,6 +10,7 @@  #include "PlatQt.h"  #include "Scintilla.h" +#include "UniConversion.h"  #include "DBCS.h"  #include "FontQuality.h" @@ -438,19 +439,6 @@ void SurfaceImpl::SetClip(PRectangle rc)  	GetPainter()->setClipRect(QRectFFromPRect(rc));  } -static size_t utf8LengthFromLead(unsigned char uch) -{ -	if (uch >= (0x80 + 0x40 + 0x20 + 0x10)) { -		return 4; -	} else if (uch >= (0x80 + 0x40 + 0x20)) { -		return 3; -	} else if (uch >= (0x80)) { -		return 2; -	} else { -		return 1; -	} -} -  void SurfaceImpl::MeasureWidths(Font &font,                                  const char *s,                                  int len, @@ -470,10 +458,10 @@ void SurfaceImpl::MeasureWidths(Font &font,  		const unsigned char *us = reinterpret_cast<const unsigned char *>(s);  		int i=0;  		while (ui<fit) { -			size_t lenChar = utf8LengthFromLead(us[i]); -			int codeUnits = (lenChar < 4) ? 1 : 2; +			const unsigned int byteCount = UTF8BytesOfLead[us[i]]; +			const int codeUnits = UTF16LengthFromUTF8ByteCount(byteCount);  			qreal xPosition = tl.cursorToX(ui+codeUnits); -			for (unsigned int bytePos=0; (bytePos<lenChar) && (i<len); bytePos++) { +			for (unsigned int bytePos=0; (bytePos<byteCount) && (i<len); bytePos++) {  				positions[i++] = xPosition;  			}  			ui += codeUnits; diff --git a/src/Document.cxx b/src/Document.cxx index 412798def..48913a16c 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -116,8 +116,6 @@ Document::Document(int options) :  	matchesValid = false; -	UTF8BytesOfLeadInitialise(); -  	perLineData[ldMarkers] = std::make_unique<LineMarkers>();  	perLineData[ldLevels] = std::make_unique<LineLevels>();  	perLineData[ldState] = std::make_unique<LineState>(); diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx index 8e537c689..19b968932 100644 --- a/src/UniConversion.cxx +++ b/src/UniConversion.cxx @@ -65,35 +65,15 @@ void UTF8FromUTF16(const wchar_t *uptr, size_t tlen, char *putf, size_t len) {  		putf[k] = '\0';  } -unsigned int UTF8CharLength(unsigned char ch) { -	if (ch < 0x80) { -		return 1; -	} else if (ch < 0x80 + 0x40 + 0x20) { -		return 2; -	} else if (ch < 0x80 + 0x40 + 0x20 + 0x10) { -		return 3; -	} else { -		return 4; -	} -} -  size_t UTF16Length(const char *s, size_t len) {  	size_t ulen = 0; -	size_t charLen; -	for (size_t i = 0; i<len;) { -		const unsigned char ch = static_cast<unsigned char>(s[i]); -		if (ch < 0x80) { -			charLen = 1; -		} else if (ch < 0x80 + 0x40 + 0x20) { -			charLen = 2; -		} else if (ch < 0x80 + 0x40 + 0x20 + 0x10) { -			charLen = 3; -		} else { -			charLen = 4; -			ulen++; -		} -		i += charLen; -		ulen++; +	const unsigned char *us = reinterpret_cast<const unsigned char *>(s); +	for (size_t i = 0; i < len;) { +		const unsigned char ch = us[i]; +		const unsigned int byteCount = UTF8BytesOfLead[ch]; +		const unsigned int utf16Len = UTF16LengthFromUTF8ByteCount(byteCount); +		i += byteCount; +		ulen += (i > len) ? 1 : utf16Len;  	}  	return ulen;  } @@ -104,39 +84,60 @@ constexpr unsigned char TrailByteValue(unsigned char c) {  	return c & 0b0011'1111;  } -const unsigned char utf8Start3 = 0b1110'0000; -const unsigned char utf8Start4 = 0b1111'0000; -  size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen) {  	size_t ui = 0;  	const unsigned char *us = reinterpret_cast<const unsigned char *>(s); -	size_t i = 0; -	while ((i<len) && (ui<tlen)) { -		unsigned char ch = us[i++]; -		if (ch < 0x80) { +	for (size_t i = 0; i < len;) { +		unsigned char ch = us[i]; +		const unsigned int byteCount = UTF8BytesOfLead[ch]; +		unsigned int value; + +		if (i + byteCount > len) { +			// Trying to read past end but still have space to write +			if (ui < tlen) { +				tbuf[ui] = ch; +				ui++; +			} +			break; +		} + +		const size_t outLen = (byteCount < 4) ? 1 : 2; +		if (ui + outLen > tlen) { +			throw std::runtime_error("UTF16FromUTF8: attempted write beyond end"); +		} + +		i++; +		switch (byteCount) { +		case 1:  			tbuf[ui] = ch; -		} else if (ch < utf8Start3) { -			tbuf[ui] = static_cast<wchar_t>((ch & 0x1F) << 6); +			break; +		case 2: +			value = (ch & 0x1F) << 6;  			ch = us[i++]; -			tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + TrailByteValue(ch)); -		} else if (ch < utf8Start4) { -			tbuf[ui] = static_cast<wchar_t>((ch & 0xF) << 12); +			value += TrailByteValue(ch); +			tbuf[ui] = static_cast<wchar_t>(value); +			break; +		case 3: +			value = (ch & 0xF) << 12;  			ch = us[i++]; -			tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (TrailByteValue(ch) << 6)); +			value += (TrailByteValue(ch) << 6);  			ch = us[i++]; -			tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + TrailByteValue(ch)); -		} else { +			value += TrailByteValue(ch); +			tbuf[ui] = static_cast<wchar_t>(value); +			break; +		default:  			// Outside the BMP so need two surrogates -			int val = (ch & 0x7) << 18; +			value = (ch & 0x7) << 18;  			ch = us[i++]; -			val += TrailByteValue(ch) << 12; +			value += TrailByteValue(ch) << 12;  			ch = us[i++]; -			val += TrailByteValue(ch) << 6; +			value += TrailByteValue(ch) << 6;  			ch = us[i++]; -			val += TrailByteValue(ch); -			tbuf[ui] = static_cast<wchar_t>(((val - 0x10000) >> 10) + SURROGATE_LEAD_FIRST); +			value += TrailByteValue(ch); +			tbuf[ui] = static_cast<wchar_t>(((value - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);  			ui++; -			tbuf[ui] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST); +			tbuf[ui] = static_cast<wchar_t>((value & 0x3ff) + SURROGATE_TRAIL_FIRST); +			break;  		}  		ui++;  	} @@ -144,25 +145,44 @@ size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen) {  }  size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen) { -	size_t ui=0; +	size_t ui = 0;  	const unsigned char *us = reinterpret_cast<const unsigned char *>(s); -	size_t i=0; -	while ((i<len) && (ui<tlen)) { -		unsigned char ch = us[i++]; -		unsigned int value = 0; -		if (ch < 0x80) { +	for (size_t i = 0; i < len;) { +		unsigned char ch = us[i]; +		const unsigned int byteCount = UTF8BytesOfLead[ch]; +		unsigned int value; + +		if (i + byteCount > len) { +			// Trying to read past end but still have space to write +			if (ui < tlen) { +				tbuf[ui] = ch; +				ui++; +			} +			break; +		} + +		if (ui == tlen) { +			throw std::runtime_error("UTF32FromUTF8: attempted write beyond end"); +		} + +		i++; +		switch (byteCount) { +		case 1:  			value = ch; -		} else if (((len-i) >= 1) && (ch < utf8Start3)) { +			break; +		case 2:  			value = (ch & 0x1F) << 6;  			ch = us[i++];  			value += TrailByteValue(ch); -		} else if (((len-i) >= 2) && (ch < utf8Start4)) { +			break; +		case 3:  			value = (ch & 0xF) << 12;  			ch = us[i++];  			value += TrailByteValue(ch) << 6;  			ch = us[i++];  			value += TrailByteValue(ch); -		} else if ((len-i) >= 3) { +			break; +		default:  			value = (ch & 0x7) << 18;  			ch = us[i++];  			value += TrailByteValue(ch) << 12; @@ -170,6 +190,7 @@ size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen)  			value += TrailByteValue(ch) << 6;  			ch = us[i++];  			value += TrailByteValue(ch); +			break;  		}  		tbuf[ui] = value;  		ui++; @@ -188,33 +209,24 @@ unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) {  	}  } -int UTF8BytesOfLead[256]; -static bool initialisedBytesOfLead = false; - -static int BytesFromLead(int leadByte) { -	if (leadByte < 0xC2) { -		// Single byte or invalid -		return 1; -	} else if (leadByte < 0xE0) { -		return 2; -	} else if (leadByte < 0xF0) { -		return 3; -	} else if (leadByte < 0xF5) { -		return 4; -	} else { -		// Characters longer than 4 bytes not possible in current UTF-8 -		return 1; -	} -} - -void UTF8BytesOfLeadInitialise() { -	if (!initialisedBytesOfLead) { -		for (int i=0; i<256; i++) { -			UTF8BytesOfLead[i] = BytesFromLead(i); -		} -		initialisedBytesOfLead = true; -	} -} +const unsigned char UTF8BytesOfLead[256] = { +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00 - 0F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10 - 1F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20 - 2F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30 - 3F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40 - 4F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50 - 5F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60 - 6F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70 - 7F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80 - 8F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 90 - 9F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A0 - AF +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B0 - BF +1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0 - CF +2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0 - DF +3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0 - EF +4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // F0 - FF +};  // Return both the width of the first character in the string and a status  // saying whether it is valid or invalid. diff --git a/src/UniConversion.h b/src/UniConversion.h index 2f358c9c5..0f22c06e6 100644 --- a/src/UniConversion.h +++ b/src/UniConversion.h @@ -16,17 +16,15 @@ const int unicodeReplacementChar = 0xFFFD;  size_t UTF8Length(const wchar_t *uptr, size_t tlen);  void UTF8FromUTF16(const wchar_t *uptr, size_t tlen, char *putf, size_t len); -unsigned int UTF8CharLength(unsigned char ch);  size_t UTF16Length(const char *s, size_t len);  size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen);  size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen);  unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf);  std::string FixInvalidUTF8(const std::string &text); -extern int UTF8BytesOfLead[256]; -void UTF8BytesOfLeadInitialise(); +extern const unsigned char UTF8BytesOfLead[256]; -inline bool UTF8IsTrailByte(int ch) { +inline bool UTF8IsTrailByte(unsigned char ch) {  	return (ch >= 0x80) && (ch < 0xc0);  } @@ -64,6 +62,10 @@ inline unsigned int UTF16CharLength(wchar_t uch) {  	return ((uch >= SURROGATE_LEAD_FIRST) && (uch <= SURROGATE_LEAD_LAST)) ? 2 : 1;  } +inline unsigned int UTF16LengthFromUTF8ByteCount(unsigned int byteCount) { +    return (byteCount < 4) ? 1 : 2; +} +  }  #endif diff --git a/test/unit/testUniConversion.cxx b/test/unit/testUniConversion.cxx index 16ea1d974..4d34abd60 100644 --- a/test/unit/testUniConversion.cxx +++ b/test/unit/testUniConversion.cxx @@ -53,6 +53,24 @@ TEST_CASE("UTF16Length") {  		size_t len = UTF16Length(s, strlen(s));  		REQUIRE(len == 2U);  	} + +	SECTION("UTF16Length Invalid Trail byte in lead position") { +		const char *s = "a\xB5yz"; +		size_t len = UTF16Length(s, strlen(s)); +		REQUIRE(len == 4U); +	} + +	SECTION("UTF16Length Invalid Lead byte at end") { +		const char *s = "a\xC2"; +		size_t len = UTF16Length(s, strlen(s)); +		REQUIRE(len == 2U); +	} + +	SECTION("UTF16Length Invalid Lead byte implies 3 trails but only 2") { +		const char *s = "a\xF1yz"; +		size_t len = UTF16Length(s, strlen(s)); +		REQUIRE(len == 2U); +	}  }  TEST_CASE("UniConversion") { @@ -100,6 +118,35 @@ TEST_CASE("UniConversion") {  		REQUIRE(tbuf[1] == 0xDF48);  	} +	SECTION("UTF16FromUTF8 Invalid Trail byte in lead position") { +		const char s[] = "a\xB5yz"; +		wchar_t tbuf[4] = {}; +		size_t tlen = UTF16FromUTF8(s, 4, tbuf, 4); +		REQUIRE(tlen == 4U); +		REQUIRE(tbuf[0] == 'a'); +		REQUIRE(tbuf[1] == 0xB5); +		REQUIRE(tbuf[2] == 'y'); +		REQUIRE(tbuf[3] == 'z'); +	} + +	SECTION("UTF16FromUTF8 Invalid Lead byte at end") { +		const char s[] = "a\xC2"; +		wchar_t tbuf[2] = {}; +		size_t tlen = UTF16FromUTF8(s, 2, tbuf, 2); +		REQUIRE(tlen == 2U); +		REQUIRE(tbuf[0] == 'a'); +		REQUIRE(tbuf[1] == 0xC2); +	} + +	SECTION("UTF16FromUTF8 Invalid Lead byte implies 3 trails but only 2") { +		const char *s = "a\xF1yz"; +		wchar_t tbuf[4] = {}; +		size_t tlen = UTF16FromUTF8(s, 4, tbuf, 4); +		REQUIRE(tlen == 2U); +		REQUIRE(tbuf[0] == 'a'); +		REQUIRE(tbuf[1] == 0xF1); +	} +  	// UTF32FromUTF8  	SECTION("UTF32FromUTF8 ASCII") { @@ -141,6 +188,44 @@ TEST_CASE("UniConversion") {  		REQUIRE(tlen == 1U);  		REQUIRE(tbuf[0] == 0x10348);  	} + +	SECTION("UTF32FromUTF8 Invalid Trail byte in lead position") { +		const char s[] = "a\xB5yz"; +		unsigned int tbuf[4] = {}; +		size_t tlen = UTF32FromUTF8(s, 4, tbuf, 4); +		REQUIRE(tlen == 4U); +		REQUIRE(tbuf[0] == static_cast<unsigned int>('a')); +		REQUIRE(tbuf[1] == 0xB5); +		REQUIRE(tbuf[2] == static_cast<unsigned int>('y')); +		REQUIRE(tbuf[3] == static_cast<unsigned int>('z')); +	} + +	SECTION("UTF32FromUTF8 Invalid Lead byte at end") { +		const char s[] = "a\xC2"; +		unsigned int tbuf[2] = {}; +		size_t tlen = UTF32FromUTF8(s, 2, tbuf, 2); +		REQUIRE(tlen == 2U); +		REQUIRE(tbuf[0] == static_cast<unsigned int>('a')); +		REQUIRE(tbuf[1] == 0xC2); +	} + +	SECTION("UTF32FromUTF8 Invalid Lead byte implies 3 trails but only 2") { +		const char *s = "a\xF1yz"; +		unsigned int tbuf[4] = {}; +		size_t tlen = UTF32FromUTF8(s, 4, tbuf, 4); +		REQUIRE(tlen == 2U); +		REQUIRE(tbuf[0] == static_cast<unsigned int>('a')); +		REQUIRE(tbuf[1] == 0xF1); +	} +} + +namespace { + +// Simple adapter to avoid casting +int UTFClass(const char *s) { +	return UTF8Classify(reinterpret_cast<const unsigned char *>(s), static_cast<int>(strlen(s))); +} +  }  TEST_CASE("UTF8Classify") { @@ -151,114 +236,76 @@ TEST_CASE("UTF8Classify") {  	// Single byte  	SECTION("UTF8Classify Simple ASCII") { -		const char *s = "a"; -		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == 1); +		REQUIRE(UTFClass("a") == 1);  	} -  	SECTION("UTF8Classify Invalid Too large lead") { -		const char *s = "\xF5"; -		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1|UTF8MaskInvalid)); +		REQUIRE(UTFClass("\xF5") == (1|UTF8MaskInvalid));  	}  	// 4 byte lead  	SECTION("UTF8Classify 4 byte lead, string less than 4 long") { -		const char *s = "\xF0"; -		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid)); +		REQUIRE(UTFClass("\xF0") == (1 | UTF8MaskInvalid));  	} -  	SECTION("UTF8Classify 1FFFF non-character") { -		const char *s = "\xF0\x9F\xBF\xBF"; -		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (4 | UTF8MaskInvalid)); +		REQUIRE(UTFClass("\xF0\x9F\xBF\xBF") == (4 | UTF8MaskInvalid));  	} -  	SECTION("UTF8Classify 1 Greater than max Unicode 110000") {  		// Maximum Unicode value is 10FFFF so 110000 is out of range -		const char *s = "\xF4\x90\x80\x80"; -		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid)); +		REQUIRE(UTFClass("\xF4\x90\x80\x80") == (1 | UTF8MaskInvalid));  	} -  	SECTION("UTF8Classify 4 byte overlong") { -		const char *s = "\xF0\x80\x80\x80"; -		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid)); +		REQUIRE(UTFClass("\xF0\x80\x80\x80") == (1 | UTF8MaskInvalid));  	} -  	SECTION("UTF8Classify 4 byte valid character") { -		const char *s = "\xF0\x9F\x8C\x90"; -		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == 4); +		REQUIRE(UTFClass("\xF0\x9F\x8C\x90") == 4);  	} -  	SECTION("UTF8Classify 4 byte bad trails") { -		const char *s = "\xF0xyz"; -		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid)); +		REQUIRE(UTFClass("\xF0xyz") == (1 | UTF8MaskInvalid));  	}  	// 3 byte lead  	SECTION("UTF8Classify 3 byte lead, string less than 3 long") { -		const char *s = "\xEF"; -		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid)); +		REQUIRE(UTFClass("\xEF") == (1 | UTF8MaskInvalid));  	} -  	SECTION("UTF8Classify 3 byte lead, overlong") { -		const char *s = "\xE0\x80\xAF"; -		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid)); +		REQUIRE(UTFClass("\xE0\x80\xAF") == (1 | UTF8MaskInvalid));  	} -  	SECTION("UTF8Classify 3 byte lead, surrogate") { -		const char *s = "\xED\xA0\x80"; -		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid)); +		REQUIRE(UTFClass("\xED\xA0\x80") == (1 | UTF8MaskInvalid));  	} -  	SECTION("UTF8Classify FFFE non-character") { -		const char *s = "\xEF\xBF\xBE"; -		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (3 | UTF8MaskInvalid)); +		REQUIRE(UTFClass("\xEF\xBF\xBE") == (3 | UTF8MaskInvalid));  	} -  	SECTION("UTF8Classify FFFF non-character") { -		const char *s = "\xEF\xBF\xBF"; -		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (3 | UTF8MaskInvalid)); +		REQUIRE(UTFClass("\xEF\xBF\xBF") == (3 | UTF8MaskInvalid));  	} -  	SECTION("UTF8Classify FDD0 non-character") { -		const char *s = "\xEF\xB7\x90"; -		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (3 | UTF8MaskInvalid)); +		REQUIRE(UTFClass("\xEF\xB7\x90") == (3 | UTF8MaskInvalid));  	} -  	SECTION("UTF8Classify 3 byte valid character") { -		const char *s = "\xE2\x82\xAC"; -		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == 3); +		REQUIRE(UTFClass("\xE2\x82\xAC") == 3);  	} -  	SECTION("UTF8Classify 3 byte bad trails") { -		const char *s = "\xE2qq"; -		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid)); +		REQUIRE(UTFClass("\xE2qq") == (1 | UTF8MaskInvalid));  	}  	// 2 byte lead  	SECTION("UTF8Classify 2 byte lead, string less than 2 long") { -		const char *s = "\xD0"; -		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid)); +		REQUIRE(UTFClass("\xD0") == (1 | UTF8MaskInvalid));  	} -  	SECTION("UTF8Classify 2 byte valid character") { -		const char *s = "\xD0\x80"; -		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == 2); +		REQUIRE(UTFClass("\xD0\x80") == 2);  	} -  	SECTION("UTF8Classify 2 byte lead trail is invalid") { -		const char *s = "\xD0q"; -		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid)); +		REQUIRE(UTFClass("\xD0q") == (1 | UTF8MaskInvalid));  	} -  	SECTION("UTF8Classify Overlong") { -		const char *s = "\xC0"; -		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid)); +		REQUIRE(UTFClass("\xC0") == (1 | UTF8MaskInvalid));  	} -  	SECTION("UTF8Classify single trail byte") { -		const char *s = "\x80"; -		REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid)); +		REQUIRE(UTFClass("\x80") == (1 | UTF8MaskInvalid));  	} -}
\ No newline at end of file +} diff --git a/win32/PlatWin.cxx b/win32/PlatWin.cxx index 9e89e2f84..79970a969 100644 --- a/win32/PlatWin.cxx +++ b/win32/PlatWin.cxx @@ -951,12 +951,14 @@ void SurfaceGDI::MeasureWidths(Font &font_, const char *s, int len, XYPOSITION *  			return;  		}  		// Map the widths given for UTF-16 characters back onto the UTF-8 input string +		const unsigned char *us = reinterpret_cast<const unsigned char *>(s);  		for (int ui = 0; ui < fit; ui++) { -			const unsigned int lenChar = UTF8BytesOfLead[static_cast<unsigned char>(s[i])]; -			if (lenChar == 4) {	// Non-BMP +			const unsigned char uch = us[i]; +			const unsigned int byteCount = UTF8BytesOfLead[uch]; +			if (byteCount == 4) {	// Non-BMP  				ui++;  			} -			for (unsigned int bytePos=0; (bytePos<lenChar) && (i<len); bytePos++) { +			for (unsigned int bytePos=0; (bytePos<byteCount) && (i<len); bytePos++) {  				positions[i++] = static_cast<XYPOSITION>(poses.buffer[ui]);  			}  		} @@ -1623,16 +1625,11 @@ void SurfaceD2D::MeasureWidths(Font &font_, const char *s, int len, XYPOSITION *  		int i=0;  		while (ui<tbuf.tlen) {  			const unsigned char uch = us[i]; -			unsigned int lenChar = 1; -			if (uch >= (0x80 + 0x40 + 0x20 + 0x10)) { -				lenChar = 4; +			const unsigned int byteCount = UTF8BytesOfLead[uch]; +			if (byteCount == 4) {	// Non-BMP  				ui++; -			} else if (uch >= (0x80 + 0x40 + 0x20)) { -				lenChar = 3; -			} else if (uch >= (0x80)) { -				lenChar = 2;  			} -			for (unsigned int bytePos=0; (bytePos<lenChar) && (i<len); bytePos++) { +			for (unsigned int bytePos=0; (bytePos<byteCount) && (i<len); bytePos++) {  				positions[i++] = poses.buffer[ui];  			}  			ui++; diff --git a/win32/scintilla.mak b/win32/scintilla.mak index 29f882032..7f943474b 100644 --- a/win32/scintilla.mak +++ b/win32/scintilla.mak @@ -66,7 +66,7 @@ CXXFLAGS=$(CXXFLAGS) $(INCLUDEDIRS)  all:	$(COMPONENT) $(LEXCOMPONENT) $(LIBSCI)  clean: -	-del /q $(DIR_O)\*.obj $(DIR_O)\*.pdb $(COMPONENT) $(LEXCOMPONENT) \ +	-del /q $(DIR_O)\*.obj $(DIR_O)\*.pdb $(DIR_O)\*.asm $(COMPONENT) $(LEXCOMPONENT) \  	$(DIR_O)\*.res $(DIR_BIN)\*.map $(DIR_BIN)\*.exp $(DIR_BIN)\*.pdb $(DIR_BIN)\*.lib  # Required for base Scintilla  | 
