diff options
| author | mitchell <unknown> | 2018-05-05 11:52:27 -0400 | 
|---|---|---|
| committer | mitchell <unknown> | 2018-05-05 11:52:27 -0400 | 
| commit | 93462d87c3c8f398d5900be84349f29cb088d849 (patch) | |
| tree | 631eb19ad8d818e262f4988139d96709b4ee8cd5 /src | |
| parent | 156c3f0e53ea2a7f932f6079cb122c2cf66fb3df (diff) | |
| download | scintilla-mirror-93462d87c3c8f398d5900be84349f29cb088d849.tar.gz | |
Backport: Feature [feature-requests:#1211]. Use pre-computed table for UTF8BytesOfLead.
Friendlier treatment of invalid UTF-8. Add tests for UniConversion handling invalid UTF-8. Simplify UTF8Classify tests.
Backport of changeset 6643:ebbb4e5aaf93.
Diffstat (limited to 'src')
| -rw-r--r-- | src/Document.cxx | 2 | ||||
| -rw-r--r-- | src/UniConversion.cxx | 180 | ||||
| -rw-r--r-- | src/UniConversion.h | 10 | 
3 files changed, 102 insertions, 90 deletions
| diff --git a/src/Document.cxx b/src/Document.cxx index 6018ca96c..16e5bec9e 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -120,8 +120,6 @@ Document::Document(int options) :  	matchesValid = false; -	UTF8BytesOfLeadInitialise(); -  	perLineData[ldMarkers].reset(new LineMarkers());  	perLineData[ldLevels].reset(new LineLevels());  	perLineData[ldState].reset(new LineState()); diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx index 255acca1d..c4025c403 100644 --- a/src/UniConversion.cxx +++ b/src/UniConversion.cxx @@ -65,35 +65,15 @@ void UTF8FromUTF16(const wchar_t *uptr, size_t tlen, char *putf, size_t len) {  		putf[k] = '\0';  } -unsigned int UTF8CharLength(unsigned char ch) { -	if (ch < 0x80) { -		return 1; -	} else if (ch < 0x80 + 0x40 + 0x20) { -		return 2; -	} else if (ch < 0x80 + 0x40 + 0x20 + 0x10) { -		return 3; -	} else { -		return 4; -	} -} -  size_t UTF16Length(const char *s, size_t len) {  	size_t ulen = 0; -	size_t charLen; -	for (size_t i = 0; i<len;) { -		const unsigned char ch = static_cast<unsigned char>(s[i]); -		if (ch < 0x80) { -			charLen = 1; -		} else if (ch < 0x80 + 0x40 + 0x20) { -			charLen = 2; -		} else if (ch < 0x80 + 0x40 + 0x20 + 0x10) { -			charLen = 3; -		} else { -			charLen = 4; -			ulen++; -		} -		i += charLen; -		ulen++; +	const unsigned char *us = reinterpret_cast<const unsigned char *>(s); +	for (size_t i = 0; i < len;) { +		const unsigned char ch = us[i]; +		const unsigned int byteCount = UTF8BytesOfLead[ch]; +		const unsigned int utf16Len = UTF16LengthFromUTF8ByteCount(byteCount); +		i += byteCount; +		ulen += (i > len) ? 1 : utf16Len;  	}  	return ulen;  } @@ -104,39 +84,60 @@ constexpr unsigned char TrailByteValue(unsigned char c) {  	return c & 0x3F;  } -const unsigned char utf8Start3 = 0xE0; -const unsigned char utf8Start4 = 0xF0; -  size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen) {  	size_t ui = 0;  	const unsigned char *us = reinterpret_cast<const unsigned char *>(s); -	size_t i = 0; -	while ((i<len) && (ui<tlen)) { -		unsigned char ch = us[i++]; -		if (ch < 0x80) { +	for (size_t i = 0; i < len;) { +		unsigned char ch = us[i]; +		const unsigned int byteCount = UTF8BytesOfLead[ch]; +		unsigned int value; + +		if (i + byteCount > len) { +			// Trying to read past end but still have space to write +			if (ui < tlen) { +			tbuf[ui] = ch; +				ui++; +			} +			break; +		} + +		const size_t outLen = (byteCount < 4) ? 1 : 2; +		if (ui + outLen > tlen) { +			throw std::runtime_error("UTF16FromUTF8: attempted write beyond end"); +		} + +		i++; +		switch (byteCount) { +		case 1:  			tbuf[ui] = ch; -		} else if (ch < utf8Start3) { -			tbuf[ui] = static_cast<wchar_t>((ch & 0x1F) << 6); +			break; +		case 2: +			value = (ch & 0x1F) << 6;  			ch = us[i++]; -			tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + TrailByteValue(ch)); -		} else if (ch < utf8Start4) { -			tbuf[ui] = static_cast<wchar_t>((ch & 0xF) << 12); +			value += TrailByteValue(ch); +			tbuf[ui] = static_cast<wchar_t>(value); +			break; +		case 3: +			value = (ch & 0xF) << 12;  			ch = us[i++]; -			tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (TrailByteValue(ch) << 6)); +			value += (TrailByteValue(ch) << 6);  			ch = us[i++]; -			tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + TrailByteValue(ch)); -		} else { +			value += TrailByteValue(ch); +			tbuf[ui] = static_cast<wchar_t>(value); +			break; +		default:  			// Outside the BMP so need two surrogates -			int val = (ch & 0x7) << 18; +			value = (ch & 0x7) << 18;  			ch = us[i++]; -			val += TrailByteValue(ch) << 12; +			value += TrailByteValue(ch) << 12;  			ch = us[i++]; -			val += TrailByteValue(ch) << 6; +			value += TrailByteValue(ch) << 6;  			ch = us[i++]; -			val += TrailByteValue(ch); -			tbuf[ui] = static_cast<wchar_t>(((val - 0x10000) >> 10) + SURROGATE_LEAD_FIRST); +			value += TrailByteValue(ch); +			tbuf[ui] = static_cast<wchar_t>(((value - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);  			ui++; -			tbuf[ui] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST); +			tbuf[ui] = static_cast<wchar_t>((value & 0x3ff) + SURROGATE_TRAIL_FIRST); +			break;  		}  		ui++;  	} @@ -144,25 +145,44 @@ size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen) {  }  size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen) { -	size_t ui=0; +	size_t ui = 0;  	const unsigned char *us = reinterpret_cast<const unsigned char *>(s); -	size_t i=0; -	while ((i<len) && (ui<tlen)) { -		unsigned char ch = us[i++]; -		unsigned int value = 0; -		if (ch < 0x80) { +	for (size_t i = 0; i < len;) { +		unsigned char ch = us[i]; +		const unsigned int byteCount = UTF8BytesOfLead[ch]; +		unsigned int value; + +		if (i + byteCount > len) { +			// Trying to read past end but still have space to write +			if (ui < tlen) { +				tbuf[ui] = ch; +				ui++; +			} +			break; +		} + +		if (ui == tlen) { +			throw std::runtime_error("UTF32FromUTF8: attempted write beyond end"); +		} + +		i++; +		switch (byteCount) { +		case 1:  			value = ch; -		} else if (((len-i) >= 1) && (ch < utf8Start3)) { +			break; +		case 2:  			value = (ch & 0x1F) << 6;  			ch = us[i++];  			value += TrailByteValue(ch); -		} else if (((len-i) >= 2) && (ch < utf8Start4)) { +			break; +		case 3:  			value = (ch & 0xF) << 12;  			ch = us[i++];  			value += TrailByteValue(ch) << 6;  			ch = us[i++];  			value += TrailByteValue(ch); -		} else if ((len-i) >= 3) { +			break; +		default:  			value = (ch & 0x7) << 18;  			ch = us[i++];  			value += TrailByteValue(ch) << 12; @@ -170,6 +190,7 @@ size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen)  			value += TrailByteValue(ch) << 6;  			ch = us[i++];  			value += TrailByteValue(ch); +			break;  		}  		tbuf[ui] = value;  		ui++; @@ -188,33 +209,24 @@ unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) {  	}  } -int UTF8BytesOfLead[256]; -static bool initialisedBytesOfLead = false; - -static int BytesFromLead(int leadByte) { -	if (leadByte < 0xC2) { -		// Single byte or invalid -		return 1; -	} else if (leadByte < 0xE0) { -		return 2; -	} else if (leadByte < 0xF0) { -		return 3; -	} else if (leadByte < 0xF5) { -		return 4; -	} else { -		// Characters longer than 4 bytes not possible in current UTF-8 -		return 1; -	} -} - -void UTF8BytesOfLeadInitialise() { -	if (!initialisedBytesOfLead) { -		for (int i=0; i<256; i++) { -			UTF8BytesOfLead[i] = BytesFromLead(i); -		} -		initialisedBytesOfLead = true; -	} -} +const unsigned char UTF8BytesOfLead[256] = { +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00 - 0F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10 - 1F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20 - 2F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30 - 3F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40 - 4F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50 - 5F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60 - 6F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70 - 7F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80 - 8F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 90 - 9F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A0 - AF +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B0 - BF +1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0 - CF +2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0 - DF +3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0 - EF +4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // F0 - FF +};  // Return both the width of the first character in the string and a status  // saying whether it is valid or invalid. diff --git a/src/UniConversion.h b/src/UniConversion.h index 2f358c9c5..0f22c06e6 100644 --- a/src/UniConversion.h +++ b/src/UniConversion.h @@ -16,17 +16,15 @@ const int unicodeReplacementChar = 0xFFFD;  size_t UTF8Length(const wchar_t *uptr, size_t tlen);  void UTF8FromUTF16(const wchar_t *uptr, size_t tlen, char *putf, size_t len); -unsigned int UTF8CharLength(unsigned char ch);  size_t UTF16Length(const char *s, size_t len);  size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen);  size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen);  unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf);  std::string FixInvalidUTF8(const std::string &text); -extern int UTF8BytesOfLead[256]; -void UTF8BytesOfLeadInitialise(); +extern const unsigned char UTF8BytesOfLead[256]; -inline bool UTF8IsTrailByte(int ch) { +inline bool UTF8IsTrailByte(unsigned char ch) {  	return (ch >= 0x80) && (ch < 0xc0);  } @@ -64,6 +62,10 @@ inline unsigned int UTF16CharLength(wchar_t uch) {  	return ((uch >= SURROGATE_LEAD_FIRST) && (uch <= SURROGATE_LEAD_LAST)) ? 2 : 1;  } +inline unsigned int UTF16LengthFromUTF8ByteCount(unsigned int byteCount) { +    return (byteCount < 4) ? 1 : 2; +} +  }  #endif | 
