diff options
| author | nyamatongwe <unknown> | 2012-05-26 12:17:54 +1000 | 
|---|---|---|
| committer | nyamatongwe <unknown> | 2012-05-26 12:17:54 +1000 | 
| commit | 477a06c700990e4b646472ce1682a8e68a93383d (patch) | |
| tree | 53c15b811862eb874a98d071ac854724da52ba1b | |
| parent | c725c015867e59efd1ebe66e0247b62e38e04ac9 (diff) | |
| download | scintilla-mirror-477a06c700990e4b646472ce1682a8e68a93383d.tar.gz | |
Optimize UTF-8 character length calculations by using an array.
| -rw-r--r-- | src/Document.cxx | 25 | ||||
| -rw-r--r-- | src/UniConversion.cxx | 28 | ||||
| -rw-r--r-- | src/UniConversion.h | 3 | 
3 files changed, 39 insertions, 17 deletions
| diff --git a/src/Document.cxx b/src/Document.cxx index 6cae14e8a..d427d636d 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -112,6 +112,8 @@ Document::Document() {  	matchesValid = false;  	regex = 0; +	UTF8BytesOfLeadInitialise(); +  	perLineData[ldMarkers] = new LineMarkers();  	perLineData[ldLevels] = new LineLevels();  	perLineData[ldState] = new LineState(); @@ -449,19 +451,13 @@ int Document::LenChar(int pos) {  	} else if (IsCrLf(pos)) {  		return 2;  	} else if (SC_CP_UTF8 == dbcsCodePage) { -		unsigned char ch = static_cast<unsigned char>(cb.CharAt(pos)); -		if (ch < 0x80) -			return 1; -		int len = 2; -		if (ch >= (0x80 + 0x40 + 0x20 + 0x10)) -			len = 4; -		else if (ch >= (0x80 + 0x40 + 0x20)) -			len = 3; +		const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(pos)); +		const int widthCharBytes = UTF8BytesOfLead[leadByte];  		int lengthDoc = Length(); -		if ((pos + len) > lengthDoc) -			return lengthDoc -pos; +		if ((pos + widthCharBytes) > lengthDoc) +			return lengthDoc - pos;  		else -			return len; +			return widthCharBytes;  	} else if (dbcsCodePage) {  		return IsDBCSLeadByte(cb.CharAt(pos)) ? 2 : 1;  	} else { @@ -720,12 +716,7 @@ int Document::SafeSegment(const char *text, int length, int lengthSegment) {  		lastEncodingAllowedBreak = j;  		if (dbcsCodePage == SC_CP_UTF8) { -			if (ch < 0x80) { -				j++; -			} else { -				int bytes = BytesFromLead(ch); -				j += bytes ? bytes : 1; -			} +			j += UTF8BytesOfLead[ch];  		} else if (dbcsCodePage) {  			j += IsDBCSLeadByte(ch) ? 2 : 1;  		} else { diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx index e1ad99563..40ac982c9 100644 --- a/src/UniConversion.cxx +++ b/src/UniConversion.cxx @@ -130,6 +130,34 @@ unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsig  	return ui;  } +int UTF8BytesOfLead[256]; +static bool initialisedBytesOfLead = false; + +static int BytesFromLead(int leadByte) { +	if (leadByte < 0xC2) { +		// Single byte or invalid +		return 1; +	} else if (leadByte < 0xE0) { +		return 2; +	} else if (leadByte < 0xF0) { +		return 3; +	} else if (leadByte < 0xF5) { +		return 4; +	} else { +		// Characters longer than 4 bytes not possible in current UTF-8 +		return 1; +	} +} + +void UTF8BytesOfLeadInitialise() { +	if (!initialisedBytesOfLead) { +		for (int i=0;i<256;i++) { +			UTF8BytesOfLead[i] = BytesFromLead(i); +		} +		initialisedBytesOfLead = true; +	} +} +  // Return both the width of the first character in the string and a status  // saying whether it is valid or invalid.  // Most invalid sequences return a width of 1 so are treated as isolated bytes but diff --git a/src/UniConversion.h b/src/UniConversion.h index 6793221cf..87cc43f77 100644 --- a/src/UniConversion.h +++ b/src/UniConversion.h @@ -11,6 +11,9 @@ unsigned int UTF8CharLength(unsigned char ch);  unsigned int UTF16Length(const char *s, unsigned int len);  unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsigned int tlen); +extern int UTF8BytesOfLead[256]; +void UTF8BytesOfLeadInitialise(); +  inline bool UTF8IsTrailByte(int ch) {  	return (ch >= 0x80) && (ch < 0xc0);  } | 
