Optimize UTF-8 character length calculations by using an array.

author: nyamatongwe <devnull@localhost> 2012-05-26 12:17:54 +1000
committer: nyamatongwe <devnull@localhost> 2012-05-26 12:17:54 +1000
commit: 032a0017a6e992fc40790214c738dbc59c084dea (patch)
tree: e04ee892cef4668f4e70d3857760348613e70021 /src
parent: a6598d01d00e24f8c0ee2f4e9cc9036dd447c15f (diff)
download: scintilla-mirror-032a0017a6e992fc40790214c738dbc59c084dea.tar.gz
3 files changed, 39 insertions, 17 deletions
diff --git a/src/Document.cxx b/src/Document.cxx
index 6cae14e8a..d427d636d 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -112,6 +112,8 @@ Document::Document() {
 	matchesValid = false;
 	regex = 0;
 
+	UTF8BytesOfLeadInitialise();
+
 	perLineData[ldMarkers] = new LineMarkers();
 	perLineData[ldLevels] = new LineLevels();
 	perLineData[ldState] = new LineState();
@@ -449,19 +451,13 @@ int Document::LenChar(int pos) {
 	} else if (IsCrLf(pos)) {
 		return 2;
 	} else if (SC_CP_UTF8 == dbcsCodePage) {
-		unsigned char ch = static_cast<unsigned char>(cb.CharAt(pos));
-		if (ch < 0x80)
-			return 1;
-		int len = 2;
-		if (ch >= (0x80 + 0x40 + 0x20 + 0x10))
-			len = 4;
-		else if (ch >= (0x80 + 0x40 + 0x20))
-			len = 3;
+		const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(pos));
+		const int widthCharBytes = UTF8BytesOfLead[leadByte];
 		int lengthDoc = Length();
-		if ((pos + len) > lengthDoc)
-			return lengthDoc -pos;
+		if ((pos + widthCharBytes) > lengthDoc)
+			return lengthDoc - pos;
 		else
-			return len;
+			return widthCharBytes;
 	} else if (dbcsCodePage) {
 		return IsDBCSLeadByte(cb.CharAt(pos)) ? 2 : 1;
 	} else {
@@ -720,12 +716,7 @@ int Document::SafeSegment(const char *text, int length, int lengthSegment) {
 		lastEncodingAllowedBreak = j;
 
 		if (dbcsCodePage == SC_CP_UTF8) {
-			if (ch < 0x80) {
-				j++;
-			} else {
-				int bytes = BytesFromLead(ch);
-				j += bytes ? bytes : 1;
-			}
+			j += UTF8BytesOfLead[ch];
 		} else if (dbcsCodePage) {
 			j += IsDBCSLeadByte(ch) ? 2 : 1;
 		} else {
diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx
index e1ad99563..40ac982c9 100644
--- a/src/UniConversion.cxx
+++ b/src/UniConversion.cxx
@@ -130,6 +130,34 @@ unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsig
 	return ui;
 }
 
+int UTF8BytesOfLead[256];
+static bool initialisedBytesOfLead = false;
+
+static int BytesFromLead(int leadByte) {
+	if (leadByte < 0xC2) {
+		// Single byte or invalid
+		return 1;
+	} else if (leadByte < 0xE0) {
+		return 2;
+	} else if (leadByte < 0xF0) {
+		return 3;
+	} else if (leadByte < 0xF5) {
+		return 4;
+	} else {
+		// Characters longer than 4 bytes not possible in current UTF-8
+		return 1;
+	}
+}
+
+void UTF8BytesOfLeadInitialise() {
+	if (!initialisedBytesOfLead) {
+		for (int i=0;i<256;i++) {
+			UTF8BytesOfLead[i] = BytesFromLead(i);
+		}
+		initialisedBytesOfLead = true;
+	}
+}
+
 // Return both the width of the first character in the string and a status
 // saying whether it is valid or invalid.
 // Most invalid sequences return a width of 1 so are treated as isolated bytes but
diff --git a/src/UniConversion.h b/src/UniConversion.h
index 6793221cf..87cc43f77 100644
--- a/src/UniConversion.h
+++ b/src/UniConversion.h
@@ -11,6 +11,9 @@ unsigned int UTF8CharLength(unsigned char ch);
 unsigned int UTF16Length(const char *s, unsigned int len);
 unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsigned int tlen);
 
+extern int UTF8BytesOfLead[256];
+void UTF8BytesOfLeadInitialise();
+
 inline bool UTF8IsTrailByte(int ch) {
 	return (ch >= 0x80) && (ch < 0xc0);
 }
author	nyamatongwe <devnull@localhost>	2012-05-26 12:17:54 +1000
committer	nyamatongwe <devnull@localhost>	2012-05-26 12:17:54 +1000
commit	032a0017a6e992fc40790214c738dbc59c084dea (patch)
tree	e04ee892cef4668f4e70d3857760348613e70021 /src
parent	a6598d01d00e24f8c0ee2f4e9cc9036dd447c15f (diff)
download	scintilla-mirror-032a0017a6e992fc40790214c738dbc59c084dea.tar.gz