aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authornyamatongwe <unknown>2012-05-26 12:17:54 +1000
committernyamatongwe <unknown>2012-05-26 12:17:54 +1000
commit477a06c700990e4b646472ce1682a8e68a93383d (patch)
tree53c15b811862eb874a98d071ac854724da52ba1b
parentc725c015867e59efd1ebe66e0247b62e38e04ac9 (diff)
downloadscintilla-mirror-477a06c700990e4b646472ce1682a8e68a93383d.tar.gz
Optimize UTF-8 character length calculations by using an array.
-rw-r--r--src/Document.cxx25
-rw-r--r--src/UniConversion.cxx28
-rw-r--r--src/UniConversion.h3
3 files changed, 39 insertions, 17 deletions
diff --git a/src/Document.cxx b/src/Document.cxx
index 6cae14e8a..d427d636d 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -112,6 +112,8 @@ Document::Document() {
matchesValid = false;
regex = 0;
+ UTF8BytesOfLeadInitialise();
+
perLineData[ldMarkers] = new LineMarkers();
perLineData[ldLevels] = new LineLevels();
perLineData[ldState] = new LineState();
@@ -449,19 +451,13 @@ int Document::LenChar(int pos) {
} else if (IsCrLf(pos)) {
return 2;
} else if (SC_CP_UTF8 == dbcsCodePage) {
- unsigned char ch = static_cast<unsigned char>(cb.CharAt(pos));
- if (ch < 0x80)
- return 1;
- int len = 2;
- if (ch >= (0x80 + 0x40 + 0x20 + 0x10))
- len = 4;
- else if (ch >= (0x80 + 0x40 + 0x20))
- len = 3;
+ const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(pos));
+ const int widthCharBytes = UTF8BytesOfLead[leadByte];
int lengthDoc = Length();
- if ((pos + len) > lengthDoc)
- return lengthDoc -pos;
+ if ((pos + widthCharBytes) > lengthDoc)
+ return lengthDoc - pos;
else
- return len;
+ return widthCharBytes;
} else if (dbcsCodePage) {
return IsDBCSLeadByte(cb.CharAt(pos)) ? 2 : 1;
} else {
@@ -720,12 +716,7 @@ int Document::SafeSegment(const char *text, int length, int lengthSegment) {
lastEncodingAllowedBreak = j;
if (dbcsCodePage == SC_CP_UTF8) {
- if (ch < 0x80) {
- j++;
- } else {
- int bytes = BytesFromLead(ch);
- j += bytes ? bytes : 1;
- }
+ j += UTF8BytesOfLead[ch];
} else if (dbcsCodePage) {
j += IsDBCSLeadByte(ch) ? 2 : 1;
} else {
diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx
index e1ad99563..40ac982c9 100644
--- a/src/UniConversion.cxx
+++ b/src/UniConversion.cxx
@@ -130,6 +130,34 @@ unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsig
return ui;
}
+int UTF8BytesOfLead[256];
+static bool initialisedBytesOfLead = false;
+
+static int BytesFromLead(int leadByte) {
+ if (leadByte < 0xC2) {
+ // Single byte or invalid
+ return 1;
+ } else if (leadByte < 0xE0) {
+ return 2;
+ } else if (leadByte < 0xF0) {
+ return 3;
+ } else if (leadByte < 0xF5) {
+ return 4;
+ } else {
+ // Characters longer than 4 bytes not possible in current UTF-8
+ return 1;
+ }
+}
+
+void UTF8BytesOfLeadInitialise() {
+ if (!initialisedBytesOfLead) {
+ for (int i=0;i<256;i++) {
+ UTF8BytesOfLead[i] = BytesFromLead(i);
+ }
+ initialisedBytesOfLead = true;
+ }
+}
+
// Return both the width of the first character in the string and a status
// saying whether it is valid or invalid.
// Most invalid sequences return a width of 1 so are treated as isolated bytes but
diff --git a/src/UniConversion.h b/src/UniConversion.h
index 6793221cf..87cc43f77 100644
--- a/src/UniConversion.h
+++ b/src/UniConversion.h
@@ -11,6 +11,9 @@ unsigned int UTF8CharLength(unsigned char ch);
unsigned int UTF16Length(const char *s, unsigned int len);
unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsigned int tlen);
+extern int UTF8BytesOfLead[256];
+void UTF8BytesOfLeadInitialise();
+
inline bool UTF8IsTrailByte(int ch) {
return (ch >= 0x80) && (ch < 0xc0);
}