diff options
author | nyamatongwe <devnull@localhost> | 2012-05-26 12:17:54 +1000 |
---|---|---|
committer | nyamatongwe <devnull@localhost> | 2012-05-26 12:17:54 +1000 |
commit | 032a0017a6e992fc40790214c738dbc59c084dea (patch) | |
tree | e04ee892cef4668f4e70d3857760348613e70021 /src | |
parent | a6598d01d00e24f8c0ee2f4e9cc9036dd447c15f (diff) | |
download | scintilla-mirror-032a0017a6e992fc40790214c738dbc59c084dea.tar.gz |
Optimize UTF-8 character length calculations by using an array.
Diffstat (limited to 'src')
-rw-r--r-- | src/Document.cxx | 25 | ||||
-rw-r--r-- | src/UniConversion.cxx | 28 | ||||
-rw-r--r-- | src/UniConversion.h | 3 |
3 files changed, 39 insertions, 17 deletions
diff --git a/src/Document.cxx b/src/Document.cxx index 6cae14e8a..d427d636d 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -112,6 +112,8 @@ Document::Document() { matchesValid = false; regex = 0; + UTF8BytesOfLeadInitialise(); + perLineData[ldMarkers] = new LineMarkers(); perLineData[ldLevels] = new LineLevels(); perLineData[ldState] = new LineState(); @@ -449,19 +451,13 @@ int Document::LenChar(int pos) { } else if (IsCrLf(pos)) { return 2; } else if (SC_CP_UTF8 == dbcsCodePage) { - unsigned char ch = static_cast<unsigned char>(cb.CharAt(pos)); - if (ch < 0x80) - return 1; - int len = 2; - if (ch >= (0x80 + 0x40 + 0x20 + 0x10)) - len = 4; - else if (ch >= (0x80 + 0x40 + 0x20)) - len = 3; + const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(pos)); + const int widthCharBytes = UTF8BytesOfLead[leadByte]; int lengthDoc = Length(); - if ((pos + len) > lengthDoc) - return lengthDoc -pos; + if ((pos + widthCharBytes) > lengthDoc) + return lengthDoc - pos; else - return len; + return widthCharBytes; } else if (dbcsCodePage) { return IsDBCSLeadByte(cb.CharAt(pos)) ? 2 : 1; } else { @@ -720,12 +716,7 @@ int Document::SafeSegment(const char *text, int length, int lengthSegment) { lastEncodingAllowedBreak = j; if (dbcsCodePage == SC_CP_UTF8) { - if (ch < 0x80) { - j++; - } else { - int bytes = BytesFromLead(ch); - j += bytes ? bytes : 1; - } + j += UTF8BytesOfLead[ch]; } else if (dbcsCodePage) { j += IsDBCSLeadByte(ch) ? 2 : 1; } else { diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx index e1ad99563..40ac982c9 100644 --- a/src/UniConversion.cxx +++ b/src/UniConversion.cxx @@ -130,6 +130,34 @@ unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsig return ui; } +int UTF8BytesOfLead[256]; +static bool initialisedBytesOfLead = false; + +static int BytesFromLead(int leadByte) { + if (leadByte < 0xC2) { + // Single byte or invalid + return 1; + } else if (leadByte < 0xE0) { + return 2; + } else if (leadByte < 0xF0) { + return 3; + } else if (leadByte < 0xF5) { + return 4; + } else { + // Characters longer than 4 bytes not possible in current UTF-8 + return 1; + } +} + +void UTF8BytesOfLeadInitialise() { + if (!initialisedBytesOfLead) { + for (int i=0;i<256;i++) { + UTF8BytesOfLead[i] = BytesFromLead(i); + } + initialisedBytesOfLead = true; + } +} + // Return both the width of the first character in the string and a status // saying whether it is valid or invalid. // Most invalid sequences return a width of 1 so are treated as isolated bytes but diff --git a/src/UniConversion.h b/src/UniConversion.h index 6793221cf..87cc43f77 100644 --- a/src/UniConversion.h +++ b/src/UniConversion.h @@ -11,6 +11,9 @@ unsigned int UTF8CharLength(unsigned char ch); unsigned int UTF16Length(const char *s, unsigned int len); unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsigned int tlen); +extern int UTF8BytesOfLead[256]; +void UTF8BytesOfLeadInitialise(); + inline bool UTF8IsTrailByte(int ch) { return (ch >= 0x80) && (ch < 0xc0); } |