diff options
author | nyamatongwe <devnull@localhost> | 2012-05-26 13:36:25 +1000 |
---|---|---|
committer | nyamatongwe <devnull@localhost> | 2012-05-26 13:36:25 +1000 |
commit | 645d0f2b7c5ebad6d757e35cd257bcbf4a118f68 (patch) | |
tree | 2dc8da26482b7830d196aa0de0b92e329950efe4 | |
parent | e1370f834348a12fea75ad883a0d801dfd1b9d8d (diff) | |
download | scintilla-mirror-645d0f2b7c5ebad6d757e35cd257bcbf4a118f68.tar.gz |
Use standardised and more stringent UTF8Classify for determining validity and
width of UTF-8 characters. Optimize to make up for cost of UTF8Classify.
Drop functions now provided by UniConversion.
Use UTF8IsAscii function instead of test against literal.
-rw-r--r-- | src/Document.cxx | 119 |
1 files changed, 61 insertions, 58 deletions
diff --git a/src/Document.cxx b/src/Document.cxx index 4e9366064..244e96e4f 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -465,51 +465,29 @@ int Document::LenChar(int pos) { } } -static inline bool IsTrailByte(int ch) { - return (ch >= 0x80) && (ch < 0xc0); -} - -static int BytesFromLead(int leadByte) { - if (leadByte > 0xF4) { - // Characters longer than 4 bytes not possible in current UTF-8 - return 0; - } else if (leadByte >= 0xF0) { - return 4; - } else if (leadByte >= 0xE0) { - return 3; - } else if (leadByte >= 0xC2) { - return 2; - } - return 0; -} - bool Document::InGoodUTF8(int pos, int &start, int &end) const { - int lead = pos; - while ((lead>0) && (pos-lead < 4) && IsTrailByte(static_cast<unsigned char>(cb.CharAt(lead-1)))) - lead--; - start = 0; - if (lead > 0) { - start = lead-1; - } - int leadByte = static_cast<unsigned char>(cb.CharAt(start)); - int bytes = BytesFromLead(leadByte); - if (bytes == 0) { + int trail = pos; + while ((trail>0) && (pos-trail < UTF8MaxBytes) && UTF8IsTrailByte(static_cast<unsigned char>(cb.CharAt(trail-1)))) + trail--; + start = (trail > 0) ? trail-1 : trail; + + const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(start)); + const int widthCharBytes = UTF8BytesOfLead[leadByte]; + if (widthCharBytes == 1) { return false; } else { - int trailBytes = bytes - 1; - int len = pos - lead + 1; + int trailBytes = widthCharBytes - 1; + int len = pos - start; if (len > trailBytes) // pos too far from lead return false; - // Check that there are enough trails for this lead - int trail = pos + 1; - while ((trail-lead<trailBytes) && (trail < Length())) { - if (!IsTrailByte(static_cast<unsigned char>(cb.CharAt(trail)))) { - return false; - } - trail++; - } - end = start + bytes; + char charBytes[UTF8MaxBytes] = {static_cast<char>(leadByte),0,0,0}; + for (int b=1; b<widthCharBytes && ((start+b) < Length()); b++) + charBytes[b] = cb.CharAt(static_cast<int>(start+b)); + int utf8status = UTF8Classify(reinterpret_cast<const unsigned char *>(charBytes), widthCharBytes); + if (utf8status & UTF8MaskInvalid) + return false; + end = start + widthCharBytes; return true; } } @@ -538,14 +516,18 @@ int Document::MovePositionOutsideChar(int pos, int moveDir, bool checkLineEnd) { if (dbcsCodePage) { if (SC_CP_UTF8 == dbcsCodePage) { unsigned char ch = static_cast<unsigned char>(cb.CharAt(pos)); - int startUTF = pos; - int endUTF = pos; - if (IsTrailByte(ch) && InGoodUTF8(pos, startUTF, endUTF)) { - // ch is a trail byte within a UTF-8 character - if (moveDir > 0) - pos = endUTF; - else - pos = startUTF; + // If ch is not a trail byte then pos is valid intercharacter position + if (UTF8IsTrailByte(ch)) { + int startUTF = pos; + int endUTF = pos; + if (InGoodUTF8(pos, startUTF, endUTF)) { + // ch is a trail byte within a UTF-8 character + if (moveDir > 0) + pos = endUTF; + else + pos = startUTF; + } + // Else invalid UTF-8 so return position of isolated trail byte } } else { // Anchor DBCS calculations at start of line because start of line can @@ -592,16 +574,37 @@ int Document::NextPosition(int pos, int moveDir) const { if (dbcsCodePage) { if (SC_CP_UTF8 == dbcsCodePage) { - pos += increment; - unsigned char ch = static_cast<unsigned char>(cb.CharAt(pos)); - int startUTF = pos; - int endUTF = pos; - if (IsTrailByte(ch) && InGoodUTF8(pos, startUTF, endUTF)) { - // ch is a trail byte within a UTF-8 character - if (moveDir > 0) - pos = endUTF; - else - pos = startUTF; + if (increment == 1) { + // Simple forward movement case so can avoid some checks + const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(pos)); + if (UTF8IsAscii(leadByte)) { + // Single byte character or invalid + pos++; + } else { + const int widthCharBytes = UTF8BytesOfLead[leadByte]; + char charBytes[UTF8MaxBytes] = {static_cast<char>(leadByte),0,0,0}; + for (int b=1; b<widthCharBytes; b++) + charBytes[b] = cb.CharAt(static_cast<int>(pos+b)); + int utf8status = UTF8Classify(reinterpret_cast<const unsigned char *>(charBytes), widthCharBytes); + if (utf8status & UTF8MaskInvalid) + pos++; + else + pos += utf8status & UTF8MaskWidth; + } + } else { + // Examine byte before position + pos--; + unsigned char ch = static_cast<unsigned char>(cb.CharAt(pos)); + // If ch is not a trail byte then pos is valid intercharacter position + if (UTF8IsTrailByte(ch)) { + // If ch is a trail byte in a valid UTF-8 character then return start of character + int startUTF = pos; + int endUTF = pos; + if (InGoodUTF8(pos, startUTF, endUTF)) { + pos = startUTF; + } + // Else invalid UTF-8 so return position of isolated trail byte + } } } else { if (moveDir > 0) { @@ -1246,7 +1249,7 @@ int Document::ParaDown(int pos) { } CharClassify::cc Document::WordCharClass(unsigned char ch) { - if ((SC_CP_UTF8 == dbcsCodePage) && (ch >= 0x80)) + if ((SC_CP_UTF8 == dbcsCodePage) && (!UTF8IsAscii(ch))) return CharClassify::ccWord; return charClass.GetClass(ch); } |