diff options
Diffstat (limited to 'src/Document.cxx')
-rw-r--r-- | src/Document.cxx | 37 |
1 files changed, 26 insertions, 11 deletions
diff --git a/src/Document.cxx b/src/Document.cxx index 9b30b44aa..3ba78c086 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -608,22 +608,37 @@ bool Document::IsCrLf(Sci::Position pos) const { } int Document::LenChar(Sci::Position pos) { - if (pos < 0) { + if (pos < 0 || pos >= Length()) { + // Returning 1 instead of 0 to defend against hanging with a loop that goes (or starts) out of bounds. return 1; } else if (IsCrLf(pos)) { return 2; - } else if (SC_CP_UTF8 == dbcsCodePage) { - const unsigned char leadByte = cb.UCharAt(pos); + } + + const unsigned char leadByte = cb.UCharAt(pos); + if (!dbcsCodePage || UTF8IsAscii(leadByte)) { + // Common case: ASCII character + return 1; + } + if (SC_CP_UTF8 == dbcsCodePage) { const int widthCharBytes = UTF8BytesOfLead[leadByte]; - const Sci::Position lengthDoc = Length(); - if ((pos + widthCharBytes) > lengthDoc) - return static_cast<int>(lengthDoc - pos); - else - return widthCharBytes; - } else if (dbcsCodePage) { - return IsDBCSLeadByteNoExcept(cb.CharAt(pos)) ? 2 : 1; + unsigned char charBytes[UTF8MaxBytes] = { leadByte, 0, 0, 0 }; + for (int b = 1; b < widthCharBytes; b++) { + charBytes[b] = cb.UCharAt(pos + b); + } + const int utf8status = UTF8Classify(charBytes, widthCharBytes); + if (utf8status & UTF8MaskInvalid) { + // Treat as invalid and use up just one byte + return 1; + } else { + return utf8status & UTF8MaskWidth; + } } else { - return 1; + if (IsDBCSLeadByteNoExcept(leadByte) && ((pos + 1) < Length())) { + return 2; + } else { + return 1; + } } } |