aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/Document.cxx37
1 files changed, 26 insertions, 11 deletions
diff --git a/src/Document.cxx b/src/Document.cxx
index 9b30b44aa..3ba78c086 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -608,22 +608,37 @@ bool Document::IsCrLf(Sci::Position pos) const {
}
int Document::LenChar(Sci::Position pos) {
- if (pos < 0) {
+ if (pos < 0 || pos >= Length()) {
+ // Returning 1 instead of 0 to defend against hanging with a loop that goes (or starts) out of bounds.
return 1;
} else if (IsCrLf(pos)) {
return 2;
- } else if (SC_CP_UTF8 == dbcsCodePage) {
- const unsigned char leadByte = cb.UCharAt(pos);
+ }
+
+ const unsigned char leadByte = cb.UCharAt(pos);
+ if (!dbcsCodePage || UTF8IsAscii(leadByte)) {
+ // Common case: ASCII character
+ return 1;
+ }
+ if (SC_CP_UTF8 == dbcsCodePage) {
const int widthCharBytes = UTF8BytesOfLead[leadByte];
- const Sci::Position lengthDoc = Length();
- if ((pos + widthCharBytes) > lengthDoc)
- return static_cast<int>(lengthDoc - pos);
- else
- return widthCharBytes;
- } else if (dbcsCodePage) {
- return IsDBCSLeadByteNoExcept(cb.CharAt(pos)) ? 2 : 1;
+ unsigned char charBytes[UTF8MaxBytes] = { leadByte, 0, 0, 0 };
+ for (int b = 1; b < widthCharBytes; b++) {
+ charBytes[b] = cb.UCharAt(pos + b);
+ }
+ const int utf8status = UTF8Classify(charBytes, widthCharBytes);
+ if (utf8status & UTF8MaskInvalid) {
+ // Treat as invalid and use up just one byte
+ return 1;
+ } else {
+ return utf8status & UTF8MaskWidth;
+ }
} else {
- return 1;
+ if (IsDBCSLeadByteNoExcept(leadByte) && ((pos + 1) < Length())) {
+ return 2;
+ } else {
+ return 1;
+ }
}
}