diff options
Diffstat (limited to 'src/Document.cxx')
-rw-r--r-- | src/Document.cxx | 60 |
1 files changed, 55 insertions, 5 deletions
diff --git a/src/Document.cxx b/src/Document.cxx index e2ca7a32a..ff8d0fbcf 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -287,6 +287,55 @@ int Document::LenChar(int pos) { } } +static bool IsTrailByte(int ch) { + return (ch >= 0x80) && (ch < (0x80 + 0x40)); +} + +static int BytesFromLead(int leadByte) { + if (leadByte > 0xF4) { + // Characters longer than 4 bytes not possible in current UTF-8 + return 0; + } else if (leadByte >= 0xF0) { + return 4; + } else if (leadByte >= 0xE0) { + return 3; + } else if (leadByte >= 0xC2) { + return 2; + } + return 0; +} + +bool Document::InGoodUTF8(int pos, int &start, int &end) { + int lead = pos; + while ((lead>0) && (pos-lead < 4) && IsTrailByte(static_cast<unsigned char>(cb.CharAt(lead-1)))) + lead--; + start = 0; + if (lead > 0) { + start = lead-1; + } + int leadByte = static_cast<unsigned char>(cb.CharAt(start)); + int bytes = BytesFromLead(leadByte); + if (bytes == 0) { + return false; + } else { + int trailBytes = bytes - 1; + int len = pos - lead + 1; + if (len > trailBytes) + // pos too far from lead + return false; + // Check that there are enough trails for this lead + int trail = pos + 1; + while ((trail-lead<trailBytes) && (trail < Length())) { + if (!IsTrailByte(static_cast<unsigned char>(cb.CharAt(trail)))) { + return false; + } + trail++; + } + end = start + bytes; + return true; + } +} + // Normalise a position so that it is not halfway through a two byte character. // This can occur in two situations - // When lines are terminated with \r\n pairs which should be treated as one character. @@ -313,13 +362,14 @@ int Document::MovePositionOutsideChar(int pos, int moveDir, bool checkLineEnd) { if (dbcsCodePage) { if (SC_CP_UTF8 == dbcsCodePage) { unsigned char ch = static_cast<unsigned char>(cb.CharAt(pos)); - while ((pos > 0) && (pos < Length()) && (ch >= 0x80) && (ch < (0x80 + 0x40))) { - // ch is a trail byte + int startUTF = pos; + int endUTF = pos; + if (IsTrailByte(ch) && InGoodUTF8(pos, startUTF, endUTF)) { + // ch is a trail byte within a UTF-8 character if (moveDir > 0) - pos++; + pos = endUTF; else - pos--; - ch = static_cast<unsigned char>(cb.CharAt(pos)); + pos = startUTF; } } else { // Anchor DBCS calculations at start of line because start of line can |