diff options
-rw-r--r-- | doc/ScintillaHistory.html | 3 | ||||
-rw-r--r-- | src/Document.cxx | 101 | ||||
-rw-r--r-- | src/UniConversion.h | 4 | ||||
-rw-r--r-- | test/unit/testDocument.cxx | 56 |
4 files changed, 157 insertions, 7 deletions
diff --git a/doc/ScintillaHistory.html b/doc/ScintillaHistory.html index 782801b7a..ef9ad02d4 100644 --- a/doc/ScintillaHistory.html +++ b/doc/ScintillaHistory.html @@ -604,6 +604,9 @@ Serialize selection type and ranges with SCI_GETSELECTIONSERIALIZED and SCI_SETSELECTIONSERIALIZED. </li> <li> + Fix segmentation of long lexemes to avoid breaking before modifiers like accents that must be drawn with their base letters. + </li> + <li> Fix bug on Qt where double-click stopped working when Scintilla instance had been running for weeks. </li> </ul> diff --git a/src/Document.cxx b/src/Document.cxx index 379a88786..dc82b1902 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -1202,6 +1202,92 @@ bool Document::IsDBCSDualByteAt(Sci::Position pos) const noexcept { && IsDBCSTrailByteNoExcept(cb.CharAt(pos + 1)); } +namespace { + +// Remove any extra bytes after the last valid character. +void DiscardEndFragment(std::string_view &text) noexcept { + if (!text.empty()) { + if (UTF8IsFirstByte(text.back())) { + // Ending with start of character byte is invalid + text.remove_suffix(1); + } else if (UTF8IsTrailByte(text.back())) { + // go back to the start of last character. + const size_t maxTrail = std::max<size_t>(UTF8MaxBytes - 1, text.length()); + size_t trail = 1; + while (trail < maxTrail && UTF8IsTrailByte(text[text.length() - trail])) { + trail++; + } + const std::string_view endPortion = text.substr(text.length() - trail); + if (!UTF8IsValid(endPortion)) { + text.remove_suffix(trail); + } + } + } +} + +constexpr bool IsBaseOfGrapheme(CharacterCategory cc) { + // \p{L}\p{N}\p{P}\p{S}\p{Zs} + switch (cc) { + case ccLu: + case ccLl: + case ccLt: + case ccLm: + case ccLo: + case ccNd: + case ccNl: + case ccNo: + case ccPc: + case ccPd: + case ccPs: + case ccPe: + case ccPi: + case ccPf: + case ccPo: + case ccSm: + case ccSc: + case ccSk: + case ccSo: + case ccZs: + return true; + default: + // ccMn, ccMc, ccMe, + // ccZl, ccZp, + // ccCc, ccCf, ccCs, ccCo, ccCn + return false; + } +} + +void DiscardLastCombinedCharacter(std::string_view &text) noexcept { + // Handle the simple common case where a base character may be followed by + // accents and similar marks by discarding until start of base character. + // + // From Grapheme_Cluster_Boundaries + // combining character sequence = ccs-base? ccs-extend+ + // ccs-base := [\p{L}\p{N}\p{P}\p{S}\p{Zs}] + // ccs-extend := [\p{M}\p{Join_Control}] + + std::string_view truncated = text; + while (truncated.length() > (UTF8MaxBytes * 2)) { + // Give up when short + std::string_view::iterator it = truncated.end() - 1; + // For UTF-8 go back to the start of last character. + for (int trail = 0; trail < UTF8MaxBytes - 1 && UTF8IsTrailByte(*it); trail++) { + --it; + } + const size_t countBytes = truncated.end() - it; + const std::string_view svLastCharacter = truncated.substr(truncated.length() - countBytes); + const CharacterCategory cc = CategoriseCharacter(UnicodeFromUTF8(svLastCharacter)); + truncated.remove_suffix(countBytes); + if (IsBaseOfGrapheme(cc)) { + text = truncated; + return; + } + } + // No base character found so just leave as is +} + +} + // Need to break text into segments near end but taking into account the // encoding to not break inside a UTF-8 or DBCS character and also trying // to avoid breaking inside a pair of combining characters, or inside @@ -1215,7 +1301,8 @@ bool Document::IsDBCSDualByteAt(Sci::Position pos) const noexcept { // In preference order from best to worst: // 1) Break before or after spaces or controls // 2) Break at word and punctuation boundary for better kerning and ligature support -// 3) Break after whole character, this may break combining characters +// 3) Break before letter in UTF-8 to avoid breaking combining characters +// 4) Break after whole character, this may break combining characters size_t Document::SafeSegment(std::string_view text) const noexcept { // check space first as most written language use spaces. @@ -1236,14 +1323,14 @@ size_t Document::SafeSegment(std::string_view text) const noexcept { } } while (it != text.begin()); - it = text.end() - 1; if (dbcsCodePage) { - // for UTF-8 go back to the start of last character. - for (int trail = 0; trail < UTF8MaxBytes - 1 && UTF8IsTrailByte(*it); trail++) { - --it; - } + // UTF-8 + DiscardEndFragment(text); + DiscardLastCombinedCharacter(text); + return text.length(); + } else { + return text.length() - 1; } - return it - text.begin(); } { diff --git a/src/UniConversion.h b/src/UniConversion.h index 7a51b2d08..5990cca8c 100644 --- a/src/UniConversion.h +++ b/src/UniConversion.h @@ -49,6 +49,10 @@ constexpr bool UTF8IsTrailByte(unsigned char ch) noexcept { return (ch >= 0x80) && (ch < 0xc0); } +constexpr bool UTF8IsFirstByte(unsigned char ch) noexcept { + return (ch >= 0xc2) && (ch <= 0xf4); +} + constexpr bool UTF8IsAscii(unsigned char ch) noexcept { return ch < 0x80; } diff --git a/test/unit/testDocument.cxx b/test/unit/testDocument.cxx index e4b674987..ad1384ee7 100644 --- a/test/unit/testDocument.cxx +++ b/test/unit/testDocument.cxx @@ -34,6 +34,7 @@ #include "Decoration.h" #include "CaseFolder.h" #include "Document.h" +#include "UniConversion.h" #include "catch.hpp" @@ -957,6 +958,61 @@ TEST_CASE("SafeSegment") { REQUIRE(text[length] == '\xf0'); } + SECTION("UTF-8 Character Fragments") { + // PositionCache breaks long texts into fixed length sub-strings that are passed to SafeSegment + // so the final character in the sub-string may be incomplete without all needed trail bytes. + // For UTF-8, SafeSegment first discards any final bytes that do not represent a valid character + // then discards the final whole character. + + const DocPlus doc("", CpUtf8); + + // break before last character after discarding incomplete last character: 0 trail byte + std::string_view text = "Japanese\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xc2"; // Invalid text as ends with start byte + size_t length = doc.document.SafeSegment(text); + REQUIRE(text[length - 1] == '\xac'); + REQUIRE(text[length] == '\xe8'); + REQUIRE(UTF8IsValid(text.substr(0, length))); + + // break before last character after discarding incomplete last character: 1 trail byte and 2 needed + text = "Japanese\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe6\x97"; // Invalid text as ends with only 1 trail byte + length = doc.document.SafeSegment(text); + REQUIRE(text[length - 1] == '\xac'); + REQUIRE(text[length] == '\xe8'); + REQUIRE(UTF8IsValid(text.substr(0, length))); + } + + SECTION("UTF-8 Combining Characters") { + const DocPlus doc("", CpUtf8); + + // There may be combining characters like accents and tone marks after the + // last letter in a sub-string and these may be included in the sub-string + // or follow it. + // Correct display requires that the combining characters are measured and + // drawn with the letter they follow. Thus the final letter and any + // following combining characters are discarded. + + // A Thai text example with 8 characters, each taking 3 bytes: + // HO HIP, SARA AA, KHO KHAI, MAI THO, O ANG, MO MA, SARA UU, LO LING + // Most are letters (Lo) but 2 characters are modifiers (Mn): + // MAI THO is a tone mark and SARA UU is a vowel. + const std::string_view text = "\xe0\xb8\xab\xe0\xb8\xb2\xe0\xb8\x82\xe0\xb9\x89\xe0\xb8\xad\xe0\xb8\xa1\xe0\xb8\xb9\xe0\xb8\xa5"; + REQUIRE(text.length() == 8 * 3); + size_t length = doc.document.SafeSegment(text); + REQUIRE(length == (8 - 1) * 3); // Discard last character + + // Remove last character (letter LO LING) then run again. + // Should skip past SARA UU combining vowel mark to discard letter MO MA and SARA UU. + const std::string_view textWithoutLoLing = text.substr(0, length); + length = doc.document.SafeSegment(textWithoutLoLing); + REQUIRE(length == (8 - 3) * 3); // Discard 2 characters + + // Remove last character SARA UU combining vowel mark then run again + // Final letter may have following combining mark so discard producing same text as previous step. + const std::string_view textWithoutSaraUu = text.substr(0, (8 - 2) * 3); + length = doc.document.SafeSegment(textWithoutSaraUu); + REQUIRE(length == (8 - 3) * 3); // Discard 1 character + } + SECTION("DBCS Shift-JIS") { const DocPlus doc("", 932); // word and punctuation boundary in middle of text: single byte |