diff options
author | Neil <nyamatongwe@gmail.com> | 2025-02-04 11:47:48 +1100 |
---|---|---|
committer | Neil <nyamatongwe@gmail.com> | 2025-02-04 11:47:48 +1100 |
commit | 4c9ddc3121d0488914858ee511028520b96fd0e9 (patch) | |
tree | 03989eae1ce94f479749ef74e1e6c76c10f3e332 /test | |
parent | ef961772c3ced424f034c2055263d7231eccee01 (diff) | |
download | scintilla-mirror-4c9ddc3121d0488914858ee511028520b96fd0e9.tar.gz |
Fix segmentation of long lexemes to avoid breaking before modifiers like accents
that must be drawn with their base letters.
This is only a subset of implementing grapheme cluster boundaries but it
improves behaviour with some Asian scripts like Thai and Javanese.
Javanese is mostly written with (ASCII) Roman characters so issues will be rare
but Thai uses Thai script.
Also slightly improves placement of combining accents in European texts.
https://github.com/notepad-plus-plus/notepad-plus-plus/issues/14822
https://github.com/notepad-plus-plus/notepad-plus-plus/issues/16115
Diffstat (limited to 'test')
-rw-r--r-- | test/unit/testDocument.cxx | 56 |
1 files changed, 56 insertions, 0 deletions
diff --git a/test/unit/testDocument.cxx b/test/unit/testDocument.cxx index e4b674987..ad1384ee7 100644 --- a/test/unit/testDocument.cxx +++ b/test/unit/testDocument.cxx @@ -34,6 +34,7 @@ #include "Decoration.h" #include "CaseFolder.h" #include "Document.h" +#include "UniConversion.h" #include "catch.hpp" @@ -957,6 +958,61 @@ TEST_CASE("SafeSegment") { REQUIRE(text[length] == '\xf0'); } + SECTION("UTF-8 Character Fragments") { + // PositionCache breaks long texts into fixed length sub-strings that are passed to SafeSegment + // so the final character in the sub-string may be incomplete without all needed trail bytes. + // For UTF-8, SafeSegment first discards any final bytes that do not represent a valid character + // then discards the final whole character. + + const DocPlus doc("", CpUtf8); + + // break before last character after discarding incomplete last character: 0 trail byte + std::string_view text = "Japanese\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xc2"; // Invalid text as ends with start byte + size_t length = doc.document.SafeSegment(text); + REQUIRE(text[length - 1] == '\xac'); + REQUIRE(text[length] == '\xe8'); + REQUIRE(UTF8IsValid(text.substr(0, length))); + + // break before last character after discarding incomplete last character: 1 trail byte and 2 needed + text = "Japanese\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe6\x97"; // Invalid text as ends with only 1 trail byte + length = doc.document.SafeSegment(text); + REQUIRE(text[length - 1] == '\xac'); + REQUIRE(text[length] == '\xe8'); + REQUIRE(UTF8IsValid(text.substr(0, length))); + } + + SECTION("UTF-8 Combining Characters") { + const DocPlus doc("", CpUtf8); + + // There may be combining characters like accents and tone marks after the + // last letter in a sub-string and these may be included in the sub-string + // or follow it. + // Correct display requires that the combining characters are measured and + // drawn with the letter they follow. Thus the final letter and any + // following combining characters are discarded. + + // A Thai text example with 8 characters, each taking 3 bytes: + // HO HIP, SARA AA, KHO KHAI, MAI THO, O ANG, MO MA, SARA UU, LO LING + // Most are letters (Lo) but 2 characters are modifiers (Mn): + // MAI THO is a tone mark and SARA UU is a vowel. + const std::string_view text = "\xe0\xb8\xab\xe0\xb8\xb2\xe0\xb8\x82\xe0\xb9\x89\xe0\xb8\xad\xe0\xb8\xa1\xe0\xb8\xb9\xe0\xb8\xa5"; + REQUIRE(text.length() == 8 * 3); + size_t length = doc.document.SafeSegment(text); + REQUIRE(length == (8 - 1) * 3); // Discard last character + + // Remove last character (letter LO LING) then run again. + // Should skip past SARA UU combining vowel mark to discard letter MO MA and SARA UU. + const std::string_view textWithoutLoLing = text.substr(0, length); + length = doc.document.SafeSegment(textWithoutLoLing); + REQUIRE(length == (8 - 3) * 3); // Discard 2 characters + + // Remove last character SARA UU combining vowel mark then run again + // Final letter may have following combining mark so discard producing same text as previous step. + const std::string_view textWithoutSaraUu = text.substr(0, (8 - 2) * 3); + length = doc.document.SafeSegment(textWithoutSaraUu); + REQUIRE(length == (8 - 3) * 3); // Discard 1 character + } + SECTION("DBCS Shift-JIS") { const DocPlus doc("", 932); // word and punctuation boundary in middle of text: single byte |