diff options
author | Neil <nyamatongwe@gmail.com> | 2025-03-03 10:30:30 +1100 |
---|---|---|
committer | Neil <nyamatongwe@gmail.com> | 2025-03-03 10:30:30 +1100 |
commit | 00838f369d7ca189b498f70c8c446ccc64cc0e2f (patch) | |
tree | 5abaec5831fe4c1fe7bd74e112b42a30cc06e4d4 | |
parent | 8a392e9483783e63e2e63dc71bfd389523fe537d (diff) | |
download | scintilla-mirror-00838f369d7ca189b498f70c8c446ccc64cc0e2f.tar.gz |
Feature [feature-requests:#1417]. Improve UTF-8 segmentation for some control
characters and invalid bytes.
Add more test cases.
-rw-r--r-- | src/Document.cxx | 10 | ||||
-rw-r--r-- | test/unit/testDocument.cxx | 131 |
2 files changed, 139 insertions, 2 deletions
diff --git a/src/Document.cxx b/src/Document.cxx index 8fc76e426..f211f0a7c 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -1229,8 +1229,8 @@ void DiscardEndFragment(std::string_view &text) noexcept { } constexpr bool IsBaseOfGrapheme(CharacterCategory cc) { - // \p{L}\p{N}\p{P}\p{Sm}\p{Sc}\p{So}\p{Zs} switch (cc) { + // \p{L}\p{N}\p{P}\p{Sm}\p{Sc}\p{So}\p{Zs} case ccLu: case ccLl: case ccLt: @@ -1250,12 +1250,16 @@ constexpr bool IsBaseOfGrapheme(CharacterCategory cc) { case ccSc: case ccSo: case ccZs: + // Control + case ccCc: + case ccCs: + case ccCo: return true; default: // ccMn, ccMc, ccMe, // ccSk, // ccZl, ccZp, - // ccCc, ccCf, ccCs, ccCo, ccCn + // ccCf, ccCn return false; } } @@ -1287,6 +1291,8 @@ bool Scintilla::Internal::DiscardLastCombinedCharacter(std::string_view &text) n // ccs-base := [\p{L}\p{N}\p{P}\p{S}\p{Zs}] // ccs-extend := [\p{M}\p{Join_Control}] // Modified to move Sk (Symbol Modifier) from ccs-base to ccs-extend to preserve modified emoji + // May break before and after Control which is defined as most of ccC? but not some of ccCf and ccCn + // so treat ccCc, ccCs, ccCo as base for now. std::string_view truncated = text; while (truncated.length() > UTF8MaxBytes) { diff --git a/test/unit/testDocument.cxx b/test/unit/testDocument.cxx index e764e209b..fb8448da4 100644 --- a/test/unit/testDocument.cxx +++ b/test/unit/testDocument.cxx @@ -1042,6 +1042,137 @@ TEST_CASE("SafeSegment") { } } +TEST_CASE("DiscardLastCombinedCharacter") { + SECTION("Short") { + const std::string_view base = "12345"; + // Short strings (up to 4 bytes) aren't changed to avoid null and problematic results + for (size_t len = 0; len < 5; len++) { + std::string_view text = base.substr(0, len); + REQUIRE(text.length() == len); + const bool changed = DiscardLastCombinedCharacter(text); + REQUIRE(!changed); + REQUIRE(text.length() == len); + } + } + + SECTION("ASCII") { + std::string_view text = "12345"; + REQUIRE(text.length() == 5); + const bool changed = DiscardLastCombinedCharacter(text); + REQUIRE(changed); + REQUIRE(text.length() == 4); + } + + SECTION("Control") { + { + std::string_view text = "12345\007"; + REQUIRE(text.length() == 6); + const bool changed = DiscardLastCombinedCharacter(text); + REQUIRE(changed); + REQUIRE(text.length() == 5); + } + { + std::string_view text = "12345\007Z"; + REQUIRE(text.length() == 7); + const bool changed = DiscardLastCombinedCharacter(text); + REQUIRE(changed); + REQUIRE(text.length() == 6); + } + } + + SECTION("Japanese") { + std::string_view text = "Japanese\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e"; + REQUIRE(text.length() == 17); + const bool changed = DiscardLastCombinedCharacter(text); + REQUIRE(changed); + REQUIRE(text.length() == 14); + } + + SECTION("Thai Combining") { + // Ends with two combined characters + // 7 characters, 5 base characters and 2 combining + // HO HIP, SARA AA, KHO KHAI, MAI THO, O ANG, MO MA, SARA UU + std::string_view text = "\xe0\xb8\xab\xe0\xb8\xb2\xe0\xb8\x82\xe0\xb9\x89\xe0\xb8\xad\xe0\xb8\xa1\xe0\xb8\xb9"; + REQUIRE(text.length() == 21); + const bool changed = DiscardLastCombinedCharacter(text); + REQUIRE(changed); + // Discarded 2 x 3-byte characters + REQUIRE(text.length() == 15); + } + + SECTION("Invalid UTF-8") { + { + // Ends with isolated lead byte + std::string_view text = "1234\xe0"; + REQUIRE(text.length() == 5); + const bool changed = DiscardLastCombinedCharacter(text); + REQUIRE(changed); + // Discarded final invalid byte + REQUIRE(text.length() == 4); + } + { + // Ends with isolated trail byte + std::string_view text = "1234\xb8"; + REQUIRE(text.length() == 5); + const bool changed = DiscardLastCombinedCharacter(text); + REQUIRE(changed); + // Discarded final invalid byte + REQUIRE(text.length() == 4); + } + { + // Ends with lead byte and only one of two required trail bytes + std::string_view text = "1234\xe0\xb8"; + REQUIRE(text.length() == 6); + const bool changed = DiscardLastCombinedCharacter(text); + REQUIRE(changed); + // Discarded final invalid byte + REQUIRE(text.length() == 5); + } + } + + SECTION("Private Use UTF-8") { + { + // Ends with private use area U+F8FF - Apple uses for apple symbol. + std::string_view text = "1234\xEF\xA3\xBF"; + REQUIRE(text.length() == 7); + const bool changed = DiscardLastCombinedCharacter(text); + REQUIRE(changed); + // Discarded whole final character + REQUIRE(text.length() == 4); + } + { + // At end: PUA + letter: PUA acts as base + std::string_view text = "1234\xEF\xA3\xBFZ"; + REQUIRE(text.length() == 8); + const bool changed = DiscardLastCombinedCharacter(text); + REQUIRE(changed); + // Discarded just final character + REQUIRE(text.length() == 7); + } + } + + SECTION("Surrogates") { + { + // Ends with surrogate U+D800. + std::string_view text = "1234\xED\xA0\x80"; + REQUIRE(text.length() == 7); + const bool changed = DiscardLastCombinedCharacter(text); + REQUIRE(changed); + // Discarded final invalid byte + REQUIRE(text.length() == 6); + } + { + // Ends with surrogate U+DC00. + std::string_view text = "1234\xED\xB0\x80"; + REQUIRE(text.length() == 7); + const bool changed = DiscardLastCombinedCharacter(text); + REQUIRE(changed); + // Discarded final invalid byte + REQUIRE(text.length() == 6); + } + } +} + TEST_CASE("PerLine") { SECTION("LineMarkers") { DocPlus doc("1\n2\n", CpUtf8); |