aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorNeil <nyamatongwe@gmail.com>2025-03-03 10:30:30 +1100
committerNeil <nyamatongwe@gmail.com>2025-03-03 10:30:30 +1100
commit00838f369d7ca189b498f70c8c446ccc64cc0e2f (patch)
tree5abaec5831fe4c1fe7bd74e112b42a30cc06e4d4
parent8a392e9483783e63e2e63dc71bfd389523fe537d (diff)
downloadscintilla-mirror-00838f369d7ca189b498f70c8c446ccc64cc0e2f.tar.gz
Feature [feature-requests:#1417]. Improve UTF-8 segmentation for some control
characters and invalid bytes. Add more test cases.
-rw-r--r--src/Document.cxx10
-rw-r--r--test/unit/testDocument.cxx131
2 files changed, 139 insertions, 2 deletions
diff --git a/src/Document.cxx b/src/Document.cxx
index 8fc76e426..f211f0a7c 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -1229,8 +1229,8 @@ void DiscardEndFragment(std::string_view &text) noexcept {
}
constexpr bool IsBaseOfGrapheme(CharacterCategory cc) {
- // \p{L}\p{N}\p{P}\p{Sm}\p{Sc}\p{So}\p{Zs}
switch (cc) {
+ // \p{L}\p{N}\p{P}\p{Sm}\p{Sc}\p{So}\p{Zs}
case ccLu:
case ccLl:
case ccLt:
@@ -1250,12 +1250,16 @@ constexpr bool IsBaseOfGrapheme(CharacterCategory cc) {
case ccSc:
case ccSo:
case ccZs:
+ // Control
+ case ccCc:
+ case ccCs:
+ case ccCo:
return true;
default:
// ccMn, ccMc, ccMe,
// ccSk,
// ccZl, ccZp,
- // ccCc, ccCf, ccCs, ccCo, ccCn
+ // ccCf, ccCn
return false;
}
}
@@ -1287,6 +1291,8 @@ bool Scintilla::Internal::DiscardLastCombinedCharacter(std::string_view &text) n
// ccs-base := [\p{L}\p{N}\p{P}\p{S}\p{Zs}]
// ccs-extend := [\p{M}\p{Join_Control}]
// Modified to move Sk (Symbol Modifier) from ccs-base to ccs-extend to preserve modified emoji
+ // May break before and after Control which is defined as most of ccC? but not some of ccCf and ccCn
+ // so treat ccCc, ccCs, ccCo as base for now.
std::string_view truncated = text;
while (truncated.length() > UTF8MaxBytes) {
diff --git a/test/unit/testDocument.cxx b/test/unit/testDocument.cxx
index e764e209b..fb8448da4 100644
--- a/test/unit/testDocument.cxx
+++ b/test/unit/testDocument.cxx
@@ -1042,6 +1042,137 @@ TEST_CASE("SafeSegment") {
}
}
+TEST_CASE("DiscardLastCombinedCharacter") {
+ SECTION("Short") {
+ const std::string_view base = "12345";
+ // Short strings (up to 4 bytes) aren't changed to avoid null and problematic results
+ for (size_t len = 0; len < 5; len++) {
+ std::string_view text = base.substr(0, len);
+ REQUIRE(text.length() == len);
+ const bool changed = DiscardLastCombinedCharacter(text);
+ REQUIRE(!changed);
+ REQUIRE(text.length() == len);
+ }
+ }
+
+ SECTION("ASCII") {
+ std::string_view text = "12345";
+ REQUIRE(text.length() == 5);
+ const bool changed = DiscardLastCombinedCharacter(text);
+ REQUIRE(changed);
+ REQUIRE(text.length() == 4);
+ }
+
+ SECTION("Control") {
+ {
+ std::string_view text = "12345\007";
+ REQUIRE(text.length() == 6);
+ const bool changed = DiscardLastCombinedCharacter(text);
+ REQUIRE(changed);
+ REQUIRE(text.length() == 5);
+ }
+ {
+ std::string_view text = "12345\007Z";
+ REQUIRE(text.length() == 7);
+ const bool changed = DiscardLastCombinedCharacter(text);
+ REQUIRE(changed);
+ REQUIRE(text.length() == 6);
+ }
+ }
+
+ SECTION("Japanese") {
+ std::string_view text = "Japanese\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e";
+ REQUIRE(text.length() == 17);
+ const bool changed = DiscardLastCombinedCharacter(text);
+ REQUIRE(changed);
+ REQUIRE(text.length() == 14);
+ }
+
+ SECTION("Thai Combining") {
+ // Ends with two combined characters
+ // 7 characters, 5 base characters and 2 combining
+ // HO HIP, SARA AA, KHO KHAI, MAI THO, O ANG, MO MA, SARA UU
+ std::string_view text = "\xe0\xb8\xab\xe0\xb8\xb2\xe0\xb8\x82\xe0\xb9\x89\xe0\xb8\xad\xe0\xb8\xa1\xe0\xb8\xb9";
+ REQUIRE(text.length() == 21);
+ const bool changed = DiscardLastCombinedCharacter(text);
+ REQUIRE(changed);
+ // Discarded 2 x 3-byte characters
+ REQUIRE(text.length() == 15);
+ }
+
+ SECTION("Invalid UTF-8") {
+ {
+ // Ends with isolated lead byte
+ std::string_view text = "1234\xe0";
+ REQUIRE(text.length() == 5);
+ const bool changed = DiscardLastCombinedCharacter(text);
+ REQUIRE(changed);
+ // Discarded final invalid byte
+ REQUIRE(text.length() == 4);
+ }
+ {
+ // Ends with isolated trail byte
+ std::string_view text = "1234\xb8";
+ REQUIRE(text.length() == 5);
+ const bool changed = DiscardLastCombinedCharacter(text);
+ REQUIRE(changed);
+ // Discarded final invalid byte
+ REQUIRE(text.length() == 4);
+ }
+ {
+ // Ends with lead byte and only one of two required trail bytes
+ std::string_view text = "1234\xe0\xb8";
+ REQUIRE(text.length() == 6);
+ const bool changed = DiscardLastCombinedCharacter(text);
+ REQUIRE(changed);
+ // Discarded final invalid byte
+ REQUIRE(text.length() == 5);
+ }
+ }
+
+ SECTION("Private Use UTF-8") {
+ {
+ // Ends with private use area U+F8FF - Apple uses for apple symbol.
+ std::string_view text = "1234\xEF\xA3\xBF";
+ REQUIRE(text.length() == 7);
+ const bool changed = DiscardLastCombinedCharacter(text);
+ REQUIRE(changed);
+ // Discarded whole final character
+ REQUIRE(text.length() == 4);
+ }
+ {
+ // At end: PUA + letter: PUA acts as base
+ std::string_view text = "1234\xEF\xA3\xBFZ";
+ REQUIRE(text.length() == 8);
+ const bool changed = DiscardLastCombinedCharacter(text);
+ REQUIRE(changed);
+ // Discarded just final character
+ REQUIRE(text.length() == 7);
+ }
+ }
+
+ SECTION("Surrogates") {
+ {
+ // Ends with surrogate U+D800.
+ std::string_view text = "1234\xED\xA0\x80";
+ REQUIRE(text.length() == 7);
+ const bool changed = DiscardLastCombinedCharacter(text);
+ REQUIRE(changed);
+ // Discarded final invalid byte
+ REQUIRE(text.length() == 6);
+ }
+ {
+ // Ends with surrogate U+DC00.
+ std::string_view text = "1234\xED\xB0\x80";
+ REQUIRE(text.length() == 7);
+ const bool changed = DiscardLastCombinedCharacter(text);
+ REQUIRE(changed);
+ // Discarded final invalid byte
+ REQUIRE(text.length() == 6);
+ }
+ }
+}
+
TEST_CASE("PerLine") {
SECTION("LineMarkers") {
DocPlus doc("1\n2\n", CpUtf8);