aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authorNeil <nyamatongwe@gmail.com>2025-03-03 10:30:30 +1100
committerNeil <nyamatongwe@gmail.com>2025-03-03 10:30:30 +1100
commit00838f369d7ca189b498f70c8c446ccc64cc0e2f (patch)
tree5abaec5831fe4c1fe7bd74e112b42a30cc06e4d4 /src
parent8a392e9483783e63e2e63dc71bfd389523fe537d (diff)
downloadscintilla-mirror-00838f369d7ca189b498f70c8c446ccc64cc0e2f.tar.gz
Feature [feature-requests:#1417]. Improve UTF-8 segmentation for some control
characters and invalid bytes. Add more test cases.
Diffstat (limited to 'src')
-rw-r--r--src/Document.cxx10
1 files changed, 8 insertions, 2 deletions
diff --git a/src/Document.cxx b/src/Document.cxx
index 8fc76e426..f211f0a7c 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -1229,8 +1229,8 @@ void DiscardEndFragment(std::string_view &text) noexcept {
}
constexpr bool IsBaseOfGrapheme(CharacterCategory cc) {
- // \p{L}\p{N}\p{P}\p{Sm}\p{Sc}\p{So}\p{Zs}
switch (cc) {
+ // \p{L}\p{N}\p{P}\p{Sm}\p{Sc}\p{So}\p{Zs}
case ccLu:
case ccLl:
case ccLt:
@@ -1250,12 +1250,16 @@ constexpr bool IsBaseOfGrapheme(CharacterCategory cc) {
case ccSc:
case ccSo:
case ccZs:
+ // Control
+ case ccCc:
+ case ccCs:
+ case ccCo:
return true;
default:
// ccMn, ccMc, ccMe,
// ccSk,
// ccZl, ccZp,
- // ccCc, ccCf, ccCs, ccCo, ccCn
+ // ccCf, ccCn
return false;
}
}
@@ -1287,6 +1291,8 @@ bool Scintilla::Internal::DiscardLastCombinedCharacter(std::string_view &text) n
// ccs-base := [\p{L}\p{N}\p{P}\p{S}\p{Zs}]
// ccs-extend := [\p{M}\p{Join_Control}]
// Modified to move Sk (Symbol Modifier) from ccs-base to ccs-extend to preserve modified emoji
+ // May break before and after Control which is defined as most of ccC? but not some of ccCf and ccCn
+ // so treat ccCc, ccCs, ccCo as base for now.
std::string_view truncated = text;
while (truncated.length() > UTF8MaxBytes) {