diff options
author | Neil <nyamatongwe@gmail.com> | 2025-03-03 10:30:30 +1100 |
---|---|---|
committer | Neil <nyamatongwe@gmail.com> | 2025-03-03 10:30:30 +1100 |
commit | 00838f369d7ca189b498f70c8c446ccc64cc0e2f (patch) | |
tree | 5abaec5831fe4c1fe7bd74e112b42a30cc06e4d4 /src | |
parent | 8a392e9483783e63e2e63dc71bfd389523fe537d (diff) | |
download | scintilla-mirror-00838f369d7ca189b498f70c8c446ccc64cc0e2f.tar.gz |
Feature [feature-requests:#1417]. Improve UTF-8 segmentation for some control
characters and invalid bytes.
Add more test cases.
Diffstat (limited to 'src')
-rw-r--r-- | src/Document.cxx | 10 |
1 files changed, 8 insertions, 2 deletions
diff --git a/src/Document.cxx b/src/Document.cxx index 8fc76e426..f211f0a7c 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -1229,8 +1229,8 @@ void DiscardEndFragment(std::string_view &text) noexcept { } constexpr bool IsBaseOfGrapheme(CharacterCategory cc) { - // \p{L}\p{N}\p{P}\p{Sm}\p{Sc}\p{So}\p{Zs} switch (cc) { + // \p{L}\p{N}\p{P}\p{Sm}\p{Sc}\p{So}\p{Zs} case ccLu: case ccLl: case ccLt: @@ -1250,12 +1250,16 @@ constexpr bool IsBaseOfGrapheme(CharacterCategory cc) { case ccSc: case ccSo: case ccZs: + // Control + case ccCc: + case ccCs: + case ccCo: return true; default: // ccMn, ccMc, ccMe, // ccSk, // ccZl, ccZp, - // ccCc, ccCf, ccCs, ccCo, ccCn + // ccCf, ccCn return false; } } @@ -1287,6 +1291,8 @@ bool Scintilla::Internal::DiscardLastCombinedCharacter(std::string_view &text) n // ccs-base := [\p{L}\p{N}\p{P}\p{S}\p{Zs}] // ccs-extend := [\p{M}\p{Join_Control}] // Modified to move Sk (Symbol Modifier) from ccs-base to ccs-extend to preserve modified emoji + // May break before and after Control which is defined as most of ccC? but not some of ccCf and ccCn + // so treat ccCc, ccCs, ccCo as base for now. std::string_view truncated = text; while (truncated.length() > UTF8MaxBytes) { |