From 00838f369d7ca189b498f70c8c446ccc64cc0e2f Mon Sep 17 00:00:00 2001 From: Neil Date: Mon, 3 Mar 2025 10:30:30 +1100 Subject: Feature [feature-requests:#1417]. Improve UTF-8 segmentation for some control characters and invalid bytes. Add more test cases. --- src/Document.cxx | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/Document.cxx b/src/Document.cxx index 8fc76e426..f211f0a7c 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -1229,8 +1229,8 @@ void DiscardEndFragment(std::string_view &text) noexcept { } constexpr bool IsBaseOfGrapheme(CharacterCategory cc) { - // \p{L}\p{N}\p{P}\p{Sm}\p{Sc}\p{So}\p{Zs} switch (cc) { + // \p{L}\p{N}\p{P}\p{Sm}\p{Sc}\p{So}\p{Zs} case ccLu: case ccLl: case ccLt: @@ -1250,12 +1250,16 @@ constexpr bool IsBaseOfGrapheme(CharacterCategory cc) { case ccSc: case ccSo: case ccZs: + // Control + case ccCc: + case ccCs: + case ccCo: return true; default: // ccMn, ccMc, ccMe, // ccSk, // ccZl, ccZp, - // ccCc, ccCf, ccCs, ccCo, ccCn + // ccCf, ccCn return false; } } @@ -1287,6 +1291,8 @@ bool Scintilla::Internal::DiscardLastCombinedCharacter(std::string_view &text) n // ccs-base := [\p{L}\p{N}\p{P}\p{S}\p{Zs}] // ccs-extend := [\p{M}\p{Join_Control}] // Modified to move Sk (Symbol Modifier) from ccs-base to ccs-extend to preserve modified emoji + // May break before and after Control which is defined as most of ccC? but not some of ccCf and ccCn + // so treat ccCc, ccCs, ccCo as base for now. std::string_view truncated = text; while (truncated.length() > UTF8MaxBytes) { -- cgit v1.2.3