diff options
author | Neil <nyamatongwe@gmail.com> | 2025-02-14 11:11:06 +1100 |
---|---|---|
committer | Neil <nyamatongwe@gmail.com> | 2025-02-14 11:11:06 +1100 |
commit | 33b4899b327695a66e3dd2ef056516a45a253878 (patch) | |
tree | 283ba9cece7b2cf9d61a2610e687505db3452c7a /src | |
parent | 9cd498d964084581648508bb661511865fefb775 (diff) | |
download | scintilla-mirror-33b4899b327695a66e3dd2ef056516a45a253878.tar.gz |
Feature [feature-requests:#1417]. Fix some UTF-8 segmentation bugs by
prioritising Unicode-safe base character check over ASCII punctuation check and
by treating emoji modifiers as modifiers instead of base characters.
This is better for
1) Keycap emoji: *, VARIATION SELECTOR-16, COMBINING ENCLOSING KEYCAP
2) Emoji + skin tone: WAVING HAND SIGN, EMOJI MODIFIER FITZPATRICK TYPE-1-2
Diffstat (limited to 'src')
-rw-r--r-- | src/Document.cxx | 26 |
1 files changed, 14 insertions, 12 deletions
diff --git a/src/Document.cxx b/src/Document.cxx index 7543b4940..d7497b825 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -1226,7 +1226,7 @@ void DiscardEndFragment(std::string_view &text) noexcept { } constexpr bool IsBaseOfGrapheme(CharacterCategory cc) { - // \p{L}\p{N}\p{P}\p{S}\p{Zs} + // \p{L}\p{N}\p{P}\p{Sm}\p{Sc}\p{So}\p{Zs} switch (cc) { case ccLu: case ccLl: @@ -1245,19 +1245,19 @@ constexpr bool IsBaseOfGrapheme(CharacterCategory cc) { case ccPo: case ccSm: case ccSc: - case ccSk: case ccSo: case ccZs: return true; default: // ccMn, ccMc, ccMe, + // ccSk, // ccZl, ccZp, // ccCc, ccCf, ccCs, ccCo, ccCn return false; } } -void DiscardLastCombinedCharacter(std::string_view &text) noexcept { +bool DiscardLastCombinedCharacter(std::string_view &text) noexcept { // Handle the simple common case where a base character may be followed by // accents and similar marks by discarding until start of base character. // @@ -1265,6 +1265,7 @@ void DiscardLastCombinedCharacter(std::string_view &text) noexcept { // combining character sequence = ccs-base? ccs-extend+ // ccs-base := [\p{L}\p{N}\p{P}\p{S}\p{Zs}] // ccs-extend := [\p{M}\p{Join_Control}] + // Modified to move Sk (Symbol Modifier) from ccs-base to ccs-extend to preserve modified emoji std::string_view truncated = text; while (truncated.length() > (UTF8MaxBytes * 2)) { @@ -1280,10 +1281,11 @@ void DiscardLastCombinedCharacter(std::string_view &text) noexcept { truncated.remove_suffix(countBytes); if (IsBaseOfGrapheme(cc)) { text = truncated; - return; + return true; } } // No base character found so just leave as is + return false; } } @@ -1313,6 +1315,13 @@ size_t Document::SafeSegment(std::string_view text) const noexcept { } if (!dbcsCodePage || dbcsCodePage == CpUtf8) { + if (dbcsCodePage) { + // UTF-8 + DiscardEndFragment(text); + if (DiscardLastCombinedCharacter(text)) { + return text.length(); + } + } // backward iterate for UTF-8 and single byte encoding to find word and punctuation boundary. std::string_view::iterator it = text.end() - 1; const bool punctuation = IsPunctuation(*it); @@ -1323,14 +1332,7 @@ size_t Document::SafeSegment(std::string_view text) const noexcept { } } while (it != text.begin()); - if (dbcsCodePage) { - // UTF-8 - DiscardEndFragment(text); - DiscardLastCombinedCharacter(text); - return text.length(); - } else { - return text.length() - 1; - } + return text.length() - 1; } { |