diff options
Diffstat (limited to 'src/Document.cxx')
-rw-r--r-- | src/Document.cxx | 26 |
1 files changed, 14 insertions, 12 deletions
diff --git a/src/Document.cxx b/src/Document.cxx index 7543b4940..d7497b825 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -1226,7 +1226,7 @@ void DiscardEndFragment(std::string_view &text) noexcept { } constexpr bool IsBaseOfGrapheme(CharacterCategory cc) { - // \p{L}\p{N}\p{P}\p{S}\p{Zs} + // \p{L}\p{N}\p{P}\p{Sm}\p{Sc}\p{So}\p{Zs} switch (cc) { case ccLu: case ccLl: @@ -1245,19 +1245,19 @@ constexpr bool IsBaseOfGrapheme(CharacterCategory cc) { case ccPo: case ccSm: case ccSc: - case ccSk: case ccSo: case ccZs: return true; default: // ccMn, ccMc, ccMe, + // ccSk, // ccZl, ccZp, // ccCc, ccCf, ccCs, ccCo, ccCn return false; } } -void DiscardLastCombinedCharacter(std::string_view &text) noexcept { +bool DiscardLastCombinedCharacter(std::string_view &text) noexcept { // Handle the simple common case where a base character may be followed by // accents and similar marks by discarding until start of base character. // @@ -1265,6 +1265,7 @@ void DiscardLastCombinedCharacter(std::string_view &text) noexcept { // combining character sequence = ccs-base? ccs-extend+ // ccs-base := [\p{L}\p{N}\p{P}\p{S}\p{Zs}] // ccs-extend := [\p{M}\p{Join_Control}] + // Modified to move Sk (Symbol Modifier) from ccs-base to ccs-extend to preserve modified emoji std::string_view truncated = text; while (truncated.length() > (UTF8MaxBytes * 2)) { @@ -1280,10 +1281,11 @@ void DiscardLastCombinedCharacter(std::string_view &text) noexcept { truncated.remove_suffix(countBytes); if (IsBaseOfGrapheme(cc)) { text = truncated; - return; + return true; } } // No base character found so just leave as is + return false; } } @@ -1313,6 +1315,13 @@ size_t Document::SafeSegment(std::string_view text) const noexcept { } if (!dbcsCodePage || dbcsCodePage == CpUtf8) { + if (dbcsCodePage) { + // UTF-8 + DiscardEndFragment(text); + if (DiscardLastCombinedCharacter(text)) { + return text.length(); + } + } // backward iterate for UTF-8 and single byte encoding to find word and punctuation boundary. std::string_view::iterator it = text.end() - 1; const bool punctuation = IsPunctuation(*it); @@ -1323,14 +1332,7 @@ size_t Document::SafeSegment(std::string_view text) const noexcept { } } while (it != text.begin()); - if (dbcsCodePage) { - // UTF-8 - DiscardEndFragment(text); - DiscardLastCombinedCharacter(text); - return text.length(); - } else { - return text.length() - 1; - } + return text.length() - 1; } { |