aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/Document.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'src/Document.cxx')
-rw-r--r--src/Document.cxx26
1 files changed, 14 insertions, 12 deletions
diff --git a/src/Document.cxx b/src/Document.cxx
index 7543b4940..d7497b825 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -1226,7 +1226,7 @@ void DiscardEndFragment(std::string_view &text) noexcept {
}
constexpr bool IsBaseOfGrapheme(CharacterCategory cc) {
- // \p{L}\p{N}\p{P}\p{S}\p{Zs}
+ // \p{L}\p{N}\p{P}\p{Sm}\p{Sc}\p{So}\p{Zs}
switch (cc) {
case ccLu:
case ccLl:
@@ -1245,19 +1245,19 @@ constexpr bool IsBaseOfGrapheme(CharacterCategory cc) {
case ccPo:
case ccSm:
case ccSc:
- case ccSk:
case ccSo:
case ccZs:
return true;
default:
// ccMn, ccMc, ccMe,
+ // ccSk,
// ccZl, ccZp,
// ccCc, ccCf, ccCs, ccCo, ccCn
return false;
}
}
-void DiscardLastCombinedCharacter(std::string_view &text) noexcept {
+bool DiscardLastCombinedCharacter(std::string_view &text) noexcept {
// Handle the simple common case where a base character may be followed by
// accents and similar marks by discarding until start of base character.
//
@@ -1265,6 +1265,7 @@ void DiscardLastCombinedCharacter(std::string_view &text) noexcept {
// combining character sequence = ccs-base? ccs-extend+
// ccs-base := [\p{L}\p{N}\p{P}\p{S}\p{Zs}]
// ccs-extend := [\p{M}\p{Join_Control}]
+ // Modified to move Sk (Symbol Modifier) from ccs-base to ccs-extend to preserve modified emoji
std::string_view truncated = text;
while (truncated.length() > (UTF8MaxBytes * 2)) {
@@ -1280,10 +1281,11 @@ void DiscardLastCombinedCharacter(std::string_view &text) noexcept {
truncated.remove_suffix(countBytes);
if (IsBaseOfGrapheme(cc)) {
text = truncated;
- return;
+ return true;
}
}
// No base character found so just leave as is
+ return false;
}
}
@@ -1313,6 +1315,13 @@ size_t Document::SafeSegment(std::string_view text) const noexcept {
}
if (!dbcsCodePage || dbcsCodePage == CpUtf8) {
+ if (dbcsCodePage) {
+ // UTF-8
+ DiscardEndFragment(text);
+ if (DiscardLastCombinedCharacter(text)) {
+ return text.length();
+ }
+ }
// backward iterate for UTF-8 and single byte encoding to find word and punctuation boundary.
std::string_view::iterator it = text.end() - 1;
const bool punctuation = IsPunctuation(*it);
@@ -1323,14 +1332,7 @@ size_t Document::SafeSegment(std::string_view text) const noexcept {
}
} while (it != text.begin());
- if (dbcsCodePage) {
- // UTF-8
- DiscardEndFragment(text);
- DiscardLastCombinedCharacter(text);
- return text.length();
- } else {
- return text.length() - 1;
- }
+ return text.length() - 1;
}
{