Feature [feature-requests:#1417]. Fix some UTF-8 segmentation bugs by

prioritising Unicode-safe base character check over ASCII punctuation check and by treating emoji modifiers as modifiers instead of base characters. This is better for 1) Keycap emoji: *, VARIATION SELECTOR-16, COMBINING ENCLOSING KEYCAP 2) Emoji + skin tone: WAVING HAND SIGN, EMOJI MODIFIER FITZPATRICK TYPE-1-2
author: Neil <nyamatongwe@gmail.com> 2025-02-14 11:11:06 +1100
committer: Neil <nyamatongwe@gmail.com> 2025-02-14 11:11:06 +1100
commit: 33b4899b327695a66e3dd2ef056516a45a253878 (patch)
tree: 283ba9cece7b2cf9d61a2610e687505db3452c7a /src
parent: 9cd498d964084581648508bb661511865fefb775 (diff)
download: scintilla-mirror-33b4899b327695a66e3dd2ef056516a45a253878.tar.gz
1 files changed, 14 insertions, 12 deletions
diff --git a/src/Document.cxx b/src/Document.cxx
index 7543b4940..d7497b825 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -1226,7 +1226,7 @@ void DiscardEndFragment(std::string_view &text) noexcept {
 }
 
 constexpr bool IsBaseOfGrapheme(CharacterCategory cc) {
-	// \p{L}\p{N}\p{P}\p{S}\p{Zs}
+	// \p{L}\p{N}\p{P}\p{Sm}\p{Sc}\p{So}\p{Zs}
 	switch (cc) {
 	case ccLu:
 	case ccLl:
@@ -1245,19 +1245,19 @@ constexpr bool IsBaseOfGrapheme(CharacterCategory cc) {
 	case ccPo:
 	case ccSm:
 	case ccSc:
-	case ccSk:
 	case ccSo:
 	case ccZs:
 		return true;
 	default:
 		// ccMn, ccMc, ccMe,
+		// ccSk,
 		// ccZl, ccZp,
 		// ccCc, ccCf, ccCs, ccCo, ccCn
 		return false;
 	}
 }
 
-void DiscardLastCombinedCharacter(std::string_view &text) noexcept {
+bool DiscardLastCombinedCharacter(std::string_view &text) noexcept {
 	// Handle the simple common case where a base character may be followed by
 	// accents and similar marks by discarding until start of base character.
 	// 
@@ -1265,6 +1265,7 @@ void DiscardLastCombinedCharacter(std::string_view &text) noexcept {
 	// combining character sequence = ccs-base? ccs-extend+
 	// ccs-base := [\p{L}\p{N}\p{P}\p{S}\p{Zs}]
 	// ccs-extend := [\p{M}\p{Join_Control}]
+	// Modified to move Sk (Symbol Modifier) from ccs-base to ccs-extend to preserve modified emoji
 
 	std::string_view truncated = text;
 	while (truncated.length() > (UTF8MaxBytes * 2)) {
@@ -1280,10 +1281,11 @@ void DiscardLastCombinedCharacter(std::string_view &text) noexcept {
 		truncated.remove_suffix(countBytes);
 		if (IsBaseOfGrapheme(cc)) {
 			text = truncated;
-			return;
+			return true;
 		}
 	}
 	// No base character found so just leave as is
+	return false;
 }
 
 }
@@ -1313,6 +1315,13 @@ size_t Document::SafeSegment(std::string_view text) const noexcept {
 	}
 
 	if (!dbcsCodePage || dbcsCodePage == CpUtf8) {
+		if (dbcsCodePage) {
+			// UTF-8
+			DiscardEndFragment(text);
+			if (DiscardLastCombinedCharacter(text)) {
+				return text.length();
+			}
+		}
 		// backward iterate for UTF-8 and single byte encoding to find word and punctuation boundary.
 		std::string_view::iterator it = text.end() - 1;
 		const bool punctuation = IsPunctuation(*it);
@@ -1323,14 +1332,7 @@ size_t Document::SafeSegment(std::string_view text) const noexcept {
 			}
 		} while (it != text.begin());
 
-		if (dbcsCodePage) {
-			// UTF-8
-			DiscardEndFragment(text);
-			DiscardLastCombinedCharacter(text);
-			return text.length();
-		} else {
-			return text.length() - 1;
-		}
+		return text.length() - 1;
 	}
 
 	{
author	Neil <nyamatongwe@gmail.com>	2025-02-14 11:11:06 +1100
committer	Neil <nyamatongwe@gmail.com>	2025-02-14 11:11:06 +1100
commit	33b4899b327695a66e3dd2ef056516a45a253878 (patch)
tree	283ba9cece7b2cf9d61a2610e687505db3452c7a /src
parent	9cd498d964084581648508bb661511865fefb775 (diff)
download	scintilla-mirror-33b4899b327695a66e3dd2ef056516a45a253878.tar.gz