Fix segmentation of long lexemes to avoid breaking before modifiers like accents

that must be drawn with their base letters. This is only a subset of implementing grapheme cluster boundaries but it improves behaviour with some Asian scripts like Thai and Javanese. Javanese is mostly written with (ASCII) Roman characters so issues will be rare but Thai uses Thai script. Also slightly improves placement of combining accents in European texts. https://github.com/notepad-plus-plus/notepad-plus-plus/issues/14822 https://github.com/notepad-plus-plus/notepad-plus-plus/issues/16115
author: Neil <nyamatongwe@gmail.com> 2025-02-04 11:47:48 +1100
committer: Neil <nyamatongwe@gmail.com> 2025-02-04 11:47:48 +1100
commit: 4c9ddc3121d0488914858ee511028520b96fd0e9 (patch)
tree: 03989eae1ce94f479749ef74e1e6c76c10f3e332
parent: ef961772c3ced424f034c2055263d7231eccee01 (diff)
download: scintilla-mirror-4c9ddc3121d0488914858ee511028520b96fd0e9.tar.gz
4 files changed, 157 insertions, 7 deletions
diff --git a/doc/ScintillaHistory.html b/doc/ScintillaHistory.html
index 782801b7a..ef9ad02d4 100644
--- a/doc/ScintillaHistory.html
+++ b/doc/ScintillaHistory.html
@@ -604,6 +604,9 @@
 	Serialize selection type and ranges with SCI_GETSELECTIONSERIALIZED and SCI_SETSELECTIONSERIALIZED.
 	</li>
 	<li>
+	Fix segmentation of long lexemes to avoid breaking before modifiers like accents that must be drawn with their base letters.
+	</li>
+	<li>
 	Fix bug on Qt where double-click stopped working when Scintilla instance had been running for weeks.
 	</li>
     </ul>
diff --git a/src/Document.cxx b/src/Document.cxx
index 379a88786..dc82b1902 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -1202,6 +1202,92 @@ bool Document::IsDBCSDualByteAt(Sci::Position pos) const noexcept {
 		&& IsDBCSTrailByteNoExcept(cb.CharAt(pos + 1));
 }
 
+namespace {
+
+// Remove any extra bytes after the last valid character.
+void DiscardEndFragment(std::string_view &text) noexcept {
+	if (!text.empty()) {
+		if (UTF8IsFirstByte(text.back())) {
+			// Ending with start of character byte is invalid
+			text.remove_suffix(1);
+		} else if (UTF8IsTrailByte(text.back())) {
+			// go back to the start of last character.
+			const size_t maxTrail = std::max<size_t>(UTF8MaxBytes - 1, text.length());
+			size_t trail = 1;
+			while (trail < maxTrail && UTF8IsTrailByte(text[text.length() - trail])) {
+				trail++;
+			}
+			const std::string_view endPortion = text.substr(text.length() - trail);
+			if (!UTF8IsValid(endPortion)) {
+				text.remove_suffix(trail);
+			}
+		}
+	}
+}
+
+constexpr bool IsBaseOfGrapheme(CharacterCategory cc) {
+	// \p{L}\p{N}\p{P}\p{S}\p{Zs}
+	switch (cc) {
+	case ccLu:
+	case ccLl:
+	case ccLt:
+	case ccLm:
+	case ccLo:
+	case ccNd:
+	case ccNl:
+	case ccNo:
+	case ccPc:
+	case ccPd:
+	case ccPs:
+	case ccPe:
+	case ccPi:
+	case ccPf:
+	case ccPo:
+	case ccSm:
+	case ccSc:
+	case ccSk:
+	case ccSo:
+	case ccZs:
+		return true;
+	default:
+		// ccMn, ccMc, ccMe,
+		// ccZl, ccZp,
+		// ccCc, ccCf, ccCs, ccCo, ccCn
+		return false;
+	}
+}
+
+void DiscardLastCombinedCharacter(std::string_view &text) noexcept {
+	// Handle the simple common case where a base character may be followed by
+	// accents and similar marks by discarding until start of base character.
+	// 
+	// From Grapheme_Cluster_Boundaries
+	// combining character sequence = ccs-base? ccs-extend+
+	// ccs-base := [\p{L}\p{N}\p{P}\p{S}\p{Zs}]
+	// ccs-extend := [\p{M}\p{Join_Control}]
+
+	std::string_view truncated = text;
+	while (truncated.length() > (UTF8MaxBytes * 2)) {
+		// Give up when short
+		std::string_view::iterator it = truncated.end() - 1;
+		// For UTF-8 go back to the start of last character.
+		for (int trail = 0; trail < UTF8MaxBytes - 1 && UTF8IsTrailByte(*it); trail++) {
+			--it;
+		}
+		const size_t countBytes = truncated.end() - it;
+		const std::string_view svLastCharacter = truncated.substr(truncated.length() - countBytes);
+		const CharacterCategory cc = CategoriseCharacter(UnicodeFromUTF8(svLastCharacter));
+		truncated.remove_suffix(countBytes);
+		if (IsBaseOfGrapheme(cc)) {
+			text = truncated;
+			return;
+		}
+	}
+	// No base character found so just leave as is
+}
+
+}
+
 // Need to break text into segments near end but taking into account the
 // encoding to not break inside a UTF-8 or DBCS character and also trying
 // to avoid breaking inside a pair of combining characters, or inside
@@ -1215,7 +1301,8 @@ bool Document::IsDBCSDualByteAt(Sci::Position pos) const noexcept {
 // In preference order from best to worst:
 //   1) Break before or after spaces or controls
 //   2) Break at word and punctuation boundary for better kerning and ligature support
-//   3) Break after whole character, this may break combining characters
+//   3) Break before letter in UTF-8 to avoid breaking combining characters
+//   4) Break after whole character, this may break combining characters
 
 size_t Document::SafeSegment(std::string_view text) const noexcept {
 	// check space first as most written language use spaces.
@@ -1236,14 +1323,14 @@ size_t Document::SafeSegment(std::string_view text) const noexcept {
 			}
 		} while (it != text.begin());
 
-		it = text.end() - 1;
 		if (dbcsCodePage) {
-			// for UTF-8 go back to the start of last character.
-			for (int trail = 0; trail < UTF8MaxBytes - 1 && UTF8IsTrailByte(*it); trail++) {
-				--it;
-			}
+			// UTF-8
+			DiscardEndFragment(text);
+			DiscardLastCombinedCharacter(text);
+			return text.length();
+		} else {
+			return text.length() - 1;
 		}
-		return it - text.begin();
 	}
 
 	{
diff --git a/src/UniConversion.h b/src/UniConversion.h
index 7a51b2d08..5990cca8c 100644
--- a/src/UniConversion.h
+++ b/src/UniConversion.h
@@ -49,6 +49,10 @@ constexpr bool UTF8IsTrailByte(unsigned char ch) noexcept {
 	return (ch >= 0x80) && (ch < 0xc0);
 }
 
+constexpr bool UTF8IsFirstByte(unsigned char ch) noexcept {
+	return (ch >= 0xc2) && (ch <= 0xf4);
+}
+
 constexpr bool UTF8IsAscii(unsigned char ch) noexcept {
 	return ch < 0x80;
 }
diff --git a/test/unit/testDocument.cxx b/test/unit/testDocument.cxx
index e4b674987..ad1384ee7 100644
--- a/test/unit/testDocument.cxx
+++ b/test/unit/testDocument.cxx
@@ -34,6 +34,7 @@
 #include "Decoration.h"
 #include "CaseFolder.h"
 #include "Document.h"
+#include "UniConversion.h"
 
 #include "catch.hpp"
 
@@ -957,6 +958,61 @@ TEST_CASE("SafeSegment") {
 		REQUIRE(text[length] == '\xf0');
 	}
 
+	SECTION("UTF-8 Character Fragments") {
+		// PositionCache breaks long texts into fixed length sub-strings that are passed to SafeSegment
+		// so the final character in the sub-string may be incomplete without all needed trail bytes.
+		// For UTF-8, SafeSegment first discards any final bytes that do not represent a valid character
+		// then discards the final whole character.
+
+		const DocPlus doc("", CpUtf8);
+
+		// break before last character after discarding incomplete last character: 0 trail byte
+		std::string_view text = "Japanese\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xc2";	// Invalid text as ends with start byte
+		size_t length = doc.document.SafeSegment(text);
+		REQUIRE(text[length - 1] == '\xac');
+		REQUIRE(text[length] == '\xe8');
+		REQUIRE(UTF8IsValid(text.substr(0, length)));
+
+		// break before last character after discarding incomplete last character: 1 trail byte and 2 needed
+		text = "Japanese\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe6\x97";	// Invalid text as ends with only 1 trail byte
+		length = doc.document.SafeSegment(text);
+		REQUIRE(text[length - 1] == '\xac');
+		REQUIRE(text[length] == '\xe8');
+		REQUIRE(UTF8IsValid(text.substr(0, length)));
+	}
+
+	SECTION("UTF-8 Combining Characters") {
+		const DocPlus doc("", CpUtf8);
+
+		// There may be combining characters like accents and tone marks after the
+		// last letter in a sub-string and these may be included in the sub-string
+		// or follow it.
+		// Correct display requires that the combining characters are measured and
+		// drawn with the letter they follow. Thus the final letter and any
+		// following combining characters are discarded.
+
+		// A Thai text example with 8 characters, each taking 3 bytes:
+		// HO HIP, SARA AA, KHO KHAI, MAI THO, O ANG, MO MA, SARA UU, LO LING
+		// Most are letters (Lo) but 2 characters are modifiers (Mn):
+		// MAI THO is a tone mark and SARA UU is a vowel.
+		const std::string_view text = "\xe0\xb8\xab\xe0\xb8\xb2\xe0\xb8\x82\xe0\xb9\x89\xe0\xb8\xad\xe0\xb8\xa1\xe0\xb8\xb9\xe0\xb8\xa5";
+		REQUIRE(text.length() == 8 * 3);
+		size_t length = doc.document.SafeSegment(text);
+		REQUIRE(length == (8 - 1) * 3);	// Discard last character
+
+		// Remove last character (letter LO LING) then run again.
+		// Should skip past SARA UU combining vowel mark to discard letter MO MA and SARA UU.
+		const std::string_view textWithoutLoLing = text.substr(0, length);
+		length = doc.document.SafeSegment(textWithoutLoLing);
+		REQUIRE(length == (8 - 3) * 3);	// Discard 2 characters
+
+		// Remove last character SARA UU combining vowel mark then run again
+		// Final letter may have following combining mark so discard producing same text as previous step.
+		const std::string_view textWithoutSaraUu = text.substr(0, (8 - 2) * 3);
+		length = doc.document.SafeSegment(textWithoutSaraUu);
+		REQUIRE(length == (8 - 3) * 3);	// Discard 1 character
+	}
+
 	SECTION("DBCS Shift-JIS") {
 		const DocPlus doc("", 932);
 		// word and punctuation boundary in middle of text: single byte
author	Neil <nyamatongwe@gmail.com>	2025-02-04 11:47:48 +1100
committer	Neil <nyamatongwe@gmail.com>	2025-02-04 11:47:48 +1100
commit	4c9ddc3121d0488914858ee511028520b96fd0e9 (patch)
tree	03989eae1ce94f479749ef74e1e6c76c10f3e332
parent	ef961772c3ced424f034c2055263d7231eccee01 (diff)
download	scintilla-mirror-4c9ddc3121d0488914858ee511028520b96fd0e9.tar.gz