4 files changed, 157 insertions, 7 deletions
diff --git a/doc/ScintillaHistory.html b/doc/ScintillaHistory.html
index 782801b7a..ef9ad02d4 100644
--- a/doc/ScintillaHistory.html
+++ b/doc/ScintillaHistory.html
@@ -604,6 +604,9 @@
 	Serialize selection type and ranges with SCI_GETSELECTIONSERIALIZED and SCI_SETSELECTIONSERIALIZED.
 	</li>
 	<li>
+	Fix segmentation of long lexemes to avoid breaking before modifiers like accents that must be drawn with their base letters.
+	</li>
+	<li>
 	Fix bug on Qt where double-click stopped working when Scintilla instance had been running for weeks.
 	</li>
     </ul>
diff --git a/src/Document.cxx b/src/Document.cxx
index 379a88786..dc82b1902 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -1202,6 +1202,92 @@ bool Document::IsDBCSDualByteAt(Sci::Position pos) const noexcept {
 		&& IsDBCSTrailByteNoExcept(cb.CharAt(pos + 1));
 }
 
+namespace {
+
+// Remove any extra bytes after the last valid character.
+void DiscardEndFragment(std::string_view &text) noexcept {
+	if (!text.empty()) {
+		if (UTF8IsFirstByte(text.back())) {
+			// Ending with start of character byte is invalid
+			text.remove_suffix(1);
+		} else if (UTF8IsTrailByte(text.back())) {
+			// go back to the start of last character.
+			const size_t maxTrail = std::max<size_t>(UTF8MaxBytes - 1, text.length());
+			size_t trail = 1;
+			while (trail < maxTrail && UTF8IsTrailByte(text[text.length() - trail])) {
+				trail++;
+			}
+			const std::string_view endPortion = text.substr(text.length() - trail);
+			if (!UTF8IsValid(endPortion)) {
+				text.remove_suffix(trail);
+			}
+		}
+	}
+}
+
+constexpr bool IsBaseOfGrapheme(CharacterCategory cc) {
+	// \p{L}\p{N}\p{P}\p{S}\p{Zs}
+	switch (cc) {
+	case ccLu:
+	case ccLl:
+	case ccLt:
+	case ccLm:
+	case ccLo:
+	case ccNd:
+	case ccNl:
+	case ccNo:
+	case ccPc:
+	case ccPd:
+	case ccPs:
+	case ccPe:
+	case ccPi:
+	case ccPf:
+	case ccPo:
+	case ccSm:
+	case ccSc:
+	case ccSk:
+	case ccSo:
+	case ccZs:
+		return true;
+	default:
+		// ccMn, ccMc, ccMe,
+		// ccZl, ccZp,
+		// ccCc, ccCf, ccCs, ccCo, ccCn
+		return false;
+	}
+}
+
+void DiscardLastCombinedCharacter(std::string_view &text) noexcept {
+	// Handle the simple common case where a base character may be followed by
+	// accents and similar marks by discarding until start of base character.
+	// 
+	// From Grapheme_Cluster_Boundaries
+	// combining character sequence = ccs-base? ccs-extend+
+	// ccs-base := [\p{L}\p{N}\p{P}\p{S}\p{Zs}]
+	// ccs-extend := [\p{M}\p{Join_Control}]
+
+	std::string_view truncated = text;
+	while (truncated.length() > (UTF8MaxBytes * 2)) {
+		// Give up when short
+		std::string_view::iterator it = truncated.end() - 1;
+		// For UTF-8 go back to the start of last character.
+		for (int trail = 0; trail < UTF8MaxBytes - 1 && UTF8IsTrailByte(*it); trail++) {
+			--it;
+		}
+		const size_t countBytes = truncated.end() - it;
+		const std::string_view svLastCharacter = truncated.substr(truncated.length() - countBytes);
+		const CharacterCategory cc = CategoriseCharacter(UnicodeFromUTF8(svLastCharacter));
+		truncated.remove_suffix(countBytes);
+		if (IsBaseOfGrapheme(cc)) {
+			text = truncated;
+			return;
+		}
+	}
+	// No base character found so just leave as is
+}
+
+}
+
 // Need to break text into segments near end but taking into account the
 // encoding to not break inside a UTF-8 or DBCS character and also trying
 // to avoid breaking inside a pair of combining characters, or inside
@@ -1215,7 +1301,8 @@ bool Document::IsDBCSDualByteAt(Sci::Position pos) const noexcept {
 // In preference order from best to worst:
 //   1) Break before or after spaces or controls
 //   2) Break at word and punctuation boundary for better kerning and ligature support
-//   3) Break after whole character, this may break combining characters
+//   3) Break before letter in UTF-8 to avoid breaking combining characters
+//   4) Break after whole character, this may break combining characters
 
 size_t Document::SafeSegment(std::string_view text) const noexcept {
 	// check space first as most written language use spaces.
@@ -1236,14 +1323,14 @@ size_t Document::SafeSegment(std::string_view text) const noexcept {
 			}
 		} while (it != text.begin());
 
-		it = text.end() - 1;
 		if (dbcsCodePage) {
-			// for UTF-8 go back to the start of last character.
-			for (int trail = 0; trail < UTF8MaxBytes - 1 && UTF8IsTrailByte(*it); trail++) {
-				--it;
-			}
+			// UTF-8
+			DiscardEndFragment(text);
+			DiscardLastCombinedCharacter(text);
+			return text.length();
+		} else {
+			return text.length() - 1;
 		}
-		return it - text.begin();
 	}
 
 	{
diff --git a/src/UniConversion.h b/src/UniConversion.h
index 7a51b2d08..5990cca8c 100644
--- a/src/UniConversion.h
+++ b/src/UniConversion.h
@@ -49,6 +49,10 @@ constexpr bool UTF8IsTrailByte(unsigned char ch) noexcept {
 	return (ch >= 0x80) && (ch < 0xc0);
 }
 
+constexpr bool UTF8IsFirstByte(unsigned char ch) noexcept {
+	return (ch >= 0xc2) && (ch <= 0xf4);
+}
+
 constexpr bool UTF8IsAscii(unsigned char ch) noexcept {
 	return ch < 0x80;
 }
diff --git a/test/unit/testDocument.cxx b/test/unit/testDocument.cxx
index e4b674987..ad1384ee7 100644
--- a/test/unit/testDocument.cxx
+++ b/test/unit/testDocument.cxx
@@ -34,6 +34,7 @@
 #include "Decoration.h"
 #include "CaseFolder.h"
 #include "Document.h"
+#include "UniConversion.h"
 
 #include "catch.hpp"
 
@@ -957,6 +958,61 @@ TEST_CASE("SafeSegment") {
 		REQUIRE(text[length] == '\xf0');
 	}
 
+	SECTION("UTF-8 Character Fragments") {
+		// PositionCache breaks long texts into fixed length sub-strings that are passed to SafeSegment
+		// so the final character in the sub-string may be incomplete without all needed trail bytes.
+		// For UTF-8, SafeSegment first discards any final bytes that do not represent a valid character
+		// then discards the final whole character.
+
+		const DocPlus doc("", CpUtf8);
+
+		// break before last character after discarding incomplete last character: 0 trail byte
+		std::string_view text = "Japanese\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xc2";	// Invalid text as ends with start byte
+		size_t length = doc.document.SafeSegment(text);
+		REQUIRE(text[length - 1] == '\xac');
+		REQUIRE(text[length] == '\xe8');
+		REQUIRE(UTF8IsValid(text.substr(0, length)));
+
+		// break before last character after discarding incomplete last character: 1 trail byte and 2 needed
+		text = "Japanese\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe6\x97";	// Invalid text as ends with only 1 trail byte
+		length = doc.document.SafeSegment(text);
+		REQUIRE(text[length - 1] == '\xac');
+		REQUIRE(text[length] == '\xe8');
+		REQUIRE(UTF8IsValid(text.substr(0, length)));
+	}
+
+	SECTION("UTF-8 Combining Characters") {
+		const DocPlus doc("", CpUtf8);
+
+		// There may be combining characters like accents and tone marks after the
+		// last letter in a sub-string and these may be included in the sub-string
+		// or follow it.
+		// Correct display requires that the combining characters are measured and
+		// drawn with the letter they follow. Thus the final letter and any
+		// following combining characters are discarded.
+
+		// A Thai text example with 8 characters, each taking 3 bytes:
+		// HO HIP, SARA AA, KHO KHAI, MAI THO, O ANG, MO MA, SARA UU, LO LING
+		// Most are letters (Lo) but 2 characters are modifiers (Mn):
+		// MAI THO is a tone mark and SARA UU is a vowel.
+		const std::string_view text = "\xe0\xb8\xab\xe0\xb8\xb2\xe0\xb8\x82\xe0\xb9\x89\xe0\xb8\xad\xe0\xb8\xa1\xe0\xb8\xb9\xe0\xb8\xa5";
+		REQUIRE(text.length() == 8 * 3);
+		size_t length = doc.document.SafeSegment(text);
+		REQUIRE(length == (8 - 1) * 3);	// Discard last character
+
+		// Remove last character (letter LO LING) then run again.
+		// Should skip past SARA UU combining vowel mark to discard letter MO MA and SARA UU.
+		const std::string_view textWithoutLoLing = text.substr(0, length);
+		length = doc.document.SafeSegment(textWithoutLoLing);
+		REQUIRE(length == (8 - 3) * 3);	// Discard 2 characters
+
+		// Remove last character SARA UU combining vowel mark then run again
+		// Final letter may have following combining mark so discard producing same text as previous step.
+		const std::string_view textWithoutSaraUu = text.substr(0, (8 - 2) * 3);
+		length = doc.document.SafeSegment(textWithoutSaraUu);
+		REQUIRE(length == (8 - 3) * 3);	// Discard 1 character
+	}
+
 	SECTION("DBCS Shift-JIS") {
 		const DocPlus doc("", 932);
 		// word and punctuation boundary in middle of text: single byte