2 files changed, 98 insertions, 7 deletions
diff --git a/src/Document.cxx b/src/Document.cxx
index 379a88786..dc82b1902 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -1202,6 +1202,92 @@ bool Document::IsDBCSDualByteAt(Sci::Position pos) const noexcept {
 		&& IsDBCSTrailByteNoExcept(cb.CharAt(pos + 1));
 }
 
+namespace {
+
+// Remove any extra bytes after the last valid character.
+void DiscardEndFragment(std::string_view &text) noexcept {
+	if (!text.empty()) {
+		if (UTF8IsFirstByte(text.back())) {
+			// Ending with start of character byte is invalid
+			text.remove_suffix(1);
+		} else if (UTF8IsTrailByte(text.back())) {
+			// go back to the start of last character.
+			const size_t maxTrail = std::max<size_t>(UTF8MaxBytes - 1, text.length());
+			size_t trail = 1;
+			while (trail < maxTrail && UTF8IsTrailByte(text[text.length() - trail])) {
+				trail++;
+			}
+			const std::string_view endPortion = text.substr(text.length() - trail);
+			if (!UTF8IsValid(endPortion)) {
+				text.remove_suffix(trail);
+			}
+		}
+	}
+}
+
+constexpr bool IsBaseOfGrapheme(CharacterCategory cc) {
+	// \p{L}\p{N}\p{P}\p{S}\p{Zs}
+	switch (cc) {
+	case ccLu:
+	case ccLl:
+	case ccLt:
+	case ccLm:
+	case ccLo:
+	case ccNd:
+	case ccNl:
+	case ccNo:
+	case ccPc:
+	case ccPd:
+	case ccPs:
+	case ccPe:
+	case ccPi:
+	case ccPf:
+	case ccPo:
+	case ccSm:
+	case ccSc:
+	case ccSk:
+	case ccSo:
+	case ccZs:
+		return true;
+	default:
+		// ccMn, ccMc, ccMe,
+		// ccZl, ccZp,
+		// ccCc, ccCf, ccCs, ccCo, ccCn
+		return false;
+	}
+}
+
+void DiscardLastCombinedCharacter(std::string_view &text) noexcept {
+	// Handle the simple common case where a base character may be followed by
+	// accents and similar marks by discarding until start of base character.
+	// 
+	// From Grapheme_Cluster_Boundaries
+	// combining character sequence = ccs-base? ccs-extend+
+	// ccs-base := [\p{L}\p{N}\p{P}\p{S}\p{Zs}]
+	// ccs-extend := [\p{M}\p{Join_Control}]
+
+	std::string_view truncated = text;
+	while (truncated.length() > (UTF8MaxBytes * 2)) {
+		// Give up when short
+		std::string_view::iterator it = truncated.end() - 1;
+		// For UTF-8 go back to the start of last character.
+		for (int trail = 0; trail < UTF8MaxBytes - 1 && UTF8IsTrailByte(*it); trail++) {
+			--it;
+		}
+		const size_t countBytes = truncated.end() - it;
+		const std::string_view svLastCharacter = truncated.substr(truncated.length() - countBytes);
+		const CharacterCategory cc = CategoriseCharacter(UnicodeFromUTF8(svLastCharacter));
+		truncated.remove_suffix(countBytes);
+		if (IsBaseOfGrapheme(cc)) {
+			text = truncated;
+			return;
+		}
+	}
+	// No base character found so just leave as is
+}
+
+}
+
 // Need to break text into segments near end but taking into account the
 // encoding to not break inside a UTF-8 or DBCS character and also trying
 // to avoid breaking inside a pair of combining characters, or inside
@@ -1215,7 +1301,8 @@ bool Document::IsDBCSDualByteAt(Sci::Position pos) const noexcept {
 // In preference order from best to worst:
 //   1) Break before or after spaces or controls
 //   2) Break at word and punctuation boundary for better kerning and ligature support
-//   3) Break after whole character, this may break combining characters
+//   3) Break before letter in UTF-8 to avoid breaking combining characters
+//   4) Break after whole character, this may break combining characters
 
 size_t Document::SafeSegment(std::string_view text) const noexcept {
 	// check space first as most written language use spaces.
@@ -1236,14 +1323,14 @@ size_t Document::SafeSegment(std::string_view text) const noexcept {
 			}
 		} while (it != text.begin());
 
-		it = text.end() - 1;
 		if (dbcsCodePage) {
-			// for UTF-8 go back to the start of last character.
-			for (int trail = 0; trail < UTF8MaxBytes - 1 && UTF8IsTrailByte(*it); trail++) {
-				--it;
-			}
+			// UTF-8
+			DiscardEndFragment(text);
+			DiscardLastCombinedCharacter(text);
+			return text.length();
+		} else {
+			return text.length() - 1;
 		}
-		return it - text.begin();
 	}
 
 	{
diff --git a/src/UniConversion.h b/src/UniConversion.h
index 7a51b2d08..5990cca8c 100644
--- a/src/UniConversion.h
+++ b/src/UniConversion.h
@@ -49,6 +49,10 @@ constexpr bool UTF8IsTrailByte(unsigned char ch) noexcept {
 	return (ch >= 0x80) && (ch < 0xc0);
 }
 
+constexpr bool UTF8IsFirstByte(unsigned char ch) noexcept {
+	return (ch >= 0xc2) && (ch <= 0xf4);
+}
+
 constexpr bool UTF8IsAscii(unsigned char ch) noexcept {
 	return ch < 0x80;
 }