Feature [feature-requests:#1417] Use backward iteration to find space / control

character and text / punctuation boundaries in SafeSegment as will be simpler and faster in almost all cases. Simplify BreakFinder::Next calling SafeSegment.
author: Zufu Liu <unknown> 2021-10-21 22:15:57 +1100
committer: Zufu Liu <unknown> 2021-10-21 22:15:57 +1100
commit: 9975609bf3b39f0e1cd121995ac49aea30a6c48f (patch)
tree: 339887d2052a909480b4e3b4df12f318bbec2be8
parent: a989b1ed63c7cf81c693da8f2f66ab5e29ee341a (diff)
download: scintilla-mirror-9975609bf3b39f0e1cd121995ac49aea30a6c48f.tar.gz
5 files changed, 193 insertions, 43 deletions
diff --git a/src/CharacterType.h b/src/CharacterType.h
index b014f1050..437fb8c5c 100644
--- a/src/CharacterType.h
+++ b/src/CharacterType.h
@@ -32,6 +32,13 @@ constexpr bool IsEOLCharacter(int ch) noexcept {
 	return ch == '\r' || ch == '\n';
 }
 
+constexpr bool IsBreakSpace(int ch) noexcept {
+	// used for text breaking, treat C0 control character as space.
+	// by default C0 control character is handled as special representation,
+	// so not appears in normal text. 0x7F DEL is omitted to simplify the code.
+	return ch >= 0 && ch <= ' ';
+}
+
 constexpr bool IsADigit(int ch) noexcept {
 	return (ch >= '0') && (ch <= '9');
 }
diff --git a/src/Document.cxx b/src/Document.cxx
index 3ebd357df..0d8b00d09 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -1127,47 +1127,74 @@ bool Document::IsDBCSDualByteAt(Sci::Position pos) const noexcept {
 		&& IsDBCSTrailByteNoExcept(cb.CharAt(pos + 1));
 }
 
-// Need to break text into segments near lengthSegment but taking into
-// account the encoding to not break inside a UTF-8 or DBCS character
-// and also trying to avoid breaking inside a pair of combining characters.
+// Need to break text into segments near end but taking into account the
+// encoding to not break inside a UTF-8 or DBCS character and also trying
+// to avoid breaking inside a pair of combining characters, or inside
+// ligatures.
+// TODO: implement grapheme cluster boundaries,
+// see https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries.
+//
 // The segment length must always be long enough (more than 4 bytes)
 // so that there will be at least one whole character to make a segment.
 // For UTF-8, text must consist only of valid whole characters.
 // In preference order from best to worst:
-//   1) Break after space
-//   2) Break before punctuation
-//   3) Break after whole character
-
-int Document::SafeSegment(const char *text, int lengthSegment) const noexcept {
-	int lastSpaceBreak = -1;
-	int lastPunctuationBreak = -1;
-	int lastEncodingAllowedBreak = 0;
-	for (int j=0; j < lengthSegment;) {
-		const unsigned char ch = text[j];
-		if (j > 0) {
-			if (IsSpaceOrTab(text[j - 1]) && !IsSpaceOrTab(text[j])) {
-				lastSpaceBreak = j;
+//   1) Break before or after spaces or controls
+//   2) Break at word and punctuation boundary for better kerning and ligature support
+//   3) Break after whole character, this may break combining characters
+
+size_t Document::SafeSegment(std::string_view text) const noexcept {
+	// check space first as most written language use spaces.
+	for (std::string_view::iterator it = text.end() - 1; it != text.begin(); --it) {
+		if (IsBreakSpace(*it)) {
+			return it - text.begin();
+		}
+	}
+
+	if (!dbcsCodePage || dbcsCodePage == CpUtf8) {
+		// backward iterate for UTF-8 and single byte encoding to find word and punctuation boundary.
+		std::string_view::iterator it = text.end() - 1;
+		const bool punctuation = IsPunctuation(*it);
+		do {
+			--it;
+			if (punctuation != IsPunctuation(*it)) {
+				return it - text.begin() + 1;
 			}
-			if (ch < 'A') {
-				lastPunctuationBreak = j;
+		} while (it != text.begin());
+
+		it = text.end() - 1;
+		if (dbcsCodePage) {
+			// for UTF-8 go back the start of last character.
+			for (int trail = 0; trail < UTF8MaxBytes - 1 && UTF8IsTrailByte(*it); trail++) {
+				--it;
 			}
 		}
-		lastEncodingAllowedBreak = j;
+		return it - text.begin();
+	}
 
-		if (dbcsCodePage == CpUtf8) {
-			j += UTF8BytesOfLead[ch];
-		} else if (dbcsCodePage) {
-			j += IsDBCSLeadByteNoExcept(ch) ? 2 : 1;
-		} else {
-			j++;
+	{
+		// forward iterate for DBCS to find word and punctuation boundary.
+		size_t lastPunctuationBreak = 0;
+		size_t lastEncodingAllowedBreak = 0;
+		CharacterClass ccPrev = CharacterClass::space;
+		for (size_t j = 0; j < text.length();) {
+			const unsigned char ch = text[j];
+			lastEncodingAllowedBreak = j++;
+
+			CharacterClass cc = CharacterClass::word;
+			if (UTF8IsAscii(ch)) {
+				if (IsPunctuation(ch)) {
+					cc = CharacterClass::punctuation;
+				}
+			} else {
+				j += IsDBCSLeadByteNoExcept(ch);
+			}
+			if (cc != ccPrev) {
+				ccPrev = cc;
+				lastPunctuationBreak = lastEncodingAllowedBreak;
+			}
 		}
+		return lastPunctuationBreak ? lastPunctuationBreak : lastEncodingAllowedBreak;
 	}
-	if (lastSpaceBreak >= 0) {
-		return lastSpaceBreak;
-	} else if (lastPunctuationBreak >= 0) {
-		return lastPunctuationBreak;
-	}
-	return lastEncodingAllowedBreak;
 }
 
 EncodingFamily Document::CodePageFamily() const noexcept {
diff --git a/src/Document.h b/src/Document.h
index 897a1270c..e406118a7 100644
--- a/src/Document.h
+++ b/src/Document.h
@@ -352,7 +352,7 @@ public:
 	bool IsDBCSTrailByteNoExcept(char ch) const noexcept;
 	int DBCSDrawBytes(std::string_view text) const noexcept;
 	bool IsDBCSDualByteAt(Sci::Position pos) const noexcept;
-	int SafeSegment(const char *text, int lengthSegment) const noexcept;
+	size_t SafeSegment(std::string_view text) const noexcept;
 	EncodingFamily CodePageFamily() const noexcept;
 
 	// Gateways to modifying document
diff --git a/src/PositionCache.cxx b/src/PositionCache.cxx
index 6370edb33..c9f4e8793 100644
--- a/src/PositionCache.cxx
+++ b/src/PositionCache.cxx
@@ -755,21 +755,20 @@ TextSegment BreakFinder::Next() {
 		}
 		subBreak = prev;
 	}
+
 	// Splitting up a long run from prev to nextBreak in lots of approximately lengthEachSubdivision.
-	// For very long runs add extra breaks after spaces or if no spaces before low punctuation.
 	const int startSegment = subBreak;
-	if ((nextBreak - subBreak) <= lengthEachSubdivision) {
-		subBreak = -1;
-		return TextSegment(startSegment, nextBreak - startSegment);
+	const int remaining = nextBreak - startSegment;
+	int lengthSegment = remaining;
+	if (lengthSegment > lengthEachSubdivision) {
+		lengthSegment = static_cast<int>(pdoc->SafeSegment(std::string_view(&ll->chars[startSegment], lengthEachSubdivision)));
+	}
+	if (lengthSegment < remaining) {
+		subBreak += lengthSegment;
 	} else {
-		subBreak += pdoc->SafeSegment(&ll->chars[subBreak], lengthEachSubdivision);
-		if (subBreak >= nextBreak) {
-			subBreak = -1;
-			return TextSegment(startSegment, nextBreak - startSegment);
-		} else {
-			return TextSegment(startSegment, subBreak - startSegment);
-		}
+		subBreak = -1;
 	}
+	return TextSegment(startSegment, lengthSegment);
 }
 
 bool BreakFinder::More() const noexcept {
diff --git a/test/unit/testDocument.cxx b/test/unit/testDocument.cxx
index 41e3907ae..4a7e20095 100644
--- a/test/unit/testDocument.cxx
+++ b/test/unit/testDocument.cxx
@@ -486,3 +486,120 @@ TEST_CASE("Words") {
 		REQUIRE(!docEndSpace.document.IsWordAt(3, 5));
 	}
 }
+
+TEST_CASE("SafeSegment") {
+	SECTION("Short") {
+		const DocPlus doc("", 0);
+		// all encoding: break before or after last space
+		const std::string_view text = "12 ";
+		size_t length = doc.document.SafeSegment(text);
+		REQUIRE(length <= text.length());
+		REQUIRE(text[length - 1] == '2');
+		REQUIRE(text[length] == ' ');
+	}
+
+	SECTION("ASCII") {
+		const DocPlus doc("", 0);
+		// all encoding: break before or after last space
+		std::string_view text = "12 3 \t45";
+		size_t length = doc.document.SafeSegment(text);
+		REQUIRE(text[length - 1] == ' ');
+		REQUIRE(text[length] == '\t');
+
+		// UTF-8 and ASCII: word and punctuation boundary in middle of text
+		text = "(IsBreakSpace(text[j]))";
+		length = doc.document.SafeSegment(text);
+		REQUIRE(text[length - 1] == 'j');
+		REQUIRE(text[length] == ']');
+
+		// UTF-8 and ASCII: word and punctuation boundary near start of text
+		text = "(IsBreakSpace";
+		length = doc.document.SafeSegment(text);
+		REQUIRE(text[length - 1] == '(');
+		REQUIRE(text[length] == 'I');
+
+		// UTF-8 and ASCII: word and punctuation boundary near end of text
+		text = "IsBreakSpace)";
+		length = doc.document.SafeSegment(text);
+		REQUIRE(text[length - 1] == 'e');
+		REQUIRE(text[length] == ')');
+
+		// break before last character
+		text = "JapaneseJa";
+		length = doc.document.SafeSegment(text);
+		REQUIRE(text[length - 1] == 'J');
+		REQUIRE(text[length] == 'a');
+	}
+
+	SECTION("UTF-8") {
+		const DocPlus doc("", CpUtf8);
+		// break before last character: no trail byte
+		std::string_view text = "JapaneseJa";
+		size_t length = doc.document.SafeSegment(text);
+		REQUIRE(text[length - 1] == 'J');
+		REQUIRE(text[length] == 'a');
+
+		// break before last character: 1 trail byte
+		text = "Japanese\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xc2\xa9";
+		length = doc.document.SafeSegment(text);
+		REQUIRE(text[length - 1] == '\x9e');
+		REQUIRE(text[length] == '\xc2');
+
+		// break before last character: 2 trail bytes
+		text = "Japanese\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e";
+		length = doc.document.SafeSegment(text);
+		REQUIRE(text[length - 1] == '\xac');
+		REQUIRE(text[length] == '\xe8');
+
+		// break before last character: 3 trail bytes
+		text = "Japanese\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xf0\x9f\x98\x8a";
+		length = doc.document.SafeSegment(text);
+		REQUIRE(text[length - 1] == '\x9e');
+		REQUIRE(text[length] == '\xf0');
+	}
+
+	SECTION("DBCS Shift-JIS") {
+		const DocPlus doc("", 932);
+		// word and punctuation boundary in middle of text: single byte
+		std::string_view text = "(IsBreakSpace(text[j]))";
+		size_t length = doc.document.SafeSegment(text);
+		REQUIRE(text[length - 1] == 'j');
+		REQUIRE(text[length] == ']');
+
+		// word and punctuation boundary in middle of text: double byte
+		text = "(IsBreakSpace(text[\x8c\xea]))";
+		length = doc.document.SafeSegment(text);
+		REQUIRE(text[length - 1] == '\xea');
+		REQUIRE(text[length] == ']');
+
+		// word and punctuation boundary near start of text
+		text = "(IsBreakSpace";
+		length = doc.document.SafeSegment(text);
+		REQUIRE(text[length - 1] == '(');
+		REQUIRE(text[length] == 'I');
+
+		// word and punctuation boundary near end of text: single byte
+		text = "IsBreakSpace)";
+		length = doc.document.SafeSegment(text);
+		REQUIRE(text[length - 1] == 'e');
+		REQUIRE(text[length] == ')');
+
+		// word and punctuation boundary near end of text: double byte
+		text = "IsBreakSpace\x8c\xea)";
+		length = doc.document.SafeSegment(text);
+		REQUIRE(text[length - 1] == '\xea');
+		REQUIRE(text[length] == ')');
+
+		// break before last character: single byte
+		text = "JapaneseJa";
+		length = doc.document.SafeSegment(text);
+		REQUIRE(text[length - 1] == 'J');
+		REQUIRE(text[length] == 'a');
+
+		// break before last character: double byte
+		text = "Japanese\x93\xfa\x96\x7b\x8c\xea";
+		length = doc.document.SafeSegment(text);
+		REQUIRE(text[length - 1] == '\x7b');
+		REQUIRE(text[length] == '\x8c');
+	}
+}
author	Zufu Liu <unknown>	2021-10-21 22:15:57 +1100
committer	Zufu Liu <unknown>	2021-10-21 22:15:57 +1100
commit	9975609bf3b39f0e1cd121995ac49aea30a6c48f (patch)
tree	339887d2052a909480b4e3b4df12f318bbec2be8
parent	a989b1ed63c7cf81c693da8f2f66ab5e29ee341a (diff)
download	scintilla-mirror-9975609bf3b39f0e1cd121995ac49aea30a6c48f.tar.gz