aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorZufu Liu <unknown>2021-10-21 22:15:57 +1100
committerZufu Liu <unknown>2021-10-21 22:15:57 +1100
commit9975609bf3b39f0e1cd121995ac49aea30a6c48f (patch)
tree339887d2052a909480b4e3b4df12f318bbec2be8
parenta989b1ed63c7cf81c693da8f2f66ab5e29ee341a (diff)
downloadscintilla-mirror-9975609bf3b39f0e1cd121995ac49aea30a6c48f.tar.gz
Feature [feature-requests:#1417] Use backward iteration to find space / control
character and text / punctuation boundaries in SafeSegment as will be simpler and faster in almost all cases. Simplify BreakFinder::Next calling SafeSegment.
-rw-r--r--src/CharacterType.h7
-rw-r--r--src/Document.cxx89
-rw-r--r--src/Document.h2
-rw-r--r--src/PositionCache.cxx21
-rw-r--r--test/unit/testDocument.cxx117
5 files changed, 193 insertions, 43 deletions
diff --git a/src/CharacterType.h b/src/CharacterType.h
index b014f1050..437fb8c5c 100644
--- a/src/CharacterType.h
+++ b/src/CharacterType.h
@@ -32,6 +32,13 @@ constexpr bool IsEOLCharacter(int ch) noexcept {
return ch == '\r' || ch == '\n';
}
+constexpr bool IsBreakSpace(int ch) noexcept {
+ // used for text breaking, treat C0 control character as space.
+ // by default C0 control character is handled as special representation,
+ // so not appears in normal text. 0x7F DEL is omitted to simplify the code.
+ return ch >= 0 && ch <= ' ';
+}
+
constexpr bool IsADigit(int ch) noexcept {
return (ch >= '0') && (ch <= '9');
}
diff --git a/src/Document.cxx b/src/Document.cxx
index 3ebd357df..0d8b00d09 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -1127,47 +1127,74 @@ bool Document::IsDBCSDualByteAt(Sci::Position pos) const noexcept {
&& IsDBCSTrailByteNoExcept(cb.CharAt(pos + 1));
}
-// Need to break text into segments near lengthSegment but taking into
-// account the encoding to not break inside a UTF-8 or DBCS character
-// and also trying to avoid breaking inside a pair of combining characters.
+// Need to break text into segments near end but taking into account the
+// encoding to not break inside a UTF-8 or DBCS character and also trying
+// to avoid breaking inside a pair of combining characters, or inside
+// ligatures.
+// TODO: implement grapheme cluster boundaries,
+// see https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries.
+//
// The segment length must always be long enough (more than 4 bytes)
// so that there will be at least one whole character to make a segment.
// For UTF-8, text must consist only of valid whole characters.
// In preference order from best to worst:
-// 1) Break after space
-// 2) Break before punctuation
-// 3) Break after whole character
-
-int Document::SafeSegment(const char *text, int lengthSegment) const noexcept {
- int lastSpaceBreak = -1;
- int lastPunctuationBreak = -1;
- int lastEncodingAllowedBreak = 0;
- for (int j=0; j < lengthSegment;) {
- const unsigned char ch = text[j];
- if (j > 0) {
- if (IsSpaceOrTab(text[j - 1]) && !IsSpaceOrTab(text[j])) {
- lastSpaceBreak = j;
+// 1) Break before or after spaces or controls
+// 2) Break at word and punctuation boundary for better kerning and ligature support
+// 3) Break after whole character, this may break combining characters
+
+size_t Document::SafeSegment(std::string_view text) const noexcept {
+ // check space first as most written language use spaces.
+ for (std::string_view::iterator it = text.end() - 1; it != text.begin(); --it) {
+ if (IsBreakSpace(*it)) {
+ return it - text.begin();
+ }
+ }
+
+ if (!dbcsCodePage || dbcsCodePage == CpUtf8) {
+ // backward iterate for UTF-8 and single byte encoding to find word and punctuation boundary.
+ std::string_view::iterator it = text.end() - 1;
+ const bool punctuation = IsPunctuation(*it);
+ do {
+ --it;
+ if (punctuation != IsPunctuation(*it)) {
+ return it - text.begin() + 1;
}
- if (ch < 'A') {
- lastPunctuationBreak = j;
+ } while (it != text.begin());
+
+ it = text.end() - 1;
+ if (dbcsCodePage) {
+ // for UTF-8 go back the start of last character.
+ for (int trail = 0; trail < UTF8MaxBytes - 1 && UTF8IsTrailByte(*it); trail++) {
+ --it;
}
}
- lastEncodingAllowedBreak = j;
+ return it - text.begin();
+ }
- if (dbcsCodePage == CpUtf8) {
- j += UTF8BytesOfLead[ch];
- } else if (dbcsCodePage) {
- j += IsDBCSLeadByteNoExcept(ch) ? 2 : 1;
- } else {
- j++;
+ {
+ // forward iterate for DBCS to find word and punctuation boundary.
+ size_t lastPunctuationBreak = 0;
+ size_t lastEncodingAllowedBreak = 0;
+ CharacterClass ccPrev = CharacterClass::space;
+ for (size_t j = 0; j < text.length();) {
+ const unsigned char ch = text[j];
+ lastEncodingAllowedBreak = j++;
+
+ CharacterClass cc = CharacterClass::word;
+ if (UTF8IsAscii(ch)) {
+ if (IsPunctuation(ch)) {
+ cc = CharacterClass::punctuation;
+ }
+ } else {
+ j += IsDBCSLeadByteNoExcept(ch);
+ }
+ if (cc != ccPrev) {
+ ccPrev = cc;
+ lastPunctuationBreak = lastEncodingAllowedBreak;
+ }
}
+ return lastPunctuationBreak ? lastPunctuationBreak : lastEncodingAllowedBreak;
}
- if (lastSpaceBreak >= 0) {
- return lastSpaceBreak;
- } else if (lastPunctuationBreak >= 0) {
- return lastPunctuationBreak;
- }
- return lastEncodingAllowedBreak;
}
EncodingFamily Document::CodePageFamily() const noexcept {
diff --git a/src/Document.h b/src/Document.h
index 897a1270c..e406118a7 100644
--- a/src/Document.h
+++ b/src/Document.h
@@ -352,7 +352,7 @@ public:
bool IsDBCSTrailByteNoExcept(char ch) const noexcept;
int DBCSDrawBytes(std::string_view text) const noexcept;
bool IsDBCSDualByteAt(Sci::Position pos) const noexcept;
- int SafeSegment(const char *text, int lengthSegment) const noexcept;
+ size_t SafeSegment(std::string_view text) const noexcept;
EncodingFamily CodePageFamily() const noexcept;
// Gateways to modifying document
diff --git a/src/PositionCache.cxx b/src/PositionCache.cxx
index 6370edb33..c9f4e8793 100644
--- a/src/PositionCache.cxx
+++ b/src/PositionCache.cxx
@@ -755,21 +755,20 @@ TextSegment BreakFinder::Next() {
}
subBreak = prev;
}
+
// Splitting up a long run from prev to nextBreak in lots of approximately lengthEachSubdivision.
- // For very long runs add extra breaks after spaces or if no spaces before low punctuation.
const int startSegment = subBreak;
- if ((nextBreak - subBreak) <= lengthEachSubdivision) {
- subBreak = -1;
- return TextSegment(startSegment, nextBreak - startSegment);
+ const int remaining = nextBreak - startSegment;
+ int lengthSegment = remaining;
+ if (lengthSegment > lengthEachSubdivision) {
+ lengthSegment = static_cast<int>(pdoc->SafeSegment(std::string_view(&ll->chars[startSegment], lengthEachSubdivision)));
+ }
+ if (lengthSegment < remaining) {
+ subBreak += lengthSegment;
} else {
- subBreak += pdoc->SafeSegment(&ll->chars[subBreak], lengthEachSubdivision);
- if (subBreak >= nextBreak) {
- subBreak = -1;
- return TextSegment(startSegment, nextBreak - startSegment);
- } else {
- return TextSegment(startSegment, subBreak - startSegment);
- }
+ subBreak = -1;
}
+ return TextSegment(startSegment, lengthSegment);
}
bool BreakFinder::More() const noexcept {
diff --git a/test/unit/testDocument.cxx b/test/unit/testDocument.cxx
index 41e3907ae..4a7e20095 100644
--- a/test/unit/testDocument.cxx
+++ b/test/unit/testDocument.cxx
@@ -486,3 +486,120 @@ TEST_CASE("Words") {
REQUIRE(!docEndSpace.document.IsWordAt(3, 5));
}
}
+
+TEST_CASE("SafeSegment") {
+ SECTION("Short") {
+ const DocPlus doc("", 0);
+ // all encoding: break before or after last space
+ const std::string_view text = "12 ";
+ size_t length = doc.document.SafeSegment(text);
+ REQUIRE(length <= text.length());
+ REQUIRE(text[length - 1] == '2');
+ REQUIRE(text[length] == ' ');
+ }
+
+ SECTION("ASCII") {
+ const DocPlus doc("", 0);
+ // all encoding: break before or after last space
+ std::string_view text = "12 3 \t45";
+ size_t length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == ' ');
+ REQUIRE(text[length] == '\t');
+
+ // UTF-8 and ASCII: word and punctuation boundary in middle of text
+ text = "(IsBreakSpace(text[j]))";
+ length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == 'j');
+ REQUIRE(text[length] == ']');
+
+ // UTF-8 and ASCII: word and punctuation boundary near start of text
+ text = "(IsBreakSpace";
+ length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == '(');
+ REQUIRE(text[length] == 'I');
+
+ // UTF-8 and ASCII: word and punctuation boundary near end of text
+ text = "IsBreakSpace)";
+ length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == 'e');
+ REQUIRE(text[length] == ')');
+
+ // break before last character
+ text = "JapaneseJa";
+ length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == 'J');
+ REQUIRE(text[length] == 'a');
+ }
+
+ SECTION("UTF-8") {
+ const DocPlus doc("", CpUtf8);
+ // break before last character: no trail byte
+ std::string_view text = "JapaneseJa";
+ size_t length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == 'J');
+ REQUIRE(text[length] == 'a');
+
+ // break before last character: 1 trail byte
+ text = "Japanese\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xc2\xa9";
+ length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == '\x9e');
+ REQUIRE(text[length] == '\xc2');
+
+ // break before last character: 2 trail bytes
+ text = "Japanese\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e";
+ length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == '\xac');
+ REQUIRE(text[length] == '\xe8');
+
+ // break before last character: 3 trail bytes
+ text = "Japanese\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xf0\x9f\x98\x8a";
+ length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == '\x9e');
+ REQUIRE(text[length] == '\xf0');
+ }
+
+ SECTION("DBCS Shift-JIS") {
+ const DocPlus doc("", 932);
+ // word and punctuation boundary in middle of text: single byte
+ std::string_view text = "(IsBreakSpace(text[j]))";
+ size_t length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == 'j');
+ REQUIRE(text[length] == ']');
+
+ // word and punctuation boundary in middle of text: double byte
+ text = "(IsBreakSpace(text[\x8c\xea]))";
+ length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == '\xea');
+ REQUIRE(text[length] == ']');
+
+ // word and punctuation boundary near start of text
+ text = "(IsBreakSpace";
+ length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == '(');
+ REQUIRE(text[length] == 'I');
+
+ // word and punctuation boundary near end of text: single byte
+ text = "IsBreakSpace)";
+ length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == 'e');
+ REQUIRE(text[length] == ')');
+
+ // word and punctuation boundary near end of text: double byte
+ text = "IsBreakSpace\x8c\xea)";
+ length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == '\xea');
+ REQUIRE(text[length] == ')');
+
+ // break before last character: single byte
+ text = "JapaneseJa";
+ length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == 'J');
+ REQUIRE(text[length] == 'a');
+
+ // break before last character: double byte
+ text = "Japanese\x93\xfa\x96\x7b\x8c\xea";
+ length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == '\x7b');
+ REQUIRE(text[length] == '\x8c');
+ }
+}