aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authorZufu Liu <unknown>2021-10-21 22:15:57 +1100
committerZufu Liu <unknown>2021-10-21 22:15:57 +1100
commit9975609bf3b39f0e1cd121995ac49aea30a6c48f (patch)
tree339887d2052a909480b4e3b4df12f318bbec2be8 /src
parenta989b1ed63c7cf81c693da8f2f66ab5e29ee341a (diff)
downloadscintilla-mirror-9975609bf3b39f0e1cd121995ac49aea30a6c48f.tar.gz
Feature [feature-requests:#1417] Use backward iteration to find space / control
character and text / punctuation boundaries in SafeSegment as will be simpler and faster in almost all cases. Simplify BreakFinder::Next calling SafeSegment.
Diffstat (limited to 'src')
-rw-r--r--src/CharacterType.h7
-rw-r--r--src/Document.cxx89
-rw-r--r--src/Document.h2
-rw-r--r--src/PositionCache.cxx21
4 files changed, 76 insertions, 43 deletions
diff --git a/src/CharacterType.h b/src/CharacterType.h
index b014f1050..437fb8c5c 100644
--- a/src/CharacterType.h
+++ b/src/CharacterType.h
@@ -32,6 +32,13 @@ constexpr bool IsEOLCharacter(int ch) noexcept {
return ch == '\r' || ch == '\n';
}
+constexpr bool IsBreakSpace(int ch) noexcept {
+ // used for text breaking, treat C0 control character as space.
+ // by default C0 control character is handled as special representation,
+ // so not appears in normal text. 0x7F DEL is omitted to simplify the code.
+ return ch >= 0 && ch <= ' ';
+}
+
constexpr bool IsADigit(int ch) noexcept {
return (ch >= '0') && (ch <= '9');
}
diff --git a/src/Document.cxx b/src/Document.cxx
index 3ebd357df..0d8b00d09 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -1127,47 +1127,74 @@ bool Document::IsDBCSDualByteAt(Sci::Position pos) const noexcept {
&& IsDBCSTrailByteNoExcept(cb.CharAt(pos + 1));
}
-// Need to break text into segments near lengthSegment but taking into
-// account the encoding to not break inside a UTF-8 or DBCS character
-// and also trying to avoid breaking inside a pair of combining characters.
+// Need to break text into segments near end but taking into account the
+// encoding to not break inside a UTF-8 or DBCS character and also trying
+// to avoid breaking inside a pair of combining characters, or inside
+// ligatures.
+// TODO: implement grapheme cluster boundaries,
+// see https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries.
+//
// The segment length must always be long enough (more than 4 bytes)
// so that there will be at least one whole character to make a segment.
// For UTF-8, text must consist only of valid whole characters.
// In preference order from best to worst:
-// 1) Break after space
-// 2) Break before punctuation
-// 3) Break after whole character
-
-int Document::SafeSegment(const char *text, int lengthSegment) const noexcept {
- int lastSpaceBreak = -1;
- int lastPunctuationBreak = -1;
- int lastEncodingAllowedBreak = 0;
- for (int j=0; j < lengthSegment;) {
- const unsigned char ch = text[j];
- if (j > 0) {
- if (IsSpaceOrTab(text[j - 1]) && !IsSpaceOrTab(text[j])) {
- lastSpaceBreak = j;
+// 1) Break before or after spaces or controls
+// 2) Break at word and punctuation boundary for better kerning and ligature support
+// 3) Break after whole character, this may break combining characters
+
+size_t Document::SafeSegment(std::string_view text) const noexcept {
+ // check space first as most written language use spaces.
+ for (std::string_view::iterator it = text.end() - 1; it != text.begin(); --it) {
+ if (IsBreakSpace(*it)) {
+ return it - text.begin();
+ }
+ }
+
+ if (!dbcsCodePage || dbcsCodePage == CpUtf8) {
+ // backward iterate for UTF-8 and single byte encoding to find word and punctuation boundary.
+ std::string_view::iterator it = text.end() - 1;
+ const bool punctuation = IsPunctuation(*it);
+ do {
+ --it;
+ if (punctuation != IsPunctuation(*it)) {
+ return it - text.begin() + 1;
}
- if (ch < 'A') {
- lastPunctuationBreak = j;
+ } while (it != text.begin());
+
+ it = text.end() - 1;
+ if (dbcsCodePage) {
+ // for UTF-8 go back the start of last character.
+ for (int trail = 0; trail < UTF8MaxBytes - 1 && UTF8IsTrailByte(*it); trail++) {
+ --it;
}
}
- lastEncodingAllowedBreak = j;
+ return it - text.begin();
+ }
- if (dbcsCodePage == CpUtf8) {
- j += UTF8BytesOfLead[ch];
- } else if (dbcsCodePage) {
- j += IsDBCSLeadByteNoExcept(ch) ? 2 : 1;
- } else {
- j++;
+ {
+ // forward iterate for DBCS to find word and punctuation boundary.
+ size_t lastPunctuationBreak = 0;
+ size_t lastEncodingAllowedBreak = 0;
+ CharacterClass ccPrev = CharacterClass::space;
+ for (size_t j = 0; j < text.length();) {
+ const unsigned char ch = text[j];
+ lastEncodingAllowedBreak = j++;
+
+ CharacterClass cc = CharacterClass::word;
+ if (UTF8IsAscii(ch)) {
+ if (IsPunctuation(ch)) {
+ cc = CharacterClass::punctuation;
+ }
+ } else {
+ j += IsDBCSLeadByteNoExcept(ch);
+ }
+ if (cc != ccPrev) {
+ ccPrev = cc;
+ lastPunctuationBreak = lastEncodingAllowedBreak;
+ }
}
+ return lastPunctuationBreak ? lastPunctuationBreak : lastEncodingAllowedBreak;
}
- if (lastSpaceBreak >= 0) {
- return lastSpaceBreak;
- } else if (lastPunctuationBreak >= 0) {
- return lastPunctuationBreak;
- }
- return lastEncodingAllowedBreak;
}
EncodingFamily Document::CodePageFamily() const noexcept {
diff --git a/src/Document.h b/src/Document.h
index 897a1270c..e406118a7 100644
--- a/src/Document.h
+++ b/src/Document.h
@@ -352,7 +352,7 @@ public:
bool IsDBCSTrailByteNoExcept(char ch) const noexcept;
int DBCSDrawBytes(std::string_view text) const noexcept;
bool IsDBCSDualByteAt(Sci::Position pos) const noexcept;
- int SafeSegment(const char *text, int lengthSegment) const noexcept;
+ size_t SafeSegment(std::string_view text) const noexcept;
EncodingFamily CodePageFamily() const noexcept;
// Gateways to modifying document
diff --git a/src/PositionCache.cxx b/src/PositionCache.cxx
index 6370edb33..c9f4e8793 100644
--- a/src/PositionCache.cxx
+++ b/src/PositionCache.cxx
@@ -755,21 +755,20 @@ TextSegment BreakFinder::Next() {
}
subBreak = prev;
}
+
// Splitting up a long run from prev to nextBreak in lots of approximately lengthEachSubdivision.
- // For very long runs add extra breaks after spaces or if no spaces before low punctuation.
const int startSegment = subBreak;
- if ((nextBreak - subBreak) <= lengthEachSubdivision) {
- subBreak = -1;
- return TextSegment(startSegment, nextBreak - startSegment);
+ const int remaining = nextBreak - startSegment;
+ int lengthSegment = remaining;
+ if (lengthSegment > lengthEachSubdivision) {
+ lengthSegment = static_cast<int>(pdoc->SafeSegment(std::string_view(&ll->chars[startSegment], lengthEachSubdivision)));
+ }
+ if (lengthSegment < remaining) {
+ subBreak += lengthSegment;
} else {
- subBreak += pdoc->SafeSegment(&ll->chars[subBreak], lengthEachSubdivision);
- if (subBreak >= nextBreak) {
- subBreak = -1;
- return TextSegment(startSegment, nextBreak - startSegment);
- } else {
- return TextSegment(startSegment, subBreak - startSegment);
- }
+ subBreak = -1;
}
+ return TextSegment(startSegment, lengthSegment);
}
bool BreakFinder::More() const noexcept {