aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/Document.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'src/Document.cxx')
-rw-r--r--src/Document.cxx89
1 files changed, 58 insertions, 31 deletions
diff --git a/src/Document.cxx b/src/Document.cxx
index 3ebd357df..0d8b00d09 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -1127,47 +1127,74 @@ bool Document::IsDBCSDualByteAt(Sci::Position pos) const noexcept {
&& IsDBCSTrailByteNoExcept(cb.CharAt(pos + 1));
}
-// Need to break text into segments near lengthSegment but taking into
-// account the encoding to not break inside a UTF-8 or DBCS character
-// and also trying to avoid breaking inside a pair of combining characters.
+// Need to break text into segments near end but taking into account the
+// encoding to not break inside a UTF-8 or DBCS character and also trying
+// to avoid breaking inside a pair of combining characters, or inside
+// ligatures.
+// TODO: implement grapheme cluster boundaries,
+// see https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries.
+//
// The segment length must always be long enough (more than 4 bytes)
// so that there will be at least one whole character to make a segment.
// For UTF-8, text must consist only of valid whole characters.
// In preference order from best to worst:
-// 1) Break after space
-// 2) Break before punctuation
-// 3) Break after whole character
-
-int Document::SafeSegment(const char *text, int lengthSegment) const noexcept {
- int lastSpaceBreak = -1;
- int lastPunctuationBreak = -1;
- int lastEncodingAllowedBreak = 0;
- for (int j=0; j < lengthSegment;) {
- const unsigned char ch = text[j];
- if (j > 0) {
- if (IsSpaceOrTab(text[j - 1]) && !IsSpaceOrTab(text[j])) {
- lastSpaceBreak = j;
+// 1) Break before or after spaces or controls
+// 2) Break at word and punctuation boundary for better kerning and ligature support
+// 3) Break after whole character, this may break combining characters
+
+size_t Document::SafeSegment(std::string_view text) const noexcept {
+ // check space first as most written language use spaces.
+ for (std::string_view::iterator it = text.end() - 1; it != text.begin(); --it) {
+ if (IsBreakSpace(*it)) {
+ return it - text.begin();
+ }
+ }
+
+ if (!dbcsCodePage || dbcsCodePage == CpUtf8) {
+ // backward iterate for UTF-8 and single byte encoding to find word and punctuation boundary.
+ std::string_view::iterator it = text.end() - 1;
+ const bool punctuation = IsPunctuation(*it);
+ do {
+ --it;
+ if (punctuation != IsPunctuation(*it)) {
+ return it - text.begin() + 1;
}
- if (ch < 'A') {
- lastPunctuationBreak = j;
+ } while (it != text.begin());
+
+ it = text.end() - 1;
+ if (dbcsCodePage) {
+ // for UTF-8 go back the start of last character.
+ for (int trail = 0; trail < UTF8MaxBytes - 1 && UTF8IsTrailByte(*it); trail++) {
+ --it;
}
}
- lastEncodingAllowedBreak = j;
+ return it - text.begin();
+ }
- if (dbcsCodePage == CpUtf8) {
- j += UTF8BytesOfLead[ch];
- } else if (dbcsCodePage) {
- j += IsDBCSLeadByteNoExcept(ch) ? 2 : 1;
- } else {
- j++;
+ {
+ // forward iterate for DBCS to find word and punctuation boundary.
+ size_t lastPunctuationBreak = 0;
+ size_t lastEncodingAllowedBreak = 0;
+ CharacterClass ccPrev = CharacterClass::space;
+ for (size_t j = 0; j < text.length();) {
+ const unsigned char ch = text[j];
+ lastEncodingAllowedBreak = j++;
+
+ CharacterClass cc = CharacterClass::word;
+ if (UTF8IsAscii(ch)) {
+ if (IsPunctuation(ch)) {
+ cc = CharacterClass::punctuation;
+ }
+ } else {
+ j += IsDBCSLeadByteNoExcept(ch);
+ }
+ if (cc != ccPrev) {
+ ccPrev = cc;
+ lastPunctuationBreak = lastEncodingAllowedBreak;
+ }
}
+ return lastPunctuationBreak ? lastPunctuationBreak : lastEncodingAllowedBreak;
}
- if (lastSpaceBreak >= 0) {
- return lastSpaceBreak;
- } else if (lastPunctuationBreak >= 0) {
- return lastPunctuationBreak;
- }
- return lastEncodingAllowedBreak;
}
EncodingFamily Document::CodePageFamily() const noexcept {