diff options
Diffstat (limited to 'src/Document.cxx')
-rw-r--r-- | src/Document.cxx | 89 |
1 files changed, 58 insertions, 31 deletions
diff --git a/src/Document.cxx b/src/Document.cxx index 3ebd357df..0d8b00d09 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -1127,47 +1127,74 @@ bool Document::IsDBCSDualByteAt(Sci::Position pos) const noexcept { && IsDBCSTrailByteNoExcept(cb.CharAt(pos + 1)); } -// Need to break text into segments near lengthSegment but taking into -// account the encoding to not break inside a UTF-8 or DBCS character -// and also trying to avoid breaking inside a pair of combining characters. +// Need to break text into segments near end but taking into account the +// encoding to not break inside a UTF-8 or DBCS character and also trying +// to avoid breaking inside a pair of combining characters, or inside +// ligatures. +// TODO: implement grapheme cluster boundaries, +// see https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries. +// // The segment length must always be long enough (more than 4 bytes) // so that there will be at least one whole character to make a segment. // For UTF-8, text must consist only of valid whole characters. // In preference order from best to worst: -// 1) Break after space -// 2) Break before punctuation -// 3) Break after whole character - -int Document::SafeSegment(const char *text, int lengthSegment) const noexcept { - int lastSpaceBreak = -1; - int lastPunctuationBreak = -1; - int lastEncodingAllowedBreak = 0; - for (int j=0; j < lengthSegment;) { - const unsigned char ch = text[j]; - if (j > 0) { - if (IsSpaceOrTab(text[j - 1]) && !IsSpaceOrTab(text[j])) { - lastSpaceBreak = j; +// 1) Break before or after spaces or controls +// 2) Break at word and punctuation boundary for better kerning and ligature support +// 3) Break after whole character, this may break combining characters + +size_t Document::SafeSegment(std::string_view text) const noexcept { + // check space first as most written language use spaces. + for (std::string_view::iterator it = text.end() - 1; it != text.begin(); --it) { + if (IsBreakSpace(*it)) { + return it - text.begin(); + } + } + + if (!dbcsCodePage || dbcsCodePage == CpUtf8) { + // backward iterate for UTF-8 and single byte encoding to find word and punctuation boundary. + std::string_view::iterator it = text.end() - 1; + const bool punctuation = IsPunctuation(*it); + do { + --it; + if (punctuation != IsPunctuation(*it)) { + return it - text.begin() + 1; } - if (ch < 'A') { - lastPunctuationBreak = j; + } while (it != text.begin()); + + it = text.end() - 1; + if (dbcsCodePage) { + // for UTF-8 go back the start of last character. + for (int trail = 0; trail < UTF8MaxBytes - 1 && UTF8IsTrailByte(*it); trail++) { + --it; } } - lastEncodingAllowedBreak = j; + return it - text.begin(); + } - if (dbcsCodePage == CpUtf8) { - j += UTF8BytesOfLead[ch]; - } else if (dbcsCodePage) { - j += IsDBCSLeadByteNoExcept(ch) ? 2 : 1; - } else { - j++; + { + // forward iterate for DBCS to find word and punctuation boundary. + size_t lastPunctuationBreak = 0; + size_t lastEncodingAllowedBreak = 0; + CharacterClass ccPrev = CharacterClass::space; + for (size_t j = 0; j < text.length();) { + const unsigned char ch = text[j]; + lastEncodingAllowedBreak = j++; + + CharacterClass cc = CharacterClass::word; + if (UTF8IsAscii(ch)) { + if (IsPunctuation(ch)) { + cc = CharacterClass::punctuation; + } + } else { + j += IsDBCSLeadByteNoExcept(ch); + } + if (cc != ccPrev) { + ccPrev = cc; + lastPunctuationBreak = lastEncodingAllowedBreak; + } } + return lastPunctuationBreak ? lastPunctuationBreak : lastEncodingAllowedBreak; } - if (lastSpaceBreak >= 0) { - return lastSpaceBreak; - } else if (lastPunctuationBreak >= 0) { - return lastPunctuationBreak; - } - return lastEncodingAllowedBreak; } EncodingFamily Document::CodePageFamily() const noexcept { |