diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/CharacterType.h | 7 | ||||
-rw-r--r-- | src/Document.cxx | 89 | ||||
-rw-r--r-- | src/Document.h | 2 | ||||
-rw-r--r-- | src/PositionCache.cxx | 21 |
4 files changed, 76 insertions, 43 deletions
diff --git a/src/CharacterType.h b/src/CharacterType.h index b014f1050..437fb8c5c 100644 --- a/src/CharacterType.h +++ b/src/CharacterType.h @@ -32,6 +32,13 @@ constexpr bool IsEOLCharacter(int ch) noexcept { return ch == '\r' || ch == '\n'; } +constexpr bool IsBreakSpace(int ch) noexcept { + // used for text breaking, treat C0 control character as space. + // by default C0 control character is handled as special representation, + // so not appears in normal text. 0x7F DEL is omitted to simplify the code. + return ch >= 0 && ch <= ' '; +} + constexpr bool IsADigit(int ch) noexcept { return (ch >= '0') && (ch <= '9'); } diff --git a/src/Document.cxx b/src/Document.cxx index 3ebd357df..0d8b00d09 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -1127,47 +1127,74 @@ bool Document::IsDBCSDualByteAt(Sci::Position pos) const noexcept { && IsDBCSTrailByteNoExcept(cb.CharAt(pos + 1)); } -// Need to break text into segments near lengthSegment but taking into -// account the encoding to not break inside a UTF-8 or DBCS character -// and also trying to avoid breaking inside a pair of combining characters. +// Need to break text into segments near end but taking into account the +// encoding to not break inside a UTF-8 or DBCS character and also trying +// to avoid breaking inside a pair of combining characters, or inside +// ligatures. +// TODO: implement grapheme cluster boundaries, +// see https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries. +// // The segment length must always be long enough (more than 4 bytes) // so that there will be at least one whole character to make a segment. // For UTF-8, text must consist only of valid whole characters. // In preference order from best to worst: -// 1) Break after space -// 2) Break before punctuation -// 3) Break after whole character - -int Document::SafeSegment(const char *text, int lengthSegment) const noexcept { - int lastSpaceBreak = -1; - int lastPunctuationBreak = -1; - int lastEncodingAllowedBreak = 0; - for (int j=0; j < lengthSegment;) { - const unsigned char ch = text[j]; - if (j > 0) { - if (IsSpaceOrTab(text[j - 1]) && !IsSpaceOrTab(text[j])) { - lastSpaceBreak = j; +// 1) Break before or after spaces or controls +// 2) Break at word and punctuation boundary for better kerning and ligature support +// 3) Break after whole character, this may break combining characters + +size_t Document::SafeSegment(std::string_view text) const noexcept { + // check space first as most written language use spaces. + for (std::string_view::iterator it = text.end() - 1; it != text.begin(); --it) { + if (IsBreakSpace(*it)) { + return it - text.begin(); + } + } + + if (!dbcsCodePage || dbcsCodePage == CpUtf8) { + // backward iterate for UTF-8 and single byte encoding to find word and punctuation boundary. + std::string_view::iterator it = text.end() - 1; + const bool punctuation = IsPunctuation(*it); + do { + --it; + if (punctuation != IsPunctuation(*it)) { + return it - text.begin() + 1; } - if (ch < 'A') { - lastPunctuationBreak = j; + } while (it != text.begin()); + + it = text.end() - 1; + if (dbcsCodePage) { + // for UTF-8 go back the start of last character. + for (int trail = 0; trail < UTF8MaxBytes - 1 && UTF8IsTrailByte(*it); trail++) { + --it; } } - lastEncodingAllowedBreak = j; + return it - text.begin(); + } - if (dbcsCodePage == CpUtf8) { - j += UTF8BytesOfLead[ch]; - } else if (dbcsCodePage) { - j += IsDBCSLeadByteNoExcept(ch) ? 2 : 1; - } else { - j++; + { + // forward iterate for DBCS to find word and punctuation boundary. + size_t lastPunctuationBreak = 0; + size_t lastEncodingAllowedBreak = 0; + CharacterClass ccPrev = CharacterClass::space; + for (size_t j = 0; j < text.length();) { + const unsigned char ch = text[j]; + lastEncodingAllowedBreak = j++; + + CharacterClass cc = CharacterClass::word; + if (UTF8IsAscii(ch)) { + if (IsPunctuation(ch)) { + cc = CharacterClass::punctuation; + } + } else { + j += IsDBCSLeadByteNoExcept(ch); + } + if (cc != ccPrev) { + ccPrev = cc; + lastPunctuationBreak = lastEncodingAllowedBreak; + } } + return lastPunctuationBreak ? lastPunctuationBreak : lastEncodingAllowedBreak; } - if (lastSpaceBreak >= 0) { - return lastSpaceBreak; - } else if (lastPunctuationBreak >= 0) { - return lastPunctuationBreak; - } - return lastEncodingAllowedBreak; } EncodingFamily Document::CodePageFamily() const noexcept { diff --git a/src/Document.h b/src/Document.h index 897a1270c..e406118a7 100644 --- a/src/Document.h +++ b/src/Document.h @@ -352,7 +352,7 @@ public: bool IsDBCSTrailByteNoExcept(char ch) const noexcept; int DBCSDrawBytes(std::string_view text) const noexcept; bool IsDBCSDualByteAt(Sci::Position pos) const noexcept; - int SafeSegment(const char *text, int lengthSegment) const noexcept; + size_t SafeSegment(std::string_view text) const noexcept; EncodingFamily CodePageFamily() const noexcept; // Gateways to modifying document diff --git a/src/PositionCache.cxx b/src/PositionCache.cxx index 6370edb33..c9f4e8793 100644 --- a/src/PositionCache.cxx +++ b/src/PositionCache.cxx @@ -755,21 +755,20 @@ TextSegment BreakFinder::Next() { } subBreak = prev; } + // Splitting up a long run from prev to nextBreak in lots of approximately lengthEachSubdivision. - // For very long runs add extra breaks after spaces or if no spaces before low punctuation. const int startSegment = subBreak; - if ((nextBreak - subBreak) <= lengthEachSubdivision) { - subBreak = -1; - return TextSegment(startSegment, nextBreak - startSegment); + const int remaining = nextBreak - startSegment; + int lengthSegment = remaining; + if (lengthSegment > lengthEachSubdivision) { + lengthSegment = static_cast<int>(pdoc->SafeSegment(std::string_view(&ll->chars[startSegment], lengthEachSubdivision))); + } + if (lengthSegment < remaining) { + subBreak += lengthSegment; } else { - subBreak += pdoc->SafeSegment(&ll->chars[subBreak], lengthEachSubdivision); - if (subBreak >= nextBreak) { - subBreak = -1; - return TextSegment(startSegment, nextBreak - startSegment); - } else { - return TextSegment(startSegment, subBreak - startSegment); - } + subBreak = -1; } + return TextSegment(startSegment, lengthSegment); } bool BreakFinder::More() const noexcept { |