diff options
author | Zufu Liu <unknown> | 2021-10-21 22:15:57 +1100 |
---|---|---|
committer | Zufu Liu <unknown> | 2021-10-21 22:15:57 +1100 |
commit | 9975609bf3b39f0e1cd121995ac49aea30a6c48f (patch) | |
tree | 339887d2052a909480b4e3b4df12f318bbec2be8 | |
parent | a989b1ed63c7cf81c693da8f2f66ab5e29ee341a (diff) | |
download | scintilla-mirror-9975609bf3b39f0e1cd121995ac49aea30a6c48f.tar.gz |
Feature [feature-requests:#1417] Use backward iteration to find space / control
character and text / punctuation boundaries in SafeSegment as will be simpler
and faster in almost all cases.
Simplify BreakFinder::Next calling SafeSegment.
-rw-r--r-- | src/CharacterType.h | 7 | ||||
-rw-r--r-- | src/Document.cxx | 89 | ||||
-rw-r--r-- | src/Document.h | 2 | ||||
-rw-r--r-- | src/PositionCache.cxx | 21 | ||||
-rw-r--r-- | test/unit/testDocument.cxx | 117 |
5 files changed, 193 insertions, 43 deletions
diff --git a/src/CharacterType.h b/src/CharacterType.h index b014f1050..437fb8c5c 100644 --- a/src/CharacterType.h +++ b/src/CharacterType.h @@ -32,6 +32,13 @@ constexpr bool IsEOLCharacter(int ch) noexcept { return ch == '\r' || ch == '\n'; } +constexpr bool IsBreakSpace(int ch) noexcept { + // used for text breaking, treat C0 control character as space. + // by default C0 control character is handled as special representation, + // so not appears in normal text. 0x7F DEL is omitted to simplify the code. + return ch >= 0 && ch <= ' '; +} + constexpr bool IsADigit(int ch) noexcept { return (ch >= '0') && (ch <= '9'); } diff --git a/src/Document.cxx b/src/Document.cxx index 3ebd357df..0d8b00d09 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -1127,47 +1127,74 @@ bool Document::IsDBCSDualByteAt(Sci::Position pos) const noexcept { && IsDBCSTrailByteNoExcept(cb.CharAt(pos + 1)); } -// Need to break text into segments near lengthSegment but taking into -// account the encoding to not break inside a UTF-8 or DBCS character -// and also trying to avoid breaking inside a pair of combining characters. +// Need to break text into segments near end but taking into account the +// encoding to not break inside a UTF-8 or DBCS character and also trying +// to avoid breaking inside a pair of combining characters, or inside +// ligatures. +// TODO: implement grapheme cluster boundaries, +// see https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries. +// // The segment length must always be long enough (more than 4 bytes) // so that there will be at least one whole character to make a segment. // For UTF-8, text must consist only of valid whole characters. // In preference order from best to worst: -// 1) Break after space -// 2) Break before punctuation -// 3) Break after whole character - -int Document::SafeSegment(const char *text, int lengthSegment) const noexcept { - int lastSpaceBreak = -1; - int lastPunctuationBreak = -1; - int lastEncodingAllowedBreak = 0; - for (int j=0; j < lengthSegment;) { - const unsigned char ch = text[j]; - if (j > 0) { - if (IsSpaceOrTab(text[j - 1]) && !IsSpaceOrTab(text[j])) { - lastSpaceBreak = j; +// 1) Break before or after spaces or controls +// 2) Break at word and punctuation boundary for better kerning and ligature support +// 3) Break after whole character, this may break combining characters + +size_t Document::SafeSegment(std::string_view text) const noexcept { + // check space first as most written language use spaces. + for (std::string_view::iterator it = text.end() - 1; it != text.begin(); --it) { + if (IsBreakSpace(*it)) { + return it - text.begin(); + } + } + + if (!dbcsCodePage || dbcsCodePage == CpUtf8) { + // backward iterate for UTF-8 and single byte encoding to find word and punctuation boundary. + std::string_view::iterator it = text.end() - 1; + const bool punctuation = IsPunctuation(*it); + do { + --it; + if (punctuation != IsPunctuation(*it)) { + return it - text.begin() + 1; } - if (ch < 'A') { - lastPunctuationBreak = j; + } while (it != text.begin()); + + it = text.end() - 1; + if (dbcsCodePage) { + // for UTF-8 go back the start of last character. + for (int trail = 0; trail < UTF8MaxBytes - 1 && UTF8IsTrailByte(*it); trail++) { + --it; } } - lastEncodingAllowedBreak = j; + return it - text.begin(); + } - if (dbcsCodePage == CpUtf8) { - j += UTF8BytesOfLead[ch]; - } else if (dbcsCodePage) { - j += IsDBCSLeadByteNoExcept(ch) ? 2 : 1; - } else { - j++; + { + // forward iterate for DBCS to find word and punctuation boundary. + size_t lastPunctuationBreak = 0; + size_t lastEncodingAllowedBreak = 0; + CharacterClass ccPrev = CharacterClass::space; + for (size_t j = 0; j < text.length();) { + const unsigned char ch = text[j]; + lastEncodingAllowedBreak = j++; + + CharacterClass cc = CharacterClass::word; + if (UTF8IsAscii(ch)) { + if (IsPunctuation(ch)) { + cc = CharacterClass::punctuation; + } + } else { + j += IsDBCSLeadByteNoExcept(ch); + } + if (cc != ccPrev) { + ccPrev = cc; + lastPunctuationBreak = lastEncodingAllowedBreak; + } } + return lastPunctuationBreak ? lastPunctuationBreak : lastEncodingAllowedBreak; } - if (lastSpaceBreak >= 0) { - return lastSpaceBreak; - } else if (lastPunctuationBreak >= 0) { - return lastPunctuationBreak; - } - return lastEncodingAllowedBreak; } EncodingFamily Document::CodePageFamily() const noexcept { diff --git a/src/Document.h b/src/Document.h index 897a1270c..e406118a7 100644 --- a/src/Document.h +++ b/src/Document.h @@ -352,7 +352,7 @@ public: bool IsDBCSTrailByteNoExcept(char ch) const noexcept; int DBCSDrawBytes(std::string_view text) const noexcept; bool IsDBCSDualByteAt(Sci::Position pos) const noexcept; - int SafeSegment(const char *text, int lengthSegment) const noexcept; + size_t SafeSegment(std::string_view text) const noexcept; EncodingFamily CodePageFamily() const noexcept; // Gateways to modifying document diff --git a/src/PositionCache.cxx b/src/PositionCache.cxx index 6370edb33..c9f4e8793 100644 --- a/src/PositionCache.cxx +++ b/src/PositionCache.cxx @@ -755,21 +755,20 @@ TextSegment BreakFinder::Next() { } subBreak = prev; } + // Splitting up a long run from prev to nextBreak in lots of approximately lengthEachSubdivision. - // For very long runs add extra breaks after spaces or if no spaces before low punctuation. const int startSegment = subBreak; - if ((nextBreak - subBreak) <= lengthEachSubdivision) { - subBreak = -1; - return TextSegment(startSegment, nextBreak - startSegment); + const int remaining = nextBreak - startSegment; + int lengthSegment = remaining; + if (lengthSegment > lengthEachSubdivision) { + lengthSegment = static_cast<int>(pdoc->SafeSegment(std::string_view(&ll->chars[startSegment], lengthEachSubdivision))); + } + if (lengthSegment < remaining) { + subBreak += lengthSegment; } else { - subBreak += pdoc->SafeSegment(&ll->chars[subBreak], lengthEachSubdivision); - if (subBreak >= nextBreak) { - subBreak = -1; - return TextSegment(startSegment, nextBreak - startSegment); - } else { - return TextSegment(startSegment, subBreak - startSegment); - } + subBreak = -1; } + return TextSegment(startSegment, lengthSegment); } bool BreakFinder::More() const noexcept { diff --git a/test/unit/testDocument.cxx b/test/unit/testDocument.cxx index 41e3907ae..4a7e20095 100644 --- a/test/unit/testDocument.cxx +++ b/test/unit/testDocument.cxx @@ -486,3 +486,120 @@ TEST_CASE("Words") { REQUIRE(!docEndSpace.document.IsWordAt(3, 5)); } } + +TEST_CASE("SafeSegment") { + SECTION("Short") { + const DocPlus doc("", 0); + // all encoding: break before or after last space + const std::string_view text = "12 "; + size_t length = doc.document.SafeSegment(text); + REQUIRE(length <= text.length()); + REQUIRE(text[length - 1] == '2'); + REQUIRE(text[length] == ' '); + } + + SECTION("ASCII") { + const DocPlus doc("", 0); + // all encoding: break before or after last space + std::string_view text = "12 3 \t45"; + size_t length = doc.document.SafeSegment(text); + REQUIRE(text[length - 1] == ' '); + REQUIRE(text[length] == '\t'); + + // UTF-8 and ASCII: word and punctuation boundary in middle of text + text = "(IsBreakSpace(text[j]))"; + length = doc.document.SafeSegment(text); + REQUIRE(text[length - 1] == 'j'); + REQUIRE(text[length] == ']'); + + // UTF-8 and ASCII: word and punctuation boundary near start of text + text = "(IsBreakSpace"; + length = doc.document.SafeSegment(text); + REQUIRE(text[length - 1] == '('); + REQUIRE(text[length] == 'I'); + + // UTF-8 and ASCII: word and punctuation boundary near end of text + text = "IsBreakSpace)"; + length = doc.document.SafeSegment(text); + REQUIRE(text[length - 1] == 'e'); + REQUIRE(text[length] == ')'); + + // break before last character + text = "JapaneseJa"; + length = doc.document.SafeSegment(text); + REQUIRE(text[length - 1] == 'J'); + REQUIRE(text[length] == 'a'); + } + + SECTION("UTF-8") { + const DocPlus doc("", CpUtf8); + // break before last character: no trail byte + std::string_view text = "JapaneseJa"; + size_t length = doc.document.SafeSegment(text); + REQUIRE(text[length - 1] == 'J'); + REQUIRE(text[length] == 'a'); + + // break before last character: 1 trail byte + text = "Japanese\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xc2\xa9"; + length = doc.document.SafeSegment(text); + REQUIRE(text[length - 1] == '\x9e'); + REQUIRE(text[length] == '\xc2'); + + // break before last character: 2 trail bytes + text = "Japanese\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e"; + length = doc.document.SafeSegment(text); + REQUIRE(text[length - 1] == '\xac'); + REQUIRE(text[length] == '\xe8'); + + // break before last character: 3 trail bytes + text = "Japanese\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xf0\x9f\x98\x8a"; + length = doc.document.SafeSegment(text); + REQUIRE(text[length - 1] == '\x9e'); + REQUIRE(text[length] == '\xf0'); + } + + SECTION("DBCS Shift-JIS") { + const DocPlus doc("", 932); + // word and punctuation boundary in middle of text: single byte + std::string_view text = "(IsBreakSpace(text[j]))"; + size_t length = doc.document.SafeSegment(text); + REQUIRE(text[length - 1] == 'j'); + REQUIRE(text[length] == ']'); + + // word and punctuation boundary in middle of text: double byte + text = "(IsBreakSpace(text[\x8c\xea]))"; + length = doc.document.SafeSegment(text); + REQUIRE(text[length - 1] == '\xea'); + REQUIRE(text[length] == ']'); + + // word and punctuation boundary near start of text + text = "(IsBreakSpace"; + length = doc.document.SafeSegment(text); + REQUIRE(text[length - 1] == '('); + REQUIRE(text[length] == 'I'); + + // word and punctuation boundary near end of text: single byte + text = "IsBreakSpace)"; + length = doc.document.SafeSegment(text); + REQUIRE(text[length - 1] == 'e'); + REQUIRE(text[length] == ')'); + + // word and punctuation boundary near end of text: double byte + text = "IsBreakSpace\x8c\xea)"; + length = doc.document.SafeSegment(text); + REQUIRE(text[length - 1] == '\xea'); + REQUIRE(text[length] == ')'); + + // break before last character: single byte + text = "JapaneseJa"; + length = doc.document.SafeSegment(text); + REQUIRE(text[length - 1] == 'J'); + REQUIRE(text[length] == 'a'); + + // break before last character: double byte + text = "Japanese\x93\xfa\x96\x7b\x8c\xea"; + length = doc.document.SafeSegment(text); + REQUIRE(text[length - 1] == '\x7b'); + REQUIRE(text[length] == '\x8c'); + } +} |