diff options
author | Neil <nyamatongwe@gmail.com> | 2025-02-14 11:48:31 +1100 |
---|---|---|
committer | Neil <nyamatongwe@gmail.com> | 2025-02-14 11:48:31 +1100 |
commit | 10e12f5d8d45ce1dadc97a09c11f9e0f340a79d3 (patch) | |
tree | 6b9572523c2cb7ab28a87094d7fc68efca13a5ec /src | |
parent | 33b4899b327695a66e3dd2ef056516a45a253878 (diff) | |
download | scintilla-mirror-10e12f5d8d45ce1dadc97a09c11f9e0f340a79d3.tar.gz |
Implement LastCharacter to return the last character or character fragment in a
potentially invalid UTF-8 string. Use this in DiscardLastCombinedCharacter.
Place DiscardLastCombinedCharacter in Scintilla::Internal namespace for use in
text wrap.
Diffstat (limited to 'src')
-rw-r--r-- | src/Document.cxx | 29 | ||||
-rw-r--r-- | src/Document.h | 2 |
2 files changed, 27 insertions, 4 deletions
diff --git a/src/Document.cxx b/src/Document.cxx index d7497b825..0b4316bbc 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -1257,7 +1257,25 @@ constexpr bool IsBaseOfGrapheme(CharacterCategory cc) { } } -bool DiscardLastCombinedCharacter(std::string_view &text) noexcept { +CharacterExtracted LastCharacter(std::string_view text) noexcept { + if (text.empty()) + return { unicodeReplacementChar, 1 }; + const size_t length = text.length(); + size_t trail = length; + while ((trail > 0) && (length - trail < UTF8MaxBytes) && UTF8IsTrailByte(text[trail - 1])) + trail--; + const size_t start = (trail > 0) ? trail - 1 : trail; + const int utf8status = UTF8Classify(text.substr(start)); + if (utf8status & UTF8MaskInvalid) { + return { unicodeReplacementChar, 1 }; + } + return { static_cast<unsigned int>(UnicodeFromUTF8(text.substr(start))), + static_cast<unsigned int>(utf8status & UTF8MaskWidth) }; +} + +} + +bool Scintilla::Internal::DiscardLastCombinedCharacter(std::string_view &text) noexcept { // Handle the simple common case where a base character may be followed by // accents and similar marks by discarding until start of base character. // @@ -1268,7 +1286,8 @@ bool DiscardLastCombinedCharacter(std::string_view &text) noexcept { // Modified to move Sk (Symbol Modifier) from ccs-base to ccs-extend to preserve modified emoji std::string_view truncated = text; - while (truncated.length() > (UTF8MaxBytes * 2)) { + while (truncated.length() > UTF8MaxBytes) { + /* // Give up when short std::string_view::iterator it = truncated.end() - 1; // For UTF-8 go back to the start of last character. @@ -1279,6 +1298,10 @@ bool DiscardLastCombinedCharacter(std::string_view &text) noexcept { const std::string_view svLastCharacter = truncated.substr(truncated.length() - countBytes); const CharacterCategory cc = CategoriseCharacter(UnicodeFromUTF8(svLastCharacter)); truncated.remove_suffix(countBytes); + */ + const CharacterExtracted ce = LastCharacter(truncated); + const CharacterCategory cc = CategoriseCharacter(static_cast<int>(ce.character)); + truncated.remove_suffix(ce.widthBytes); if (IsBaseOfGrapheme(cc)) { text = truncated; return true; @@ -1288,8 +1311,6 @@ bool DiscardLastCombinedCharacter(std::string_view &text) noexcept { return false; } -} - // Need to break text into segments near end but taking into account the // encoding to not break inside a UTF-8 or DBCS character and also trying // to avoid breaking inside a pair of combining characters, or inside diff --git a/src/Document.h b/src/Document.h index 4e18c42d1..4c6832d89 100644 --- a/src/Document.h +++ b/src/Document.h @@ -273,6 +273,8 @@ struct CharacterExtracted { } }; +bool DiscardLastCombinedCharacter(std::string_view &text) noexcept; + /** */ class Document : PerLine, public Scintilla::IDocument, public Scintilla::ILoader, public Scintilla::IDocumentEditable { |