aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--doc/ScintillaHistory.html3
-rw-r--r--src/Document.cxx101
-rw-r--r--src/UniConversion.h4
-rw-r--r--test/unit/testDocument.cxx56
4 files changed, 157 insertions, 7 deletions
diff --git a/doc/ScintillaHistory.html b/doc/ScintillaHistory.html
index 782801b7a..ef9ad02d4 100644
--- a/doc/ScintillaHistory.html
+++ b/doc/ScintillaHistory.html
@@ -604,6 +604,9 @@
Serialize selection type and ranges with SCI_GETSELECTIONSERIALIZED and SCI_SETSELECTIONSERIALIZED.
</li>
<li>
+ Fix segmentation of long lexemes to avoid breaking before modifiers like accents that must be drawn with their base letters.
+ </li>
+ <li>
Fix bug on Qt where double-click stopped working when Scintilla instance had been running for weeks.
</li>
</ul>
diff --git a/src/Document.cxx b/src/Document.cxx
index 379a88786..dc82b1902 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -1202,6 +1202,92 @@ bool Document::IsDBCSDualByteAt(Sci::Position pos) const noexcept {
&& IsDBCSTrailByteNoExcept(cb.CharAt(pos + 1));
}
+namespace {
+
+// Remove any extra bytes after the last valid character.
+void DiscardEndFragment(std::string_view &text) noexcept {
+ if (!text.empty()) {
+ if (UTF8IsFirstByte(text.back())) {
+ // Ending with start of character byte is invalid
+ text.remove_suffix(1);
+ } else if (UTF8IsTrailByte(text.back())) {
+ // go back to the start of last character.
+ const size_t maxTrail = std::max<size_t>(UTF8MaxBytes - 1, text.length());
+ size_t trail = 1;
+ while (trail < maxTrail && UTF8IsTrailByte(text[text.length() - trail])) {
+ trail++;
+ }
+ const std::string_view endPortion = text.substr(text.length() - trail);
+ if (!UTF8IsValid(endPortion)) {
+ text.remove_suffix(trail);
+ }
+ }
+ }
+}
+
+constexpr bool IsBaseOfGrapheme(CharacterCategory cc) {
+ // \p{L}\p{N}\p{P}\p{S}\p{Zs}
+ switch (cc) {
+ case ccLu:
+ case ccLl:
+ case ccLt:
+ case ccLm:
+ case ccLo:
+ case ccNd:
+ case ccNl:
+ case ccNo:
+ case ccPc:
+ case ccPd:
+ case ccPs:
+ case ccPe:
+ case ccPi:
+ case ccPf:
+ case ccPo:
+ case ccSm:
+ case ccSc:
+ case ccSk:
+ case ccSo:
+ case ccZs:
+ return true;
+ default:
+ // ccMn, ccMc, ccMe,
+ // ccZl, ccZp,
+ // ccCc, ccCf, ccCs, ccCo, ccCn
+ return false;
+ }
+}
+
+void DiscardLastCombinedCharacter(std::string_view &text) noexcept {
+ // Handle the simple common case where a base character may be followed by
+ // accents and similar marks by discarding until start of base character.
+ //
+ // From Grapheme_Cluster_Boundaries
+ // combining character sequence = ccs-base? ccs-extend+
+ // ccs-base := [\p{L}\p{N}\p{P}\p{S}\p{Zs}]
+ // ccs-extend := [\p{M}\p{Join_Control}]
+
+ std::string_view truncated = text;
+ while (truncated.length() > (UTF8MaxBytes * 2)) {
+ // Give up when short
+ std::string_view::iterator it = truncated.end() - 1;
+ // For UTF-8 go back to the start of last character.
+ for (int trail = 0; trail < UTF8MaxBytes - 1 && UTF8IsTrailByte(*it); trail++) {
+ --it;
+ }
+ const size_t countBytes = truncated.end() - it;
+ const std::string_view svLastCharacter = truncated.substr(truncated.length() - countBytes);
+ const CharacterCategory cc = CategoriseCharacter(UnicodeFromUTF8(svLastCharacter));
+ truncated.remove_suffix(countBytes);
+ if (IsBaseOfGrapheme(cc)) {
+ text = truncated;
+ return;
+ }
+ }
+ // No base character found so just leave as is
+}
+
+}
+
// Need to break text into segments near end but taking into account the
// encoding to not break inside a UTF-8 or DBCS character and also trying
// to avoid breaking inside a pair of combining characters, or inside
@@ -1215,7 +1301,8 @@ bool Document::IsDBCSDualByteAt(Sci::Position pos) const noexcept {
// In preference order from best to worst:
// 1) Break before or after spaces or controls
// 2) Break at word and punctuation boundary for better kerning and ligature support
-// 3) Break after whole character, this may break combining characters
+// 3) Break before letter in UTF-8 to avoid breaking combining characters
+// 4) Break after whole character, this may break combining characters
size_t Document::SafeSegment(std::string_view text) const noexcept {
// check space first as most written language use spaces.
@@ -1236,14 +1323,14 @@ size_t Document::SafeSegment(std::string_view text) const noexcept {
}
} while (it != text.begin());
- it = text.end() - 1;
if (dbcsCodePage) {
- // for UTF-8 go back to the start of last character.
- for (int trail = 0; trail < UTF8MaxBytes - 1 && UTF8IsTrailByte(*it); trail++) {
- --it;
- }
+ // UTF-8
+ DiscardEndFragment(text);
+ DiscardLastCombinedCharacter(text);
+ return text.length();
+ } else {
+ return text.length() - 1;
}
- return it - text.begin();
}
{
diff --git a/src/UniConversion.h b/src/UniConversion.h
index 7a51b2d08..5990cca8c 100644
--- a/src/UniConversion.h
+++ b/src/UniConversion.h
@@ -49,6 +49,10 @@ constexpr bool UTF8IsTrailByte(unsigned char ch) noexcept {
return (ch >= 0x80) && (ch < 0xc0);
}
+constexpr bool UTF8IsFirstByte(unsigned char ch) noexcept {
+ return (ch >= 0xc2) && (ch <= 0xf4);
+}
+
constexpr bool UTF8IsAscii(unsigned char ch) noexcept {
return ch < 0x80;
}
diff --git a/test/unit/testDocument.cxx b/test/unit/testDocument.cxx
index e4b674987..ad1384ee7 100644
--- a/test/unit/testDocument.cxx
+++ b/test/unit/testDocument.cxx
@@ -34,6 +34,7 @@
#include "Decoration.h"
#include "CaseFolder.h"
#include "Document.h"
+#include "UniConversion.h"
#include "catch.hpp"
@@ -957,6 +958,61 @@ TEST_CASE("SafeSegment") {
REQUIRE(text[length] == '\xf0');
}
+ SECTION("UTF-8 Character Fragments") {
+ // PositionCache breaks long texts into fixed length sub-strings that are passed to SafeSegment
+ // so the final character in the sub-string may be incomplete without all needed trail bytes.
+ // For UTF-8, SafeSegment first discards any final bytes that do not represent a valid character
+ // then discards the final whole character.
+
+ const DocPlus doc("", CpUtf8);
+
+ // break before last character after discarding incomplete last character: 0 trail byte
+ std::string_view text = "Japanese\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xc2"; // Invalid text as ends with start byte
+ size_t length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == '\xac');
+ REQUIRE(text[length] == '\xe8');
+ REQUIRE(UTF8IsValid(text.substr(0, length)));
+
+ // break before last character after discarding incomplete last character: 1 trail byte and 2 needed
+ text = "Japanese\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe6\x97"; // Invalid text as ends with only 1 trail byte
+ length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == '\xac');
+ REQUIRE(text[length] == '\xe8');
+ REQUIRE(UTF8IsValid(text.substr(0, length)));
+ }
+
+ SECTION("UTF-8 Combining Characters") {
+ const DocPlus doc("", CpUtf8);
+
+ // There may be combining characters like accents and tone marks after the
+ // last letter in a sub-string and these may be included in the sub-string
+ // or follow it.
+ // Correct display requires that the combining characters are measured and
+ // drawn with the letter they follow. Thus the final letter and any
+ // following combining characters are discarded.
+
+ // A Thai text example with 8 characters, each taking 3 bytes:
+ // HO HIP, SARA AA, KHO KHAI, MAI THO, O ANG, MO MA, SARA UU, LO LING
+ // Most are letters (Lo) but 2 characters are modifiers (Mn):
+ // MAI THO is a tone mark and SARA UU is a vowel.
+ const std::string_view text = "\xe0\xb8\xab\xe0\xb8\xb2\xe0\xb8\x82\xe0\xb9\x89\xe0\xb8\xad\xe0\xb8\xa1\xe0\xb8\xb9\xe0\xb8\xa5";
+ REQUIRE(text.length() == 8 * 3);
+ size_t length = doc.document.SafeSegment(text);
+ REQUIRE(length == (8 - 1) * 3); // Discard last character
+
+ // Remove last character (letter LO LING) then run again.
+ // Should skip past SARA UU combining vowel mark to discard letter MO MA and SARA UU.
+ const std::string_view textWithoutLoLing = text.substr(0, length);
+ length = doc.document.SafeSegment(textWithoutLoLing);
+ REQUIRE(length == (8 - 3) * 3); // Discard 2 characters
+
+ // Remove last character SARA UU combining vowel mark then run again
+ // Final letter may have following combining mark so discard producing same text as previous step.
+ const std::string_view textWithoutSaraUu = text.substr(0, (8 - 2) * 3);
+ length = doc.document.SafeSegment(textWithoutSaraUu);
+ REQUIRE(length == (8 - 3) * 3); // Discard 1 character
+ }
+
SECTION("DBCS Shift-JIS") {
const DocPlus doc("", 932);
// word and punctuation boundary in middle of text: single byte