aboutsummaryrefslogtreecommitdiffhomepage
path: root/test
diff options
context:
space:
mode:
Diffstat (limited to 'test')
-rw-r--r--test/unit/testDocument.cxx56
1 files changed, 56 insertions, 0 deletions
diff --git a/test/unit/testDocument.cxx b/test/unit/testDocument.cxx
index e4b674987..ad1384ee7 100644
--- a/test/unit/testDocument.cxx
+++ b/test/unit/testDocument.cxx
@@ -34,6 +34,7 @@
#include "Decoration.h"
#include "CaseFolder.h"
#include "Document.h"
+#include "UniConversion.h"
#include "catch.hpp"
@@ -957,6 +958,61 @@ TEST_CASE("SafeSegment") {
REQUIRE(text[length] == '\xf0');
}
+ SECTION("UTF-8 Character Fragments") {
+ // PositionCache breaks long texts into fixed length sub-strings that are passed to SafeSegment
+ // so the final character in the sub-string may be incomplete without all needed trail bytes.
+ // For UTF-8, SafeSegment first discards any final bytes that do not represent a valid character
+ // then discards the final whole character.
+
+ const DocPlus doc("", CpUtf8);
+
+ // break before last character after discarding incomplete last character: 0 trail byte
+ std::string_view text = "Japanese\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xc2"; // Invalid text as ends with start byte
+ size_t length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == '\xac');
+ REQUIRE(text[length] == '\xe8');
+ REQUIRE(UTF8IsValid(text.substr(0, length)));
+
+ // break before last character after discarding incomplete last character: 1 trail byte and 2 needed
+ text = "Japanese\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe6\x97"; // Invalid text as ends with only 1 trail byte
+ length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == '\xac');
+ REQUIRE(text[length] == '\xe8');
+ REQUIRE(UTF8IsValid(text.substr(0, length)));
+ }
+
+ SECTION("UTF-8 Combining Characters") {
+ const DocPlus doc("", CpUtf8);
+
+ // There may be combining characters like accents and tone marks after the
+ // last letter in a sub-string and these may be included in the sub-string
+ // or follow it.
+ // Correct display requires that the combining characters are measured and
+ // drawn with the letter they follow. Thus the final letter and any
+ // following combining characters are discarded.
+
+ // A Thai text example with 8 characters, each taking 3 bytes:
+ // HO HIP, SARA AA, KHO KHAI, MAI THO, O ANG, MO MA, SARA UU, LO LING
+ // Most are letters (Lo) but 2 characters are modifiers (Mn):
+ // MAI THO is a tone mark and SARA UU is a vowel.
+ const std::string_view text = "\xe0\xb8\xab\xe0\xb8\xb2\xe0\xb8\x82\xe0\xb9\x89\xe0\xb8\xad\xe0\xb8\xa1\xe0\xb8\xb9\xe0\xb8\xa5";
+ REQUIRE(text.length() == 8 * 3);
+ size_t length = doc.document.SafeSegment(text);
+ REQUIRE(length == (8 - 1) * 3); // Discard last character
+
+ // Remove last character (letter LO LING) then run again.
+ // Should skip past SARA UU combining vowel mark to discard letter MO MA and SARA UU.
+ const std::string_view textWithoutLoLing = text.substr(0, length);
+ length = doc.document.SafeSegment(textWithoutLoLing);
+ REQUIRE(length == (8 - 3) * 3); // Discard 2 characters
+
+ // Remove last character SARA UU combining vowel mark then run again
+ // Final letter may have following combining mark so discard producing same text as previous step.
+ const std::string_view textWithoutSaraUu = text.substr(0, (8 - 2) * 3);
+ length = doc.document.SafeSegment(textWithoutSaraUu);
+ REQUIRE(length == (8 - 3) * 3); // Discard 1 character
+ }
+
SECTION("DBCS Shift-JIS") {
const DocPlus doc("", 932);
// word and punctuation boundary in middle of text: single byte