diff options
Diffstat (limited to 'test')
-rw-r--r-- | test/unit/testDocument.cxx | 56 |
1 files changed, 56 insertions, 0 deletions
diff --git a/test/unit/testDocument.cxx b/test/unit/testDocument.cxx index e4b674987..ad1384ee7 100644 --- a/test/unit/testDocument.cxx +++ b/test/unit/testDocument.cxx @@ -34,6 +34,7 @@ #include "Decoration.h" #include "CaseFolder.h" #include "Document.h" +#include "UniConversion.h" #include "catch.hpp" @@ -957,6 +958,61 @@ TEST_CASE("SafeSegment") { REQUIRE(text[length] == '\xf0'); } + SECTION("UTF-8 Character Fragments") { + // PositionCache breaks long texts into fixed length sub-strings that are passed to SafeSegment + // so the final character in the sub-string may be incomplete without all needed trail bytes. + // For UTF-8, SafeSegment first discards any final bytes that do not represent a valid character + // then discards the final whole character. + + const DocPlus doc("", CpUtf8); + + // break before last character after discarding incomplete last character: 0 trail byte + std::string_view text = "Japanese\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xc2"; // Invalid text as ends with start byte + size_t length = doc.document.SafeSegment(text); + REQUIRE(text[length - 1] == '\xac'); + REQUIRE(text[length] == '\xe8'); + REQUIRE(UTF8IsValid(text.substr(0, length))); + + // break before last character after discarding incomplete last character: 1 trail byte and 2 needed + text = "Japanese\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe6\x97"; // Invalid text as ends with only 1 trail byte + length = doc.document.SafeSegment(text); + REQUIRE(text[length - 1] == '\xac'); + REQUIRE(text[length] == '\xe8'); + REQUIRE(UTF8IsValid(text.substr(0, length))); + } + + SECTION("UTF-8 Combining Characters") { + const DocPlus doc("", CpUtf8); + + // There may be combining characters like accents and tone marks after the + // last letter in a sub-string and these may be included in the sub-string + // or follow it. + // Correct display requires that the combining characters are measured and + // drawn with the letter they follow. Thus the final letter and any + // following combining characters are discarded. + + // A Thai text example with 8 characters, each taking 3 bytes: + // HO HIP, SARA AA, KHO KHAI, MAI THO, O ANG, MO MA, SARA UU, LO LING + // Most are letters (Lo) but 2 characters are modifiers (Mn): + // MAI THO is a tone mark and SARA UU is a vowel. + const std::string_view text = "\xe0\xb8\xab\xe0\xb8\xb2\xe0\xb8\x82\xe0\xb9\x89\xe0\xb8\xad\xe0\xb8\xa1\xe0\xb8\xb9\xe0\xb8\xa5"; + REQUIRE(text.length() == 8 * 3); + size_t length = doc.document.SafeSegment(text); + REQUIRE(length == (8 - 1) * 3); // Discard last character + + // Remove last character (letter LO LING) then run again. + // Should skip past SARA UU combining vowel mark to discard letter MO MA and SARA UU. + const std::string_view textWithoutLoLing = text.substr(0, length); + length = doc.document.SafeSegment(textWithoutLoLing); + REQUIRE(length == (8 - 3) * 3); // Discard 2 characters + + // Remove last character SARA UU combining vowel mark then run again + // Final letter may have following combining mark so discard producing same text as previous step. + const std::string_view textWithoutSaraUu = text.substr(0, (8 - 2) * 3); + length = doc.document.SafeSegment(textWithoutSaraUu); + REQUIRE(length == (8 - 3) * 3); // Discard 1 character + } + SECTION("DBCS Shift-JIS") { const DocPlus doc("", 932); // word and punctuation boundary in middle of text: single byte |