aboutsummaryrefslogtreecommitdiffhomepage
path: root/test/unit/testDocument.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'test/unit/testDocument.cxx')
-rw-r--r--test/unit/testDocument.cxx117
1 files changed, 117 insertions, 0 deletions
diff --git a/test/unit/testDocument.cxx b/test/unit/testDocument.cxx
index 41e3907ae..4a7e20095 100644
--- a/test/unit/testDocument.cxx
+++ b/test/unit/testDocument.cxx
@@ -486,3 +486,120 @@ TEST_CASE("Words") {
REQUIRE(!docEndSpace.document.IsWordAt(3, 5));
}
}
+
+TEST_CASE("SafeSegment") {
+ SECTION("Short") {
+ const DocPlus doc("", 0);
+ // all encoding: break before or after last space
+ const std::string_view text = "12 ";
+ size_t length = doc.document.SafeSegment(text);
+ REQUIRE(length <= text.length());
+ REQUIRE(text[length - 1] == '2');
+ REQUIRE(text[length] == ' ');
+ }
+
+ SECTION("ASCII") {
+ const DocPlus doc("", 0);
+ // all encoding: break before or after last space
+ std::string_view text = "12 3 \t45";
+ size_t length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == ' ');
+ REQUIRE(text[length] == '\t');
+
+ // UTF-8 and ASCII: word and punctuation boundary in middle of text
+ text = "(IsBreakSpace(text[j]))";
+ length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == 'j');
+ REQUIRE(text[length] == ']');
+
+ // UTF-8 and ASCII: word and punctuation boundary near start of text
+ text = "(IsBreakSpace";
+ length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == '(');
+ REQUIRE(text[length] == 'I');
+
+ // UTF-8 and ASCII: word and punctuation boundary near end of text
+ text = "IsBreakSpace)";
+ length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == 'e');
+ REQUIRE(text[length] == ')');
+
+ // break before last character
+ text = "JapaneseJa";
+ length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == 'J');
+ REQUIRE(text[length] == 'a');
+ }
+
+ SECTION("UTF-8") {
+ const DocPlus doc("", CpUtf8);
+ // break before last character: no trail byte
+ std::string_view text = "JapaneseJa";
+ size_t length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == 'J');
+ REQUIRE(text[length] == 'a');
+
+ // break before last character: 1 trail byte
+ text = "Japanese\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xc2\xa9";
+ length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == '\x9e');
+ REQUIRE(text[length] == '\xc2');
+
+ // break before last character: 2 trail bytes
+ text = "Japanese\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e";
+ length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == '\xac');
+ REQUIRE(text[length] == '\xe8');
+
+ // break before last character: 3 trail bytes
+ text = "Japanese\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xf0\x9f\x98\x8a";
+ length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == '\x9e');
+ REQUIRE(text[length] == '\xf0');
+ }
+
+ SECTION("DBCS Shift-JIS") {
+ const DocPlus doc("", 932);
+ // word and punctuation boundary in middle of text: single byte
+ std::string_view text = "(IsBreakSpace(text[j]))";
+ size_t length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == 'j');
+ REQUIRE(text[length] == ']');
+
+ // word and punctuation boundary in middle of text: double byte
+ text = "(IsBreakSpace(text[\x8c\xea]))";
+ length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == '\xea');
+ REQUIRE(text[length] == ']');
+
+ // word and punctuation boundary near start of text
+ text = "(IsBreakSpace";
+ length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == '(');
+ REQUIRE(text[length] == 'I');
+
+ // word and punctuation boundary near end of text: single byte
+ text = "IsBreakSpace)";
+ length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == 'e');
+ REQUIRE(text[length] == ')');
+
+ // word and punctuation boundary near end of text: double byte
+ text = "IsBreakSpace\x8c\xea)";
+ length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == '\xea');
+ REQUIRE(text[length] == ')');
+
+ // break before last character: single byte
+ text = "JapaneseJa";
+ length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == 'J');
+ REQUIRE(text[length] == 'a');
+
+ // break before last character: double byte
+ text = "Japanese\x93\xfa\x96\x7b\x8c\xea";
+ length = doc.document.SafeSegment(text);
+ REQUIRE(text[length - 1] == '\x7b');
+ REQUIRE(text[length] == '\x8c');
+ }
+}