Feature [feature-requests:#1408] Check both bytes of potential DBCS character

before treating as a character.
author: Zufu Liu <unknown> 2021-07-05 16:18:13 +1000
committer: Zufu Liu <unknown> 2021-07-05 16:18:13 +1000
commit: 289314060dd7a44f9844cfc891d1c1d823742f94 (patch)
tree: a414ccc8b1136ea24f0d0e6d60bb5cc2f89d6fac
parent: c9d4cf4a91ae137870804d244945cb90c1dd37f4 (diff)
download: scintilla-mirror-289314060dd7a44f9844cfc891d1c1d823742f94.tar.gz
3 files changed, 70 insertions, 12 deletions
diff --git a/src/Document.cxx b/src/Document.cxx
index e5022ad64..6dc14238f 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -674,7 +674,7 @@ int Document::LenChar(Sci::Position pos) const noexcept {
 			return utf8status & UTF8MaskWidth;
 		}
 	} else {
-		if (IsDBCSLeadByteNoExcept(leadByte) && ((pos + 1) < LengthNoExcept())) {
+		if (IsDBCSLeadByteNoExcept(leadByte) && IsDBCSTrailByteNoExcept(cb.CharAt(pos + 1))) {
 			return 2;
 		} else {
 			return 1;
@@ -709,7 +709,7 @@ bool Document::InGoodUTF8(Sci::Position pos, Sci::Position &start, Sci::Position
 	}
 }
 
-// Normalise a position so that it is not halfway through a two byte character.
+// Normalise a position so that it is not part way through a multi-byte character.
 // This can occur in two situations -
 // When lines are terminated with \r\n pairs which should be treated as one character.
 // When displaying DBCS text such as Japanese.
@@ -760,7 +760,7 @@ Sci::Position Document::MovePositionOutsideChar(Sci::Position pos, Sci::Position
 
 			// Check from known start of character.
 			while (posCheck < pos) {
-				const int mbsize = IsDBCSLeadByteNoExcept(cb.CharAt(posCheck)) ? 2 : 1;
+				const int mbsize = IsDBCSDualByteAt(posCheck) ? 2 : 1;
 				if (posCheck + mbsize == pos) {
 					return pos;
 				} else if (posCheck + mbsize > pos) {
@@ -825,7 +825,7 @@ Sci::Position Document::NextPosition(Sci::Position pos, int moveDir) const noexc
 			}
 		} else {
 			if (moveDir > 0) {
-				const int mbsize = IsDBCSLeadByteNoExcept(cb.CharAt(pos)) ? 2 : 1;
+				const int mbsize = IsDBCSDualByteAt(pos) ? 2 : 1;
 				pos += mbsize;
 				if (pos > cb.Length())
 					pos = cb.Length();
@@ -1098,6 +1098,11 @@ int Document::DBCSDrawBytes(std::string_view text) const noexcept {
 	}
 }
 
+bool Document::IsDBCSDualByteAt(Sci::Position pos) const noexcept {
+	return IsDBCSLeadByteNoExcept(cb.CharAt(pos))
+		&& IsDBCSTrailByteNoExcept(cb.CharAt(pos + 1));
+}
+
 static constexpr bool IsSpaceOrTab(int ch) noexcept {
 	return ch == ' ' || ch == '\t';
 }
diff --git a/src/Document.h b/src/Document.h
index c40ce2a44..fe27f4936 100644
--- a/src/Document.h
+++ b/src/Document.h
@@ -330,8 +330,8 @@ public:
 	bool SCI_METHOD IsDBCSLeadByte(char ch) const override;
 	bool IsDBCSLeadByteNoExcept(char ch) const noexcept;
 	bool IsDBCSTrailByteNoExcept(char ch) const noexcept;
-	bool IsDBCSLeadByteInvalid(char ch) const noexcept;
 	int DBCSDrawBytes(std::string_view text) const noexcept;
+	bool IsDBCSDualByteAt(Sci::Position pos) const noexcept;
 	int SafeSegment(const char *text, int length, int lengthSegment) const noexcept;
 	EncodingFamily CodePageFamily() const noexcept;
 
diff --git a/test/unit/testDocument.cxx b/test/unit/testDocument.cxx
index e07f99997..c91868165 100644
--- a/test/unit/testDocument.cxx
+++ b/test/unit/testDocument.cxx
@@ -316,25 +316,78 @@ TEST_CASE("Document") {
 		// Can not test case mapping of double byte text as folder available here does not implement this 
 	}
 
-	SECTION("GetCharacterAndWidth") {
+	SECTION("GetCharacterAndWidth DBCS") {
 		Document doc(DocumentOption::Default);
 		doc.SetDBCSCodePage(932);
 		REQUIRE(doc.CodePage() == 932);
-		const Sci::Position length = doc.InsertString(0, "\x84\xff=", 3);
-		REQUIRE(3 == length);
-		REQUIRE(3 == doc.Length());
+		const Sci::Position length = doc.InsertString(0, "H\x84\xff\x84H", 5);
+		// This text is invalid in code page 932.
+		// A reasonable interpretation is as 4 items: 2 characters and 2 character fragments
+		// The last item is a 2-byte CYRILLIC CAPITAL LETTER ZE character 
+		// H [84] [FF] ZE
+		REQUIRE(5 == length);
+		REQUIRE(5 == doc.Length());
 		Sci::Position width = 0;
+		// test GetCharacterAndWidth()
 		int ch = doc.GetCharacterAndWidth(0, &width);
 		REQUIRE(width == 1);
+		REQUIRE(ch == 'H');
+		ch = doc.GetCharacterAndWidth(1, &width);
+		REQUIRE(width == 1);
 		REQUIRE(ch == 0x84);
 		width = 0;
-		ch = doc.GetCharacterAndWidth(1, &width);
+		ch = doc.GetCharacterAndWidth(2, &width);
 		REQUIRE(width == 1);
 		REQUIRE(ch == 0xff);
 		width = 0;
-		ch = doc.GetCharacterAndWidth(2, &width);
+		ch = doc.GetCharacterAndWidth(3, &width);
+		REQUIRE(width == 2);
+		REQUIRE(ch == 0x8448);
+		// test LenChar()
+		width = doc.LenChar(0);
+		REQUIRE(width == 1);
+		width = doc.LenChar(1);
+		REQUIRE(width == 1);
+		width = doc.LenChar(2);
 		REQUIRE(width == 1);
-		REQUIRE(ch == '=');
+		width = doc.LenChar(3);
+		REQUIRE(width == 2);
+		// test MovePositionOutsideChar()
+		Sci::Position pos = doc.MovePositionOutsideChar(1, 1);
+		REQUIRE(pos == 1);
+		pos = doc.MovePositionOutsideChar(2, 1);
+		REQUIRE(pos == 2);
+		pos = doc.MovePositionOutsideChar(3, 1);
+		REQUIRE(pos == 3);
+		pos = doc.MovePositionOutsideChar(4, 1);
+		REQUIRE(pos == 5);
+		pos = doc.MovePositionOutsideChar(1, -1);
+		REQUIRE(pos == 1);
+		pos = doc.MovePositionOutsideChar(2, -1);
+		REQUIRE(pos == 2);
+		pos = doc.MovePositionOutsideChar(3, -1);
+		REQUIRE(pos == 3);
+		pos = doc.MovePositionOutsideChar(4, -1);
+		REQUIRE(pos == 3);
+		// test NextPosition()
+		pos = doc.NextPosition(0, 1);
+		REQUIRE(pos == 1);
+		pos = doc.NextPosition(1, 1);
+		REQUIRE(pos == 2);
+		pos = doc.NextPosition(2, 1);
+		REQUIRE(pos == 3);
+		pos = doc.NextPosition(3, 1);
+		REQUIRE(pos == 5);
+		pos = doc.NextPosition(1, -1);
+		REQUIRE(pos == 0);
+		// The next two tests are commented out because the implementation of NextPosition
+		// cannot yet handle character fragments correctly when moving backwards.
+		//pos = doc.NextPosition(2, -1);
+		//REQUIRE(pos == 1);
+		//pos = doc.NextPosition(3, -1);
+		//REQUIRE(pos == 2);
+		pos = doc.NextPosition(5, -1);
+		REQUIRE(pos == 3);
 	}
 
 }
author	Zufu Liu <unknown>	2021-07-05 16:18:13 +1000
committer	Zufu Liu <unknown>	2021-07-05 16:18:13 +1000
commit	289314060dd7a44f9844cfc891d1c1d823742f94 (patch)
tree	a414ccc8b1136ea24f0d0e6d60bb5cc2f89d6fac
parent	c9d4cf4a91ae137870804d244945cb90c1dd37f4 (diff)
download	scintilla-mirror-289314060dd7a44f9844cfc891d1c1d823742f94.tar.gz