Backport: Draw invalid bytes in DBCS when detected as blobs in a similar way to UTF-8.

Backport of changeset 6962:514fde42ccbf, but without std::string_view.
author: mitchell <unknown> 2018-05-25 17:21:46 -0400
committer: mitchell <unknown> 2018-05-25 17:21:46 -0400
commit: 8bda8cce4b5daf0bc785401a887331182e4f2b74 (patch)
tree: c7adba12e4815cdf34e3803aca4b85178582834a
parent: 96cf9078786ae2e4a8aaf56468e6d069e73ed9c0 (diff)
download: scintilla-mirror-8bda8cce4b5daf0bc785401a887331182e4f2b74.tar.gz
5 files changed, 107 insertions, 1 deletions
diff --git a/doc/ScintillaHistory.html b/doc/ScintillaHistory.html
index 2d6fdd34a..621cb2c8a 100644
--- a/doc/ScintillaHistory.html
+++ b/doc/ScintillaHistory.html
@@ -555,6 +555,10 @@
 	Indicators are drawn for line end characters when displayed.
 	</li>
 	<li>
+	Most invalid bytes in DBCS encodings are displayed as blobs to make problems clear
+	and ensure something is shown.
+	</li>
+	<li>
 	EDIFACT lexer adds property lexer.edifact.highlight.un.all to highlight all UN* segments.
 	<a href="https://sourceforge.net/p/scintilla/feature-requests/1166/">Feature #1166.</a>
 	</li>
diff --git a/src/Document.cxx b/src/Document.cxx
index 312aa5c02..b7dc4c8d1 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -972,6 +972,93 @@ bool Document::IsDBCSLeadByteNoExcept(char ch) const noexcept {
 	return false;
 }
 
+bool Document::IsDBCSLeadByteInvalid(char ch) const noexcept {
+	const unsigned char lead = ch;
+	switch (dbcsCodePage) {
+	case 932:
+		// Shift_jis
+		return
+			(lead == 0x85) ||
+			(lead == 0x86) ||
+			(lead == 0xEB) ||
+			(lead == 0xEC) ||
+			(lead == 0xEF) ||
+			(lead == 0xFA) ||
+			(lead == 0xFB) ||
+			(lead == 0xFC);
+	case 936:
+		// GBK
+		return (lead == 0x80) || (lead == 0xFF);
+	case 949:
+		// Korean Wansung KS C-5601-1987
+		return (lead == 0x80) || (lead == 0xC9) || (lead >= 0xFE);
+	case 950:
+		// Big5
+		return
+			((lead >= 0x80) && (lead <= 0xA0)) ||
+			(lead == 0xC8) ||
+			(lead >= 0xFA);
+	case 1361:
+		// Korean Johab KS C-5601-1992
+		return
+			((lead >= 0x80) && (lead <= 0x83)) ||
+			((lead >= 0xD4) && (lead <= 0xD8)) ||
+			(lead == 0xDF) ||
+			(lead >= 0xFA);
+	}
+	return false;
+}
+
+bool Document::IsDBCSTrailByteInvalid(char ch) const noexcept {
+	const unsigned char trail = ch;
+	switch (dbcsCodePage) {
+	case 932:
+		// Shift_jis
+		return
+			(trail <= 0x3F) ||
+			(trail == 0x7F) ||
+			(trail >= 0xFD);
+	case 936:
+		// GBK
+		return
+			(trail <= 0x3F) ||
+			(trail == 0x7F) ||
+			(trail == 0xFF);
+	case 949:
+		// Korean Wansung KS C-5601-1987
+		return
+			(trail <= 0x40) ||
+			((trail >= 0x5B) && (trail <= 0x60)) ||
+			((trail >= 0x7B) && (trail <= 0x80)) ||
+			(trail == 0xFF);
+	case 950:
+		// Big5
+		return
+			(trail <= 0x3F) ||
+			((trail >= 0x7F) && (trail <= 0xA0)) ||
+			(trail == 0xFF);
+	case 1361:
+		// Korean Johab KS C-5601-1992
+		return
+			(trail <= 0x30) ||
+			(trail == 0x7F) ||
+			(trail == 0x80) ||
+			(trail == 0xFF);
+	}
+	return false;
+}
+
+int Document::DBCSDrawBytes(const char *text, int len) const noexcept {
+	if (len <= 1) {
+		return len;
+	}
+	if (IsDBCSLeadByteNoExcept(text[0])) {
+		return IsDBCSTrailByteInvalid(text[1]) ? 1 : 2;
+	} else {
+		return 1;
+	}
+}
+
 static inline bool IsSpaceOrTab(int ch) noexcept {
 	return ch == ' ' || ch == '\t';
 }
diff --git a/src/Document.h b/src/Document.h
index b4639a9c2..42f9d36ce 100644
--- a/src/Document.h
+++ b/src/Document.h
@@ -309,6 +309,9 @@ public:
 	int SCI_METHOD CodePage() const override;
 	bool SCI_METHOD IsDBCSLeadByte(char ch) const override;
 	bool IsDBCSLeadByteNoExcept(char ch) const noexcept;
+	bool IsDBCSLeadByteInvalid(char ch) const noexcept;
+	bool IsDBCSTrailByteInvalid(char ch) const noexcept;
+	int DBCSDrawBytes(const char *text, int len) const noexcept;
 	int SafeSegment(const char *text, int length, int lengthSegment) const;
 	EncodingFamily CodePageFamily() const;
 
diff --git a/src/Editor.cxx b/src/Editor.cxx
index 092c6c28f..7ceef1c50 100644
--- a/src/Editor.cxx
+++ b/src/Editor.cxx
@@ -242,6 +242,17 @@ void Editor::SetRepresentations() {
 			sprintf(hexits, "x%2X", k);
 			reprs.SetRepresentation(hiByte, hexits);
 		}
+	} else if (pdoc->dbcsCodePage) {
+		// DBCS invalid single lead bytes
+		for (int k = 0x80; k < 0x100; k++) {
+			char ch = static_cast<char>(k);
+			if (pdoc->IsDBCSLeadByteNoExcept(ch)  || pdoc->IsDBCSLeadByteInvalid(ch)) {
+				const char hiByte[2] = { ch, 0 };
+				char hexits[5];	// Really only needs 4 but that causes warning from gcc 7.1
+				sprintf(hexits, "x%2X", k);
+				reprs.SetRepresentation(hiByte, hexits);
+			}
+		}
 	}
 }
 
diff --git a/src/PositionCache.cxx b/src/PositionCache.cxx
index 543573b38..df58aa9a7 100644
--- a/src/PositionCache.cxx
+++ b/src/PositionCache.cxx
@@ -507,7 +507,8 @@ TextSegment BreakFinder::Next() {
 				charWidth = UTF8DrawBytes(reinterpret_cast<unsigned char *>(&ll->chars[nextBreak]),
 					static_cast<int>(lineRange.end - nextBreak));
 			else if (encodingFamily == efDBCS)
-				charWidth = pdoc->IsDBCSLeadByteNoExcept(ll->chars[nextBreak]) ? 2 : 1;
+				charWidth = pdoc->DBCSDrawBytes(
+					&ll->chars[nextBreak], static_cast<int>(lineRange.end - nextBreak));
 			const Representation *repr = preprs->RepresentationFromCharacter(&ll->chars[nextBreak], charWidth);
 			if (((nextBreak > 0) && (ll->styles[nextBreak] != ll->styles[nextBreak - 1])) ||
 					repr ||
author	mitchell <unknown>	2018-05-25 17:21:46 -0400
committer	mitchell <unknown>	2018-05-25 17:21:46 -0400
commit	8bda8cce4b5daf0bc785401a887331182e4f2b74 (patch)
tree	c7adba12e4815cdf34e3803aca4b85178582834a
parent	96cf9078786ae2e4a8aaf56468e6d069e73ed9c0 (diff)
download	scintilla-mirror-8bda8cce4b5daf0bc785401a887331182e4f2b74.tar.gz