diff options
author | mitchell <unknown> | 2018-05-25 17:21:46 -0400 |
---|---|---|
committer | mitchell <unknown> | 2018-05-25 17:21:46 -0400 |
commit | 8bda8cce4b5daf0bc785401a887331182e4f2b74 (patch) | |
tree | c7adba12e4815cdf34e3803aca4b85178582834a | |
parent | 96cf9078786ae2e4a8aaf56468e6d069e73ed9c0 (diff) | |
download | scintilla-mirror-8bda8cce4b5daf0bc785401a887331182e4f2b74.tar.gz |
Backport: Draw invalid bytes in DBCS when detected as blobs in a similar way to UTF-8.
Backport of changeset 6962:514fde42ccbf, but without std::string_view.
-rw-r--r-- | doc/ScintillaHistory.html | 4 | ||||
-rw-r--r-- | src/Document.cxx | 87 | ||||
-rw-r--r-- | src/Document.h | 3 | ||||
-rw-r--r-- | src/Editor.cxx | 11 | ||||
-rw-r--r-- | src/PositionCache.cxx | 3 |
5 files changed, 107 insertions, 1 deletions
diff --git a/doc/ScintillaHistory.html b/doc/ScintillaHistory.html index 2d6fdd34a..621cb2c8a 100644 --- a/doc/ScintillaHistory.html +++ b/doc/ScintillaHistory.html @@ -555,6 +555,10 @@ Indicators are drawn for line end characters when displayed. </li> <li> + Most invalid bytes in DBCS encodings are displayed as blobs to make problems clear + and ensure something is shown. + </li> + <li> EDIFACT lexer adds property lexer.edifact.highlight.un.all to highlight all UN* segments. <a href="https://sourceforge.net/p/scintilla/feature-requests/1166/">Feature #1166.</a> </li> diff --git a/src/Document.cxx b/src/Document.cxx index 312aa5c02..b7dc4c8d1 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -972,6 +972,93 @@ bool Document::IsDBCSLeadByteNoExcept(char ch) const noexcept { return false; } +bool Document::IsDBCSLeadByteInvalid(char ch) const noexcept { + const unsigned char lead = ch; + switch (dbcsCodePage) { + case 932: + // Shift_jis + return + (lead == 0x85) || + (lead == 0x86) || + (lead == 0xEB) || + (lead == 0xEC) || + (lead == 0xEF) || + (lead == 0xFA) || + (lead == 0xFB) || + (lead == 0xFC); + case 936: + // GBK + return (lead == 0x80) || (lead == 0xFF); + case 949: + // Korean Wansung KS C-5601-1987 + return (lead == 0x80) || (lead == 0xC9) || (lead >= 0xFE); + case 950: + // Big5 + return + ((lead >= 0x80) && (lead <= 0xA0)) || + (lead == 0xC8) || + (lead >= 0xFA); + case 1361: + // Korean Johab KS C-5601-1992 + return + ((lead >= 0x80) && (lead <= 0x83)) || + ((lead >= 0xD4) && (lead <= 0xD8)) || + (lead == 0xDF) || + (lead >= 0xFA); + } + return false; +} + +bool Document::IsDBCSTrailByteInvalid(char ch) const noexcept { + const unsigned char trail = ch; + switch (dbcsCodePage) { + case 932: + // Shift_jis + return + (trail <= 0x3F) || + (trail == 0x7F) || + (trail >= 0xFD); + case 936: + // GBK + return + (trail <= 0x3F) || + (trail == 0x7F) || + (trail == 0xFF); + case 949: + // Korean Wansung KS C-5601-1987 + return + (trail <= 0x40) || + ((trail >= 0x5B) && (trail <= 0x60)) || + ((trail >= 0x7B) && (trail <= 0x80)) || + (trail == 0xFF); + case 950: + // Big5 + return + (trail <= 0x3F) || + ((trail >= 0x7F) && (trail <= 0xA0)) || + (trail == 0xFF); + case 1361: + // Korean Johab KS C-5601-1992 + return + (trail <= 0x30) || + (trail == 0x7F) || + (trail == 0x80) || + (trail == 0xFF); + } + return false; +} + +int Document::DBCSDrawBytes(const char *text, int len) const noexcept { + if (len <= 1) { + return len; + } + if (IsDBCSLeadByteNoExcept(text[0])) { + return IsDBCSTrailByteInvalid(text[1]) ? 1 : 2; + } else { + return 1; + } +} + static inline bool IsSpaceOrTab(int ch) noexcept { return ch == ' ' || ch == '\t'; } diff --git a/src/Document.h b/src/Document.h index b4639a9c2..42f9d36ce 100644 --- a/src/Document.h +++ b/src/Document.h @@ -309,6 +309,9 @@ public: int SCI_METHOD CodePage() const override; bool SCI_METHOD IsDBCSLeadByte(char ch) const override; bool IsDBCSLeadByteNoExcept(char ch) const noexcept; + bool IsDBCSLeadByteInvalid(char ch) const noexcept; + bool IsDBCSTrailByteInvalid(char ch) const noexcept; + int DBCSDrawBytes(const char *text, int len) const noexcept; int SafeSegment(const char *text, int length, int lengthSegment) const; EncodingFamily CodePageFamily() const; diff --git a/src/Editor.cxx b/src/Editor.cxx index 092c6c28f..7ceef1c50 100644 --- a/src/Editor.cxx +++ b/src/Editor.cxx @@ -242,6 +242,17 @@ void Editor::SetRepresentations() { sprintf(hexits, "x%2X", k); reprs.SetRepresentation(hiByte, hexits); } + } else if (pdoc->dbcsCodePage) { + // DBCS invalid single lead bytes + for (int k = 0x80; k < 0x100; k++) { + char ch = static_cast<char>(k); + if (pdoc->IsDBCSLeadByteNoExcept(ch) || pdoc->IsDBCSLeadByteInvalid(ch)) { + const char hiByte[2] = { ch, 0 }; + char hexits[5]; // Really only needs 4 but that causes warning from gcc 7.1 + sprintf(hexits, "x%2X", k); + reprs.SetRepresentation(hiByte, hexits); + } + } } } diff --git a/src/PositionCache.cxx b/src/PositionCache.cxx index 543573b38..df58aa9a7 100644 --- a/src/PositionCache.cxx +++ b/src/PositionCache.cxx @@ -507,7 +507,8 @@ TextSegment BreakFinder::Next() { charWidth = UTF8DrawBytes(reinterpret_cast<unsigned char *>(&ll->chars[nextBreak]), static_cast<int>(lineRange.end - nextBreak)); else if (encodingFamily == efDBCS) - charWidth = pdoc->IsDBCSLeadByteNoExcept(ll->chars[nextBreak]) ? 2 : 1; + charWidth = pdoc->DBCSDrawBytes( + &ll->chars[nextBreak], static_cast<int>(lineRange.end - nextBreak)); const Representation *repr = preprs->RepresentationFromCharacter(&ll->chars[nextBreak], charWidth); if (((nextBreak > 0) && (ll->styles[nextBreak] != ll->styles[nextBreak - 1])) || repr || |