aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorZufu Liu <unknown>2018-03-22 15:02:38 +1100
committerZufu Liu <unknown>2018-03-22 15:02:38 +1100
commitff707f0fe276677a4d89633ae4964e8b94712ca3 (patch)
tree103d8741341108a8dc04ef59923e19da6f4a64e4
parent9e4cdff7752304fff978ab7f606b64ea85310baf (diff)
downloadscintilla-mirror-ff707f0fe276677a4d89633ae4964e8b94712ca3.tar.gz
Feature [feature-requests:#1211]. Use pre-computed table for UTF8BytesOfLead.
Friendlier treatment of invalid UTF-8. Add tests for UniConversion handling invalid UTF-8. Simplify UTF8Classify tests.
-rw-r--r--.hgignore2
-rw-r--r--cocoa/PlatCocoa.mm19
-rw-r--r--doc/ScintillaHistory.html6
-rw-r--r--gtk/PlatGTK.cxx2
-rw-r--r--qt/ScintillaEditBase/PlatQt.cpp20
-rw-r--r--src/Document.cxx2
-rw-r--r--src/UniConversion.cxx180
-rw-r--r--src/UniConversion.h10
-rw-r--r--test/unit/testUniConversion.cxx167
-rw-r--r--win32/PlatWin.cxx19
-rw-r--r--win32/scintilla.mak2
11 files changed, 235 insertions, 194 deletions
diff --git a/.hgignore b/.hgignore
index 9fc409a8b..73365de42 100644
--- a/.hgignore
+++ b/.hgignore
@@ -1,6 +1,7 @@
syntax: glob
*.o
*.a
+*.asm
*.lib
*.obj
*.iobj
@@ -9,6 +10,7 @@ syntax: glob
*.dylib
*.framework
*.pyd
+*.exe
*.exp
*.lib
*.pdb
diff --git a/cocoa/PlatCocoa.mm b/cocoa/PlatCocoa.mm
index 5f9d788cd..c89a6f3aa 100644
--- a/cocoa/PlatCocoa.mm
+++ b/cocoa/PlatCocoa.mm
@@ -31,6 +31,7 @@
#include "StringCopy.h"
#include "XPM.h"
+#include "UniConversion.h"
#import "ScintillaView.h"
#import "ScintillaCocoa.h"
@@ -864,18 +865,6 @@ void SurfaceImpl::DrawTextTransparent(PRectangle rc, Font &font_, XYPOSITION yba
textLayout->draw(rc.left, ybase);
}
-static size_t utf8LengthFromLead(unsigned char uch) {
- if (uch >= (0x80 + 0x40 + 0x20 + 0x10)) {
- return 4;
- } else if (uch >= (0x80 + 0x40 + 0x20)) {
- return 3;
- } else if (uch >= (0x80)) {
- return 2;
- } else {
- return 1;
- }
-}
-
//--------------------------------------------------------------------------------------------------
void SurfaceImpl::MeasureWidths(Font &font_, const char *s, int len, XYPOSITION *positions) {
@@ -892,10 +881,10 @@ void SurfaceImpl::MeasureWidths(Font &font_, const char *s, int len, XYPOSITION
const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
int i=0;
while (ui<fit) {
- size_t lenChar = utf8LengthFromLead(us[i]);
- size_t codeUnits = (lenChar < 4) ? 1 : 2;
+ const unsigned int byteCount = UTF8BytesOfLead[us[i]];
+ const int codeUnits = UTF16LengthFromUTF8ByteCount(byteCount);
CGFloat xPosition = CTLineGetOffsetForStringIndex(mLine, ui+codeUnits, NULL);
- for (unsigned int bytePos=0; (bytePos<lenChar) && (i<len); bytePos++) {
+ for (unsigned int bytePos=0; (bytePos<byteCount) && (i<len); bytePos++) {
positions[i++] = static_cast<XYPOSITION>(xPosition);
}
ui += codeUnits;
diff --git a/doc/ScintillaHistory.html b/doc/ScintillaHistory.html
index 7b9a51dab..222b29a2e 100644
--- a/doc/ScintillaHistory.html
+++ b/doc/ScintillaHistory.html
@@ -546,6 +546,12 @@
The statically linked version of SciTE, Sc1, links to this static library.
</li>
<li>
+ In some cases, invalid UTF-8 is handled in a way that is a little friendlier.
+ For example, when copying to the clipboard on Windows, an invalid lead byte will be copied as the
+ equivalent ISO 8859-1 character and will not hide the following byte.
+ <a href="http://sourceforge.net/p/scintilla/feature-requests/1211/">Feature #1211.</a>
+ </li>
+ <li>
Lexer added for the Maxima computer algebra language.
<a href="http://sourceforge.net/p/scintilla/feature-requests/1210/">Feature #1210.</a>
</li>
diff --git a/gtk/PlatGTK.cxx b/gtk/PlatGTK.cxx
index ebedc6e93..da04bae61 100644
--- a/gtk/PlatGTK.cxx
+++ b/gtk/PlatGTK.cxx
@@ -781,7 +781,7 @@ void SurfaceImpl::MeasureWidths(Font &font_, const char *s, int len, XYPOSITION
positions[i++] = iti.position - (places - place) * iti.distance / places;
positionsCalculated++;
}
- clusterStart += UTF8CharLength(static_cast<unsigned char>(utfForm.c_str()[clusterStart]));
+ clusterStart += UTF8BytesOfLead[static_cast<unsigned char>(utfForm.c_str()[clusterStart])];
place++;
}
}
diff --git a/qt/ScintillaEditBase/PlatQt.cpp b/qt/ScintillaEditBase/PlatQt.cpp
index 713f4c46f..87496a191 100644
--- a/qt/ScintillaEditBase/PlatQt.cpp
+++ b/qt/ScintillaEditBase/PlatQt.cpp
@@ -10,6 +10,7 @@
#include "PlatQt.h"
#include "Scintilla.h"
+#include "UniConversion.h"
#include "DBCS.h"
#include "FontQuality.h"
@@ -438,19 +439,6 @@ void SurfaceImpl::SetClip(PRectangle rc)
GetPainter()->setClipRect(QRectFFromPRect(rc));
}
-static size_t utf8LengthFromLead(unsigned char uch)
-{
- if (uch >= (0x80 + 0x40 + 0x20 + 0x10)) {
- return 4;
- } else if (uch >= (0x80 + 0x40 + 0x20)) {
- return 3;
- } else if (uch >= (0x80)) {
- return 2;
- } else {
- return 1;
- }
-}
-
void SurfaceImpl::MeasureWidths(Font &font,
const char *s,
int len,
@@ -470,10 +458,10 @@ void SurfaceImpl::MeasureWidths(Font &font,
const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
int i=0;
while (ui<fit) {
- size_t lenChar = utf8LengthFromLead(us[i]);
- int codeUnits = (lenChar < 4) ? 1 : 2;
+ const unsigned int byteCount = UTF8BytesOfLead[us[i]];
+ const int codeUnits = UTF16LengthFromUTF8ByteCount(byteCount);
qreal xPosition = tl.cursorToX(ui+codeUnits);
- for (unsigned int bytePos=0; (bytePos<lenChar) && (i<len); bytePos++) {
+ for (unsigned int bytePos=0; (bytePos<byteCount) && (i<len); bytePos++) {
positions[i++] = xPosition;
}
ui += codeUnits;
diff --git a/src/Document.cxx b/src/Document.cxx
index 412798def..48913a16c 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -116,8 +116,6 @@ Document::Document(int options) :
matchesValid = false;
- UTF8BytesOfLeadInitialise();
-
perLineData[ldMarkers] = std::make_unique<LineMarkers>();
perLineData[ldLevels] = std::make_unique<LineLevels>();
perLineData[ldState] = std::make_unique<LineState>();
diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx
index 8e537c689..19b968932 100644
--- a/src/UniConversion.cxx
+++ b/src/UniConversion.cxx
@@ -65,35 +65,15 @@ void UTF8FromUTF16(const wchar_t *uptr, size_t tlen, char *putf, size_t len) {
putf[k] = '\0';
}
-unsigned int UTF8CharLength(unsigned char ch) {
- if (ch < 0x80) {
- return 1;
- } else if (ch < 0x80 + 0x40 + 0x20) {
- return 2;
- } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
- return 3;
- } else {
- return 4;
- }
-}
-
size_t UTF16Length(const char *s, size_t len) {
size_t ulen = 0;
- size_t charLen;
- for (size_t i = 0; i<len;) {
- const unsigned char ch = static_cast<unsigned char>(s[i]);
- if (ch < 0x80) {
- charLen = 1;
- } else if (ch < 0x80 + 0x40 + 0x20) {
- charLen = 2;
- } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
- charLen = 3;
- } else {
- charLen = 4;
- ulen++;
- }
- i += charLen;
- ulen++;
+ const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
+ for (size_t i = 0; i < len;) {
+ const unsigned char ch = us[i];
+ const unsigned int byteCount = UTF8BytesOfLead[ch];
+ const unsigned int utf16Len = UTF16LengthFromUTF8ByteCount(byteCount);
+ i += byteCount;
+ ulen += (i > len) ? 1 : utf16Len;
}
return ulen;
}
@@ -104,39 +84,60 @@ constexpr unsigned char TrailByteValue(unsigned char c) {
return c & 0b0011'1111;
}
-const unsigned char utf8Start3 = 0b1110'0000;
-const unsigned char utf8Start4 = 0b1111'0000;
-
size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen) {
size_t ui = 0;
const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
- size_t i = 0;
- while ((i<len) && (ui<tlen)) {
- unsigned char ch = us[i++];
- if (ch < 0x80) {
+ for (size_t i = 0; i < len;) {
+ unsigned char ch = us[i];
+ const unsigned int byteCount = UTF8BytesOfLead[ch];
+ unsigned int value;
+
+ if (i + byteCount > len) {
+ // Trying to read past end but still have space to write
+ if (ui < tlen) {
+ tbuf[ui] = ch;
+ ui++;
+ }
+ break;
+ }
+
+ const size_t outLen = (byteCount < 4) ? 1 : 2;
+ if (ui + outLen > tlen) {
+ throw std::runtime_error("UTF16FromUTF8: attempted write beyond end");
+ }
+
+ i++;
+ switch (byteCount) {
+ case 1:
tbuf[ui] = ch;
- } else if (ch < utf8Start3) {
- tbuf[ui] = static_cast<wchar_t>((ch & 0x1F) << 6);
+ break;
+ case 2:
+ value = (ch & 0x1F) << 6;
ch = us[i++];
- tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + TrailByteValue(ch));
- } else if (ch < utf8Start4) {
- tbuf[ui] = static_cast<wchar_t>((ch & 0xF) << 12);
+ value += TrailByteValue(ch);
+ tbuf[ui] = static_cast<wchar_t>(value);
+ break;
+ case 3:
+ value = (ch & 0xF) << 12;
ch = us[i++];
- tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (TrailByteValue(ch) << 6));
+ value += (TrailByteValue(ch) << 6);
ch = us[i++];
- tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + TrailByteValue(ch));
- } else {
+ value += TrailByteValue(ch);
+ tbuf[ui] = static_cast<wchar_t>(value);
+ break;
+ default:
// Outside the BMP so need two surrogates
- int val = (ch & 0x7) << 18;
+ value = (ch & 0x7) << 18;
ch = us[i++];
- val += TrailByteValue(ch) << 12;
+ value += TrailByteValue(ch) << 12;
ch = us[i++];
- val += TrailByteValue(ch) << 6;
+ value += TrailByteValue(ch) << 6;
ch = us[i++];
- val += TrailByteValue(ch);
- tbuf[ui] = static_cast<wchar_t>(((val - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);
+ value += TrailByteValue(ch);
+ tbuf[ui] = static_cast<wchar_t>(((value - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);
ui++;
- tbuf[ui] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
+ tbuf[ui] = static_cast<wchar_t>((value & 0x3ff) + SURROGATE_TRAIL_FIRST);
+ break;
}
ui++;
}
@@ -144,25 +145,44 @@ size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen) {
}
size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen) {
- size_t ui=0;
+ size_t ui = 0;
const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
- size_t i=0;
- while ((i<len) && (ui<tlen)) {
- unsigned char ch = us[i++];
- unsigned int value = 0;
- if (ch < 0x80) {
+ for (size_t i = 0; i < len;) {
+ unsigned char ch = us[i];
+ const unsigned int byteCount = UTF8BytesOfLead[ch];
+ unsigned int value;
+
+ if (i + byteCount > len) {
+ // Trying to read past end but still have space to write
+ if (ui < tlen) {
+ tbuf[ui] = ch;
+ ui++;
+ }
+ break;
+ }
+
+ if (ui == tlen) {
+ throw std::runtime_error("UTF32FromUTF8: attempted write beyond end");
+ }
+
+ i++;
+ switch (byteCount) {
+ case 1:
value = ch;
- } else if (((len-i) >= 1) && (ch < utf8Start3)) {
+ break;
+ case 2:
value = (ch & 0x1F) << 6;
ch = us[i++];
value += TrailByteValue(ch);
- } else if (((len-i) >= 2) && (ch < utf8Start4)) {
+ break;
+ case 3:
value = (ch & 0xF) << 12;
ch = us[i++];
value += TrailByteValue(ch) << 6;
ch = us[i++];
value += TrailByteValue(ch);
- } else if ((len-i) >= 3) {
+ break;
+ default:
value = (ch & 0x7) << 18;
ch = us[i++];
value += TrailByteValue(ch) << 12;
@@ -170,6 +190,7 @@ size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen)
value += TrailByteValue(ch) << 6;
ch = us[i++];
value += TrailByteValue(ch);
+ break;
}
tbuf[ui] = value;
ui++;
@@ -188,33 +209,24 @@ unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) {
}
}
-int UTF8BytesOfLead[256];
-static bool initialisedBytesOfLead = false;
-
-static int BytesFromLead(int leadByte) {
- if (leadByte < 0xC2) {
- // Single byte or invalid
- return 1;
- } else if (leadByte < 0xE0) {
- return 2;
- } else if (leadByte < 0xF0) {
- return 3;
- } else if (leadByte < 0xF5) {
- return 4;
- } else {
- // Characters longer than 4 bytes not possible in current UTF-8
- return 1;
- }
-}
-
-void UTF8BytesOfLeadInitialise() {
- if (!initialisedBytesOfLead) {
- for (int i=0; i<256; i++) {
- UTF8BytesOfLead[i] = BytesFromLead(i);
- }
- initialisedBytesOfLead = true;
- }
-}
+const unsigned char UTF8BytesOfLead[256] = {
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00 - 0F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10 - 1F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20 - 2F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30 - 3F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40 - 4F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50 - 5F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60 - 6F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70 - 7F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80 - 8F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 90 - 9F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A0 - AF
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B0 - BF
+1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0 - CF
+2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0 - DF
+3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0 - EF
+4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // F0 - FF
+};
// Return both the width of the first character in the string and a status
// saying whether it is valid or invalid.
diff --git a/src/UniConversion.h b/src/UniConversion.h
index 2f358c9c5..0f22c06e6 100644
--- a/src/UniConversion.h
+++ b/src/UniConversion.h
@@ -16,17 +16,15 @@ const int unicodeReplacementChar = 0xFFFD;
size_t UTF8Length(const wchar_t *uptr, size_t tlen);
void UTF8FromUTF16(const wchar_t *uptr, size_t tlen, char *putf, size_t len);
-unsigned int UTF8CharLength(unsigned char ch);
size_t UTF16Length(const char *s, size_t len);
size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen);
size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen);
unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf);
std::string FixInvalidUTF8(const std::string &text);
-extern int UTF8BytesOfLead[256];
-void UTF8BytesOfLeadInitialise();
+extern const unsigned char UTF8BytesOfLead[256];
-inline bool UTF8IsTrailByte(int ch) {
+inline bool UTF8IsTrailByte(unsigned char ch) {
return (ch >= 0x80) && (ch < 0xc0);
}
@@ -64,6 +62,10 @@ inline unsigned int UTF16CharLength(wchar_t uch) {
return ((uch >= SURROGATE_LEAD_FIRST) && (uch <= SURROGATE_LEAD_LAST)) ? 2 : 1;
}
+inline unsigned int UTF16LengthFromUTF8ByteCount(unsigned int byteCount) {
+ return (byteCount < 4) ? 1 : 2;
+}
+
}
#endif
diff --git a/test/unit/testUniConversion.cxx b/test/unit/testUniConversion.cxx
index 16ea1d974..4d34abd60 100644
--- a/test/unit/testUniConversion.cxx
+++ b/test/unit/testUniConversion.cxx
@@ -53,6 +53,24 @@ TEST_CASE("UTF16Length") {
size_t len = UTF16Length(s, strlen(s));
REQUIRE(len == 2U);
}
+
+ SECTION("UTF16Length Invalid Trail byte in lead position") {
+ const char *s = "a\xB5yz";
+ size_t len = UTF16Length(s, strlen(s));
+ REQUIRE(len == 4U);
+ }
+
+ SECTION("UTF16Length Invalid Lead byte at end") {
+ const char *s = "a\xC2";
+ size_t len = UTF16Length(s, strlen(s));
+ REQUIRE(len == 2U);
+ }
+
+ SECTION("UTF16Length Invalid Lead byte implies 3 trails but only 2") {
+ const char *s = "a\xF1yz";
+ size_t len = UTF16Length(s, strlen(s));
+ REQUIRE(len == 2U);
+ }
}
TEST_CASE("UniConversion") {
@@ -100,6 +118,35 @@ TEST_CASE("UniConversion") {
REQUIRE(tbuf[1] == 0xDF48);
}
+ SECTION("UTF16FromUTF8 Invalid Trail byte in lead position") {
+ const char s[] = "a\xB5yz";
+ wchar_t tbuf[4] = {};
+ size_t tlen = UTF16FromUTF8(s, 4, tbuf, 4);
+ REQUIRE(tlen == 4U);
+ REQUIRE(tbuf[0] == 'a');
+ REQUIRE(tbuf[1] == 0xB5);
+ REQUIRE(tbuf[2] == 'y');
+ REQUIRE(tbuf[3] == 'z');
+ }
+
+ SECTION("UTF16FromUTF8 Invalid Lead byte at end") {
+ const char s[] = "a\xC2";
+ wchar_t tbuf[2] = {};
+ size_t tlen = UTF16FromUTF8(s, 2, tbuf, 2);
+ REQUIRE(tlen == 2U);
+ REQUIRE(tbuf[0] == 'a');
+ REQUIRE(tbuf[1] == 0xC2);
+ }
+
+ SECTION("UTF16FromUTF8 Invalid Lead byte implies 3 trails but only 2") {
+ const char *s = "a\xF1yz";
+ wchar_t tbuf[4] = {};
+ size_t tlen = UTF16FromUTF8(s, 4, tbuf, 4);
+ REQUIRE(tlen == 2U);
+ REQUIRE(tbuf[0] == 'a');
+ REQUIRE(tbuf[1] == 0xF1);
+ }
+
// UTF32FromUTF8
SECTION("UTF32FromUTF8 ASCII") {
@@ -141,6 +188,44 @@ TEST_CASE("UniConversion") {
REQUIRE(tlen == 1U);
REQUIRE(tbuf[0] == 0x10348);
}
+
+ SECTION("UTF32FromUTF8 Invalid Trail byte in lead position") {
+ const char s[] = "a\xB5yz";
+ unsigned int tbuf[4] = {};
+ size_t tlen = UTF32FromUTF8(s, 4, tbuf, 4);
+ REQUIRE(tlen == 4U);
+ REQUIRE(tbuf[0] == static_cast<unsigned int>('a'));
+ REQUIRE(tbuf[1] == 0xB5);
+ REQUIRE(tbuf[2] == static_cast<unsigned int>('y'));
+ REQUIRE(tbuf[3] == static_cast<unsigned int>('z'));
+ }
+
+ SECTION("UTF32FromUTF8 Invalid Lead byte at end") {
+ const char s[] = "a\xC2";
+ unsigned int tbuf[2] = {};
+ size_t tlen = UTF32FromUTF8(s, 2, tbuf, 2);
+ REQUIRE(tlen == 2U);
+ REQUIRE(tbuf[0] == static_cast<unsigned int>('a'));
+ REQUIRE(tbuf[1] == 0xC2);
+ }
+
+ SECTION("UTF32FromUTF8 Invalid Lead byte implies 3 trails but only 2") {
+ const char *s = "a\xF1yz";
+ unsigned int tbuf[4] = {};
+ size_t tlen = UTF32FromUTF8(s, 4, tbuf, 4);
+ REQUIRE(tlen == 2U);
+ REQUIRE(tbuf[0] == static_cast<unsigned int>('a'));
+ REQUIRE(tbuf[1] == 0xF1);
+ }
+}
+
+namespace {
+
+// Simple adapter to avoid casting
+int UTFClass(const char *s) {
+ return UTF8Classify(reinterpret_cast<const unsigned char *>(s), static_cast<int>(strlen(s)));
+}
+
}
TEST_CASE("UTF8Classify") {
@@ -151,114 +236,76 @@ TEST_CASE("UTF8Classify") {
// Single byte
SECTION("UTF8Classify Simple ASCII") {
- const char *s = "a";
- REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == 1);
+ REQUIRE(UTFClass("a") == 1);
}
-
SECTION("UTF8Classify Invalid Too large lead") {
- const char *s = "\xF5";
- REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1|UTF8MaskInvalid));
+ REQUIRE(UTFClass("\xF5") == (1|UTF8MaskInvalid));
}
// 4 byte lead
SECTION("UTF8Classify 4 byte lead, string less than 4 long") {
- const char *s = "\xF0";
- REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+ REQUIRE(UTFClass("\xF0") == (1 | UTF8MaskInvalid));
}
-
SECTION("UTF8Classify 1FFFF non-character") {
- const char *s = "\xF0\x9F\xBF\xBF";
- REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (4 | UTF8MaskInvalid));
+ REQUIRE(UTFClass("\xF0\x9F\xBF\xBF") == (4 | UTF8MaskInvalid));
}
-
SECTION("UTF8Classify 1 Greater than max Unicode 110000") {
// Maximum Unicode value is 10FFFF so 110000 is out of range
- const char *s = "\xF4\x90\x80\x80";
- REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+ REQUIRE(UTFClass("\xF4\x90\x80\x80") == (1 | UTF8MaskInvalid));
}
-
SECTION("UTF8Classify 4 byte overlong") {
- const char *s = "\xF0\x80\x80\x80";
- REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+ REQUIRE(UTFClass("\xF0\x80\x80\x80") == (1 | UTF8MaskInvalid));
}
-
SECTION("UTF8Classify 4 byte valid character") {
- const char *s = "\xF0\x9F\x8C\x90";
- REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == 4);
+ REQUIRE(UTFClass("\xF0\x9F\x8C\x90") == 4);
}
-
SECTION("UTF8Classify 4 byte bad trails") {
- const char *s = "\xF0xyz";
- REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+ REQUIRE(UTFClass("\xF0xyz") == (1 | UTF8MaskInvalid));
}
// 3 byte lead
SECTION("UTF8Classify 3 byte lead, string less than 3 long") {
- const char *s = "\xEF";
- REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+ REQUIRE(UTFClass("\xEF") == (1 | UTF8MaskInvalid));
}
-
SECTION("UTF8Classify 3 byte lead, overlong") {
- const char *s = "\xE0\x80\xAF";
- REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+ REQUIRE(UTFClass("\xE0\x80\xAF") == (1 | UTF8MaskInvalid));
}
-
SECTION("UTF8Classify 3 byte lead, surrogate") {
- const char *s = "\xED\xA0\x80";
- REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+ REQUIRE(UTFClass("\xED\xA0\x80") == (1 | UTF8MaskInvalid));
}
-
SECTION("UTF8Classify FFFE non-character") {
- const char *s = "\xEF\xBF\xBE";
- REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (3 | UTF8MaskInvalid));
+ REQUIRE(UTFClass("\xEF\xBF\xBE") == (3 | UTF8MaskInvalid));
}
-
SECTION("UTF8Classify FFFF non-character") {
- const char *s = "\xEF\xBF\xBF";
- REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (3 | UTF8MaskInvalid));
+ REQUIRE(UTFClass("\xEF\xBF\xBF") == (3 | UTF8MaskInvalid));
}
-
SECTION("UTF8Classify FDD0 non-character") {
- const char *s = "\xEF\xB7\x90";
- REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (3 | UTF8MaskInvalid));
+ REQUIRE(UTFClass("\xEF\xB7\x90") == (3 | UTF8MaskInvalid));
}
-
SECTION("UTF8Classify 3 byte valid character") {
- const char *s = "\xE2\x82\xAC";
- REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == 3);
+ REQUIRE(UTFClass("\xE2\x82\xAC") == 3);
}
-
SECTION("UTF8Classify 3 byte bad trails") {
- const char *s = "\xE2qq";
- REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+ REQUIRE(UTFClass("\xE2qq") == (1 | UTF8MaskInvalid));
}
// 2 byte lead
SECTION("UTF8Classify 2 byte lead, string less than 2 long") {
- const char *s = "\xD0";
- REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+ REQUIRE(UTFClass("\xD0") == (1 | UTF8MaskInvalid));
}
-
SECTION("UTF8Classify 2 byte valid character") {
- const char *s = "\xD0\x80";
- REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == 2);
+ REQUIRE(UTFClass("\xD0\x80") == 2);
}
-
SECTION("UTF8Classify 2 byte lead trail is invalid") {
- const char *s = "\xD0q";
- REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+ REQUIRE(UTFClass("\xD0q") == (1 | UTF8MaskInvalid));
}
-
SECTION("UTF8Classify Overlong") {
- const char *s = "\xC0";
- REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+ REQUIRE(UTFClass("\xC0") == (1 | UTF8MaskInvalid));
}
-
SECTION("UTF8Classify single trail byte") {
- const char *s = "\x80";
- REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid));
+ REQUIRE(UTFClass("\x80") == (1 | UTF8MaskInvalid));
}
-} \ No newline at end of file
+}
diff --git a/win32/PlatWin.cxx b/win32/PlatWin.cxx
index 9e89e2f84..79970a969 100644
--- a/win32/PlatWin.cxx
+++ b/win32/PlatWin.cxx
@@ -951,12 +951,14 @@ void SurfaceGDI::MeasureWidths(Font &font_, const char *s, int len, XYPOSITION *
return;
}
// Map the widths given for UTF-16 characters back onto the UTF-8 input string
+ const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
for (int ui = 0; ui < fit; ui++) {
- const unsigned int lenChar = UTF8BytesOfLead[static_cast<unsigned char>(s[i])];
- if (lenChar == 4) { // Non-BMP
+ const unsigned char uch = us[i];
+ const unsigned int byteCount = UTF8BytesOfLead[uch];
+ if (byteCount == 4) { // Non-BMP
ui++;
}
- for (unsigned int bytePos=0; (bytePos<lenChar) && (i<len); bytePos++) {
+ for (unsigned int bytePos=0; (bytePos<byteCount) && (i<len); bytePos++) {
positions[i++] = static_cast<XYPOSITION>(poses.buffer[ui]);
}
}
@@ -1623,16 +1625,11 @@ void SurfaceD2D::MeasureWidths(Font &font_, const char *s, int len, XYPOSITION *
int i=0;
while (ui<tbuf.tlen) {
const unsigned char uch = us[i];
- unsigned int lenChar = 1;
- if (uch >= (0x80 + 0x40 + 0x20 + 0x10)) {
- lenChar = 4;
+ const unsigned int byteCount = UTF8BytesOfLead[uch];
+ if (byteCount == 4) { // Non-BMP
ui++;
- } else if (uch >= (0x80 + 0x40 + 0x20)) {
- lenChar = 3;
- } else if (uch >= (0x80)) {
- lenChar = 2;
}
- for (unsigned int bytePos=0; (bytePos<lenChar) && (i<len); bytePos++) {
+ for (unsigned int bytePos=0; (bytePos<byteCount) && (i<len); bytePos++) {
positions[i++] = poses.buffer[ui];
}
ui++;
diff --git a/win32/scintilla.mak b/win32/scintilla.mak
index 29f882032..7f943474b 100644
--- a/win32/scintilla.mak
+++ b/win32/scintilla.mak
@@ -66,7 +66,7 @@ CXXFLAGS=$(CXXFLAGS) $(INCLUDEDIRS)
all: $(COMPONENT) $(LEXCOMPONENT) $(LIBSCI)
clean:
- -del /q $(DIR_O)\*.obj $(DIR_O)\*.pdb $(COMPONENT) $(LEXCOMPONENT) \
+ -del /q $(DIR_O)\*.obj $(DIR_O)\*.pdb $(DIR_O)\*.asm $(COMPONENT) $(LEXCOMPONENT) \
$(DIR_O)\*.res $(DIR_BIN)\*.map $(DIR_BIN)\*.exp $(DIR_BIN)\*.pdb $(DIR_BIN)\*.lib
# Required for base Scintilla