diff options
-rw-r--r-- | .hgignore | 2 | ||||
-rw-r--r-- | cocoa/PlatCocoa.mm | 19 | ||||
-rw-r--r-- | doc/ScintillaHistory.html | 6 | ||||
-rw-r--r-- | gtk/PlatGTK.cxx | 2 | ||||
-rw-r--r-- | qt/ScintillaEditBase/PlatQt.cpp | 20 | ||||
-rw-r--r-- | src/Document.cxx | 2 | ||||
-rw-r--r-- | src/UniConversion.cxx | 180 | ||||
-rw-r--r-- | src/UniConversion.h | 10 | ||||
-rw-r--r-- | test/unit/testUniConversion.cxx | 167 | ||||
-rw-r--r-- | win32/PlatWin.cxx | 19 | ||||
-rw-r--r-- | win32/scintilla.mak | 2 |
11 files changed, 235 insertions, 194 deletions
@@ -1,6 +1,7 @@ syntax: glob *.o *.a +*.asm *.lib *.obj *.iobj @@ -9,6 +10,7 @@ syntax: glob *.dylib *.framework *.pyd +*.exe *.exp *.lib *.pdb diff --git a/cocoa/PlatCocoa.mm b/cocoa/PlatCocoa.mm index 5f9d788cd..c89a6f3aa 100644 --- a/cocoa/PlatCocoa.mm +++ b/cocoa/PlatCocoa.mm @@ -31,6 +31,7 @@ #include "StringCopy.h" #include "XPM.h" +#include "UniConversion.h" #import "ScintillaView.h" #import "ScintillaCocoa.h" @@ -864,18 +865,6 @@ void SurfaceImpl::DrawTextTransparent(PRectangle rc, Font &font_, XYPOSITION yba textLayout->draw(rc.left, ybase); } -static size_t utf8LengthFromLead(unsigned char uch) { - if (uch >= (0x80 + 0x40 + 0x20 + 0x10)) { - return 4; - } else if (uch >= (0x80 + 0x40 + 0x20)) { - return 3; - } else if (uch >= (0x80)) { - return 2; - } else { - return 1; - } -} - //-------------------------------------------------------------------------------------------------- void SurfaceImpl::MeasureWidths(Font &font_, const char *s, int len, XYPOSITION *positions) { @@ -892,10 +881,10 @@ void SurfaceImpl::MeasureWidths(Font &font_, const char *s, int len, XYPOSITION const unsigned char *us = reinterpret_cast<const unsigned char *>(s); int i=0; while (ui<fit) { - size_t lenChar = utf8LengthFromLead(us[i]); - size_t codeUnits = (lenChar < 4) ? 1 : 2; + const unsigned int byteCount = UTF8BytesOfLead[us[i]]; + const int codeUnits = UTF16LengthFromUTF8ByteCount(byteCount); CGFloat xPosition = CTLineGetOffsetForStringIndex(mLine, ui+codeUnits, NULL); - for (unsigned int bytePos=0; (bytePos<lenChar) && (i<len); bytePos++) { + for (unsigned int bytePos=0; (bytePos<byteCount) && (i<len); bytePos++) { positions[i++] = static_cast<XYPOSITION>(xPosition); } ui += codeUnits; diff --git a/doc/ScintillaHistory.html b/doc/ScintillaHistory.html index 7b9a51dab..222b29a2e 100644 --- a/doc/ScintillaHistory.html +++ b/doc/ScintillaHistory.html @@ -546,6 +546,12 @@ The statically linked version of SciTE, Sc1, links to this static library. </li> <li> + In some cases, invalid UTF-8 is handled in a way that is a little friendlier. + For example, when copying to the clipboard on Windows, an invalid lead byte will be copied as the + equivalent ISO 8859-1 character and will not hide the following byte. + <a href="http://sourceforge.net/p/scintilla/feature-requests/1211/">Feature #1211.</a> + </li> + <li> Lexer added for the Maxima computer algebra language. <a href="http://sourceforge.net/p/scintilla/feature-requests/1210/">Feature #1210.</a> </li> diff --git a/gtk/PlatGTK.cxx b/gtk/PlatGTK.cxx index ebedc6e93..da04bae61 100644 --- a/gtk/PlatGTK.cxx +++ b/gtk/PlatGTK.cxx @@ -781,7 +781,7 @@ void SurfaceImpl::MeasureWidths(Font &font_, const char *s, int len, XYPOSITION positions[i++] = iti.position - (places - place) * iti.distance / places; positionsCalculated++; } - clusterStart += UTF8CharLength(static_cast<unsigned char>(utfForm.c_str()[clusterStart])); + clusterStart += UTF8BytesOfLead[static_cast<unsigned char>(utfForm.c_str()[clusterStart])]; place++; } } diff --git a/qt/ScintillaEditBase/PlatQt.cpp b/qt/ScintillaEditBase/PlatQt.cpp index 713f4c46f..87496a191 100644 --- a/qt/ScintillaEditBase/PlatQt.cpp +++ b/qt/ScintillaEditBase/PlatQt.cpp @@ -10,6 +10,7 @@ #include "PlatQt.h" #include "Scintilla.h" +#include "UniConversion.h" #include "DBCS.h" #include "FontQuality.h" @@ -438,19 +439,6 @@ void SurfaceImpl::SetClip(PRectangle rc) GetPainter()->setClipRect(QRectFFromPRect(rc)); } -static size_t utf8LengthFromLead(unsigned char uch) -{ - if (uch >= (0x80 + 0x40 + 0x20 + 0x10)) { - return 4; - } else if (uch >= (0x80 + 0x40 + 0x20)) { - return 3; - } else if (uch >= (0x80)) { - return 2; - } else { - return 1; - } -} - void SurfaceImpl::MeasureWidths(Font &font, const char *s, int len, @@ -470,10 +458,10 @@ void SurfaceImpl::MeasureWidths(Font &font, const unsigned char *us = reinterpret_cast<const unsigned char *>(s); int i=0; while (ui<fit) { - size_t lenChar = utf8LengthFromLead(us[i]); - int codeUnits = (lenChar < 4) ? 1 : 2; + const unsigned int byteCount = UTF8BytesOfLead[us[i]]; + const int codeUnits = UTF16LengthFromUTF8ByteCount(byteCount); qreal xPosition = tl.cursorToX(ui+codeUnits); - for (unsigned int bytePos=0; (bytePos<lenChar) && (i<len); bytePos++) { + for (unsigned int bytePos=0; (bytePos<byteCount) && (i<len); bytePos++) { positions[i++] = xPosition; } ui += codeUnits; diff --git a/src/Document.cxx b/src/Document.cxx index 412798def..48913a16c 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -116,8 +116,6 @@ Document::Document(int options) : matchesValid = false; - UTF8BytesOfLeadInitialise(); - perLineData[ldMarkers] = std::make_unique<LineMarkers>(); perLineData[ldLevels] = std::make_unique<LineLevels>(); perLineData[ldState] = std::make_unique<LineState>(); diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx index 8e537c689..19b968932 100644 --- a/src/UniConversion.cxx +++ b/src/UniConversion.cxx @@ -65,35 +65,15 @@ void UTF8FromUTF16(const wchar_t *uptr, size_t tlen, char *putf, size_t len) { putf[k] = '\0'; } -unsigned int UTF8CharLength(unsigned char ch) { - if (ch < 0x80) { - return 1; - } else if (ch < 0x80 + 0x40 + 0x20) { - return 2; - } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) { - return 3; - } else { - return 4; - } -} - size_t UTF16Length(const char *s, size_t len) { size_t ulen = 0; - size_t charLen; - for (size_t i = 0; i<len;) { - const unsigned char ch = static_cast<unsigned char>(s[i]); - if (ch < 0x80) { - charLen = 1; - } else if (ch < 0x80 + 0x40 + 0x20) { - charLen = 2; - } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) { - charLen = 3; - } else { - charLen = 4; - ulen++; - } - i += charLen; - ulen++; + const unsigned char *us = reinterpret_cast<const unsigned char *>(s); + for (size_t i = 0; i < len;) { + const unsigned char ch = us[i]; + const unsigned int byteCount = UTF8BytesOfLead[ch]; + const unsigned int utf16Len = UTF16LengthFromUTF8ByteCount(byteCount); + i += byteCount; + ulen += (i > len) ? 1 : utf16Len; } return ulen; } @@ -104,39 +84,60 @@ constexpr unsigned char TrailByteValue(unsigned char c) { return c & 0b0011'1111; } -const unsigned char utf8Start3 = 0b1110'0000; -const unsigned char utf8Start4 = 0b1111'0000; - size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen) { size_t ui = 0; const unsigned char *us = reinterpret_cast<const unsigned char *>(s); - size_t i = 0; - while ((i<len) && (ui<tlen)) { - unsigned char ch = us[i++]; - if (ch < 0x80) { + for (size_t i = 0; i < len;) { + unsigned char ch = us[i]; + const unsigned int byteCount = UTF8BytesOfLead[ch]; + unsigned int value; + + if (i + byteCount > len) { + // Trying to read past end but still have space to write + if (ui < tlen) { + tbuf[ui] = ch; + ui++; + } + break; + } + + const size_t outLen = (byteCount < 4) ? 1 : 2; + if (ui + outLen > tlen) { + throw std::runtime_error("UTF16FromUTF8: attempted write beyond end"); + } + + i++; + switch (byteCount) { + case 1: tbuf[ui] = ch; - } else if (ch < utf8Start3) { - tbuf[ui] = static_cast<wchar_t>((ch & 0x1F) << 6); + break; + case 2: + value = (ch & 0x1F) << 6; ch = us[i++]; - tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + TrailByteValue(ch)); - } else if (ch < utf8Start4) { - tbuf[ui] = static_cast<wchar_t>((ch & 0xF) << 12); + value += TrailByteValue(ch); + tbuf[ui] = static_cast<wchar_t>(value); + break; + case 3: + value = (ch & 0xF) << 12; ch = us[i++]; - tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (TrailByteValue(ch) << 6)); + value += (TrailByteValue(ch) << 6); ch = us[i++]; - tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + TrailByteValue(ch)); - } else { + value += TrailByteValue(ch); + tbuf[ui] = static_cast<wchar_t>(value); + break; + default: // Outside the BMP so need two surrogates - int val = (ch & 0x7) << 18; + value = (ch & 0x7) << 18; ch = us[i++]; - val += TrailByteValue(ch) << 12; + value += TrailByteValue(ch) << 12; ch = us[i++]; - val += TrailByteValue(ch) << 6; + value += TrailByteValue(ch) << 6; ch = us[i++]; - val += TrailByteValue(ch); - tbuf[ui] = static_cast<wchar_t>(((val - 0x10000) >> 10) + SURROGATE_LEAD_FIRST); + value += TrailByteValue(ch); + tbuf[ui] = static_cast<wchar_t>(((value - 0x10000) >> 10) + SURROGATE_LEAD_FIRST); ui++; - tbuf[ui] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST); + tbuf[ui] = static_cast<wchar_t>((value & 0x3ff) + SURROGATE_TRAIL_FIRST); + break; } ui++; } @@ -144,25 +145,44 @@ size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen) { } size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen) { - size_t ui=0; + size_t ui = 0; const unsigned char *us = reinterpret_cast<const unsigned char *>(s); - size_t i=0; - while ((i<len) && (ui<tlen)) { - unsigned char ch = us[i++]; - unsigned int value = 0; - if (ch < 0x80) { + for (size_t i = 0; i < len;) { + unsigned char ch = us[i]; + const unsigned int byteCount = UTF8BytesOfLead[ch]; + unsigned int value; + + if (i + byteCount > len) { + // Trying to read past end but still have space to write + if (ui < tlen) { + tbuf[ui] = ch; + ui++; + } + break; + } + + if (ui == tlen) { + throw std::runtime_error("UTF32FromUTF8: attempted write beyond end"); + } + + i++; + switch (byteCount) { + case 1: value = ch; - } else if (((len-i) >= 1) && (ch < utf8Start3)) { + break; + case 2: value = (ch & 0x1F) << 6; ch = us[i++]; value += TrailByteValue(ch); - } else if (((len-i) >= 2) && (ch < utf8Start4)) { + break; + case 3: value = (ch & 0xF) << 12; ch = us[i++]; value += TrailByteValue(ch) << 6; ch = us[i++]; value += TrailByteValue(ch); - } else if ((len-i) >= 3) { + break; + default: value = (ch & 0x7) << 18; ch = us[i++]; value += TrailByteValue(ch) << 12; @@ -170,6 +190,7 @@ size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen) value += TrailByteValue(ch) << 6; ch = us[i++]; value += TrailByteValue(ch); + break; } tbuf[ui] = value; ui++; @@ -188,33 +209,24 @@ unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) { } } -int UTF8BytesOfLead[256]; -static bool initialisedBytesOfLead = false; - -static int BytesFromLead(int leadByte) { - if (leadByte < 0xC2) { - // Single byte or invalid - return 1; - } else if (leadByte < 0xE0) { - return 2; - } else if (leadByte < 0xF0) { - return 3; - } else if (leadByte < 0xF5) { - return 4; - } else { - // Characters longer than 4 bytes not possible in current UTF-8 - return 1; - } -} - -void UTF8BytesOfLeadInitialise() { - if (!initialisedBytesOfLead) { - for (int i=0; i<256; i++) { - UTF8BytesOfLead[i] = BytesFromLead(i); - } - initialisedBytesOfLead = true; - } -} +const unsigned char UTF8BytesOfLead[256] = { +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00 - 0F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10 - 1F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20 - 2F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30 - 3F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40 - 4F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50 - 5F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60 - 6F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70 - 7F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80 - 8F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 90 - 9F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A0 - AF +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B0 - BF +1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0 - CF +2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0 - DF +3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0 - EF +4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // F0 - FF +}; // Return both the width of the first character in the string and a status // saying whether it is valid or invalid. diff --git a/src/UniConversion.h b/src/UniConversion.h index 2f358c9c5..0f22c06e6 100644 --- a/src/UniConversion.h +++ b/src/UniConversion.h @@ -16,17 +16,15 @@ const int unicodeReplacementChar = 0xFFFD; size_t UTF8Length(const wchar_t *uptr, size_t tlen); void UTF8FromUTF16(const wchar_t *uptr, size_t tlen, char *putf, size_t len); -unsigned int UTF8CharLength(unsigned char ch); size_t UTF16Length(const char *s, size_t len); size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen); size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen); unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf); std::string FixInvalidUTF8(const std::string &text); -extern int UTF8BytesOfLead[256]; -void UTF8BytesOfLeadInitialise(); +extern const unsigned char UTF8BytesOfLead[256]; -inline bool UTF8IsTrailByte(int ch) { +inline bool UTF8IsTrailByte(unsigned char ch) { return (ch >= 0x80) && (ch < 0xc0); } @@ -64,6 +62,10 @@ inline unsigned int UTF16CharLength(wchar_t uch) { return ((uch >= SURROGATE_LEAD_FIRST) && (uch <= SURROGATE_LEAD_LAST)) ? 2 : 1; } +inline unsigned int UTF16LengthFromUTF8ByteCount(unsigned int byteCount) { + return (byteCount < 4) ? 1 : 2; +} + } #endif diff --git a/test/unit/testUniConversion.cxx b/test/unit/testUniConversion.cxx index 16ea1d974..4d34abd60 100644 --- a/test/unit/testUniConversion.cxx +++ b/test/unit/testUniConversion.cxx @@ -53,6 +53,24 @@ TEST_CASE("UTF16Length") { size_t len = UTF16Length(s, strlen(s)); REQUIRE(len == 2U); } + + SECTION("UTF16Length Invalid Trail byte in lead position") { + const char *s = "a\xB5yz"; + size_t len = UTF16Length(s, strlen(s)); + REQUIRE(len == 4U); + } + + SECTION("UTF16Length Invalid Lead byte at end") { + const char *s = "a\xC2"; + size_t len = UTF16Length(s, strlen(s)); + REQUIRE(len == 2U); + } + + SECTION("UTF16Length Invalid Lead byte implies 3 trails but only 2") { + const char *s = "a\xF1yz"; + size_t len = UTF16Length(s, strlen(s)); + REQUIRE(len == 2U); + } } TEST_CASE("UniConversion") { @@ -100,6 +118,35 @@ TEST_CASE("UniConversion") { REQUIRE(tbuf[1] == 0xDF48); } + SECTION("UTF16FromUTF8 Invalid Trail byte in lead position") { + const char s[] = "a\xB5yz"; + wchar_t tbuf[4] = {}; + size_t tlen = UTF16FromUTF8(s, 4, tbuf, 4); + REQUIRE(tlen == 4U); + REQUIRE(tbuf[0] == 'a'); + REQUIRE(tbuf[1] == 0xB5); + REQUIRE(tbuf[2] == 'y'); + REQUIRE(tbuf[3] == 'z'); + } + + SECTION("UTF16FromUTF8 Invalid Lead byte at end") { + const char s[] = "a\xC2"; + wchar_t tbuf[2] = {}; + size_t tlen = UTF16FromUTF8(s, 2, tbuf, 2); + REQUIRE(tlen == 2U); + REQUIRE(tbuf[0] == 'a'); + REQUIRE(tbuf[1] == 0xC2); + } + + SECTION("UTF16FromUTF8 Invalid Lead byte implies 3 trails but only 2") { + const char *s = "a\xF1yz"; + wchar_t tbuf[4] = {}; + size_t tlen = UTF16FromUTF8(s, 4, tbuf, 4); + REQUIRE(tlen == 2U); + REQUIRE(tbuf[0] == 'a'); + REQUIRE(tbuf[1] == 0xF1); + } + // UTF32FromUTF8 SECTION("UTF32FromUTF8 ASCII") { @@ -141,6 +188,44 @@ TEST_CASE("UniConversion") { REQUIRE(tlen == 1U); REQUIRE(tbuf[0] == 0x10348); } + + SECTION("UTF32FromUTF8 Invalid Trail byte in lead position") { + const char s[] = "a\xB5yz"; + unsigned int tbuf[4] = {}; + size_t tlen = UTF32FromUTF8(s, 4, tbuf, 4); + REQUIRE(tlen == 4U); + REQUIRE(tbuf[0] == static_cast<unsigned int>('a')); + REQUIRE(tbuf[1] == 0xB5); + REQUIRE(tbuf[2] == static_cast<unsigned int>('y')); + REQUIRE(tbuf[3] == static_cast<unsigned int>('z')); + } + + SECTION("UTF32FromUTF8 Invalid Lead byte at end") { + const char s[] = "a\xC2"; + unsigned int tbuf[2] = {}; + size_t tlen = UTF32FromUTF8(s, 2, tbuf, 2); + REQUIRE(tlen == 2U); + REQUIRE(tbuf[0] == static_cast<unsigned int>('a')); + REQUIRE(tbuf[1] == 0xC2); + } + + SECTION("UTF32FromUTF8 Invalid Lead byte implies 3 trails but only 2") { + const char *s = "a\xF1yz"; + unsigned int tbuf[4] = {}; + size_t tlen = UTF32FromUTF8(s, 4, tbuf, 4); + REQUIRE(tlen == 2U); + REQUIRE(tbuf[0] == static_cast<unsigned int>('a')); + REQUIRE(tbuf[1] == 0xF1); + } +} + +namespace { + +// Simple adapter to avoid casting +int UTFClass(const char *s) { + return UTF8Classify(reinterpret_cast<const unsigned char *>(s), static_cast<int>(strlen(s))); +} + } TEST_CASE("UTF8Classify") { @@ -151,114 +236,76 @@ TEST_CASE("UTF8Classify") { // Single byte SECTION("UTF8Classify Simple ASCII") { - const char *s = "a"; - REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == 1); + REQUIRE(UTFClass("a") == 1); } - SECTION("UTF8Classify Invalid Too large lead") { - const char *s = "\xF5"; - REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1|UTF8MaskInvalid)); + REQUIRE(UTFClass("\xF5") == (1|UTF8MaskInvalid)); } // 4 byte lead SECTION("UTF8Classify 4 byte lead, string less than 4 long") { - const char *s = "\xF0"; - REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid)); + REQUIRE(UTFClass("\xF0") == (1 | UTF8MaskInvalid)); } - SECTION("UTF8Classify 1FFFF non-character") { - const char *s = "\xF0\x9F\xBF\xBF"; - REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (4 | UTF8MaskInvalid)); + REQUIRE(UTFClass("\xF0\x9F\xBF\xBF") == (4 | UTF8MaskInvalid)); } - SECTION("UTF8Classify 1 Greater than max Unicode 110000") { // Maximum Unicode value is 10FFFF so 110000 is out of range - const char *s = "\xF4\x90\x80\x80"; - REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid)); + REQUIRE(UTFClass("\xF4\x90\x80\x80") == (1 | UTF8MaskInvalid)); } - SECTION("UTF8Classify 4 byte overlong") { - const char *s = "\xF0\x80\x80\x80"; - REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid)); + REQUIRE(UTFClass("\xF0\x80\x80\x80") == (1 | UTF8MaskInvalid)); } - SECTION("UTF8Classify 4 byte valid character") { - const char *s = "\xF0\x9F\x8C\x90"; - REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == 4); + REQUIRE(UTFClass("\xF0\x9F\x8C\x90") == 4); } - SECTION("UTF8Classify 4 byte bad trails") { - const char *s = "\xF0xyz"; - REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid)); + REQUIRE(UTFClass("\xF0xyz") == (1 | UTF8MaskInvalid)); } // 3 byte lead SECTION("UTF8Classify 3 byte lead, string less than 3 long") { - const char *s = "\xEF"; - REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid)); + REQUIRE(UTFClass("\xEF") == (1 | UTF8MaskInvalid)); } - SECTION("UTF8Classify 3 byte lead, overlong") { - const char *s = "\xE0\x80\xAF"; - REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid)); + REQUIRE(UTFClass("\xE0\x80\xAF") == (1 | UTF8MaskInvalid)); } - SECTION("UTF8Classify 3 byte lead, surrogate") { - const char *s = "\xED\xA0\x80"; - REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid)); + REQUIRE(UTFClass("\xED\xA0\x80") == (1 | UTF8MaskInvalid)); } - SECTION("UTF8Classify FFFE non-character") { - const char *s = "\xEF\xBF\xBE"; - REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (3 | UTF8MaskInvalid)); + REQUIRE(UTFClass("\xEF\xBF\xBE") == (3 | UTF8MaskInvalid)); } - SECTION("UTF8Classify FFFF non-character") { - const char *s = "\xEF\xBF\xBF"; - REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (3 | UTF8MaskInvalid)); + REQUIRE(UTFClass("\xEF\xBF\xBF") == (3 | UTF8MaskInvalid)); } - SECTION("UTF8Classify FDD0 non-character") { - const char *s = "\xEF\xB7\x90"; - REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (3 | UTF8MaskInvalid)); + REQUIRE(UTFClass("\xEF\xB7\x90") == (3 | UTF8MaskInvalid)); } - SECTION("UTF8Classify 3 byte valid character") { - const char *s = "\xE2\x82\xAC"; - REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == 3); + REQUIRE(UTFClass("\xE2\x82\xAC") == 3); } - SECTION("UTF8Classify 3 byte bad trails") { - const char *s = "\xE2qq"; - REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid)); + REQUIRE(UTFClass("\xE2qq") == (1 | UTF8MaskInvalid)); } // 2 byte lead SECTION("UTF8Classify 2 byte lead, string less than 2 long") { - const char *s = "\xD0"; - REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid)); + REQUIRE(UTFClass("\xD0") == (1 | UTF8MaskInvalid)); } - SECTION("UTF8Classify 2 byte valid character") { - const char *s = "\xD0\x80"; - REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == 2); + REQUIRE(UTFClass("\xD0\x80") == 2); } - SECTION("UTF8Classify 2 byte lead trail is invalid") { - const char *s = "\xD0q"; - REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid)); + REQUIRE(UTFClass("\xD0q") == (1 | UTF8MaskInvalid)); } - SECTION("UTF8Classify Overlong") { - const char *s = "\xC0"; - REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid)); + REQUIRE(UTFClass("\xC0") == (1 | UTF8MaskInvalid)); } - SECTION("UTF8Classify single trail byte") { - const char *s = "\x80"; - REQUIRE(UTF8Classify(reinterpret_cast<const unsigned char *>(s), strlen(s)) == (1 | UTF8MaskInvalid)); + REQUIRE(UTFClass("\x80") == (1 | UTF8MaskInvalid)); } -}
\ No newline at end of file +} diff --git a/win32/PlatWin.cxx b/win32/PlatWin.cxx index 9e89e2f84..79970a969 100644 --- a/win32/PlatWin.cxx +++ b/win32/PlatWin.cxx @@ -951,12 +951,14 @@ void SurfaceGDI::MeasureWidths(Font &font_, const char *s, int len, XYPOSITION * return; } // Map the widths given for UTF-16 characters back onto the UTF-8 input string + const unsigned char *us = reinterpret_cast<const unsigned char *>(s); for (int ui = 0; ui < fit; ui++) { - const unsigned int lenChar = UTF8BytesOfLead[static_cast<unsigned char>(s[i])]; - if (lenChar == 4) { // Non-BMP + const unsigned char uch = us[i]; + const unsigned int byteCount = UTF8BytesOfLead[uch]; + if (byteCount == 4) { // Non-BMP ui++; } - for (unsigned int bytePos=0; (bytePos<lenChar) && (i<len); bytePos++) { + for (unsigned int bytePos=0; (bytePos<byteCount) && (i<len); bytePos++) { positions[i++] = static_cast<XYPOSITION>(poses.buffer[ui]); } } @@ -1623,16 +1625,11 @@ void SurfaceD2D::MeasureWidths(Font &font_, const char *s, int len, XYPOSITION * int i=0; while (ui<tbuf.tlen) { const unsigned char uch = us[i]; - unsigned int lenChar = 1; - if (uch >= (0x80 + 0x40 + 0x20 + 0x10)) { - lenChar = 4; + const unsigned int byteCount = UTF8BytesOfLead[uch]; + if (byteCount == 4) { // Non-BMP ui++; - } else if (uch >= (0x80 + 0x40 + 0x20)) { - lenChar = 3; - } else if (uch >= (0x80)) { - lenChar = 2; } - for (unsigned int bytePos=0; (bytePos<lenChar) && (i<len); bytePos++) { + for (unsigned int bytePos=0; (bytePos<byteCount) && (i<len); bytePos++) { positions[i++] = poses.buffer[ui]; } ui++; diff --git a/win32/scintilla.mak b/win32/scintilla.mak index 29f882032..7f943474b 100644 --- a/win32/scintilla.mak +++ b/win32/scintilla.mak @@ -66,7 +66,7 @@ CXXFLAGS=$(CXXFLAGS) $(INCLUDEDIRS) all: $(COMPONENT) $(LEXCOMPONENT) $(LIBSCI) clean: - -del /q $(DIR_O)\*.obj $(DIR_O)\*.pdb $(COMPONENT) $(LEXCOMPONENT) \ + -del /q $(DIR_O)\*.obj $(DIR_O)\*.pdb $(DIR_O)\*.asm $(COMPONENT) $(LEXCOMPONENT) \ $(DIR_O)\*.res $(DIR_BIN)\*.map $(DIR_BIN)\*.exp $(DIR_BIN)\*.pdb $(DIR_BIN)\*.lib # Required for base Scintilla |