diff options
-rw-r--r-- | src/Document.cxx | 50 | ||||
-rw-r--r-- | src/Document.h | 1 | ||||
-rw-r--r-- | src/UniConversion.h | 6 |
3 files changed, 29 insertions, 28 deletions
diff --git a/src/Document.cxx b/src/Document.cxx index dca0ccc51..4e9366064 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -1373,19 +1373,6 @@ static inline char MakeLowerCase(char ch) { return static_cast<char>(ch - 'A' + 'a'); } -size_t Document::ExtractChar(int pos, char *bytes) { - unsigned char ch = static_cast<unsigned char>(cb.CharAt(pos)); - size_t widthChar = UTF8CharLength(ch); - bytes[0] = ch; - for (size_t i=1; i<widthChar; i++) { - bytes[i] = cb.CharAt(static_cast<int>(pos+i)); - if (!IsTrailByte(static_cast<unsigned char>(bytes[i]))) { // Bad byte - widthChar = 1; - } - } - return widthChar; -} - CaseFolderTable::CaseFolderTable() { for (size_t iChar=0; iChar<sizeof(mapping); iChar++) { mapping[iChar] = static_cast<char>(iChar); @@ -1476,37 +1463,46 @@ long Document::FindText(int minPos, int maxPos, const char *search, break; } } else if (SC_CP_UTF8 == dbcsCodePage) { - const size_t maxBytesCharacter = 4; const size_t maxFoldingExpansion = 4; - std::vector<char> searchThing(lengthFind * maxBytesCharacter * maxFoldingExpansion + 1); + std::vector<char> searchThing(lengthFind * UTF8MaxBytes * maxFoldingExpansion + 1); const int lenSearch = static_cast<int>( pcf->Fold(&searchThing[0], searchThing.size(), search, lengthFind)); + char bytes[UTF8MaxBytes + 1]; + char folded[UTF8MaxBytes * maxFoldingExpansion + 1]; while (forward ? (pos < endPos) : (pos >= endPos)) { int widthFirstCharacter = 0; - int indexDocument = 0; + int posIndexDocument = pos; int indexSearch = 0; bool characterMatches = true; - while (characterMatches && - ((pos + indexDocument) < limitPos) && - (indexSearch < lenSearch)) { - char bytes[maxBytesCharacter + 1]; - bytes[maxBytesCharacter] = 0; - const int widthChar = static_cast<int>(ExtractChar(pos + indexDocument, bytes)); + for (;;) { + const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(posIndexDocument)); + bytes[0] = leadByte; + int widthChar = 1; + if (!UTF8IsAscii(leadByte)) { + const int widthCharBytes = UTF8BytesOfLead[leadByte]; + for (int b=1; b<widthCharBytes; b++) { + bytes[b] = cb.CharAt(posIndexDocument+b); + } + widthChar = UTF8Classify(reinterpret_cast<const unsigned char *>(bytes), widthCharBytes) & UTF8MaskWidth; + } if (!widthFirstCharacter) widthFirstCharacter = widthChar; - if ((pos + indexDocument + widthChar) > limitPos) + if ((posIndexDocument + widthChar) > limitPos) break; - char folded[maxBytesCharacter * maxFoldingExpansion + 1]; const int lenFlat = static_cast<int>(pcf->Fold(folded, sizeof(folded), bytes, widthChar)); folded[lenFlat] = 0; // Does folded match the buffer characterMatches = 0 == memcmp(folded, &searchThing[0] + indexSearch, lenFlat); - indexDocument += widthChar; + if (!characterMatches) + break; + posIndexDocument += widthChar; indexSearch += lenFlat; + if (indexSearch >= lenSearch) + break; } if (characterMatches && (indexSearch == static_cast<int>(lenSearch))) { - if (MatchesWordOptions(word, wordStart, pos, indexDocument)) { - *length = indexDocument; + if (MatchesWordOptions(word, wordStart, pos, posIndexDocument - pos)) { + *length = posIndexDocument - pos; return pos; } } diff --git a/src/Document.h b/src/Document.h index ec41603eb..18bf00a3d 100644 --- a/src/Document.h +++ b/src/Document.h @@ -352,7 +352,6 @@ public: int NextWordEnd(int pos, int delta); int SCI_METHOD Length() const { return cb.Length(); } void Allocate(int newSize) { cb.Allocate(newSize); } - size_t ExtractChar(int pos, char *bytes); bool MatchesWordOptions(bool word, bool wordStart, int pos, int length); long FindText(int minPos, int maxPos, const char *search, bool caseSensitive, bool word, bool wordStart, bool regExp, int flags, int *length, CaseFolder *pcf); diff --git a/src/UniConversion.h b/src/UniConversion.h index 87cc43f77..704f16239 100644 --- a/src/UniConversion.h +++ b/src/UniConversion.h @@ -5,6 +5,8 @@ // Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org> // The License.txt file describes the conditions under which this software may be distributed. +const int UTF8MaxBytes = 4; + unsigned int UTF8Length(const wchar_t *uptr, unsigned int tlen); void UTF8FromUTF16(const wchar_t *uptr, unsigned int tlen, char *putf, unsigned int len); unsigned int UTF8CharLength(unsigned char ch); @@ -18,5 +20,9 @@ inline bool UTF8IsTrailByte(int ch) { return (ch >= 0x80) && (ch < 0xc0); } +inline bool UTF8IsAscii(int ch) { + return ch < 0x80; +} + enum { UTF8MaskWidth=0x7, UTF8MaskInvalid=0x8 }; int UTF8Classify(const unsigned char *us, int len); |