From e1370f834348a12fea75ad883a0d801dfd1b9d8d Mon Sep 17 00:00:00 2001 From: nyamatongwe Date: Sat, 26 May 2012 13:26:11 +1000 Subject: For case-insensitive UTF-8 searching, use UTF8Classify for finding valid character width so compatible with other similar code. Optimize treatment of single byte ASCII characters and also optimize loop conditions. These mostly make up for the performance decrease from calling UTF8Classify. Add support definitions UTF8MaxBytes and UTF8IsAscii in UniConversion. Remove ExtractChar as no longer needed. --- src/Document.cxx | 50 +++++++++++++++++++++++--------------------------- 1 file changed, 23 insertions(+), 27 deletions(-) (limited to 'src/Document.cxx') diff --git a/src/Document.cxx b/src/Document.cxx index dca0ccc51..4e9366064 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -1373,19 +1373,6 @@ static inline char MakeLowerCase(char ch) { return static_cast(ch - 'A' + 'a'); } -size_t Document::ExtractChar(int pos, char *bytes) { - unsigned char ch = static_cast(cb.CharAt(pos)); - size_t widthChar = UTF8CharLength(ch); - bytes[0] = ch; - for (size_t i=1; i(pos+i)); - if (!IsTrailByte(static_cast(bytes[i]))) { // Bad byte - widthChar = 1; - } - } - return widthChar; -} - CaseFolderTable::CaseFolderTable() { for (size_t iChar=0; iChar(iChar); @@ -1476,37 +1463,46 @@ long Document::FindText(int minPos, int maxPos, const char *search, break; } } else if (SC_CP_UTF8 == dbcsCodePage) { - const size_t maxBytesCharacter = 4; const size_t maxFoldingExpansion = 4; - std::vector searchThing(lengthFind * maxBytesCharacter * maxFoldingExpansion + 1); + std::vector searchThing(lengthFind * UTF8MaxBytes * maxFoldingExpansion + 1); const int lenSearch = static_cast( pcf->Fold(&searchThing[0], searchThing.size(), search, lengthFind)); + char bytes[UTF8MaxBytes + 1]; + char folded[UTF8MaxBytes * maxFoldingExpansion + 1]; while (forward ? (pos < endPos) : (pos >= endPos)) { int widthFirstCharacter = 0; - int indexDocument = 0; + int posIndexDocument = pos; int indexSearch = 0; bool characterMatches = true; - while (characterMatches && - ((pos + indexDocument) < limitPos) && - (indexSearch < lenSearch)) { - char bytes[maxBytesCharacter + 1]; - bytes[maxBytesCharacter] = 0; - const int widthChar = static_cast(ExtractChar(pos + indexDocument, bytes)); + for (;;) { + const unsigned char leadByte = static_cast(cb.CharAt(posIndexDocument)); + bytes[0] = leadByte; + int widthChar = 1; + if (!UTF8IsAscii(leadByte)) { + const int widthCharBytes = UTF8BytesOfLead[leadByte]; + for (int b=1; b(bytes), widthCharBytes) & UTF8MaskWidth; + } if (!widthFirstCharacter) widthFirstCharacter = widthChar; - if ((pos + indexDocument + widthChar) > limitPos) + if ((posIndexDocument + widthChar) > limitPos) break; - char folded[maxBytesCharacter * maxFoldingExpansion + 1]; const int lenFlat = static_cast(pcf->Fold(folded, sizeof(folded), bytes, widthChar)); folded[lenFlat] = 0; // Does folded match the buffer characterMatches = 0 == memcmp(folded, &searchThing[0] + indexSearch, lenFlat); - indexDocument += widthChar; + if (!characterMatches) + break; + posIndexDocument += widthChar; indexSearch += lenFlat; + if (indexSearch >= lenSearch) + break; } if (characterMatches && (indexSearch == static_cast(lenSearch))) { - if (MatchesWordOptions(word, wordStart, pos, indexDocument)) { - *length = indexDocument; + if (MatchesWordOptions(word, wordStart, pos, posIndexDocument - pos)) { + *length = posIndexDocument - pos; return pos; } } -- cgit v1.2.3