diff options
author | Neil <nyamatongwe@gmail.com> | 2025-03-27 20:05:39 +1100 |
---|---|---|
committer | Neil <nyamatongwe@gmail.com> | 2025-03-27 20:05:39 +1100 |
commit | 44a2755858009d5c14732d82814f9b5558f99ef4 (patch) | |
tree | 5d814d09c1a00c94f198c69b860e1d27e36ad68c /win32/ScintillaWin.cxx | |
parent | b451f612b65dfe1ad678080e7b897733be310d01 (diff) | |
download | scintilla-mirror-44a2755858009d5c14732d82814f9b5558f99ef4.tar.gz |
Optimize case-insensitive DBCS search to be around 5 times faster by using 64K
memory to cache folding data for each DBCS code page used.
Diffstat (limited to 'win32/ScintillaWin.cxx')
-rw-r--r-- | win32/ScintillaWin.cxx | 109 |
1 files changed, 64 insertions, 45 deletions
diff --git a/win32/ScintillaWin.cxx b/win32/ScintillaWin.cxx index bebaad6eb..0dc8acf68 100644 --- a/win32/ScintillaWin.cxx +++ b/win32/ScintillaWin.cxx @@ -19,6 +19,7 @@ #include <string> #include <string_view> #include <vector> +#include <array> #include <map> #include <set> #include <optional> @@ -2791,63 +2792,83 @@ void ScintillaWin::NotifyDoubleClick(Point pt, KeyMod modifiers) { namespace { constexpr unsigned int safeFoldingSize = 20; +constexpr int highByteFirst = 0x80; +constexpr int highByteLast = 0xFF; +constexpr int minTrailByte = 0x30; + +// CreateFoldMap creates a fold map by calling platform APIs so will differ between platforms. +void CreateFoldMap(int codePage, FoldMap *foldingMap) { + for (int byte1 = highByteFirst; byte1 <= highByteLast; byte1++) { + const char ch1 = byte1 & 0xFF; // & 0xFF avoids warnings but has no real effect. + if (DBCSIsLeadByte(codePage, ch1)) { + for (int byte2 = minTrailByte; byte2 <= highByteLast; byte2++) { + const char ch2 = byte2 & 0xFF; + if (DBCSIsTrailByte(codePage, ch2)) { + const DBCSPair pair{ ch1, ch2 }; + const uint16_t index = DBCSIndex(ch1, ch2); + (*foldingMap)[index] = pair; + const std::string_view svBytes(pair.chars, 2); + const int lenUni = WideCharLenFromMultiByte(codePage, svBytes); + if (lenUni == 1) { + // DBCS pair must produce a single Unicode BMP code point + wchar_t codePoint = 0; + WideCharFromMultiByte(codePage, svBytes, &codePoint, 1); + if (codePoint) { + // Could create a DBCS -> Unicode conversion map here + const char *foldedUTF8 = CaseConvert(codePoint, CaseConversion::fold); + if (foldedUTF8) { + wchar_t wFolded[safeFoldingSize]{}; + const size_t charsConverted = UTF16FromUTF8(foldedUTF8, wFolded, std::size(wFolded)); + char back[safeFoldingSize]{}; + const int lengthBack = MultiByteFromWideChar(codePage, std::wstring(wFolded, charsConverted), + back, std::size(back)); + if (lengthBack == 2) { + // Only allow cases where input length and folded length are both 2 + (*foldingMap)[index] = { back[0], back[1] }; + } + } + } + } + } + } + } + } +} class CaseFolderDBCS : public CaseFolderTable { // Allocate the expandable storage here so that it does not need to be reallocated // for each call to Fold. - std::vector<wchar_t> utf16Mixed; - std::vector<wchar_t> utf16Folded; + const FoldMap *foldingMap; UINT cp; public: explicit CaseFolderDBCS(UINT cp_) : cp(cp_) { + if (!DBCSHasFoldMap(cp)) { + CreateFoldMap(cp, DBCSGetMutableFoldMap(cp)); + } + foldingMap = DBCSGetFoldMap(cp); } size_t Fold(char *folded, size_t sizeFolded, const char *mixed, size_t lenMixed) override; }; size_t CaseFolderDBCS::Fold(char *folded, size_t sizeFolded, const char *mixed, size_t lenMixed) { - if ((lenMixed == 1) && (sizeFolded > 0)) { - folded[0] = mapping[static_cast<unsigned char>(mixed[0])]; - return 1; - } - if (lenMixed > utf16Mixed.size()) { - utf16Mixed.resize(lenMixed + 8); - } - const size_t nUtf16Mixed = WideCharFromMultiByte(cp, - std::string_view(mixed, lenMixed), - utf16Mixed.data(), - utf16Mixed.size()); - - if (nUtf16Mixed == 0) { - // Failed to convert -> bad input - folded[0] = '\0'; - return 1; - } - - size_t lenFlat = 0; - for (size_t mixIndex = 0; mixIndex < nUtf16Mixed; mixIndex++) { - if ((lenFlat + safeFoldingSize) > utf16Folded.size()) - utf16Folded.resize(lenFlat + 60); - const char *foldedUTF8 = CaseConvert(utf16Mixed[mixIndex], CaseConversion::fold); - if (foldedUTF8) { - // Maximum length of a case conversion is 6 bytes, 3 characters - wchar_t wFolded[safeFoldingSize]; - const size_t charsConverted = UTF16FromUTF8(std::string_view(foldedUTF8), - wFolded, std::size(wFolded)); - for (size_t j = 0; j < charsConverted; j++) - utf16Folded[lenFlat++] = wFolded[j]; - } else { - utf16Folded[lenFlat++] = utf16Mixed[mixIndex]; + // This loop outputs the same length as input as for each char 1-byte -> 1-byte; 2-byte -> 2-byte + size_t lenOut = 0; + for (size_t i = 0; i < lenMixed; i++) { + const ptrdiff_t lenLeft = lenMixed - i; + const char ch = mixed[i]; + if ((lenLeft >= 2) && DBCSIsLeadByte(cp, ch) && ((lenOut + 2) <= sizeFolded)) { + i++; + const char ch2 = mixed[i]; + const uint16_t ind = DBCSIndex(ch, ch2); + const char *pair = foldingMap->at(ind).chars; + folded[lenOut++] = pair[0]; + folded[lenOut++] = pair[1]; + } else if ((lenOut + 1) <= sizeFolded) { + const unsigned char uch = ch; + folded[lenOut++] = mapping[uch]; } } - - const std::wstring_view wsvFolded(utf16Folded.data(), lenFlat); - const size_t lenOut = MultiByteLenFromWideChar(cp, wsvFolded); - - if (lenOut < sizeFolded) { - MultiByteFromWideChar(cp, wsvFolded, folded, lenOut); - return lenOut; - } - return 0; + return lenOut; } } @@ -2862,8 +2883,6 @@ std::unique_ptr<CaseFolder> ScintillaWin::CaseFolderForEncoding() { } std::unique_ptr<CaseFolderTable> pcf = std::make_unique<CaseFolderTable>(); // Only for single byte encodings - constexpr int highByteFirst = 0x80; - constexpr int highByteLast = 0xFF; for (int i=highByteFirst; i<=highByteLast; i++) { char sCharacter[2] = "A"; sCharacter[0] = static_cast<char>(i); |