diff options
author | Neil <nyamatongwe@gmail.com> | 2025-03-27 20:05:39 +1100 |
---|---|---|
committer | Neil <nyamatongwe@gmail.com> | 2025-03-27 20:05:39 +1100 |
commit | 44a2755858009d5c14732d82814f9b5558f99ef4 (patch) | |
tree | 5d814d09c1a00c94f198c69b860e1d27e36ad68c | |
parent | b451f612b65dfe1ad678080e7b897733be310d01 (diff) | |
download | scintilla-mirror-44a2755858009d5c14732d82814f9b5558f99ef4.tar.gz |
Optimize case-insensitive DBCS search to be around 5 times faster by using 64K
memory to cache folding data for each DBCS code page used.
-rw-r--r-- | doc/ScintillaHistory.html | 4 | ||||
-rw-r--r-- | src/DBCS.cxx | 61 | ||||
-rw-r--r-- | src/DBCS.h | 21 | ||||
-rw-r--r-- | win32/ScintillaWin.cxx | 109 |
4 files changed, 150 insertions, 45 deletions
diff --git a/doc/ScintillaHistory.html b/doc/ScintillaHistory.html index 98346def6..b9ff5bb30 100644 --- a/doc/ScintillaHistory.html +++ b/doc/ScintillaHistory.html @@ -608,6 +608,10 @@ On Win32, use DirectWrite for autocompletion lists when DirectWrite chosen for document text. </li> <li> + On Win32, optimize case-insensitive DBCS search to be around 5 times faster + by using 64K memory to cache folding data for each DBCS code page used. + </li> + <li> When using Visual C++ through nmake, fix building for ARM64. <a href="https://sourceforge.net/p/scintilla/feature-requests/1546/">Feature #1546</a>. </li> diff --git a/src/DBCS.cxx b/src/DBCS.cxx index d5512cc14..0f22a705a 100644 --- a/src/DBCS.cxx +++ b/src/DBCS.cxx @@ -5,6 +5,11 @@ // Copyright 2017 by Neil Hodgson <neilh@scintilla.org> // The License.txt file describes the conditions under which this software may be distributed. +#include <cstdint> + +#include <array> +#include <map> + #include "DBCS.h" using namespace Scintilla::Internal; @@ -35,6 +40,41 @@ bool DBCSIsLeadByte(int codePage, char ch) noexcept { ((uch >= 0x84) && (uch <= 0xD3)) || ((uch >= 0xD8) && (uch <= 0xDE)) || ((uch >= 0xE0) && (uch <= 0xF9)); + default: + break; + } + return false; +} + +bool DBCSIsTrailByte(int codePage, char ch) noexcept { + const unsigned char trail = ch; + switch (codePage) { + case cp932: + // Shift_jis + return (trail != 0x7F) && + ((trail >= 0x40) && (trail <= 0xFC)); + case cp936: + // GBK + return (trail != 0x7F) && + ((trail >= 0x40) && (trail <= 0xFE)); + case cp949: + // Korean Wansung KS C-5601-1987 + return + ((trail >= 0x41) && (trail <= 0x5A)) || + ((trail >= 0x61) && (trail <= 0x7A)) || + ((trail >= 0x81) && (trail <= 0xFE)); + case cp950: + // Big5 + return + ((trail >= 0x40) && (trail <= 0x7E)) || + ((trail >= 0xA1) && (trail <= 0xFE)); + case cp1361: + // Korean Johab KS C-5601-1992 + return + ((trail >= 0x31) && (trail <= 0x7E)) || + ((trail >= 0x81) && (trail <= 0xFE)); + default: + break; } return false; } @@ -51,4 +91,25 @@ bool IsDBCSValidSingleByte(int codePage, int ch) noexcept { } } +using CodePageToFoldMap = std::map<int, FoldMap>; +CodePageToFoldMap cpToFoldMap; + +bool DBCSHasFoldMap(int codePage) { + const CodePageToFoldMap::const_iterator it = cpToFoldMap.find(codePage); + return it != cpToFoldMap.end(); +} + +void DBCSSetFoldMap(int codePage, const FoldMap &foldMap) { + cpToFoldMap[codePage] = foldMap; +} + +FoldMap *DBCSGetMutableFoldMap(int codePage) { + // Constructs if needed + return &cpToFoldMap[codePage]; +} + +const FoldMap *DBCSGetFoldMap(int codePage) { + return &cpToFoldMap[codePage]; +} + } diff --git a/src/DBCS.h b/src/DBCS.h index 01830c843..12bbaf986 100644 --- a/src/DBCS.h +++ b/src/DBCS.h @@ -25,8 +25,29 @@ constexpr bool IsDBCSCodePage(int codePage) noexcept { } bool DBCSIsLeadByte(int codePage, char ch) noexcept; +bool DBCSIsTrailByte(int codePage, char ch) noexcept; bool IsDBCSValidSingleByte(int codePage, int ch) noexcept; +// Calculate a number from a DBCS byte pair that can be used to index into an array or map. +// Should only be called with genuine DBCS character pairs which means that ch1 has top bit set. +constexpr uint16_t DBCSIndex(char ch1, char ch2) noexcept { + constexpr unsigned int highStart = 0x80U; + constexpr unsigned int byteMultiply = 0x100U; + const unsigned char uch1 = ch1; + const unsigned char uch2 = ch2; + return ((uch1 - highStart) * byteMultiply) + uch2; +} + +struct DBCSPair { + char chars[2]; +}; +using FoldMap = std::array<DBCSPair, 0x8000>; + +bool DBCSHasFoldMap(int codePage); +void DBCSSetFoldMap(int codePage, const FoldMap &foldMap); +FoldMap *DBCSGetMutableFoldMap(int codePage); +const FoldMap *DBCSGetFoldMap(int codePage); + } #endif diff --git a/win32/ScintillaWin.cxx b/win32/ScintillaWin.cxx index bebaad6eb..0dc8acf68 100644 --- a/win32/ScintillaWin.cxx +++ b/win32/ScintillaWin.cxx @@ -19,6 +19,7 @@ #include <string> #include <string_view> #include <vector> +#include <array> #include <map> #include <set> #include <optional> @@ -2791,63 +2792,83 @@ void ScintillaWin::NotifyDoubleClick(Point pt, KeyMod modifiers) { namespace { constexpr unsigned int safeFoldingSize = 20; +constexpr int highByteFirst = 0x80; +constexpr int highByteLast = 0xFF; +constexpr int minTrailByte = 0x30; + +// CreateFoldMap creates a fold map by calling platform APIs so will differ between platforms. +void CreateFoldMap(int codePage, FoldMap *foldingMap) { + for (int byte1 = highByteFirst; byte1 <= highByteLast; byte1++) { + const char ch1 = byte1 & 0xFF; // & 0xFF avoids warnings but has no real effect. + if (DBCSIsLeadByte(codePage, ch1)) { + for (int byte2 = minTrailByte; byte2 <= highByteLast; byte2++) { + const char ch2 = byte2 & 0xFF; + if (DBCSIsTrailByte(codePage, ch2)) { + const DBCSPair pair{ ch1, ch2 }; + const uint16_t index = DBCSIndex(ch1, ch2); + (*foldingMap)[index] = pair; + const std::string_view svBytes(pair.chars, 2); + const int lenUni = WideCharLenFromMultiByte(codePage, svBytes); + if (lenUni == 1) { + // DBCS pair must produce a single Unicode BMP code point + wchar_t codePoint = 0; + WideCharFromMultiByte(codePage, svBytes, &codePoint, 1); + if (codePoint) { + // Could create a DBCS -> Unicode conversion map here + const char *foldedUTF8 = CaseConvert(codePoint, CaseConversion::fold); + if (foldedUTF8) { + wchar_t wFolded[safeFoldingSize]{}; + const size_t charsConverted = UTF16FromUTF8(foldedUTF8, wFolded, std::size(wFolded)); + char back[safeFoldingSize]{}; + const int lengthBack = MultiByteFromWideChar(codePage, std::wstring(wFolded, charsConverted), + back, std::size(back)); + if (lengthBack == 2) { + // Only allow cases where input length and folded length are both 2 + (*foldingMap)[index] = { back[0], back[1] }; + } + } + } + } + } + } + } + } +} class CaseFolderDBCS : public CaseFolderTable { // Allocate the expandable storage here so that it does not need to be reallocated // for each call to Fold. - std::vector<wchar_t> utf16Mixed; - std::vector<wchar_t> utf16Folded; + const FoldMap *foldingMap; UINT cp; public: explicit CaseFolderDBCS(UINT cp_) : cp(cp_) { + if (!DBCSHasFoldMap(cp)) { + CreateFoldMap(cp, DBCSGetMutableFoldMap(cp)); + } + foldingMap = DBCSGetFoldMap(cp); } size_t Fold(char *folded, size_t sizeFolded, const char *mixed, size_t lenMixed) override; }; size_t CaseFolderDBCS::Fold(char *folded, size_t sizeFolded, const char *mixed, size_t lenMixed) { - if ((lenMixed == 1) && (sizeFolded > 0)) { - folded[0] = mapping[static_cast<unsigned char>(mixed[0])]; - return 1; - } - if (lenMixed > utf16Mixed.size()) { - utf16Mixed.resize(lenMixed + 8); - } - const size_t nUtf16Mixed = WideCharFromMultiByte(cp, - std::string_view(mixed, lenMixed), - utf16Mixed.data(), - utf16Mixed.size()); - - if (nUtf16Mixed == 0) { - // Failed to convert -> bad input - folded[0] = '\0'; - return 1; - } - - size_t lenFlat = 0; - for (size_t mixIndex = 0; mixIndex < nUtf16Mixed; mixIndex++) { - if ((lenFlat + safeFoldingSize) > utf16Folded.size()) - utf16Folded.resize(lenFlat + 60); - const char *foldedUTF8 = CaseConvert(utf16Mixed[mixIndex], CaseConversion::fold); - if (foldedUTF8) { - // Maximum length of a case conversion is 6 bytes, 3 characters - wchar_t wFolded[safeFoldingSize]; - const size_t charsConverted = UTF16FromUTF8(std::string_view(foldedUTF8), - wFolded, std::size(wFolded)); - for (size_t j = 0; j < charsConverted; j++) - utf16Folded[lenFlat++] = wFolded[j]; - } else { - utf16Folded[lenFlat++] = utf16Mixed[mixIndex]; + // This loop outputs the same length as input as for each char 1-byte -> 1-byte; 2-byte -> 2-byte + size_t lenOut = 0; + for (size_t i = 0; i < lenMixed; i++) { + const ptrdiff_t lenLeft = lenMixed - i; + const char ch = mixed[i]; + if ((lenLeft >= 2) && DBCSIsLeadByte(cp, ch) && ((lenOut + 2) <= sizeFolded)) { + i++; + const char ch2 = mixed[i]; + const uint16_t ind = DBCSIndex(ch, ch2); + const char *pair = foldingMap->at(ind).chars; + folded[lenOut++] = pair[0]; + folded[lenOut++] = pair[1]; + } else if ((lenOut + 1) <= sizeFolded) { + const unsigned char uch = ch; + folded[lenOut++] = mapping[uch]; } } - - const std::wstring_view wsvFolded(utf16Folded.data(), lenFlat); - const size_t lenOut = MultiByteLenFromWideChar(cp, wsvFolded); - - if (lenOut < sizeFolded) { - MultiByteFromWideChar(cp, wsvFolded, folded, lenOut); - return lenOut; - } - return 0; + return lenOut; } } @@ -2862,8 +2883,6 @@ std::unique_ptr<CaseFolder> ScintillaWin::CaseFolderForEncoding() { } std::unique_ptr<CaseFolderTable> pcf = std::make_unique<CaseFolderTable>(); // Only for single byte encodings - constexpr int highByteFirst = 0x80; - constexpr int highByteLast = 0xFF; for (int i=highByteFirst; i<=highByteLast; i++) { char sCharacter[2] = "A"; sCharacter[0] = static_cast<char>(i); |