aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorNeil <nyamatongwe@gmail.com>2025-03-27 20:05:39 +1100
committerNeil <nyamatongwe@gmail.com>2025-03-27 20:05:39 +1100
commit44a2755858009d5c14732d82814f9b5558f99ef4 (patch)
tree5d814d09c1a00c94f198c69b860e1d27e36ad68c
parentb451f612b65dfe1ad678080e7b897733be310d01 (diff)
downloadscintilla-mirror-44a2755858009d5c14732d82814f9b5558f99ef4.tar.gz
Optimize case-insensitive DBCS search to be around 5 times faster by using 64K
memory to cache folding data for each DBCS code page used.
-rw-r--r--doc/ScintillaHistory.html4
-rw-r--r--src/DBCS.cxx61
-rw-r--r--src/DBCS.h21
-rw-r--r--win32/ScintillaWin.cxx109
4 files changed, 150 insertions, 45 deletions
diff --git a/doc/ScintillaHistory.html b/doc/ScintillaHistory.html
index 98346def6..b9ff5bb30 100644
--- a/doc/ScintillaHistory.html
+++ b/doc/ScintillaHistory.html
@@ -608,6 +608,10 @@
On Win32, use DirectWrite for autocompletion lists when DirectWrite chosen for document text.
</li>
<li>
+ On Win32, optimize case-insensitive DBCS search to be around 5 times faster
+ by using 64K memory to cache folding data for each DBCS code page used.
+ </li>
+ <li>
When using Visual C++ through nmake, fix building for ARM64.
<a href="https://sourceforge.net/p/scintilla/feature-requests/1546/">Feature #1546</a>.
</li>
diff --git a/src/DBCS.cxx b/src/DBCS.cxx
index d5512cc14..0f22a705a 100644
--- a/src/DBCS.cxx
+++ b/src/DBCS.cxx
@@ -5,6 +5,11 @@
// Copyright 2017 by Neil Hodgson <neilh@scintilla.org>
// The License.txt file describes the conditions under which this software may be distributed.
+#include <cstdint>
+
+#include <array>
+#include <map>
+
#include "DBCS.h"
using namespace Scintilla::Internal;
@@ -35,6 +40,41 @@ bool DBCSIsLeadByte(int codePage, char ch) noexcept {
((uch >= 0x84) && (uch <= 0xD3)) ||
((uch >= 0xD8) && (uch <= 0xDE)) ||
((uch >= 0xE0) && (uch <= 0xF9));
+ default:
+ break;
+ }
+ return false;
+}
+
+bool DBCSIsTrailByte(int codePage, char ch) noexcept {
+ const unsigned char trail = ch;
+ switch (codePage) {
+ case cp932:
+ // Shift_jis
+ return (trail != 0x7F) &&
+ ((trail >= 0x40) && (trail <= 0xFC));
+ case cp936:
+ // GBK
+ return (trail != 0x7F) &&
+ ((trail >= 0x40) && (trail <= 0xFE));
+ case cp949:
+ // Korean Wansung KS C-5601-1987
+ return
+ ((trail >= 0x41) && (trail <= 0x5A)) ||
+ ((trail >= 0x61) && (trail <= 0x7A)) ||
+ ((trail >= 0x81) && (trail <= 0xFE));
+ case cp950:
+ // Big5
+ return
+ ((trail >= 0x40) && (trail <= 0x7E)) ||
+ ((trail >= 0xA1) && (trail <= 0xFE));
+ case cp1361:
+ // Korean Johab KS C-5601-1992
+ return
+ ((trail >= 0x31) && (trail <= 0x7E)) ||
+ ((trail >= 0x81) && (trail <= 0xFE));
+ default:
+ break;
}
return false;
}
@@ -51,4 +91,25 @@ bool IsDBCSValidSingleByte(int codePage, int ch) noexcept {
}
}
+using CodePageToFoldMap = std::map<int, FoldMap>;
+CodePageToFoldMap cpToFoldMap;
+
+bool DBCSHasFoldMap(int codePage) {
+ const CodePageToFoldMap::const_iterator it = cpToFoldMap.find(codePage);
+ return it != cpToFoldMap.end();
+}
+
+void DBCSSetFoldMap(int codePage, const FoldMap &foldMap) {
+ cpToFoldMap[codePage] = foldMap;
+}
+
+FoldMap *DBCSGetMutableFoldMap(int codePage) {
+ // Constructs if needed
+ return &cpToFoldMap[codePage];
+}
+
+const FoldMap *DBCSGetFoldMap(int codePage) {
+ return &cpToFoldMap[codePage];
+}
+
}
diff --git a/src/DBCS.h b/src/DBCS.h
index 01830c843..12bbaf986 100644
--- a/src/DBCS.h
+++ b/src/DBCS.h
@@ -25,8 +25,29 @@ constexpr bool IsDBCSCodePage(int codePage) noexcept {
}
bool DBCSIsLeadByte(int codePage, char ch) noexcept;
+bool DBCSIsTrailByte(int codePage, char ch) noexcept;
bool IsDBCSValidSingleByte(int codePage, int ch) noexcept;
+// Calculate a number from a DBCS byte pair that can be used to index into an array or map.
+// Should only be called with genuine DBCS character pairs which means that ch1 has top bit set.
+constexpr uint16_t DBCSIndex(char ch1, char ch2) noexcept {
+ constexpr unsigned int highStart = 0x80U;
+ constexpr unsigned int byteMultiply = 0x100U;
+ const unsigned char uch1 = ch1;
+ const unsigned char uch2 = ch2;
+ return ((uch1 - highStart) * byteMultiply) + uch2;
+}
+
+struct DBCSPair {
+ char chars[2];
+};
+using FoldMap = std::array<DBCSPair, 0x8000>;
+
+bool DBCSHasFoldMap(int codePage);
+void DBCSSetFoldMap(int codePage, const FoldMap &foldMap);
+FoldMap *DBCSGetMutableFoldMap(int codePage);
+const FoldMap *DBCSGetFoldMap(int codePage);
+
}
#endif
diff --git a/win32/ScintillaWin.cxx b/win32/ScintillaWin.cxx
index bebaad6eb..0dc8acf68 100644
--- a/win32/ScintillaWin.cxx
+++ b/win32/ScintillaWin.cxx
@@ -19,6 +19,7 @@
#include <string>
#include <string_view>
#include <vector>
+#include <array>
#include <map>
#include <set>
#include <optional>
@@ -2791,63 +2792,83 @@ void ScintillaWin::NotifyDoubleClick(Point pt, KeyMod modifiers) {
namespace {
constexpr unsigned int safeFoldingSize = 20;
+constexpr int highByteFirst = 0x80;
+constexpr int highByteLast = 0xFF;
+constexpr int minTrailByte = 0x30;
+
+// CreateFoldMap creates a fold map by calling platform APIs so will differ between platforms.
+void CreateFoldMap(int codePage, FoldMap *foldingMap) {
+ for (int byte1 = highByteFirst; byte1 <= highByteLast; byte1++) {
+ const char ch1 = byte1 & 0xFF; // & 0xFF avoids warnings but has no real effect.
+ if (DBCSIsLeadByte(codePage, ch1)) {
+ for (int byte2 = minTrailByte; byte2 <= highByteLast; byte2++) {
+ const char ch2 = byte2 & 0xFF;
+ if (DBCSIsTrailByte(codePage, ch2)) {
+ const DBCSPair pair{ ch1, ch2 };
+ const uint16_t index = DBCSIndex(ch1, ch2);
+ (*foldingMap)[index] = pair;
+ const std::string_view svBytes(pair.chars, 2);
+ const int lenUni = WideCharLenFromMultiByte(codePage, svBytes);
+ if (lenUni == 1) {
+ // DBCS pair must produce a single Unicode BMP code point
+ wchar_t codePoint = 0;
+ WideCharFromMultiByte(codePage, svBytes, &codePoint, 1);
+ if (codePoint) {
+ // Could create a DBCS -> Unicode conversion map here
+ const char *foldedUTF8 = CaseConvert(codePoint, CaseConversion::fold);
+ if (foldedUTF8) {
+ wchar_t wFolded[safeFoldingSize]{};
+ const size_t charsConverted = UTF16FromUTF8(foldedUTF8, wFolded, std::size(wFolded));
+ char back[safeFoldingSize]{};
+ const int lengthBack = MultiByteFromWideChar(codePage, std::wstring(wFolded, charsConverted),
+ back, std::size(back));
+ if (lengthBack == 2) {
+ // Only allow cases where input length and folded length are both 2
+ (*foldingMap)[index] = { back[0], back[1] };
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
class CaseFolderDBCS : public CaseFolderTable {
// Allocate the expandable storage here so that it does not need to be reallocated
// for each call to Fold.
- std::vector<wchar_t> utf16Mixed;
- std::vector<wchar_t> utf16Folded;
+ const FoldMap *foldingMap;
UINT cp;
public:
explicit CaseFolderDBCS(UINT cp_) : cp(cp_) {
+ if (!DBCSHasFoldMap(cp)) {
+ CreateFoldMap(cp, DBCSGetMutableFoldMap(cp));
+ }
+ foldingMap = DBCSGetFoldMap(cp);
}
size_t Fold(char *folded, size_t sizeFolded, const char *mixed, size_t lenMixed) override;
};
size_t CaseFolderDBCS::Fold(char *folded, size_t sizeFolded, const char *mixed, size_t lenMixed) {
- if ((lenMixed == 1) && (sizeFolded > 0)) {
- folded[0] = mapping[static_cast<unsigned char>(mixed[0])];
- return 1;
- }
- if (lenMixed > utf16Mixed.size()) {
- utf16Mixed.resize(lenMixed + 8);
- }
- const size_t nUtf16Mixed = WideCharFromMultiByte(cp,
- std::string_view(mixed, lenMixed),
- utf16Mixed.data(),
- utf16Mixed.size());
-
- if (nUtf16Mixed == 0) {
- // Failed to convert -> bad input
- folded[0] = '\0';
- return 1;
- }
-
- size_t lenFlat = 0;
- for (size_t mixIndex = 0; mixIndex < nUtf16Mixed; mixIndex++) {
- if ((lenFlat + safeFoldingSize) > utf16Folded.size())
- utf16Folded.resize(lenFlat + 60);
- const char *foldedUTF8 = CaseConvert(utf16Mixed[mixIndex], CaseConversion::fold);
- if (foldedUTF8) {
- // Maximum length of a case conversion is 6 bytes, 3 characters
- wchar_t wFolded[safeFoldingSize];
- const size_t charsConverted = UTF16FromUTF8(std::string_view(foldedUTF8),
- wFolded, std::size(wFolded));
- for (size_t j = 0; j < charsConverted; j++)
- utf16Folded[lenFlat++] = wFolded[j];
- } else {
- utf16Folded[lenFlat++] = utf16Mixed[mixIndex];
+ // This loop outputs the same length as input as for each char 1-byte -> 1-byte; 2-byte -> 2-byte
+ size_t lenOut = 0;
+ for (size_t i = 0; i < lenMixed; i++) {
+ const ptrdiff_t lenLeft = lenMixed - i;
+ const char ch = mixed[i];
+ if ((lenLeft >= 2) && DBCSIsLeadByte(cp, ch) && ((lenOut + 2) <= sizeFolded)) {
+ i++;
+ const char ch2 = mixed[i];
+ const uint16_t ind = DBCSIndex(ch, ch2);
+ const char *pair = foldingMap->at(ind).chars;
+ folded[lenOut++] = pair[0];
+ folded[lenOut++] = pair[1];
+ } else if ((lenOut + 1) <= sizeFolded) {
+ const unsigned char uch = ch;
+ folded[lenOut++] = mapping[uch];
}
}
-
- const std::wstring_view wsvFolded(utf16Folded.data(), lenFlat);
- const size_t lenOut = MultiByteLenFromWideChar(cp, wsvFolded);
-
- if (lenOut < sizeFolded) {
- MultiByteFromWideChar(cp, wsvFolded, folded, lenOut);
- return lenOut;
- }
- return 0;
+ return lenOut;
}
}
@@ -2862,8 +2883,6 @@ std::unique_ptr<CaseFolder> ScintillaWin::CaseFolderForEncoding() {
}
std::unique_ptr<CaseFolderTable> pcf = std::make_unique<CaseFolderTable>();
// Only for single byte encodings
- constexpr int highByteFirst = 0x80;
- constexpr int highByteLast = 0xFF;
for (int i=highByteFirst; i<=highByteLast; i++) {
char sCharacter[2] = "A";
sCharacter[0] = static_cast<char>(i);