aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authorNeil <nyamatongwe@gmail.com>2025-03-27 20:05:39 +1100
committerNeil <nyamatongwe@gmail.com>2025-03-27 20:05:39 +1100
commit44a2755858009d5c14732d82814f9b5558f99ef4 (patch)
tree5d814d09c1a00c94f198c69b860e1d27e36ad68c /src
parentb451f612b65dfe1ad678080e7b897733be310d01 (diff)
downloadscintilla-mirror-44a2755858009d5c14732d82814f9b5558f99ef4.tar.gz
Optimize case-insensitive DBCS search to be around 5 times faster by using 64K
memory to cache folding data for each DBCS code page used.
Diffstat (limited to 'src')
-rw-r--r--src/DBCS.cxx61
-rw-r--r--src/DBCS.h21
2 files changed, 82 insertions, 0 deletions
diff --git a/src/DBCS.cxx b/src/DBCS.cxx
index d5512cc14..0f22a705a 100644
--- a/src/DBCS.cxx
+++ b/src/DBCS.cxx
@@ -5,6 +5,11 @@
// Copyright 2017 by Neil Hodgson <neilh@scintilla.org>
// The License.txt file describes the conditions under which this software may be distributed.
+#include <cstdint>
+
+#include <array>
+#include <map>
+
#include "DBCS.h"
using namespace Scintilla::Internal;
@@ -35,6 +40,41 @@ bool DBCSIsLeadByte(int codePage, char ch) noexcept {
((uch >= 0x84) && (uch <= 0xD3)) ||
((uch >= 0xD8) && (uch <= 0xDE)) ||
((uch >= 0xE0) && (uch <= 0xF9));
+ default:
+ break;
+ }
+ return false;
+}
+
+bool DBCSIsTrailByte(int codePage, char ch) noexcept {
+ const unsigned char trail = ch;
+ switch (codePage) {
+ case cp932:
+ // Shift_jis
+ return (trail != 0x7F) &&
+ ((trail >= 0x40) && (trail <= 0xFC));
+ case cp936:
+ // GBK
+ return (trail != 0x7F) &&
+ ((trail >= 0x40) && (trail <= 0xFE));
+ case cp949:
+ // Korean Wansung KS C-5601-1987
+ return
+ ((trail >= 0x41) && (trail <= 0x5A)) ||
+ ((trail >= 0x61) && (trail <= 0x7A)) ||
+ ((trail >= 0x81) && (trail <= 0xFE));
+ case cp950:
+ // Big5
+ return
+ ((trail >= 0x40) && (trail <= 0x7E)) ||
+ ((trail >= 0xA1) && (trail <= 0xFE));
+ case cp1361:
+ // Korean Johab KS C-5601-1992
+ return
+ ((trail >= 0x31) && (trail <= 0x7E)) ||
+ ((trail >= 0x81) && (trail <= 0xFE));
+ default:
+ break;
}
return false;
}
@@ -51,4 +91,25 @@ bool IsDBCSValidSingleByte(int codePage, int ch) noexcept {
}
}
+using CodePageToFoldMap = std::map<int, FoldMap>;
+CodePageToFoldMap cpToFoldMap;
+
+bool DBCSHasFoldMap(int codePage) {
+ const CodePageToFoldMap::const_iterator it = cpToFoldMap.find(codePage);
+ return it != cpToFoldMap.end();
+}
+
+void DBCSSetFoldMap(int codePage, const FoldMap &foldMap) {
+ cpToFoldMap[codePage] = foldMap;
+}
+
+FoldMap *DBCSGetMutableFoldMap(int codePage) {
+ // Constructs if needed
+ return &cpToFoldMap[codePage];
+}
+
+const FoldMap *DBCSGetFoldMap(int codePage) {
+ return &cpToFoldMap[codePage];
+}
+
}
diff --git a/src/DBCS.h b/src/DBCS.h
index 01830c843..12bbaf986 100644
--- a/src/DBCS.h
+++ b/src/DBCS.h
@@ -25,8 +25,29 @@ constexpr bool IsDBCSCodePage(int codePage) noexcept {
}
bool DBCSIsLeadByte(int codePage, char ch) noexcept;
+bool DBCSIsTrailByte(int codePage, char ch) noexcept;
bool IsDBCSValidSingleByte(int codePage, int ch) noexcept;
+// Calculate a number from a DBCS byte pair that can be used to index into an array or map.
+// Should only be called with genuine DBCS character pairs which means that ch1 has top bit set.
+constexpr uint16_t DBCSIndex(char ch1, char ch2) noexcept {
+ constexpr unsigned int highStart = 0x80U;
+ constexpr unsigned int byteMultiply = 0x100U;
+ const unsigned char uch1 = ch1;
+ const unsigned char uch2 = ch2;
+ return ((uch1 - highStart) * byteMultiply) + uch2;
+}
+
+struct DBCSPair {
+ char chars[2];
+};
+using FoldMap = std::array<DBCSPair, 0x8000>;
+
+bool DBCSHasFoldMap(int codePage);
+void DBCSSetFoldMap(int codePage, const FoldMap &foldMap);
+FoldMap *DBCSGetMutableFoldMap(int codePage);
+const FoldMap *DBCSGetFoldMap(int codePage);
+
}
#endif