diff options
author | Neil <nyamatongwe@gmail.com> | 2025-03-27 20:05:39 +1100 |
---|---|---|
committer | Neil <nyamatongwe@gmail.com> | 2025-03-27 20:05:39 +1100 |
commit | 44a2755858009d5c14732d82814f9b5558f99ef4 (patch) | |
tree | 5d814d09c1a00c94f198c69b860e1d27e36ad68c /src | |
parent | b451f612b65dfe1ad678080e7b897733be310d01 (diff) | |
download | scintilla-mirror-44a2755858009d5c14732d82814f9b5558f99ef4.tar.gz |
Optimize case-insensitive DBCS search to be around 5 times faster by using 64K
memory to cache folding data for each DBCS code page used.
Diffstat (limited to 'src')
-rw-r--r-- | src/DBCS.cxx | 61 | ||||
-rw-r--r-- | src/DBCS.h | 21 |
2 files changed, 82 insertions, 0 deletions
diff --git a/src/DBCS.cxx b/src/DBCS.cxx index d5512cc14..0f22a705a 100644 --- a/src/DBCS.cxx +++ b/src/DBCS.cxx @@ -5,6 +5,11 @@ // Copyright 2017 by Neil Hodgson <neilh@scintilla.org> // The License.txt file describes the conditions under which this software may be distributed. +#include <cstdint> + +#include <array> +#include <map> + #include "DBCS.h" using namespace Scintilla::Internal; @@ -35,6 +40,41 @@ bool DBCSIsLeadByte(int codePage, char ch) noexcept { ((uch >= 0x84) && (uch <= 0xD3)) || ((uch >= 0xD8) && (uch <= 0xDE)) || ((uch >= 0xE0) && (uch <= 0xF9)); + default: + break; + } + return false; +} + +bool DBCSIsTrailByte(int codePage, char ch) noexcept { + const unsigned char trail = ch; + switch (codePage) { + case cp932: + // Shift_jis + return (trail != 0x7F) && + ((trail >= 0x40) && (trail <= 0xFC)); + case cp936: + // GBK + return (trail != 0x7F) && + ((trail >= 0x40) && (trail <= 0xFE)); + case cp949: + // Korean Wansung KS C-5601-1987 + return + ((trail >= 0x41) && (trail <= 0x5A)) || + ((trail >= 0x61) && (trail <= 0x7A)) || + ((trail >= 0x81) && (trail <= 0xFE)); + case cp950: + // Big5 + return + ((trail >= 0x40) && (trail <= 0x7E)) || + ((trail >= 0xA1) && (trail <= 0xFE)); + case cp1361: + // Korean Johab KS C-5601-1992 + return + ((trail >= 0x31) && (trail <= 0x7E)) || + ((trail >= 0x81) && (trail <= 0xFE)); + default: + break; } return false; } @@ -51,4 +91,25 @@ bool IsDBCSValidSingleByte(int codePage, int ch) noexcept { } } +using CodePageToFoldMap = std::map<int, FoldMap>; +CodePageToFoldMap cpToFoldMap; + +bool DBCSHasFoldMap(int codePage) { + const CodePageToFoldMap::const_iterator it = cpToFoldMap.find(codePage); + return it != cpToFoldMap.end(); +} + +void DBCSSetFoldMap(int codePage, const FoldMap &foldMap) { + cpToFoldMap[codePage] = foldMap; +} + +FoldMap *DBCSGetMutableFoldMap(int codePage) { + // Constructs if needed + return &cpToFoldMap[codePage]; +} + +const FoldMap *DBCSGetFoldMap(int codePage) { + return &cpToFoldMap[codePage]; +} + } diff --git a/src/DBCS.h b/src/DBCS.h index 01830c843..12bbaf986 100644 --- a/src/DBCS.h +++ b/src/DBCS.h @@ -25,8 +25,29 @@ constexpr bool IsDBCSCodePage(int codePage) noexcept { } bool DBCSIsLeadByte(int codePage, char ch) noexcept; +bool DBCSIsTrailByte(int codePage, char ch) noexcept; bool IsDBCSValidSingleByte(int codePage, int ch) noexcept; +// Calculate a number from a DBCS byte pair that can be used to index into an array or map. +// Should only be called with genuine DBCS character pairs which means that ch1 has top bit set. +constexpr uint16_t DBCSIndex(char ch1, char ch2) noexcept { + constexpr unsigned int highStart = 0x80U; + constexpr unsigned int byteMultiply = 0x100U; + const unsigned char uch1 = ch1; + const unsigned char uch2 = ch2; + return ((uch1 - highStart) * byteMultiply) + uch2; +} + +struct DBCSPair { + char chars[2]; +}; +using FoldMap = std::array<DBCSPair, 0x8000>; + +bool DBCSHasFoldMap(int codePage); +void DBCSSetFoldMap(int codePage, const FoldMap &foldMap); +FoldMap *DBCSGetMutableFoldMap(int codePage); +const FoldMap *DBCSGetFoldMap(int codePage); + } #endif |