diff options
author | Neil <nyamatongwe@gmail.com> | 2019-03-29 09:05:14 +1100 |
---|---|---|
committer | Neil <nyamatongwe@gmail.com> | 2019-03-29 09:05:14 +1100 |
commit | 6d1bf18357b56cbec7d289c9c9434a7a8888b386 (patch) | |
tree | e03227ff253d79d62c42b331c2b66d32b81dc9c0 | |
parent | 62b31d42a2ee02b4992134d325fab6f297729094 (diff) | |
download | scintilla-mirror-6d1bf18357b56cbec7d289c9c9434a7a8888b386.tar.gz |
Feature [feature-requests:#1259]. Add SCI_SETCHARACTERCATEGORYOPTIMIZATION API
to optimize speed of character category features.
-rw-r--r-- | doc/ScintillaDoc.html | 11 | ||||
-rw-r--r-- | doc/ScintillaHistory.html | 6 | ||||
-rw-r--r-- | include/Scintilla.h | 2 | ||||
-rw-r--r-- | include/Scintilla.iface | 6 | ||||
-rw-r--r-- | lexlib/CharacterCategory.cxx | 31 | ||||
-rw-r--r-- | lexlib/CharacterCategory.h | 17 | ||||
-rw-r--r-- | scripts/GenerateCharacterCategory.py | 20 | ||||
-rw-r--r-- | src/Document.cxx | 10 | ||||
-rw-r--r-- | src/Document.h | 3 | ||||
-rw-r--r-- | src/Editor.cxx | 7 | ||||
-rw-r--r-- | test/simpleTests.py | 5 |
11 files changed, 112 insertions, 6 deletions
diff --git a/doc/ScintillaDoc.html b/doc/ScintillaDoc.html index 47d057ff4..e91f8ec3b 100644 --- a/doc/ScintillaDoc.html +++ b/doc/ScintillaDoc.html @@ -2470,6 +2470,8 @@ struct Sci_TextToFind { <a class="message" href="#SCI_SETPUNCTUATIONCHARS">SCI_SETPUNCTUATIONCHARS(<unused>, const char *characters)</a><br /> <a class="message" href="#SCI_GETPUNCTUATIONCHARS">SCI_GETPUNCTUATIONCHARS(<unused>, char *characters) → int</a><br /> <a class="message" href="#SCI_SETCHARSDEFAULT">SCI_SETCHARSDEFAULT</a><br /> + <a class="message" href="#SCI_SETCHARACTERCATEGORYOPTIMIZATION">SCI_SETCHARACTERCATEGORYOPTIMIZATION(int countCharacters)</a><br /> + <a class="message" href="#SCI_GETCHARACTERCATEGORYOPTIMIZATION">SCI_GETCHARACTERCATEGORYOPTIMIZATION → int</a><br /> <p><b id="SCI_WORDENDPOSITION">SCI_WORDENDPOSITION(int pos, bool onlyWordCharacters) → int</b><br /> <b id="SCI_WORDSTARTPOSITION">SCI_WORDSTARTPOSITION(int pos, bool onlyWordCharacters) → int</b><br /> @@ -2597,6 +2599,15 @@ struct Sci_TextToFind { characters with codes less than 0x20, with word characters set to alphanumeric and '_'. </p> + <p><b id="SCI_SETCHARACTERCATEGORYOPTIMIZATION">SCI_SETCHARACTERCATEGORYOPTIMIZATION(int countCharacters)</b><br /> + <b id="SCI_GETCHARACTERCATEGORYOPTIMIZATION">SCI_GETCHARACTERCATEGORYOPTIMIZATION → int</b><br /> + Optimize speed of character category features like determining whether a character is a space or number at the expense of memory. + Mostly used for Unicode documents. + The countCharacters parameter determines how many character starting from 0 are added to a look-up table with one byte used for each character. + It is reasonable to cover the set of characters likely to be used in a document so 0x100 for simple Roman text, + 0x1000 to cover most simple alphabets, 0x10000 to cover most of East Asian languages, and 0x110000 to cover all possible characters. + </p> + <p>Word keyboard commands are:</p> <ul> <li class="message" id="SCI_WORDLEFT">SCI_WORDLEFT</li> diff --git a/doc/ScintillaHistory.html b/doc/ScintillaHistory.html index 6f08fe033..8e8d734dd 100644 --- a/doc/ScintillaHistory.html +++ b/doc/ScintillaHistory.html @@ -571,6 +571,12 @@ Implement high-priority idle on Win32 to make redraw smoother and more efficient. </li> <li> + Add SCI_SETCHARACTERCATEGORYOPTIMIZATION API to optimize speed + of character category features like determining whether a character is a space or number + at the expense of memory. + <a href="https://sourceforge.net/p/scintilla/feature-requests/1259/">Feature #1259</a>. + </li> + <li> Fix flicker when inserting primary selection on GTK. <a href="https://sourceforge.net/p/scintilla/bugs/2087/">Bug #2087</a>. </li> diff --git a/include/Scintilla.h b/include/Scintilla.h index dc0a73b7e..c8ef2fa37 100644 --- a/include/Scintilla.h +++ b/include/Scintilla.h @@ -266,6 +266,8 @@ typedef sptr_t (*SciFnDirect)(sptr_t ptr, unsigned int iMessage, uptr_t wParam, #define SCI_SETCARETPERIOD 2076 #define SCI_SETWORDCHARS 2077 #define SCI_GETWORDCHARS 2646 +#define SCI_SETCHARACTERCATEGORYOPTIMIZATION 2720 +#define SCI_GETCHARACTERCATEGORYOPTIMIZATION 2721 #define SCI_BEGINUNDOACTION 2078 #define SCI_ENDUNDOACTION 2079 #define INDIC_PLAIN 0 diff --git a/include/Scintilla.iface b/include/Scintilla.iface index aed70e7f3..cf2d01abc 100644 --- a/include/Scintilla.iface +++ b/include/Scintilla.iface @@ -610,6 +610,12 @@ set void SetWordChars=2077(, string characters) # Returns the number of characters get int GetWordChars=2646(, stringresult characters) +# Set the number of characters to have directly indexed categories +set void SetCharacterCategoryOptimization=2720(int countCharacters,) + +# Get the number of characters to have directly indexed categories +get int GetCharacterCategoryOptimization=2721(,) + # Start a sequence of actions that is undone and redone as a unit. # May be nested. fun void BeginUndoAction=2078(,) diff --git a/lexlib/CharacterCategory.cxx b/lexlib/CharacterCategory.cxx index bc2fa2336..ca76325df 100644 --- a/lexlib/CharacterCategory.cxx +++ b/lexlib/CharacterCategory.cxx @@ -7,6 +7,7 @@ // Copyright 2013 by Neil Hodgson <neilh@scintilla.org> // The License.txt file describes the conditions under which this software may be distributed. +#include <vector> #include <algorithm> #include <iterator> @@ -3790,6 +3791,7 @@ const int catRanges[] = { 33554397, 33554460, 35651549, +35651613, //--Autogenerated -- end of section automatically generated }; @@ -3963,4 +3965,33 @@ bool IsXidContinue(int character) { } } +CharacterCategoryMap::CharacterCategoryMap() noexcept { + Optimize(256); +} + +int CharacterCategoryMap::Size() const noexcept { + return static_cast<int>(dense.size()); +} + +void CharacterCategoryMap::Optimize(int countCharacters) { + const int characters = std::clamp(countCharacters, 256, maxUnicode + 1); + dense.resize(characters); + + int end = 0; + int index = 0; + int current = catRanges[index]; + ++index; + do { + const int next = catRanges[index]; + const unsigned char category = current & maskCategory; + current >>= 5; + end = std::min(characters, next >> 5); + while (current < end) { + dense[current++] = category; + } + current = next; + ++index; + } while (characters > end); +} + } diff --git a/lexlib/CharacterCategory.h b/lexlib/CharacterCategory.h index 767d79670..d1ac39152 100644 --- a/lexlib/CharacterCategory.h +++ b/lexlib/CharacterCategory.h @@ -28,6 +28,23 @@ bool IsIdContinue(int character); bool IsXidStart(int character); bool IsXidContinue(int character); +class CharacterCategoryMap { +private: + std::vector<unsigned char> dense; +public: + CharacterCategoryMap() noexcept; + CharacterCategory CategoryFor(int character) const { + if (static_cast<size_t>(character) < dense.size()) { + return static_cast<CharacterCategory>(dense[character]); + } else { + // binary search through ranges + return CategoriseCharacter(character); + } + } + int Size() const noexcept; + void Optimize(int countCharacters); +}; + } #endif diff --git a/scripts/GenerateCharacterCategory.py b/scripts/GenerateCharacterCategory.py index 4596eec6a..ba6ac858d 100644 --- a/scripts/GenerateCharacterCategory.py +++ b/scripts/GenerateCharacterCategory.py @@ -17,17 +17,27 @@ def findCategories(filename): def updateCharacterCategory(filename): values = ["// Created with Python %s, Unicode %s" % ( platform.python_version(), unicodedata.unidata_version)] - category = unicodedata.category(chr(0)) + startRange = 0 + category = unicodedata.category(chr(startRange)) + table = [] for ch in range(sys.maxunicode): uch = chr(ch) - if unicodedata.category(uch) != category: + current = unicodedata.category(uch) + if current != category: value = startRange * 32 + categories.index(category) - values.append("%d," % value) - category = unicodedata.category(uch) + table.append(value) + category = current startRange = ch value = startRange * 32 + categories.index(category) - values.append("%d," % value) + table.append(value) + + # the sentinel value is used to simplify CharacterCategoryMap::Optimize() + category = 'Cn' + value = (sys.maxunicode + 1)*32 + categories.index(category) + table.append(value) + + values.extend(["%d," % value for value in table]) Regenerate(filename, "//", values) diff --git a/src/Document.cxx b/src/Document.cxx index 4df512b1d..72dae5f5b 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -1707,7 +1707,7 @@ CharClassify::cc Document::WordCharacterClass(unsigned int ch) const { if (dbcsCodePage && (!UTF8IsAscii(ch))) { if (SC_CP_UTF8 == dbcsCodePage) { // Use hard coded Unicode class - const CharacterCategory cc = CategoriseCharacter(ch); + const CharacterCategory cc = charMap.CategoryFor(ch); switch (cc) { // Separator, Line/Paragraph @@ -2166,6 +2166,14 @@ int Document::GetCharsOfClass(CharClassify::cc characterClass, unsigned char *bu return charClass.GetCharsOfClass(characterClass, buffer); } +void Document::SetCharacterCategoryOptimization(int countCharacters) { + charMap.Optimize(countCharacters); +} + +int Document::CharacterCategoryOptimization() const noexcept { + return charMap.Size(); +} + void SCI_METHOD Document::StartStyling(Sci_Position position) { endStyled = position; } diff --git a/src/Document.h b/src/Document.h index de986ce2a..c5b078a72 100644 --- a/src/Document.h +++ b/src/Document.h @@ -230,6 +230,7 @@ private: int refCount; CellBuffer cb; CharClassify charClass; + CharacterCategoryMap charMap; std::unique_ptr<CaseFolder> pcf; Sci::Position endStyled; int styleClock; @@ -444,6 +445,8 @@ public: void SetDefaultCharClasses(bool includeWordClass); void SetCharClasses(const unsigned char *chars, CharClassify::cc newCharClass); int GetCharsOfClass(CharClassify::cc characterClass, unsigned char *buffer) const; + void SetCharacterCategoryOptimization(int countCharacters); + int CharacterCategoryOptimization() const noexcept; void SCI_METHOD StartStyling(Sci_Position position) override; bool SCI_METHOD SetStyleFor(Sci_Position length, char style) override; bool SCI_METHOD SetStyles(Sci_Position length, const char *styles) override; diff --git a/src/Editor.cxx b/src/Editor.cxx index 72660dd1b..4bae9b44f 100644 --- a/src/Editor.cxx +++ b/src/Editor.cxx @@ -6253,6 +6253,13 @@ sptr_t Editor::WndProc(unsigned int iMessage, uptr_t wParam, sptr_t lParam) { pdoc->SetDefaultCharClasses(true); break; + case SCI_SETCHARACTERCATEGORYOPTIMIZATION: + pdoc->SetCharacterCategoryOptimization(static_cast<int>(wParam)); + break; + + case SCI_GETCHARACTERCATEGORYOPTIMIZATION: + return pdoc->CharacterCategoryOptimization(); + case SCI_GETLENGTH: return pdoc->Length(); diff --git a/test/simpleTests.py b/test/simpleTests.py index d51908baf..3af25247c 100644 --- a/test/simpleTests.py +++ b/test/simpleTests.py @@ -2282,6 +2282,11 @@ class TestWordChars(unittest.TestCase): data = self.ed.GetPunctuationChars(None) self.assertCharSetsEqual(data, expected) + def testCharacterCategoryOptimization(self): + self.assertEquals(self.ed.CharacterCategoryOptimization, 0x100) + self.ed.CharacterCategoryOptimization = 0x1000 + self.assertEquals(self.ed.CharacterCategoryOptimization, 0x1000) + class TestExplicitTabStops(unittest.TestCase): def setUp(self): |