aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorNeil <nyamatongwe@gmail.com>2019-03-29 09:05:14 +1100
committerNeil <nyamatongwe@gmail.com>2019-03-29 09:05:14 +1100
commit6d1bf18357b56cbec7d289c9c9434a7a8888b386 (patch)
treee03227ff253d79d62c42b331c2b66d32b81dc9c0
parent62b31d42a2ee02b4992134d325fab6f297729094 (diff)
downloadscintilla-mirror-6d1bf18357b56cbec7d289c9c9434a7a8888b386.tar.gz
Feature [feature-requests:#1259]. Add SCI_SETCHARACTERCATEGORYOPTIMIZATION API
to optimize speed of character category features.
-rw-r--r--doc/ScintillaDoc.html11
-rw-r--r--doc/ScintillaHistory.html6
-rw-r--r--include/Scintilla.h2
-rw-r--r--include/Scintilla.iface6
-rw-r--r--lexlib/CharacterCategory.cxx31
-rw-r--r--lexlib/CharacterCategory.h17
-rw-r--r--scripts/GenerateCharacterCategory.py20
-rw-r--r--src/Document.cxx10
-rw-r--r--src/Document.h3
-rw-r--r--src/Editor.cxx7
-rw-r--r--test/simpleTests.py5
11 files changed, 112 insertions, 6 deletions
diff --git a/doc/ScintillaDoc.html b/doc/ScintillaDoc.html
index 47d057ff4..e91f8ec3b 100644
--- a/doc/ScintillaDoc.html
+++ b/doc/ScintillaDoc.html
@@ -2470,6 +2470,8 @@ struct Sci_TextToFind {
<a class="message" href="#SCI_SETPUNCTUATIONCHARS">SCI_SETPUNCTUATIONCHARS(&lt;unused&gt;, const char *characters)</a><br />
<a class="message" href="#SCI_GETPUNCTUATIONCHARS">SCI_GETPUNCTUATIONCHARS(&lt;unused&gt;, char *characters) &rarr; int</a><br />
<a class="message" href="#SCI_SETCHARSDEFAULT">SCI_SETCHARSDEFAULT</a><br />
+ <a class="message" href="#SCI_SETCHARACTERCATEGORYOPTIMIZATION">SCI_SETCHARACTERCATEGORYOPTIMIZATION(int countCharacters)</a><br />
+ <a class="message" href="#SCI_GETCHARACTERCATEGORYOPTIMIZATION">SCI_GETCHARACTERCATEGORYOPTIMIZATION &rarr; int</a><br />
<p><b id="SCI_WORDENDPOSITION">SCI_WORDENDPOSITION(int pos, bool onlyWordCharacters) &rarr; int</b><br />
<b id="SCI_WORDSTARTPOSITION">SCI_WORDSTARTPOSITION(int pos, bool onlyWordCharacters) &rarr; int</b><br />
@@ -2597,6 +2599,15 @@ struct Sci_TextToFind {
characters with codes less than 0x20, with word characters set to alphanumeric and '_'.
</p>
+ <p><b id="SCI_SETCHARACTERCATEGORYOPTIMIZATION">SCI_SETCHARACTERCATEGORYOPTIMIZATION(int countCharacters)</b><br />
+ <b id="SCI_GETCHARACTERCATEGORYOPTIMIZATION">SCI_GETCHARACTERCATEGORYOPTIMIZATION &rarr; int</b><br />
+ Optimize speed of character category features like determining whether a character is a space or number at the expense of memory.
+ Mostly used for Unicode documents.
+ The countCharacters parameter determines how many character starting from 0 are added to a look-up table with one byte used for each character.
+ It is reasonable to cover the set of characters likely to be used in a document so 0x100 for simple Roman text,
+ 0x1000 to cover most simple alphabets, 0x10000 to cover most of East Asian languages, and 0x110000 to cover all possible characters.
+ </p>
+
<p>Word keyboard commands are:</p>
<ul>
<li class="message" id="SCI_WORDLEFT">SCI_WORDLEFT</li>
diff --git a/doc/ScintillaHistory.html b/doc/ScintillaHistory.html
index 6f08fe033..8e8d734dd 100644
--- a/doc/ScintillaHistory.html
+++ b/doc/ScintillaHistory.html
@@ -571,6 +571,12 @@
Implement high-priority idle on Win32 to make redraw smoother and more efficient.
</li>
<li>
+ Add SCI_SETCHARACTERCATEGORYOPTIMIZATION API to optimize speed
+ of character category features like determining whether a character is a space or number
+ at the expense of memory.
+ <a href="https://sourceforge.net/p/scintilla/feature-requests/1259/">Feature #1259</a>.
+ </li>
+ <li>
Fix flicker when inserting primary selection on GTK.
<a href="https://sourceforge.net/p/scintilla/bugs/2087/">Bug #2087</a>.
</li>
diff --git a/include/Scintilla.h b/include/Scintilla.h
index dc0a73b7e..c8ef2fa37 100644
--- a/include/Scintilla.h
+++ b/include/Scintilla.h
@@ -266,6 +266,8 @@ typedef sptr_t (*SciFnDirect)(sptr_t ptr, unsigned int iMessage, uptr_t wParam,
#define SCI_SETCARETPERIOD 2076
#define SCI_SETWORDCHARS 2077
#define SCI_GETWORDCHARS 2646
+#define SCI_SETCHARACTERCATEGORYOPTIMIZATION 2720
+#define SCI_GETCHARACTERCATEGORYOPTIMIZATION 2721
#define SCI_BEGINUNDOACTION 2078
#define SCI_ENDUNDOACTION 2079
#define INDIC_PLAIN 0
diff --git a/include/Scintilla.iface b/include/Scintilla.iface
index aed70e7f3..cf2d01abc 100644
--- a/include/Scintilla.iface
+++ b/include/Scintilla.iface
@@ -610,6 +610,12 @@ set void SetWordChars=2077(, string characters)
# Returns the number of characters
get int GetWordChars=2646(, stringresult characters)
+# Set the number of characters to have directly indexed categories
+set void SetCharacterCategoryOptimization=2720(int countCharacters,)
+
+# Get the number of characters to have directly indexed categories
+get int GetCharacterCategoryOptimization=2721(,)
+
# Start a sequence of actions that is undone and redone as a unit.
# May be nested.
fun void BeginUndoAction=2078(,)
diff --git a/lexlib/CharacterCategory.cxx b/lexlib/CharacterCategory.cxx
index bc2fa2336..ca76325df 100644
--- a/lexlib/CharacterCategory.cxx
+++ b/lexlib/CharacterCategory.cxx
@@ -7,6 +7,7 @@
// Copyright 2013 by Neil Hodgson <neilh@scintilla.org>
// The License.txt file describes the conditions under which this software may be distributed.
+#include <vector>
#include <algorithm>
#include <iterator>
@@ -3790,6 +3791,7 @@ const int catRanges[] = {
33554397,
33554460,
35651549,
+35651613,
//--Autogenerated -- end of section automatically generated
};
@@ -3963,4 +3965,33 @@ bool IsXidContinue(int character) {
}
}
+CharacterCategoryMap::CharacterCategoryMap() noexcept {
+ Optimize(256);
+}
+
+int CharacterCategoryMap::Size() const noexcept {
+ return static_cast<int>(dense.size());
+}
+
+void CharacterCategoryMap::Optimize(int countCharacters) {
+ const int characters = std::clamp(countCharacters, 256, maxUnicode + 1);
+ dense.resize(characters);
+
+ int end = 0;
+ int index = 0;
+ int current = catRanges[index];
+ ++index;
+ do {
+ const int next = catRanges[index];
+ const unsigned char category = current & maskCategory;
+ current >>= 5;
+ end = std::min(characters, next >> 5);
+ while (current < end) {
+ dense[current++] = category;
+ }
+ current = next;
+ ++index;
+ } while (characters > end);
+}
+
}
diff --git a/lexlib/CharacterCategory.h b/lexlib/CharacterCategory.h
index 767d79670..d1ac39152 100644
--- a/lexlib/CharacterCategory.h
+++ b/lexlib/CharacterCategory.h
@@ -28,6 +28,23 @@ bool IsIdContinue(int character);
bool IsXidStart(int character);
bool IsXidContinue(int character);
+class CharacterCategoryMap {
+private:
+ std::vector<unsigned char> dense;
+public:
+ CharacterCategoryMap() noexcept;
+ CharacterCategory CategoryFor(int character) const {
+ if (static_cast<size_t>(character) < dense.size()) {
+ return static_cast<CharacterCategory>(dense[character]);
+ } else {
+ // binary search through ranges
+ return CategoriseCharacter(character);
+ }
+ }
+ int Size() const noexcept;
+ void Optimize(int countCharacters);
+};
+
}
#endif
diff --git a/scripts/GenerateCharacterCategory.py b/scripts/GenerateCharacterCategory.py
index 4596eec6a..ba6ac858d 100644
--- a/scripts/GenerateCharacterCategory.py
+++ b/scripts/GenerateCharacterCategory.py
@@ -17,17 +17,27 @@ def findCategories(filename):
def updateCharacterCategory(filename):
values = ["// Created with Python %s, Unicode %s" % (
platform.python_version(), unicodedata.unidata_version)]
- category = unicodedata.category(chr(0))
+
startRange = 0
+ category = unicodedata.category(chr(startRange))
+ table = []
for ch in range(sys.maxunicode):
uch = chr(ch)
- if unicodedata.category(uch) != category:
+ current = unicodedata.category(uch)
+ if current != category:
value = startRange * 32 + categories.index(category)
- values.append("%d," % value)
- category = unicodedata.category(uch)
+ table.append(value)
+ category = current
startRange = ch
value = startRange * 32 + categories.index(category)
- values.append("%d," % value)
+ table.append(value)
+
+ # the sentinel value is used to simplify CharacterCategoryMap::Optimize()
+ category = 'Cn'
+ value = (sys.maxunicode + 1)*32 + categories.index(category)
+ table.append(value)
+
+ values.extend(["%d," % value for value in table])
Regenerate(filename, "//", values)
diff --git a/src/Document.cxx b/src/Document.cxx
index 4df512b1d..72dae5f5b 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -1707,7 +1707,7 @@ CharClassify::cc Document::WordCharacterClass(unsigned int ch) const {
if (dbcsCodePage && (!UTF8IsAscii(ch))) {
if (SC_CP_UTF8 == dbcsCodePage) {
// Use hard coded Unicode class
- const CharacterCategory cc = CategoriseCharacter(ch);
+ const CharacterCategory cc = charMap.CategoryFor(ch);
switch (cc) {
// Separator, Line/Paragraph
@@ -2166,6 +2166,14 @@ int Document::GetCharsOfClass(CharClassify::cc characterClass, unsigned char *bu
return charClass.GetCharsOfClass(characterClass, buffer);
}
+void Document::SetCharacterCategoryOptimization(int countCharacters) {
+ charMap.Optimize(countCharacters);
+}
+
+int Document::CharacterCategoryOptimization() const noexcept {
+ return charMap.Size();
+}
+
void SCI_METHOD Document::StartStyling(Sci_Position position) {
endStyled = position;
}
diff --git a/src/Document.h b/src/Document.h
index de986ce2a..c5b078a72 100644
--- a/src/Document.h
+++ b/src/Document.h
@@ -230,6 +230,7 @@ private:
int refCount;
CellBuffer cb;
CharClassify charClass;
+ CharacterCategoryMap charMap;
std::unique_ptr<CaseFolder> pcf;
Sci::Position endStyled;
int styleClock;
@@ -444,6 +445,8 @@ public:
void SetDefaultCharClasses(bool includeWordClass);
void SetCharClasses(const unsigned char *chars, CharClassify::cc newCharClass);
int GetCharsOfClass(CharClassify::cc characterClass, unsigned char *buffer) const;
+ void SetCharacterCategoryOptimization(int countCharacters);
+ int CharacterCategoryOptimization() const noexcept;
void SCI_METHOD StartStyling(Sci_Position position) override;
bool SCI_METHOD SetStyleFor(Sci_Position length, char style) override;
bool SCI_METHOD SetStyles(Sci_Position length, const char *styles) override;
diff --git a/src/Editor.cxx b/src/Editor.cxx
index 72660dd1b..4bae9b44f 100644
--- a/src/Editor.cxx
+++ b/src/Editor.cxx
@@ -6253,6 +6253,13 @@ sptr_t Editor::WndProc(unsigned int iMessage, uptr_t wParam, sptr_t lParam) {
pdoc->SetDefaultCharClasses(true);
break;
+ case SCI_SETCHARACTERCATEGORYOPTIMIZATION:
+ pdoc->SetCharacterCategoryOptimization(static_cast<int>(wParam));
+ break;
+
+ case SCI_GETCHARACTERCATEGORYOPTIMIZATION:
+ return pdoc->CharacterCategoryOptimization();
+
case SCI_GETLENGTH:
return pdoc->Length();
diff --git a/test/simpleTests.py b/test/simpleTests.py
index d51908baf..3af25247c 100644
--- a/test/simpleTests.py
+++ b/test/simpleTests.py
@@ -2282,6 +2282,11 @@ class TestWordChars(unittest.TestCase):
data = self.ed.GetPunctuationChars(None)
self.assertCharSetsEqual(data, expected)
+ def testCharacterCategoryOptimization(self):
+ self.assertEquals(self.ed.CharacterCategoryOptimization, 0x100)
+ self.ed.CharacterCategoryOptimization = 0x1000
+ self.assertEquals(self.ed.CharacterCategoryOptimization, 0x1000)
+
class TestExplicitTabStops(unittest.TestCase):
def setUp(self):