aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--curses/ScintillaCurses.cxx1
-rw-r--r--doc/ScintillaDoc.html11
-rw-r--r--doc/ScintillaHistory.html6
-rw-r--r--include/Scintilla.h2
-rw-r--r--include/Scintilla.iface6
-rw-r--r--lexlib/CharacterCategory.cxx33
-rw-r--r--lexlib/CharacterCategory.h17
-rw-r--r--scripts/GenerateCharacterCategory.py20
-rw-r--r--src/Document.cxx10
-rw-r--r--src/Document.h3
-rw-r--r--src/Editor.cxx7
-rw-r--r--test/simpleTests.py5
12 files changed, 115 insertions, 6 deletions
diff --git a/curses/ScintillaCurses.cxx b/curses/ScintillaCurses.cxx
index 1c69801d1..a39ccbec3 100644
--- a/curses/ScintillaCurses.cxx
+++ b/curses/ScintillaCurses.cxx
@@ -22,6 +22,7 @@
#include "ILoader.h"
#include "ILexer.h"
#include "Scintilla.h"
+#include "CharacterCategory.h"
#include "Position.h"
#include "UniqueString.h"
#include "SplitVector.h"
diff --git a/doc/ScintillaDoc.html b/doc/ScintillaDoc.html
index 76acc4c71..0bd7c2243 100644
--- a/doc/ScintillaDoc.html
+++ b/doc/ScintillaDoc.html
@@ -2467,6 +2467,8 @@ struct Sci_TextToFind {
<a class="message" href="#SCI_SETPUNCTUATIONCHARS">SCI_SETPUNCTUATIONCHARS(&lt;unused&gt;, const char *characters)</a><br />
<a class="message" href="#SCI_GETPUNCTUATIONCHARS">SCI_GETPUNCTUATIONCHARS(&lt;unused&gt;, char *characters) &rarr; int</a><br />
<a class="message" href="#SCI_SETCHARSDEFAULT">SCI_SETCHARSDEFAULT</a><br />
+ <a class="message" href="#SCI_SETCHARACTERCATEGORYOPTIMIZATION">SCI_SETCHARACTERCATEGORYOPTIMIZATION(int countCharacters)</a><br />
+ <a class="message" href="#SCI_GETCHARACTERCATEGORYOPTIMIZATION">SCI_GETCHARACTERCATEGORYOPTIMIZATION &rarr; int</a><br />
<p><b id="SCI_WORDENDPOSITION">SCI_WORDENDPOSITION(int pos, bool onlyWordCharacters) &rarr; int</b><br />
<b id="SCI_WORDSTARTPOSITION">SCI_WORDSTARTPOSITION(int pos, bool onlyWordCharacters) &rarr; int</b><br />
@@ -2594,6 +2596,15 @@ struct Sci_TextToFind {
characters with codes less than 0x20, with word characters set to alphanumeric and '_'.
</p>
+ <p><b id="SCI_SETCHARACTERCATEGORYOPTIMIZATION">SCI_SETCHARACTERCATEGORYOPTIMIZATION(int countCharacters)</b><br />
+ <b id="SCI_GETCHARACTERCATEGORYOPTIMIZATION">SCI_GETCHARACTERCATEGORYOPTIMIZATION &rarr; int</b><br />
+ Optimize speed of character category features like determining whether a character is a space or number at the expense of memory.
+ Mostly used for Unicode documents.
+ The countCharacters parameter determines how many character starting from 0 are added to a look-up table with one byte used for each character.
+ It is reasonable to cover the set of characters likely to be used in a document so 0x100 for simple Roman text,
+ 0x1000 to cover most simple alphabets, 0x10000 to cover most of East Asian languages, and 0x110000 to cover all possible characters.
+ </p>
+
<p>Word keyboard commands are:</p>
<ul>
<li class="message" id="SCI_WORDLEFT">SCI_WORDLEFT</li>
diff --git a/doc/ScintillaHistory.html b/doc/ScintillaHistory.html
index 343aa7966..1a6b34f49 100644
--- a/doc/ScintillaHistory.html
+++ b/doc/ScintillaHistory.html
@@ -560,6 +560,12 @@
Implement high-priority idle on Win32 to make redraw smoother and more efficient.
</li>
<li>
+ Add SCI_SETCHARACTERCATEGORYOPTIMIZATION API to optimize speed
+ of character category features like determining whether a character is a space or number
+ at the expense of memory.
+ <a href="https://sourceforge.net/p/scintilla/feature-requests/1259/">Feature #1259</a>.
+ </li>
+ <li>
Fix flicker when inserting primary selection on GTK.
<a href="https://sourceforge.net/p/scintilla/bugs/2087/">Bug #2087</a>.
</li>
diff --git a/include/Scintilla.h b/include/Scintilla.h
index cb3259842..31115ad0c 100644
--- a/include/Scintilla.h
+++ b/include/Scintilla.h
@@ -266,6 +266,8 @@ typedef sptr_t (*SciFnDirect)(sptr_t ptr, unsigned int iMessage, uptr_t wParam,
#define SCI_SETCARETPERIOD 2076
#define SCI_SETWORDCHARS 2077
#define SCI_GETWORDCHARS 2646
+#define SCI_SETCHARACTERCATEGORYOPTIMIZATION 2720
+#define SCI_GETCHARACTERCATEGORYOPTIMIZATION 2721
#define SCI_BEGINUNDOACTION 2078
#define SCI_ENDUNDOACTION 2079
#define INDIC_PLAIN 0
diff --git a/include/Scintilla.iface b/include/Scintilla.iface
index a26f7057d..a3820445a 100644
--- a/include/Scintilla.iface
+++ b/include/Scintilla.iface
@@ -610,6 +610,12 @@ set void SetWordChars=2077(, string characters)
# Returns the number of characters
get int GetWordChars=2646(, stringresult characters)
+# Set the number of characters to have directly indexed categories
+set void SetCharacterCategoryOptimization=2720(int countCharacters,)
+
+# Get the number of characters to have directly indexed categories
+get int GetCharacterCategoryOptimization=2721(,)
+
# Start a sequence of actions that is undone and redone as a unit.
# May be nested.
fun void BeginUndoAction=2078(,)
diff --git a/lexlib/CharacterCategory.cxx b/lexlib/CharacterCategory.cxx
index bc2fa2336..19c44cabe 100644
--- a/lexlib/CharacterCategory.cxx
+++ b/lexlib/CharacterCategory.cxx
@@ -7,10 +7,13 @@
// Copyright 2013 by Neil Hodgson <neilh@scintilla.org>
// The License.txt file describes the conditions under which this software may be distributed.
+#include <vector>
#include <algorithm>
#include <iterator>
+#include "Scintilla.h" // for ptrdiff_t in Position.h
#include "CharacterCategory.h"
+#include "Position.h" // for Sci::clamp
namespace Scintilla {
@@ -3790,6 +3793,7 @@ const int catRanges[] = {
33554397,
33554460,
35651549,
+35651613,
//--Autogenerated -- end of section automatically generated
};
@@ -3963,4 +3967,33 @@ bool IsXidContinue(int character) {
}
}
+CharacterCategoryMap::CharacterCategoryMap() noexcept {
+ Optimize(256);
+}
+
+int CharacterCategoryMap::Size() const noexcept {
+ return static_cast<int>(dense.size());
+}
+
+void CharacterCategoryMap::Optimize(int countCharacters) {
+ const int characters = Sci::clamp(countCharacters, 256, maxUnicode + 1);
+ dense.resize(characters);
+
+ int end = 0;
+ int index = 0;
+ int current = catRanges[index];
+ ++index;
+ do {
+ const int next = catRanges[index];
+ const unsigned char category = current & maskCategory;
+ current >>= 5;
+ end = std::min(characters, next >> 5);
+ while (current < end) {
+ dense[current++] = category;
+ }
+ current = next;
+ ++index;
+ } while (characters > end);
+}
+
}
diff --git a/lexlib/CharacterCategory.h b/lexlib/CharacterCategory.h
index 767d79670..d1ac39152 100644
--- a/lexlib/CharacterCategory.h
+++ b/lexlib/CharacterCategory.h
@@ -28,6 +28,23 @@ bool IsIdContinue(int character);
bool IsXidStart(int character);
bool IsXidContinue(int character);
+class CharacterCategoryMap {
+private:
+ std::vector<unsigned char> dense;
+public:
+ CharacterCategoryMap() noexcept;
+ CharacterCategory CategoryFor(int character) const {
+ if (static_cast<size_t>(character) < dense.size()) {
+ return static_cast<CharacterCategory>(dense[character]);
+ } else {
+ // binary search through ranges
+ return CategoriseCharacter(character);
+ }
+ }
+ int Size() const noexcept;
+ void Optimize(int countCharacters);
+};
+
}
#endif
diff --git a/scripts/GenerateCharacterCategory.py b/scripts/GenerateCharacterCategory.py
index 4596eec6a..ba6ac858d 100644
--- a/scripts/GenerateCharacterCategory.py
+++ b/scripts/GenerateCharacterCategory.py
@@ -17,17 +17,27 @@ def findCategories(filename):
def updateCharacterCategory(filename):
values = ["// Created with Python %s, Unicode %s" % (
platform.python_version(), unicodedata.unidata_version)]
- category = unicodedata.category(chr(0))
+
startRange = 0
+ category = unicodedata.category(chr(startRange))
+ table = []
for ch in range(sys.maxunicode):
uch = chr(ch)
- if unicodedata.category(uch) != category:
+ current = unicodedata.category(uch)
+ if current != category:
value = startRange * 32 + categories.index(category)
- values.append("%d," % value)
- category = unicodedata.category(uch)
+ table.append(value)
+ category = current
startRange = ch
value = startRange * 32 + categories.index(category)
- values.append("%d," % value)
+ table.append(value)
+
+ # the sentinel value is used to simplify CharacterCategoryMap::Optimize()
+ category = 'Cn'
+ value = (sys.maxunicode + 1)*32 + categories.index(category)
+ table.append(value)
+
+ values.extend(["%d," % value for value in table])
Regenerate(filename, "//", values)
diff --git a/src/Document.cxx b/src/Document.cxx
index 4aad2e370..dd11ae42d 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -1710,7 +1710,7 @@ CharClassify::cc Document::WordCharacterClass(unsigned int ch) const {
if (dbcsCodePage && (!UTF8IsAscii(ch))) {
if (SC_CP_UTF8 == dbcsCodePage) {
// Use hard coded Unicode class
- const CharacterCategory cc = CategoriseCharacter(ch);
+ const CharacterCategory cc = charMap.CategoryFor(ch);
switch (cc) {
// Separator, Line/Paragraph
@@ -2169,6 +2169,14 @@ int Document::GetCharsOfClass(CharClassify::cc characterClass, unsigned char *bu
return charClass.GetCharsOfClass(characterClass, buffer);
}
+void Document::SetCharacterCategoryOptimization(int countCharacters) {
+ charMap.Optimize(countCharacters);
+}
+
+int Document::CharacterCategoryOptimization() const noexcept {
+ return charMap.Size();
+}
+
void SCI_METHOD Document::StartStyling(Sci_Position position, char) {
endStyled = position;
}
diff --git a/src/Document.h b/src/Document.h
index f97e1393f..adbdc3413 100644
--- a/src/Document.h
+++ b/src/Document.h
@@ -230,6 +230,7 @@ private:
int refCount;
CellBuffer cb;
CharClassify charClass;
+ CharacterCategoryMap charMap;
std::unique_ptr<CaseFolder> pcf;
Sci::Position endStyled;
int styleClock;
@@ -444,6 +445,8 @@ public:
void SetDefaultCharClasses(bool includeWordClass);
void SetCharClasses(const unsigned char *chars, CharClassify::cc newCharClass);
int GetCharsOfClass(CharClassify::cc characterClass, unsigned char *buffer) const;
+ void SetCharacterCategoryOptimization(int countCharacters);
+ int CharacterCategoryOptimization() const noexcept;
void SCI_METHOD StartStyling(Sci_Position position, char mask) override;
bool SCI_METHOD SetStyleFor(Sci_Position length, char style) override;
bool SCI_METHOD SetStyles(Sci_Position length, const char *styles) override;
diff --git a/src/Editor.cxx b/src/Editor.cxx
index 75c5d14a0..a9df59f96 100644
--- a/src/Editor.cxx
+++ b/src/Editor.cxx
@@ -6249,6 +6249,13 @@ sptr_t Editor::WndProc(unsigned int iMessage, uptr_t wParam, sptr_t lParam) {
pdoc->SetDefaultCharClasses(true);
break;
+ case SCI_SETCHARACTERCATEGORYOPTIMIZATION:
+ pdoc->SetCharacterCategoryOptimization(static_cast<int>(wParam));
+ break;
+
+ case SCI_GETCHARACTERCATEGORYOPTIMIZATION:
+ return pdoc->CharacterCategoryOptimization();
+
case SCI_GETLENGTH:
return pdoc->Length();
diff --git a/test/simpleTests.py b/test/simpleTests.py
index d51908baf..3af25247c 100644
--- a/test/simpleTests.py
+++ b/test/simpleTests.py
@@ -2282,6 +2282,11 @@ class TestWordChars(unittest.TestCase):
data = self.ed.GetPunctuationChars(None)
self.assertCharSetsEqual(data, expected)
+ def testCharacterCategoryOptimization(self):
+ self.assertEquals(self.ed.CharacterCategoryOptimization, 0x100)
+ self.ed.CharacterCategoryOptimization = 0x1000
+ self.assertEquals(self.ed.CharacterCategoryOptimization, 0x1000)
+
class TestExplicitTabStops(unittest.TestCase):
def setUp(self):