From 86266d4700632860705fc2d4e88d4be4f5228be1 Mon Sep 17 00:00:00 2001
From: mitchell <unknown>
Date: Tue, 16 Apr 2019 22:50:17 -0400
Subject: Backport: Feature [feature-requests:#1259]. Add
 SCI_SETCHARACTERCATEGORYOPTIMIZATION API to optimize speed of character
 category features.

Backport of changeset 7392:2832adedd0f4, but with added includes for Sci::clamp().
---
 curses/ScintillaCurses.cxx           |  1 +
 doc/ScintillaDoc.html                | 11 +++++++++++
 doc/ScintillaHistory.html            |  6 ++++++
 include/Scintilla.h                  |  2 ++
 include/Scintilla.iface              |  6 ++++++
 lexlib/CharacterCategory.cxx         | 33 +++++++++++++++++++++++++++++++++
 lexlib/CharacterCategory.h           | 17 +++++++++++++++++
 scripts/GenerateCharacterCategory.py | 20 +++++++++++++++-----
 src/Document.cxx                     | 10 +++++++++-
 src/Document.h                       |  3 +++
 src/Editor.cxx                       |  7 +++++++
 test/simpleTests.py                  |  5 +++++
 12 files changed, 115 insertions(+), 6 deletions(-)
diff --git a/curses/ScintillaCurses.cxx b/curses/ScintillaCurses.cxx
index 1c69801d1..a39ccbec3 100644
--- a/curses/ScintillaCurses.cxx
+++ b/curses/ScintillaCurses.cxx
@@ -22,6 +22,7 @@
 #include "ILoader.h"
 #include "ILexer.h"
 #include "Scintilla.h"
+#include "CharacterCategory.h"
 #include "Position.h"
 #include "UniqueString.h"
 #include "SplitVector.h"
diff --git a/doc/ScintillaDoc.html b/doc/ScintillaDoc.html
index 76acc4c71..0bd7c2243 100644
--- a/doc/ScintillaDoc.html
+++ b/doc/ScintillaDoc.html
@@ -2467,6 +2467,8 @@ struct Sci_TextToFind {
      <a class="message" href="#SCI_SETPUNCTUATIONCHARS">SCI_SETPUNCTUATIONCHARS(&lt;unused&gt;, const char *characters)</a><br />
      <a class="message" href="#SCI_GETPUNCTUATIONCHARS">SCI_GETPUNCTUATIONCHARS(&lt;unused&gt;, char *characters) &rarr; int</a><br />
      <a class="message" href="#SCI_SETCHARSDEFAULT">SCI_SETCHARSDEFAULT</a><br />
+     <a class="message" href="#SCI_SETCHARACTERCATEGORYOPTIMIZATION">SCI_SETCHARACTERCATEGORYOPTIMIZATION(int countCharacters)</a><br />
+     <a class="message" href="#SCI_GETCHARACTERCATEGORYOPTIMIZATION">SCI_GETCHARACTERCATEGORYOPTIMIZATION &rarr; int</a><br />
 
     <p><b id="SCI_WORDENDPOSITION">SCI_WORDENDPOSITION(int pos, bool onlyWordCharacters) &rarr; int</b><br />
      <b id="SCI_WORDSTARTPOSITION">SCI_WORDSTARTPOSITION(int pos, bool onlyWordCharacters) &rarr; int</b><br />
@@ -2594,6 +2596,15 @@ struct Sci_TextToFind {
      characters with codes less than 0x20, with word characters set to alphanumeric and '_'.
     </p>
 
+    <p><b id="SCI_SETCHARACTERCATEGORYOPTIMIZATION">SCI_SETCHARACTERCATEGORYOPTIMIZATION(int countCharacters)</b><br />
+    <b id="SCI_GETCHARACTERCATEGORYOPTIMIZATION">SCI_GETCHARACTERCATEGORYOPTIMIZATION &rarr; int</b><br />
+      Optimize speed of character category features like determining whether a character is a space or number at the expense of memory.
+      Mostly used for Unicode documents.
+      The countCharacters parameter determines how many character starting from 0 are added to a look-up table with one byte used for each character.
+      It is reasonable to cover the set of characters likely to be used in a document so 0x100 for simple Roman text,
+      0x1000 to cover most simple alphabets, 0x10000 to cover most of East Asian languages, and 0x110000 to cover all possible characters.
+    </p>
+
     <p>Word keyboard commands are:</p>
     <ul>
     <li class="message" id="SCI_WORDLEFT">SCI_WORDLEFT</li>
diff --git a/doc/ScintillaHistory.html b/doc/ScintillaHistory.html
index 343aa7966..1a6b34f49 100644
--- a/doc/ScintillaHistory.html
+++ b/doc/ScintillaHistory.html
@@ -560,6 +560,12 @@
 	Implement high-priority idle on Win32 to make redraw smoother and more efficient.
 	</li>
  	<li>
+	Add SCI_SETCHARACTERCATEGORYOPTIMIZATION API to optimize speed
+	of character category features like determining whether a character is a space or number
+	at the expense of memory.
+	<a href="https://sourceforge.net/p/scintilla/feature-requests/1259/">Feature #1259</a>.
+	</li>
+ 	<li>
 	Fix flicker when inserting primary selection on GTK.
 	<a href="https://sourceforge.net/p/scintilla/bugs/2087/">Bug #2087</a>.
 	</li>
diff --git a/include/Scintilla.h b/include/Scintilla.h
index cb3259842..31115ad0c 100644
--- a/include/Scintilla.h
+++ b/include/Scintilla.h
@@ -266,6 +266,8 @@ typedef sptr_t (*SciFnDirect)(sptr_t ptr, unsigned int iMessage, uptr_t wParam,
 #define SCI_SETCARETPERIOD 2076
 #define SCI_SETWORDCHARS 2077
 #define SCI_GETWORDCHARS 2646
+#define SCI_SETCHARACTERCATEGORYOPTIMIZATION 2720
+#define SCI_GETCHARACTERCATEGORYOPTIMIZATION 2721
 #define SCI_BEGINUNDOACTION 2078
 #define SCI_ENDUNDOACTION 2079
 #define INDIC_PLAIN 0
diff --git a/include/Scintilla.iface b/include/Scintilla.iface
index a26f7057d..a3820445a 100644
--- a/include/Scintilla.iface
+++ b/include/Scintilla.iface
@@ -610,6 +610,12 @@ set void SetWordChars=2077(, string characters)
 # Returns the number of characters
 get int GetWordChars=2646(, stringresult characters)
 
+# Set the number of characters to have directly indexed categories
+set void SetCharacterCategoryOptimization=2720(int countCharacters,)
+
+# Get the number of characters to have directly indexed categories
+get int GetCharacterCategoryOptimization=2721(,)
+
 # Start a sequence of actions that is undone and redone as a unit.
 # May be nested.
 fun void BeginUndoAction=2078(,)
diff --git a/lexlib/CharacterCategory.cxx b/lexlib/CharacterCategory.cxx
index bc2fa2336..19c44cabe 100644
--- a/lexlib/CharacterCategory.cxx
+++ b/lexlib/CharacterCategory.cxx
@@ -7,10 +7,13 @@
 // Copyright 2013 by Neil Hodgson <neilh@scintilla.org>
 // The License.txt file describes the conditions under which this software may be distributed.
 
+#include <vector>
 #include <algorithm>
 #include <iterator>
 
+#include "Scintilla.h" // for ptrdiff_t in Position.h
 #include "CharacterCategory.h"
+#include "Position.h" // for Sci::clamp
 
 namespace Scintilla {
 
@@ -3790,6 +3793,7 @@ const int catRanges[] = {
 33554397,
 33554460,
 35651549,
+35651613,
 //--Autogenerated -- end of section automatically generated
 };
 
@@ -3963,4 +3967,33 @@ bool IsXidContinue(int character) {
 	}
 }
 
+CharacterCategoryMap::CharacterCategoryMap() noexcept {
+	Optimize(256);
+}
+
+int CharacterCategoryMap::Size() const noexcept {
+	return static_cast<int>(dense.size());
+}
+
+void CharacterCategoryMap::Optimize(int countCharacters) {
+	const int characters = Sci::clamp(countCharacters, 256, maxUnicode + 1);
+	dense.resize(characters);
+
+	int end = 0;
+	int index = 0;
+	int current = catRanges[index];
+	++index;
+	do {
+		const int next = catRanges[index];
+		const unsigned char category = current & maskCategory;
+		current >>= 5;
+		end = std::min(characters, next >> 5);
+		while (current < end) {
+			dense[current++] = category;
+		}
+		current = next;
+		++index;
+	} while (characters > end);
+}
+
 }
diff --git a/lexlib/CharacterCategory.h b/lexlib/CharacterCategory.h
index 767d79670..d1ac39152 100644
--- a/lexlib/CharacterCategory.h
+++ b/lexlib/CharacterCategory.h
@@ -28,6 +28,23 @@ bool IsIdContinue(int character);
 bool IsXidStart(int character);
 bool IsXidContinue(int character);
 
+class CharacterCategoryMap {
+private:
+	std::vector<unsigned char> dense;
+public:
+	CharacterCategoryMap() noexcept;
+	CharacterCategory CategoryFor(int character) const {
+		if (static_cast<size_t>(character) < dense.size()) {
+			return static_cast<CharacterCategory>(dense[character]);
+		} else {
+			// binary search through ranges
+			return CategoriseCharacter(character);
+		}
+	}
+	int Size() const noexcept;
+	void Optimize(int countCharacters);
+};
+
 }
 
 #endif
diff --git a/scripts/GenerateCharacterCategory.py b/scripts/GenerateCharacterCategory.py
index 4596eec6a..ba6ac858d 100644
--- a/scripts/GenerateCharacterCategory.py
+++ b/scripts/GenerateCharacterCategory.py
@@ -17,17 +17,27 @@ def findCategories(filename):
 def updateCharacterCategory(filename):
     values = ["// Created with Python %s,  Unicode %s" % (
         platform.python_version(), unicodedata.unidata_version)]
-    category = unicodedata.category(chr(0))
+
     startRange = 0
+    category = unicodedata.category(chr(startRange))
+    table = []
     for ch in range(sys.maxunicode):
         uch = chr(ch)
-        if unicodedata.category(uch) != category:
+        current = unicodedata.category(uch)
+        if current != category:
             value = startRange * 32 + categories.index(category)
-            values.append("%d," % value)
-            category = unicodedata.category(uch)
+            table.append(value)
+            category = current
             startRange = ch
     value = startRange * 32 + categories.index(category)
-    values.append("%d," % value)
+    table.append(value)
+
+    # the sentinel value is used to simplify CharacterCategoryMap::Optimize()
+    category = 'Cn'
+    value = (sys.maxunicode + 1)*32 + categories.index(category)
+    table.append(value)
+
+    values.extend(["%d," % value for value in table])
 
     Regenerate(filename, "//", values)
 
diff --git a/src/Document.cxx b/src/Document.cxx
index 4aad2e370..dd11ae42d 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -1710,7 +1710,7 @@ CharClassify::cc Document::WordCharacterClass(unsigned int ch) const {
 	if (dbcsCodePage && (!UTF8IsAscii(ch))) {
 		if (SC_CP_UTF8 == dbcsCodePage) {
 			// Use hard coded Unicode class
-			const CharacterCategory cc = CategoriseCharacter(ch);
+			const CharacterCategory cc = charMap.CategoryFor(ch);
 			switch (cc) {
 
 				// Separator, Line/Paragraph
@@ -2169,6 +2169,14 @@ int Document::GetCharsOfClass(CharClassify::cc characterClass, unsigned char *bu
     return charClass.GetCharsOfClass(characterClass, buffer);
 }
 
+void Document::SetCharacterCategoryOptimization(int countCharacters) {
+	charMap.Optimize(countCharacters);
+}
+
+int Document::CharacterCategoryOptimization() const noexcept {
+	return charMap.Size();
+}
+
 void SCI_METHOD Document::StartStyling(Sci_Position position, char) {
 	endStyled = position;
 }
diff --git a/src/Document.h b/src/Document.h
index f97e1393f..adbdc3413 100644
--- a/src/Document.h
+++ b/src/Document.h
@@ -230,6 +230,7 @@ private:
 	int refCount;
 	CellBuffer cb;
 	CharClassify charClass;
+	CharacterCategoryMap charMap;
 	std::unique_ptr<CaseFolder> pcf;
 	Sci::Position endStyled;
 	int styleClock;
@@ -444,6 +445,8 @@ public:
 	void SetDefaultCharClasses(bool includeWordClass);
 	void SetCharClasses(const unsigned char *chars, CharClassify::cc newCharClass);
 	int GetCharsOfClass(CharClassify::cc characterClass, unsigned char *buffer) const;
+	void SetCharacterCategoryOptimization(int countCharacters);
+	int CharacterCategoryOptimization() const noexcept;
 	void SCI_METHOD StartStyling(Sci_Position position, char mask) override;
 	bool SCI_METHOD SetStyleFor(Sci_Position length, char style) override;
 	bool SCI_METHOD SetStyles(Sci_Position length, const char *styles) override;
diff --git a/src/Editor.cxx b/src/Editor.cxx
index 75c5d14a0..a9df59f96 100644
--- a/src/Editor.cxx
+++ b/src/Editor.cxx
@@ -6249,6 +6249,13 @@ sptr_t Editor::WndProc(unsigned int iMessage, uptr_t wParam, sptr_t lParam) {
 		pdoc->SetDefaultCharClasses(true);
 		break;
 
+	case SCI_SETCHARACTERCATEGORYOPTIMIZATION:
+		pdoc->SetCharacterCategoryOptimization(static_cast<int>(wParam));
+		break;
+
+	case SCI_GETCHARACTERCATEGORYOPTIMIZATION:
+		return pdoc->CharacterCategoryOptimization();
+
 	case SCI_GETLENGTH:
 		return pdoc->Length();
 
diff --git a/test/simpleTests.py b/test/simpleTests.py
index d51908baf..3af25247c 100644
--- a/test/simpleTests.py
+++ b/test/simpleTests.py
@@ -2282,6 +2282,11 @@ class TestWordChars(unittest.TestCase):
 		data = self.ed.GetPunctuationChars(None)
 		self.assertCharSetsEqual(data, expected)
 
+	def testCharacterCategoryOptimization(self):
+		self.assertEquals(self.ed.CharacterCategoryOptimization, 0x100)
+		self.ed.CharacterCategoryOptimization = 0x1000
+		self.assertEquals(self.ed.CharacterCategoryOptimization, 0x1000)
+
 class TestExplicitTabStops(unittest.TestCase):
 
 	def setUp(self):
-- 
cgit v1.2.3