aboutsummaryrefslogtreecommitdiffhomepage
path: root/lexlib/CharacterCategory.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'lexlib/CharacterCategory.cxx')
-rw-r--r--lexlib/CharacterCategory.cxx15
1 files changed, 13 insertions, 2 deletions
diff --git a/lexlib/CharacterCategory.cxx b/lexlib/CharacterCategory.cxx
index a75551ccc..a83776028 100644
--- a/lexlib/CharacterCategory.cxx
+++ b/lexlib/CharacterCategory.cxx
@@ -18,7 +18,7 @@ namespace Scintilla {
namespace {
// Use an unnamed namespace to protect the declarations from name conflicts
-static int catRanges[] = {
+const int catRanges[] = {
//++Autogenerated -- start of section automatically generated
// Created with Python 3.3.0, Unicode 6.1.0
25,
@@ -3275,14 +3275,25 @@ static int catRanges[] = {
const int maxUnicode = 0x10ffff;
const int maskCategory = 0x1F;
+const int nRanges = sizeof(catRanges) / sizeof(catRanges[0]);
}
+// Each element in catRanges is the start of a range of Unicode characters in
+// one general category.
+// The value is comprised of a 21-bit character value shifted 5 bits and a 5 bit
+// category matching the CharacterCategory enumeration.
+// Initial version has 3249 entries and adds about 13K to the executable.
+// The array is in ascending order so can be searched using binary search.
+// Therefore the average call takes log2(3249) = 12 comparisons.
+// For speed, it may be an useful to make a linear table for the common values,
+// possibly for 0..0xff for most Western European text or 0..0xfff for most
+// alphabetic languages.
+
CharacterCategory CategoriseCharacter(int character) {
if (character < 0 || character > maxUnicode)
return ccCn;
const int baseValue = character * (maskCategory+1) + maskCategory;
- const int nRanges = sizeof(catRanges) / sizeof(catRanges[0]);
const int *placeAfter = std::lower_bound(catRanges, catRanges+nRanges, baseValue);
return static_cast<CharacterCategory>(*(placeAfter-1) & maskCategory);
}