diff options
-rw-r--r-- | lexlib/CharacterCategory.cxx | 15 |
1 files changed, 13 insertions, 2 deletions
diff --git a/lexlib/CharacterCategory.cxx b/lexlib/CharacterCategory.cxx index a75551ccc..a83776028 100644 --- a/lexlib/CharacterCategory.cxx +++ b/lexlib/CharacterCategory.cxx @@ -18,7 +18,7 @@ namespace Scintilla { namespace { // Use an unnamed namespace to protect the declarations from name conflicts -static int catRanges[] = { +const int catRanges[] = { //++Autogenerated -- start of section automatically generated // Created with Python 3.3.0, Unicode 6.1.0 25, @@ -3275,14 +3275,25 @@ static int catRanges[] = { const int maxUnicode = 0x10ffff; const int maskCategory = 0x1F; +const int nRanges = sizeof(catRanges) / sizeof(catRanges[0]); } +// Each element in catRanges is the start of a range of Unicode characters in +// one general category. +// The value is comprised of a 21-bit character value shifted 5 bits and a 5 bit +// category matching the CharacterCategory enumeration. +// Initial version has 3249 entries and adds about 13K to the executable. +// The array is in ascending order so can be searched using binary search. +// Therefore the average call takes log2(3249) = 12 comparisons. +// For speed, it may be an useful to make a linear table for the common values, +// possibly for 0..0xff for most Western European text or 0..0xfff for most +// alphabetic languages. + CharacterCategory CategoriseCharacter(int character) { if (character < 0 || character > maxUnicode) return ccCn; const int baseValue = character * (maskCategory+1) + maskCategory; - const int nRanges = sizeof(catRanges) / sizeof(catRanges[0]); const int *placeAfter = std::lower_bound(catRanges, catRanges+nRanges, baseValue); return static_cast<CharacterCategory>(*(placeAfter-1) & maskCategory); } |