1 files changed, 35 insertions, 0 deletions
diff --git a/scripts/GenerateCharacterCategory.py b/scripts/GenerateCharacterCategory.py
new file mode 100644
index 000000000..6e3ffd3ba
--- /dev/null
+++ b/scripts/GenerateCharacterCategory.py
@@ -0,0 +1,35 @@
+# Script to generate CharacterCategory.cxx from Python's Unicode data
+# Should be run rarely when a Python with a new version of Unicode data is available.
+# Should not be run with old versions of Python.
+
+import codecs, os, platform, sys, unicodedata
+
+from FileGenerator import Regenerate
+
+def findCategories(filename):
+    with codecs.open(filename, "r", "UTF-8") as infile:
+        lines = [x.strip() for x in infile.readlines() if "\tcc" in x]
+    values = "".join(lines).replace(" ","").split(",")
+    print(values)
+    return [v[2:] for v in values]
+
+def updateCharacterCategory(filename):
+    values = ["// Created with Python %s,  Unicode %s" % (
+        platform.python_version(), unicodedata.unidata_version)]
+    category = unicodedata.category(chr(0))
+    startRange = 0
+    for ch in range(sys.maxunicode):
+        uch = chr(ch)
+        if unicodedata.category(uch) != category:
+            value = startRange * 32 + categories.index(category)
+            values.append("%d," % value)
+            category = unicodedata.category(uch)
+            startRange = ch
+    value = startRange * 32 + categories.index(category)
+    values.append("%d," % value)
+
+    Regenerate(filename, "//", values)
+
+categories = findCategories("../lexlib/CharacterCategory.h")
+
+updateCharacterCategory("../lexlib/CharacterCategory.cxx")