diff options
author | Neil <nyamatongwe@gmail.com> | 2013-07-11 10:43:40 +1000 |
---|---|---|
committer | Neil <nyamatongwe@gmail.com> | 2013-07-11 10:43:40 +1000 |
commit | 5262f832018923ecc8ccd25bedecbd5a291a563f (patch) | |
tree | d14831d0e08cd988dd8425674a29d514326c99f4 /scripts | |
parent | 8ee42f850d492efbec73969eca52243e1acd1c96 (diff) | |
download | scintilla-mirror-5262f832018923ecc8ccd25bedecbd5a291a563f.tar.gz |
Include case conversion data in Scintilla so that all platforms will perform
case conversion of Unicode text in accordance with Unicode.
Diffstat (limited to 'scripts')
-rw-r--r-- | scripts/GenerateCaseConvert.py | 122 |
1 files changed, 122 insertions, 0 deletions
diff --git a/scripts/GenerateCaseConvert.py b/scripts/GenerateCaseConvert.py new file mode 100644 index 000000000..841cdcc51 --- /dev/null +++ b/scripts/GenerateCaseConvert.py @@ -0,0 +1,122 @@ +# Script to generate CaseConvert.cxx from Python's Unicode data +# Should be run rarely when a Python with a new version of Unicode data is available. +# Should not be run with old versions of Python. + +# Current best approach divides case conversions into two cases: +# simple symmetric and complex. +# Simple symmetric is where a lower and upper case pair convert to each +# other and the folded form is the same as the lower case. +# There are 1006 symmetric pairs. +# These are further divided into ranges (stored as lower, upper, range length, +# range pitch and singletons (stored as lower, upper). +# Complex is for cases that don't fit the above: where there are multiple +# characters in one of the forms or fold is different to lower or +# lower(upper(x)) or upper(lower(x)) are not x. These are represented as UTF-8 +# strings with original, folded, upper, and lower separated by '|'. +# There are 126 complex cases. + +import codecs, itertools, os, string, sys, unicodedata + +from FileGenerator import Regenerate + +def contiguousRanges(l, diff): + # l is s list of lists + # group into lists where first element of each element differs by diff + out = [[l[0]]] + for s in l[1:]: + if s[0] != out[-1][-1][0] + diff: + out.append([]) + out[-1].append(s) + return out + +def flatten(listOfLists): + "Flatten one level of nesting" + return itertools.chain.from_iterable(listOfLists) + +def conversionSets(): + # For all Unicode characters, see whether they have case conversions + # Return 2 sets: one of simple symmetric conversion cases and another + # with complex cases. + complexes = [] + symmetrics = [] + for ch in range(sys.maxunicode): + if ch >= 0xd800 and ch <= 0xDBFF: + continue + if ch >= 0xdc00 and ch <= 0xDFFF: + continue + uch = chr(ch) + + fold = uch.casefold() + upper = uch.upper() + lower = uch.lower() + symmetric = False + if uch != upper and len(upper) == 1 and uch == lower and uch == fold: + lowerUpper = upper.lower() + foldUpper = upper.casefold() + if lowerUpper == foldUpper and lowerUpper == uch: + symmetric = True + symmetrics.append((ch, ord(upper), ch - ord(upper))) + if uch != lower and len(lower) == 1 and uch == upper and lower == fold: + upperLower = lower.upper() + if upperLower == uch: + symmetric = True + + if fold == uch: + fold = "" + if upper == uch: + upper = "" + if lower == uch: + lower = "" + + if (fold or upper or lower) and not symmetric: + complexes.append((uch, fold, upper, lower)) + + return symmetrics, complexes + +def groupRanges(symmetrics): + # Group the symmetrics into groups where possible, returning a list + # of ranges and a list of symmetrics that didn't fit into a range + + def distance(s): + return s[2] + + groups = [] + uniquekeys = [] + for k, g in itertools.groupby(symmetrics, distance): + groups.append(list(g)) # Store group iterator as a list + uniquekeys.append(k) + + contiguousGroups = flatten([contiguousRanges(g, 1) for g in groups]) + longGroups = [(x[0][0], x[0][1], len(x), 1) for x in contiguousGroups if len(x) > 4] + + oneDiffs = [s for s in symmetrics if s[2] == 1] + contiguousOnes = flatten([contiguousRanges(g, 2) for g in [oneDiffs]]) + longOneGroups = [(x[0][0], x[0][1], len(x), 2) for x in contiguousOnes if len(x) > 4] + + rangeGroups = sorted(longGroups+longOneGroups, key=lambda s: s[0]) + + rangeCoverage = list(flatten([range(r[0], r[0]+r[2]*r[3], r[3]) for r in rangeGroups])) + + nonRanges = [(l, u) for l, u, d in symmetrics if l not in rangeCoverage] + + return rangeGroups, nonRanges + +def updateCaseConvert(): + symmetrics, complexes = conversionSets() + + rangeGroups, nonRanges = groupRanges(symmetrics) + + print(len(rangeGroups), "ranges") + rangeLines = ["%d,%d,%d,%d, " % x for x in rangeGroups] + + print(len(nonRanges), "non ranges") + nonRangeLines = ["%d,%d, " % x for x in nonRanges] + + print(len(symmetrics), "symmetric") + + complexLines = ['"%s|%s|%s|%s|"' % x for x in complexes] + print(len(complexLines), "complex") + + Regenerate("../src/CaseConvert.cxx", "//", rangeLines, nonRangeLines, complexLines) + +updateCaseConvert() |