diff options
Diffstat (limited to 'Tools/unicode')
-rw-r--r-- | Tools/unicode/comparecodecs.py | 2 | ||||
-rw-r--r-- | Tools/unicode/makeunicodedata.py | 271 |
2 files changed, 189 insertions, 84 deletions
diff --git a/Tools/unicode/comparecodecs.py b/Tools/unicode/comparecodecs.py index 0f5c1e24a3..7de14fdc27 100644 --- a/Tools/unicode/comparecodecs.py +++ b/Tools/unicode/comparecodecs.py @@ -14,7 +14,7 @@ def compare_codecs(encoding1, encoding2): print('Comparing encoding/decoding of %r and %r' % (encoding1, encoding2)) mismatch = 0 # Check encoding - for i in range(sys.maxunicode): + for i in range(sys.maxunicode+1): u = chr(i) try: c1 = u.encode(encoding1) diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index d50319024c..d9770979e1 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -21,11 +21,17 @@ # 2004-05-29 perky add east asian width information # 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta # 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch +# 2011-10-21 ezio add support for name aliases and named sequences # # written by Fredrik Lundh (fredrik@pythonware.com) # -import sys, os, zipfile +import os +import sys +import zipfile + +from textwrap import dedent +from operator import itemgetter SCRIPT = sys.argv[0] VERSION = "3.2" @@ -39,6 +45,17 @@ UNIHAN = "Unihan%s.zip" DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt" DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt" LINE_BREAK = "LineBreak%s.txt" +NAME_ALIASES = "NameAliases%s.txt" +NAMED_SEQUENCES = "NamedSequences%s.txt" + +# Private Use Areas -- in planes 1, 15, 16 +PUA_1 = range(0xE000, 0xF900) +PUA_15 = range(0xF0000, 0xFFFFE) +PUA_16 = range(0x100000, 0x10FFFE) + +# we use this ranges of PUA_15 to store name aliases and named sequences +NAME_ALIASES_START = 0xF0000 +NAMED_SEQUENCES_START = 0xF0100 old_versions = ["3.2.0"] @@ -692,6 +709,39 @@ def makeunicodename(unicode, trace): print("/* name->code dictionary */", file=fp) codehash.dump(fp, trace) + print(file=fp) + print('static const unsigned int aliases_start = %#x;' % + NAME_ALIASES_START, file=fp) + print('static const unsigned int aliases_end = %#x;' % + (NAME_ALIASES_START + len(unicode.aliases)), file=fp) + + print('static const unsigned int name_aliases[] = {', file=fp) + for name, codepoint in unicode.aliases: + print(' 0x%04X,' % codepoint, file=fp) + print('};', file=fp) + + # In Unicode 6.0.0, the sequences contain at most 4 BMP chars, + # so we are using Py_UCS2 seq[4]. This needs to be updated if longer + # sequences or sequences with non-BMP chars are added. + # unicodedata_lookup should be adapted too. + print(dedent(""" + typedef struct NamedSequence { + int seqlen; + Py_UCS2 seq[4]; + } named_sequence; + """), file=fp) + + print('static const unsigned int named_sequences_start = %#x;' % + NAMED_SEQUENCES_START, file=fp) + print('static const unsigned int named_sequences_end = %#x;' % + (NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp) + + print('static const named_sequence named_sequences[] = {', file=fp) + for name, sequence in unicode.named_sequences: + seq_str = ', '.join('0x%04X' % cp for cp in sequence) + print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp) + print('};', file=fp) + fp.close() @@ -726,7 +776,11 @@ def merge_old_version(version, new, old): for k in range(len(old.table[i])): if old.table[i][k] != new.table[i][k]: value = old.table[i][k] - if k == 2: + if k == 1 and i in PUA_15: + # the name is not set in the old.table, but in the + # new.table we are using it for aliases and named seq + assert value == '' + elif k == 2: #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k] category_changes[i] = CATEGORY_NAMES.index(value) elif k == 4: @@ -816,15 +870,15 @@ class UnicodeData: expand=1, cjk_check=True): self.changed = [] - file = open_data(UNICODE_DATA, version) table = [None] * 0x110000 - while 1: - s = file.readline() - if not s: - break - s = s.strip().split(";") - char = int(s[0], 16) - table[char] = s + with open_data(UNICODE_DATA, version) as file: + while 1: + s = file.readline() + if not s: + break + s = s.strip().split(";") + char = int(s[0], 16) + table[char] = s cjk_ranges_found = [] @@ -855,32 +909,78 @@ class UnicodeData: self.table = table self.chars = list(range(0x110000)) # unicode 3.2 - file = open_data(COMPOSITION_EXCLUSIONS, version) + # check for name aliases and named sequences, see #12753 + # aliases and named sequences are not in 3.2.0 + if version != '3.2.0': + self.aliases = [] + # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF, + # in order to take advantage of the compression and lookup + # algorithms used for the other characters + pua_index = NAME_ALIASES_START + with open_data(NAME_ALIASES, version) as file: + for s in file: + s = s.strip() + if not s or s.startswith('#'): + continue + char, name = s.split(';') + char = int(char, 16) + self.aliases.append((name, char)) + # also store the name in the PUA 1 + self.table[pua_index][1] = name + pua_index += 1 + assert pua_index - NAME_ALIASES_START == len(self.aliases) + + self.named_sequences = [] + # store named seqences in the PUA 1, in range U+F0100.., + # in order to take advantage of the compression and lookup + # algorithms used for the other characters. + + pua_index = NAMED_SEQUENCES_START + with open_data(NAMED_SEQUENCES, version) as file: + for s in file: + s = s.strip() + if not s or s.startswith('#'): + continue + name, chars = s.split(';') + chars = tuple(int(char, 16) for char in chars.split()) + # check that the structure defined in makeunicodename is OK + assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size" + assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in " + "the NamedSequence struct and in unicodedata_lookup") + self.named_sequences.append((name, chars)) + # also store these in the PUA 1 + self.table[pua_index][1] = name + pua_index += 1 + assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences) + self.exclusions = {} - for s in file: - s = s.strip() - if not s: - continue - if s[0] == '#': - continue - char = int(s.split()[0],16) - self.exclusions[char] = 1 + with open_data(COMPOSITION_EXCLUSIONS, version) as file: + for s in file: + s = s.strip() + if not s: + continue + if s[0] == '#': + continue + char = int(s.split()[0],16) + self.exclusions[char] = 1 widths = [None] * 0x110000 - for s in open_data(EASTASIAN_WIDTH, version): - s = s.strip() - if not s: - continue - if s[0] == '#': - continue - s = s.split()[0].split(';') - if '..' in s[0]: - first, last = [int(c, 16) for c in s[0].split('..')] - chars = list(range(first, last+1)) - else: - chars = [int(s[0], 16)] - for char in chars: - widths[char] = s[1] + with open_data(EASTASIAN_WIDTH, version) as file: + for s in file: + s = s.strip() + if not s: + continue + if s[0] == '#': + continue + s = s.split()[0].split(';') + if '..' in s[0]: + first, last = [int(c, 16) for c in s[0].split('..')] + chars = list(range(first, last+1)) + else: + chars = [int(s[0], 16)] + for char in chars: + widths[char] = s[1] + for i in range(0, 0x110000): if table[i] is not None: table[i].append(widths[i]) @@ -888,36 +988,39 @@ class UnicodeData: for i in range(0, 0x110000): if table[i] is not None: table[i].append(set()) - for s in open_data(DERIVED_CORE_PROPERTIES, version): - s = s.split('#', 1)[0].strip() - if not s: - continue - r, p = s.split(";") - r = r.strip() - p = p.strip() - if ".." in r: - first, last = [int(c, 16) for c in r.split('..')] - chars = list(range(first, last+1)) - else: - chars = [int(r, 16)] - for char in chars: - if table[char]: - # Some properties (e.g. Default_Ignorable_Code_Point) - # apply to unassigned code points; ignore them - table[char][-1].add(p) - - for s in open_data(LINE_BREAK, version): - s = s.partition('#')[0] - s = [i.strip() for i in s.split(';')] - if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS: - continue - if '..' not in s[0]: - first = last = int(s[0], 16) - else: - first, last = [int(c, 16) for c in s[0].split('..')] - for char in range(first, last+1): - table[char][-1].add('Line_Break') + with open_data(DERIVED_CORE_PROPERTIES, version) as file: + for s in file: + s = s.split('#', 1)[0].strip() + if not s: + continue + + r, p = s.split(";") + r = r.strip() + p = p.strip() + if ".." in r: + first, last = [int(c, 16) for c in r.split('..')] + chars = list(range(first, last+1)) + else: + chars = [int(r, 16)] + for char in chars: + if table[char]: + # Some properties (e.g. Default_Ignorable_Code_Point) + # apply to unassigned code points; ignore them + table[char][-1].add(p) + + with open_data(LINE_BREAK, version) as file: + for s in file: + s = s.partition('#')[0] + s = [i.strip() for i in s.split(';')] + if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS: + continue + if '..' not in s[0]: + first = last = int(s[0], 16) + else: + first, last = [int(c, 16) for c in s[0].split('..')] + for char in range(first, last+1): + table[char][-1].add('Line_Break') # We only want the quickcheck properties # Format: NF?_QC; Y(es)/N(o)/M(aybe) @@ -928,31 +1031,33 @@ class UnicodeData: # for older versions, and no delta records will be created. quickchecks = [0] * 0x110000 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split() - for s in open_data(DERIVEDNORMALIZATION_PROPS, version): - if '#' in s: - s = s[:s.index('#')] - s = [i.strip() for i in s.split(';')] - if len(s) < 2 or s[1] not in qc_order: - continue - quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No - quickcheck_shift = qc_order.index(s[1])*2 - quickcheck <<= quickcheck_shift - if '..' not in s[0]: - first = last = int(s[0], 16) - else: - first, last = [int(c, 16) for c in s[0].split('..')] - for char in range(first, last+1): - assert not (quickchecks[char]>>quickcheck_shift)&3 - quickchecks[char] |= quickcheck + with open_data(DERIVEDNORMALIZATION_PROPS, version) as file: + for s in file: + if '#' in s: + s = s[:s.index('#')] + s = [i.strip() for i in s.split(';')] + if len(s) < 2 or s[1] not in qc_order: + continue + quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No + quickcheck_shift = qc_order.index(s[1])*2 + quickcheck <<= quickcheck_shift + if '..' not in s[0]: + first = last = int(s[0], 16) + else: + first, last = [int(c, 16) for c in s[0].split('..')] + for char in range(first, last+1): + assert not (quickchecks[char]>>quickcheck_shift)&3 + quickchecks[char] |= quickcheck for i in range(0, 0x110000): if table[i] is not None: table[i].append(quickchecks[i]) - zip = zipfile.ZipFile(open_data(UNIHAN, version)) - if version == '3.2.0': - data = zip.open('Unihan-3.2.0.txt').read() - else: - data = zip.open('Unihan_NumericValues.txt').read() + with open_data(UNIHAN, version) as file: + zip = zipfile.ZipFile(file) + if version == '3.2.0': + data = zip.open('Unihan-3.2.0.txt').read() + else: + data = zip.open('Unihan_NumericValues.txt').read() for line in data.decode("utf-8").splitlines(): if not line.startswith('U+'): continue |