2 files changed, 189 insertions, 84 deletions
diff --git a/Tools/unicode/comparecodecs.py b/Tools/unicode/comparecodecs.py
index 0f5c1e24a3..7de14fdc27 100644
--- a/Tools/unicode/comparecodecs.py
+++ b/Tools/unicode/comparecodecs.py
@@ -14,7 +14,7 @@ def compare_codecs(encoding1, encoding2):
     print('Comparing encoding/decoding of   %r and   %r' % (encoding1, encoding2))
     mismatch = 0
     # Check encoding
-    for i in range(sys.maxunicode):
+    for i in range(sys.maxunicode+1):
         u = chr(i)
         try:
             c1 = u.encode(encoding1)
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
index d50319024c..d9770979e1 100644
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -21,11 +21,17 @@
 # 2004-05-29 perky add east asian width information
 # 2006-03-10 mvl  update to Unicode 4.1; add UCD 3.2 delta
 # 2008-06-11 gb   add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
+# 2011-10-21 ezio add support for name aliases and named sequences
 #
 # written by Fredrik Lundh (fredrik@pythonware.com)
 #
 
-import sys, os, zipfile
+import os
+import sys
+import zipfile
+
+from textwrap import dedent
+from operator import itemgetter
 
 SCRIPT = sys.argv[0]
 VERSION = "3.2"
@@ -39,6 +45,17 @@ UNIHAN = "Unihan%s.zip"
 DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
 DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
 LINE_BREAK = "LineBreak%s.txt"
+NAME_ALIASES = "NameAliases%s.txt"
+NAMED_SEQUENCES = "NamedSequences%s.txt"
+
+# Private Use Areas -- in planes 1, 15, 16
+PUA_1 = range(0xE000, 0xF900)
+PUA_15 = range(0xF0000, 0xFFFFE)
+PUA_16 = range(0x100000, 0x10FFFE)
+
+# we use this ranges of PUA_15 to store name aliases and named sequences
+NAME_ALIASES_START = 0xF0000
+NAMED_SEQUENCES_START = 0xF0100
 
 old_versions = ["3.2.0"]
 
@@ -692,6 +709,39 @@ def makeunicodename(unicode, trace):
     print("/* name->code dictionary */", file=fp)
     codehash.dump(fp, trace)
 
+    print(file=fp)
+    print('static const unsigned int aliases_start = %#x;' %
+          NAME_ALIASES_START, file=fp)
+    print('static const unsigned int aliases_end = %#x;' %
+          (NAME_ALIASES_START + len(unicode.aliases)), file=fp)
+
+    print('static const unsigned int name_aliases[] = {', file=fp)
+    for name, codepoint in unicode.aliases:
+        print('    0x%04X,' % codepoint, file=fp)
+    print('};', file=fp)
+
+    # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
+    # so we are using Py_UCS2 seq[4].  This needs to be updated if longer
+    # sequences or sequences with non-BMP chars are added.
+    # unicodedata_lookup should be adapted too.
+    print(dedent("""
+        typedef struct NamedSequence {
+            int seqlen;
+            Py_UCS2 seq[4];
+        } named_sequence;
+        """), file=fp)
+
+    print('static const unsigned int named_sequences_start = %#x;' %
+          NAMED_SEQUENCES_START, file=fp)
+    print('static const unsigned int named_sequences_end = %#x;' %
+          (NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)
+
+    print('static const named_sequence named_sequences[] = {', file=fp)
+    for name, sequence in unicode.named_sequences:
+        seq_str = ', '.join('0x%04X' % cp for cp in sequence)
+        print('    {%d, {%s}},' % (len(sequence), seq_str), file=fp)
+    print('};', file=fp)
+
     fp.close()
 
 
@@ -726,7 +776,11 @@ def merge_old_version(version, new, old):
             for k in range(len(old.table[i])):
                 if old.table[i][k] != new.table[i][k]:
                     value = old.table[i][k]
-                    if k == 2:
+                    if k == 1 and i in PUA_15:
+                        # the name is not set in the old.table, but in the
+                        # new.table we are using it for aliases and named seq
+                        assert value == ''
+                    elif k == 2:
                         #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
                         category_changes[i] = CATEGORY_NAMES.index(value)
                     elif k == 4:
@@ -816,15 +870,15 @@ class UnicodeData:
                  expand=1,
                  cjk_check=True):
         self.changed = []
-        file = open_data(UNICODE_DATA, version)
         table = [None] * 0x110000
-        while 1:
-            s = file.readline()
-            if not s:
-                break
-            s = s.strip().split(";")
-            char = int(s[0], 16)
-            table[char] = s
+        with open_data(UNICODE_DATA, version) as file:
+            while 1:
+                s = file.readline()
+                if not s:
+                    break
+                s = s.strip().split(";")
+                char = int(s[0], 16)
+                table[char] = s
 
         cjk_ranges_found = []
 
@@ -855,32 +909,78 @@ class UnicodeData:
         self.table = table
         self.chars = list(range(0x110000)) # unicode 3.2
 
-        file = open_data(COMPOSITION_EXCLUSIONS, version)
+        # check for name aliases and named sequences, see #12753
+        # aliases and named sequences are not in 3.2.0
+        if version != '3.2.0':
+            self.aliases = []
+            # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
+            # in order to take advantage of the compression and lookup
+            # algorithms used for the other characters
+            pua_index = NAME_ALIASES_START
+            with open_data(NAME_ALIASES, version) as file:
+                for s in file:
+                    s = s.strip()
+                    if not s or s.startswith('#'):
+                        continue
+                    char, name = s.split(';')
+                    char = int(char, 16)
+                    self.aliases.append((name, char))
+                    # also store the name in the PUA 1
+                    self.table[pua_index][1] = name
+                    pua_index += 1
+            assert pua_index - NAME_ALIASES_START == len(self.aliases)
+
+            self.named_sequences = []
+            # store named seqences in the PUA 1, in range U+F0100..,
+            # in order to take advantage of the compression and lookup
+            # algorithms used for the other characters.
+
+            pua_index = NAMED_SEQUENCES_START
+            with open_data(NAMED_SEQUENCES, version) as file:
+                for s in file:
+                    s = s.strip()
+                    if not s or s.startswith('#'):
+                        continue
+                    name, chars = s.split(';')
+                    chars = tuple(int(char, 16) for char in chars.split())
+                    # check that the structure defined in makeunicodename is OK
+                    assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
+                    assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
+                        "the NamedSequence struct and in unicodedata_lookup")
+                    self.named_sequences.append((name, chars))
+                    # also store these in the PUA 1
+                    self.table[pua_index][1] = name
+                    pua_index += 1
+            assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
+
         self.exclusions = {}
-        for s in file:
-            s = s.strip()
-            if not s:
-                continue
-            if s[0] == '#':
-                continue
-            char = int(s.split()[0],16)
-            self.exclusions[char] = 1
+        with open_data(COMPOSITION_EXCLUSIONS, version) as file:
+            for s in file:
+                s = s.strip()
+                if not s:
+                    continue
+                if s[0] == '#':
+                    continue
+                char = int(s.split()[0],16)
+                self.exclusions[char] = 1
 
         widths = [None] * 0x110000
-        for s in open_data(EASTASIAN_WIDTH, version):
-            s = s.strip()
-            if not s:
-                continue
-            if s[0] == '#':
-                continue
-            s = s.split()[0].split(';')
-            if '..' in s[0]:
-                first, last = [int(c, 16) for c in s[0].split('..')]
-                chars = list(range(first, last+1))
-            else:
-                chars = [int(s[0], 16)]
-            for char in chars:
-                widths[char] = s[1]
+        with open_data(EASTASIAN_WIDTH, version) as file:
+            for s in file:
+                s = s.strip()
+                if not s:
+                    continue
+                if s[0] == '#':
+                    continue
+                s = s.split()[0].split(';')
+                if '..' in s[0]:
+                    first, last = [int(c, 16) for c in s[0].split('..')]
+                    chars = list(range(first, last+1))
+                else:
+                    chars = [int(s[0], 16)]
+                for char in chars:
+                    widths[char] = s[1]
+
         for i in range(0, 0x110000):
             if table[i] is not None:
                 table[i].append(widths[i])
@@ -888,36 +988,39 @@ class UnicodeData:
         for i in range(0, 0x110000):
             if table[i] is not None:
                 table[i].append(set())
-        for s in open_data(DERIVED_CORE_PROPERTIES, version):
-            s = s.split('#', 1)[0].strip()
-            if not s:
-                continue
 
-            r, p = s.split(";")
-            r = r.strip()
-            p = p.strip()
-            if ".." in r:
-                first, last = [int(c, 16) for c in r.split('..')]
-                chars = list(range(first, last+1))
-            else:
-                chars = [int(r, 16)]
-            for char in chars:
-                if table[char]:
-                    # Some properties (e.g. Default_Ignorable_Code_Point)
-                    # apply to unassigned code points; ignore them
-                    table[char][-1].add(p)
-
-        for s in open_data(LINE_BREAK, version):
-            s = s.partition('#')[0]
-            s = [i.strip() for i in s.split(';')]
-            if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
-                continue
-            if '..' not in s[0]:
-                first = last = int(s[0], 16)
-            else:
-                first, last = [int(c, 16) for c in s[0].split('..')]
-            for char in range(first, last+1):
-                table[char][-1].add('Line_Break')
+        with open_data(DERIVED_CORE_PROPERTIES, version) as file:
+            for s in file:
+                s = s.split('#', 1)[0].strip()
+                if not s:
+                    continue
+
+                r, p = s.split(";")
+                r = r.strip()
+                p = p.strip()
+                if ".." in r:
+                    first, last = [int(c, 16) for c in r.split('..')]
+                    chars = list(range(first, last+1))
+                else:
+                    chars = [int(r, 16)]
+                for char in chars:
+                    if table[char]:
+                        # Some properties (e.g. Default_Ignorable_Code_Point)
+                        # apply to unassigned code points; ignore them
+                        table[char][-1].add(p)
+
+        with open_data(LINE_BREAK, version) as file:
+            for s in file:
+                s = s.partition('#')[0]
+                s = [i.strip() for i in s.split(';')]
+                if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
+                    continue
+                if '..' not in s[0]:
+                    first = last = int(s[0], 16)
+                else:
+                    first, last = [int(c, 16) for c in s[0].split('..')]
+                for char in range(first, last+1):
+                    table[char][-1].add('Line_Break')
 
         # We only want the quickcheck properties
         # Format: NF?_QC; Y(es)/N(o)/M(aybe)
@@ -928,31 +1031,33 @@ class UnicodeData:
         # for older versions, and no delta records will be created.
         quickchecks = [0] * 0x110000
         qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
-        for s in open_data(DERIVEDNORMALIZATION_PROPS, version):
-            if '#' in s:
-                s = s[:s.index('#')]
-            s = [i.strip() for i in s.split(';')]
-            if len(s) < 2 or s[1] not in qc_order:
-                continue
-            quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
-            quickcheck_shift = qc_order.index(s[1])*2
-            quickcheck <<= quickcheck_shift
-            if '..' not in s[0]:
-                first = last = int(s[0], 16)
-            else:
-                first, last = [int(c, 16) for c in s[0].split('..')]
-            for char in range(first, last+1):
-                assert not (quickchecks[char]>>quickcheck_shift)&3
-                quickchecks[char] |= quickcheck
+        with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:
+            for s in file:
+                if '#' in s:
+                    s = s[:s.index('#')]
+                s = [i.strip() for i in s.split(';')]
+                if len(s) < 2 or s[1] not in qc_order:
+                    continue
+                quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
+                quickcheck_shift = qc_order.index(s[1])*2
+                quickcheck <<= quickcheck_shift
+                if '..' not in s[0]:
+                    first = last = int(s[0], 16)
+                else:
+                    first, last = [int(c, 16) for c in s[0].split('..')]
+                for char in range(first, last+1):
+                    assert not (quickchecks[char]>>quickcheck_shift)&3
+                    quickchecks[char] |= quickcheck
         for i in range(0, 0x110000):
             if table[i] is not None:
                 table[i].append(quickchecks[i])
 
-        zip = zipfile.ZipFile(open_data(UNIHAN, version))
-        if version == '3.2.0':
-            data = zip.open('Unihan-3.2.0.txt').read()
-        else:
-            data = zip.open('Unihan_NumericValues.txt').read()
+        with open_data(UNIHAN, version) as file:
+            zip = zipfile.ZipFile(file)
+            if version == '3.2.0':
+                data = zip.open('Unihan-3.2.0.txt').read()
+            else:
+                data = zip.open('Unihan_NumericValues.txt').read()
         for line in data.decode("utf-8").splitlines():
             if not line.startswith('U+'):
                 continue