diff options
author | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2017-02-24 17:30:30 +0000 |
---|---|---|
committer | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2017-02-24 17:30:30 +0000 |
commit | 8037f71d03b3cd8919248f38448a0a2d3715c18c (patch) | |
tree | 0fb6d719cb178fa234f4acc0029c5b0a38b5ec50 /maint | |
parent | e7991eb5273b5b4162656f4b3d32e68a7430805a (diff) | |
download | pcre-8037f71d03b3cd8919248f38448a0a2d3715c18c.tar.gz |
Fix Unicode property crash for 32-bit characters greater than 0x10ffff.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1688 2f5784b3-3f2a-0410-8824-cb99058d5e15
Diffstat (limited to 'maint')
-rwxr-xr-x | maint/MultiStage2.py | 106 |
1 files changed, 61 insertions, 45 deletions
diff --git a/maint/MultiStage2.py b/maint/MultiStage2.py index 44ad80c..9a75759 100755 --- a/maint/MultiStage2.py +++ b/maint/MultiStage2.py @@ -1,5 +1,7 @@ #! /usr/bin/python +# WARNING! This is a python 2 script. + # Multistage table builder # (c) Peter Kankowski, 2008 @@ -15,10 +17,10 @@ # ./MultiStage2.py >../pcre_ucd.c # # It requires four Unicode data tables, DerivedGeneralCategory.txt, -# GraphemeBreakProperty.txt, Scripts.txt, and CaseFolding.txt, to be in the -# Unicode.tables subdirectory. The first of these is found in the "extracted" -# subdirectory of the Unicode database (UCD) on the Unicode web site; the -# second is in the "auxiliary" subdirectory; the other two are directly in the +# GraphemeBreakProperty.txt, Scripts.txt, and CaseFolding.txt, to be in the +# Unicode.tables subdirectory. The first of these is found in the "extracted" +# subdirectory of the Unicode database (UCD) on the Unicode web site; the +# second is in the "auxiliary" subdirectory; the other two are directly in the # UCD directory. # # Minor modifications made to this script: @@ -42,7 +44,7 @@ # code scans CaseFolding.txt instead of UnicodeData.txt. # # The main tables generated by this script are used by macros defined in -# pcre_internal.h. They look up Unicode character properties using short +# pcre_internal.h. They look up Unicode character properties using short # sequences of code that contains no branches, which makes for greater speed. # # Conceptually, there is a table of records (of type ucd_record), containing a @@ -69,13 +71,13 @@ # Example: lowercase "a" (U+0061) is in block 0 # lookup 0 in stage1 table yields 0 # lookup 97 in the first table in stage2 yields 16 -# record 17 is { 33, 5, 11, 0, -32 } +# record 17 is { 33, 5, 11, 0, -32 } # 33 = ucp_Latin => Latin script # 5 = ucp_Ll => Lower case letter # 11 = ucp_gbOther => Grapheme break property "Other" # 0 => not part of a caseless set # -32 => Other case is U+0041 -# +# # Almost all lowercase latin characters resolve to the same record. One or two # are different because they are part of a multi-character caseless set (for # example, k, K and the Kelvin symbol are such a set). @@ -83,17 +85,17 @@ # Example: hiragana letter A (U+3042) is in block 96 (0x60) # lookup 96 in stage1 table yields 88 # lookup 66 in the 88th table in stage2 yields 467 -# record 470 is { 26, 7, 11, 0, 0 } +# record 470 is { 26, 7, 11, 0, 0 } # 26 = ucp_Hiragana => Hiragana script # 7 = ucp_Lo => Other letter # 11 = ucp_gbOther => Grapheme break property "Other" # 0 => not part of a caseless set -# 0 => No other case +# 0 => No other case # # In these examples, no other blocks resolve to the same "virtual" block, as it # happens, but plenty of other blocks do share "virtual" blocks. # -# There is a fourth table, maintained by hand, which translates from the +# There is a fourth table, maintained by hand, which translates from the # individual character types such as ucp_Cc to the general types like ucp_C. # # Philip Hazel, 03 July 2008 @@ -101,8 +103,8 @@ # 01-March-2010: Updated list of scripts for Unicode 5.2.0 # 30-April-2011: Updated list of scripts for Unicode 6.0.0 # July-2012: Updated list of scripts for Unicode 6.1.0 -# 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new -# field in the record to hold the value. Luckily, the +# 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new +# field in the record to hold the value. Luckily, the # structure had a hole in it, so the resulting table is # not much bigger than before. # 18-September-2012: Added code for multiple caseless sets. This uses the @@ -144,14 +146,14 @@ def read_table(file_name, get_value, default_value): if m.group(3) is None: last = char else: - last = int(m.group(3), 16) + last = int(m.group(3), 16) for i in range(char, last + 1): # It is important not to overwrite a previously set # value because in the CaseFolding file there are lines - # to be ignored (returning the default value of 0) - # which often come after a line which has already set - # data. - if table[i] == default_value: + # to be ignored (returning the default value of 0) + # which often come after a line which has already set + # data. + if table[i] == default_value: table[i] = value file.close() return table @@ -192,14 +194,14 @@ def compress_table(table, block_size): stage2 += block blocks[block] = start stage1.append(start) - + return stage1, stage2 # Print a table def print_table(table, table_name, block_size = None): type, size = get_type_size(table) ELEMS_PER_LINE = 16 - + s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table)) if block_size: s += ", block = %d" % block_size @@ -245,15 +247,15 @@ def get_record_size_struct(records): size = (size + slice_size - 1) & -slice_size size += slice_size structure += '%s property_%d;\n' % (slice_type, i) - + # round up to the first item of the next structure in array record_slice = map(lambda record: record[0], records) slice_type, slice_size = get_type_size(record_slice) size = (size + slice_size - 1) & -slice_size - + structure += '} ucd_record;\n*/\n\n' return size, structure - + def test_record_size(): tests = [ \ ( [(3,), (6,), (6,), (1,)], 1 ), \ @@ -305,7 +307,7 @@ script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Bugines 'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi', 'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi' ] - + category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ] @@ -321,20 +323,20 @@ break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_na other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0) -# This block of code was added by PH in September 2012. I am not a Python -# programmer, so the style is probably dreadful, but it does the job. It scans -# the other_case table to find sets of more than two characters that must all -# match each other caselessly. Later in this script a table of these sets is -# written out. However, we have to do this work here in order to compute the +# This block of code was added by PH in September 2012. I am not a Python +# programmer, so the style is probably dreadful, but it does the job. It scans +# the other_case table to find sets of more than two characters that must all +# match each other caselessly. Later in this script a table of these sets is +# written out. However, we have to do this work here in order to compute the # offsets in the table that are inserted into the main table. # The CaseFolding.txt file lists pairs, but the common logic for reading data -# sets only one value, so first we go through the table and set "return" +# sets only one value, so first we go through the table and set "return" # offsets for those that are not already set. for c in range(0x10ffff): if other_case[c] != 0 and other_case[c + other_case[c]] == 0: - other_case[c + other_case[c]] = -other_case[c] + other_case[c + other_case[c]] = -other_case[c] # Now scan again and create equivalence sets. @@ -344,25 +346,25 @@ for c in range(0x10ffff): o = c + other_case[c] # Trigger when this character's other case does not point back here. We - # now have three characters that are case-equivalent. - + # now have three characters that are case-equivalent. + if other_case[o] != -other_case[c]: t = o + other_case[o] - - # Scan the existing sets to see if any of the three characters are already + + # Scan the existing sets to see if any of the three characters are already # part of a set. If so, unite the existing set with the new set. - - appended = 0 + + appended = 0 for s in sets: - found = 0 + found = 0 for x in s: if x == c or x == o or x == t: found = 1 - + # Add new characters to an existing set - + if found: - found = 0 + found = 0 for y in [c, o, t]: for x in s: if x == y: @@ -370,10 +372,10 @@ for c in range(0x10ffff): if not found: s.append(y) appended = 1 - + # If we have not added to an existing set, create a new one. - if not appended: + if not appended: sets.append([c, o, t]) # End of loop looking for caseless sets. @@ -384,7 +386,7 @@ caseless_offsets = [0] * MAX_UNICODE offset = 1; for s in sets: - for x in s: + for x in s: caseless_offsets[x] = offset offset += len(s) + 1 @@ -393,7 +395,7 @@ for s in sets: # Combine the tables -table, records = combine_tables(script, category, break_props, +table, records = combine_tables(script, category, break_props, caseless_offsets, other_case) record_size, record_struct = get_record_size_struct(records.keys()) @@ -450,6 +452,20 @@ print "const pcre_uint16 PRIV(ucd_stage2)[] = {0};" print "const pcre_uint32 PRIV(ucd_caseless_sets)[] = {0};" print "#else" print +print "/* If the 32-bit library is run in non-32-bit mode, character values" +print "greater than 0x10ffff may be encountered. For these we set up a" +print "special record. */" +print +print "#ifdef COMPILE_PCRE32" +print "const ucd_record PRIV(dummy_ucd_record)[] = {{" +print " ucp_Common, /* script */" +print " ucp_Cn, /* type unassigned */" +print " ucp_gbOther, /* grapheme break property */" +print " 0, /* case set */" +print " 0, /* other case */" +print " }};" +print "#endif" +print print record_struct # --- Added by PH: output the table of caseless character sets --- @@ -460,7 +476,7 @@ for s in sets: s = sorted(s) for x in s: print ' 0x%04x,' % x, - print ' NOTACHAR,' + print ' NOTACHAR,' print '};' print |