summaryrefslogtreecommitdiff
path: root/maint
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2017-02-24 17:30:30 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2017-02-24 17:30:30 +0000
commit8037f71d03b3cd8919248f38448a0a2d3715c18c (patch)
tree0fb6d719cb178fa234f4acc0029c5b0a38b5ec50 /maint
parente7991eb5273b5b4162656f4b3d32e68a7430805a (diff)
downloadpcre-8037f71d03b3cd8919248f38448a0a2d3715c18c.tar.gz
Fix Unicode property crash for 32-bit characters greater than 0x10ffff.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1688 2f5784b3-3f2a-0410-8824-cb99058d5e15
Diffstat (limited to 'maint')
-rwxr-xr-xmaint/MultiStage2.py106
1 files changed, 61 insertions, 45 deletions
diff --git a/maint/MultiStage2.py b/maint/MultiStage2.py
index 44ad80c..9a75759 100755
--- a/maint/MultiStage2.py
+++ b/maint/MultiStage2.py
@@ -1,5 +1,7 @@
#! /usr/bin/python
+# WARNING! This is a python 2 script.
+
# Multistage table builder
# (c) Peter Kankowski, 2008
@@ -15,10 +17,10 @@
# ./MultiStage2.py >../pcre_ucd.c
#
# It requires four Unicode data tables, DerivedGeneralCategory.txt,
-# GraphemeBreakProperty.txt, Scripts.txt, and CaseFolding.txt, to be in the
-# Unicode.tables subdirectory. The first of these is found in the "extracted"
-# subdirectory of the Unicode database (UCD) on the Unicode web site; the
-# second is in the "auxiliary" subdirectory; the other two are directly in the
+# GraphemeBreakProperty.txt, Scripts.txt, and CaseFolding.txt, to be in the
+# Unicode.tables subdirectory. The first of these is found in the "extracted"
+# subdirectory of the Unicode database (UCD) on the Unicode web site; the
+# second is in the "auxiliary" subdirectory; the other two are directly in the
# UCD directory.
#
# Minor modifications made to this script:
@@ -42,7 +44,7 @@
# code scans CaseFolding.txt instead of UnicodeData.txt.
#
# The main tables generated by this script are used by macros defined in
-# pcre_internal.h. They look up Unicode character properties using short
+# pcre_internal.h. They look up Unicode character properties using short
# sequences of code that contains no branches, which makes for greater speed.
#
# Conceptually, there is a table of records (of type ucd_record), containing a
@@ -69,13 +71,13 @@
# Example: lowercase "a" (U+0061) is in block 0
# lookup 0 in stage1 table yields 0
# lookup 97 in the first table in stage2 yields 16
-# record 17 is { 33, 5, 11, 0, -32 }
+# record 17 is { 33, 5, 11, 0, -32 }
# 33 = ucp_Latin => Latin script
# 5 = ucp_Ll => Lower case letter
# 11 = ucp_gbOther => Grapheme break property "Other"
# 0 => not part of a caseless set
# -32 => Other case is U+0041
-#
+#
# Almost all lowercase latin characters resolve to the same record. One or two
# are different because they are part of a multi-character caseless set (for
# example, k, K and the Kelvin symbol are such a set).
@@ -83,17 +85,17 @@
# Example: hiragana letter A (U+3042) is in block 96 (0x60)
# lookup 96 in stage1 table yields 88
# lookup 66 in the 88th table in stage2 yields 467
-# record 470 is { 26, 7, 11, 0, 0 }
+# record 470 is { 26, 7, 11, 0, 0 }
# 26 = ucp_Hiragana => Hiragana script
# 7 = ucp_Lo => Other letter
# 11 = ucp_gbOther => Grapheme break property "Other"
# 0 => not part of a caseless set
-# 0 => No other case
+# 0 => No other case
#
# In these examples, no other blocks resolve to the same "virtual" block, as it
# happens, but plenty of other blocks do share "virtual" blocks.
#
-# There is a fourth table, maintained by hand, which translates from the
+# There is a fourth table, maintained by hand, which translates from the
# individual character types such as ucp_Cc to the general types like ucp_C.
#
# Philip Hazel, 03 July 2008
@@ -101,8 +103,8 @@
# 01-March-2010: Updated list of scripts for Unicode 5.2.0
# 30-April-2011: Updated list of scripts for Unicode 6.0.0
# July-2012: Updated list of scripts for Unicode 6.1.0
-# 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new
-# field in the record to hold the value. Luckily, the
+# 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new
+# field in the record to hold the value. Luckily, the
# structure had a hole in it, so the resulting table is
# not much bigger than before.
# 18-September-2012: Added code for multiple caseless sets. This uses the
@@ -144,14 +146,14 @@ def read_table(file_name, get_value, default_value):
if m.group(3) is None:
last = char
else:
- last = int(m.group(3), 16)
+ last = int(m.group(3), 16)
for i in range(char, last + 1):
# It is important not to overwrite a previously set
# value because in the CaseFolding file there are lines
- # to be ignored (returning the default value of 0)
- # which often come after a line which has already set
- # data.
- if table[i] == default_value:
+ # to be ignored (returning the default value of 0)
+ # which often come after a line which has already set
+ # data.
+ if table[i] == default_value:
table[i] = value
file.close()
return table
@@ -192,14 +194,14 @@ def compress_table(table, block_size):
stage2 += block
blocks[block] = start
stage1.append(start)
-
+
return stage1, stage2
# Print a table
def print_table(table, table_name, block_size = None):
type, size = get_type_size(table)
ELEMS_PER_LINE = 16
-
+
s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
if block_size:
s += ", block = %d" % block_size
@@ -245,15 +247,15 @@ def get_record_size_struct(records):
size = (size + slice_size - 1) & -slice_size
size += slice_size
structure += '%s property_%d;\n' % (slice_type, i)
-
+
# round up to the first item of the next structure in array
record_slice = map(lambda record: record[0], records)
slice_type, slice_size = get_type_size(record_slice)
size = (size + slice_size - 1) & -slice_size
-
+
structure += '} ucd_record;\n*/\n\n'
return size, structure
-
+
def test_record_size():
tests = [ \
( [(3,), (6,), (6,), (1,)], 1 ), \
@@ -305,7 +307,7 @@ script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Bugines
'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi',
'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi'
]
-
+
category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
@@ -321,20 +323,20 @@ break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_na
other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
-# This block of code was added by PH in September 2012. I am not a Python
-# programmer, so the style is probably dreadful, but it does the job. It scans
-# the other_case table to find sets of more than two characters that must all
-# match each other caselessly. Later in this script a table of these sets is
-# written out. However, we have to do this work here in order to compute the
+# This block of code was added by PH in September 2012. I am not a Python
+# programmer, so the style is probably dreadful, but it does the job. It scans
+# the other_case table to find sets of more than two characters that must all
+# match each other caselessly. Later in this script a table of these sets is
+# written out. However, we have to do this work here in order to compute the
# offsets in the table that are inserted into the main table.
# The CaseFolding.txt file lists pairs, but the common logic for reading data
-# sets only one value, so first we go through the table and set "return"
+# sets only one value, so first we go through the table and set "return"
# offsets for those that are not already set.
for c in range(0x10ffff):
if other_case[c] != 0 and other_case[c + other_case[c]] == 0:
- other_case[c + other_case[c]] = -other_case[c]
+ other_case[c + other_case[c]] = -other_case[c]
# Now scan again and create equivalence sets.
@@ -344,25 +346,25 @@ for c in range(0x10ffff):
o = c + other_case[c]
# Trigger when this character's other case does not point back here. We
- # now have three characters that are case-equivalent.
-
+ # now have three characters that are case-equivalent.
+
if other_case[o] != -other_case[c]:
t = o + other_case[o]
-
- # Scan the existing sets to see if any of the three characters are already
+
+ # Scan the existing sets to see if any of the three characters are already
# part of a set. If so, unite the existing set with the new set.
-
- appended = 0
+
+ appended = 0
for s in sets:
- found = 0
+ found = 0
for x in s:
if x == c or x == o or x == t:
found = 1
-
+
# Add new characters to an existing set
-
+
if found:
- found = 0
+ found = 0
for y in [c, o, t]:
for x in s:
if x == y:
@@ -370,10 +372,10 @@ for c in range(0x10ffff):
if not found:
s.append(y)
appended = 1
-
+
# If we have not added to an existing set, create a new one.
- if not appended:
+ if not appended:
sets.append([c, o, t])
# End of loop looking for caseless sets.
@@ -384,7 +386,7 @@ caseless_offsets = [0] * MAX_UNICODE
offset = 1;
for s in sets:
- for x in s:
+ for x in s:
caseless_offsets[x] = offset
offset += len(s) + 1
@@ -393,7 +395,7 @@ for s in sets:
# Combine the tables
-table, records = combine_tables(script, category, break_props,
+table, records = combine_tables(script, category, break_props,
caseless_offsets, other_case)
record_size, record_struct = get_record_size_struct(records.keys())
@@ -450,6 +452,20 @@ print "const pcre_uint16 PRIV(ucd_stage2)[] = {0};"
print "const pcre_uint32 PRIV(ucd_caseless_sets)[] = {0};"
print "#else"
print
+print "/* If the 32-bit library is run in non-32-bit mode, character values"
+print "greater than 0x10ffff may be encountered. For these we set up a"
+print "special record. */"
+print
+print "#ifdef COMPILE_PCRE32"
+print "const ucd_record PRIV(dummy_ucd_record)[] = {{"
+print " ucp_Common, /* script */"
+print " ucp_Cn, /* type unassigned */"
+print " ucp_gbOther, /* grapheme break property */"
+print " 0, /* case set */"
+print " 0, /* other case */"
+print " }};"
+print "#endif"
+print
print record_struct
# --- Added by PH: output the table of caseless character sets ---
@@ -460,7 +476,7 @@ for s in sets:
s = sorted(s)
for x in s:
print ' 0x%04x,' % x,
- print ' NOTACHAR,'
+ print ' NOTACHAR,'
print '};'
print