Fix Unicode property crash for 32-bit characters greater than 0x10ffff.

git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1688 2f5784b3-3f2a-0410-8824-cb99058d5e15
author: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2017-02-24 17:30:30 +0000
committer: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2017-02-24 17:30:30 +0000
commit: 8037f71d03b3cd8919248f38448a0a2d3715c18c (patch)
tree: 0fb6d719cb178fa234f4acc0029c5b0a38b5ec50
parent: e7991eb5273b5b4162656f4b3d32e68a7430805a (diff)
download: pcre-8037f71d03b3cd8919248f38448a0a2d3715c18c.tar.gz
4 files changed, 90 insertions, 47 deletions
diff --git a/ChangeLog b/ChangeLog
index a226e21..9e6ca0c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -10,7 +10,7 @@ Version 8.41
 1.  Fixed typo in CMakeLists.txt (wrong number of arguments for 
 PCRE_STATIC_RUNTIME (affects MSVC only).
 
-2. Issue 1 for 8.40 below was not correctly fixed. If pcregrep in multiline 
+2.  Issue 1 for 8.40 below was not correctly fixed. If pcregrep in multiline 
 mode with --only-matching matched several lines, it restarted scanning at the 
 next line instead of moving on to the end of the matched string, which can be 
 several lines after the start.
@@ -29,6 +29,10 @@ are fixed:
 
     (a) Check for values < 256 when calling isprint() in pcretest.
     (b) Give an error for too big a number after \O. 
+    
+7.  In the 32-bit library in non-UTF mode, an attempt to find a Unicode 
+property for a character with a code point greater than 0x10ffff (the Unicode 
+maximum) caused a crash.
 
 
 Version 8.40 11-January-2017
diff --git a/maint/MultiStage2.py b/maint/MultiStage2.py
index 44ad80c..9a75759 100755
--- a/maint/MultiStage2.py
+++ b/maint/MultiStage2.py
@@ -1,5 +1,7 @@
 #! /usr/bin/python
 
+# WARNING! This is a python 2 script.
+
 # Multistage table builder
 # (c) Peter Kankowski, 2008
 
@@ -15,10 +17,10 @@
 # ./MultiStage2.py >../pcre_ucd.c
 #
 # It requires four Unicode data tables, DerivedGeneralCategory.txt,
-# GraphemeBreakProperty.txt, Scripts.txt, and CaseFolding.txt, to be in the 
-# Unicode.tables subdirectory. The first of these is found in the "extracted" 
-# subdirectory of the Unicode database (UCD) on the Unicode web site; the 
-# second is in the "auxiliary" subdirectory; the other two are directly in the 
+# GraphemeBreakProperty.txt, Scripts.txt, and CaseFolding.txt, to be in the
+# Unicode.tables subdirectory. The first of these is found in the "extracted"
+# subdirectory of the Unicode database (UCD) on the Unicode web site; the
+# second is in the "auxiliary" subdirectory; the other two are directly in the
 # UCD directory.
 #
 # Minor modifications made to this script:
@@ -42,7 +44,7 @@
 #  code scans CaseFolding.txt instead of UnicodeData.txt.
 #
 # The main tables generated by this script are used by macros defined in
-# pcre_internal.h. They look up Unicode character properties using short 
+# pcre_internal.h. They look up Unicode character properties using short
 # sequences of code that contains no branches, which makes for greater speed.
 #
 # Conceptually, there is a table of records (of type ucd_record), containing a
@@ -69,13 +71,13 @@
 # Example: lowercase "a" (U+0061) is in block 0
 #          lookup 0 in stage1 table yields 0
 #          lookup 97 in the first table in stage2 yields 16
-#          record 17 is { 33, 5, 11, 0, -32 } 
+#          record 17 is { 33, 5, 11, 0, -32 }
 #            33 = ucp_Latin   => Latin script
 #             5 = ucp_Ll      => Lower case letter
 #            11 = ucp_gbOther => Grapheme break property "Other"
 #             0               => not part of a caseless set
 #           -32               => Other case is U+0041
-#         
+#
 # Almost all lowercase latin characters resolve to the same record. One or two
 # are different because they are part of a multi-character caseless set (for
 # example, k, K and the Kelvin symbol are such a set).
@@ -83,17 +85,17 @@
 # Example: hiragana letter A (U+3042) is in block 96 (0x60)
 #          lookup 96 in stage1 table yields 88
 #          lookup 66 in the 88th table in stage2 yields 467
-#          record 470 is { 26, 7, 11, 0, 0 } 
+#          record 470 is { 26, 7, 11, 0, 0 }
 #            26 = ucp_Hiragana => Hiragana script
 #             7 = ucp_Lo       => Other letter
 #            11 = ucp_gbOther  => Grapheme break property "Other"
 #             0                => not part of a caseless set
-#             0                => No other case 
+#             0                => No other case
 #
 # In these examples, no other blocks resolve to the same "virtual" block, as it
 # happens, but plenty of other blocks do share "virtual" blocks.
 #
-# There is a fourth table, maintained by hand, which translates from the 
+# There is a fourth table, maintained by hand, which translates from the
 # individual character types such as ucp_Cc to the general types like ucp_C.
 #
 #  Philip Hazel, 03 July 2008
@@ -101,8 +103,8 @@
 # 01-March-2010:     Updated list of scripts for Unicode 5.2.0
 # 30-April-2011:     Updated list of scripts for Unicode 6.0.0
 #     July-2012:     Updated list of scripts for Unicode 6.1.0
-# 20-August-2012:    Added scan of GraphemeBreakProperty.txt and added a new 
-#                      field in the record to hold the value. Luckily, the 
+# 20-August-2012:    Added scan of GraphemeBreakProperty.txt and added a new
+#                      field in the record to hold the value. Luckily, the
 #                      structure had a hole in it, so the resulting table is
 #                      not much bigger than before.
 # 18-September-2012: Added code for multiple caseless sets. This uses the
@@ -144,14 +146,14 @@ def read_table(file_name, get_value, default_value):
                 if m.group(3) is None:
                         last = char
                 else:
-                        last = int(m.group(3), 16)            
+                        last = int(m.group(3), 16)
                 for i in range(char, last + 1):
                         # It is important not to overwrite a previously set
                         # value because in the CaseFolding file there are lines
-                        # to be ignored (returning the default value of 0) 
-                        # which often come after a line which has already set 
-                        # data.   
-                        if table[i] == default_value: 
+                        # to be ignored (returning the default value of 0)
+                        # which often come after a line which has already set
+                        # data.
+                        if table[i] == default_value:
                           table[i] = value
         file.close()
         return table
@@ -192,14 +194,14 @@ def compress_table(table, block_size):
                         stage2 += block
                         blocks[block] = start
                 stage1.append(start)
-        
+
         return stage1, stage2
 
 # Print a table
 def print_table(table, table_name, block_size = None):
         type, size = get_type_size(table)
         ELEMS_PER_LINE = 16
-        
+
         s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
         if block_size:
                 s += ", block = %d" % block_size
@@ -245,15 +247,15 @@ def get_record_size_struct(records):
                 size = (size + slice_size - 1) & -slice_size
                 size += slice_size
                 structure += '%s property_%d;\n' % (slice_type, i)
-        
+
         # round up to the first item of the next structure in array
         record_slice = map(lambda record: record[0], records)
         slice_type, slice_size = get_type_size(record_slice)
         size = (size + slice_size - 1) & -slice_size
-        
+
         structure += '} ucd_record;\n*/\n\n'
         return size, structure
-        
+
 def test_record_size():
         tests = [ \
           ( [(3,), (6,), (6,), (1,)], 1 ), \
@@ -305,7 +307,7 @@ script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Bugines
  'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi',
  'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi'
  ]
- 
+
 category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
   'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
   'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
@@ -321,20 +323,20 @@ break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_na
 other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
 
 
-# This block of code was added by PH in September 2012. I am not a Python 
-# programmer, so the style is probably dreadful, but it does the job. It scans 
-# the other_case table to find sets of more than two characters that must all 
-# match each other caselessly. Later in this script a table of these sets is 
-# written out. However, we have to do this work here in order to compute the 
+# This block of code was added by PH in September 2012. I am not a Python
+# programmer, so the style is probably dreadful, but it does the job. It scans
+# the other_case table to find sets of more than two characters that must all
+# match each other caselessly. Later in this script a table of these sets is
+# written out. However, we have to do this work here in order to compute the
 # offsets in the table that are inserted into the main table.
 
 # The CaseFolding.txt file lists pairs, but the common logic for reading data
-# sets only one value, so first we go through the table and set "return" 
+# sets only one value, so first we go through the table and set "return"
 # offsets for those that are not already set.
 
 for c in range(0x10ffff):
   if other_case[c] != 0 and other_case[c + other_case[c]] == 0:
-    other_case[c + other_case[c]] = -other_case[c] 
+    other_case[c + other_case[c]] = -other_case[c]
 
 # Now scan again and create equivalence sets.
 
@@ -344,25 +346,25 @@ for c in range(0x10ffff):
   o = c + other_case[c]
 
   # Trigger when this character's other case does not point back here. We
-  # now have three characters that are case-equivalent. 
- 
+  # now have three characters that are case-equivalent.
+
   if other_case[o] != -other_case[c]:
     t = o + other_case[o]
-    
-    # Scan the existing sets to see if any of the three characters are already 
+
+    # Scan the existing sets to see if any of the three characters are already
     # part of a set. If so, unite the existing set with the new set.
- 
-    appended = 0 
+
+    appended = 0
     for s in sets:
-      found = 0 
+      found = 0
       for x in s:
         if x == c or x == o or x == t:
           found = 1
-    
+
       # Add new characters to an existing set
-       
+
       if found:
-        found = 0 
+        found = 0
         for y in [c, o, t]:
           for x in s:
             if x == y:
@@ -370,10 +372,10 @@ for c in range(0x10ffff):
           if not found:
             s.append(y)
         appended = 1
-        
+
     # If we have not added to an existing set, create a new one.
 
-    if not appended:     
+    if not appended:
       sets.append([c, o, t])
 
 # End of loop looking for caseless sets.
@@ -384,7 +386,7 @@ caseless_offsets = [0] * MAX_UNICODE
 
 offset = 1;
 for s in sets:
-  for x in s:   
+  for x in s:
     caseless_offsets[x] = offset
   offset += len(s) + 1
 
@@ -393,7 +395,7 @@ for s in sets:
 
 # Combine the tables
 
-table, records = combine_tables(script, category, break_props, 
+table, records = combine_tables(script, category, break_props,
   caseless_offsets, other_case)
 
 record_size, record_struct = get_record_size_struct(records.keys())
@@ -450,6 +452,20 @@ print "const pcre_uint16 PRIV(ucd_stage2)[] = {0};"
 print "const pcre_uint32 PRIV(ucd_caseless_sets)[] = {0};"
 print "#else"
 print
+print "/* If the 32-bit library is run in non-32-bit mode, character values"
+print "greater than 0x10ffff may be encountered. For these we set up a"
+print "special record. */"
+print
+print "#ifdef COMPILE_PCRE32"
+print "const ucd_record PRIV(dummy_ucd_record)[] = {{"
+print "  ucp_Common,    /* script */"
+print "  ucp_Cn,        /* type unassigned */"
+print "  ucp_gbOther,   /* grapheme break property */"
+print "  0,             /* case set */"
+print "  0,             /* other case */"
+print "  }};"
+print "#endif"
+print
 print record_struct
 
 # --- Added by PH: output the table of caseless character sets ---
@@ -460,7 +476,7 @@ for s in sets:
   s = sorted(s)
   for x in s:
     print '  0x%04x,' % x,
-  print '  NOTACHAR,'   
+  print '  NOTACHAR,'
 print '};'
 print
 
diff --git a/pcre_internal.h b/pcre_internal.h
index 2923b29..154d3f6 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -2772,6 +2772,9 @@ extern const pcre_uint8  PRIV(ucd_stage1)[];
 extern const pcre_uint16 PRIV(ucd_stage2)[];
 extern const pcre_uint32 PRIV(ucp_gentype)[];
 extern const pcre_uint32 PRIV(ucp_gbtable)[];
+#ifdef COMPILE_PCRE32
+extern const ucd_record  PRIV(dummy_ucd_record)[];
+#endif
 #ifdef SUPPORT_JIT
 extern const int         PRIV(ucp_typerange)[];
 #endif
@@ -2780,9 +2783,15 @@ extern const int         PRIV(ucp_typerange)[];
 /* UCD access macros */
 
 #define UCD_BLOCK_SIZE 128
-#define GET_UCD(ch) (PRIV(ucd_records) + \
+#define REAL_GET_UCD(ch) (PRIV(ucd_records) + \
         PRIV(ucd_stage2)[PRIV(ucd_stage1)[(int)(ch) / UCD_BLOCK_SIZE] * \
         UCD_BLOCK_SIZE + (int)(ch) % UCD_BLOCK_SIZE])
+        
+#ifdef COMPILE_PCRE32
+#define GET_UCD(ch) ((ch > 0x10ffff)? PRIV(dummy_ucd_record) : REAL_GET_UCD(ch))
+#else
+#define GET_UCD(ch) REAL_GET_UCD(ch)
+#endif 
 
 #define UCD_CHARTYPE(ch)    GET_UCD(ch)->chartype
 #define UCD_SCRIPT(ch)      GET_UCD(ch)->script
diff --git a/pcre_ucd.c b/pcre_ucd.c
index 69c4fd4..f22f826 100644
--- a/pcre_ucd.c
+++ b/pcre_ucd.c
@@ -38,6 +38,20 @@ const pcre_uint16 PRIV(ucd_stage2)[] = {0};
 const pcre_uint32 PRIV(ucd_caseless_sets)[] = {0};
 #else
 
+/* If the 32-bit library is run in non-32-bit mode, character values
+greater than 0x10ffff may be encountered. For these we set up a
+special record. */
+
+#ifdef COMPILE_PCRE32
+const ucd_record PRIV(dummy_ucd_record)[] = {{
+  ucp_Common,    /* script */
+  ucp_Cn,        /* type unassigned */
+  ucp_gbOther,   /* grapheme break property */
+  0,             /* case set */
+  0,             /* other case */
+  }};
+#endif
+
 /* When recompiling tables with a new Unicode version, please check the
 types in this structure definition from pcre_internal.h (the actual
 field names will be different):
author	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2017-02-24 17:30:30 +0000
committer	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2017-02-24 17:30:30 +0000
commit	8037f71d03b3cd8919248f38448a0a2d3715c18c (patch)
tree	0fb6d719cb178fa234f4acc0029c5b0a38b5ec50
parent	e7991eb5273b5b4162656f4b3d32e68a7430805a (diff)
download	pcre-8037f71d03b3cd8919248f38448a0a2d3715c18c.tar.gz