A more correct fix for the chartables bug with UTF-8 and non-std tables.

git-svn-id: svn://vcs.exim.org/pcre/code/trunk@539 2f5784b3-3f2a-0410-8824-cb99058d5e15
author: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2010-06-13 21:35:04 +0000
committer: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2010-06-13 21:35:04 +0000
commit: d940107b6f55a1f47fb03ddb48e6246175ba669b (patch)
tree: b8a88629ff4c4d8a502de861d64786299c53abbb
parent: ccf5f782818c0ab97070ba43f1d41188fd3c252c (diff)
download: pcre-d940107b6f55a1f47fb03ddb48e6246175ba669b.tar.gz
5 files changed, 244 insertions, 57 deletions
diff --git a/ChangeLog b/ChangeLog
index d1821c3..b1c2853 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -81,9 +81,8 @@ Version 8.10 03-Jun-2010
     used to create a list of bytes that can start a match. For \s, it was
     including 0x85 and 0xa0, which of course cannot start UTF-8 characters. I
     have changed the code so that only real ASCII characters (less than 128)
-    are set in this case because the \s etc escapes are documented as 
-    recognizing only ASCII characters. (When PCRE_UCP is set - see 9 above - 
-    the code is different altogether.)
+    and the correct starting bytes for UTF-8 encodings are set in this case.
+    (When PCRE_UCP is set - see 9 above - the code is different altogether.)
 
 
 Version 8.02 19-Mar-2010
diff --git a/configure.ac b/configure.ac
index 499ae87..f88ad39 100644
--- a/configure.ac
+++ b/configure.ac
@@ -10,8 +10,8 @@ dnl be defined as -RC2, for example. For real releases, it should be empty.
 
 m4_define(pcre_major, [8])
 m4_define(pcre_minor, [10])
-m4_define(pcre_prerelease, [-RC1])
-m4_define(pcre_date, [2010-06-03])
+m4_define(pcre_prerelease, [-RC2])
+m4_define(pcre_date, [2010-06-11])
 
 # Libtool shared library interface versions (current:revision:age)
 m4_define(libpcre_version, [0:1:0])
diff --git a/maint/README b/maint/README
index f6c9102..06b2883 100644
--- a/maint/README
+++ b/maint/README
@@ -35,6 +35,12 @@ MultiStage2.py   A Python script that generates the file pcre_ucd.c from three
                  Unicode web site. Run this script in the "maint" directory.
                  The generated file contains the tables for a 2-stage lookup
                  of Unicode properties.
+                 
+pcre_chartables.c.non-standard
+                 This is a set of character tables that came from a Windows
+                 system. It has characters greater than 128 that are set as
+                 spaces, amongst other things. I kept it so that it can be
+                 used for testing from time to time.    
 
 README           This file.
 
diff --git a/maint/pcre_chartables.c.non-standard b/maint/pcre_chartables.c.non-standard
new file mode 100644
index 0000000..c18d49d
--- /dev/null
+++ b/maint/pcre_chartables.c.non-standard
@@ -0,0 +1,138 @@
+const unsigned char _pcre_default_tables[] = {
+0,1,2,3,4,5,6,7,
+8,9,10,11,12,13,14,15,
+16,17,18,19,20,21,22,23,
+24,25,26,27,28,29,30,31,
+32,33,34,35,36,37,38,39,
+40,41,42,43,44,45,46,47,
+48,49,50,51,52,53,54,55,
+56,57,58,59,60,61,62,63,
+64,97,98,99,100,101,102,103,
+104,105,106,107,108,109,110,111,
+112,113,114,115,116,117,118,119,
+120,121,122,91,92,93,94,95,
+96,97,98,99,100,101,102,103,
+104,105,106,107,108,109,110,111,
+112,113,114,115,116,117,118,119,
+120,121,122,123,124,125,126,127,
+128,129,130,131,132,133,134,135,
+136,137,138,139,140,141,142,143,
+144,145,146,147,148,149,150,151,
+152,153,154,155,156,157,158,159,
+160,161,162,163,164,165,166,167,
+168,169,170,171,172,173,174,175,
+176,177,178,179,180,181,182,183,
+184,185,186,187,188,189,190,191,
+224,225,226,227,228,229,230,231,
+232,233,234,235,236,237,238,239,
+240,241,242,243,244,245,246,215,
+248,249,250,251,252,253,254,223,
+224,225,226,227,228,229,230,231,
+232,233,234,235,236,237,238,239,
+240,241,242,243,244,245,246,247,
+248,249,250,251,252,253,254,255,
+0,1,2,3,4,5,6,7,
+8,9,10,11,12,13,14,15,
+16,17,18,19,20,21,22,23,
+24,25,26,27,28,29,30,31,
+32,33,34,35,36,37,38,39,
+40,41,42,43,44,45,46,47,
+48,49,50,51,52,53,54,55,
+56,57,58,59,60,61,62,63,
+64,97,98,99,100,101,102,103,
+104,105,106,107,108,109,110,111,
+112,113,114,115,116,117,118,119,
+120,121,122,91,92,93,94,95,
+96,65,66,67,68,69,70,71,
+72,73,74,75,76,77,78,79,
+80,81,82,83,84,85,86,87,
+88,89,90,123,124,125,126,127,
+128,129,130,131,132,133,134,135,
+136,137,138,139,140,141,142,143,
+144,145,146,147,148,149,150,151,
+152,153,154,155,156,157,158,159,
+160,161,162,163,164,165,166,167,
+168,169,170,171,172,173,174,175,
+176,177,178,179,180,181,182,183,
+184,185,186,187,188,189,190,191,
+224,225,226,227,228,229,230,231,
+232,233,234,235,236,237,238,239,
+240,241,242,243,244,245,246,215,
+248,249,250,251,252,253,254,223,
+192,193,194,195,196,197,198,199,
+200,201,202,203,204,205,206,207,
+208,209,210,211,212,213,214,247,
+216,217,218,219,220,221,222,255,
+0,62,0,0,1,0,0,0,
+0,0,0,0,0,0,0,0,
+32,0,0,0,1,0,0,0,
+0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,255,3,
+126,0,0,0,126,0,0,0,
+0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,255,3,
+0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,12,2,
+0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,
+254,255,255,7,0,0,0,0,
+0,0,0,0,0,0,0,0,
+255,255,127,127,0,0,0,0,
+0,0,0,0,0,0,0,0,
+0,0,0,0,254,255,255,7,
+0,0,0,0,0,4,32,4,
+0,0,0,128,255,255,127,255,
+0,0,0,0,0,0,255,3,
+254,255,255,135,254,255,255,7,
+0,0,0,0,0,4,44,6,
+255,255,127,255,255,255,127,255,
+0,0,0,0,254,255,255,255,
+255,255,255,255,255,255,255,127,
+0,0,0,0,254,255,255,255,
+255,255,255,255,255,255,255,255,
+0,2,0,0,255,255,255,255,
+255,255,255,255,255,255,255,127,
+0,0,0,0,255,255,255,255,
+255,255,255,255,255,255,255,255,
+0,0,0,0,254,255,0,252,
+1,0,0,248,1,0,0,120,
+0,0,0,0,254,255,255,255,
+0,0,128,0,0,0,128,0,
+255,255,255,255,0,0,0,0,
+0,0,0,0,0,0,0,128,
+255,255,255,255,0,0,0,0,
+0,0,0,0,0,0,0,0,
+128,0,0,0,0,0,0,0,
+0,1,1,0,1,1,0,0,
+0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,
+1,0,0,0,128,0,0,0,
+128,128,128,128,0,0,128,0,
+28,28,28,28,28,28,28,28,
+28,28,0,0,0,0,0,128,
+0,26,26,26,26,26,26,18,
+18,18,18,18,18,18,18,18,
+18,18,18,18,18,18,18,18,
+18,18,18,128,128,0,128,16,
+0,26,26,26,26,26,26,18,
+18,18,18,18,18,18,18,18,
+18,18,18,18,18,18,18,18,
+18,18,18,128,128,0,0,0,
+0,0,0,0,0,1,0,0,
+0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,
+1,0,0,0,0,0,0,0,
+0,0,18,0,0,0,0,0,
+0,0,20,20,0,18,0,0,
+0,20,18,0,0,0,0,0,
+18,18,18,18,18,18,18,18,
+18,18,18,18,18,18,18,18,
+18,18,18,18,18,18,18,0,
+18,18,18,18,18,18,18,18,
+18,18,18,18,18,18,18,18,
+18,18,18,18,18,18,18,18,
+18,18,18,18,18,18,18,0,
+18,18,18,18,18,18,18,18
+};
diff --git a/pcre_study.c b/pcre_study.c
index 51457a7..71d2526 100644
--- a/pcre_study.c
+++ b/pcre_study.c
@@ -490,6 +490,77 @@ return p + 1;
 
 
 /*************************************************
+*     Set bits for a positive character type     *
+*************************************************/
+
+/* This function sets starting bits for a character type. In UTF-8 mode, we can
+only do a direct setting for bytes less than 128, as otherwise there can be
+confusion with bytes in the middle of UTF-8 characters. In a "traditional"
+environment, the tables will only recognize ASCII characters anyway, but in at
+least one Windows environment, some higher bytes bits were set in the tables.
+So we deal with that case by considering the UTF-8 encoding.
+
+Arguments:
+  start_bits     the starting bitmap
+  cbit type      the type of character wanted
+  table_limit    32 for non-UTF-8; 16 for UTF-8
+  cd             the block with char table pointers 
+
+Returns:         nothing
+*/
+
+static void
+set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
+  compile_data *cd)
+{
+register int c; 
+for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
+if (table_limit == 32) return;
+for (c = 128; c < 256; c++)
+  {
+  if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
+    {
+    uschar buff[8];
+    (void)_pcre_ord2utf8(c, buff);
+    SET_BIT(buff[0]);  
+    }  
+  }        
+}
+
+
+/*************************************************
+*     Set bits for a negative character type     *
+*************************************************/
+
+/* This function sets starting bits for a negative character type such as \D.
+In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
+otherwise there can be confusion with bytes in the middle of UTF-8 characters.
+Unlike in the positive case, where we can set appropriate starting bits for 
+specific high-valued UTF-8 characters, in this case we have to set the bits for
+all high-valued characters. The lowest is 0xc2, but we overkill by starting at 
+0xc0 (192) for simplicity.
+
+Arguments:
+  start_bits     the starting bitmap
+  cbit type      the type of character wanted
+  table_limit    32 for non-UTF-8; 16 for UTF-8
+  cd             the block with char table pointers 
+
+Returns:         nothing
+*/
+
+static void
+set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
+  compile_data *cd)
+{
+register int c; 
+for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
+if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
+}
+
+
+
+/*************************************************
 *          Create bitmap of starting bytes       *
 *************************************************/
 
@@ -705,64 +776,48 @@ do
 
       /* Single character types set the bits and stop. Note that if PCRE_UCP
       is set, we do not see these op codes because \d etc are converted to
-      properties. Therefore, these apply in the case when only ASCII characters
-      are recognized to match the types. In UTF-8 mode, we must restrict 
-      ourselves to bytes less than 128, as otherwise there can be confusion 
-      with bytes in the middle of UTF-8 characters. (In a "traditional" 
-      environment, the tables will only recognize ASCII characters anyway, but 
-      in at least one Windows environment, some higher bytes bits were set in 
-      the tables.) */
+      properties. Therefore, these apply in the case when only characters less 
+      than 256 are recognized to match the types. */
 
       case OP_NOT_DIGIT:
-      for (c = 0; c < table_limit; c++)
-        start_bits[c] |= ~cd->cbits[c+cbit_digit];
+      set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
       try_next = FALSE;
       break;
 
       case OP_DIGIT:
-      for (c = 0; c < table_limit; c++)
-        start_bits[c] |= cd->cbits[c+cbit_digit];
+      set_type_bits(start_bits, cbit_digit, table_limit, cd);
       try_next = FALSE;
       break;
 
       /* The cbit_space table has vertical tab as whitespace; we have to
-      discard it. */
+      ensure it is set as not whitespace. */
 
       case OP_NOT_WHITESPACE:
-      for (c = 0; c < table_limit; c++)
-        {
-        int d = cd->cbits[c+cbit_space];
-        if (c == 1) d &= ~0x08;
-        start_bits[c] |= ~d;
-        }
+      set_nottype_bits(start_bits, cbit_space, table_limit, cd);
+      start_bits[1] |= 0x08;
       try_next = FALSE;
       break;
 
       /* The cbit_space table has vertical tab as whitespace; we have to
-      discard it. */
+      not set it from the table. */
 
       case OP_WHITESPACE:
-      for (c = 0; c < table_limit; c++)
-        {
-        int d = cd->cbits[c+cbit_space];
-        if (c == 1) d &= ~0x08;
-        start_bits[c] |= d;
-        }
+      c = start_bits[1];    /* Save in case it was already set */
+      set_type_bits(start_bits, cbit_space, table_limit, cd);
+      start_bits[1] = (start_bits[1] & ~0x08) | c;
       try_next = FALSE;
       break;
 
       case OP_NOT_WORDCHAR:
-      for (c = 0; c < table_limit; c++)
-        start_bits[c] |= ~cd->cbits[c+cbit_word];
+      set_nottype_bits(start_bits, cbit_word, table_limit, cd);
       try_next = FALSE;
       break;
 
       case OP_WORDCHAR:
-      for (c = 0; c < table_limit; c++)
-        start_bits[c] |= cd->cbits[c+cbit_word];
+      set_type_bits(start_bits, cbit_word, table_limit, cd);
       try_next = FALSE;
       break;
-
+ 
       /* One or more character type fudges the pointer and restarts, knowing
       it will hit a single character type and stop there. */
 
@@ -825,47 +880,36 @@ do
         break;
 
         case OP_NOT_DIGIT:
-        for (c = 0; c < table_limit; c++)
-          start_bits[c] |= ~cd->cbits[c+cbit_digit];
+        set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
         break;
 
         case OP_DIGIT:
-        for (c = 0; c < table_limit; c++)
-          start_bits[c] |= cd->cbits[c+cbit_digit];
+        set_type_bits(start_bits, cbit_digit, table_limit, cd);
         break;
 
         /* The cbit_space table has vertical tab as whitespace; we have to
-        discard it. */
+        ensure it gets set as not whitespace. */
 
         case OP_NOT_WHITESPACE:
-        for (c = 0; c < table_limit; c++)
-          {
-          int d = cd->cbits[c+cbit_space];
-          if (c == 1) d &= ~0x08;
-          start_bits[c] |= ~d;
-          }
+        set_nottype_bits(start_bits, cbit_space, table_limit, cd);
+        start_bits[1] |= 0x08; 
         break;
 
         /* The cbit_space table has vertical tab as whitespace; we have to
-        discard it. */
+        avoid setting it. */
 
         case OP_WHITESPACE:
-        for (c = 0; c < table_limit; c++)
-          {
-          int d = cd->cbits[c+cbit_space];
-          if (c == 1) d &= ~0x08;
-          start_bits[c] |= d;
-          }
+        c = start_bits[1];    /* Save in case it was already set */
+        set_type_bits(start_bits, cbit_space, table_limit, cd);
+        start_bits[1] = (start_bits[1] & ~0x08) | c;
         break;
 
         case OP_NOT_WORDCHAR:
-        for (c = 0; c < table_limit; c++)
-          start_bits[c] |= ~cd->cbits[c+cbit_word];
+        set_nottype_bits(start_bits, cbit_word, table_limit, cd);
         break;
 
         case OP_WORDCHAR:
-        for (c = 0; c < table_limit; c++)
-          start_bits[c] |= cd->cbits[c+cbit_word];
+        set_type_bits(start_bits, cbit_word, table_limit, cd);
         break;
         }
author	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2010-06-13 21:35:04 +0000
committer	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2010-06-13 21:35:04 +0000
commit	d940107b6f55a1f47fb03ddb48e6246175ba669b (patch)
tree	b8a88629ff4c4d8a502de861d64786299c53abbb
parent	ccf5f782818c0ab97070ba43f1d41188fd3c252c (diff)
download	pcre-d940107b6f55a1f47fb03ddb48e6246175ba669b.tar.gz