Fix pcre_study() problem with non-C-locale chartables in UTF-8 mode.

git-svn-id: svn://vcs.exim.org/pcre/code/trunk@538 2f5784b3-3f2a-0410-8824-cb99058d5e15
author: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2010-06-09 19:30:57 +0000
committer: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2010-06-09 19:30:57 +0000
commit: ccf5f782818c0ab97070ba43f1d41188fd3c252c (patch)
tree: c8ac7b66a820a73bcb7524aa17d7a71ae3b1b82d
parent: 84fb9259ee23fe98b0595dac5b7fac1f65574650 (diff)
download: pcre-ccf5f782818c0ab97070ba43f1d41188fd3c252c.tar.gz
6 files changed, 84 insertions, 30 deletions
diff --git a/ChangeLog b/ChangeLog
index 2e120f8..d1821c3 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -72,6 +72,18 @@ Version 8.10 03-Jun-2010
 18. If the last data line in a file for pcretest does not have a newline on
     the end, a newline was missing in the output. 
     
+19. The default pcre_chartables.c file recognizes only ASCII characters (values 
+    less than 128) in its various bitmaps. However, there is a facility for 
+    generating tables according to the current locale when PCRE is compiled. It 
+    turns out that in some environments, 0x85 and 0xa0, which are Unicode space 
+    characters, are recognized by isspace() and therefore were getting set in 
+    these tables. This caused a problem in UTF-8 mode when pcre_study() was
+    used to create a list of bytes that can start a match. For \s, it was
+    including 0x85 and 0xa0, which of course cannot start UTF-8 characters. I
+    have changed the code so that only real ASCII characters (less than 128)
+    are set in this case because the \s etc escapes are documented as 
+    recognizing only ASCII characters. (When PCRE_UCP is set - see 9 above - 
+    the code is different altogether.)
 
 
 Version 8.02 19-Mar-2010
diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c
index 3c6e44b..09677aa 100644
--- a/pcre_dfa_exec.c
+++ b/pcre_dfa_exec.c
@@ -3109,8 +3109,16 @@ for (;;)
         while (current_subject < end_subject)
           {
           register unsigned int c = *current_subject;
-          if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
-            else break;
+          if ((start_bits[c/8] & (1 << (c&7))) == 0) 
+            {
+            current_subject++;
+#ifdef SUPPORT_UTF8
+            if (utf8)
+              while(current_subject < end_subject && 
+                    (*current_subject & 0xc0) == 0x80) current_subject++;
+#endif            
+            }
+          else break;
           }
         }
       }
diff --git a/pcre_exec.c b/pcre_exec.c
index 6d630c8..cdd66ce 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -5959,8 +5959,16 @@ for(;;)
       while (start_match < end_subject)
         {
         register unsigned int c = *start_match;
-        if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
-          else break;
+        if ((start_bits[c/8] & (1 << (c&7))) == 0) 
+          {
+          start_match++;
+#ifdef SUPPORT_UTF8
+          if (utf8)
+            while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
+              start_match++;
+#endif            
+          }
+        else break;
         }
       }
     }   /* Starting optimizations */
diff --git a/pcre_study.c b/pcre_study.c
index e473fdd..51457a7 100644
--- a/pcre_study.c
+++ b/pcre_study.c
@@ -519,6 +519,7 @@ set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
 {
 register int c;
 int yield = SSB_DONE;
+int table_limit = utf8? 16:32;
 
 #if 0
 /* ========================================================================= */
@@ -676,13 +677,14 @@ do
       case OP_HSPACE:
       SET_BIT(0x09);
       SET_BIT(0x20);
-      SET_BIT(0xA0);
       if (utf8)
         {
+        SET_BIT(0xC2);  /* For U+00A0 */ 
         SET_BIT(0xE1);  /* For U+1680, U+180E */
         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
         SET_BIT(0xE3);  /* For U+3000 */
         }
+      else SET_BIT(0xA0);
       try_next = FALSE;
       break;
 
@@ -692,24 +694,33 @@ do
       SET_BIT(0x0B);
       SET_BIT(0x0C);
       SET_BIT(0x0D);
-      SET_BIT(0x85);
-      if (utf8) SET_BIT(0xE2);    /* For U+2028, U+2029 */
+      if (utf8) 
+        { 
+        SET_BIT(0xC2);  /* For U+0085 */ 
+        SET_BIT(0xE2);  /* For U+2028, U+2029 */
+        } 
+      else SET_BIT(0x85);
       try_next = FALSE;
       break;
 
       /* Single character types set the bits and stop. Note that if PCRE_UCP
       is set, we do not see these op codes because \d etc are converted to
       properties. Therefore, these apply in the case when only ASCII characters
-      are recognized to match the types. */
+      are recognized to match the types. In UTF-8 mode, we must restrict 
+      ourselves to bytes less than 128, as otherwise there can be confusion 
+      with bytes in the middle of UTF-8 characters. (In a "traditional" 
+      environment, the tables will only recognize ASCII characters anyway, but 
+      in at least one Windows environment, some higher bytes bits were set in 
+      the tables.) */
 
       case OP_NOT_DIGIT:
-      for (c = 0; c < 32; c++)
+      for (c = 0; c < table_limit; c++)
         start_bits[c] |= ~cd->cbits[c+cbit_digit];
       try_next = FALSE;
       break;
 
       case OP_DIGIT:
-      for (c = 0; c < 32; c++)
+      for (c = 0; c < table_limit; c++)
         start_bits[c] |= cd->cbits[c+cbit_digit];
       try_next = FALSE;
       break;
@@ -718,7 +729,7 @@ do
       discard it. */
 
       case OP_NOT_WHITESPACE:
-      for (c = 0; c < 32; c++)
+      for (c = 0; c < table_limit; c++)
         {
         int d = cd->cbits[c+cbit_space];
         if (c == 1) d &= ~0x08;
@@ -731,7 +742,7 @@ do
       discard it. */
 
       case OP_WHITESPACE:
-      for (c = 0; c < 32; c++)
+      for (c = 0; c < table_limit; c++)
         {
         int d = cd->cbits[c+cbit_space];
         if (c == 1) d &= ~0x08;
@@ -741,13 +752,13 @@ do
       break;
 
       case OP_NOT_WORDCHAR:
-      for (c = 0; c < 32; c++)
+      for (c = 0; c < table_limit; c++)
         start_bits[c] |= ~cd->cbits[c+cbit_word];
       try_next = FALSE;
       break;
 
       case OP_WORDCHAR:
-      for (c = 0; c < 32; c++)
+      for (c = 0; c < table_limit; c++)
         start_bits[c] |= cd->cbits[c+cbit_word];
       try_next = FALSE;
       break;
@@ -789,13 +800,14 @@ do
         case OP_HSPACE:
         SET_BIT(0x09);
         SET_BIT(0x20);
-        SET_BIT(0xA0);
         if (utf8)
           {
+          SET_BIT(0xC2);  /* For U+00A0 */ 
           SET_BIT(0xE1);  /* For U+1680, U+180E */
           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
           SET_BIT(0xE3);  /* For U+3000 */
           }
+        else SET_BIT(0xA0);
         break;
 
         case OP_ANYNL:
@@ -804,17 +816,21 @@ do
         SET_BIT(0x0B);
         SET_BIT(0x0C);
         SET_BIT(0x0D);
-        SET_BIT(0x85);
-        if (utf8) SET_BIT(0xE2);    /* For U+2028, U+2029 */
+        if (utf8) 
+          {
+          SET_BIT(0xC2);  /* For U+0085 */ 
+          SET_BIT(0xE2);  /* For U+2028, U+2029 */
+          } 
+        else SET_BIT(0x85);
         break;
 
         case OP_NOT_DIGIT:
-        for (c = 0; c < 32; c++)
+        for (c = 0; c < table_limit; c++)
           start_bits[c] |= ~cd->cbits[c+cbit_digit];
         break;
 
         case OP_DIGIT:
-        for (c = 0; c < 32; c++)
+        for (c = 0; c < table_limit; c++)
           start_bits[c] |= cd->cbits[c+cbit_digit];
         break;
 
@@ -822,7 +838,7 @@ do
         discard it. */
 
         case OP_NOT_WHITESPACE:
-        for (c = 0; c < 32; c++)
+        for (c = 0; c < table_limit; c++)
           {
           int d = cd->cbits[c+cbit_space];
           if (c == 1) d &= ~0x08;
@@ -834,7 +850,7 @@ do
         discard it. */
 
         case OP_WHITESPACE:
-        for (c = 0; c < 32; c++)
+        for (c = 0; c < table_limit; c++)
           {
           int d = cd->cbits[c+cbit_space];
           if (c == 1) d &= ~0x08;
@@ -843,12 +859,12 @@ do
         break;
 
         case OP_NOT_WORDCHAR:
-        for (c = 0; c < 32; c++)
+        for (c = 0; c < table_limit; c++)
           start_bits[c] |= ~cd->cbits[c+cbit_word];
         break;
 
         case OP_WORDCHAR:
-        for (c = 0; c < 32; c++)
+        for (c = 0; c < table_limit; c++)
           start_bits[c] |= cd->cbits[c+cbit_word];
         break;
         }
diff --git a/testdata/testinput5 b/testdata/testinput5
index 71dff04..156aa03 100644
--- a/testdata/testinput5
+++ b/testdata/testinput5
@@ -777,4 +777,6 @@ can't tell the difference.) --/
     
 /\v+A/SI8
 
+/\s?xxx\s/8SI
+
 /-- End of testinput5 --/
diff --git a/testdata/testoutput5 b/testdata/testoutput5
index aa8ebfa..37ec599 100644
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@@ -2090,13 +2090,13 @@ Options: utf8
 No first char
 No need char
 Subject length lower bound = 1
-Starting byte set: \x09 \x20 \xa0 \xe1 \xe2 \xe3 
+Starting byte set: \x09 \x20 \xc2 \xe1 \xe2 \xe3 
     ABC\x{09}
  0: \x{09}
     ABC\x{20}
  0:  
     ABC\x{a0}
- 0: \xa0
+ 0: \x{a0}
     ABC\x{1680}
  0: \x{1680}
     ABC\x{180e}
@@ -2124,7 +2124,7 @@ Options: utf8
 No first char
 No need char
 Subject length lower bound = 1
-Starting byte set: \x0a \x0b \x0c \x0d \x85 \xe2 
+Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2 
     ABC\x{0a}
  0: \x{0a}
     ABC\x{0b}
@@ -2134,7 +2134,7 @@ Starting byte set: \x0a \x0b \x0c \x0d \x85 \xe2
     ABC\x{0d}
  0: \x{0d}
     ABC\x{85}
- 0: \x85
+ 0: \x{85}
     ABC\x{2028}
  0: \x{2028}
 
@@ -2152,7 +2152,7 @@ Options: utf8
 No first char
 No need char
 Subject length lower bound = 2
-Starting byte set: \x0a \x0b \x0c \x0d \x85 \xe2 
+Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2 
 
 /\h*A/SI8
 Capturing subpattern count = 0
@@ -2160,7 +2160,7 @@ Options: utf8
 No first char
 Need char = 'A'
 Subject length lower bound = 1
-Starting byte set: \x09 \x20 A \xa0 \xe1 \xe2 \xe3 
+Starting byte set: \x09 \x20 A \xc2 \xe1 \xe2 \xe3 
     CDBABC
  0: A
     
@@ -2170,6 +2170,14 @@ Options: utf8
 No first char
 Need char = 'A'
 Subject length lower bound = 2
-Starting byte set: \x0a \x0b \x0c \x0d \x85 \xe2 
+Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2 
+
+/\s?xxx\s/8SI
+Capturing subpattern count = 0
+Options: utf8
+No first char
+Need char = 'x'
+Subject length lower bound = 4
+Starting byte set: \x09 \x0a \x0c \x0d \x20 x 
 
 /-- End of testinput5 --/
author	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2010-06-09 19:30:57 +0000
committer	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2010-06-09 19:30:57 +0000
commit	ccf5f782818c0ab97070ba43f1d41188fd3c252c (patch)
tree	c8ac7b66a820a73bcb7524aa17d7a71ae3b1b82d
parent	84fb9259ee23fe98b0595dac5b7fac1f65574650 (diff)
download	pcre-ccf5f782818c0ab97070ba43f1d41188fd3c252c.tar.gz