summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2010-06-09 19:30:57 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2010-06-09 19:30:57 +0000
commitccf5f782818c0ab97070ba43f1d41188fd3c252c (patch)
treec8ac7b66a820a73bcb7524aa17d7a71ae3b1b82d
parent84fb9259ee23fe98b0595dac5b7fac1f65574650 (diff)
downloadpcre-ccf5f782818c0ab97070ba43f1d41188fd3c252c.tar.gz
Fix pcre_study() problem with non-C-locale chartables in UTF-8 mode.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@538 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog12
-rw-r--r--pcre_dfa_exec.c12
-rw-r--r--pcre_exec.c12
-rw-r--r--pcre_study.c54
-rw-r--r--testdata/testinput52
-rw-r--r--testdata/testoutput522
6 files changed, 84 insertions, 30 deletions
diff --git a/ChangeLog b/ChangeLog
index 2e120f8..d1821c3 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -72,6 +72,18 @@ Version 8.10 03-Jun-2010
18. If the last data line in a file for pcretest does not have a newline on
the end, a newline was missing in the output.
+19. The default pcre_chartables.c file recognizes only ASCII characters (values
+ less than 128) in its various bitmaps. However, there is a facility for
+ generating tables according to the current locale when PCRE is compiled. It
+ turns out that in some environments, 0x85 and 0xa0, which are Unicode space
+ characters, are recognized by isspace() and therefore were getting set in
+ these tables. This caused a problem in UTF-8 mode when pcre_study() was
+ used to create a list of bytes that can start a match. For \s, it was
+ including 0x85 and 0xa0, which of course cannot start UTF-8 characters. I
+ have changed the code so that only real ASCII characters (less than 128)
+ are set in this case because the \s etc escapes are documented as
+ recognizing only ASCII characters. (When PCRE_UCP is set - see 9 above -
+ the code is different altogether.)
Version 8.02 19-Mar-2010
diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c
index 3c6e44b..09677aa 100644
--- a/pcre_dfa_exec.c
+++ b/pcre_dfa_exec.c
@@ -3109,8 +3109,16 @@ for (;;)
while (current_subject < end_subject)
{
register unsigned int c = *current_subject;
- if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
- else break;
+ if ((start_bits[c/8] & (1 << (c&7))) == 0)
+ {
+ current_subject++;
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ while(current_subject < end_subject &&
+ (*current_subject & 0xc0) == 0x80) current_subject++;
+#endif
+ }
+ else break;
}
}
}
diff --git a/pcre_exec.c b/pcre_exec.c
index 6d630c8..cdd66ce 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -5959,8 +5959,16 @@ for(;;)
while (start_match < end_subject)
{
register unsigned int c = *start_match;
- if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
- else break;
+ if ((start_bits[c/8] & (1 << (c&7))) == 0)
+ {
+ start_match++;
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
+ start_match++;
+#endif
+ }
+ else break;
}
}
} /* Starting optimizations */
diff --git a/pcre_study.c b/pcre_study.c
index e473fdd..51457a7 100644
--- a/pcre_study.c
+++ b/pcre_study.c
@@ -519,6 +519,7 @@ set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
{
register int c;
int yield = SSB_DONE;
+int table_limit = utf8? 16:32;
#if 0
/* ========================================================================= */
@@ -676,13 +677,14 @@ do
case OP_HSPACE:
SET_BIT(0x09);
SET_BIT(0x20);
- SET_BIT(0xA0);
if (utf8)
{
+ SET_BIT(0xC2); /* For U+00A0 */
SET_BIT(0xE1); /* For U+1680, U+180E */
SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
SET_BIT(0xE3); /* For U+3000 */
}
+ else SET_BIT(0xA0);
try_next = FALSE;
break;
@@ -692,24 +694,33 @@ do
SET_BIT(0x0B);
SET_BIT(0x0C);
SET_BIT(0x0D);
- SET_BIT(0x85);
- if (utf8) SET_BIT(0xE2); /* For U+2028, U+2029 */
+ if (utf8)
+ {
+ SET_BIT(0xC2); /* For U+0085 */
+ SET_BIT(0xE2); /* For U+2028, U+2029 */
+ }
+ else SET_BIT(0x85);
try_next = FALSE;
break;
/* Single character types set the bits and stop. Note that if PCRE_UCP
is set, we do not see these op codes because \d etc are converted to
properties. Therefore, these apply in the case when only ASCII characters
- are recognized to match the types. */
+ are recognized to match the types. In UTF-8 mode, we must restrict
+ ourselves to bytes less than 128, as otherwise there can be confusion
+ with bytes in the middle of UTF-8 characters. (In a "traditional"
+ environment, the tables will only recognize ASCII characters anyway, but
+ in at least one Windows environment, some higher bytes bits were set in
+ the tables.) */
case OP_NOT_DIGIT:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
start_bits[c] |= ~cd->cbits[c+cbit_digit];
try_next = FALSE;
break;
case OP_DIGIT:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
start_bits[c] |= cd->cbits[c+cbit_digit];
try_next = FALSE;
break;
@@ -718,7 +729,7 @@ do
discard it. */
case OP_NOT_WHITESPACE:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
{
int d = cd->cbits[c+cbit_space];
if (c == 1) d &= ~0x08;
@@ -731,7 +742,7 @@ do
discard it. */
case OP_WHITESPACE:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
{
int d = cd->cbits[c+cbit_space];
if (c == 1) d &= ~0x08;
@@ -741,13 +752,13 @@ do
break;
case OP_NOT_WORDCHAR:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
start_bits[c] |= ~cd->cbits[c+cbit_word];
try_next = FALSE;
break;
case OP_WORDCHAR:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
start_bits[c] |= cd->cbits[c+cbit_word];
try_next = FALSE;
break;
@@ -789,13 +800,14 @@ do
case OP_HSPACE:
SET_BIT(0x09);
SET_BIT(0x20);
- SET_BIT(0xA0);
if (utf8)
{
+ SET_BIT(0xC2); /* For U+00A0 */
SET_BIT(0xE1); /* For U+1680, U+180E */
SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
SET_BIT(0xE3); /* For U+3000 */
}
+ else SET_BIT(0xA0);
break;
case OP_ANYNL:
@@ -804,17 +816,21 @@ do
SET_BIT(0x0B);
SET_BIT(0x0C);
SET_BIT(0x0D);
- SET_BIT(0x85);
- if (utf8) SET_BIT(0xE2); /* For U+2028, U+2029 */
+ if (utf8)
+ {
+ SET_BIT(0xC2); /* For U+0085 */
+ SET_BIT(0xE2); /* For U+2028, U+2029 */
+ }
+ else SET_BIT(0x85);
break;
case OP_NOT_DIGIT:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
start_bits[c] |= ~cd->cbits[c+cbit_digit];
break;
case OP_DIGIT:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
start_bits[c] |= cd->cbits[c+cbit_digit];
break;
@@ -822,7 +838,7 @@ do
discard it. */
case OP_NOT_WHITESPACE:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
{
int d = cd->cbits[c+cbit_space];
if (c == 1) d &= ~0x08;
@@ -834,7 +850,7 @@ do
discard it. */
case OP_WHITESPACE:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
{
int d = cd->cbits[c+cbit_space];
if (c == 1) d &= ~0x08;
@@ -843,12 +859,12 @@ do
break;
case OP_NOT_WORDCHAR:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
start_bits[c] |= ~cd->cbits[c+cbit_word];
break;
case OP_WORDCHAR:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
start_bits[c] |= cd->cbits[c+cbit_word];
break;
}
diff --git a/testdata/testinput5 b/testdata/testinput5
index 71dff04..156aa03 100644
--- a/testdata/testinput5
+++ b/testdata/testinput5
@@ -777,4 +777,6 @@ can't tell the difference.) --/
/\v+A/SI8
+/\s?xxx\s/8SI
+
/-- End of testinput5 --/
diff --git a/testdata/testoutput5 b/testdata/testoutput5
index aa8ebfa..37ec599 100644
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@@ -2090,13 +2090,13 @@ Options: utf8
No first char
No need char
Subject length lower bound = 1
-Starting byte set: \x09 \x20 \xa0 \xe1 \xe2 \xe3
+Starting byte set: \x09 \x20 \xc2 \xe1 \xe2 \xe3
ABC\x{09}
0: \x{09}
ABC\x{20}
0:
ABC\x{a0}
- 0: \xa0
+ 0: \x{a0}
ABC\x{1680}
0: \x{1680}
ABC\x{180e}
@@ -2124,7 +2124,7 @@ Options: utf8
No first char
No need char
Subject length lower bound = 1
-Starting byte set: \x0a \x0b \x0c \x0d \x85 \xe2
+Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2
ABC\x{0a}
0: \x{0a}
ABC\x{0b}
@@ -2134,7 +2134,7 @@ Starting byte set: \x0a \x0b \x0c \x0d \x85 \xe2
ABC\x{0d}
0: \x{0d}
ABC\x{85}
- 0: \x85
+ 0: \x{85}
ABC\x{2028}
0: \x{2028}
@@ -2152,7 +2152,7 @@ Options: utf8
No first char
No need char
Subject length lower bound = 2
-Starting byte set: \x0a \x0b \x0c \x0d \x85 \xe2
+Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2
/\h*A/SI8
Capturing subpattern count = 0
@@ -2160,7 +2160,7 @@ Options: utf8
No first char
Need char = 'A'
Subject length lower bound = 1
-Starting byte set: \x09 \x20 A \xa0 \xe1 \xe2 \xe3
+Starting byte set: \x09 \x20 A \xc2 \xe1 \xe2 \xe3
CDBABC
0: A
@@ -2170,6 +2170,14 @@ Options: utf8
No first char
Need char = 'A'
Subject length lower bound = 2
-Starting byte set: \x0a \x0b \x0c \x0d \x85 \xe2
+Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2
+
+/\s?xxx\s/8SI
+Capturing subpattern count = 0
+Options: utf8
+No first char
+Need char = 'x'
+Subject length lower bound = 4
+Starting byte set: \x09 \x0a \x0c \x0d \x20 x
/-- End of testinput5 --/