diff options
author | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2010-06-09 19:30:57 +0000 |
---|---|---|
committer | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2010-06-09 19:30:57 +0000 |
commit | ccf5f782818c0ab97070ba43f1d41188fd3c252c (patch) | |
tree | c8ac7b66a820a73bcb7524aa17d7a71ae3b1b82d | |
parent | 84fb9259ee23fe98b0595dac5b7fac1f65574650 (diff) | |
download | pcre-ccf5f782818c0ab97070ba43f1d41188fd3c252c.tar.gz |
Fix pcre_study() problem with non-C-locale chartables in UTF-8 mode.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@538 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | ChangeLog | 12 | ||||
-rw-r--r-- | pcre_dfa_exec.c | 12 | ||||
-rw-r--r-- | pcre_exec.c | 12 | ||||
-rw-r--r-- | pcre_study.c | 54 | ||||
-rw-r--r-- | testdata/testinput5 | 2 | ||||
-rw-r--r-- | testdata/testoutput5 | 22 |
6 files changed, 84 insertions, 30 deletions
@@ -72,6 +72,18 @@ Version 8.10 03-Jun-2010 18. If the last data line in a file for pcretest does not have a newline on the end, a newline was missing in the output. +19. The default pcre_chartables.c file recognizes only ASCII characters (values + less than 128) in its various bitmaps. However, there is a facility for + generating tables according to the current locale when PCRE is compiled. It + turns out that in some environments, 0x85 and 0xa0, which are Unicode space + characters, are recognized by isspace() and therefore were getting set in + these tables. This caused a problem in UTF-8 mode when pcre_study() was + used to create a list of bytes that can start a match. For \s, it was + including 0x85 and 0xa0, which of course cannot start UTF-8 characters. I + have changed the code so that only real ASCII characters (less than 128) + are set in this case because the \s etc escapes are documented as + recognizing only ASCII characters. (When PCRE_UCP is set - see 9 above - + the code is different altogether.) Version 8.02 19-Mar-2010 diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c index 3c6e44b..09677aa 100644 --- a/pcre_dfa_exec.c +++ b/pcre_dfa_exec.c @@ -3109,8 +3109,16 @@ for (;;) while (current_subject < end_subject) { register unsigned int c = *current_subject; - if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++; - else break; + if ((start_bits[c/8] & (1 << (c&7))) == 0) + { + current_subject++; +#ifdef SUPPORT_UTF8 + if (utf8) + while(current_subject < end_subject && + (*current_subject & 0xc0) == 0x80) current_subject++; +#endif + } + else break; } } } diff --git a/pcre_exec.c b/pcre_exec.c index 6d630c8..cdd66ce 100644 --- a/pcre_exec.c +++ b/pcre_exec.c @@ -5959,8 +5959,16 @@ for(;;) while (start_match < end_subject) { register unsigned int c = *start_match; - if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; - else break; + if ((start_bits[c/8] & (1 << (c&7))) == 0) + { + start_match++; +#ifdef SUPPORT_UTF8 + if (utf8) + while(start_match < end_subject && (*start_match & 0xc0) == 0x80) + start_match++; +#endif + } + else break; } } } /* Starting optimizations */ diff --git a/pcre_study.c b/pcre_study.c index e473fdd..51457a7 100644 --- a/pcre_study.c +++ b/pcre_study.c @@ -519,6 +519,7 @@ set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless, { register int c; int yield = SSB_DONE; +int table_limit = utf8? 16:32; #if 0 /* ========================================================================= */ @@ -676,13 +677,14 @@ do case OP_HSPACE: SET_BIT(0x09); SET_BIT(0x20); - SET_BIT(0xA0); if (utf8) { + SET_BIT(0xC2); /* For U+00A0 */ SET_BIT(0xE1); /* For U+1680, U+180E */ SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */ SET_BIT(0xE3); /* For U+3000 */ } + else SET_BIT(0xA0); try_next = FALSE; break; @@ -692,24 +694,33 @@ do SET_BIT(0x0B); SET_BIT(0x0C); SET_BIT(0x0D); - SET_BIT(0x85); - if (utf8) SET_BIT(0xE2); /* For U+2028, U+2029 */ + if (utf8) + { + SET_BIT(0xC2); /* For U+0085 */ + SET_BIT(0xE2); /* For U+2028, U+2029 */ + } + else SET_BIT(0x85); try_next = FALSE; break; /* Single character types set the bits and stop. Note that if PCRE_UCP is set, we do not see these op codes because \d etc are converted to properties. Therefore, these apply in the case when only ASCII characters - are recognized to match the types. */ + are recognized to match the types. In UTF-8 mode, we must restrict + ourselves to bytes less than 128, as otherwise there can be confusion + with bytes in the middle of UTF-8 characters. (In a "traditional" + environment, the tables will only recognize ASCII characters anyway, but + in at least one Windows environment, some higher bytes bits were set in + the tables.) */ case OP_NOT_DIGIT: - for (c = 0; c < 32; c++) + for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_digit]; try_next = FALSE; break; case OP_DIGIT: - for (c = 0; c < 32; c++) + for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_digit]; try_next = FALSE; break; @@ -718,7 +729,7 @@ do discard it. */ case OP_NOT_WHITESPACE: - for (c = 0; c < 32; c++) + for (c = 0; c < table_limit; c++) { int d = cd->cbits[c+cbit_space]; if (c == 1) d &= ~0x08; @@ -731,7 +742,7 @@ do discard it. */ case OP_WHITESPACE: - for (c = 0; c < 32; c++) + for (c = 0; c < table_limit; c++) { int d = cd->cbits[c+cbit_space]; if (c == 1) d &= ~0x08; @@ -741,13 +752,13 @@ do break; case OP_NOT_WORDCHAR: - for (c = 0; c < 32; c++) + for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_word]; try_next = FALSE; break; case OP_WORDCHAR: - for (c = 0; c < 32; c++) + for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_word]; try_next = FALSE; break; @@ -789,13 +800,14 @@ do case OP_HSPACE: SET_BIT(0x09); SET_BIT(0x20); - SET_BIT(0xA0); if (utf8) { + SET_BIT(0xC2); /* For U+00A0 */ SET_BIT(0xE1); /* For U+1680, U+180E */ SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */ SET_BIT(0xE3); /* For U+3000 */ } + else SET_BIT(0xA0); break; case OP_ANYNL: @@ -804,17 +816,21 @@ do SET_BIT(0x0B); SET_BIT(0x0C); SET_BIT(0x0D); - SET_BIT(0x85); - if (utf8) SET_BIT(0xE2); /* For U+2028, U+2029 */ + if (utf8) + { + SET_BIT(0xC2); /* For U+0085 */ + SET_BIT(0xE2); /* For U+2028, U+2029 */ + } + else SET_BIT(0x85); break; case OP_NOT_DIGIT: - for (c = 0; c < 32; c++) + for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_digit]; break; case OP_DIGIT: - for (c = 0; c < 32; c++) + for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_digit]; break; @@ -822,7 +838,7 @@ do discard it. */ case OP_NOT_WHITESPACE: - for (c = 0; c < 32; c++) + for (c = 0; c < table_limit; c++) { int d = cd->cbits[c+cbit_space]; if (c == 1) d &= ~0x08; @@ -834,7 +850,7 @@ do discard it. */ case OP_WHITESPACE: - for (c = 0; c < 32; c++) + for (c = 0; c < table_limit; c++) { int d = cd->cbits[c+cbit_space]; if (c == 1) d &= ~0x08; @@ -843,12 +859,12 @@ do break; case OP_NOT_WORDCHAR: - for (c = 0; c < 32; c++) + for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_word]; break; case OP_WORDCHAR: - for (c = 0; c < 32; c++) + for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_word]; break; } diff --git a/testdata/testinput5 b/testdata/testinput5 index 71dff04..156aa03 100644 --- a/testdata/testinput5 +++ b/testdata/testinput5 @@ -777,4 +777,6 @@ can't tell the difference.) --/ /\v+A/SI8 +/\s?xxx\s/8SI + /-- End of testinput5 --/ diff --git a/testdata/testoutput5 b/testdata/testoutput5 index aa8ebfa..37ec599 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -2090,13 +2090,13 @@ Options: utf8 No first char No need char Subject length lower bound = 1 -Starting byte set: \x09 \x20 \xa0 \xe1 \xe2 \xe3 +Starting byte set: \x09 \x20 \xc2 \xe1 \xe2 \xe3 ABC\x{09} 0: \x{09} ABC\x{20} 0: ABC\x{a0} - 0: \xa0 + 0: \x{a0} ABC\x{1680} 0: \x{1680} ABC\x{180e} @@ -2124,7 +2124,7 @@ Options: utf8 No first char No need char Subject length lower bound = 1 -Starting byte set: \x0a \x0b \x0c \x0d \x85 \xe2 +Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2 ABC\x{0a} 0: \x{0a} ABC\x{0b} @@ -2134,7 +2134,7 @@ Starting byte set: \x0a \x0b \x0c \x0d \x85 \xe2 ABC\x{0d} 0: \x{0d} ABC\x{85} - 0: \x85 + 0: \x{85} ABC\x{2028} 0: \x{2028} @@ -2152,7 +2152,7 @@ Options: utf8 No first char No need char Subject length lower bound = 2 -Starting byte set: \x0a \x0b \x0c \x0d \x85 \xe2 +Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2 /\h*A/SI8 Capturing subpattern count = 0 @@ -2160,7 +2160,7 @@ Options: utf8 No first char Need char = 'A' Subject length lower bound = 1 -Starting byte set: \x09 \x20 A \xa0 \xe1 \xe2 \xe3 +Starting byte set: \x09 \x20 A \xc2 \xe1 \xe2 \xe3 CDBABC 0: A @@ -2170,6 +2170,14 @@ Options: utf8 No first char Need char = 'A' Subject length lower bound = 2 -Starting byte set: \x0a \x0b \x0c \x0d \x85 \xe2 +Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2 + +/\s?xxx\s/8SI +Capturing subpattern count = 0 +Options: utf8 +No first char +Need char = 'x' +Subject length lower bound = 4 +Starting byte set: \x09 \x0a \x0c \x0d \x20 x /-- End of testinput5 --/ |