summaryrefslogtreecommitdiff
path: root/pcre_study.c
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2010-06-09 19:30:57 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2010-06-09 19:30:57 +0000
commitccf5f782818c0ab97070ba43f1d41188fd3c252c (patch)
treec8ac7b66a820a73bcb7524aa17d7a71ae3b1b82d /pcre_study.c
parent84fb9259ee23fe98b0595dac5b7fac1f65574650 (diff)
downloadpcre-ccf5f782818c0ab97070ba43f1d41188fd3c252c.tar.gz
Fix pcre_study() problem with non-C-locale chartables in UTF-8 mode.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@538 2f5784b3-3f2a-0410-8824-cb99058d5e15
Diffstat (limited to 'pcre_study.c')
-rw-r--r--pcre_study.c54
1 files changed, 35 insertions, 19 deletions
diff --git a/pcre_study.c b/pcre_study.c
index e473fdd..51457a7 100644
--- a/pcre_study.c
+++ b/pcre_study.c
@@ -519,6 +519,7 @@ set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
{
register int c;
int yield = SSB_DONE;
+int table_limit = utf8? 16:32;
#if 0
/* ========================================================================= */
@@ -676,13 +677,14 @@ do
case OP_HSPACE:
SET_BIT(0x09);
SET_BIT(0x20);
- SET_BIT(0xA0);
if (utf8)
{
+ SET_BIT(0xC2); /* For U+00A0 */
SET_BIT(0xE1); /* For U+1680, U+180E */
SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
SET_BIT(0xE3); /* For U+3000 */
}
+ else SET_BIT(0xA0);
try_next = FALSE;
break;
@@ -692,24 +694,33 @@ do
SET_BIT(0x0B);
SET_BIT(0x0C);
SET_BIT(0x0D);
- SET_BIT(0x85);
- if (utf8) SET_BIT(0xE2); /* For U+2028, U+2029 */
+ if (utf8)
+ {
+ SET_BIT(0xC2); /* For U+0085 */
+ SET_BIT(0xE2); /* For U+2028, U+2029 */
+ }
+ else SET_BIT(0x85);
try_next = FALSE;
break;
/* Single character types set the bits and stop. Note that if PCRE_UCP
is set, we do not see these op codes because \d etc are converted to
properties. Therefore, these apply in the case when only ASCII characters
- are recognized to match the types. */
+ are recognized to match the types. In UTF-8 mode, we must restrict
+ ourselves to bytes less than 128, as otherwise there can be confusion
+ with bytes in the middle of UTF-8 characters. (In a "traditional"
+ environment, the tables will only recognize ASCII characters anyway, but
+ in at least one Windows environment, some higher bytes bits were set in
+ the tables.) */
case OP_NOT_DIGIT:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
start_bits[c] |= ~cd->cbits[c+cbit_digit];
try_next = FALSE;
break;
case OP_DIGIT:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
start_bits[c] |= cd->cbits[c+cbit_digit];
try_next = FALSE;
break;
@@ -718,7 +729,7 @@ do
discard it. */
case OP_NOT_WHITESPACE:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
{
int d = cd->cbits[c+cbit_space];
if (c == 1) d &= ~0x08;
@@ -731,7 +742,7 @@ do
discard it. */
case OP_WHITESPACE:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
{
int d = cd->cbits[c+cbit_space];
if (c == 1) d &= ~0x08;
@@ -741,13 +752,13 @@ do
break;
case OP_NOT_WORDCHAR:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
start_bits[c] |= ~cd->cbits[c+cbit_word];
try_next = FALSE;
break;
case OP_WORDCHAR:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
start_bits[c] |= cd->cbits[c+cbit_word];
try_next = FALSE;
break;
@@ -789,13 +800,14 @@ do
case OP_HSPACE:
SET_BIT(0x09);
SET_BIT(0x20);
- SET_BIT(0xA0);
if (utf8)
{
+ SET_BIT(0xC2); /* For U+00A0 */
SET_BIT(0xE1); /* For U+1680, U+180E */
SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
SET_BIT(0xE3); /* For U+3000 */
}
+ else SET_BIT(0xA0);
break;
case OP_ANYNL:
@@ -804,17 +816,21 @@ do
SET_BIT(0x0B);
SET_BIT(0x0C);
SET_BIT(0x0D);
- SET_BIT(0x85);
- if (utf8) SET_BIT(0xE2); /* For U+2028, U+2029 */
+ if (utf8)
+ {
+ SET_BIT(0xC2); /* For U+0085 */
+ SET_BIT(0xE2); /* For U+2028, U+2029 */
+ }
+ else SET_BIT(0x85);
break;
case OP_NOT_DIGIT:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
start_bits[c] |= ~cd->cbits[c+cbit_digit];
break;
case OP_DIGIT:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
start_bits[c] |= cd->cbits[c+cbit_digit];
break;
@@ -822,7 +838,7 @@ do
discard it. */
case OP_NOT_WHITESPACE:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
{
int d = cd->cbits[c+cbit_space];
if (c == 1) d &= ~0x08;
@@ -834,7 +850,7 @@ do
discard it. */
case OP_WHITESPACE:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
{
int d = cd->cbits[c+cbit_space];
if (c == 1) d &= ~0x08;
@@ -843,12 +859,12 @@ do
break;
case OP_NOT_WORDCHAR:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
start_bits[c] |= ~cd->cbits[c+cbit_word];
break;
case OP_WORDCHAR:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
start_bits[c] |= cd->cbits[c+cbit_word];
break;
}