summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2010-06-13 21:35:04 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2010-06-13 21:35:04 +0000
commitd940107b6f55a1f47fb03ddb48e6246175ba669b (patch)
treeb8a88629ff4c4d8a502de861d64786299c53abbb
parentccf5f782818c0ab97070ba43f1d41188fd3c252c (diff)
downloadpcre-d940107b6f55a1f47fb03ddb48e6246175ba669b.tar.gz
A more correct fix for the chartables bug with UTF-8 and non-std tables.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@539 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog5
-rw-r--r--configure.ac4
-rw-r--r--maint/README6
-rw-r--r--maint/pcre_chartables.c.non-standard138
-rw-r--r--pcre_study.c148
5 files changed, 244 insertions, 57 deletions
diff --git a/ChangeLog b/ChangeLog
index d1821c3..b1c2853 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -81,9 +81,8 @@ Version 8.10 03-Jun-2010
used to create a list of bytes that can start a match. For \s, it was
including 0x85 and 0xa0, which of course cannot start UTF-8 characters. I
have changed the code so that only real ASCII characters (less than 128)
- are set in this case because the \s etc escapes are documented as
- recognizing only ASCII characters. (When PCRE_UCP is set - see 9 above -
- the code is different altogether.)
+ and the correct starting bytes for UTF-8 encodings are set in this case.
+ (When PCRE_UCP is set - see 9 above - the code is different altogether.)
Version 8.02 19-Mar-2010
diff --git a/configure.ac b/configure.ac
index 499ae87..f88ad39 100644
--- a/configure.ac
+++ b/configure.ac
@@ -10,8 +10,8 @@ dnl be defined as -RC2, for example. For real releases, it should be empty.
m4_define(pcre_major, [8])
m4_define(pcre_minor, [10])
-m4_define(pcre_prerelease, [-RC1])
-m4_define(pcre_date, [2010-06-03])
+m4_define(pcre_prerelease, [-RC2])
+m4_define(pcre_date, [2010-06-11])
# Libtool shared library interface versions (current:revision:age)
m4_define(libpcre_version, [0:1:0])
diff --git a/maint/README b/maint/README
index f6c9102..06b2883 100644
--- a/maint/README
+++ b/maint/README
@@ -35,6 +35,12 @@ MultiStage2.py A Python script that generates the file pcre_ucd.c from three
Unicode web site. Run this script in the "maint" directory.
The generated file contains the tables for a 2-stage lookup
of Unicode properties.
+
+pcre_chartables.c.non-standard
+ This is a set of character tables that came from a Windows
+ system. It has characters greater than 128 that are set as
+ spaces, amongst other things. I kept it so that it can be
+ used for testing from time to time.
README This file.
diff --git a/maint/pcre_chartables.c.non-standard b/maint/pcre_chartables.c.non-standard
new file mode 100644
index 0000000..c18d49d
--- /dev/null
+++ b/maint/pcre_chartables.c.non-standard
@@ -0,0 +1,138 @@
+const unsigned char _pcre_default_tables[] = {
+0,1,2,3,4,5,6,7,
+8,9,10,11,12,13,14,15,
+16,17,18,19,20,21,22,23,
+24,25,26,27,28,29,30,31,
+32,33,34,35,36,37,38,39,
+40,41,42,43,44,45,46,47,
+48,49,50,51,52,53,54,55,
+56,57,58,59,60,61,62,63,
+64,97,98,99,100,101,102,103,
+104,105,106,107,108,109,110,111,
+112,113,114,115,116,117,118,119,
+120,121,122,91,92,93,94,95,
+96,97,98,99,100,101,102,103,
+104,105,106,107,108,109,110,111,
+112,113,114,115,116,117,118,119,
+120,121,122,123,124,125,126,127,
+128,129,130,131,132,133,134,135,
+136,137,138,139,140,141,142,143,
+144,145,146,147,148,149,150,151,
+152,153,154,155,156,157,158,159,
+160,161,162,163,164,165,166,167,
+168,169,170,171,172,173,174,175,
+176,177,178,179,180,181,182,183,
+184,185,186,187,188,189,190,191,
+224,225,226,227,228,229,230,231,
+232,233,234,235,236,237,238,239,
+240,241,242,243,244,245,246,215,
+248,249,250,251,252,253,254,223,
+224,225,226,227,228,229,230,231,
+232,233,234,235,236,237,238,239,
+240,241,242,243,244,245,246,247,
+248,249,250,251,252,253,254,255,
+0,1,2,3,4,5,6,7,
+8,9,10,11,12,13,14,15,
+16,17,18,19,20,21,22,23,
+24,25,26,27,28,29,30,31,
+32,33,34,35,36,37,38,39,
+40,41,42,43,44,45,46,47,
+48,49,50,51,52,53,54,55,
+56,57,58,59,60,61,62,63,
+64,97,98,99,100,101,102,103,
+104,105,106,107,108,109,110,111,
+112,113,114,115,116,117,118,119,
+120,121,122,91,92,93,94,95,
+96,65,66,67,68,69,70,71,
+72,73,74,75,76,77,78,79,
+80,81,82,83,84,85,86,87,
+88,89,90,123,124,125,126,127,
+128,129,130,131,132,133,134,135,
+136,137,138,139,140,141,142,143,
+144,145,146,147,148,149,150,151,
+152,153,154,155,156,157,158,159,
+160,161,162,163,164,165,166,167,
+168,169,170,171,172,173,174,175,
+176,177,178,179,180,181,182,183,
+184,185,186,187,188,189,190,191,
+224,225,226,227,228,229,230,231,
+232,233,234,235,236,237,238,239,
+240,241,242,243,244,245,246,215,
+248,249,250,251,252,253,254,223,
+192,193,194,195,196,197,198,199,
+200,201,202,203,204,205,206,207,
+208,209,210,211,212,213,214,247,
+216,217,218,219,220,221,222,255,
+0,62,0,0,1,0,0,0,
+0,0,0,0,0,0,0,0,
+32,0,0,0,1,0,0,0,
+0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,255,3,
+126,0,0,0,126,0,0,0,
+0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,255,3,
+0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,12,2,
+0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,
+254,255,255,7,0,0,0,0,
+0,0,0,0,0,0,0,0,
+255,255,127,127,0,0,0,0,
+0,0,0,0,0,0,0,0,
+0,0,0,0,254,255,255,7,
+0,0,0,0,0,4,32,4,
+0,0,0,128,255,255,127,255,
+0,0,0,0,0,0,255,3,
+254,255,255,135,254,255,255,7,
+0,0,0,0,0,4,44,6,
+255,255,127,255,255,255,127,255,
+0,0,0,0,254,255,255,255,
+255,255,255,255,255,255,255,127,
+0,0,0,0,254,255,255,255,
+255,255,255,255,255,255,255,255,
+0,2,0,0,255,255,255,255,
+255,255,255,255,255,255,255,127,
+0,0,0,0,255,255,255,255,
+255,255,255,255,255,255,255,255,
+0,0,0,0,254,255,0,252,
+1,0,0,248,1,0,0,120,
+0,0,0,0,254,255,255,255,
+0,0,128,0,0,0,128,0,
+255,255,255,255,0,0,0,0,
+0,0,0,0,0,0,0,128,
+255,255,255,255,0,0,0,0,
+0,0,0,0,0,0,0,0,
+128,0,0,0,0,0,0,0,
+0,1,1,0,1,1,0,0,
+0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,
+1,0,0,0,128,0,0,0,
+128,128,128,128,0,0,128,0,
+28,28,28,28,28,28,28,28,
+28,28,0,0,0,0,0,128,
+0,26,26,26,26,26,26,18,
+18,18,18,18,18,18,18,18,
+18,18,18,18,18,18,18,18,
+18,18,18,128,128,0,128,16,
+0,26,26,26,26,26,26,18,
+18,18,18,18,18,18,18,18,
+18,18,18,18,18,18,18,18,
+18,18,18,128,128,0,0,0,
+0,0,0,0,0,1,0,0,
+0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,
+1,0,0,0,0,0,0,0,
+0,0,18,0,0,0,0,0,
+0,0,20,20,0,18,0,0,
+0,20,18,0,0,0,0,0,
+18,18,18,18,18,18,18,18,
+18,18,18,18,18,18,18,18,
+18,18,18,18,18,18,18,0,
+18,18,18,18,18,18,18,18,
+18,18,18,18,18,18,18,18,
+18,18,18,18,18,18,18,18,
+18,18,18,18,18,18,18,0,
+18,18,18,18,18,18,18,18
+};
diff --git a/pcre_study.c b/pcre_study.c
index 51457a7..71d2526 100644
--- a/pcre_study.c
+++ b/pcre_study.c
@@ -490,6 +490,77 @@ return p + 1;
/*************************************************
+* Set bits for a positive character type *
+*************************************************/
+
+/* This function sets starting bits for a character type. In UTF-8 mode, we can
+only do a direct setting for bytes less than 128, as otherwise there can be
+confusion with bytes in the middle of UTF-8 characters. In a "traditional"
+environment, the tables will only recognize ASCII characters anyway, but in at
+least one Windows environment, some higher bytes bits were set in the tables.
+So we deal with that case by considering the UTF-8 encoding.
+
+Arguments:
+ start_bits the starting bitmap
+ cbit type the type of character wanted
+ table_limit 32 for non-UTF-8; 16 for UTF-8
+ cd the block with char table pointers
+
+Returns: nothing
+*/
+
+static void
+set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
+ compile_data *cd)
+{
+register int c;
+for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
+if (table_limit == 32) return;
+for (c = 128; c < 256; c++)
+ {
+ if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
+ {
+ uschar buff[8];
+ (void)_pcre_ord2utf8(c, buff);
+ SET_BIT(buff[0]);
+ }
+ }
+}
+
+
+/*************************************************
+* Set bits for a negative character type *
+*************************************************/
+
+/* This function sets starting bits for a negative character type such as \D.
+In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
+otherwise there can be confusion with bytes in the middle of UTF-8 characters.
+Unlike in the positive case, where we can set appropriate starting bits for
+specific high-valued UTF-8 characters, in this case we have to set the bits for
+all high-valued characters. The lowest is 0xc2, but we overkill by starting at
+0xc0 (192) for simplicity.
+
+Arguments:
+ start_bits the starting bitmap
+ cbit type the type of character wanted
+ table_limit 32 for non-UTF-8; 16 for UTF-8
+ cd the block with char table pointers
+
+Returns: nothing
+*/
+
+static void
+set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
+ compile_data *cd)
+{
+register int c;
+for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
+if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
+}
+
+
+
+/*************************************************
* Create bitmap of starting bytes *
*************************************************/
@@ -705,64 +776,48 @@ do
/* Single character types set the bits and stop. Note that if PCRE_UCP
is set, we do not see these op codes because \d etc are converted to
- properties. Therefore, these apply in the case when only ASCII characters
- are recognized to match the types. In UTF-8 mode, we must restrict
- ourselves to bytes less than 128, as otherwise there can be confusion
- with bytes in the middle of UTF-8 characters. (In a "traditional"
- environment, the tables will only recognize ASCII characters anyway, but
- in at least one Windows environment, some higher bytes bits were set in
- the tables.) */
+ properties. Therefore, these apply in the case when only characters less
+ than 256 are recognized to match the types. */
case OP_NOT_DIGIT:
- for (c = 0; c < table_limit; c++)
- start_bits[c] |= ~cd->cbits[c+cbit_digit];
+ set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
try_next = FALSE;
break;
case OP_DIGIT:
- for (c = 0; c < table_limit; c++)
- start_bits[c] |= cd->cbits[c+cbit_digit];
+ set_type_bits(start_bits, cbit_digit, table_limit, cd);
try_next = FALSE;
break;
/* The cbit_space table has vertical tab as whitespace; we have to
- discard it. */
+ ensure it is set as not whitespace. */
case OP_NOT_WHITESPACE:
- for (c = 0; c < table_limit; c++)
- {
- int d = cd->cbits[c+cbit_space];
- if (c == 1) d &= ~0x08;
- start_bits[c] |= ~d;
- }
+ set_nottype_bits(start_bits, cbit_space, table_limit, cd);
+ start_bits[1] |= 0x08;
try_next = FALSE;
break;
/* The cbit_space table has vertical tab as whitespace; we have to
- discard it. */
+ not set it from the table. */
case OP_WHITESPACE:
- for (c = 0; c < table_limit; c++)
- {
- int d = cd->cbits[c+cbit_space];
- if (c == 1) d &= ~0x08;
- start_bits[c] |= d;
- }
+ c = start_bits[1]; /* Save in case it was already set */
+ set_type_bits(start_bits, cbit_space, table_limit, cd);
+ start_bits[1] = (start_bits[1] & ~0x08) | c;
try_next = FALSE;
break;
case OP_NOT_WORDCHAR:
- for (c = 0; c < table_limit; c++)
- start_bits[c] |= ~cd->cbits[c+cbit_word];
+ set_nottype_bits(start_bits, cbit_word, table_limit, cd);
try_next = FALSE;
break;
case OP_WORDCHAR:
- for (c = 0; c < table_limit; c++)
- start_bits[c] |= cd->cbits[c+cbit_word];
+ set_type_bits(start_bits, cbit_word, table_limit, cd);
try_next = FALSE;
break;
-
+
/* One or more character type fudges the pointer and restarts, knowing
it will hit a single character type and stop there. */
@@ -825,47 +880,36 @@ do
break;
case OP_NOT_DIGIT:
- for (c = 0; c < table_limit; c++)
- start_bits[c] |= ~cd->cbits[c+cbit_digit];
+ set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
break;
case OP_DIGIT:
- for (c = 0; c < table_limit; c++)
- start_bits[c] |= cd->cbits[c+cbit_digit];
+ set_type_bits(start_bits, cbit_digit, table_limit, cd);
break;
/* The cbit_space table has vertical tab as whitespace; we have to
- discard it. */
+ ensure it gets set as not whitespace. */
case OP_NOT_WHITESPACE:
- for (c = 0; c < table_limit; c++)
- {
- int d = cd->cbits[c+cbit_space];
- if (c == 1) d &= ~0x08;
- start_bits[c] |= ~d;
- }
+ set_nottype_bits(start_bits, cbit_space, table_limit, cd);
+ start_bits[1] |= 0x08;
break;
/* The cbit_space table has vertical tab as whitespace; we have to
- discard it. */
+ avoid setting it. */
case OP_WHITESPACE:
- for (c = 0; c < table_limit; c++)
- {
- int d = cd->cbits[c+cbit_space];
- if (c == 1) d &= ~0x08;
- start_bits[c] |= d;
- }
+ c = start_bits[1]; /* Save in case it was already set */
+ set_type_bits(start_bits, cbit_space, table_limit, cd);
+ start_bits[1] = (start_bits[1] & ~0x08) | c;
break;
case OP_NOT_WORDCHAR:
- for (c = 0; c < table_limit; c++)
- start_bits[c] |= ~cd->cbits[c+cbit_word];
+ set_nottype_bits(start_bits, cbit_word, table_limit, cd);
break;
case OP_WORDCHAR:
- for (c = 0; c < table_limit; c++)
- start_bits[c] |= cd->cbits[c+cbit_word];
+ set_type_bits(start_bits, cbit_word, table_limit, cd);
break;
}