diff options
author | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2010-06-13 21:35:04 +0000 |
---|---|---|
committer | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2010-06-13 21:35:04 +0000 |
commit | d940107b6f55a1f47fb03ddb48e6246175ba669b (patch) | |
tree | b8a88629ff4c4d8a502de861d64786299c53abbb | |
parent | ccf5f782818c0ab97070ba43f1d41188fd3c252c (diff) | |
download | pcre-d940107b6f55a1f47fb03ddb48e6246175ba669b.tar.gz |
A more correct fix for the chartables bug with UTF-8 and non-std tables.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@539 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | ChangeLog | 5 | ||||
-rw-r--r-- | configure.ac | 4 | ||||
-rw-r--r-- | maint/README | 6 | ||||
-rw-r--r-- | maint/pcre_chartables.c.non-standard | 138 | ||||
-rw-r--r-- | pcre_study.c | 148 |
5 files changed, 244 insertions, 57 deletions
@@ -81,9 +81,8 @@ Version 8.10 03-Jun-2010 used to create a list of bytes that can start a match. For \s, it was including 0x85 and 0xa0, which of course cannot start UTF-8 characters. I have changed the code so that only real ASCII characters (less than 128) - are set in this case because the \s etc escapes are documented as - recognizing only ASCII characters. (When PCRE_UCP is set - see 9 above - - the code is different altogether.) + and the correct starting bytes for UTF-8 encodings are set in this case. + (When PCRE_UCP is set - see 9 above - the code is different altogether.) Version 8.02 19-Mar-2010 diff --git a/configure.ac b/configure.ac index 499ae87..f88ad39 100644 --- a/configure.ac +++ b/configure.ac @@ -10,8 +10,8 @@ dnl be defined as -RC2, for example. For real releases, it should be empty. m4_define(pcre_major, [8]) m4_define(pcre_minor, [10]) -m4_define(pcre_prerelease, [-RC1]) -m4_define(pcre_date, [2010-06-03]) +m4_define(pcre_prerelease, [-RC2]) +m4_define(pcre_date, [2010-06-11]) # Libtool shared library interface versions (current:revision:age) m4_define(libpcre_version, [0:1:0]) diff --git a/maint/README b/maint/README index f6c9102..06b2883 100644 --- a/maint/README +++ b/maint/README @@ -35,6 +35,12 @@ MultiStage2.py A Python script that generates the file pcre_ucd.c from three Unicode web site. Run this script in the "maint" directory. The generated file contains the tables for a 2-stage lookup of Unicode properties. + +pcre_chartables.c.non-standard + This is a set of character tables that came from a Windows + system. It has characters greater than 128 that are set as + spaces, amongst other things. I kept it so that it can be + used for testing from time to time. README This file. diff --git a/maint/pcre_chartables.c.non-standard b/maint/pcre_chartables.c.non-standard new file mode 100644 index 0000000..c18d49d --- /dev/null +++ b/maint/pcre_chartables.c.non-standard @@ -0,0 +1,138 @@ +const unsigned char _pcre_default_tables[] = { +0,1,2,3,4,5,6,7, +8,9,10,11,12,13,14,15, +16,17,18,19,20,21,22,23, +24,25,26,27,28,29,30,31, +32,33,34,35,36,37,38,39, +40,41,42,43,44,45,46,47, +48,49,50,51,52,53,54,55, +56,57,58,59,60,61,62,63, +64,97,98,99,100,101,102,103, +104,105,106,107,108,109,110,111, +112,113,114,115,116,117,118,119, +120,121,122,91,92,93,94,95, +96,97,98,99,100,101,102,103, +104,105,106,107,108,109,110,111, +112,113,114,115,116,117,118,119, +120,121,122,123,124,125,126,127, +128,129,130,131,132,133,134,135, +136,137,138,139,140,141,142,143, +144,145,146,147,148,149,150,151, +152,153,154,155,156,157,158,159, +160,161,162,163,164,165,166,167, +168,169,170,171,172,173,174,175, +176,177,178,179,180,181,182,183, +184,185,186,187,188,189,190,191, +224,225,226,227,228,229,230,231, +232,233,234,235,236,237,238,239, +240,241,242,243,244,245,246,215, +248,249,250,251,252,253,254,223, +224,225,226,227,228,229,230,231, +232,233,234,235,236,237,238,239, +240,241,242,243,244,245,246,247, +248,249,250,251,252,253,254,255, +0,1,2,3,4,5,6,7, +8,9,10,11,12,13,14,15, +16,17,18,19,20,21,22,23, +24,25,26,27,28,29,30,31, +32,33,34,35,36,37,38,39, +40,41,42,43,44,45,46,47, +48,49,50,51,52,53,54,55, +56,57,58,59,60,61,62,63, +64,97,98,99,100,101,102,103, +104,105,106,107,108,109,110,111, +112,113,114,115,116,117,118,119, +120,121,122,91,92,93,94,95, +96,65,66,67,68,69,70,71, +72,73,74,75,76,77,78,79, +80,81,82,83,84,85,86,87, +88,89,90,123,124,125,126,127, +128,129,130,131,132,133,134,135, +136,137,138,139,140,141,142,143, +144,145,146,147,148,149,150,151, +152,153,154,155,156,157,158,159, +160,161,162,163,164,165,166,167, +168,169,170,171,172,173,174,175, +176,177,178,179,180,181,182,183, +184,185,186,187,188,189,190,191, +224,225,226,227,228,229,230,231, +232,233,234,235,236,237,238,239, +240,241,242,243,244,245,246,215, +248,249,250,251,252,253,254,223, +192,193,194,195,196,197,198,199, +200,201,202,203,204,205,206,207, +208,209,210,211,212,213,214,247, +216,217,218,219,220,221,222,255, +0,62,0,0,1,0,0,0, +0,0,0,0,0,0,0,0, +32,0,0,0,1,0,0,0, +0,0,0,0,0,0,0,0, +0,0,0,0,0,0,255,3, +126,0,0,0,126,0,0,0, +0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0, +0,0,0,0,0,0,255,3, +0,0,0,0,0,0,0,0, +0,0,0,0,0,0,12,2, +0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0, +254,255,255,7,0,0,0,0, +0,0,0,0,0,0,0,0, +255,255,127,127,0,0,0,0, +0,0,0,0,0,0,0,0, +0,0,0,0,254,255,255,7, +0,0,0,0,0,4,32,4, +0,0,0,128,255,255,127,255, +0,0,0,0,0,0,255,3, +254,255,255,135,254,255,255,7, +0,0,0,0,0,4,44,6, +255,255,127,255,255,255,127,255, +0,0,0,0,254,255,255,255, +255,255,255,255,255,255,255,127, +0,0,0,0,254,255,255,255, +255,255,255,255,255,255,255,255, +0,2,0,0,255,255,255,255, +255,255,255,255,255,255,255,127, +0,0,0,0,255,255,255,255, +255,255,255,255,255,255,255,255, +0,0,0,0,254,255,0,252, +1,0,0,248,1,0,0,120, +0,0,0,0,254,255,255,255, +0,0,128,0,0,0,128,0, +255,255,255,255,0,0,0,0, +0,0,0,0,0,0,0,128, +255,255,255,255,0,0,0,0, +0,0,0,0,0,0,0,0, +128,0,0,0,0,0,0,0, +0,1,1,0,1,1,0,0, +0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0, +1,0,0,0,128,0,0,0, +128,128,128,128,0,0,128,0, +28,28,28,28,28,28,28,28, +28,28,0,0,0,0,0,128, +0,26,26,26,26,26,26,18, +18,18,18,18,18,18,18,18, +18,18,18,18,18,18,18,18, +18,18,18,128,128,0,128,16, +0,26,26,26,26,26,26,18, +18,18,18,18,18,18,18,18, +18,18,18,18,18,18,18,18, +18,18,18,128,128,0,0,0, +0,0,0,0,0,1,0,0, +0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0, +1,0,0,0,0,0,0,0, +0,0,18,0,0,0,0,0, +0,0,20,20,0,18,0,0, +0,20,18,0,0,0,0,0, +18,18,18,18,18,18,18,18, +18,18,18,18,18,18,18,18, +18,18,18,18,18,18,18,0, +18,18,18,18,18,18,18,18, +18,18,18,18,18,18,18,18, +18,18,18,18,18,18,18,18, +18,18,18,18,18,18,18,0, +18,18,18,18,18,18,18,18 +}; diff --git a/pcre_study.c b/pcre_study.c index 51457a7..71d2526 100644 --- a/pcre_study.c +++ b/pcre_study.c @@ -490,6 +490,77 @@ return p + 1; /************************************************* +* Set bits for a positive character type * +*************************************************/ + +/* This function sets starting bits for a character type. In UTF-8 mode, we can +only do a direct setting for bytes less than 128, as otherwise there can be +confusion with bytes in the middle of UTF-8 characters. In a "traditional" +environment, the tables will only recognize ASCII characters anyway, but in at +least one Windows environment, some higher bytes bits were set in the tables. +So we deal with that case by considering the UTF-8 encoding. + +Arguments: + start_bits the starting bitmap + cbit type the type of character wanted + table_limit 32 for non-UTF-8; 16 for UTF-8 + cd the block with char table pointers + +Returns: nothing +*/ + +static void +set_type_bits(uschar *start_bits, int cbit_type, int table_limit, + compile_data *cd) +{ +register int c; +for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type]; +if (table_limit == 32) return; +for (c = 128; c < 256; c++) + { + if ((cd->cbits[c/8] & (1 << (c&7))) != 0) + { + uschar buff[8]; + (void)_pcre_ord2utf8(c, buff); + SET_BIT(buff[0]); + } + } +} + + +/************************************************* +* Set bits for a negative character type * +*************************************************/ + +/* This function sets starting bits for a negative character type such as \D. +In UTF-8 mode, we can only do a direct setting for bytes less than 128, as +otherwise there can be confusion with bytes in the middle of UTF-8 characters. +Unlike in the positive case, where we can set appropriate starting bits for +specific high-valued UTF-8 characters, in this case we have to set the bits for +all high-valued characters. The lowest is 0xc2, but we overkill by starting at +0xc0 (192) for simplicity. + +Arguments: + start_bits the starting bitmap + cbit type the type of character wanted + table_limit 32 for non-UTF-8; 16 for UTF-8 + cd the block with char table pointers + +Returns: nothing +*/ + +static void +set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit, + compile_data *cd) +{ +register int c; +for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type]; +if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff; +} + + + +/************************************************* * Create bitmap of starting bytes * *************************************************/ @@ -705,64 +776,48 @@ do /* Single character types set the bits and stop. Note that if PCRE_UCP is set, we do not see these op codes because \d etc are converted to - properties. Therefore, these apply in the case when only ASCII characters - are recognized to match the types. In UTF-8 mode, we must restrict - ourselves to bytes less than 128, as otherwise there can be confusion - with bytes in the middle of UTF-8 characters. (In a "traditional" - environment, the tables will only recognize ASCII characters anyway, but - in at least one Windows environment, some higher bytes bits were set in - the tables.) */ + properties. Therefore, these apply in the case when only characters less + than 256 are recognized to match the types. */ case OP_NOT_DIGIT: - for (c = 0; c < table_limit; c++) - start_bits[c] |= ~cd->cbits[c+cbit_digit]; + set_nottype_bits(start_bits, cbit_digit, table_limit, cd); try_next = FALSE; break; case OP_DIGIT: - for (c = 0; c < table_limit; c++) - start_bits[c] |= cd->cbits[c+cbit_digit]; + set_type_bits(start_bits, cbit_digit, table_limit, cd); try_next = FALSE; break; /* The cbit_space table has vertical tab as whitespace; we have to - discard it. */ + ensure it is set as not whitespace. */ case OP_NOT_WHITESPACE: - for (c = 0; c < table_limit; c++) - { - int d = cd->cbits[c+cbit_space]; - if (c == 1) d &= ~0x08; - start_bits[c] |= ~d; - } + set_nottype_bits(start_bits, cbit_space, table_limit, cd); + start_bits[1] |= 0x08; try_next = FALSE; break; /* The cbit_space table has vertical tab as whitespace; we have to - discard it. */ + not set it from the table. */ case OP_WHITESPACE: - for (c = 0; c < table_limit; c++) - { - int d = cd->cbits[c+cbit_space]; - if (c == 1) d &= ~0x08; - start_bits[c] |= d; - } + c = start_bits[1]; /* Save in case it was already set */ + set_type_bits(start_bits, cbit_space, table_limit, cd); + start_bits[1] = (start_bits[1] & ~0x08) | c; try_next = FALSE; break; case OP_NOT_WORDCHAR: - for (c = 0; c < table_limit; c++) - start_bits[c] |= ~cd->cbits[c+cbit_word]; + set_nottype_bits(start_bits, cbit_word, table_limit, cd); try_next = FALSE; break; case OP_WORDCHAR: - for (c = 0; c < table_limit; c++) - start_bits[c] |= cd->cbits[c+cbit_word]; + set_type_bits(start_bits, cbit_word, table_limit, cd); try_next = FALSE; break; - + /* One or more character type fudges the pointer and restarts, knowing it will hit a single character type and stop there. */ @@ -825,47 +880,36 @@ do break; case OP_NOT_DIGIT: - for (c = 0; c < table_limit; c++) - start_bits[c] |= ~cd->cbits[c+cbit_digit]; + set_nottype_bits(start_bits, cbit_digit, table_limit, cd); break; case OP_DIGIT: - for (c = 0; c < table_limit; c++) - start_bits[c] |= cd->cbits[c+cbit_digit]; + set_type_bits(start_bits, cbit_digit, table_limit, cd); break; /* The cbit_space table has vertical tab as whitespace; we have to - discard it. */ + ensure it gets set as not whitespace. */ case OP_NOT_WHITESPACE: - for (c = 0; c < table_limit; c++) - { - int d = cd->cbits[c+cbit_space]; - if (c == 1) d &= ~0x08; - start_bits[c] |= ~d; - } + set_nottype_bits(start_bits, cbit_space, table_limit, cd); + start_bits[1] |= 0x08; break; /* The cbit_space table has vertical tab as whitespace; we have to - discard it. */ + avoid setting it. */ case OP_WHITESPACE: - for (c = 0; c < table_limit; c++) - { - int d = cd->cbits[c+cbit_space]; - if (c == 1) d &= ~0x08; - start_bits[c] |= d; - } + c = start_bits[1]; /* Save in case it was already set */ + set_type_bits(start_bits, cbit_space, table_limit, cd); + start_bits[1] = (start_bits[1] & ~0x08) | c; break; case OP_NOT_WORDCHAR: - for (c = 0; c < table_limit; c++) - start_bits[c] |= ~cd->cbits[c+cbit_word]; + set_nottype_bits(start_bits, cbit_word, table_limit, cd); break; case OP_WORDCHAR: - for (c = 0; c < table_limit; c++) - start_bits[c] |= cd->cbits[c+cbit_word]; + set_type_bits(start_bits, cbit_word, table_limit, cd); break; } |