diff options
author | zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2011-11-28 20:39:30 +0000 |
---|---|---|
committer | zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2011-11-28 20:39:30 +0000 |
commit | 00cc776fe74e502bc0774ceca2bb3f11283e189a (patch) | |
tree | 1f19206b0bbf8f9f4d2af9a8a34d0198c8a70808 | |
parent | 4d715f1b6035e095635067d977ad56948ff4e4c2 (diff) | |
download | pcre-00cc776fe74e502bc0774ceca2bb3f11283e189a.tar.gz |
Make character ranges 16 bit friendly
git-svn-id: svn://vcs.exim.org/pcre/code/branches/pcre16@770 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | Makefile.am | 3 | ||||
-rw-r--r-- | pcre16_xclass.c | 45 | ||||
-rw-r--r-- | pcre_compile.c | 266 | ||||
-rw-r--r-- | pcre_dfa_exec.c | 2 | ||||
-rw-r--r-- | pcre_exec.c | 67 | ||||
-rw-r--r-- | pcre_internal.h | 25 | ||||
-rw-r--r-- | pcre_jit_compile.c | 46 | ||||
-rw-r--r-- | pcre_printint.src | 14 | ||||
-rw-r--r-- | pcre_study.c | 18 | ||||
-rw-r--r-- | pcre_xclass.c | 7 |
10 files changed, 319 insertions, 174 deletions
diff --git a/Makefile.am b/Makefile.am index 440d699..7d5de86 100644 --- a/Makefile.am +++ b/Makefile.am @@ -219,7 +219,8 @@ libpcre16_la_SOURCES = \ pcre16_tables.c \ pcre16_try_flipped.c \ pcre16_utf16_utils.c \ - pcre16_valid_utf16.c + pcre16_valid_utf16.c \ + pcre16_xclass.c ## This file is generated as part of the building process, so don't distribute. nodist_libpcre16_la_SOURCES = \ diff --git a/pcre16_xclass.c b/pcre16_xclass.c new file mode 100644 index 0000000..acb5631 --- /dev/null +++ b/pcre16_xclass.c @@ -0,0 +1,45 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Copyright (c) 1997-2011 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + +/* Generate code with 16 bit character support. */ +#define COMPILE_PCRE16 + +#include "pcre_xclass.c" + +/* End of pcre16_xclass.c */ diff --git a/pcre_compile.c b/pcre_compile.c index 1664506..46d881d 100644 --- a/pcre_compile.c +++ b/pcre_compile.c @@ -1764,15 +1764,15 @@ for (;;) /* Check a class for variable quantification */ -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF || defined COMPILE_PCRE16 case OP_XCLASS: - cc += GET(cc, 1) - 33; + cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS]; /* Fall through */ #endif case OP_CLASS: case OP_NCLASS: - cc += 33; + cc += PRIV(OP_lengths)[OP_CLASS]; switch (*cc) { @@ -2310,7 +2310,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); case OP_CLASS: case OP_NCLASS: - ccode = code + 33; + ccode = code + PRIV(OP_lengths)[OP_CLASS]; #ifdef SUPPORT_UTF8 CHECK_CLASS_REPEAT: @@ -3299,22 +3299,27 @@ const pcre_uchar *nestptr = NULL; pcre_uchar *previous = NULL; pcre_uchar *previous_callout = NULL; pcre_uchar *save_hwm = NULL; -pcre_uchar classbits[32]; +pcre_uint8 classbits[32]; /* We can fish out the UTF-8 setting once and for all into a BOOL, but we must not do this for other options (e.g. PCRE_EXTENDED) because they may change dynamically as we process the pattern. */ #ifdef SUPPORT_UTF8 -BOOL class_utf8; BOOL utf8 = (options & PCRE_UTF8) != 0; -pcre_uint8 *class_utf8data; -pcre_uint8 *class_utf8data_base; pcre_uint8 utf8_char[6]; #else BOOL utf8 = FALSE; #endif +/* Helper variables for OP_XCLASS opcode (for characters > 255). */ + +#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 +BOOL xclass; +pcre_uchar *class_uchardata; +pcre_uchar *class_uchardata_base; +#endif + #ifdef PCRE_DEBUG if (lengthptr != NULL) DPRINTF((">> start branch\n")); #endif @@ -3620,8 +3625,7 @@ for (;; ptr++) { if (ptr[1] == CHAR_E) ptr++; - else if (STRNCMP_UC_C8(ptr + 1, - STR_Q STR_BACKSLASH STR_E, 3) == 0) + else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0) ptr += 3; else break; @@ -3665,10 +3669,10 @@ for (;; ptr++) memset(classbits, 0, 32 * sizeof(pcre_uint8)); -#ifdef SUPPORT_UTF8 - class_utf8 = FALSE; /* No chars >= 256 */ - class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */ - class_utf8data_base = class_utf8data; /* For resetting in pass 1 */ +#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 + xclass = FALSE; /* No chars >= 256 */ + class_uchardata = code + LINK_SIZE + 2; /* For UTF-8 items */ + class_uchardata_base = class_uchardata; /* For resetting in pass 1 */ #endif /* Process characters until ] is reached. By writing this as a "do" it @@ -3684,18 +3688,19 @@ for (;; ptr++) { /* Braces are required because the */ GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */ } +#endif - /* In the pre-compile phase, accumulate the length of any UTF-8 extra +#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 + /* In the pre-compile phase, accumulate the length of any extra data and reset the pointer. This is so that very large classes that - contain a zillion UTF-8 characters no longer overwrite the work space + contain a zillion > 255 characters no longer overwrite the work space (which is on the stack). */ if (lengthptr != NULL) { - *lengthptr += class_utf8data - class_utf8data_base; - class_utf8data = class_utf8data_base; + *lengthptr += class_uchardata - class_uchardata_base; + class_uchardata = class_uchardata_base; } - #endif /* Inside \Q...\E everything is literal except \E */ @@ -3896,23 +3901,23 @@ for (;; ptr++) SETBIT(classbits, 0x09); /* VT */ SETBIT(classbits, 0x20); /* SPACE */ SETBIT(classbits, 0xa0); /* NSBP */ -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf8) { - class_utf8 = TRUE; - *class_utf8data++ = XCL_SINGLE; - class_utf8data += PRIV(ord2utf8)(0x1680, class_utf8data); - *class_utf8data++ = XCL_SINGLE; - class_utf8data += PRIV(ord2utf8)(0x180e, class_utf8data); - *class_utf8data++ = XCL_RANGE; - class_utf8data += PRIV(ord2utf8)(0x2000, class_utf8data); - class_utf8data += PRIV(ord2utf8)(0x200A, class_utf8data); - *class_utf8data++ = XCL_SINGLE; - class_utf8data += PRIV(ord2utf8)(0x202f, class_utf8data); - *class_utf8data++ = XCL_SINGLE; - class_utf8data += PRIV(ord2utf8)(0x205f, class_utf8data); - *class_utf8data++ = XCL_SINGLE; - class_utf8data += PRIV(ord2utf8)(0x3000, class_utf8data); + xclass = TRUE; + *class_uchardata++ = XCL_SINGLE; + class_uchardata += PRIV(ord2utf8)(0x1680, class_uchardata); + *class_uchardata++ = XCL_SINGLE; + class_uchardata += PRIV(ord2utf8)(0x180e, class_uchardata); + *class_uchardata++ = XCL_RANGE; + class_uchardata += PRIV(ord2utf8)(0x2000, class_uchardata); + class_uchardata += PRIV(ord2utf8)(0x200A, class_uchardata); + *class_uchardata++ = XCL_SINGLE; + class_uchardata += PRIV(ord2utf8)(0x202f, class_uchardata); + *class_uchardata++ = XCL_SINGLE; + class_uchardata += PRIV(ord2utf8)(0x205f, class_uchardata); + *class_uchardata++ = XCL_SINGLE; + class_uchardata += PRIV(ord2utf8)(0x3000, class_uchardata); } #endif continue; @@ -3931,31 +3936,31 @@ for (;; ptr++) classbits[c] |= x; } -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf8) { - class_utf8 = TRUE; - *class_utf8data++ = XCL_RANGE; - class_utf8data += PRIV(ord2utf8)(0x0100, class_utf8data); - class_utf8data += PRIV(ord2utf8)(0x167f, class_utf8data); - *class_utf8data++ = XCL_RANGE; - class_utf8data += PRIV(ord2utf8)(0x1681, class_utf8data); - class_utf8data += PRIV(ord2utf8)(0x180d, class_utf8data); - *class_utf8data++ = XCL_RANGE; - class_utf8data += PRIV(ord2utf8)(0x180f, class_utf8data); - class_utf8data += PRIV(ord2utf8)(0x1fff, class_utf8data); - *class_utf8data++ = XCL_RANGE; - class_utf8data += PRIV(ord2utf8)(0x200B, class_utf8data); - class_utf8data += PRIV(ord2utf8)(0x202e, class_utf8data); - *class_utf8data++ = XCL_RANGE; - class_utf8data += PRIV(ord2utf8)(0x2030, class_utf8data); - class_utf8data += PRIV(ord2utf8)(0x205e, class_utf8data); - *class_utf8data++ = XCL_RANGE; - class_utf8data += PRIV(ord2utf8)(0x2060, class_utf8data); - class_utf8data += PRIV(ord2utf8)(0x2fff, class_utf8data); - *class_utf8data++ = XCL_RANGE; - class_utf8data += PRIV(ord2utf8)(0x3001, class_utf8data); - class_utf8data += PRIV(ord2utf8)(0x7fffffff, class_utf8data); + xclass = TRUE; + *class_uchardata++ = XCL_RANGE; + class_uchardata += PRIV(ord2utf8)(0x0100, class_uchardata); + class_uchardata += PRIV(ord2utf8)(0x167f, class_uchardata); + *class_uchardata++ = XCL_RANGE; + class_uchardata += PRIV(ord2utf8)(0x1681, class_uchardata); + class_uchardata += PRIV(ord2utf8)(0x180d, class_uchardata); + *class_uchardata++ = XCL_RANGE; + class_uchardata += PRIV(ord2utf8)(0x180f, class_uchardata); + class_uchardata += PRIV(ord2utf8)(0x1fff, class_uchardata); + *class_uchardata++ = XCL_RANGE; + class_uchardata += PRIV(ord2utf8)(0x200B, class_uchardata); + class_uchardata += PRIV(ord2utf8)(0x202e, class_uchardata); + *class_uchardata++ = XCL_RANGE; + class_uchardata += PRIV(ord2utf8)(0x2030, class_uchardata); + class_uchardata += PRIV(ord2utf8)(0x205e, class_uchardata); + *class_uchardata++ = XCL_RANGE; + class_uchardata += PRIV(ord2utf8)(0x2060, class_uchardata); + class_uchardata += PRIV(ord2utf8)(0x2fff, class_uchardata); + *class_uchardata++ = XCL_RANGE; + class_uchardata += PRIV(ord2utf8)(0x3001, class_uchardata); + class_uchardata += PRIV(ord2utf8)(0x7fffffff, class_uchardata); } #endif continue; @@ -3966,13 +3971,13 @@ for (;; ptr++) SETBIT(classbits, 0x0c); /* FF */ SETBIT(classbits, 0x0d); /* CR */ SETBIT(classbits, 0x85); /* NEL */ -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf8) { - class_utf8 = TRUE; - *class_utf8data++ = XCL_RANGE; - class_utf8data += PRIV(ord2utf8)(0x2028, class_utf8data); - class_utf8data += PRIV(ord2utf8)(0x2029, class_utf8data); + xclass = TRUE; + *class_uchardata++ = XCL_RANGE; + class_uchardata += PRIV(ord2utf8)(0x2028, class_uchardata); + class_uchardata += PRIV(ord2utf8)(0x2029, class_uchardata); } #endif continue; @@ -3994,16 +3999,16 @@ for (;; ptr++) classbits[c] |= x; } -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf8) { - class_utf8 = TRUE; - *class_utf8data++ = XCL_RANGE; - class_utf8data += PRIV(ord2utf8)(0x0100, class_utf8data); - class_utf8data += PRIV(ord2utf8)(0x2027, class_utf8data); - *class_utf8data++ = XCL_RANGE; - class_utf8data += PRIV(ord2utf8)(0x2029, class_utf8data); - class_utf8data += PRIV(ord2utf8)(0x7fffffff, class_utf8data); + xclass = TRUE; + *class_uchardata++ = XCL_RANGE; + class_uchardata += PRIV(ord2utf8)(0x0100, class_uchardata); + class_uchardata += PRIV(ord2utf8)(0x2027, class_uchardata); + *class_uchardata++ = XCL_RANGE; + class_uchardata += PRIV(ord2utf8)(0x2029, class_uchardata); + class_uchardata += PRIV(ord2utf8)(0x7fffffff, class_uchardata); } #endif continue; @@ -4016,11 +4021,11 @@ for (;; ptr++) int pdata; int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); if (ptype < 0) goto FAILED; - class_utf8 = TRUE; - *class_utf8data++ = ((-c == ESC_p) != negated)? + xclass = TRUE; + *class_uchardata++ = ((-c == ESC_p) != negated)? XCL_PROP : XCL_NOTPROP; - *class_utf8data++ = ptype; - *class_utf8data++ = pdata; + *class_uchardata++ = ptype; + *class_uchardata++ = pdata; class_charcount -= 2; /* Not a < 256 character */ continue; } @@ -4042,7 +4047,7 @@ for (;; ptr++) } /* Fall through if we have a single character (c >= 0). This may be - greater than 256 in UTF-8 mode. */ + greater than 256 mode. */ } /* End of backslash handling */ @@ -4140,10 +4145,15 @@ for (;; ptr++) matching for characters > 127 is available only if UCP support is available. */ -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127))) +#endif +#ifndef COMPILE_PCRE8 + if (d > 255) +#endif +#if defined SUPPORT_UTF || defined COMPILE_PCRE16 { - class_utf8 = TRUE; + xclass = TRUE; /* With UCP support, we can find the other case equivalents of the relevant characters. There may be several ranges. Optimize how @@ -4176,14 +4186,14 @@ for (;; ptr++) if (occ == ocd) { - *class_utf8data++ = XCL_SINGLE; + *class_uchardata++ = XCL_SINGLE; } else { - *class_utf8data++ = XCL_RANGE; - class_utf8data += PRIV(ord2utf8)(occ, class_utf8data); + *class_uchardata++ = XCL_RANGE; + class_uchardata += PRIV(ord2utf8)(occ, class_uchardata); } - class_utf8data += PRIV(ord2utf8)(ocd, class_utf8data); + class_uchardata += PRIV(ord2utf8)(ocd, class_uchardata); } } #endif /* SUPPORT_UCP */ @@ -4191,30 +4201,38 @@ for (;; ptr++) /* Now record the original range, possibly modified for UCP caseless overlapping ranges. */ - *class_utf8data++ = XCL_RANGE; - class_utf8data += PRIV(ord2utf8)(c, class_utf8data); - class_utf8data += PRIV(ord2utf8)(d, class_utf8data); + *class_uchardata++ = XCL_RANGE; +#ifdef SUPPORT_UTF + class_uchardata += PRIV(ord2utf8)(c, class_uchardata); + class_uchardata += PRIV(ord2utf8)(d, class_uchardata); +#else + *class_uchardata++ = c; + *class_uchardata++ = d; +#endif /* With UCP support, we are done. Without UCP support, there is no - caseless matching for UTF-8 characters > 127; we can use the bit map - for the smaller ones. */ + caseless matching for UTF characters > 127; we can use the bit map + for the smaller ones. As for 16 bit characters without UTF, we + can still use */ #ifdef SUPPORT_UCP continue; /* With next character in the class */ #else +#ifdef SUPPORT_UTF if ((options & PCRE_CASELESS) == 0 || c > 127) continue; - /* Adjust upper limit and fall through to set up the map */ - d = 127; - +#else + if (c > 255) continue; + /* Adjust upper limit and fall through to set up the map */ + d = 255; +#endif /* SUPPORT_UTF */ #endif /* SUPPORT_UCP */ } -#endif /* SUPPORT_UTF8 */ +#endif /* SUPPORT_UTF8 || COMPILE_PCRE16 */ - /* We use the bit map for all cases when not in UTF-8 mode; else - ranges that lie entirely within 0-127 when there is UCP support; else - for partial ranges without UCP support. */ + /* We use the bit map for 8 bit mode, or when the characters fall + partially or entirely to [0-255] ([0-127] for UCP) ranges. */ class_charcount += d - c + 1; class_lastchar = d; @@ -4242,12 +4260,21 @@ for (;; ptr++) /* Handle a character that cannot go in the bit map */ -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127))) +#endif +#ifndef COMPILE_PCRE8 + if (c > 255) +#endif +#if defined SUPPORT_UTF || defined COMPILE_PCRE16 { - class_utf8 = TRUE; - *class_utf8data++ = XCL_SINGLE; - class_utf8data += PRIV(ord2utf8)(c, class_utf8data); + xclass = TRUE; + *class_uchardata++ = XCL_SINGLE; +#ifdef SUPPORT_UTF + class_uchardata += PRIV(ord2utf8)(c, class_uchardata); +#else + *class_uchardata++ = c; +#endif #ifdef SUPPORT_UCP if ((options & PCRE_CASELESS) != 0) @@ -4255,8 +4282,8 @@ for (;; ptr++) unsigned int othercase; if ((othercase = UCD_OTHERCASE(c)) != c) { - *class_utf8data++ = XCL_SINGLE; - class_utf8data += PRIV(ord2utf8)(othercase, class_utf8data); + *class_uchardata++ = XCL_SINGLE; + class_uchardata += PRIV(ord2utf8)(othercase, class_uchardata); } } #endif /* SUPPORT_UCP */ @@ -4312,11 +4339,13 @@ for (;; ptr++) char if this item is first, whatever repeat count may follow. In the case of reqbyte, save the previous value for reinstating. */ -#ifdef SUPPORT_UTF8 - if (class_charcount == 1 && !class_utf8 && +#ifdef SUPPORT_UTF + if (class_charcount == 1 && !xclass && (!utf8 || !negate_class || class_lastchar < 128)) -#else +#elif defined COMPILE_PCRE8 if (class_charcount == 1) +#else + if (class_charcount == 1 && !xclass) #endif { zeroreqbyte = reqbyte; @@ -4364,13 +4393,18 @@ for (;; ptr++) be listed) there are no characters < 256, we can omit the bitmap in the actual compiled code. */ -#ifdef SUPPORT_UTF8 - if (class_utf8 && (!should_flip_negation || (options & PCRE_UCP) != 0)) +#ifdef SUPPORT_UTF + if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0)) +#endif +#ifndef COMPILE_PCRE8 + if (xclass && !should_flip_negation) +#endif +#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 { - *class_utf8data++ = XCL_END; /* Marks the end of extra data */ + *class_uchardata++ = XCL_END; /* Marks the end of extra data */ *code++ = OP_XCLASS; code += LINK_SIZE; - *code = negate_class? XCL_NOT : 0; + *code = negate_class? XCL_NOT:0; /* If the map is required, move up the extra data to make room for it; otherwise just move the code pointer to the end of the extra data. */ @@ -4378,11 +4412,12 @@ for (;; ptr++) if (class_charcount > 0) { *code++ |= XCL_MAP; - memmove(code + 32, code, class_utf8data - code); + memmove(code + (32 / sizeof(pcre_uchar)), code, + IN_UCHARS(class_uchardata - code)); memcpy(code, classbits, 32); - code = class_utf8data + 32; + code = class_uchardata + (32 / sizeof(pcre_uchar)); } - else code = class_utf8data; + else code = class_uchardata; /* Now fill in the complete length of the item */ @@ -4398,16 +4433,13 @@ for (;; ptr++) negating it if necessary. */ *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS; - if (negate_class) - { - if (lengthptr == NULL) /* Save time in the pre-compile phase */ - for (c = 0; c < 32; c++) code[c] = ~classbits[c]; - } - else + if (lengthptr == NULL) /* Save time in the pre-compile phase */ { + if (negate_class) + for (c = 0; c < 32; c++) classbits[c] = ~classbits[c]; memcpy(code, classbits, 32); } - code += 32; + code += 32 / sizeof(pcre_uchar); break; @@ -4761,7 +4793,7 @@ for (;; ptr++) else if (*previous == OP_CLASS || *previous == OP_NCLASS || -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF8 || !defined COMPILE_PCRE8 *previous == OP_XCLASS || #endif *previous == OP_REF || diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c index ea5b00c..0793897 100644 --- a/pcre_dfa_exec.c +++ b/pcre_dfa_exec.c @@ -2426,7 +2426,7 @@ for (;;) if (codevalue != OP_XCLASS) { - ecode = code + 33; + ecode = code + 1 + (32 / sizeof(pcre_uchar)); if (clen > 0) { isinclass = (c > 255)? (codevalue == OP_NCLASS) : diff --git a/pcre_exec.c b/pcre_exec.c index 41a2482..e532513 100644 --- a/pcre_exec.c +++ b/pcre_exec.c @@ -2706,8 +2706,11 @@ for (;;) case OP_NCLASS: case OP_CLASS: { + /* The data variable is saved across frames, so the byte map needs to + be stored there. */ +#define BYTE_MAP ((pcre_uint8 *)data) data = ecode + 1; /* Save for matching */ - ecode += 33; /* Advance past the item */ + ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */ switch (*ecode) { @@ -2740,7 +2743,7 @@ for (;;) /* First, ensure the minimum number of matches are present. */ -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF /* UTF-8 mode */ if (utf8) { @@ -2757,9 +2760,7 @@ for (;;) if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH); } else - { - if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH); - } + if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH); } } else @@ -2774,7 +2775,14 @@ for (;;) MRRETURN(MATCH_NOMATCH); } c = *eptr++; - if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH); +#ifndef COMPILE_PCRE8 + if (c > 255) + { + if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH); + } + else +#endif + if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH); } } @@ -2788,7 +2796,7 @@ for (;;) if (minimize) { -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF /* UTF-8 mode */ if (utf8) { @@ -2808,9 +2816,7 @@ for (;;) if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH); } else - { - if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH); - } + if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH); } } else @@ -2828,7 +2834,14 @@ for (;;) MRRETURN(MATCH_NOMATCH); } c = *eptr++; - if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH); +#ifndef COMPILE_PCRE8 + if (c > 255) + { + if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH); + } + else +#endif + if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH); } } /* Control never gets here */ @@ -2840,8 +2853,8 @@ for (;;) { pp = eptr; -#ifdef SUPPORT_UTF8 - /* UTF-8 mode */ +#ifdef SUPPORT_UTF + /* UTF mode */ if (utf8) { for (i = min; i < max; i++) @@ -2858,9 +2871,7 @@ for (;;) if (op == OP_CLASS) break; } else - { - if ((data[c/8] & (1 << (c&7))) == 0) break; - } + if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break; eptr += len; } for (;;) @@ -2873,7 +2884,7 @@ for (;;) } else #endif - /* Not UTF-8 mode */ + /* Not UTF mode */ { for (i = min; i < max; i++) { @@ -2883,7 +2894,14 @@ for (;;) break; } c = *eptr; - if ((data[c/8] & (1 << (c&7))) == 0) break; +#ifndef COMPILE_PCRE8 + if (c > 255) + { + if (op == OP_CLASS) break; + } + else +#endif + if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break; eptr++; } while (eptr >= pp) @@ -2896,6 +2914,7 @@ for (;;) MRRETURN(MATCH_NOMATCH); } +#undef BYTE_MAP } /* Control never gets here */ @@ -2904,7 +2923,7 @@ for (;;) when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8 mode, because Unicode properties are supported in non-UTF-8 mode. */ -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 case OP_XCLASS: { data = ecode + 1 + LINK_SIZE; /* Save for matching */ @@ -2991,7 +3010,11 @@ for (;;) SCHECK_PARTIAL(); break; } +#ifdef SUPPORT_UTF GETCHARLENTEST(c, eptr, len); +#else + c = *eptr; +#endif if (!PRIV(xclass)(c, data)) break; eptr += len; } @@ -3000,7 +3023,9 @@ for (;;) RMATCH(eptr, ecode, offset_top, md, eptrb, RM21); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (eptr-- == pp) break; /* Stop if tried at original pos */ +#ifdef SUPPORT_UTF if (utf8) BACKCHAR(eptr); +#endif } MRRETURN(MATCH_NOMATCH); } @@ -6353,7 +6378,11 @@ for(;;) { while (start_match < end_subject) { +#ifdef COMPILE_PCRE8 register unsigned int c = *start_match; +#else + register unsigned int c = *start_match & 0xff; +#endif if ((start_bits[c/8] & (1 << (c&7))) == 0) { start_match++; diff --git a/pcre_internal.h b/pcre_internal.h index 0228207..b9f8dd4 100644 --- a/pcre_internal.h +++ b/pcre_internal.h @@ -71,6 +71,21 @@ script prevents both being selected, but not everybody uses "configure". */ #define SUPPORT_UTF8 1 #endif +/* If SUPPORT_UCP is defined, SUPPORT_UTF16 must also be defined. The +"configure" script ensures this, but not everybody uses "configure". */ + +#if defined SUPPORT_UCP && defined COMPILE_PCRE16 && !defined SUPPORT_UTF16 +#define SUPPORT_UTF16 1 +#endif + +/* This macro is defined if either UTF-8 or UTF-16 support or both are +enabled. */ + +#if defined SUPPORT_UTF8 || defined SUPPORT_UTF16 +/* Unicode Transformation Format is enabled. */ +#define SUPPORT_UTF 1 +#endif + /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef inline, and there are *still* stupid compilers about that don't like indented pre-processor statements, or at least there were when I first wrote this. After @@ -1325,7 +1340,7 @@ only. */ #define PT_WORD 8 /* Word - L plus N plus underscore */ /* Flag bits and data types for the extended class (OP_XCLASS) for classes that -contain UTF-8 characters with values greater than 255. */ +contain characters with values greater than 255. */ #define XCL_NOT 0x01 /* Flag: this is a negative class */ #define XCL_MAP 0x02 /* Flag: a 32-byte map is present */ @@ -1522,8 +1537,8 @@ enum { OP_CLASS, /* 106 Match a character class, chars < 256 only */ OP_NCLASS, /* 107 Same, but the bitmap was created from a negative class - the difference is relevant only when a - UTF-8 character > 255 is encountered. */ - OP_XCLASS, /* 108 Extended class for handling UTF-8 chars within the + character > 255 is encountered. */ + OP_XCLASS, /* 108 Extended class for handling > 255 chars within the class. This does both positive and negative. */ OP_REF, /* 109 Match a back reference, casefully */ OP_REFI, /* 110 Match a back reference, caselessly */ @@ -1704,8 +1719,8 @@ in UTF-8 mode. The code that uses this table must know about such things. */ /* Character class & ref repeats */ \ 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \ 1+2*IMM2_SIZE, 1+2*IMM2_SIZE, /* CRRANGE, CRMINRANGE */ \ - 33, /* CLASS */ \ - 33, /* NCLASS */ \ + 1+(32/sizeof(pcre_uchar)), /* CLASS */ \ + 1+(32/sizeof(pcre_uchar)), /* NCLASS */ \ 0, /* XCLASS - variable length */ \ 1+IMM2_SIZE, /* REF */ \ 1+IMM2_SIZE, /* REFI */ \ diff --git a/pcre_jit_compile.c b/pcre_jit_compile.c index 5fed4a1..7a2c41d 100644 --- a/pcre_jit_compile.c +++ b/pcre_jit_compile.c @@ -592,9 +592,9 @@ switch(*cc) case OP_CLASS: case OP_NCLASS: - return cc + 33; + return cc + 1 + 32 / sizeof(pcre_uchar); -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 case OP_XCLASS: return cc + GET(cc, 1); #endif @@ -1879,11 +1879,14 @@ if (firstline) start = LABEL(); leave = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); -OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); -#ifdef SUPPORT_UTF8 +OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); +#ifdef SUPPORT_UTF if (common->utf8) OP1(SLJIT_MOV, TMP3, 0, TMP1, 0); #endif +#ifndef COMPILE_PCRE8 +OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xff); +#endif OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7); OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3); OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), start_bits); @@ -1891,11 +1894,11 @@ OP2(SLJIT_SHL, TMP2, 0, SLJIT_IMM, 1, TMP2, 0); OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, TMP2, 0); found = JUMP(SLJIT_C_NOT_ZERO); -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (common->utf8) OP1(SLJIT_MOV, TMP1, 0, TMP3, 0); #endif -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); #ifdef SUPPORT_UTF8 if (common->utf8) { @@ -2435,7 +2438,7 @@ while (utf8length > 0); return cc; } -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 #define SET_TYPE_OFFSET(value) \ if ((value) != typeoffset) \ @@ -2482,8 +2485,12 @@ read_char(common); if ((*cc++ & XCL_MAP) != 0) { OP1(SLJIT_MOV, TMP3, 0, TMP1, 0); +#ifndef COMPILE_PCRE8 + jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255); +#elif defined SUPPORT_UTF8 if (common->utf8) jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255); +#endif OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7); OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3); @@ -2492,13 +2499,17 @@ if ((*cc++ & XCL_MAP) != 0) OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, TMP2, 0); add_jump(compiler, list, JUMP(SLJIT_C_NOT_ZERO)); +#ifndef COMPILE_PCRE8 + JUMPHERE(jump); +#elif defined SUPPORT_UTF8 if (common->utf8) JUMPHERE(jump); +#endif OP1(SLJIT_MOV, TMP1, 0, TMP3, 0); #ifdef SUPPORT_UCP charsaved = TRUE; #endif - cc += 32; + cc += 32 / sizeof(pcre_uchar); } /* Scanning the necessary info. */ @@ -3179,9 +3190,12 @@ switch(type) case OP_NCLASS: check_input_end(common, fallbacks); read_char(common); -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 jump[0] = NULL; +#ifdef SUPPORT_UTF8 + /* This check can only be skipped in pure 8 bit mode. */ if (common->utf8) +#endif { jump[0] = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255); if (type == OP_CLASS) @@ -3197,13 +3211,13 @@ switch(type) OP2(SLJIT_SHL, TMP2, 0, SLJIT_IMM, 1, TMP2, 0); OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, TMP2, 0); add_jump(compiler, fallbacks, JUMP(SLJIT_C_ZERO)); -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 if (jump[0] != NULL) JUMPHERE(jump[0]); #endif - return cc + 32; + return cc + 32 / sizeof(pcre_uchar); -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF || defined COMPILE_PCRE16 case OP_XCLASS: compile_xclass_hotpath(common, cc + LINK_SIZE, fallbacks); return cc + GET(cc, 0) - 1; @@ -4725,7 +4739,7 @@ else SLJIT_ASSERT(*opcode >= OP_CLASS || *opcode <= OP_XCLASS); *type = *opcode; cc++; - class_len = (*type < OP_XCLASS) ? 33 : GET(cc, 0); + class_len = (*type < OP_XCLASS) ? (1 + (32 / sizeof(pcre_uchar))) : GET(cc, 0); *opcode = cc[class_len - 1]; if (*opcode >= OP_CRSTAR && *opcode <= OP_CRMINQUERY) { @@ -5133,13 +5147,13 @@ while (cc < ccend) case OP_CLASS: case OP_NCLASS: - if (cc[33] >= OP_CRSTAR && cc[33] <= OP_CRMINRANGE) + if (cc[1 + (32 / sizeof(pcre_uchar))] >= OP_CRSTAR && cc[1 + (32 / sizeof(pcre_uchar))] <= OP_CRMINRANGE) cc = compile_iterator_hotpath(common, cc, parent); else cc = compile_char1_hotpath(common, *cc, cc + 1, parent->top != NULL ? &parent->top->nextfallbacks : &parent->topfallbacks); break; -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF || defined COMPILE_PCRE16 case OP_XCLASS: if (*(cc + GET(cc, 1)) >= OP_CRSTAR && *(cc + GET(cc, 1)) <= OP_CRMINRANGE) cc = compile_iterator_hotpath(common, cc, parent); @@ -5994,7 +6008,9 @@ while (current) case OP_TYPEPOSUPTO: case OP_CLASS: case OP_NCLASS: +#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 case OP_XCLASS: +#endif compile_iterator_fallbackpath(common, current); break; diff --git a/pcre_printint.src b/pcre_printint.src index a5670e5..5a9f15d 100644 --- a/pcre_printint.src +++ b/pcre_printint.src @@ -471,9 +471,9 @@ for(;;) fprintf(f, " %s %s", OP_names[*code], get_ucpname(code[1], code[2])); break; - /* OP_XCLASS can only occur in UTF-8 mode. However, there's no harm in - having this code always here, and it makes it less messy without all those - #ifdefs. */ + /* OP_XCLASS can only occur in UTF or PCRE16 modes. However, there's no + harm in having this code always here, and it makes it less messy without + all those #ifdefs. */ case OP_CLASS: case OP_NCLASS: @@ -481,6 +481,7 @@ for(;;) { int i, min, max; BOOL printmap; + pcre_uint8 *map; fprintf(f, " ["); @@ -501,13 +502,14 @@ for(;;) if (printmap) { + map = (pcre_uint8 *)ccode; for (i = 0; i < 256; i++) { - if ((ccode[i/8] & (1 << (i&7))) != 0) + if ((map[i/8] & (1 << (i&7))) != 0) { int j; for (j = i+1; j < 256; j++) - if ((ccode[j/8] & (1 << (j&7))) == 0) break; + if ((map[j/8] & (1 << (j&7))) == 0) break; if (i == '-' || i == ']') fprintf(f, "\\"); if (PRINTABLE(i)) fprintf(f, "%c", i); else fprintf(f, "\\x%02x", i); @@ -521,7 +523,7 @@ for(;;) i = j; } } - ccode += 32; + ccode += 32 / sizeof(pcre_uchar); } /* For an XCLASS there is always some additional data */ diff --git a/pcre_study.c b/pcre_study.c index 5253c49..661627d 100644 --- a/pcre_study.c +++ b/pcre_study.c @@ -322,15 +322,15 @@ for (;;) /* Check a class for variable quantification */ -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF8 || !defined COMPILE_PCRE8 case OP_XCLASS: - cc += GET(cc, 1) - 33; + cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS]; /* Fall through */ #endif case OP_CLASS: case OP_NCLASS: - cc += 33; + cc += PRIV(OP_lengths)[OP_CLASS]; switch (*cc) { @@ -789,7 +789,9 @@ do case OP_SOM: case OP_THEN: case OP_THEN_ARG: +#if defined SUPPORT_UTF8 || !defined COMPILE_PCRE8 case OP_XCLASS: +#endif return SSB_FAIL; /* We can ignore word boundary tests. */ @@ -1134,7 +1136,9 @@ do case OP_CLASS: { + pcre_uint8 *map; tcode++; + map = (pcre_uint8 *)tcode; /* In UTF-8 mode, the bits in a bit map correspond to character values, not to byte values. However, the bit map we are constructing is @@ -1145,10 +1149,10 @@ do #ifdef SUPPORT_UTF8 if (utf8) { - for (c = 0; c < 16; c++) start_bits[c] |= tcode[c]; + for (c = 0; c < 16; c++) start_bits[c] |= map[c]; for (c = 128; c < 256; c++) { - if ((tcode[c/8] && (1 << (c&7))) != 0) + if ((map[c/8] && (1 << (c&7))) != 0) { int d = (c >> 6) | 0xc0; /* Set bit for this starter */ start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */ @@ -1162,13 +1166,13 @@ do else #endif { - for (c = 0; c < 32; c++) start_bits[c] |= tcode[c]; + for (c = 0; c < 32; c++) start_bits[c] |= map[c]; } /* Advance past the bit map, and act on what follows. For a zero minimum repeat, continue; otherwise stop processing. */ - tcode += 32; + tcode += 32 / sizeof(pcre_uchar); switch (*tcode) { case OP_CRSTAR: diff --git a/pcre_xclass.c b/pcre_xclass.c index 024d71d..cdb9d07 100644 --- a/pcre_xclass.c +++ b/pcre_xclass.c @@ -75,15 +75,16 @@ additional data. */ if (c < 256) { - if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0) - return !negated; /* char found */ + if ((*data & XCL_MAP) != 0 && + (((pcre_uint8 *)(data + 1))[c/8] & (1 << (c&7))) != 0) + return !negated; /* char found */ } /* First skip the bit map if present. Then match against the list of Unicode properties or large chars or ranges that end with a large char. We won't ever encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */ -if ((*data++ & XCL_MAP) != 0) data += 32; +if ((*data++ & XCL_MAP) != 0) data += 32 / sizeof(pcre_uchar); while ((t = *data++) != XCL_END) { |