diff options
author | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2012-09-23 16:50:00 +0000 |
---|---|---|
committer | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2012-09-23 16:50:00 +0000 |
commit | 05d03818ae1cf4196b9316714f0fd199dfb1b1e2 (patch) | |
tree | 63b12f728462a7a07d2a2f31d1631048a0de1137 /pcre_compile.c | |
parent | 515816cae335b59b2d784ae9bb13711455a7c192 (diff) | |
download | pcre-05d03818ae1cf4196b9316714f0fd199dfb1b1e2.tar.gz |
Update character class handling to use new character case information; rework
\h, \H, \v, and \V to use the same apparatus with centrally defined lists.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1045 2f5784b3-3f2a-0410-8824-cb99058d5e15
Diffstat (limited to 'pcre_compile.c')
-rw-r--r-- | pcre_compile.c | 824 |
1 files changed, 371 insertions, 453 deletions
diff --git a/pcre_compile.c b/pcre_compile.c index 58be101..facf3ef 100644 --- a/pcre_compile.c +++ b/pcre_compile.c @@ -68,7 +68,7 @@ COMPILE_PCREx macro will already be appropriately set. */ /* Macro for setting individual bits in class bitmaps. */ -#define SETBIT(a,b) a[b/8] |= (1 << (b%8)) +#define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7)) /* Maximum length value to check against when making sure that the integer that holds the compiled pattern length does not overflow. We make it a bit less than @@ -77,6 +77,17 @@ to check them every time. */ #define OFLOW_MAX (INT_MAX - 20) +/* Definitions to allow mutual recursion */ + +static int + add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *, + const pcre_uint32 *, unsigned int); + +static BOOL + compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, + int, int, int *, int *, branch_chain *, compile_data *, int *); + + /************************************************* * Code parameters and static tables * @@ -631,12 +642,6 @@ static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */ #endif -/* Definition to allow mutual recursion */ - -static BOOL - compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int, - int *, int *, branch_chain *, compile_data *, int *); - /************************************************* @@ -2871,9 +2876,10 @@ PUT(previous_callout, 2 + LINK_SIZE, length); *************************************************/ /* This function is passed the start and end of a class range, in UTF-8 mode -with UCP support. It searches up the characters, looking for internal ranges of +with UCP support. It searches up the characters, looking for ranges of characters in the "other" case. Each call returns the next one, updating the -start address. +start address. A character with multiple other cases is returned on its own +with a special return value. Arguments: cptr points to starting character value; updated @@ -2881,19 +2887,34 @@ Arguments: ocptr where to put start of othercase range odptr where to put end of othercase range -Yield: TRUE when range returned; FALSE when no more +Yield: -1 when no more + 0 when a range is returned + >0 the CASESET offset for char with multiple other cases + in this case, ocptr contains the original */ -static BOOL +static int get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr, unsigned int *odptr) { unsigned int c, othercase, next; +int co; + +/* Find the first character that has an other case. If it has multiple other +cases, return its case offset value. */ for (c = *cptr; c <= d; c++) - { if ((othercase = UCD_OTHERCASE(c)) != c) break; } + { + if ((co = UCD_CASESET(c)) != 0) + { + *ocptr = c++; /* Character that has the set */ + *cptr = c; /* Rest of input range */ + return co; + } + if ((othercase = UCD_OTHERCASE(c)) != c) break; + } -if (c > d) return FALSE; +if (c > d) return -1; /* Reached end of range */ *ocptr = othercase; next = othercase + 1; @@ -2904,10 +2925,9 @@ for (++c; c <= d; c++) next++; } -*odptr = next - 1; -*cptr = c; - -return TRUE; +*odptr = next - 1; /* End of othercase range */ +*cptr = c; /* Rest of input range */ +return 0; } @@ -3357,6 +3377,243 @@ switch(op_code) /************************************************* +* Add a character or range to a class * +*************************************************/ + +/* This function packages up the logic of adding a character or range of +characters to a class. The character values in the arguments will be within the +valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is +mutually recursive with the function immediately below. + +Arguments: + classbits the bit map for characters < 256 + uchardptr points to the pointer for extra data + options the options word + cd contains pointers to tables etc. + start start of range character + end end of range character + +Returns: the number of < 256 characters added + the pointer to extra data is updated +*/ + +static int +add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options, + compile_data *cd, unsigned int start, unsigned int end) +{ +unsigned int c; +int n8 = 0; + +/* If caseless matching is required, scan the range and process alternate +cases. In Unicode, there are 8-bit characters that have alternate cases that +are greater than 255 and vice-versa. Sometimes we can just extend the original +range. */ + +if ((options & PCRE_CASELESS) != 0) + { +#ifdef SUPPORT_UCP + if ((options & PCRE_UTF8) != 0) + { + int rc; + unsigned int oc, od; + + options &= ~PCRE_CASELESS; /* Remove for recursive calls */ + c = start; + + while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0) + { + /* Handle a single character that has more than one other case. */ + + if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd, + PRIV(ucd_caseless_sets) + rc, oc); + + /* Do nothing if the other case range is within the original range. */ + + else if (oc >= start && od <= end) continue; + + /* Extend the original range if there is overlap, noting that if oc < c, we + can't have od > end because a subrange is always shorter than the basic + range. Otherwise, use a recursive call to add the additional range. */ + + else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */ + else if (od > end && oc <= end + 1) end = od; /* Extend upwards */ + else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od); + } + } + else +#endif /* SUPPORT_UCP */ + + /* Not UTF-mode, or no UCP */ + + for (c = start; c <= end && c < 256; c++) + { + SETBIT(classbits, cd->fcc[c]); + n8++; + } + } + +/* Now handle the original range. Adjust the final value according to the bit +length - this means that the same lists of (e.g.) horizontal spaces can be used +in all cases. */ + +#ifdef COMPILE_PCRE8 +#ifdef SUPPORT_UTF + if ((options & PCRE_UTF8) == 0) +#endif + if (end > 0xff) end = 0xff; +#endif + +#ifdef COMPILE_PCRE16 +#ifdef SUPPORT_UTF + if ((options & PCRE_UTF16) == 0) +#endif + if (end > 0xffff) end = 0xffff; +#endif + +/* If all characters are less than 256, use the bit map. Otherwise use extra +data. */ + +if (end < 0x100) + { + for (c = start; c <= end; c++) + { + n8++; + SETBIT(classbits, c); + } + } + +else + { + pcre_uchar *uchardata = *uchardptr; + +#ifdef SUPPORT_UTF + if ((options & PCRE_UTF8) != 0) /* All UTFs use the same flag bit */ + { + if (start < end) + { + *uchardata++ = XCL_RANGE; + uchardata += PRIV(ord2utf)(start, uchardata); + uchardata += PRIV(ord2utf)(end, uchardata); + } + else if (start == end) + { + *uchardata++ = XCL_SINGLE; + uchardata += PRIV(ord2utf)(start, uchardata); + } + } + else +#endif /* SUPPORT_UTF */ + + /* Without UTF support, character values are constrained by the bit length, + and can only be > 256 for 16-bit and 32-bit libraries. */ + +#ifdef COMPILE_PCRE8 + {} +#else + if (start < end) + { + *uchardata++ = XCL_RANGE; + *uchardata++ = start; + *uchardata++ = end; + } + else if (start == end) + { + *uchardata++ = XCL_SINGLE; + *uchardata++ = start; + } +#endif + + *uchardptr = uchardata; /* Updata extra data pointer */ + } + +return n8; /* Number of 8-bit characters */ +} + + + + +/************************************************* +* Add a list of characters to a class * +*************************************************/ + +/* This function is used for adding a list of case-equivalent characters to a +class, and also for adding a list of horizontal or vertical whitespace. If the +list is in order (which it should be), ranges of characters are detected and +handled appropriately. This function is mutually recursive with the function +above. + +Arguments: + classbits the bit map for characters < 256 + uchardptr points to the pointer for extra data + options the options word + cd contains pointers to tables etc. + p points to row of 32-bit values, terminated by NOTACHAR + except character to omit; this is used when adding lists of + case-equivalent characters to avoid including the one we + already know about + +Returns: the number of < 256 characters added + the pointer to extra data is updated +*/ + +static int +add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options, + compile_data *cd, const pcre_uint32 *p, unsigned int except) +{ +int n8 = 0; +while (p[0] < NOTACHAR) + { + int n = 0; + if (p[0] != except) + { + while(p[n+1] == p[0] + n + 1) n++; + n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]); + } + p += n + 1; + } +return n8; +} + + + +/************************************************* +* Add characters not in a list to a class * +*************************************************/ + +/* This function is used for adding the complement of a list of horizontal or +vertical whitespace to a class. The list must be in order. + +Arguments: + classbits the bit map for characters < 256 + uchardptr points to the pointer for extra data + options the options word + cd contains pointers to tables etc. + p points to row of 32-bit values, terminated by NOTACHAR + +Returns: the number of < 256 characters added + the pointer to extra data is updated +*/ + +static int +add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, + int options, compile_data *cd, const pcre_uint32 *p) +{ +int n8 = 0; +if (p[0] > 0) + n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1); +while (p[0] < NOTACHAR) + { + while (p[1] == p[0] + 1) p++; + n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1, + (p[1] == NOTACHAR)? 0x10ffff : p[1] - 1); + p++; + } +return n8; +} + + + +/************************************************* * Compile one branch * *************************************************/ @@ -3474,7 +3731,7 @@ for (;; ptr++) BOOL is_recurse; BOOL reset_bracount; int class_has_8bitchar; - int class_single_char; + int class_one_char; int newoptions; int recno; int refsign; @@ -3772,25 +4029,25 @@ for (;; ptr++) should_flip_negation = FALSE; - /* For optimization purposes, we track some properties of the class. - class_has_8bitchar will be non-zero, if the class contains at least one - < 256 character. class_single_char will be 1 if the class contains only - a single character. */ + /* For optimization purposes, we track some properties of the class: + class_has_8bitchar will be non-zero if the class contains at least one < + 256 character; class_one_char will be 1 if the class contains just one + character. */ class_has_8bitchar = 0; - class_single_char = 0; + class_one_char = 0; /* Initialize the 32-char bit map to all zeros. We build the map in a - temporary bit of memory, in case the class contains only 1 character (less - than 256), because in that case the compiled code doesn't use the bit map. - */ + temporary bit of memory, in case the class contains fewer than two + 8-bit characters because in that case the compiled code doesn't use the bit + map. */ memset(classbits, 0, 32 * sizeof(pcre_uint8)); #if defined SUPPORT_UTF || !defined COMPILE_PCRE8 - xclass = FALSE; /* No chars >= 256 */ - class_uchardata = code + LINK_SIZE + 2; /* For UTF-8 items */ - class_uchardata_base = class_uchardata; /* For resetting in pass 1 */ + xclass = FALSE; + class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */ + class_uchardata_base = class_uchardata; /* Save the start */ #endif /* Process characters until ] is reached. By writing this as a "do" it @@ -3812,10 +4069,12 @@ for (;; ptr++) /* In the pre-compile phase, accumulate the length of any extra data and reset the pointer. This is so that very large classes that contain a zillion > 255 characters no longer overwrite the work space - (which is on the stack). */ + (which is on the stack). We have to remember that there was XCLASS data, + however. */ - if (lengthptr != NULL) + if (lengthptr != NULL && class_uchardata > class_uchardata_base) { + xclass = TRUE; *lengthptr += class_uchardata - class_uchardata_base; class_uchardata = class_uchardata_base; } @@ -3917,7 +4176,7 @@ for (;; ptr++) for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset]; } - /* Not see if we need to remove any special characters. An option + /* Now see if we need to remove any special characters. An option value of 1 removes vertical space and 2 removes underscore. */ if (tabopt < 0) tabopt = -tabopt; @@ -3933,10 +4192,10 @@ for (;; ptr++) for (c = 0; c < 32; c++) classbits[c] |= pbits[c]; ptr = tempptr + 1; - /* Every class contains at least one < 256 characters. */ + /* Every class contains at least one < 256 character. */ class_has_8bitchar = 1; /* Every class contains at least two characters. */ - class_single_char = 2; + class_one_char = 2; continue; /* End of POSIX syntax handling */ } @@ -3944,7 +4203,7 @@ for (;; ptr++) of the specials, which just set a flag. The sequence \b is a special case. Inside a class (and only there) it is treated as backspace. We assume that other escapes have more than one character in them, so - speculatively set both class_has_8bitchar and class_single_char bigger + speculatively set both class_has_8bitchar and class_one_char bigger than one. Unrecognized escapes fall through and are either treated as literal characters (by default), or are faulted if PCRE_EXTRA is set. */ @@ -3977,7 +4236,7 @@ for (;; ptr++) /* Every class contains at least two < 256 characters. */ class_has_8bitchar++; /* Every class contains at least two characters. */ - class_single_char += 2; + class_one_char += 2; switch (-c) { @@ -4027,191 +4286,27 @@ for (;; ptr++) for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space]; classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */ continue; + + /* The rest apply in both UCP and non-UCP cases. */ case ESC_h: - SETBIT(classbits, CHAR_HT); - SETBIT(classbits, CHAR_SPACE); -#ifndef EBCDIC - SETBIT(classbits, 0xa0); /* NSBP */ -#ifndef COMPILE_PCRE8 - xclass = TRUE; - *class_uchardata++ = XCL_SINGLE; - *class_uchardata++ = 0x1680; - *class_uchardata++ = XCL_SINGLE; - *class_uchardata++ = 0x180e; - *class_uchardata++ = XCL_RANGE; - *class_uchardata++ = 0x2000; - *class_uchardata++ = 0x200a; - *class_uchardata++ = XCL_SINGLE; - *class_uchardata++ = 0x202f; - *class_uchardata++ = XCL_SINGLE; - *class_uchardata++ = 0x205f; - *class_uchardata++ = XCL_SINGLE; - *class_uchardata++ = 0x3000; -#elif defined SUPPORT_UTF - if (utf) - { - xclass = TRUE; - *class_uchardata++ = XCL_SINGLE; - class_uchardata += PRIV(ord2utf)(0x1680, class_uchardata); - *class_uchardata++ = XCL_SINGLE; - class_uchardata += PRIV(ord2utf)(0x180e, class_uchardata); - *class_uchardata++ = XCL_RANGE; - class_uchardata += PRIV(ord2utf)(0x2000, class_uchardata); - class_uchardata += PRIV(ord2utf)(0x200a, class_uchardata); - *class_uchardata++ = XCL_SINGLE; - class_uchardata += PRIV(ord2utf)(0x202f, class_uchardata); - *class_uchardata++ = XCL_SINGLE; - class_uchardata += PRIV(ord2utf)(0x205f, class_uchardata); - *class_uchardata++ = XCL_SINGLE; - class_uchardata += PRIV(ord2utf)(0x3000, class_uchardata); - } -#endif -#endif /* Not EBCDIC */ + (void)add_list_to_class(classbits, &class_uchardata, options, cd, + PRIV(hspace_list), NOTACHAR); continue; case ESC_H: - for (c = 0; c < 32; c++) - { - int x = 0xff; - switch (c) - { - case CHAR_HT/8: x ^= 1 << (CHAR_HT%8); break; - case CHAR_SPACE/8: x ^= 1 << (CHAR_SPACE%8); break; -#ifndef EBCDIC - case 0xa0/8: x ^= 1 << (0xa0%8); break; /* NSBSP */ -#endif - default: break; - } - classbits[c] |= x; - } -#ifndef EBCDIC -#ifndef COMPILE_PCRE8 - xclass = TRUE; - *class_uchardata++ = XCL_RANGE; - *class_uchardata++ = 0x0100; - *class_uchardata++ = 0x167f; - *class_uchardata++ = XCL_RANGE; - *class_uchardata++ = 0x1681; - *class_uchardata++ = 0x180d; - *class_uchardata++ = XCL_RANGE; - *class_uchardata++ = 0x180f; - *class_uchardata++ = 0x1fff; - *class_uchardata++ = XCL_RANGE; - *class_uchardata++ = 0x200b; - *class_uchardata++ = 0x202e; - *class_uchardata++ = XCL_RANGE; - *class_uchardata++ = 0x2030; - *class_uchardata++ = 0x205e; - *class_uchardata++ = XCL_RANGE; - *class_uchardata++ = 0x2060; - *class_uchardata++ = 0x2fff; - *class_uchardata++ = XCL_RANGE; - *class_uchardata++ = 0x3001; -#ifdef SUPPORT_UTF - if (utf) - class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata); - else -#endif /* SUPPORT_UTF */ - *class_uchardata++ = 0xffff; -#elif defined SUPPORT_UTF - if (utf) - { - xclass = TRUE; - *class_uchardata++ = XCL_RANGE; - class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata); - class_uchardata += PRIV(ord2utf)(0x167f, class_uchardata); - *class_uchardata++ = XCL_RANGE; - class_uchardata += PRIV(ord2utf)(0x1681, class_uchardata); - class_uchardata += PRIV(ord2utf)(0x180d, class_uchardata); - *class_uchardata++ = XCL_RANGE; - class_uchardata += PRIV(ord2utf)(0x180f, class_uchardata); - class_uchardata += PRIV(ord2utf)(0x1fff, class_uchardata); - *class_uchardata++ = XCL_RANGE; - class_uchardata += PRIV(ord2utf)(0x200b, class_uchardata); - class_uchardata += PRIV(ord2utf)(0x202e, class_uchardata); - *class_uchardata++ = XCL_RANGE; - class_uchardata += PRIV(ord2utf)(0x2030, class_uchardata); - class_uchardata += PRIV(ord2utf)(0x205e, class_uchardata); - *class_uchardata++ = XCL_RANGE; - class_uchardata += PRIV(ord2utf)(0x2060, class_uchardata); - class_uchardata += PRIV(ord2utf)(0x2fff, class_uchardata); - *class_uchardata++ = XCL_RANGE; - class_uchardata += PRIV(ord2utf)(0x3001, class_uchardata); - class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata); - } -#endif -#endif /* Not EBCDIC */ + (void)add_not_list_to_class(classbits, &class_uchardata, options, + cd, PRIV(hspace_list)); continue; case ESC_v: - SETBIT(classbits, CHAR_LF); - SETBIT(classbits, CHAR_VT); - SETBIT(classbits, CHAR_FF); - SETBIT(classbits, CHAR_CR); - SETBIT(classbits, CHAR_NEL); -#ifndef EBCDIC -#ifndef COMPILE_PCRE8 - xclass = TRUE; - *class_uchardata++ = XCL_RANGE; - *class_uchardata++ = 0x2028; - *class_uchardata++ = 0x2029; -#elif defined SUPPORT_UTF - if (utf) - { - xclass = TRUE; - *class_uchardata++ = XCL_RANGE; - class_uchardata += PRIV(ord2utf)(0x2028, class_uchardata); - class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata); - } -#endif -#endif /* Not EBCDIC */ + (void)add_list_to_class(classbits, &class_uchardata, options, cd, + PRIV(vspace_list), NOTACHAR); continue; case ESC_V: - for (c = 0; c < 32; c++) - { - int x = 0xff; - switch (c) - { - case CHAR_LF/8: x ^= 1 << (CHAR_LF%8); - x ^= 1 << (CHAR_VT%8); - x ^= 1 << (CHAR_FF%8); - x ^= 1 << (CHAR_CR%8); - break; - case CHAR_NEL/8: x ^= 1 << (CHAR_NEL%8); break; - default: break; - } - classbits[c] |= x; - } - -#ifndef EBCDIC -#ifndef COMPILE_PCRE8 - xclass = TRUE; - *class_uchardata++ = XCL_RANGE; - *class_uchardata++ = 0x0100; - *class_uchardata++ = 0x2027; - *class_uchardata++ = XCL_RANGE; - *class_uchardata++ = 0x202a; -#ifdef SUPPORT_UTF - if (utf) - class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata); - else -#endif - *class_uchardata++ = 0xffff; -#elif defined SUPPORT_UTF - if (utf) - { - xclass = TRUE; - *class_uchardata++ = XCL_RANGE; - class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata); - class_uchardata += PRIV(ord2utf)(0x2027, class_uchardata); - *class_uchardata++ = XCL_RANGE; - class_uchardata += PRIV(ord2utf)(0x202a, class_uchardata); - class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata); - } -#endif -#endif /* Not EBCDIC */ + (void)add_not_list_to_class(classbits, &class_uchardata, options, + cd, PRIV(vspace_list)); continue; #ifdef SUPPORT_UCP @@ -4222,7 +4317,6 @@ for (;; ptr++) int pdata; int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); if (ptype < 0) goto FAILED; - xclass = TRUE; *class_uchardata++ = ((-c == ESC_p) != negated)? XCL_PROP : XCL_NOTPROP; *class_uchardata++ = ptype; @@ -4242,21 +4336,21 @@ for (;; ptr++) goto FAILED; } class_has_8bitchar--; /* Undo the speculative increase. */ - class_single_char -= 2; /* Undo the speculative increase. */ + class_one_char -= 2; /* Undo the speculative increase. */ c = *ptr; /* Get the final character and fall through */ break; } } - /* Fall through if we have a single character (c >= 0). This may be - greater than 256. */ - + /* Fall through if the escape just defined a single character (c >= 0). + This may be greater than 256. */ + } /* End of backslash handling */ - /* A single character may be followed by '-' to form a range. However, - Perl does not permit ']' to be the end of the range. A '-' character - at the end is treated as a literal. Perl ignores orphaned \E sequences - entirely. The code for handling \Q and \E is messy. */ + /* A character may be followed by '-' to form a range. However, Perl does + not permit ']' to be the end of the range. A '-' character at the end is + treated as a literal. Perl ignores orphaned \E sequences entirely. The + code for handling \Q and \E is messy. */ CHECK_RANGE: while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E) @@ -4264,10 +4358,9 @@ for (;; ptr++) inescq = FALSE; ptr += 2; } - oldptr = ptr; - /* Remember \r or \n */ + /* Remember if \r or \n were explicitly used */ if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF; @@ -4290,12 +4383,17 @@ for (;; ptr++) inescq = TRUE; break; } + + /* Minus (hyphen) at the end of a class is treated as a literal, so put + back the pointer and jump to handle the character that preceded it. */ if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET)) { ptr = oldptr; - goto LONE_SINGLE_CHARACTER; + goto CLASS_SINGLE_CHARACTER; } + + /* Otherwise, we have a potential range; pick up the next character */ #ifdef SUPPORT_UTF if (utf) @@ -4315,203 +4413,63 @@ for (;; ptr++) d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE); if (*errorcodeptr != 0) goto FAILED; - /* \b is backspace; any other special means the '-' was literal */ + /* \b is backspace; any other special means the '-' was literal. */ if (d < 0) { if (d == -ESC_b) d = CHAR_BS; else { ptr = oldptr; - goto LONE_SINGLE_CHARACTER; /* A few lines below */ + goto CLASS_SINGLE_CHARACTER; /* A few lines below */ } } } /* Check that the two values are in the correct order. Optimize - one-character ranges */ + one-character ranges. */ if (d < c) { *errorcodeptr = ERR8; goto FAILED; } + if (d == c) goto CLASS_SINGLE_CHARACTER; /* A few lines below */ - if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */ + /* We have found a character range, so single character optimizations + cannot be done anymore. Any value greater than 1 indicates that there + is more than one character. */ + + class_one_char = 2; - /* Remember \r or \n */ + /* Remember an explicit \r or \n, and add the range to the class. */ if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF; - - /* Since we found a character range, single character optimizations - cannot be done anymore. */ - class_single_char = 2; - - /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless - matching, we have to use an XCLASS with extra data items. Caseless - matching for characters > 127 is available only if UCP support is - available. */ - -#if defined SUPPORT_UTF && !(defined COMPILE_PCRE8) - if ((d > 255) || (utf && ((options & PCRE_CASELESS) != 0 && d > 127))) -#elif defined SUPPORT_UTF - if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127))) -#elif !(defined COMPILE_PCRE8) - if (d > 255) -#endif -#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) - { - xclass = TRUE; - - /* With UCP support, we can find the other case equivalents of - the relevant characters. There may be several ranges. Optimize how - they fit with the basic range. */ - -#ifdef SUPPORT_UCP -#ifndef COMPILE_PCRE8 - if (utf && (options & PCRE_CASELESS) != 0) -#else - if ((options & PCRE_CASELESS) != 0) -#endif - { - unsigned int occ, ocd; - unsigned int cc = c; - unsigned int origd = d; - while (get_othercase_range(&cc, origd, &occ, &ocd)) - { - if (occ >= (unsigned int)c && - ocd <= (unsigned int)d) - continue; /* Skip embedded ranges */ - - if (occ < (unsigned int)c && - ocd >= (unsigned int)c - 1) /* Extend the basic range */ - { /* if there is overlap, */ - c = occ; /* noting that if occ < c */ - continue; /* we can't have ocd > d */ - } /* because a subrange is */ - if (ocd > (unsigned int)d && - occ <= (unsigned int)d + 1) /* always shorter than */ - { /* the basic range. */ - d = ocd; - continue; - } - - if (occ == ocd) - { - *class_uchardata++ = XCL_SINGLE; - } - else - { - *class_uchardata++ = XCL_RANGE; - class_uchardata += PRIV(ord2utf)(occ, class_uchardata); - } - class_uchardata += PRIV(ord2utf)(ocd, class_uchardata); - } - } -#endif /* SUPPORT_UCP */ - - /* Now record the original range, possibly modified for UCP caseless - overlapping ranges. */ - - *class_uchardata++ = XCL_RANGE; -#ifdef SUPPORT_UTF -#ifndef COMPILE_PCRE8 - if (utf) - { - class_uchardata += PRIV(ord2utf)(c, class_uchardata); - class_uchardata += PRIV(ord2utf)(d, class_uchardata); - } - else - { - *class_uchardata++ = c; - *class_uchardata++ = d; - } -#else - class_uchardata += PRIV(ord2utf)(c, class_uchardata); - class_uchardata += PRIV(ord2utf)(d, class_uchardata); -#endif -#else /* SUPPORT_UTF */ - *class_uchardata++ = c; - *class_uchardata++ = d; -#endif /* SUPPORT_UTF */ - - /* With UCP support, we are done. Without UCP support, there is no - caseless matching for UTF characters > 127; we can use the bit map - for the smaller ones. As for 16 bit characters without UTF, we - can still use */ - -#ifdef SUPPORT_UCP -#ifndef COMPILE_PCRE8 - if (utf) -#endif - continue; /* With next character in the class */ -#endif /* SUPPORT_UCP */ - -#if defined SUPPORT_UTF && !defined(SUPPORT_UCP) && !(defined COMPILE_PCRE8) - if (utf) - { - if ((options & PCRE_CASELESS) == 0 || c > 127) continue; - /* Adjust upper limit and fall through to set up the map */ - d = 127; - } - else - { - if (c > 255) continue; - /* Adjust upper limit and fall through to set up the map */ - d = 255; - } -#elif defined SUPPORT_UTF && !defined(SUPPORT_UCP) - if ((options & PCRE_CASELESS) == 0 || c > 127) continue; - /* Adjust upper limit and fall through to set up the map */ - d = 127; -#else - if (c > 255) continue; - /* Adjust upper limit and fall through to set up the map */ - d = 255; -#endif /* SUPPORT_UTF && !SUPPORT_UCP && !COMPILE_PCRE8 */ - } -#endif /* SUPPORT_UTF || !COMPILE_PCRE8 */ - - /* We use the bit map for 8 bit mode, or when the characters fall - partially or entirely to [0-255] ([0-127] for UCP) ranges. */ - - class_has_8bitchar = 1; - - /* We can save a bit of time by skipping this in the pre-compile. */ - - if (lengthptr == NULL) for (; c <= d; c++) - { - classbits[c/8] |= (1 << (c&7)); - if ((options & PCRE_CASELESS) != 0) - { - int uc = cd->fcc[c]; /* flip case */ - classbits[uc/8] |= (1 << (uc&7)); - } - } - + + class_has_8bitchar += + add_to_class(classbits, &class_uchardata, options, cd, c, d); + continue; /* Go get the next char in the class */ } - /* Handle a lone single character - we can get here for a normal - non-escape char, or after \ that introduces a single character or for an - apparent range that isn't. */ - - LONE_SINGLE_CHARACTER: - - /* Only the value of 1 matters for class_single_char. */ - - if (class_single_char < 2) class_single_char++; - - /* If class_charcount is 1, we saw precisely one character. As long as - there was no use of \p or \P, in other words, no use of any XCLASS - features, we can optimize. - - The optimization throws away the bit map. We turn the item into a - 1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative. - In the positive case, it can cause firstchar to be set. Otherwise, there - can be no first char if this item is first, whatever repeat count may - follow. In the case of reqchar, save the previous value for reinstating. */ - - if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) + /* Handle a single character - we can get here for a normal non-escape + char, or after \ that introduces a single character or for an apparent + range that isn't. Only the value 1 matters for class_one_char, so don't + increase it if it is already 2 or more ... just in case there's a class + with a zillion characters in it. */ + + CLASS_SINGLE_CHARACTER: + if (class_one_char < 2) class_one_char++; + + /* If class_one_char is 1, we have the first single character in the + class, and there have been no prior ranges, or XCLASS items generated by + escapes. If this is the final character in the class, we can optimize by + turning the item into a 1-character OP_CHAR[I] if it's positive, or + OP_NOT[I] if it's negative. In the positive case, it can cause firstchar + to be set. Otherwise, there can be no first char if this item is first, + whatever repeat count may follow. In the case of reqchar, save the + previous value for reinstating. */ + + if (class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) { ptr++; zeroreqchar = reqchar; @@ -4544,64 +4502,12 @@ for (;; ptr++) } goto ONE_CHAR; } /* End of 1-char optimization */ - - /* Handle a character that cannot go in the bit map. */ - -#if defined SUPPORT_UTF && !(defined COMPILE_PCRE8) - if ((c > 255) || (utf && ((options & PCRE_CASELESS) != 0 && c > 127))) -#elif defined SUPPORT_UTF - if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127))) -#elif !(defined COMPILE_PCRE8) - if (c > 255) -#endif - -#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) - { - xclass = TRUE; - *class_uchardata++ = XCL_SINGLE; -#ifdef SUPPORT_UTF -#ifndef COMPILE_PCRE8 - /* In non 8 bit mode, we can get here even if we are not in UTF mode. */ - if (!utf) - *class_uchardata++ = c; - else -#endif - class_uchardata += PRIV(ord2utf)(c, class_uchardata); -#else /* SUPPORT_UTF */ - *class_uchardata++ = c; -#endif /* SUPPORT_UTF */ - -#ifdef SUPPORT_UCP -#ifdef COMPILE_PCRE8 - if ((options & PCRE_CASELESS) != 0) -#else - /* In non 8 bit mode, we can get here even if we are not in UTF mode. */ - if (utf && (options & PCRE_CASELESS) != 0) -#endif - { - unsigned int othercase; - if ((int)(othercase = UCD_OTHERCASE(c)) != c) - { - *class_uchardata++ = XCL_SINGLE; - class_uchardata += PRIV(ord2utf)(othercase, class_uchardata); - } - } -#endif /* SUPPORT_UCP */ - - } - else -#endif /* SUPPORT_UTF || COMPILE_PCRE16 */ - - /* Handle a single-byte character */ - { - class_has_8bitchar = 1; - classbits[c/8] |= (1 << (c&7)); - if ((options & PCRE_CASELESS) != 0) - { - c = cd->fcc[c]; /* flip case */ - classbits[c/8] |= (1 << (c&7)); - } - } + + /* There is more than one character in the class, or an XCLASS item + has been generated. Add this character to the class. */ + + class_has_8bitchar += + add_to_class(classbits, &class_uchardata, options, cd, c, c); } /* Loop until ']' reached. This "while" is the end of the "do" far above. @@ -4621,6 +4527,18 @@ for (;; ptr++) goto FAILED; } + /* We will need an XCLASS if data has been placed in class_uchardata. In + the second phase this is a sufficient test. However, in the pre-compile + phase, class_uchardata gets emptied to prevent workspace overflow, so it + only if the very last character in the class needs XCLASS will it contain + anything at this point. For this reason, xclass gets set TRUE above when + uchar_classdata is emptied, and that's why this code is the way it is here + instead of just doing a test on class_uchardata below. */ + +#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 + if (class_uchardata > class_uchardata_base) xclass = TRUE; +#endif + /* If this is the first thing in the branch, there can be no first char setting, whatever the repeat count. Any reqchar setting must remain unchanged after any kind of repeat. */ |