diff options
author | Andrei Zmievski <andrei@php.net> | 2006-03-06 21:45:57 +0000 |
---|---|---|
committer | Andrei Zmievski <andrei@php.net> | 2006-03-06 21:45:57 +0000 |
commit | 89a4054106fd1f5a6b7e1442f9d118b6fb0f3375 (patch) | |
tree | a4b8ad99f5a9355463124cd6221fa5e6e44b98be /ext/pcre/pcrelib/pcre_compile.c | |
parent | 5bfe560cab0532054e48737e8ea4bf2fa8ec00ff (diff) | |
download | php-git-89a4054106fd1f5a6b7e1442f9d118b6fb0f3375.tar.gz |
MFH (Import PCRE 6.6)
Diffstat (limited to 'ext/pcre/pcrelib/pcre_compile.c')
-rw-r--r-- | ext/pcre/pcrelib/pcre_compile.c | 287 |
1 files changed, 176 insertions, 111 deletions
diff --git a/ext/pcre/pcrelib/pcre_compile.c b/ext/pcre/pcrelib/pcre_compile.c index 2289952525..0bbc7f5ba1 100644 --- a/ext/pcre/pcrelib/pcre_compile.c +++ b/ext/pcre/pcrelib/pcre_compile.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2005 University of Cambridge + Copyright (c) 1997-2006 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -45,6 +45,15 @@ supporting internal functions that are not used by other modules. */ #include "pcre_internal.h" +/* When DEBUG is defined, we need the pcre_printint() function, which is also +used by pcretest. DEBUG is not defined when building a production library. */ + +#ifdef DEBUG +#include "pcre_printint.src" +#endif + + + /************************************************* * Code parameters and static tables * *************************************************/ @@ -107,7 +116,7 @@ static const short int escapes[] = { /* Tables of names of POSIX character classes and their lengths. The list is -terminated by a zero length entry. The first three must be alpha, upper, lower, +terminated by a zero length entry. The first three must be alpha, lower, upper, as this is assumed for handling case independence. */ static const char *const posix_names[] = { @@ -118,25 +127,31 @@ static const char *const posix_names[] = { static const uschar posix_name_lengths[] = { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 }; -/* Table of class bit maps for each POSIX class; up to three may be combined -to form the class. The table for [:blank:] is dynamically modified to remove -the vertical space characters. */ +/* Table of class bit maps for each POSIX class. Each class is formed from a +base map, with an optional addition or removal of another map. Then, for some +classes, there is some additional tweaking: for [:blank:] the vertical space +characters are removed, and for [:alpha:] and [:alnum:] the underscore +character is removed. The triples in the table consist of the base map offset, +second map offset or -1 if no second map, and a non-negative value for map +addition or a negative value for map subtraction (if there are two maps). The +absolute value of the third field has these meanings: 0 => no tweaking, 1 => +remove vertical space characters, 2 => remove underscore. */ static const int posix_class_maps[] = { - cbit_lower, cbit_upper, -1, /* alpha */ - cbit_lower, -1, -1, /* lower */ - cbit_upper, -1, -1, /* upper */ - cbit_digit, cbit_lower, cbit_upper, /* alnum */ - cbit_print, cbit_cntrl, -1, /* ascii */ - cbit_space, -1, -1, /* blank - a GNU extension */ - cbit_cntrl, -1, -1, /* cntrl */ - cbit_digit, -1, -1, /* digit */ - cbit_graph, -1, -1, /* graph */ - cbit_print, -1, -1, /* print */ - cbit_punct, -1, -1, /* punct */ - cbit_space, -1, -1, /* space */ - cbit_word, -1, -1, /* word - a Perl extension */ - cbit_xdigit,-1, -1 /* xdigit */ + cbit_word, cbit_digit, -2, /* alpha */ + cbit_lower, -1, 0, /* lower */ + cbit_upper, -1, 0, /* upper */ + cbit_word, -1, 2, /* alnum - word without underscore */ + cbit_print, cbit_cntrl, 0, /* ascii */ + cbit_space, -1, 1, /* blank - a GNU extension */ + cbit_cntrl, -1, 0, /* cntrl */ + cbit_digit, -1, 0, /* digit */ + cbit_graph, -1, 0, /* graph */ + cbit_print, -1, 0, /* print */ + cbit_punct, -1, 0, /* punct */ + cbit_space, -1, 0, /* space */ + cbit_word, -1, 0, /* word - a Perl extension */ + cbit_xdigit,-1, 0 /* xdigit */ }; @@ -362,12 +377,15 @@ static int check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount, int options, BOOL isclass) { -const uschar *ptr = *ptrptr; +BOOL utf8 = (options & PCRE_UTF8) != 0; +const uschar *ptr = *ptrptr + 1; int c, i; +GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ +ptr--; /* Set pointer back to the last byte */ + /* If backslash is at the end of the pattern, it's an error. */ -c = *(++ptr); if (c == 0) *errorcodeptr = ERR1; /* Non-alphamerics are literals. For digits or letters, do an initial lookup in @@ -451,40 +469,44 @@ else c &= 255; /* Take least significant 8 bits */ break; - /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number - which can be greater than 0xff, but only if the ddd are hex digits. */ + /* \x is complicated. \x{ddd} is a character number which can be greater + than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is + treated as a data character. */ case 'x': -#ifdef SUPPORT_UTF8 - if (ptr[1] == '{' && (options & PCRE_UTF8) != 0) + if (ptr[1] == '{') { const uschar *pt = ptr + 2; - register int count = 0; + int count = 0; + c = 0; while ((digitab[*pt] & ctype_xdigit) != 0) { - int cc = *pt++; + register int cc = *pt++; + if (c == 0 && cc == '0') continue; /* Leading zeroes */ count++; + #if !EBCDIC /* ASCII coding */ if (cc >= 'a') cc -= 32; /* Convert to upper case */ - c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10)); + c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10)); #else /* EBCDIC coding */ if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */ - c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10)); + c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10)); #endif } + if (*pt == '}') { - if (c < 0 || count > 8) *errorcodeptr = ERR34; + if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34; ptr = pt; break; } + /* If the sequence of hex digits does not end with '}', then we don't recognize this construct; fall through to the normal \x handling. */ } -#endif - /* Read just a single hex char */ + /* Read just a single-byte hex-defined char */ c = 0; while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0) @@ -560,25 +582,26 @@ escape sequence. Argument: ptrptr points to the pattern position pointer negptr points to a boolean that is set TRUE for negation else FALSE + dptr points to an int that is set to the detailed property value errorcodeptr points to the error code variable -Returns: value from ucp_type_table, or -1 for an invalid type +Returns: type value from ucp_type_table, or -1 for an invalid type */ static int -get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr) +get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr) { int c, i, bot, top; const uschar *ptr = *ptrptr; -char name[4]; +char name[32]; c = *(++ptr); if (c == 0) goto ERROR_RETURN; *negptr = FALSE; -/* \P or \p can be followed by a one- or two-character name in {}, optionally -preceded by ^ for negation. */ +/* \P or \p can be followed by a name in {}, optionally preceded by ^ for +negation. */ if (c == '{') { @@ -587,18 +610,14 @@ if (c == '{') *negptr = TRUE; ptr++; } - for (i = 0; i <= 2; i++) + for (i = 0; i < sizeof(name) - 1; i++) { c = *(++ptr); if (c == 0) goto ERROR_RETURN; if (c == '}') break; name[i] = c; } - if (c !='}') /* Try to distinguish error cases */ - { - while (*(++ptr) != 0 && *ptr != '}'); - if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN; - } + if (c !='}') goto ERROR_RETURN; name[i] = 0; } @@ -619,13 +638,16 @@ top = _pcre_utt_size; while (bot < top) { - i = (bot + top)/2; + i = (bot + top) >> 1; c = strcmp(name, _pcre_utt[i].name); - if (c == 0) return _pcre_utt[i].value; + if (c == 0) + { + *dptr = _pcre_utt[i].value; + return _pcre_utt[i].type; + } if (c > 0) bot = i + 1; else top = i; } -UNKNOWN_RETURN: *errorcodeptr = ERR47; *ptrptr = ptr; return -1; @@ -928,7 +950,7 @@ for (;;) case OP_PROP: case OP_NOTPROP: - cc++; + cc += 2; /* Fall through */ case OP_NOT_DIGIT: @@ -1488,13 +1510,10 @@ Yield: TRUE when range returned; FALSE when no more static BOOL get_othercase_range(int *cptr, int d, int *ocptr, int *odptr) { -int c, chartype, othercase, next; +int c, othercase, next; for (c = *cptr; c <= d; c++) - { - if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0) - break; - } + { if ((othercase = _pcre_ucp_othercase(c)) >= 0) break; } if (c > d) return FALSE; @@ -1503,9 +1522,7 @@ next = othercase + 1; for (++c; c <= d; c++) { - if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L || - othercase != next) - break; + if (_pcre_ucp_othercase(c) != next) break; next++; } @@ -1722,11 +1739,11 @@ for (;; ptr++) *code++ = OP_ANY; break; - /* Character classes. If the included characters are all < 255 in value, we - build a 32-byte bitmap of the permitted characters, except in the special - case where there is only one such character. For negated classes, we build - the map as usual, then invert it at the end. However, we use a different - opcode so that data characters > 255 can be handled correctly. + /* Character classes. If the included characters are all < 256, we build a + 32-byte bitmap of the permitted characters, except in the special case + where there is only one such character. For negated classes, we build the + map as usual, then invert it at the end. However, we use a different opcode + so that data characters > 255 can be handled correctly. If the class contains characters outside the 0-255 range, a different opcode is compiled. It may optionally have a bit map for characters < 256, @@ -1817,8 +1834,9 @@ for (;; ptr++) check_posix_syntax(ptr, &tempptr, cd)) { BOOL local_negate = FALSE; - int posix_class, i; + int posix_class, taboffset, tabopt; register const uschar *cbits = cd->cbits; + uschar pbits[32]; if (ptr[1] != ':') { @@ -1847,32 +1865,46 @@ for (;; ptr++) if ((options & PCRE_CASELESS) != 0 && posix_class <= 2) posix_class = 0; - /* Or into the map we are building up to 3 of the static class - tables, or their negations. The [:blank:] class sets up the same - chars as the [:space:] class (all white space). We remove the vertical - white space chars afterwards. */ + /* We build the bit map for the POSIX class in a chunk of local store + because we may be adding and subtracting from it, and we don't want to + subtract bits that may be in the main map already. At the end we or the + result into the bit map that is being built. */ posix_class *= 3; - for (i = 0; i < 3; i++) + + /* Copy in the first table (always present) */ + + memcpy(pbits, cbits + posix_class_maps[posix_class], + 32 * sizeof(uschar)); + + /* If there is a second table, add or remove it as required. */ + + taboffset = posix_class_maps[posix_class + 1]; + tabopt = posix_class_maps[posix_class + 2]; + + if (taboffset >= 0) { - BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0; - int taboffset = posix_class_maps[posix_class + i]; - if (taboffset < 0) break; - if (local_negate) - { - if (i == 0) - for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset]; - else - for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset]; - if (blankclass) classbits[1] |= 0x3c; - } + if (tabopt >= 0) + for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset]; else - { - for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset]; - if (blankclass) classbits[1] &= ~0x3c; - } + for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset]; } + /* Not see if we need to remove any special characters. An option + value of 1 removes vertical space and 2 removes underscore. */ + + if (tabopt < 0) tabopt = -tabopt; + if (tabopt == 1) pbits[1] &= ~0x3c; + else if (tabopt == 2) pbits[11] &= 0x7f; + + /* Add the POSIX table or its complement into the main table that is + being built and we are done. */ + + if (local_negate) + for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c]; + else + for (c = 0; c < 32; c++) classbits[c] |= pbits[c]; + ptr = tempptr + 1; class_charcount = 10; /* Set > 1; assumes more than 1 per class */ continue; /* End of POSIX syntax handling */ @@ -1939,12 +1971,14 @@ for (;; ptr++) case ESC_P: { BOOL negated; - int property = get_ucp(&ptr, &negated, errorcodeptr); - if (property < 0) goto FAILED; + int pdata; + int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); + if (ptype < 0) goto FAILED; class_utf8 = TRUE; *class_utf8data++ = ((-c == ESC_p) != negated)? XCL_PROP : XCL_NOTPROP; - *class_utf8data++ = property; + *class_utf8data++ = ptype; + *class_utf8data++ = pdata; class_charcount -= 2; /* Not a < 256 character */ } continue; @@ -2126,10 +2160,8 @@ for (;; ptr++) #ifdef SUPPORT_UCP if ((options & PCRE_CASELESS) != 0) { - int chartype; int othercase; - if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 && - othercase > 0) + if ((othercase = _pcre_ucp_othercase(c)) >= 0) { *class_utf8data++ = XCL_SINGLE; class_utf8data += _pcre_ord2utf8(othercase, class_utf8data); @@ -2414,13 +2446,17 @@ for (;; ptr++) else if (*previous < OP_EODN) { uschar *oldcode; - int prop_type; + int prop_type, prop_value; op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */ c = *previous; OUTPUT_SINGLE_REPEAT: - prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)? - previous[1] : -1; + if (*previous == OP_PROP || *previous == OP_NOTPROP) + { + prop_type = previous[1]; + prop_value = previous[2]; + } + else prop_type = prop_value = -1; oldcode = code; code = previous; /* Usually overwrite previous item */ @@ -2481,7 +2517,7 @@ for (;; ptr++) /* If the maximum is unlimited, insert an OP_STAR. Before doing so, we have to insert the character for the previous code. For a repeated - Unicode property match, there is an extra byte that defines the + Unicode property match, there are two extra bytes that define the required property. In UTF-8 mode, long characters have their length in c, with the 0x80 bit as a flag. */ @@ -2497,7 +2533,11 @@ for (;; ptr++) #endif { *code++ = c; - if (prop_type >= 0) *code++ = prop_type; + if (prop_type >= 0) + { + *code++ = prop_type; + *code++ = prop_value; + } } *code++ = OP_STAR + repeat_type; } @@ -2516,7 +2556,11 @@ for (;; ptr++) else #endif *code++ = c; - if (prop_type >= 0) *code++ = prop_type; + if (prop_type >= 0) + { + *code++ = prop_type; + *code++ = prop_value; + } repeat_max -= repeat_min; *code++ = OP_UPTO + repeat_type; PUT2INC(code, 0, repeat_max); @@ -2535,11 +2579,15 @@ for (;; ptr++) #endif *code++ = c; - /* For a repeated Unicode property match, there is an extra byte that - defines the required property. */ + /* For a repeated Unicode property match, there are two extra bytes that + define the required property. */ #ifdef SUPPORT_UCP - if (prop_type >= 0) *code++ = prop_type; + if (prop_type >= 0) + { + *code++ = prop_type; + *code++ = prop_value; + } #endif } @@ -3007,11 +3055,20 @@ for (;; ptr++) goto FAILED; } - /* Insert the recursion/subroutine item */ + /* Insert the recursion/subroutine item, automatically wrapped inside + "once" brackets. */ + + *code = OP_ONCE; + PUT(code, 1, 2 + 2*LINK_SIZE); + code += 1 + LINK_SIZE; *code = OP_RECURSE; PUT(code, 1, called - cd->start_code); code += 1 + LINK_SIZE; + + *code = OP_KET; + PUT(code, 1, 2 + 2*LINK_SIZE); + code += 1 + LINK_SIZE; } continue; @@ -3281,10 +3338,12 @@ for (;; ptr++) else if (-c == ESC_P || -c == ESC_p) { BOOL negated; - int value = get_ucp(&ptr, &negated, errorcodeptr); + int pdata; + int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); previous = code; *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP; - *code++ = value; + *code++ = ptype; + *code++ = pdata; } #endif @@ -3839,7 +3898,7 @@ Returns: pointer to compiled data block, or NULL on error, with errorptr and erroroffset set */ -EXPORT pcre * +PCRE_DATA_SCOPE pcre * pcre_compile(const char *pattern, int options, const char **errorptr, int *erroroffset, const unsigned char *tables) { @@ -3847,7 +3906,7 @@ return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables); } -EXPORT pcre * +PCRE_DATA_SCOPE pcre * pcre_compile2(const char *pattern, int options, int *errorcodeptr, const char **errorptr, int *erroroffset, const unsigned char *tables) { @@ -4040,15 +4099,17 @@ while ((c = *(++ptr)) != 0) #endif /* \P and \p are for Unicode properties, but only when the support has - been compiled. Each item needs 2 bytes. */ + been compiled. Each item needs 3 bytes. */ else if (-c == ESC_P || -c == ESC_p) { #ifdef SUPPORT_UCP BOOL negated; - length += 2; - lastitemlength = 2; - if (get_ucp(&ptr, &negated, &errorcode) < 0) goto PCRE_ERROR_RETURN; + BOOL pdata; + length += 3; + lastitemlength = 3; + if (get_ucp(&ptr, &negated, &pdata, &errorcode) < 0) + goto PCRE_ERROR_RETURN; continue; #else errorcode = ERR45; @@ -4214,7 +4275,7 @@ while ((c = *(++ptr)) != 0) class_utf8 = TRUE; length += LINK_SIZE + 2; } - length += 2; + length += 3; } #endif } @@ -4477,7 +4538,7 @@ while ((c = *(++ptr)) != 0) errorcode = ERR29; goto PCRE_ERROR_RETURN; } - length += 1 + LINK_SIZE; + length += 3 + 3*LINK_SIZE; /* Allows for the automatic "once" */ /* If this item is quantified, it will get wrapped inside brackets so as to use the code for quantified brackets. We jump down and use the @@ -4533,6 +4594,7 @@ while ((c = *(++ptr)) != 0) if (*ptr == '=' || *ptr == '>') { + length += 2 + 2*LINK_SIZE; /* Allow for the automatic "once" */ while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0); if (*ptr != ')') { @@ -5000,7 +5062,8 @@ if (reqbyte >= 0 && re->options |= PCRE_REQCHSET; } -/* Print out the compiled data for debugging */ +/* Print out the compiled data if debugging is enabled. This is never the +case when building a production library. */ #ifdef DEBUG @@ -5025,7 +5088,8 @@ if (re->options != 0) if ((re->options & PCRE_FIRSTSET) != 0) { int ch = re->first_byte & 255; - const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)"; + const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? + "" : " (caseless)"; if (isprint(ch)) printf("First char = %c%s\n", ch, caseless); else printf("First char = \\x%02x%s\n", ch, caseless); } @@ -5033,12 +5097,13 @@ if ((re->options & PCRE_FIRSTSET) != 0) if ((re->options & PCRE_REQCHSET) != 0) { int ch = re->req_byte & 255; - const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)"; + const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? + "" : " (caseless)"; if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless); else printf("Req char = \\x%02x%s\n", ch, caseless); } -_pcre_printint(re, stdout); +pcre_printint(re, stdout); /* This check is done here in the debugging case so that the code that was compiled can be seen. */ |