summaryrefslogtreecommitdiff
path: root/ext/pcre/pcrelib/pcre_compile.c
diff options
context:
space:
mode:
authorAndrei Zmievski <andrei@php.net>2006-03-06 21:45:57 +0000
committerAndrei Zmievski <andrei@php.net>2006-03-06 21:45:57 +0000
commit89a4054106fd1f5a6b7e1442f9d118b6fb0f3375 (patch)
treea4b8ad99f5a9355463124cd6221fa5e6e44b98be /ext/pcre/pcrelib/pcre_compile.c
parent5bfe560cab0532054e48737e8ea4bf2fa8ec00ff (diff)
downloadphp-git-89a4054106fd1f5a6b7e1442f9d118b6fb0f3375.tar.gz
MFH (Import PCRE 6.6)
Diffstat (limited to 'ext/pcre/pcrelib/pcre_compile.c')
-rw-r--r--ext/pcre/pcrelib/pcre_compile.c287
1 files changed, 176 insertions, 111 deletions
diff --git a/ext/pcre/pcrelib/pcre_compile.c b/ext/pcre/pcrelib/pcre_compile.c
index 2289952525..0bbc7f5ba1 100644
--- a/ext/pcre/pcrelib/pcre_compile.c
+++ b/ext/pcre/pcrelib/pcre_compile.c
@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
- Copyright (c) 1997-2005 University of Cambridge
+ Copyright (c) 1997-2006 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -45,6 +45,15 @@ supporting internal functions that are not used by other modules. */
#include "pcre_internal.h"
+/* When DEBUG is defined, we need the pcre_printint() function, which is also
+used by pcretest. DEBUG is not defined when building a production library. */
+
+#ifdef DEBUG
+#include "pcre_printint.src"
+#endif
+
+
+
/*************************************************
* Code parameters and static tables *
*************************************************/
@@ -107,7 +116,7 @@ static const short int escapes[] = {
/* Tables of names of POSIX character classes and their lengths. The list is
-terminated by a zero length entry. The first three must be alpha, upper, lower,
+terminated by a zero length entry. The first three must be alpha, lower, upper,
as this is assumed for handling case independence. */
static const char *const posix_names[] = {
@@ -118,25 +127,31 @@ static const char *const posix_names[] = {
static const uschar posix_name_lengths[] = {
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
-/* Table of class bit maps for each POSIX class; up to three may be combined
-to form the class. The table for [:blank:] is dynamically modified to remove
-the vertical space characters. */
+/* Table of class bit maps for each POSIX class. Each class is formed from a
+base map, with an optional addition or removal of another map. Then, for some
+classes, there is some additional tweaking: for [:blank:] the vertical space
+characters are removed, and for [:alpha:] and [:alnum:] the underscore
+character is removed. The triples in the table consist of the base map offset,
+second map offset or -1 if no second map, and a non-negative value for map
+addition or a negative value for map subtraction (if there are two maps). The
+absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
+remove vertical space characters, 2 => remove underscore. */
static const int posix_class_maps[] = {
- cbit_lower, cbit_upper, -1, /* alpha */
- cbit_lower, -1, -1, /* lower */
- cbit_upper, -1, -1, /* upper */
- cbit_digit, cbit_lower, cbit_upper, /* alnum */
- cbit_print, cbit_cntrl, -1, /* ascii */
- cbit_space, -1, -1, /* blank - a GNU extension */
- cbit_cntrl, -1, -1, /* cntrl */
- cbit_digit, -1, -1, /* digit */
- cbit_graph, -1, -1, /* graph */
- cbit_print, -1, -1, /* print */
- cbit_punct, -1, -1, /* punct */
- cbit_space, -1, -1, /* space */
- cbit_word, -1, -1, /* word - a Perl extension */
- cbit_xdigit,-1, -1 /* xdigit */
+ cbit_word, cbit_digit, -2, /* alpha */
+ cbit_lower, -1, 0, /* lower */
+ cbit_upper, -1, 0, /* upper */
+ cbit_word, -1, 2, /* alnum - word without underscore */
+ cbit_print, cbit_cntrl, 0, /* ascii */
+ cbit_space, -1, 1, /* blank - a GNU extension */
+ cbit_cntrl, -1, 0, /* cntrl */
+ cbit_digit, -1, 0, /* digit */
+ cbit_graph, -1, 0, /* graph */
+ cbit_print, -1, 0, /* print */
+ cbit_punct, -1, 0, /* punct */
+ cbit_space, -1, 0, /* space */
+ cbit_word, -1, 0, /* word - a Perl extension */
+ cbit_xdigit,-1, 0 /* xdigit */
};
@@ -362,12 +377,15 @@ static int
check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
int options, BOOL isclass)
{
-const uschar *ptr = *ptrptr;
+BOOL utf8 = (options & PCRE_UTF8) != 0;
+const uschar *ptr = *ptrptr + 1;
int c, i;
+GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
+ptr--; /* Set pointer back to the last byte */
+
/* If backslash is at the end of the pattern, it's an error. */
-c = *(++ptr);
if (c == 0) *errorcodeptr = ERR1;
/* Non-alphamerics are literals. For digits or letters, do an initial lookup in
@@ -451,40 +469,44 @@ else
c &= 255; /* Take least significant 8 bits */
break;
- /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
- which can be greater than 0xff, but only if the ddd are hex digits. */
+ /* \x is complicated. \x{ddd} is a character number which can be greater
+ than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
+ treated as a data character. */
case 'x':
-#ifdef SUPPORT_UTF8
- if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
+ if (ptr[1] == '{')
{
const uschar *pt = ptr + 2;
- register int count = 0;
+ int count = 0;
+
c = 0;
while ((digitab[*pt] & ctype_xdigit) != 0)
{
- int cc = *pt++;
+ register int cc = *pt++;
+ if (c == 0 && cc == '0') continue; /* Leading zeroes */
count++;
+
#if !EBCDIC /* ASCII coding */
if (cc >= 'a') cc -= 32; /* Convert to upper case */
- c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
+ c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
#else /* EBCDIC coding */
if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
- c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
+ c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
#endif
}
+
if (*pt == '}')
{
- if (c < 0 || count > 8) *errorcodeptr = ERR34;
+ if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
ptr = pt;
break;
}
+
/* If the sequence of hex digits does not end with '}', then we don't
recognize this construct; fall through to the normal \x handling. */
}
-#endif
- /* Read just a single hex char */
+ /* Read just a single-byte hex-defined char */
c = 0;
while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
@@ -560,25 +582,26 @@ escape sequence.
Argument:
ptrptr points to the pattern position pointer
negptr points to a boolean that is set TRUE for negation else FALSE
+ dptr points to an int that is set to the detailed property value
errorcodeptr points to the error code variable
-Returns: value from ucp_type_table, or -1 for an invalid type
+Returns: type value from ucp_type_table, or -1 for an invalid type
*/
static int
-get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)
+get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
{
int c, i, bot, top;
const uschar *ptr = *ptrptr;
-char name[4];
+char name[32];
c = *(++ptr);
if (c == 0) goto ERROR_RETURN;
*negptr = FALSE;
-/* \P or \p can be followed by a one- or two-character name in {}, optionally
-preceded by ^ for negation. */
+/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
+negation. */
if (c == '{')
{
@@ -587,18 +610,14 @@ if (c == '{')
*negptr = TRUE;
ptr++;
}
- for (i = 0; i <= 2; i++)
+ for (i = 0; i < sizeof(name) - 1; i++)
{
c = *(++ptr);
if (c == 0) goto ERROR_RETURN;
if (c == '}') break;
name[i] = c;
}
- if (c !='}') /* Try to distinguish error cases */
- {
- while (*(++ptr) != 0 && *ptr != '}');
- if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;
- }
+ if (c !='}') goto ERROR_RETURN;
name[i] = 0;
}
@@ -619,13 +638,16 @@ top = _pcre_utt_size;
while (bot < top)
{
- i = (bot + top)/2;
+ i = (bot + top) >> 1;
c = strcmp(name, _pcre_utt[i].name);
- if (c == 0) return _pcre_utt[i].value;
+ if (c == 0)
+ {
+ *dptr = _pcre_utt[i].value;
+ return _pcre_utt[i].type;
+ }
if (c > 0) bot = i + 1; else top = i;
}
-UNKNOWN_RETURN:
*errorcodeptr = ERR47;
*ptrptr = ptr;
return -1;
@@ -928,7 +950,7 @@ for (;;)
case OP_PROP:
case OP_NOTPROP:
- cc++;
+ cc += 2;
/* Fall through */
case OP_NOT_DIGIT:
@@ -1488,13 +1510,10 @@ Yield: TRUE when range returned; FALSE when no more
static BOOL
get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
{
-int c, chartype, othercase, next;
+int c, othercase, next;
for (c = *cptr; c <= d; c++)
- {
- if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0)
- break;
- }
+ { if ((othercase = _pcre_ucp_othercase(c)) >= 0) break; }
if (c > d) return FALSE;
@@ -1503,9 +1522,7 @@ next = othercase + 1;
for (++c; c <= d; c++)
{
- if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L ||
- othercase != next)
- break;
+ if (_pcre_ucp_othercase(c) != next) break;
next++;
}
@@ -1722,11 +1739,11 @@ for (;; ptr++)
*code++ = OP_ANY;
break;
- /* Character classes. If the included characters are all < 255 in value, we
- build a 32-byte bitmap of the permitted characters, except in the special
- case where there is only one such character. For negated classes, we build
- the map as usual, then invert it at the end. However, we use a different
- opcode so that data characters > 255 can be handled correctly.
+ /* Character classes. If the included characters are all < 256, we build a
+ 32-byte bitmap of the permitted characters, except in the special case
+ where there is only one such character. For negated classes, we build the
+ map as usual, then invert it at the end. However, we use a different opcode
+ so that data characters > 255 can be handled correctly.
If the class contains characters outside the 0-255 range, a different
opcode is compiled. It may optionally have a bit map for characters < 256,
@@ -1817,8 +1834,9 @@ for (;; ptr++)
check_posix_syntax(ptr, &tempptr, cd))
{
BOOL local_negate = FALSE;
- int posix_class, i;
+ int posix_class, taboffset, tabopt;
register const uschar *cbits = cd->cbits;
+ uschar pbits[32];
if (ptr[1] != ':')
{
@@ -1847,32 +1865,46 @@ for (;; ptr++)
if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
posix_class = 0;
- /* Or into the map we are building up to 3 of the static class
- tables, or their negations. The [:blank:] class sets up the same
- chars as the [:space:] class (all white space). We remove the vertical
- white space chars afterwards. */
+ /* We build the bit map for the POSIX class in a chunk of local store
+ because we may be adding and subtracting from it, and we don't want to
+ subtract bits that may be in the main map already. At the end we or the
+ result into the bit map that is being built. */
posix_class *= 3;
- for (i = 0; i < 3; i++)
+
+ /* Copy in the first table (always present) */
+
+ memcpy(pbits, cbits + posix_class_maps[posix_class],
+ 32 * sizeof(uschar));
+
+ /* If there is a second table, add or remove it as required. */
+
+ taboffset = posix_class_maps[posix_class + 1];
+ tabopt = posix_class_maps[posix_class + 2];
+
+ if (taboffset >= 0)
{
- BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;
- int taboffset = posix_class_maps[posix_class + i];
- if (taboffset < 0) break;
- if (local_negate)
- {
- if (i == 0)
- for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];
- else
- for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];
- if (blankclass) classbits[1] |= 0x3c;
- }
+ if (tabopt >= 0)
+ for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
else
- {
- for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];
- if (blankclass) classbits[1] &= ~0x3c;
- }
+ for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
}
+ /* Not see if we need to remove any special characters. An option
+ value of 1 removes vertical space and 2 removes underscore. */
+
+ if (tabopt < 0) tabopt = -tabopt;
+ if (tabopt == 1) pbits[1] &= ~0x3c;
+ else if (tabopt == 2) pbits[11] &= 0x7f;
+
+ /* Add the POSIX table or its complement into the main table that is
+ being built and we are done. */
+
+ if (local_negate)
+ for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
+ else
+ for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
+
ptr = tempptr + 1;
class_charcount = 10; /* Set > 1; assumes more than 1 per class */
continue; /* End of POSIX syntax handling */
@@ -1939,12 +1971,14 @@ for (;; ptr++)
case ESC_P:
{
BOOL negated;
- int property = get_ucp(&ptr, &negated, errorcodeptr);
- if (property < 0) goto FAILED;
+ int pdata;
+ int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
+ if (ptype < 0) goto FAILED;
class_utf8 = TRUE;
*class_utf8data++ = ((-c == ESC_p) != negated)?
XCL_PROP : XCL_NOTPROP;
- *class_utf8data++ = property;
+ *class_utf8data++ = ptype;
+ *class_utf8data++ = pdata;
class_charcount -= 2; /* Not a < 256 character */
}
continue;
@@ -2126,10 +2160,8 @@ for (;; ptr++)
#ifdef SUPPORT_UCP
if ((options & PCRE_CASELESS) != 0)
{
- int chartype;
int othercase;
- if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 &&
- othercase > 0)
+ if ((othercase = _pcre_ucp_othercase(c)) >= 0)
{
*class_utf8data++ = XCL_SINGLE;
class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
@@ -2414,13 +2446,17 @@ for (;; ptr++)
else if (*previous < OP_EODN)
{
uschar *oldcode;
- int prop_type;
+ int prop_type, prop_value;
op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
c = *previous;
OUTPUT_SINGLE_REPEAT:
- prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?
- previous[1] : -1;
+ if (*previous == OP_PROP || *previous == OP_NOTPROP)
+ {
+ prop_type = previous[1];
+ prop_value = previous[2];
+ }
+ else prop_type = prop_value = -1;
oldcode = code;
code = previous; /* Usually overwrite previous item */
@@ -2481,7 +2517,7 @@ for (;; ptr++)
/* If the maximum is unlimited, insert an OP_STAR. Before doing so,
we have to insert the character for the previous code. For a repeated
- Unicode property match, there is an extra byte that defines the
+ Unicode property match, there are two extra bytes that define the
required property. In UTF-8 mode, long characters have their length in
c, with the 0x80 bit as a flag. */
@@ -2497,7 +2533,11 @@ for (;; ptr++)
#endif
{
*code++ = c;
- if (prop_type >= 0) *code++ = prop_type;
+ if (prop_type >= 0)
+ {
+ *code++ = prop_type;
+ *code++ = prop_value;
+ }
}
*code++ = OP_STAR + repeat_type;
}
@@ -2516,7 +2556,11 @@ for (;; ptr++)
else
#endif
*code++ = c;
- if (prop_type >= 0) *code++ = prop_type;
+ if (prop_type >= 0)
+ {
+ *code++ = prop_type;
+ *code++ = prop_value;
+ }
repeat_max -= repeat_min;
*code++ = OP_UPTO + repeat_type;
PUT2INC(code, 0, repeat_max);
@@ -2535,11 +2579,15 @@ for (;; ptr++)
#endif
*code++ = c;
- /* For a repeated Unicode property match, there is an extra byte that
- defines the required property. */
+ /* For a repeated Unicode property match, there are two extra bytes that
+ define the required property. */
#ifdef SUPPORT_UCP
- if (prop_type >= 0) *code++ = prop_type;
+ if (prop_type >= 0)
+ {
+ *code++ = prop_type;
+ *code++ = prop_value;
+ }
#endif
}
@@ -3007,11 +3055,20 @@ for (;; ptr++)
goto FAILED;
}
- /* Insert the recursion/subroutine item */
+ /* Insert the recursion/subroutine item, automatically wrapped inside
+ "once" brackets. */
+
+ *code = OP_ONCE;
+ PUT(code, 1, 2 + 2*LINK_SIZE);
+ code += 1 + LINK_SIZE;
*code = OP_RECURSE;
PUT(code, 1, called - cd->start_code);
code += 1 + LINK_SIZE;
+
+ *code = OP_KET;
+ PUT(code, 1, 2 + 2*LINK_SIZE);
+ code += 1 + LINK_SIZE;
}
continue;
@@ -3281,10 +3338,12 @@ for (;; ptr++)
else if (-c == ESC_P || -c == ESC_p)
{
BOOL negated;
- int value = get_ucp(&ptr, &negated, errorcodeptr);
+ int pdata;
+ int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
previous = code;
*code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
- *code++ = value;
+ *code++ = ptype;
+ *code++ = pdata;
}
#endif
@@ -3839,7 +3898,7 @@ Returns: pointer to compiled data block, or NULL on error,
with errorptr and erroroffset set
*/
-EXPORT pcre *
+PCRE_DATA_SCOPE pcre *
pcre_compile(const char *pattern, int options, const char **errorptr,
int *erroroffset, const unsigned char *tables)
{
@@ -3847,7 +3906,7 @@ return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
}
-EXPORT pcre *
+PCRE_DATA_SCOPE pcre *
pcre_compile2(const char *pattern, int options, int *errorcodeptr,
const char **errorptr, int *erroroffset, const unsigned char *tables)
{
@@ -4040,15 +4099,17 @@ while ((c = *(++ptr)) != 0)
#endif
/* \P and \p are for Unicode properties, but only when the support has
- been compiled. Each item needs 2 bytes. */
+ been compiled. Each item needs 3 bytes. */
else if (-c == ESC_P || -c == ESC_p)
{
#ifdef SUPPORT_UCP
BOOL negated;
- length += 2;
- lastitemlength = 2;
- if (get_ucp(&ptr, &negated, &errorcode) < 0) goto PCRE_ERROR_RETURN;
+ BOOL pdata;
+ length += 3;
+ lastitemlength = 3;
+ if (get_ucp(&ptr, &negated, &pdata, &errorcode) < 0)
+ goto PCRE_ERROR_RETURN;
continue;
#else
errorcode = ERR45;
@@ -4214,7 +4275,7 @@ while ((c = *(++ptr)) != 0)
class_utf8 = TRUE;
length += LINK_SIZE + 2;
}
- length += 2;
+ length += 3;
}
#endif
}
@@ -4477,7 +4538,7 @@ while ((c = *(++ptr)) != 0)
errorcode = ERR29;
goto PCRE_ERROR_RETURN;
}
- length += 1 + LINK_SIZE;
+ length += 3 + 3*LINK_SIZE; /* Allows for the automatic "once" */
/* If this item is quantified, it will get wrapped inside brackets so
as to use the code for quantified brackets. We jump down and use the
@@ -4533,6 +4594,7 @@ while ((c = *(++ptr)) != 0)
if (*ptr == '=' || *ptr == '>')
{
+ length += 2 + 2*LINK_SIZE; /* Allow for the automatic "once" */
while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
if (*ptr != ')')
{
@@ -5000,7 +5062,8 @@ if (reqbyte >= 0 &&
re->options |= PCRE_REQCHSET;
}
-/* Print out the compiled data for debugging */
+/* Print out the compiled data if debugging is enabled. This is never the
+case when building a production library. */
#ifdef DEBUG
@@ -5025,7 +5088,8 @@ if (re->options != 0)
if ((re->options & PCRE_FIRSTSET) != 0)
{
int ch = re->first_byte & 255;
- const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
+ const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
+ "" : " (caseless)";
if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
else printf("First char = \\x%02x%s\n", ch, caseless);
}
@@ -5033,12 +5097,13 @@ if ((re->options & PCRE_FIRSTSET) != 0)
if ((re->options & PCRE_REQCHSET) != 0)
{
int ch = re->req_byte & 255;
- const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
+ const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
+ "" : " (caseless)";
if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
else printf("Req char = \\x%02x%s\n", ch, caseless);
}
-_pcre_printint(re, stdout);
+pcre_printint(re, stdout);
/* This check is done here in the debugging case so that the code that
was compiled can be seen. */