summaryrefslogtreecommitdiff
path: root/ext/pcre/pcrelib/pcre_internal.h
diff options
context:
space:
mode:
authorIlia Alshanetsky <iliaa@php.net>2010-08-14 14:37:13 +0000
committerIlia Alshanetsky <iliaa@php.net>2010-08-14 14:37:13 +0000
commitc5602787fca62e64a63b99e7ae3f310f72e2cac9 (patch)
treec03f412a77986b5f66e76ca2a1155ba7efc511f0 /ext/pcre/pcrelib/pcre_internal.h
parentc1019643253d2d65d813358e1bc37ecf5dd365ed (diff)
downloadphp-git-c5602787fca62e64a63b99e7ae3f310f72e2cac9.tar.gz
MFH: Upgraded bundled PCRE to version 8.10.
Diffstat (limited to 'ext/pcre/pcrelib/pcre_internal.h')
-rw-r--r--ext/pcre/pcrelib/pcre_internal.h81
1 files changed, 52 insertions, 29 deletions
diff --git a/ext/pcre/pcrelib/pcre_internal.h b/ext/pcre/pcrelib/pcre_internal.h
index 0938782a78..e293602fe6 100644
--- a/ext/pcre/pcrelib/pcre_internal.h
+++ b/ext/pcre/pcrelib/pcre_internal.h
@@ -191,9 +191,8 @@ arithmetic. Handle this by defining a macro for the appropriate type. If
stdint.h is available, include it; it may define INT64_MAX. Systems that do not
have stdint.h (e.g. Solaris) may have inttypes.h. The macro int64_t may be set
by "configure". */
-#ifdef PHP_WIN32
-#include "win32/php_stdint.h"
-#elif HAVE_STDINT_H
+
+#if HAVE_STDINT_H
#include <stdint.h>
#elif HAVE_INTTYPES_H
#include <inttypes.h>
@@ -476,7 +475,8 @@ know we are in UTF-8 mode. */
} \
}
-/* Get the next character, testing for UTF-8 mode, and advancing the pointer */
+/* Get the next character, testing for UTF-8 mode, and advancing the pointer.
+This is called when we don't know if we are in UTF-8 mode. */
#define GETCHARINCTEST(c, eptr) \
c = *eptr++; \
@@ -513,7 +513,7 @@ if there are extra bytes. This is called when we know we are in UTF-8 mode. */
/* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the
pointer, incrementing length if there are extra bytes. This is called when we
-know we are in UTF-8 mode. */
+do not know if we are in UTF-8 mode. */
#define GETCHARLENTEST(c, eptr, len) \
c = *eptr; \
@@ -581,7 +581,7 @@ time, run time, or study time, respectively. */
PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \
PCRE_DUPNAMES|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \
- PCRE_JAVASCRIPT_COMPAT)
+ PCRE_JAVASCRIPT_COMPAT|PCRE_UCP)
#define PUBLIC_EXEC_OPTIONS \
(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NOTEMPTY_ATSTART| \
@@ -876,6 +876,7 @@ so that PCRE works on both ASCII and EBCDIC platforms, in non-UTF-mode only. */
#define STRING_COMMIT0 "COMMIT\0"
#define STRING_F0 "F\0"
#define STRING_FAIL0 "FAIL\0"
+#define STRING_MARK0 "MARK\0"
#define STRING_PRUNE0 "PRUNE\0"
#define STRING_SKIP0 "SKIP\0"
#define STRING_THEN "THEN"
@@ -905,6 +906,7 @@ so that PCRE works on both ASCII and EBCDIC platforms, in non-UTF-mode only. */
#define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)"
#define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)"
#define STRING_UTF8_RIGHTPAR "UTF8)"
+#define STRING_UCP_RIGHTPAR "UCP)"
#else /* SUPPORT_UTF8 */
@@ -1128,6 +1130,7 @@ only. */
#define STRING_COMMIT0 STR_C STR_O STR_M STR_M STR_I STR_T "\0"
#define STRING_F0 STR_F "\0"
#define STRING_FAIL0 STR_F STR_A STR_I STR_L "\0"
+#define STRING_MARK0 STR_M STR_A STR_R STR_K "\0"
#define STRING_PRUNE0 STR_P STR_R STR_U STR_N STR_E "\0"
#define STRING_SKIP0 STR_S STR_K STR_I STR_P "\0"
#define STRING_THEN STR_T STR_H STR_E STR_N
@@ -1157,6 +1160,7 @@ only. */
#define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
#define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS
#define STRING_UTF8_RIGHTPAR STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS
+#define STRING_UCP_RIGHTPAR STR_U STR_C STR_P STR_RIGHT_PARENTHESIS
#endif /* SUPPORT_UTF8 */
@@ -1189,9 +1193,13 @@ only. */
#define PT_ANY 0 /* Any property - matches all chars */
#define PT_LAMP 1 /* L& - the union of Lu, Ll, Lt */
-#define PT_GC 2 /* General characteristic (e.g. L) */
-#define PT_PC 3 /* Particular characteristic (e.g. Lu) */
+#define PT_GC 2 /* Specified general characteristic (e.g. L) */
+#define PT_PC 3 /* Specified particular characteristic (e.g. Lu) */
#define PT_SC 4 /* Script (e.g. Han) */
+#define PT_ALNUM 5 /* Alphanumeric - the union of L and N */
+#define PT_SPACE 6 /* Perl space - Z plus 9,10,12,13 */
+#define PT_PXSPACE 7 /* POSIX space - Z plus 9,10,11,12,13 */
+#define PT_WORD 8 /* Word - L plus N plus underscore */
/* Flag bits and data types for the extended class (OP_XCLASS) for classes that
contain UTF-8 characters with values greater than 255. */
@@ -1208,9 +1216,15 @@ contain UTF-8 characters with values greater than 255. */
/* These are escaped items that aren't just an encoding of a particular data
value such as \n. They must have non-zero values, as check_escape() returns
their negation. Also, they must appear in the same order as in the opcode
-definitions below, up to ESC_z. There's a dummy for OP_ANY because it
-corresponds to "." rather than an escape sequence, and another for OP_ALLANY
-(which is used for [^] in JavaScript compatibility mode).
+definitions below, up to ESC_z. There's a dummy for OP_ALLANY because it
+corresponds to "." in DOTALL mode rather than an escape sequence. It is also
+used for [^] in JavaScript compatibility mode. In non-DOTALL mode, "." behaves
+like \N.
+
+The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc.
+when PCRE_UCP is set, when replacement of \d etc by \p sequences is required.
+They must be contiguous, and remain in order so that the replacements can be
+looked up from a table.
The final escape must be ESC_REF as subsequent values are used for
backreferences (\1, \2, \3, etc). There are two tests in the code for an escape
@@ -1220,11 +1234,12 @@ put in between that don't consume a character, that code will have to change.
*/
enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s,
- ESC_W, ESC_w, ESC_dum1, ESC_dum2, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H,
- ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_g, ESC_k,
+ ESC_W, ESC_w, ESC_N, ESC_dum, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H,
+ ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z,
+ ESC_E, ESC_Q, ESC_g, ESC_k,
+ ESC_DU, ESC_du, ESC_SU, ESC_su, ESC_WU, ESC_wu,
ESC_REF };
-
/* Opcode table: Starting from 1 (i.e. after OP_END), the values up to
OP_EOD must correspond in order to the list of escapes immediately above.
@@ -1248,8 +1263,8 @@ enum {
OP_WHITESPACE, /* 9 \s */
OP_NOT_WORDCHAR, /* 10 \W */
OP_WORDCHAR, /* 11 \w */
- OP_ANY, /* 12 Match any character (subject to DOTALL) */
- OP_ALLANY, /* 13 Match any character (not subject to DOTALL) */
+ OP_ANY, /* 12 Match any character except newline */
+ OP_ALLANY, /* 13 Match any character */
OP_ANYBYTE, /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */
OP_NOTPROP, /* 15 \P (not Unicode property) */
OP_PROP, /* 16 \p (Unicode property) */
@@ -1379,20 +1394,24 @@ enum {
/* These are backtracking control verbs */
- OP_PRUNE, /* 107 */
- OP_SKIP, /* 108 */
- OP_THEN, /* 109 */
- OP_COMMIT, /* 110 */
+ OP_MARK, /* 107 always has an argument */
+ OP_PRUNE, /* 108 */
+ OP_PRUNE_ARG, /* 109 same, but with argument */
+ OP_SKIP, /* 110 */
+ OP_SKIP_ARG, /* 111 same, but with argument */
+ OP_THEN, /* 112 */
+ OP_THEN_ARG, /* 113 same, but with argument */
+ OP_COMMIT, /* 114 */
/* These are forced failure and success verbs */
- OP_FAIL, /* 111 */
- OP_ACCEPT, /* 112 */
- OP_CLOSE, /* 113 Used before OP_ACCEPT to close open captures */
+ OP_FAIL, /* 115 */
+ OP_ACCEPT, /* 116 */
+ OP_CLOSE, /* 117 Used before OP_ACCEPT to close open captures */
/* This is used to skip a subpattern with a {0} quantifier */
- OP_SKIPZERO, /* 114 */
+ OP_SKIPZERO, /* 118 */
/* This is not an opcode, but is used to check that tables indexed by opcode
are the correct length, in order to catch updating errors - there have been
@@ -1403,7 +1422,7 @@ enum {
/* *** NOTE NOTE NOTE *** Whenever the list above is updated, the two macro
definitions that follow must also be updated to match. There are also tables
-called "coptable" cna "poptable" in pcre_dfa_exec.c that must be updated. */
+called "coptable" and "poptable" in pcre_dfa_exec.c that must be updated. */
/* This macro defines textual names for all the opcodes. These are used only
@@ -1428,7 +1447,8 @@ for debugging. The macro is referenced only in pcre_printint.c. */
"Once", "Bra", "CBra", "Cond", "SBra", "SCBra", "SCond", \
"Cond ref", "Cond nref", "Cond rec", "Cond nrec", "Cond def", \
"Brazero", "Braminzero", \
- "*PRUNE", "*SKIP", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \
+ "*MARK", "*PRUNE", "*PRUNE", "*SKIP", "*SKIP", \
+ "*THEN", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \
"Close", "Skip zero"
@@ -1494,8 +1514,9 @@ in UTF-8 mode. The code that uses this table must know about such things. */
3, 3, /* RREF, NRREF */ \
1, /* DEF */ \
1, 1, /* BRAZERO, BRAMINZERO */ \
- 1, 1, 1, 1, /* PRUNE, SKIP, THEN, COMMIT, */ \
- 1, 1, 3, 1 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
+ 3, 1, 3, /* MARK, PRUNE, PRUNE_ARG, */ \
+ 1, 3, 1, 3, /* SKIP, SKIP_ARG, THEN, THEN_ARG, */ \
+ 1, 1, 1, 3, 1 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
/* A magic value for OP_RREF and OP_NRREF to indicate the "any recursion"
@@ -1513,7 +1534,7 @@ enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39,
ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59,
- ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERRCOUNT };
+ ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERRCOUNT };
/* The real format of the start of the pcre block; the index of names and the
code vector run on as long as necessary after the end. We store an explicit
@@ -1656,6 +1677,7 @@ typedef struct match_data {
BOOL noteol; /* NOTEOL flag */
BOOL utf8; /* UTF8 flag */
BOOL jscript_compat; /* JAVASCRIPT_COMPAT flag */
+ BOOL use_ucp; /* PCRE_UCP flag */
BOOL endonly; /* Dollar not before final \n */
BOOL notempty; /* Empty string match not wanted */
BOOL notempty_atstart; /* Empty string match at start not wanted */
@@ -1675,6 +1697,7 @@ typedef struct match_data {
int eptrn; /* Next free eptrblock */
recursion_info *recursive; /* Linked list of recursion data */
void *callout_data; /* To pass back to callouts */
+ const uschar *mark; /* Mark pointer to pass back */
} match_data;
/* A similar structure is used for the same purpose by the DFA matching