summaryrefslogtreecommitdiff
path: root/ext/pcre/pcrelib/internal.h
diff options
context:
space:
mode:
authorAndrei Zmievski <andrei@php.net>2005-05-27 17:46:43 +0000
committerAndrei Zmievski <andrei@php.net>2005-05-27 17:46:43 +0000
commit4e72f8a298cec9b14ab623950863e229b83b829d (patch)
tree6e1daec8c779196da1002543254764edb190ba02 /ext/pcre/pcrelib/internal.h
parent743c5bb15bcbc147e2cee4afb13c87b090102e51 (diff)
downloadphp-git-4e72f8a298cec9b14ab623950863e229b83b829d.tar.gz
Upgrade library to version 5.0.
Diffstat (limited to 'ext/pcre/pcrelib/internal.h')
-rw-r--r--ext/pcre/pcrelib/internal.h399
1 files changed, 231 insertions, 168 deletions
diff --git a/ext/pcre/pcrelib/internal.h b/ext/pcre/pcrelib/internal.h
index 43039570f0..d80608bc07 100644
--- a/ext/pcre/pcrelib/internal.h
+++ b/ext/pcre/pcrelib/internal.h
@@ -5,30 +5,38 @@
/* This is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language. See
-the file Tech.Notes for some information on the internals.
+the file doc/Tech.Notes for some information on the internals.
Written by: Philip Hazel <ph10@cam.ac.uk>
- Copyright (c) 1997-2003 University of Cambridge
+ Copyright (c) 1997-2004 University of Cambridge
-----------------------------------------------------------------------------
-Permission is granted to anyone to use this software for any purpose on any
-computer system, and to redistribute it freely, subject to the following
-restrictions:
-
-1. This software is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-
-2. The origin of this software must not be misrepresented, either by
- explicit claim or by omission.
-
-3. Altered versions must be plainly marked as such, and must not be
- misrepresented as being the original software.
-
-4. If PCRE is embedded in any software that is released under the GNU
- General Purpose Licence (GPL), then the terms of that licence shall
- supersede any condition above with which it is incompatible.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
@@ -45,6 +53,18 @@ modules, but which are not relevant to the outside. */
# include <php_config.h>
#endif
+/* Standard C headers plus the external interface definition. The only time
+setjmp and stdarg are used is when NO_RECURSE is set. */
+
+#include <ctype.h>
+#include <limits.h>
+#include <setjmp.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
#ifndef PCRE_SPY
#define PCRE_DEFINITION /* Win32 __declspec(export) trigger for .dll */
#endif
@@ -57,24 +77,45 @@ On Unix systems, "configure" can be used to override this default. */
#define NEWLINE '\n'
#endif
-/* The value of MATCH_LIMIT determines the default number of times the match()
-function can be called during a single execution of pcre_exec(). (There is a
-runtime method of setting a different limit.) The limit exists in order to
-catch runaway regular expressions that take for ever to determine that they do
-not match. The default is set very large so that it does not accidentally catch
-legitimate cases. On Unix systems, "configure" can be used to override this
-default default. */
-
-#ifndef MATCH_LIMIT
-#define MATCH_LIMIT 10000000
-#endif
-
/* If you are compiling for a system that needs some magic to be inserted
* before the definition of an exported function, define this macro to contain
* the relevant magic. It apears at the start of every exported function. */
#define EXPORT
+/* We need to have types that specify unsigned 16-bit and 32-bit integers. We
+cannot determine these outside the compilation (e.g. by running a program as
+part of "configure") because PCRE is often cross-compiled for use on other
+systems. Instead we make use of the maximum sizes that are available at
+preprocessor time in standard C environments. */
+
+#if USHRT_MAX == 65535
+ typedef unsigned short pcre_uint16;
+#elif UINT_MAX == 65535
+ typedef unsigned int pcre_uint16;
+#else
+ #error Cannot determine a type for 16-bit unsigned integers
+#endif
+
+#if UINT_MAX == 4294967295
+ typedef unsigned int pcre_uint32;
+#elif ULONG_MAX == 4294967295
+ typedef unsigned long int pcre_uint32;
+#else
+ #error Cannot determine a type for 32-bit unsigned integers
+#endif
+
+/* All character handling must be done as unsigned characters. Otherwise there
+are problems with top-bit-set characters and functions such as isspace().
+However, we leave the interface to the outside world as char *, because that
+should make things easier for callers. We define a short type for unsigned char
+to save lots of typing. I tried "uchar", but it causes problems on Digital
+Unix, where it is defined in sys/types, so use "uschar" instead. */
+
+typedef unsigned char uschar;
+
+/* Include the public PCRE header */
+
#include "pcre.h"
/* When compiling for use with the Virtual Pascal compiler, these functions
@@ -95,18 +136,6 @@ neither (there some non-Unix environments where this is the case). This assumes
that all calls to memmove are moving strings upwards in store, which is the
case in PCRE. */
-/* Standard C headers plus the external interface definition. The only time
-setjmp and stdarg are used is when NO_RECURSE is set. */
-
-#include <ctype.h>
-#include <limits.h>
-#include <setjmp.h>
-#include <stdarg.h>
-#include <stddef.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
#if ! HAVE_MEMMOVE
#undef memmove /* some systems may have a macro */
#if HAVE_BCOPY
@@ -126,13 +155,14 @@ for (i = 0; i < n; ++i) *(--dest) = *(--src);
#endif /* not VPCOMPAT */
-/* PCRE keeps offsets in its compiled code as 2-byte quantities by default.
-These are used, for example, to link from the start of a subpattern to its
-alternatives and its end. The use of 2 bytes per offset limits the size of the
-compiled regex to around 64K, which is big enough for almost everybody.
-However, I received a request for an even bigger limit. For this reason, and
-also to make the code easier to maintain, the storing and loading of offsets
-from the byte string is now handled by the macros that are defined here.
+/* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored
+in big-endian order) by default. These are used, for example, to link from the
+start of a subpattern to its alternatives and its end. The use of 2 bytes per
+offset limits the size of the compiled regex to around 64K, which is big enough
+for almost everybody. However, I received a request for an even bigger limit.
+For this reason, and also to make the code easier to maintain, the storing and
+loading of offsets from the byte string is now handled by the macros that are
+defined here.
The macros are controlled by the value of LINK_SIZE. This defaults to 2 in
the config.h file, but can be overridden by using -D on the command line. This
@@ -208,6 +238,7 @@ Standard C system should have one. */
#define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field))
#endif
+
/* These are the public options that can change during matching. */
#define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL)
@@ -216,12 +247,13 @@ Standard C system should have one. */
but skip the top bit so we can use ints for convenience without getting tangled
with negative values. The public options defined in pcre.h start at the least
significant end. Make sure they don't overlap, though now that we have expanded
-to four bytes there is plenty of space. */
+to four bytes, there is plenty of space. */
#define PCRE_FIRSTSET 0x40000000 /* first_byte is set */
#define PCRE_REQCHSET 0x20000000 /* req_byte is set */
#define PCRE_STARTLINE 0x10000000 /* start after \n for multiline */
#define PCRE_ICHANGED 0x08000000 /* i option changes within regex */
+#define PCRE_NOPARTIAL 0x04000000 /* can't use partial with this regex */
/* Options for the "extra" block produced by pcre_study(). */
@@ -233,10 +265,11 @@ time, run time or study time, respectively. */
#define PUBLIC_OPTIONS \
(PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
- PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK)
+ PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT)
#define PUBLIC_EXEC_OPTIONS \
- (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK)
+ (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
+ PCRE_PARTIAL)
#define PUBLIC_STUDY_OPTIONS 0 /* None defined */
@@ -296,12 +329,13 @@ definitions below, up to ESC_z. There's a dummy for OP_ANY because it
corresponds to "." rather than an escape sequence. The final one must be
ESC_REF as subsequent values are used for \1, \2, \3, etc. There is are two
tests in the code for an escape greater than ESC_b and less than ESC_Z to
-detect the types that may be repeated. These are the types that consume a
-character. If any new escapes are put in between that don't consume a
+detect the types that may be repeated. These are the types that consume
+characters. If any new escapes are put in between that don't consume a
character, that code will have to change. */
enum { ESC_A = 1, ESC_G, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W,
- ESC_w, ESC_dum1, ESC_C, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_REF };
+ ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_X, ESC_Z, ESC_z, ESC_E,
+ ESC_Q, ESC_REF };
/* Flag bits and data types for the extended class (OP_XCLASS) for classes that
contain UTF-8 characters with values greater than 255. */
@@ -312,6 +346,8 @@ contain UTF-8 characters with values greater than 255. */
#define XCL_END 0 /* Marks end of individual items */
#define XCL_SINGLE 1 /* Single item (one multibyte char) follows */
#define XCL_RANGE 2 /* A range (two multibyte chars) follows */
+#define XCL_PROP 3 /* Unicode property (one property code) follows */
+#define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */
/* Opcode table: OP_BRA must be last, as all values >= it are used for brackets
@@ -337,100 +373,112 @@ enum {
OP_WORDCHAR, /* 10 \w */
OP_ANY, /* 11 Match any character */
OP_ANYBYTE, /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */
- OP_EODN, /* 13 End of data or \n at end of data: \Z. */
- OP_EOD, /* 14 End of data: \z */
-
- OP_OPT, /* 15 Set runtime options */
- OP_CIRC, /* 16 Start of line - varies with multiline switch */
- OP_DOLL, /* 17 End of line - varies with multiline switch */
- OP_CHARS, /* 18 Match string of characters */
- OP_NOT, /* 19 Match anything but the following char */
-
- OP_STAR, /* 20 The maximizing and minimizing versions of */
- OP_MINSTAR, /* 21 all these opcodes must come in pairs, with */
- OP_PLUS, /* 22 the minimizing one second. */
- OP_MINPLUS, /* 23 This first set applies to single characters */
- OP_QUERY, /* 24 */
- OP_MINQUERY, /* 25 */
- OP_UPTO, /* 26 From 0 to n matches */
- OP_MINUPTO, /* 27 */
- OP_EXACT, /* 28 Exactly n matches */
-
- OP_NOTSTAR, /* 29 The maximizing and minimizing versions of */
- OP_NOTMINSTAR, /* 30 all these opcodes must come in pairs, with */
- OP_NOTPLUS, /* 31 the minimizing one second. */
- OP_NOTMINPLUS, /* 32 This set applies to "not" single characters */
- OP_NOTQUERY, /* 33 */
- OP_NOTMINQUERY, /* 34 */
- OP_NOTUPTO, /* 35 From 0 to n matches */
- OP_NOTMINUPTO, /* 36 */
- OP_NOTEXACT, /* 37 Exactly n matches */
-
- OP_TYPESTAR, /* 38 The maximizing and minimizing versions of */
- OP_TYPEMINSTAR, /* 39 all these opcodes must come in pairs, with */
- OP_TYPEPLUS, /* 40 the minimizing one second. These codes must */
- OP_TYPEMINPLUS, /* 41 be in exactly the same order as those above. */
- OP_TYPEQUERY, /* 42 This set applies to character types such as \d */
- OP_TYPEMINQUERY, /* 43 */
- OP_TYPEUPTO, /* 44 From 0 to n matches */
- OP_TYPEMINUPTO, /* 45 */
- OP_TYPEEXACT, /* 46 Exactly n matches */
-
- OP_CRSTAR, /* 47 The maximizing and minimizing versions of */
- OP_CRMINSTAR, /* 48 all these opcodes must come in pairs, with */
- OP_CRPLUS, /* 49 the minimizing one second. These codes must */
- OP_CRMINPLUS, /* 50 be in exactly the same order as those above. */
- OP_CRQUERY, /* 51 These are for character classes and back refs */
- OP_CRMINQUERY, /* 52 */
- OP_CRRANGE, /* 53 These are different to the three seta above. */
- OP_CRMINRANGE, /* 54 */
-
- OP_CLASS, /* 55 Match a character class, chars < 256 only */
- OP_NCLASS, /* 56 Same, but the bitmap was created from a negative
+ OP_NOTPROP, /* 13 \P (not Unicode property) */
+ OP_PROP, /* 14 \p (Unicode property) */
+ OP_EXTUNI, /* 15 \X (extended Unicode sequence */
+ OP_EODN, /* 16 End of data or \n at end of data: \Z. */
+ OP_EOD, /* 17 End of data: \z */
+
+ OP_OPT, /* 18 Set runtime options */
+ OP_CIRC, /* 19 Start of line - varies with multiline switch */
+ OP_DOLL, /* 20 End of line - varies with multiline switch */
+ OP_CHAR, /* 21 Match one character, casefully */
+ OP_CHARNC, /* 22 Match one character, caselessly */
+ OP_NOT, /* 23 Match anything but the following char */
+
+ OP_STAR, /* 24 The maximizing and minimizing versions of */
+ OP_MINSTAR, /* 25 all these opcodes must come in pairs, with */
+ OP_PLUS, /* 26 the minimizing one second. */
+ OP_MINPLUS, /* 27 This first set applies to single characters */
+ OP_QUERY, /* 28 */
+ OP_MINQUERY, /* 29 */
+ OP_UPTO, /* 30 From 0 to n matches */
+ OP_MINUPTO, /* 31 */
+ OP_EXACT, /* 32 Exactly n matches */
+
+ OP_NOTSTAR, /* 33 The maximizing and minimizing versions of */
+ OP_NOTMINSTAR, /* 34 all these opcodes must come in pairs, with */
+ OP_NOTPLUS, /* 35 the minimizing one second. */
+ OP_NOTMINPLUS, /* 36 This set applies to "not" single characters */
+ OP_NOTQUERY, /* 37 */
+ OP_NOTMINQUERY, /* 38 */
+ OP_NOTUPTO, /* 39 From 0 to n matches */
+ OP_NOTMINUPTO, /* 40 */
+ OP_NOTEXACT, /* 41 Exactly n matches */
+
+ OP_TYPESTAR, /* 42 The maximizing and minimizing versions of */
+ OP_TYPEMINSTAR, /* 43 all these opcodes must come in pairs, with */
+ OP_TYPEPLUS, /* 44 the minimizing one second. These codes must */
+ OP_TYPEMINPLUS, /* 45 be in exactly the same order as those above. */
+ OP_TYPEQUERY, /* 46 This set applies to character types such as \d */
+ OP_TYPEMINQUERY, /* 47 */
+ OP_TYPEUPTO, /* 48 From 0 to n matches */
+ OP_TYPEMINUPTO, /* 49 */
+ OP_TYPEEXACT, /* 50 Exactly n matches */
+
+ OP_CRSTAR, /* 51 The maximizing and minimizing versions of */
+ OP_CRMINSTAR, /* 52 all these opcodes must come in pairs, with */
+ OP_CRPLUS, /* 53 the minimizing one second. These codes must */
+ OP_CRMINPLUS, /* 54 be in exactly the same order as those above. */
+ OP_CRQUERY, /* 55 These are for character classes and back refs */
+ OP_CRMINQUERY, /* 56 */
+ OP_CRRANGE, /* 57 These are different to the three sets above. */
+ OP_CRMINRANGE, /* 58 */
+
+ OP_CLASS, /* 59 Match a character class, chars < 256 only */
+ OP_NCLASS, /* 60 Same, but the bitmap was created from a negative
class - the difference is relevant only when a UTF-8
character > 255 is encountered. */
- OP_XCLASS, /* 57 Extended class for handling UTF-8 chars within the
+ OP_XCLASS, /* 61 Extended class for handling UTF-8 chars within the
class. This does both positive and negative. */
- OP_REF, /* 58 Match a back reference */
- OP_RECURSE, /* 59 Match a numbered subpattern (possibly recursive) */
- OP_CALLOUT, /* 60 Call out to external function if provided */
+ OP_REF, /* 62 Match a back reference */
+ OP_RECURSE, /* 63 Match a numbered subpattern (possibly recursive) */
+ OP_CALLOUT, /* 64 Call out to external function if provided */
- OP_ALT, /* 61 Start of alternation */
- OP_KET, /* 62 End of group that doesn't have an unbounded repeat */
- OP_KETRMAX, /* 63 These two must remain together and in this */
- OP_KETRMIN, /* 64 order. They are for groups the repeat for ever. */
+ OP_ALT, /* 65 Start of alternation */
+ OP_KET, /* 66 End of group that doesn't have an unbounded repeat */
+ OP_KETRMAX, /* 67 These two must remain together and in this */
+ OP_KETRMIN, /* 68 order. They are for groups the repeat for ever. */
/* The assertions must come before ONCE and COND */
- OP_ASSERT, /* 65 Positive lookahead */
- OP_ASSERT_NOT, /* 66 Negative lookahead */
- OP_ASSERTBACK, /* 67 Positive lookbehind */
- OP_ASSERTBACK_NOT, /* 68 Negative lookbehind */
- OP_REVERSE, /* 69 Move pointer back - used in lookbehind assertions */
+ OP_ASSERT, /* 69 Positive lookahead */
+ OP_ASSERT_NOT, /* 70 Negative lookahead */
+ OP_ASSERTBACK, /* 71 Positive lookbehind */
+ OP_ASSERTBACK_NOT, /* 72 Negative lookbehind */
+ OP_REVERSE, /* 73 Move pointer back - used in lookbehind assertions */
/* ONCE and COND must come after the assertions, with ONCE first, as there's
a test for >= ONCE for a subpattern that isn't an assertion. */
- OP_ONCE, /* 70 Once matched, don't back up into the subpattern */
- OP_COND, /* 71 Conditional group */
- OP_CREF, /* 72 Used to hold an extraction string number (cond ref) */
+ OP_ONCE, /* 74 Once matched, don't back up into the subpattern */
+ OP_COND, /* 75 Conditional group */
+ OP_CREF, /* 76 Used to hold an extraction string number (cond ref) */
- OP_BRAZERO, /* 73 These two must remain together and in this */
- OP_BRAMINZERO, /* 74 order. */
+ OP_BRAZERO, /* 77 These two must remain together and in this */
+ OP_BRAMINZERO, /* 78 order. */
- OP_BRANUMBER, /* 75 Used for extracting brackets whose number is greater
+ OP_BRANUMBER, /* 79 Used for extracting brackets whose number is greater
than can fit into an opcode. */
- OP_BRA /* 76 This and greater values are used for brackets that
- extract substrings up to a basic limit. After that,
- use is made of OP_BRANUMBER. */
+ OP_BRA /* 80 This and greater values are used for brackets that
+ extract substrings up to EXTRACT_BASIC_MAX. After
+ that, use is made of OP_BRANUMBER. */
};
-/* WARNING: There is an implicit assumption in study.c that all opcodes are
-less than 128 in value. This makes handling UTF-8 character sequences easier.
-*/
+/* WARNING WARNING WARNING: There is an implicit assumption in pcre.c and
+study.c that all opcodes are less than 128 in value. This makes handling UTF-8
+character sequences easier. */
+
+/* The highest extraction number before we have to start using additional
+bytes. (Originally PCRE didn't have support for extraction counts highter than
+this number.) The value is limited by the number of opcodes left after OP_BRA,
+i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional
+opcodes. */
+
+#define EXTRACT_BASIC_MAX 100
/* This macro defines textual names for all the opcodes. There are used only
@@ -439,8 +487,10 @@ macro is referenced only in printint.c. */
#define OP_NAME_LIST \
"End", "\\A", "\\G", "\\B", "\\b", "\\D", "\\d", \
- "\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", "\\Z", "\\z", \
- "Opt", "^", "$", "chars", "not", \
+ "\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", \
+ "notprop", "prop", "extuni", \
+ "\\Z", "\\z", \
+ "Opt", "^", "$", "char", "charnc", "not", \
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
@@ -463,8 +513,11 @@ in UTF-8 mode. The code that uses this table must know about such things. */
#define OP_LENGTHS \
1, /* End */ \
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* \A, \G, \B, \B, \D, \d, \S, \s, \W, \w */ \
- 1, 1, 1, 1, 2, 1, 1, /* Any, Anybyte, \Z, \z, Opt, ^, $ */ \
- 2, /* Chars - the minimum length */ \
+ 1, 1, /* Any, Anybyte */ \
+ 2, 2, 1, /* NOTPROP, PROP, EXTUNI */ \
+ 1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \
+ 2, /* Char - the minimum length */ \
+ 2, /* Charnc - the minimum length */ \
2, /* not */ \
/* Positive single-char repeats ** These are */ \
2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \
@@ -483,7 +536,7 @@ in UTF-8 mode. The code that uses this table must know about such things. */
0, /* XCLASS - variable length */ \
3, /* REF */ \
1+LINK_SIZE, /* RECURSE */ \
- 2, /* CALLOUT */ \
+ 2+2*LINK_SIZE, /* CALLOUT */ \
1+LINK_SIZE, /* Alt */ \
1+LINK_SIZE, /* Ket */ \
1+LINK_SIZE, /* KetRmax */ \
@@ -501,14 +554,6 @@ in UTF-8 mode. The code that uses this table must know about such things. */
1+LINK_SIZE /* BRA */ \
-/* The highest extraction number before we have to start using additional
-bytes. (Originally PCRE didn't have support for extraction counts highter than
-this number.) The value is limited by the number of opcodes left after OP_BRA,
-i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional
-opcodes. */
-
-#define EXTRACT_BASIC_MAX 150
-
/* A magic value for OP_CREF to indicate the "in recursion" condition. */
#define CREF_RECURSE 0xffff
@@ -554,7 +599,7 @@ just to accommodate the POSIX wrapper. */
#define ERR34 "character value in \\x{...} sequence is too large"
#define ERR35 "invalid condition (?(0)"
#define ERR36 "\\C not allowed in lookbehind assertion"
-#define ERR37 "PCRE does not support \\L, \\l, \\N, \\P, \\p, \\U, \\u, or \\X"
+#define ERR37 "PCRE does not support \\L, \\l, \\N, \\U, or \\u"
#define ERR38 "number after (?C is > 255"
#define ERR39 "closing ) for (?C expected"
#define ERR40 "recursive call could loop indefinitely"
@@ -562,37 +607,51 @@ just to accommodate the POSIX wrapper. */
#define ERR42 "syntax error after (?P"
#define ERR43 "two named groups have the same name"
#define ERR44 "invalid UTF-8 string"
-
-/* All character handling must be done as unsigned characters. Otherwise there
-are problems with top-bit-set characters and functions such as isspace().
-However, we leave the interface to the outside world as char *, because that
-should make things easier for callers. We define a short type for unsigned char
-to save lots of typing. I tried "uchar", but it causes problems on Digital
-Unix, where it is defined in sys/types, so use "uschar" instead. */
-
-typedef unsigned char uschar;
+#define ERR45 "support for \\P, \\p, and \\X has not been compiled"
+#define ERR46 "malformed \\P or \\p sequence"
+#define ERR47 "unknown property name after \\P or \\p"
/* The real format of the start of the pcre block; the index of names and the
-code vector run on as long as necessary after the end. */
+code vector run on as long as necessary after the end. We store an explicit
+offset to the name table so that if a regex is compiled on one host, saved, and
+then run on another where the size of pointers is different, all might still
+be well. For the case of compiled-on-4 and run-on-8, we include an extra
+pointer that is always NULL. For future-proofing, we also include a few dummy
+fields - even though you can never get this planning right!
+
+NOTE NOTE NOTE:
+Because people can now save and re-use compiled patterns, any additions to this
+structure should be made at the end, and something earlier (e.g. a new
+flag in the options or one of the dummy fields) should indicate that the new
+fields are present. Currently PCRE always sets the dummy fields to zero.
+NOTE NOTE NOTE:
+*/
typedef struct real_pcre {
- unsigned long int magic_number;
- size_t size; /* Total that was malloced */
- const unsigned char *tables; /* Pointer to tables */
- unsigned long int options;
- unsigned short int top_bracket;
- unsigned short int top_backref;
- unsigned short int first_byte;
- unsigned short int req_byte;
- unsigned short int name_entry_size; /* Size of any name items; 0 => none */
- unsigned short int name_count; /* Number of name items */
+ pcre_uint32 magic_number;
+ pcre_uint32 size; /* Total that was malloced */
+ pcre_uint32 options;
+ pcre_uint32 dummy1; /* For future use, maybe */
+
+ pcre_uint16 top_bracket;
+ pcre_uint16 top_backref;
+ pcre_uint16 first_byte;
+ pcre_uint16 req_byte;
+ pcre_uint16 name_table_offset; /* Offset to name table that follows */
+ pcre_uint16 name_entry_size; /* Size of any name items */
+ pcre_uint16 name_count; /* Number of name items */
+ pcre_uint16 dummy2; /* For future use, maybe */
+
+ const unsigned char *tables; /* Pointer to tables or NULL for std */
+ const unsigned char *nullpad; /* NULL padding */
} real_pcre;
-/* The format of the block used to store data from pcre_study(). */
+/* The format of the block used to store data from pcre_study(). The same
+remark (see NOTE above) about extending this structure applies. */
typedef struct pcre_study_data {
- size_t size; /* Total that was malloced */
- uschar options;
+ pcre_uint32 size; /* Total that was malloced */
+ pcre_uint32 options;
uschar start_bits[32];
} pcre_study_data;
@@ -605,12 +664,14 @@ typedef struct compile_data {
const uschar *cbits; /* Points to character type table */
const uschar *ctypes; /* Points to table of type maps */
const uschar *start_code; /* The start of the compiled code */
+ const uschar *start_pattern; /* The start of the pattern */
uschar *name_table; /* The name/number table */
int names_found; /* Number of entries so far */
int name_entry_size; /* Size of each entry */
int top_backref; /* Maximum back reference */
unsigned int backref_map; /* Bitmap of low back refs */
int req_varyopt; /* "After variable item" flag for reqbyte */
+ BOOL nopartial; /* Set TRUE if partial won't work */
} compile_data;
/* Structure for maintaining a chain of pointers to the currently incomplete
@@ -660,6 +721,8 @@ typedef struct match_data {
BOOL utf8; /* UTF8 flag */
BOOL endonly; /* Dollar not before final \n */
BOOL notempty; /* Empty string match not wanted */
+ BOOL partial; /* PARTIAL flag */
+ BOOL hitend; /* Hit the end of the subject at some point */
const uschar *start_code; /* For use when recursing */
const uschar *start_subject; /* Start of the subject string */
const uschar *end_subject; /* End of the subject string */