summaryrefslogtreecommitdiff
path: root/ext/pcre/pcrelib/pcre.c
diff options
context:
space:
mode:
authorAndrei Zmievski <andrei@php.net>2005-05-27 17:46:43 +0000
committerAndrei Zmievski <andrei@php.net>2005-05-27 17:46:43 +0000
commit4e72f8a298cec9b14ab623950863e229b83b829d (patch)
tree6e1daec8c779196da1002543254764edb190ba02 /ext/pcre/pcrelib/pcre.c
parent743c5bb15bcbc147e2cee4afb13c87b090102e51 (diff)
downloadphp-git-4e72f8a298cec9b14ab623950863e229b83b829d.tar.gz
Upgrade library to version 5.0.
Diffstat (limited to 'ext/pcre/pcrelib/pcre.c')
-rw-r--r--ext/pcre/pcrelib/pcre.c2303
1 files changed, 1597 insertions, 706 deletions
diff --git a/ext/pcre/pcrelib/pcre.c b/ext/pcre/pcrelib/pcre.c
index e50b9b69e3..c495ec015f 100644
--- a/ext/pcre/pcrelib/pcre.c
+++ b/ext/pcre/pcrelib/pcre.c
@@ -12,23 +12,31 @@ Written by: Philip Hazel <ph10@cam.ac.uk>
Copyright (c) 1997-2004 University of Cambridge
-----------------------------------------------------------------------------
-Permission is granted to anyone to use this software for any purpose on any
-computer system, and to redistribute it freely, subject to the following
-restrictions:
-
-1. This software is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-
-2. The origin of this software must not be misrepresented, either by
- explicit claim or by omission.
-
-3. Altered versions must be plainly marked as such, and must not be
- misrepresented as being the original software.
-
-4. If PCRE is embedded in any software that is released under the GNU
- General Purpose Licence (GPL), then the terms of that licence shall
- supersede any condition above with which it is incompatible.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
@@ -51,14 +59,14 @@ C headers, and the external pcre header. */
#include "internal.h"
+/* If Unicode Property support is wanted, include a private copy of the
+function that does it, and the table that translates names to numbers. */
-/* Allow compilation as C++ source code, should anybody want to do that. */
-
-#ifdef __cplusplus
-#define class pcre_class
+#ifdef SUPPORT_UCP
+#include "ucp.c"
+#include "ucptypetable.c"
#endif
-
/* Maximum number of items on the nested bracket stacks at compile time. This
applies to the nesting of all kinds of parentheses. It does not limit
un-nested, non-capturing parentheses. This number can be made bigger if
@@ -75,14 +83,6 @@ because the offset vector is always a multiple of 3 long. */
#define REC_STACK_SAVE_MAX 30
-/* The number of bytes in a literal character string above which we can't add
-any more is set at 250 in order to allow for UTF-8 characters. (In theory it
-could be 255 when UTF-8 support is excluded, but that means that some of the
-test output would be different, which just complicates things.) */
-
-#define MAXLIT 250
-
-
/* The maximum remaining length of subject we are prepared to search for a
req_byte match. */
@@ -106,16 +106,16 @@ is invalid. */
#if !EBCDIC /* This is the "normal" table for ASCII systems */
static const short int escapes[] = {
- 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
- 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
- '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
- 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
- 0, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
- 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
- '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
- 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
- 0, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
- 0, 0, -ESC_z /* x - z */
+ 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
+ 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
+ '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
+ 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
+-ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
+-ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
+ '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
+ 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
+-ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
+ 0, 0, -ESC_z /* x - z */
};
#else /* This is the "abnormal" table for EBCDIC systems */
@@ -129,7 +129,7 @@ static const short int escapes[] = {
/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
/* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
/* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
-/* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, 0,
+/* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
/* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
@@ -137,9 +137,9 @@ static const short int escapes[] = {
/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
/* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
-/* D0 */ '}', 0, 0, 0, 0, 0, 0, 0,
+/* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
/* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
-/* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, 0,
+/* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
@@ -562,6 +562,86 @@ return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
/*************************************************
+* Flip bytes in an integer *
+*************************************************/
+
+/* This function is called when the magic number in a regex doesn't match in
+order to flip its bytes to see if we are dealing with a pattern that was
+compiled on a host of different endianness. If so, this function is used to
+flip other byte values.
+
+Arguments:
+ value the number to flip
+ n the number of bytes to flip (assumed to be 2 or 4)
+
+Returns: the flipped value
+*/
+
+static long int
+byteflip(long int value, int n)
+{
+if (n == 2) return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8);
+return ((value & 0x000000ff) << 24) |
+ ((value & 0x0000ff00) << 8) |
+ ((value & 0x00ff0000) >> 8) |
+ ((value & 0xff000000) >> 24);
+}
+
+
+
+/*************************************************
+* Test for a byte-flipped compiled regex *
+*************************************************/
+
+/* This function is called from pce_exec() and also from pcre_fullinfo(). Its
+job is to test whether the regex is byte-flipped - that is, it was compiled on
+a system of opposite endianness. The function is called only when the native
+MAGIC_NUMBER test fails. If the regex is indeed flipped, we flip all the
+relevant values into a different data block, and return it.
+
+Arguments:
+ re points to the regex
+ study points to study data, or NULL
+ internal_re points to a new regex block
+ internal_study points to a new study block
+
+Returns: the new block if is is indeed a byte-flipped regex
+ NULL if it is not
+*/
+
+static real_pcre *
+try_flipped(const real_pcre *re, real_pcre *internal_re,
+ const pcre_study_data *study, pcre_study_data *internal_study)
+{
+if (byteflip(re->magic_number, sizeof(re->magic_number)) != MAGIC_NUMBER)
+ return NULL;
+
+*internal_re = *re; /* To copy other fields */
+internal_re->size = byteflip(re->size, sizeof(re->size));
+internal_re->options = byteflip(re->options, sizeof(re->options));
+internal_re->top_bracket = byteflip(re->top_bracket, sizeof(re->top_bracket));
+internal_re->top_backref = byteflip(re->top_backref, sizeof(re->top_backref));
+internal_re->first_byte = byteflip(re->first_byte, sizeof(re->first_byte));
+internal_re->req_byte = byteflip(re->req_byte, sizeof(re->req_byte));
+internal_re->name_table_offset = byteflip(re->name_table_offset,
+ sizeof(re->name_table_offset));
+internal_re->name_entry_size = byteflip(re->name_entry_size,
+ sizeof(re->name_entry_size));
+internal_re->name_count = byteflip(re->name_count, sizeof(re->name_count));
+
+if (study != NULL)
+ {
+ *internal_study = *study; /* To copy other fields */
+ internal_study->size = byteflip(study->size, sizeof(study->size));
+ internal_study->options = byteflip(study->options, sizeof(study->options));
+ }
+
+return internal_re;
+}
+
+
+
+/*************************************************
* (Obsolete) Return info about compiled pattern *
*************************************************/
@@ -573,7 +653,7 @@ at the low end of it, and so even on 16-bit systems this will still be OK.
Therefore, I haven't changed the API for pcre_info().
Arguments:
- external_re points to compiled code
+ argument_re points to compiled code
optptr where to pass back the options
first_byte where to pass back the first character,
or -1 if multiline and all branches start ^,
@@ -584,11 +664,16 @@ Returns: number of capturing subpatterns
*/
EXPORT int
-pcre_info(const pcre *external_re, int *optptr, int *first_byte)
+pcre_info(const pcre *argument_re, int *optptr, int *first_byte)
{
-const real_pcre *re = (const real_pcre *)external_re;
+real_pcre internal_re;
+const real_pcre *re = (const real_pcre *)argument_re;
if (re == NULL) return PCRE_ERROR_NULL;
-if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
+if (re->magic_number != MAGIC_NUMBER)
+ {
+ re = try_flipped(re, &internal_re, NULL, NULL);
+ if (re == NULL) return PCRE_ERROR_BADMAGIC;
+ }
if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
if (first_byte != NULL)
*first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
@@ -606,7 +691,7 @@ return re->top_bracket;
that additional items can be added compatibly.
Arguments:
- external_re points to compiled code
+ argument_re points to compiled code
extra_data points extra data, or NULL
what what information is required
where where to put the information
@@ -615,18 +700,26 @@ Returns: 0 if data returned, negative on error
*/
EXPORT int
-pcre_fullinfo(const pcre *external_re, const pcre_extra *extra_data, int what,
+pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what,
void *where)
{
-const real_pcre *re = (const real_pcre *)external_re;
+real_pcre internal_re;
+pcre_study_data internal_study;
+const real_pcre *re = (const real_pcre *)argument_re;
const pcre_study_data *study = NULL;
if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
-if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
study = (const pcre_study_data *)extra_data->study_data;
+if (re->magic_number != MAGIC_NUMBER)
+ {
+ re = try_flipped(re, &internal_re, study, &internal_study);
+ if (re == NULL) return PCRE_ERROR_BADMAGIC;
+ if (study != NULL) study = &internal_study;
+ }
+
switch (what)
{
case PCRE_INFO_OPTIONS:
@@ -655,10 +748,13 @@ switch (what)
((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
break;
+ /* Make sure we pass back the pointer to the bit vector in the external
+ block, not the internal copy (with flipped integer fields). */
+
case PCRE_INFO_FIRSTTABLE:
*((const uschar **)where) =
(study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
- study->start_bits : NULL;
+ ((const pcre_study_data *)extra_data->study_data)->start_bits : NULL;
break;
case PCRE_INFO_LASTLITERAL:
@@ -675,7 +771,11 @@ switch (what)
break;
case PCRE_INFO_NAMETABLE:
- *((const uschar **)where) = (const uschar *)re + sizeof(real_pcre);
+ *((const uschar **)where) = (const uschar *)re + re->name_table_offset;
+ break;
+
+ case PCRE_INFO_DEFAULT_TABLES:
+ *((const uschar **)where) = (const uschar *)pcre_default_tables;
break;
default: return PCRE_ERROR_BADOPTION;
@@ -713,6 +813,14 @@ switch (what)
#endif
break;
+ case PCRE_CONFIG_UNICODE_PROPERTIES:
+#ifdef SUPPORT_UCP
+ *((int *)where) = 1;
+#else
+ *((int *)where) = 0;
+#endif
+ break;
+
case PCRE_CONFIG_NEWLINE:
*((int *)where) = NEWLINE;
break;
@@ -835,11 +943,8 @@ else
case 'l':
case 'L':
case 'N':
- case 'p':
- case 'P':
case 'u':
case 'U':
- case 'X':
*errorptr = ERR37;
break;
@@ -989,6 +1094,99 @@ return c;
+#ifdef SUPPORT_UCP
+/*************************************************
+* Handle \P and \p *
+*************************************************/
+
+/* This function is called after \P or \p has been encountered, provided that
+PCRE is compiled with support for Unicode properties. On entry, ptrptr is
+pointing at the P or p. On exit, it is pointing at the final character of the
+escape sequence.
+
+Argument:
+ ptrptr points to the pattern position pointer
+ negptr points to a boolean that is set TRUE for negation else FALSE
+ errorptr points to the pointer to the error message
+
+Returns: value from ucp_type_table, or -1 for an invalid type
+*/
+
+static int
+get_ucp(const uschar **ptrptr, BOOL *negptr, const char **errorptr)
+{
+int c, i, bot, top;
+const uschar *ptr = *ptrptr;
+char name[4];
+
+c = *(++ptr);
+if (c == 0) goto ERROR_RETURN;
+
+*negptr = FALSE;
+
+/* \P or \p can be followed by a one- or two-character name in {}, optionally
+preceded by ^ for negation. */
+
+if (c == '{')
+ {
+ if (ptr[1] == '^')
+ {
+ *negptr = TRUE;
+ ptr++;
+ }
+ for (i = 0; i <= 2; i++)
+ {
+ c = *(++ptr);
+ if (c == 0) goto ERROR_RETURN;
+ if (c == '}') break;
+ name[i] = c;
+ }
+ if (c !='}') /* Try to distinguish error cases */
+ {
+ while (*(++ptr) != 0 && *ptr != '}');
+ if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;
+ }
+ name[i] = 0;
+ }
+
+/* Otherwise there is just one following character */
+
+else
+ {
+ name[0] = c;
+ name[1] = 0;
+ }
+
+*ptrptr = ptr;
+
+/* Search for a recognized property name using binary chop */
+
+bot = 0;
+top = sizeof(utt)/sizeof(ucp_type_table);
+
+while (bot < top)
+ {
+ i = (bot + top)/2;
+ c = strcmp(name, utt[i].name);
+ if (c == 0) return utt[i].value;
+ if (c > 0) bot = i + 1; else top = i;
+ }
+
+UNKNOWN_RETURN:
+*errorptr = ERR47;
+*ptrptr = ptr;
+return -1;
+
+ERROR_RETURN:
+*errorptr = ERR46;
+*ptrptr = ptr;
+return -1;
+}
+#endif
+
+
+
+
/*************************************************
* Check for counted repeat *
*************************************************/
@@ -1085,18 +1283,22 @@ return p;
/* This is called by several functions that scan a compiled expression looking
for a fixed first character, or an anchoring op code etc. It skips over things
that do not influence this. For some calls, a change of option is important.
+For some calls, it makes sense to skip negative forward and all backward
+assertions, and also the \b assertion; for others it does not.
Arguments:
- code pointer to the start of the group
- options pointer to external options
- optbit the option bit whose changing is significant, or
- zero if none are
+ code pointer to the start of the group
+ options pointer to external options
+ optbit the option bit whose changing is significant, or
+ zero if none are
+ skipassert TRUE if certain assertions are to be skipped
-Returns: pointer to the first significant opcode
+Returns: pointer to the first significant opcode
*/
static const uschar*
-first_significant_code(const uschar *code, int *options, int optbit)
+first_significant_code(const uschar *code, int *options, int optbit,
+ BOOL skipassert)
{
for (;;)
{
@@ -1111,14 +1313,19 @@ for (;;)
case OP_ASSERT_NOT:
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
+ if (!skipassert) return code;
do code += GET(code, 1); while (*code == OP_ALT);
+ code += OP_lengths[*code];
+ break;
+
+ case OP_WORD_BOUNDARY:
+ case OP_NOT_WORD_BOUNDARY:
+ if (!skipassert) return code;
/* Fall through */
case OP_CALLOUT:
case OP_CREF:
case OP_BRANUMBER:
- case OP_WORD_BOUNDARY:
- case OP_NOT_WORD_BOUNDARY:
code += OP_lengths[*code];
break;
@@ -1220,19 +1427,18 @@ for (;;)
cc += OP_lengths[*cc];
break;
- /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
- This requires a scan of the string, unfortunately. We assume valid UTF-8
- strings, so all we do is reduce the length by one for every byte whose bits
- are 10xxxxxx. */
+ /* Handle literal characters */
- case OP_CHARS:
- branchlength += *(++cc);
+ case OP_CHAR:
+ case OP_CHARNC:
+ branchlength++;
+ cc += 2;
#ifdef SUPPORT_UTF8
if ((options & PCRE_UTF8) != 0)
- for (d = 1; d <= *cc; d++)
- if ((cc[d] & 0xc0) == 0x80) branchlength--;
+ {
+ while ((*cc & 0xc0) == 0x80) cc++;
+ }
#endif
- cc += *cc + 1;
break;
/* Handle exact repetitions. The count is already in characters, but we
@@ -1256,6 +1462,11 @@ for (;;)
/* Handle single-char matchers */
+ case OP_PROP:
+ case OP_NOTPROP:
+ cc++;
+ /* Fall through */
+
case OP_NOT_DIGIT:
case OP_DIGIT:
case OP_NOT_WHITESPACE:
@@ -1342,7 +1553,6 @@ for (;;)
{
register int c = *code;
if (c == OP_END) return NULL;
- else if (c == OP_CHARS) code += code[1] + OP_lengths[c];
else if (c > OP_BRA)
{
int n = c - OP_BRA;
@@ -1358,11 +1568,13 @@ for (;;)
/* In UTF-8 mode, opcodes that are followed by a character may be followed
by a multi-byte character. The length in the table is a minimum, so we have
- to scan along to skip the extra characters. All opcodes are less than 128,
- so we can use relatively efficient code. */
+ to scan along to skip the extra bytes. All opcodes are less than 128, so we
+ can use relatively efficient code. */
if (utf8) switch(c)
{
+ case OP_CHAR:
+ case OP_CHARNC:
case OP_EXACT:
case OP_UPTO:
case OP_MINUPTO:
@@ -1377,7 +1589,7 @@ for (;;)
/* XCLASS is used for classes that cannot be represented just by a bit
map. This includes negated single high-valued characters. The length in
- the table is zero; the actual length is stored in the compled code. */
+ the table is zero; the actual length is stored in the compiled code. */
case OP_XCLASS:
code += GET(code, 1) + 1;
@@ -1416,7 +1628,6 @@ for (;;)
register int c = *code;
if (c == OP_END) return NULL;
else if (c == OP_RECURSE) return code;
- else if (c == OP_CHARS) code += code[1] + OP_lengths[c];
else if (c > OP_BRA)
{
code += OP_lengths[OP_BRA];
@@ -1429,11 +1640,13 @@ for (;;)
/* In UTF-8 mode, opcodes that are followed by a character may be followed
by a multi-byte character. The length in the table is a minimum, so we have
- to scan along to skip the extra characters. All opcodes are less than 128,
- so we can use relatively efficient code. */
+ to scan along to skip the extra bytes. All opcodes are less than 128, so we
+ can use relatively efficient code. */
if (utf8) switch(c)
{
+ case OP_CHAR:
+ case OP_CHARNC:
case OP_EXACT:
case OP_UPTO:
case OP_MINUPTO:
@@ -1448,7 +1661,7 @@ for (;;)
/* XCLASS is used for classes that cannot be represented just by a bit
map. This includes negated single high-valued characters. The length in
- the table is zero; the actual length is stored in the compled code. */
+ the table is zero; the actual length is stored in the compiled code. */
case OP_XCLASS:
code += GET(code, 1) + 1;
@@ -1483,9 +1696,9 @@ static BOOL
could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
{
register int c;
-for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0);
+for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
code < endcode;
- code = first_significant_code(code + OP_lengths[c], NULL, 0))
+ code = first_significant_code(code + OP_lengths[c], NULL, 0, TRUE))
{
const uschar *ccode;
@@ -1551,6 +1764,9 @@ for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0);
/* Opcodes that must match a character */
+ case OP_PROP:
+ case OP_NOTPROP:
+ case OP_EXTUNI:
case OP_NOT_DIGIT:
case OP_DIGIT:
case OP_NOT_WHITESPACE:
@@ -1559,7 +1775,8 @@ for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0);
case OP_WORDCHAR:
case OP_ANY:
case OP_ANYBYTE:
- case OP_CHARS:
+ case OP_CHAR:
+ case OP_CHARNC:
case OP_NOT:
case OP_PLUS:
case OP_MINPLUS:
@@ -1734,6 +1951,107 @@ while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
/*************************************************
+* Insert an automatic callout point *
+*************************************************/
+
+/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
+callout points before each pattern item.
+
+Arguments:
+ code current code pointer
+ ptr current pattern pointer
+ cd pointers to tables etc
+
+Returns: new code pointer
+*/
+
+static uschar *
+auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
+{
+*code++ = OP_CALLOUT;
+*code++ = 255;
+PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
+PUT(code, LINK_SIZE, 0); /* Default length */
+return code + 2*LINK_SIZE;
+}
+
+
+
+/*************************************************
+* Complete a callout item *
+*************************************************/
+
+/* A callout item contains the length of the next item in the pattern, which
+we can't fill in till after we have reached the relevant point. This is used
+for both automatic and manual callouts.
+
+Arguments:
+ previous_callout points to previous callout item
+ ptr current pattern pointer
+ cd pointers to tables etc
+
+Returns: nothing
+*/
+
+static void
+complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
+{
+int length = ptr - cd->start_pattern - GET(previous_callout, 2);
+PUT(previous_callout, 2 + LINK_SIZE, length);
+}
+
+
+
+#ifdef SUPPORT_UCP
+/*************************************************
+* Get othercase range *
+*************************************************/
+
+/* This function is passed the start and end of a class range, in UTF-8 mode
+with UCP support. It searches up the characters, looking for internal ranges of
+characters in the "other" case. Each call returns the next one, updating the
+start address.
+
+Arguments:
+ cptr points to starting character value; updated
+ d end value
+ ocptr where to put start of othercase range
+ odptr where to put end of othercase range
+
+Yield: TRUE when range returned; FALSE when no more
+*/
+
+static BOOL
+get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
+{
+int c, chartype, othercase, next;
+
+for (c = *cptr; c <= d; c++)
+ {
+ if (ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0) break;
+ }
+
+if (c > d) return FALSE;
+
+*ocptr = othercase;
+next = othercase + 1;
+
+for (++c; c <= d; c++)
+ {
+ if (ucp_findchar(c, &chartype, &othercase) != ucp_L || othercase != next)
+ break;
+ next++;
+ }
+
+*odptr = next - 1;
+*cptr = c;
+
+return TRUE;
+}
+#endif /* SUPPORT_UCP */
+
+
+/*************************************************
* Compile one branch *
*************************************************/
@@ -1744,7 +2062,7 @@ bits.
Arguments:
optionsptr pointer to the option bits
brackets points to number of extracting brackets used
- code points to the pointer to the current code point
+ codeptr points to the pointer to the current code point
ptrptr points to the current pattern pointer
errorptr points to pointer to error message
firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
@@ -1764,13 +2082,13 @@ compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
int repeat_type, op_type;
int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
int bravalue = 0;
-int length;
int greedy_default, greedy_non_default;
int firstbyte, reqbyte;
int zeroreqbyte, zerofirstbyte;
int req_caseopt, reqvary, tempreqvary;
int condcount = 0;
int options = *optionsptr;
+int after_manual_callout = 0;
register int c;
register uschar *code = *codeptr;
uschar *tempcode;
@@ -1779,7 +2097,8 @@ BOOL groupsetfirstbyte = FALSE;
const uschar *ptr = *ptrptr;
const uschar *tempptr;
uschar *previous = NULL;
-uschar class[32];
+uschar *previous_callout = NULL;
+uschar classbits[32];
#ifdef SUPPORT_UTF8
BOOL class_utf8;
@@ -1795,7 +2114,7 @@ BOOL utf8 = FALSE;
greedy_default = ((options & PCRE_UNGREEDY) != 0);
greedy_non_default = greedy_default ^ 1;
-/* Initialize no first char, no required char. REQ_UNSET means "no char
+/* Initialize no first byte, no required byte. REQ_UNSET means "no char
matching encountered yet". It gets changed to REQ_NONE if we hit something that
matches a non-fixed char first char; reqbyte just remains unset if we never
find one.
@@ -1810,7 +2129,7 @@ firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
/* The variable req_caseopt contains either the REQ_CASELESS value or zero,
according to the current setting of the caseless flag. REQ_CASELESS is a bit
value > 255. It is added into the firstbyte or reqbyte variables to record the
-case status of the value. */
+case status of the value. This is used only for ASCII characters. */
req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
@@ -1820,6 +2139,7 @@ for (;; ptr++)
{
BOOL negate_class;
BOOL possessive_quantifier;
+ BOOL is_quantifier;
int class_charcount;
int class_lastchar;
int newoptions;
@@ -1827,9 +2147,53 @@ for (;; ptr++)
int skipbytes;
int subreqbyte;
int subfirstbyte;
+ int mclength;
+ uschar mcbuffer[8];
+
+ /* Next byte in the pattern */
c = *ptr;
- if (inescq && c != 0) goto NORMAL_CHAR;
+
+ /* If in \Q...\E, check for the end; if not, we have a literal */
+
+ if (inescq && c != 0)
+ {
+ if (c == '\\' && ptr[1] == 'E')
+ {
+ inescq = FALSE;
+ ptr++;
+ continue;
+ }
+ else
+ {
+ if (previous_callout != NULL)
+ {
+ complete_callout(previous_callout, ptr, cd);
+ previous_callout = NULL;
+ }
+ if ((options & PCRE_AUTO_CALLOUT) != 0)
+ {
+ previous_callout = code;
+ code = auto_callout(code, ptr, cd);
+ }
+ goto NORMAL_CHAR;
+ }
+ }
+
+ /* Fill in length of a previous callout, except when the next thing is
+ a quantifier. */
+
+ is_quantifier = c == '*' || c == '+' || c == '?' ||
+ (c == '{' && is_counted_repeat(ptr+1));
+
+ if (!is_quantifier && previous_callout != NULL &&
+ after_manual_callout-- <= 0)
+ {
+ complete_callout(previous_callout, ptr, cd);
+ previous_callout = NULL;
+ }
+
+ /* In extended mode, skip white space and comments */
if ((options & PCRE_EXTENDED) != 0)
{
@@ -1843,6 +2207,14 @@ for (;; ptr++)
}
}
+ /* No auto callout for quantifiers. */
+
+ if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
+ {
+ previous_callout = code;
+ code = auto_callout(code, ptr, cd);
+ }
+
switch(c)
{
/* The branch terminates at end of string, |, or ). */
@@ -1938,7 +2310,7 @@ for (;; ptr++)
character (< 256), because in that case the compiled code doesn't use the
bit map. */
- memset(class, 0, 32 * sizeof(uschar));
+ memset(classbits, 0, 32 * sizeof(uschar));
/* Process characters until ] is reached. By writing this as a "do" it
means that an initial ] is taken as a data character. The first pass
@@ -2022,13 +2394,16 @@ for (;; ptr++)
if (taboffset < 0) break;
if (local_negate)
{
- for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
- if (blankclass) class[1] |= 0x3c;
+ if (i == 0)
+ for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];
+ else
+ for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];
+ if (blankclass) classbits[1] |= 0x3c;
}
else
{
- for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
- if (blankclass) class[1] &= ~0x3c;
+ for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];
+ if (blankclass) classbits[1] &= ~0x3c;
}
}
@@ -2048,9 +2423,10 @@ for (;; ptr++)
if (c == '\\')
{
c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
- if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
- if (-c == ESC_Q) /* Handle start of quoted string */
+ if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
+ else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
+ else if (-c == ESC_Q) /* Handle start of quoted string */
{
if (ptr[1] == '\\' && ptr[2] == 'E')
{
@@ -2060,38 +2436,54 @@ for (;; ptr++)
continue;
}
- else if (c < 0)
+ if (c < 0)
{
register const uschar *cbits = cd->cbits;
- class_charcount = 10; /* Greater than 1 is what matters */
+ class_charcount += 2; /* Greater than 1 is what matters */
switch (-c)
{
case ESC_d:
- for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
+ for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
continue;
case ESC_D:
- for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
+ for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
continue;
case ESC_w:
- for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
+ for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
continue;
case ESC_W:
- for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
+ for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
continue;
case ESC_s:
- for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
- class[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
+ for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
+ classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
continue;
case ESC_S:
- for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
- class[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
+ for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
+ classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
continue;
+#ifdef SUPPORT_UCP
+ case ESC_p:
+ case ESC_P:
+ {
+ BOOL negated;
+ int property = get_ucp(&ptr, &negated, errorptr);
+ if (property < 0) goto FAILED;
+ class_utf8 = TRUE;
+ *class_utf8data++ = ((-c == ESC_p) != negated)?
+ XCL_PROP : XCL_NOTPROP;
+ *class_utf8data++ = property;
+ class_charcount -= 2; /* Not a < 256 character */
+ }
+ continue;
+#endif
+
/* Unrecognized escapes are faulted if PCRE is running in its
strict mode. By default, for compatibility with Perl, they are
treated as literals. */
@@ -2102,7 +2494,8 @@ for (;; ptr++)
*errorptr = ERR7;
goto FAILED;
}
- c = *ptr; /* The final character */
+ c = *ptr; /* The final character */
+ class_charcount -= 2; /* Undo the default count from above */
}
}
@@ -2127,7 +2520,7 @@ for (;; ptr++)
}
else
#endif
- d = *ptr;
+ d = *ptr; /* Not UTF-8 mode */
/* The second part of a range can be a single-character escape, but
not any of the other escapes. Perl 5.6 treats a hyphen as a literal
@@ -2138,11 +2531,13 @@ for (;; ptr++)
const uschar *oldptr = ptr;
d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
- /* \b is backslash; any other special means the '-' was literal */
+ /* \b is backslash; \X is literal X; any other special means the '-'
+ was literal */
if (d < 0)
{
- if (d == -ESC_b) d = '\b'; else
+ if (d == -ESC_b) d = '\b';
+ else if (d == -ESC_X) d = 'X'; else
{
ptr = oldptr - 2;
goto LONE_SINGLE_CHARACTER; /* A few lines below */
@@ -2150,47 +2545,95 @@ for (;; ptr++)
}
}
- /* Check that the two values are in the correct order */
+ /* The check that the two values are in the correct order happens in
+ the pre-pass. Optimize one-character ranges */
- if (d < c)
- {
- *errorptr = ERR8;
- goto FAILED;
- }
+ if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
- /* If d is greater than 255, we can't just use the bit map, so set up
- for the UTF-8 supporting class type. If we are not caseless, we can
- just set up a single range. If we are caseless, the characters < 256
- are handled with a bitmap, in order to get the case-insensitive
- handling. */
+ /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
+ matching, we have to use an XCLASS with extra data items. Caseless
+ matching for characters > 127 is available only if UCP support is
+ available. */
#ifdef SUPPORT_UTF8
- if (d > 255)
+ if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
{
class_utf8 = TRUE;
- *class_utf8data++ = XCL_RANGE;
- if ((options & PCRE_CASELESS) == 0)
+
+ /* With UCP support, we can find the other case equivalents of
+ the relevant characters. There may be several ranges. Optimize how
+ they fit with the basic range. */
+
+#ifdef SUPPORT_UCP
+ if ((options & PCRE_CASELESS) != 0)
{
- class_utf8data += ord2utf8(c, class_utf8data);
- class_utf8data += ord2utf8(d, class_utf8data);
- continue; /* Go get the next char in the class */
+ int occ, ocd;
+ int cc = c;
+ int origd = d;
+ while (get_othercase_range(&cc, origd, &occ, &ocd))
+ {
+ if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
+
+ if (occ < c && ocd >= c - 1) /* Extend the basic range */
+ { /* if there is overlap, */
+ c = occ; /* noting that if occ < c */
+ continue; /* we can't have ocd > d */
+ } /* because a subrange is */
+ if (ocd > d && occ <= d + 1) /* always shorter than */
+ { /* the basic range. */
+ d = ocd;
+ continue;
+ }
+
+ if (occ == ocd)
+ {
+ *class_utf8data++ = XCL_SINGLE;
+ }
+ else
+ {
+ *class_utf8data++ = XCL_RANGE;
+ class_utf8data += ord2utf8(occ, class_utf8data);
+ }
+ class_utf8data += ord2utf8(ocd, class_utf8data);
+ }
}
- class_utf8data += ord2utf8(256, class_utf8data);
+#endif /* SUPPORT_UCP */
+
+ /* Now record the original range, possibly modified for UCP caseless
+ overlapping ranges. */
+
+ *class_utf8data++ = XCL_RANGE;
+ class_utf8data += ord2utf8(c, class_utf8data);
class_utf8data += ord2utf8(d, class_utf8data);
- d = 255;
- /* Fall through */
+
+ /* With UCP support, we are done. Without UCP support, there is no
+ caseless matching for UTF-8 characters > 127; we can use the bit map
+ for the smaller ones. */
+
+#ifdef SUPPORT_UCP
+ continue; /* With next character in the class */
+#else
+ if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
+
+ /* Adjust upper limit and fall through to set up the map */
+
+ d = 127;
+
+#endif /* SUPPORT_UCP */
}
-#endif
- /* We use the bit map if the range is entirely < 255, or if part of it
- is < 255 and matching is caseless. */
+#endif /* SUPPORT_UTF8 */
+
+ /* We use the bit map for all cases when not in UTF-8 mode; else
+ ranges that lie entirely within 0-127 when there is UCP support; else
+ for partial ranges without UCP support. */
for (; c <= d; c++)
{
- class[c/8] |= (1 << (c&7));
+ classbits[c/8] |= (1 << (c&7));
if ((options & PCRE_CASELESS) != 0)
{
int uc = cd->fcc[c]; /* flip case */
- class[uc/8] |= (1 << (uc&7));
+ classbits[uc/8] |= (1 << (uc&7));
}
class_charcount++; /* in case a one-char range */
class_lastchar = c;
@@ -2200,28 +2643,44 @@ for (;; ptr++)
}
/* Handle a lone single character - we can get here for a normal
- non-escape char, or after \ that introduces a single character. */
+ non-escape char, or after \ that introduces a single character or for an
+ apparent range that isn't. */
LONE_SINGLE_CHARACTER:
- /* Handle a multibyte character */
+ /* Handle a character that cannot go in the bit map */
#ifdef SUPPORT_UTF8
- if (utf8 && c > 255)
+ if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
{
class_utf8 = TRUE;
*class_utf8data++ = XCL_SINGLE;
class_utf8data += ord2utf8(c, class_utf8data);
+
+#ifdef SUPPORT_UCP
+ if ((options & PCRE_CASELESS) != 0)
+ {
+ int chartype;
+ int othercase;
+ if (ucp_findchar(c, &chartype, &othercase) >= 0 && othercase > 0)
+ {
+ *class_utf8data++ = XCL_SINGLE;
+ class_utf8data += ord2utf8(othercase, class_utf8data);
+ }
+ }
+#endif /* SUPPORT_UCP */
+
}
else
-#endif
+#endif /* SUPPORT_UTF8 */
+
/* Handle a single-byte character */
{
- class [c/8] |= (1 << (c&7));
+ classbits[c/8] |= (1 << (c&7));
if ((options & PCRE_CASELESS) != 0)
{
c = cd->fcc[c]; /* flip case */
- class[c/8] |= (1 << (c&7));
+ classbits[c/8] |= (1 << (c&7));
}
class_charcount++;
class_lastchar = c;
@@ -2233,12 +2692,15 @@ for (;; ptr++)
while ((c = *(++ptr)) != ']' || inescq);
- /* If class_charcount is 1, we saw precisely one character with a value <
- 256. In UTF-8 mode, we can optimize if there were no characters >= 256 and
- the one character is < 128. In non-UTF-8 mode we can always optimize.
+ /* If class_charcount is 1, we saw precisely one character whose value is
+ less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
+ can optimize the negative case only if there were no characters >= 128
+ because OP_NOT and the related opcodes like OP_NOTSTAR operate on
+ single-bytes only. This is an historical hangover. Maybe one day we can
+ tidy these opcodes to handle multi-byte characters.
The optimization throws away the bit map. We turn the item into a
- 1-character OP_CHARS if it's positive, or OP_NOT if it's negative. Note
+ 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
that OP_NOT does not support multibyte characters. In the positive case, it
can cause firstbyte to be set. Otherwise, there can be no first char if
this item is first, whatever repeat count may follow. In the case of
@@ -2247,40 +2709,44 @@ for (;; ptr++)
#ifdef SUPPORT_UTF8
if (class_charcount == 1 &&
(!utf8 ||
- (!class_utf8 && class_lastchar < 128)))
+ (!class_utf8 && (!negate_class || class_lastchar < 128))))
+
#else
if (class_charcount == 1)
#endif
{
zeroreqbyte = reqbyte;
+
+ /* The OP_NOT opcode works on one-byte characters only. */
+
if (negate_class)
{
if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
zerofirstbyte = firstbyte;
*code++ = OP_NOT;
+ *code++ = class_lastchar;
+ break;
}
+
+ /* For a single, positive character, get the value into mcbuffer, and
+ then we can handle this with the normal one-character code. */
+
+#ifdef SUPPORT_UTF8
+ if (utf8 && class_lastchar > 127)
+ mclength = ord2utf8(class_lastchar, mcbuffer);
else
+#endif
{
- if (firstbyte == REQ_UNSET)
- {
- zerofirstbyte = REQ_NONE;
- firstbyte = class_lastchar | req_caseopt;
- }
- else
- {
- zerofirstbyte = firstbyte;
- reqbyte = class_lastchar | req_caseopt | cd->req_varyopt;
- }
- *code++ = OP_CHARS;
- *code++ = 1;
+ mcbuffer[0] = class_lastchar;
+ mclength = 1;
}
- *code++ = class_lastchar;
- break; /* End of class handling */
- } /* End of 1-byte optimization */
+ goto ONE_CHAR;
+ } /* End of 1-char optimization */
- /* Otherwise, if this is the first thing in the branch, there can be no
- first char setting, whatever the repeat count. Any reqbyte setting must
- remain unchanged after any kind of repeat. */
+ /* The general case - not the one-char optimization. If this is the first
+ thing in the branch, there can be no first char setting, whatever the
+ repeat count. Any reqbyte setting must remain unchanged after any kind of
+ repeat. */
if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
zerofirstbyte = firstbyte;
@@ -2304,7 +2770,7 @@ for (;; ptr++)
if (class_charcount > 0)
{
*code++ |= XCL_MAP;
- memcpy(code, class, 32);
+ memcpy(code, classbits, 32);
code = class_utf8data;
}
@@ -2332,20 +2798,21 @@ for (;; ptr++)
if (negate_class)
{
*code++ = OP_NCLASS;
- for (c = 0; c < 32; c++) code[c] = ~class[c];
+ for (c = 0; c < 32; c++) code[c] = ~classbits[c];
}
else
{
*code++ = OP_CLASS;
- memcpy(code, class, 32);
+ memcpy(code, classbits, 32);
}
code += 32;
break;
- /* Various kinds of repeat */
+ /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
+ has been tested above. */
case '{':
- if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
+ if (!is_quantifier) goto NORMAL_CHAR;
ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
if (*errorptr != NULL) goto FAILED;
goto REPEAT;
@@ -2422,14 +2889,13 @@ for (;; ptr++)
code += 1 + LINK_SIZE;
}
- /* If previous was a string of characters, chop off the last one and use it
- as the subject of the repeat. If there was only one character, we can
- abolish the previous item altogether. If a one-char item has a minumum of
- more than one, ensure that it is set in reqbyte - it might not be if a
- sequence such as x{3} is the first thing in a branch because the x will
- have gone into firstbyte instead. */
+ /* If previous was a character match, abolish the item and generate a
+ repeat item instead. If a char item has a minumum of more than one, ensure
+ that it is set in reqbyte - it might not be if a sequence such as x{3} is
+ the first thing in a branch because the x will have gone into firstbyte
+ instead. */
- if (*previous == OP_CHARS)
+ if (*previous == OP_CHAR || *previous == OP_CHARNC)
{
/* Deal with UTF-8 characters that take up more than one byte. It's
easier to write this out separately than try to macrify it. Use c to
@@ -2443,16 +2909,6 @@ for (;; ptr++)
while((*lastchar & 0xc0) == 0x80) lastchar--;
c = code - lastchar; /* Length of UTF-8 character */
memcpy(utf8_char, lastchar, c); /* Save the char */
- if (lastchar == previous + 2) /* There was only one character */
- {
- code = previous; /* Abolish the previous item */
- }
- else
- {
- previous[1] -= c; /* Adjust length of previous */
- code = lastchar; /* Lost char off the end */
- tempcode = code; /* Adjust position to be moved for '+' */
- }
c |= 0x80; /* Flag c as a length */
}
else
@@ -2462,17 +2918,8 @@ for (;; ptr++)
with UTF-8 disabled, or for a UTF-8 character < 128. */
{
- c = *(--code);
- if (code == previous + 2) /* There was only one character */
- {
- code = previous; /* Abolish the previous item */
- if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
- }
- else
- {
- previous[1]--; /* adjust length */
- tempcode = code; /* Adjust position to be moved for '+' */
- }
+ c = code[-1];
+ if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
}
goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
@@ -2487,27 +2934,40 @@ for (;; ptr++)
{
op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
c = previous[1];
- code = previous;
goto OUTPUT_SINGLE_REPEAT;
}
/* If previous was a character type match (\d or similar), abolish it and
create a suitable repeat item. The code is shared with single-character
- repeats by setting op_type to add a suitable offset into repeat_type. */
+ repeats by setting op_type to add a suitable offset into repeat_type. Note
+ the the Unicode property types will be present only when SUPPORT_UCP is
+ defined, but we don't wrap the little bits of code here because it just
+ makes it horribly messy. */
else if (*previous < OP_EODN)
{
+ uschar *oldcode;
+ int prop_type;
op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
c = *previous;
- code = previous;
OUTPUT_SINGLE_REPEAT:
+ prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?
+ previous[1] : -1;
+
+ oldcode = code;
+ code = previous; /* Usually overwrite previous item */
/* If the maximum is zero then the minimum must also be zero; Perl allows
this case, so we do too - by simply omitting the item altogether. */
if (repeat_max == 0) goto END_REPEAT;
+ /* All real repeats make it impossible to handle partial matching (maybe
+ one day we will be able to remove this restriction). */
+
+ if (repeat_max != 1) cd->nopartial = TRUE;
+
/* Combine the op_type with the repeat_type */
repeat_type += op_type;
@@ -2526,51 +2986,37 @@ for (;; ptr++)
}
}
- /* The case {1,} is handled as the special case + */
-
- else if (repeat_min == 1 && repeat_max == -1)
- *code++ = OP_PLUS + repeat_type;
-
- /* The case {n,n} is just an EXACT, while the general case {n,m} is
- handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
+ /* A repeat minimum of 1 is optimized into some special cases. If the
+ maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
+ left in place and, if the maximum is greater than 1, we use OP_UPTO with
+ one less than the maximum. */
- else
+ else if (repeat_min == 1)
{
- if (repeat_min != 1)
- {
- *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
- PUT2INC(code, 0, repeat_min);
- }
-
- /* If the mininum is 1 and the previous item was a character string,
- we either have to put back the item that got cancelled if the string
- length was 1, or add the character back onto the end of a longer
- string. For a character type nothing need be done; it will just get
- put back naturally. Note that the final character is always going to
- get added below, so we leave code ready for its insertion. */
-
- else if (*previous == OP_CHARS)
+ if (repeat_max == -1)
+ *code++ = OP_PLUS + repeat_type;
+ else
{
- if (code == previous) code += 2; else
-
- /* In UTF-8 mode, a multibyte char has its length in c, with the 0x80
- bit set as a flag. The length will always be between 2 and 6. */
-
-#ifdef SUPPORT_UTF8
- if (utf8 && c >= 128) previous[1] += c & 7; else
-#endif
- previous[1]++;
+ code = oldcode; /* leave previous item in place */
+ if (repeat_max == 1) goto END_REPEAT;
+ *code++ = OP_UPTO + repeat_type;
+ PUT2INC(code, 0, repeat_max - 1);
}
+ }
- /* For a single negated character we also have to put back the
- item that got cancelled. At present this applies only to single byte
- characters in any mode. */
+ /* The case {n,n} is just an EXACT, while the general case {n,m} is
+ handled as an EXACT followed by an UPTO. */
- else if (*previous == OP_NOT) code++;
+ else
+ {
+ *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
+ PUT2INC(code, 0, repeat_min);
/* If the maximum is unlimited, insert an OP_STAR. Before doing so,
- we have to insert the character for the previous code. In UTF-8 mode,
- long characters have their length in c, with the 0x80 bit as a flag. */
+ we have to insert the character for the previous code. For a repeated
+ Unicode property match, there is an extra byte that defines the
+ required property. In UTF-8 mode, long characters have their length in
+ c, with the 0x80 bit as a flag. */
if (repeat_max < 0)
{
@@ -2582,7 +3028,10 @@ for (;; ptr++)
}
else
#endif
- *code++ = c;
+ {
+ *code++ = c;
+ if (prop_type >= 0) *code++ = prop_type;
+ }
*code++ = OP_STAR + repeat_type;
}
@@ -2600,6 +3049,7 @@ for (;; ptr++)
else
#endif
*code++ = c;
+ if (prop_type >= 0) *code++ = prop_type;
repeat_max -= repeat_min;
*code++ = OP_UPTO + repeat_type;
PUT2INC(code, 0, repeat_max);
@@ -2616,8 +3066,14 @@ for (;; ptr++)
}
else
#endif
-
*code++ = c;
+
+ /* For a repeated Unicode property match, there is an extra byte that
+ defines the required property. */
+
+#ifdef SUPPORT_UCP
+ if (prop_type >= 0) *code++ = prop_type;
+#endif
}
/* If previous was a character class or a back reference, we put the repeat
@@ -2635,6 +3091,12 @@ for (;; ptr++)
code = previous;
goto END_REPEAT;
}
+
+ /* All real repeats make it impossible to handle partial matching (maybe
+ one day we will be able to remove this restriction). */
+
+ if (repeat_max != 1) cd->nopartial = TRUE;
+
if (repeat_min == 0 && repeat_max == -1)
*code++ = OP_CRSTAR + repeat_type;
else if (repeat_min == 1 && repeat_max == -1)
@@ -2937,9 +3399,11 @@ for (;; ptr++)
ptr++;
break;
- case 'C': /* Callout - may be followed by digits */
- *code++ = OP_CALLOUT;
- {
+ case 'C': /* Callout - may be followed by digits; */
+ previous_callout = code; /* Save for later completion */
+ after_manual_callout = 1; /* Skip one item before completing */
+ *code++ = OP_CALLOUT; /* Already checked that the terminating */
+ { /* closing parenthesis is present. */
int n = 0;
while ((digitab[*(++ptr)] & ctype_digit) != 0)
n = n * 10 + *ptr - '0';
@@ -2949,6 +3413,9 @@ for (;; ptr++)
goto FAILED;
}
*code++ = n;
+ PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
+ PUT(code, LINK_SIZE, 0); /* Default length */
+ code += 2 * LINK_SIZE;
}
previous = NULL;
continue;
@@ -3339,215 +3806,107 @@ for (;; ptr++)
*code++ = OP_REF;
PUT2INC(code, 0, number);
}
- else
- {
- previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
- *code++ = -c;
- }
- continue;
- }
- /* Data character: reset and fall through */
+ /* So are Unicode property matches, if supported. We know that get_ucp
+ won't fail because it was tested in the pre-pass. */
- ptr = tempptr;
- c = '\\';
-
- /* Handle a run of data characters until a metacharacter is encountered.
- The first character is guaranteed not to be whitespace or # when the
- extended flag is set. */
-
- NORMAL_CHAR:
- default:
- previous = code;
- *code = OP_CHARS;
- code += 2;
- length = 0;
-
- do
- {
- /* If in \Q...\E, check for the end; if not, we always have a literal */
-
- if (inescq)
+#ifdef SUPPORT_UCP
+ else if (-c == ESC_P || -c == ESC_p)
{
- if (c == '\\' && ptr[1] == 'E')
- {
- inescq = FALSE;
- ptr++;
- }
- else
- {
- *code++ = c;
- length++;
- }
- continue;
+ BOOL negated;
+ int value = get_ucp(&ptr, &negated, errorptr);
+ previous = code;
+ *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
+ *code++ = value;
}
+#endif
- /* Skip white space and comments for /x patterns */
+ /* For the rest, we can obtain the OP value by negating the escape
+ value */
- if ((options & PCRE_EXTENDED) != 0)
+ else
{
- if ((cd->ctypes[c] & ctype_space) != 0) continue;
- if (c == '#')
- {
- /* The space before the ; is to avoid a warning on a silly compiler
- on the Macintosh. */
- while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
- if (c == 0) break;
- continue;
- }
+ previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
+ *code++ = -c;
}
+ continue;
+ }
- /* Backslash may introduce a data char or a metacharacter. Escaped items
- are checked for validity in the pre-compiling pass. Stop the string
- before a metaitem. */
-
- if (c == '\\')
- {
- tempptr = ptr;
- c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
- if (c < 0) { ptr = tempptr; break; }
-
- /* If a character is > 127 in UTF-8 mode, we have to turn it into
- two or more bytes in the UTF-8 encoding. */
+ /* We have a data character whose value is in c. In UTF-8 mode it may have
+ a value > 127. We set its representation in the length/buffer, and then
+ handle it as a data character. */
#ifdef SUPPORT_UTF8
- if (utf8 && c > 127)
- {
- uschar buffer[8];
- int len = ord2utf8(c, buffer);
- for (c = 0; c < len; c++) *code++ = buffer[c];
- length += len;
- continue;
- }
+ if (utf8 && c > 127)
+ mclength = ord2utf8(c, mcbuffer);
+ else
#endif
- }
-
- /* Ordinary character or single-char escape */
- *code++ = c;
- length++;
- }
+ {
+ mcbuffer[0] = c;
+ mclength = 1;
+ }
- /* This "while" is the end of the "do" above. */
+ goto ONE_CHAR;
- while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
+ /* Handle a literal character. It is guaranteed not to be whitespace or #
+ when the extended flag is set. If we are in UTF-8 mode, it may be a
+ multi-byte literal character. */
- /* Update the first and last requirements. These are always bytes, even in
- UTF-8 mode. However, there is a special case to be considered when there
- are only one or two characters. Because this gets messy in UTF-8 mode, the
- code is kept separate. When we get here "length" contains the number of
- bytes. */
+ default:
+ NORMAL_CHAR:
+ mclength = 1;
+ mcbuffer[0] = c;
#ifdef SUPPORT_UTF8
- if (utf8 && length > 1)
+ if (utf8 && (c & 0xc0) == 0xc0)
{
- uschar *t = previous + 3; /* After this code, t */
- while (t < code && (*t & 0xc0) == 0x80) t++; /* follows the 1st char */
-
- /* Handle the case when there is only one multibyte character. It must
- have at least two bytes because of the "length > 1" test above. */
-
- if (t == code)
- {
- /* If no previous first byte, set it from this character, but revert to
- none on a zero repeat. */
-
- if (firstbyte == REQ_UNSET)
- {
- zerofirstbyte = REQ_NONE;
- firstbyte = previous[2];
- }
+ while ((ptr[1] & 0xc0) == 0x80)
+ mcbuffer[mclength++] = *(++ptr);
+ }
+#endif
- /* Otherwise, leave the first byte value alone, and don't change it on
- a zero repeat */
+ /* At this point we have the character's bytes in mcbuffer, and the length
+ in mclength. When not in UTF-8 mode, the length is always 1. */
- else zerofirstbyte = firstbyte;
+ ONE_CHAR:
+ previous = code;
+ *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
+ for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
- /* In both cases, a zero repeat resets the previous required byte */
+ /* Set the first and required bytes appropriately. If no previous first
+ byte, set it from this character, but revert to none on a zero repeat.
+ Otherwise, leave the firstbyte value alone, and don't change it on a zero
+ repeat. */
- zeroreqbyte = reqbyte;
- }
+ if (firstbyte == REQ_UNSET)
+ {
+ zerofirstbyte = REQ_NONE;
+ zeroreqbyte = reqbyte;
- /* Handle the case when there is more than one character. These may be
- single-byte or multibyte characters */
+ /* If the character is more than one byte long, we can set firstbyte
+ only if it is not to be matched caselessly. */
- else
+ if (mclength == 1 || req_caseopt == 0)
{
- t = code - 1; /* After this code, t is at the */
- while ((*t & 0xc0) == 0x80) t--; /* start of the last character */
-
- /* If no previous first byte, set it from the first character, and
- retain it on a zero repeat (of the last character). The required byte
- is reset on a zero repeat, either to the byte before the last
- character, unless this is the first byte of the string. In that case,
- it reverts to its previous value. */
-
- if (firstbyte == REQ_UNSET)
- {
- zerofirstbyte = firstbyte = previous[2] | req_caseopt;
- zeroreqbyte = (t - 1 == previous + 2)?
- reqbyte : t[-1] | req_caseopt | cd->req_varyopt;
- }
-
- /* If there was a previous first byte, leave it alone, and don't change
- it on a zero repeat. The required byte is reset on a zero repeat to the
- byte before the last character. */
-
- else
- {
- zerofirstbyte = firstbyte;
- zeroreqbyte = t[-1] | req_caseopt | cd->req_varyopt;
- }
+ firstbyte = mcbuffer[0] | req_caseopt;
+ if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
}
-
- /* In all cases (we know length > 1), the new required byte is the last
- byte of the string. */
-
- reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
+ else firstbyte = reqbyte = REQ_NONE;
}
- else /* End of UTF-8 coding */
-#endif
-
- /* This is the code for non-UTF-8 operation, either without UTF-8 support,
- or when UTF-8 is not enabled. */
+ /* firstbyte was previously set; we can set reqbyte only the length is
+ 1 or the matching is caseful. */
+ else
{
- /* firstbyte was not previously set; take it from this string */
-
- if (firstbyte == REQ_UNSET)
- {
- if (length == 1)
- {
- zerofirstbyte = REQ_NONE;
- firstbyte = previous[2] | req_caseopt;
- zeroreqbyte = reqbyte;
- }
- else
- {
- zerofirstbyte = firstbyte = previous[2] | req_caseopt;
- zeroreqbyte = (length > 2)?
- (code[-2] | req_caseopt | cd->req_varyopt) : reqbyte;
- reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
- }
- }
-
- /* firstbyte was previously set */
-
- else
- {
- zerofirstbyte = firstbyte;
- zeroreqbyte = (length == 1)? reqbyte :
- code[-2] | req_caseopt | cd->req_varyopt;
+ zerofirstbyte = firstbyte;
+ zeroreqbyte = reqbyte;
+ if (mclength == 1 || req_caseopt == 0)
reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
- }
}
- /* Set the length in the data vector, and advance to the next state. */
-
- previous[1] = length;
- if (length < MAXLIT) ptr--;
- break;
+ break; /* End of literal character handling */
}
} /* end of big loop */
@@ -3809,7 +4168,7 @@ is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
{
do {
const uschar *scode =
- first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE);
+ first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
register int op = *scode;
/* Capturing brackets */
@@ -3878,7 +4237,8 @@ is_startline(const uschar *code, unsigned int bracket_map,
unsigned int backref_map)
{
do {
- const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0);
+ const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
+ FALSE);
register int op = *scode;
/* Capturing brackets */
@@ -3897,7 +4257,7 @@ do {
else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
{ if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
- /* .* is not anchored unless DOTALL is set and it isn't in brackets that
+ /* .* means "start at start or after \n" if it isn't in brackets that
may be referenced. */
else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
@@ -3908,6 +4268,9 @@ do {
/* Check for explicit circumflex */
else if (op != OP_CIRC) return FALSE;
+
+ /* Move on to the next alternative */
+
code += GET(code, 1);
}
while (*code == OP_ALT); /* Loop for each alternative */
@@ -3943,7 +4306,7 @@ register int c = -1;
do {
int d;
const uschar *scode =
- first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS);
+ first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
register int op = *scode;
if (op >= OP_BRA) op = OP_BRA;
@@ -3963,11 +4326,10 @@ do {
break;
case OP_EXACT: /* Fall through */
- scode++;
-
- case OP_CHARS: /* Fall through */
- scode++;
+ scode += 2;
+ case OP_CHAR:
+ case OP_CHARNC:
case OP_PLUS:
case OP_MINPLUS:
if (!inassert) return -1;
@@ -4107,8 +4469,8 @@ int branch_newextra;
int item_count = -1;
int name_count = 0;
int max_name_size = 0;
+int lastitemlength = 0;
#ifdef SUPPORT_UTF8
-int lastcharlength = 0;
BOOL utf8;
BOOL class_utf8;
#endif
@@ -4199,7 +4561,11 @@ while ((c = *(++ptr)) != 0)
/* If we are inside a \Q...\E sequence, all chars are literal */
- if (inescq) goto NORMAL_CHAR;
+ if (inescq)
+ {
+ if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
+ goto NORMAL_CHAR;
+ }
/* Otherwise, first check for ignored whitespace and comments */
@@ -4218,24 +4584,40 @@ while ((c = *(++ptr)) != 0)
item_count++; /* Is zero for the first non-comment item */
+ /* Allow space for auto callout before every item except quantifiers. */
+
+ if ((options & PCRE_AUTO_CALLOUT) != 0 &&
+ c != '*' && c != '+' && c != '?' &&
+ (c != '{' || !is_counted_repeat(ptr + 1)))
+ length += 2 + 2*LINK_SIZE;
+
switch(c)
{
- /* A backslashed item may be an escaped "normal" character or a
- character type. For a "normal" character, put the pointers and
- character back so that tests for whitespace etc. in the input
- are done correctly. */
+ /* A backslashed item may be an escaped data character or it may be a
+ character type. */
case '\\':
+ c = check_escape(&ptr, errorptr, bracount, options, FALSE);
+ if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
+
+ lastitemlength = 1; /* Default length of last item for repeats */
+
+ if (c >= 0) /* Data character */
{
- const uschar *save_ptr = ptr;
- c = check_escape(&ptr, errorptr, bracount, options, FALSE);
- if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
- if (c >= 0)
+ length += 2; /* For a one-byte character */
+
+#ifdef SUPPORT_UTF8
+ if (utf8 && c > 127)
{
- ptr = save_ptr;
- c = '\\';
- goto NORMAL_CHAR;
+ int i;
+ for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
+ if (c <= utf8_table1[i]) break;
+ length += i;
+ lastitemlength += i;
}
+#endif
+
+ continue;
}
/* If \Q, enter "literal" mode */
@@ -4246,12 +4628,36 @@ while ((c = *(++ptr)) != 0)
continue;
}
- /* Other escapes need one byte, and are of length one for repeats */
+ /* \X is supported only if Unicode property support is compiled */
- length++;
-#ifdef SUPPORT_UTF8
- lastcharlength = 1;
+#ifndef SUPPORT_UCP
+ if (-c == ESC_X)
+ {
+ *errorptr = ERR45;
+ goto PCRE_ERROR_RETURN;
+ }
+#endif
+
+ /* \P and \p are for Unicode properties, but only when the support has
+ been compiled. Each item needs 2 bytes. */
+
+ else if (-c == ESC_P || -c == ESC_p)
+ {
+#ifdef SUPPORT_UCP
+ BOOL negated;
+ length += 2;
+ lastitemlength = 2;
+ if (get_ucp(&ptr, &negated, errorptr) < 0) goto PCRE_ERROR_RETURN;
+ continue;
+#else
+ *errorptr = ERR45;
+ goto PCRE_ERROR_RETURN;
#endif
+ }
+
+ /* Other escapes need one byte */
+
+ length++;
/* A back reference needs an additional 2 bytes, plus either one or 5
bytes for a repeat. We also need to keep the value of the highest
@@ -4281,9 +4687,7 @@ while ((c = *(++ptr)) != 0)
case '.':
case '$':
length++;
-#ifdef SUPPORT_UTF8
- lastcharlength = 1;
-#endif
+ lastitemlength = 1;
continue;
case '*': /* These repeats won't be after brackets; */
@@ -4310,30 +4714,12 @@ while ((c = *(++ptr)) != 0)
else
{
-#ifdef SUPPORT_UTF8
- /* In UTF-8 mode, we should find the length in lastcharlength */
- if (utf8)
+ if (min != 1)
{
- if (min != 1)
- {
- length -= lastcharlength; /* Uncount the original char or metachar */
- if (min > 0) length += 3 + lastcharlength;
- }
- length += lastcharlength + ((max > 0)? 3 : 1);
- }
- else
-#endif
-
- /* Not UTF-8 mode: all characters are one byte */
- {
- if (min != 1)
- {
- length--; /* Uncount the original char or metachar */
- if (min > 0) length += 4;
- }
-
- length += (max > 0)? 4 : 2;
+ length -= lastitemlength; /* Uncount the original char or metachar */
+ if (min > 0) length += 3 + lastitemlength;
}
+ length += lastitemlength + ((max > 0)? 3 : 1);
}
if (ptr[1] == '?') ptr++; /* Needs no extra length */
@@ -4364,14 +4750,17 @@ while ((c = *(++ptr)) != 0)
where we can. (In UTF-8 mode we can do this only for chars < 128.) */
case '[':
- class_optcount = 0;
+ if (*(++ptr) == '^')
+ {
+ class_optcount = 10; /* Greater than one */
+ ptr++;
+ }
+ else class_optcount = 0;
#ifdef SUPPORT_UTF8
class_utf8 = FALSE;
#endif
- if (*(++ptr) == '^') ptr++;
-
/* Written as a "do" so that an initial ']' is taken as data */
if (*ptr != 0) do
@@ -4380,7 +4769,7 @@ while ((c = *(++ptr)) != 0)
if (inescq)
{
- if (*ptr != '\\' || ptr[1] != 'E') goto NON_SPECIAL_CHARACTER;
+ if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
inescq = FALSE;
ptr += 1;
continue;
@@ -4390,19 +4779,17 @@ while ((c = *(++ptr)) != 0)
if (*ptr == '\\')
{
-#ifdef SUPPORT_UTF8
- int prevchar = ptr[-1];
-#endif
- int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);
+ c = check_escape(&ptr, errorptr, bracount, options, TRUE);
if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
- /* \b is backspace inside a class */
+ /* \b is backspace inside a class; \X is literal */
- if (-ch == ESC_b) ch = '\b';
+ if (-c == ESC_b) c = '\b';
+ else if (-c == ESC_X) c = 'X';
/* \Q enters quoting mode */
- if (-ch == ESC_Q)
+ else if (-c == ESC_Q)
{
inescq = TRUE;
continue;
@@ -4410,33 +4797,26 @@ while ((c = *(++ptr)) != 0)
/* Handle escapes that turn into characters */
- if (ch >= 0)
+ if (c >= 0) goto NON_SPECIAL_CHARACTER;
+
+ /* Escapes that are meta-things. The normal ones just affect the
+ bit map, but Unicode properties require an XCLASS extended item. */
+
+ else
{
+ class_optcount = 10; /* \d, \s etc; make sure > 1 */
#ifdef SUPPORT_UTF8
- if (utf8)
+ if (-c == ESC_p || -c == ESC_P)
{
- if (ch > 127) class_optcount = 10; /* Ensure > 1 */
- if (ch > 255)
+ if (!class_utf8)
{
- uschar buffer[6];
- if (!class_utf8)
- {
- class_utf8 = TRUE;
- length += LINK_SIZE + 1 + 1;
- }
- length += 1 + ord2utf8(ch, buffer);
-
- /* If this wide character is preceded by '-', add an extra 2 to
- the length in case the previous character was < 128, because in
- this case the whole range will be put into the list. */
-
- if (prevchar == '-') length += 2;
+ class_utf8 = TRUE;
+ length += LINK_SIZE + 2;
}
+ length += 2;
}
#endif
- class_optcount++; /* for possible optimization */
}
- else class_optcount = 10; /* \d, \s etc; make sure > 1 */
}
/* Check the syntax for POSIX stuff. The bits we actually handle are
@@ -4448,42 +4828,155 @@ while ((c = *(++ptr)) != 0)
class_optcount = 10; /* Make sure > 1 */
}
- /* Anything else just increments the possible optimization count. If
- there are wide characters, we are going to have to use an XCLASS. */
+ /* Anything else increments the possible optimization count. We have to
+ detect ranges here so that we can compute the number of extra ranges for
+ caseless wide characters when UCP support is available. If there are wide
+ characters, we are going to have to use an XCLASS, even for single
+ characters. */
else
{
- NON_SPECIAL_CHARACTER:
- class_optcount++;
+ int d;
+
+ GET_ONE_CHARACTER:
#ifdef SUPPORT_UTF8
if (utf8)
{
- int ch;
int extra = 0;
- GETCHARLEN(ch, ptr, extra);
- if (ch > 127) class_optcount = 10; /* No optimization possible */
- if (ch > 255)
+ GETCHARLEN(c, ptr, extra);
+ ptr += extra;
+ }
+ else c = *ptr;
+#else
+ c = *ptr;
+#endif
+
+ /* Come here from handling \ above when it escapes to a char value */
+
+ NON_SPECIAL_CHARACTER:
+ class_optcount++;
+
+ d = -1;
+ if (ptr[1] == '-')
+ {
+ uschar const *hyptr = ptr++;
+ if (ptr[1] == '\\')
{
- if (!class_utf8)
+ ptr++;
+ d = check_escape(&ptr, errorptr, bracount, options, TRUE);
+ if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
+ if (-d == ESC_b) d = '\b'; /* backspace */
+ else if (-d == ESC_X) d = 'X'; /* literal X in a class */
+ }
+ else if (ptr[1] != 0 && ptr[1] != ']')
+ {
+ ptr++;
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ {
+ int extra = 0;
+ GETCHARLEN(d, ptr, extra);
+ ptr += extra;
+ }
+ else
+#endif
+ d = *ptr;
+ }
+ if (d < 0) ptr = hyptr; /* go back to hyphen as data */
+ }
+
+ /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
+ 127 for caseless matching, we will need to use an XCLASS. */
+
+ if (d >= 0)
+ {
+ class_optcount = 10; /* Ensure > 1 */
+ if (d < c)
+ {
+ *errorptr = ERR8;
+ goto PCRE_ERROR_RETURN;
+ }
+
+#ifdef SUPPORT_UTF8
+ if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
+ {
+ uschar buffer[6];
+ if (!class_utf8) /* Allow for XCLASS overhead */
{
class_utf8 = TRUE;
- length += LINK_SIZE + 1 + 1;
+ length += LINK_SIZE + 2;
}
- length += 2 + extra;
- /* If this wide character is preceded by '-', add an extra 2 to
- the length in case the previous character was < 128, because in
- this case the whole range will be put into the list. */
+#ifdef SUPPORT_UCP
+ /* If we have UCP support, find out how many extra ranges are
+ needed to map the other case of characters within this range. We
+ have to mimic the range optimization here, because extending the
+ range upwards might push d over a boundary that makes is use
+ another byte in the UTF-8 representation. */
- if (ptr[-1] == '-') length += 2;
+ if ((options & PCRE_CASELESS) != 0)
+ {
+ int occ, ocd;
+ int cc = c;
+ int origd = d;
+ while (get_othercase_range(&cc, origd, &occ, &ocd))
+ {
+ if (occ >= c && ocd <= d) continue; /* Skip embedded */
+
+ if (occ < c && ocd >= c - 1) /* Extend the basic range */
+ { /* if there is overlap, */
+ c = occ; /* noting that if occ < c */
+ continue; /* we can't have ocd > d */
+ } /* because a subrange is */
+ if (ocd > d && occ <= d + 1) /* always shorter than */
+ { /* the basic range. */
+ d = ocd;
+ continue;
+ }
+
+ /* An extra item is needed */
+
+ length += 1 + ord2utf8(occ, buffer) +
+ ((occ == ocd)? 0 : ord2utf8(ocd, buffer));
+ }
+ }
+#endif /* SUPPORT_UCP */
- /* Advance to the end of this character */
+ /* The length of the (possibly extended) range */
- ptr += extra;
+ length += 1 + ord2utf8(c, buffer) + ord2utf8(d, buffer);
}
+#endif /* SUPPORT_UTF8 */
+
+ }
+
+ /* We have a single character. There is nothing to be done unless we
+ are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
+ allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
+ support. */
+
+ else
+ {
+#ifdef SUPPORT_UTF8
+ if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
+ {
+ uschar buffer[6];
+ class_optcount = 10; /* Ensure > 1 */
+ if (!class_utf8) /* Allow for XCLASS overhead */
+ {
+ class_utf8 = TRUE;
+ length += LINK_SIZE + 2;
+ }
+#ifdef SUPPORT_UCP
+ length += (((options & PCRE_CASELESS) != 0)? 2 : 1) *
+ (1 + ord2utf8(c, buffer));
+#else /* SUPPORT_UCP */
+ length += 1 + ord2utf8(c, buffer);
+#endif /* SUPPORT_UCP */
+ }
+#endif /* SUPPORT_UTF8 */
}
-#endif
}
}
while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
@@ -4608,7 +5101,7 @@ while ((c = *(++ptr)) != 0)
*errorptr = ERR39;
goto PCRE_ERROR_RETURN;
}
- length += 2;
+ length += 2 + 2*LINK_SIZE;
continue;
/* Named subpatterns are an extension copied from Python */
@@ -4908,111 +5401,46 @@ while ((c = *(++ptr)) != 0)
}
continue;
- /* Non-special character. For a run of such characters the length required
- is the number of characters + 2, except that the maximum run length is
- MAXLIT. We won't get a skipped space or a non-data escape or the start of a
- # comment as the first character, so the length can't be zero. */
+ /* Non-special character. It won't be space or # in extended mode, so it is
+ always a genuine character. If we are in a \Q...\E sequence, check for the
+ end; if not, we have a literal. */
- NORMAL_CHAR:
default:
- length += 2;
- runlength = 0;
- do
- {
-#ifdef SUPPORT_UTF8
- lastcharlength = 1; /* Need length of last char for UTF-8 repeats */
-#endif
-
- /* If in a \Q...\E sequence, check for end; otherwise it's a literal */
- if (inescq)
- {
- if (c == '\\' && ptr[1] == 'E')
- {
- inescq = FALSE;
- ptr++;
- }
- else runlength++;
- continue;
- }
-
- /* Skip whitespace and comments for /x */
-
- if ((options & PCRE_EXTENDED) != 0)
- {
- if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
- if (c == '#')
- {
- /* The space before the ; is to avoid a warning on a silly compiler
- on the Macintosh. */
- while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
- continue;
- }
- }
-
- /* Backslash may introduce a data char or a metacharacter; stop the
- string before the latter. */
-
- if (c == '\\')
- {
- const uschar *saveptr = ptr;
- c = check_escape(&ptr, errorptr, bracount, options, FALSE);
- if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
- if (c < 0) { ptr = saveptr; break; }
-
- /* In UTF-8 mode, add on the number of additional bytes needed to
- encode this character, and save the total length in case this is a
- final char that is repeated. */
-
-#ifdef SUPPORT_UTF8
- if (utf8 && c > 127)
- {
- int i;
- for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
- if (c <= utf8_table1[i]) break;
- runlength += i;
- lastcharlength += i;
- }
-#endif
- }
-
- /* Ordinary character or single-char escape */
+ NORMAL_CHAR:
- runlength++;
+ if (inescq && c == '\\' && ptr[1] == 'E')
+ {
+ inescq = FALSE;
+ ptr++;
+ continue;
}
- /* This "while" is the end of the "do" above. */
-
- while (runlength < MAXLIT &&
- (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
-
- /* If we hit a meta-character, back off to point to it */
-
- if (runlength < MAXLIT) ptr--;
+ length += 2; /* For a one-byte character */
+ lastitemlength = 1; /* Default length of last item for repeats */
- /* If the last char in the string is a UTF-8 multibyte character, we must
- set lastcharlength correctly. If it was specified as an escape, this will
- already have been done above. However, we also have to support in-line
- UTF-8 characters, so check backwards from where we are. */
+ /* In UTF-8 mode, check for additional bytes. */
#ifdef SUPPORT_UTF8
- if (utf8)
+ if (utf8 && (c & 0xc0) == 0xc0)
{
- const uschar *lastptr = ptr - 1;
- if ((*lastptr & 0x80) != 0)
- {
- while((*lastptr & 0xc0) == 0x80) lastptr--;
- lastcharlength = ptr - lastptr;
+ while ((ptr[1] & 0xc0) == 0x80) /* Can't flow over the end */
+ { /* because the end is marked */
+ lastitemlength++; /* by a zero byte. */
+ length++;
+ ptr++;
}
}
#endif
- length += runlength;
continue;
}
}
length += 2 + LINK_SIZE; /* For final KET and END */
+if ((options & PCRE_AUTO_CALLOUT) != 0)
+ length += 2 + 2*LINK_SIZE; /* For final callout */
+
if (length > MAX_PATTERN_SIZE)
{
*errorptr = ERR20;
@@ -5031,24 +5459,32 @@ if (re == NULL)
return NULL;
}
-/* Put in the magic number, and save the size, options, and table pointer */
+/* Put in the magic number, and save the sizes, options, and character table
+pointer. NULL is used for the default character tables. The nullpad field is at
+the end; it's there to help in the case when a regex compiled on a system with
+4-byte pointers is run on another with 8-byte pointers. */
re->magic_number = MAGIC_NUMBER;
re->size = size;
re->options = options;
-re->tables = tables;
+re->dummy1 = re->dummy2 = 0;
+re->name_table_offset = sizeof(real_pcre);
re->name_entry_size = max_name_size + 3;
re->name_count = name_count;
+re->tables = (tables == pcre_default_tables)? NULL : tables;
+re->nullpad = NULL;
/* The starting points of the name/number translation table and of the code are
passed around in the compile data block. */
compile_block.names_found = 0;
compile_block.name_entry_size = max_name_size + 3;
-compile_block.name_table = (uschar *)re + sizeof(real_pcre);
+compile_block.name_table = (uschar *)re + re->name_table_offset;
codestart = compile_block.name_table + re->name_entry_size * re->name_count;
compile_block.start_code = codestart;
+compile_block.start_pattern = (const uschar *)pattern;
compile_block.req_varyopt = 0;
+compile_block.nopartial = FALSE;
/* Set up a starting, non-extracting bracket, then compile the expression. On
error, *errorptr will be set non-NULL, so we don't need to look at the result
@@ -5063,6 +5499,8 @@ bracount = 0;
re->top_bracket = bracount;
re->top_backref = compile_block.top_backref;
+if (compile_block.nopartial) re->options |= PCRE_NOPARTIAL;
+
/* If not reached end of pattern on success, there's an excess bracket. */
if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
@@ -5124,7 +5562,7 @@ if ((options & PCRE_ANCHORED) == 0)
/* For an anchored pattern, we use the "required byte" only if it follows a
variable length item in the regex. Remove the caseless flag for non-caseable
-chars. */
+bytes. */
if (reqbyte >= 0 &&
((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
@@ -5144,7 +5582,8 @@ printf("Length = %d top_bracket = %d top_backref = %d\n",
if (re->options != 0)
{
- printf("%s%s%s%s%s%s%s%s%s\n",
+ printf("%s%s%s%s%s%s%s%s%s%s\n",
+ ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
@@ -5276,27 +5715,46 @@ if (c < 256)
return !negated; /* char found */
}
-/* Now match against the list of large chars or ranges that end with a large
-char. First skip the bit map if present. */
+/* First skip the bit map if present. Then match against the list of Unicode
+properties or large chars or ranges that end with a large char. We won't ever
+encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */
if ((*data++ & XCL_MAP) != 0) data += 32;
while ((t = *data++) != XCL_END)
{
int x, y;
- GETCHARINC(x, data);
if (t == XCL_SINGLE)
{
+ GETCHARINC(x, data);
if (c == x) return !negated;
}
- else
+ else if (t == XCL_RANGE)
{
+ GETCHARINC(x, data);
GETCHARINC(y, data);
if (c >= x && c <= y) return !negated;
}
+
+#ifdef SUPPORT_UCP
+ else /* XCL_PROP & XCL_NOTPROP */
+ {
+ int chartype, othercase;
+ int rqdtype = *data++;
+ int category = ucp_findchar(c, &chartype, &othercase);
+ if (rqdtype >= 128)
+ {
+ if ((rqdtype - 128 == category) == (t == XCL_PROP)) return !negated;
+ }
+ else
+ {
+ if ((rqdtype == chartype) == (t == XCL_PROP)) return !negated;
+ }
+ }
+#endif /* SUPPORT_UCP */
}
-return negated; /* char was not found */
+return negated; /* char did not match */
}
#endif
@@ -5336,7 +5794,7 @@ always used to.
/* These versions of the macros manage a private stack on the heap. Note
that the rd argument of RMATCH isn't actually used. It's the md argument of
-match(), which never actually changes. */
+match(), which never changes. */
#define REGISTER
@@ -5398,7 +5856,6 @@ typedef struct heapframe {
const uschar *Xcallpat;
const uschar *Xcharptr;
const uschar *Xdata;
- const uschar *Xlastptr;
const uschar *Xnext;
const uschar *Xpp;
const uschar *Xprev;
@@ -5413,6 +5870,16 @@ typedef struct heapframe {
unsigned long int Xoriginal_ims;
+#ifdef SUPPORT_UCP
+ int Xprop_type;
+ int Xprop_fail_result;
+ int Xprop_category;
+ int Xprop_chartype;
+ int Xprop_othercase;
+ int Xprop_test_against;
+ int *Xprop_test_variable;
+#endif
+
int Xctype;
int Xfc;
int Xfi;
@@ -5523,10 +5990,11 @@ HEAP_RECURSE:
/* Ditto for the local variables */
-#define callpat frame->Xcallpat
+#ifdef SUPPORT_UTF8
#define charptr frame->Xcharptr
+#endif
+#define callpat frame->Xcallpat
#define data frame->Xdata
-#define lastptr frame->Xlastptr
#define next frame->Xnext
#define pp frame->Xpp
#define prev frame->Xprev
@@ -5541,6 +6009,16 @@ HEAP_RECURSE:
#define original_ims frame->Xoriginal_ims
+#ifdef SUPPORT_UCP
+#define prop_type frame->Xprop_type
+#define prop_fail_result frame->Xprop_fail_result
+#define prop_category frame->Xprop_category
+#define prop_chartype frame->Xprop_chartype
+#define prop_othercase frame->Xprop_othercase
+#define prop_test_against frame->Xprop_test_against
+#define prop_test_variable frame->Xprop_test_variable
+#endif
+
#define ctype frame->Xctype
#define fc frame->Xfc
#define fi frame->Xfi
@@ -5566,24 +6044,36 @@ i, and fc and c, can be the same variables. */
#define fi i
#define fc c
-const uschar *callpat; /* Many of these variables are used ony */
+
+#ifdef SUPPORT_UTF8 /* Many of these variables are used ony */
const uschar *charptr; /* small blocks of the code. My normal */
-const uschar *data; /* style of coding would have declared */
-const uschar *lastptr; /* them within each of those blocks. */
-const uschar *next; /* However, in order to accommodate the */
-const uschar *pp; /* version of this code that uses an */
-const uschar *prev; /* external "stack" implemented on the */
-const uschar *saved_eptr; /* heap, it is easier to declare them */
- /* all here, so the declarations can */
-recursion_info new_recursive; /* be cut out in a block. The only */
- /* declarations within blocks below are */
-BOOL cur_is_word; /* for variables that do not have to */
-BOOL condition; /* be preserved over a recursive call */
-BOOL minimize; /* to RMATCH(). */
+#endif /* style of coding would have declared */
+const uschar *callpat; /* them within each of those blocks. */
+const uschar *data; /* However, in order to accommodate the */
+const uschar *next; /* version of this code that uses an */
+const uschar *pp; /* external "stack" implemented on the */
+const uschar *prev; /* heap, it is easier to declare them */
+const uschar *saved_eptr; /* all here, so the declarations can */
+ /* be cut out in a block. The only */
+recursion_info new_recursive; /* declarations within blocks below are */
+ /* for variables that do not have to */
+BOOL cur_is_word; /* be preserved over a recursive call */
+BOOL condition; /* to RMATCH(). */
+BOOL minimize;
BOOL prev_is_word;
unsigned long int original_ims;
+#ifdef SUPPORT_UCP
+int prop_type;
+int prop_fail_result;
+int prop_category;
+int prop_chartype;
+int prop_othercase;
+int prop_test_against;
+int *prop_test_variable;
+#endif
+
int ctype;
int length;
int max;
@@ -5598,6 +6088,14 @@ int stacksave[REC_STACK_SAVE_MAX];
eptrblock newptrb;
#endif
+/* These statements are here to stop the compiler complaining about unitialized
+variables. */
+
+#ifdef SUPPORT_UCP
+prop_fail_result = 0;
+prop_test_against = 0;
+prop_test_variable = NULL;
+#endif
/* OK, now we can get on with the real code of the function. Recursion is
specified by the macros RMATCH and RRETURN. When NO_RECURSE is *not* defined,
@@ -5629,6 +6127,14 @@ for (;;)
op = *ecode;
minimize = FALSE;
+ /* For partial matching, remember if we ever hit the end of the subject after
+ matching at least one subject character. */
+
+ if (md->partial &&
+ eptr >= md->end_subject &&
+ eptr > md->start_match)
+ md->hitend = TRUE;
+
/* Opening capturing bracket. If there is space in the offset vector, save
the current subject position in the working slot at the top of the vector. We
mustn't change the current values of the data slot, because they may be set
@@ -5883,20 +6389,22 @@ for (;;)
if (pcre_callout != NULL)
{
pcre_callout_block cb;
- cb.version = 0; /* Version 0 of the callout block */
+ cb.version = 1; /* Version 1 of the callout block */
cb.callout_number = ecode[1];
cb.offset_vector = md->offset_vector;
cb.subject = (const char *)md->start_subject;
cb.subject_length = md->end_subject - md->start_subject;
cb.start_match = md->start_match - md->start_subject;
cb.current_position = eptr - md->start_subject;
+ cb.pattern_position = GET(ecode, 2);
+ cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
cb.capture_top = offset_top/2;
cb.capture_last = md->capture_last;
cb.callout_data = md->callout_data;
if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
if (rrc < 0) RRETURN(rrc);
}
- ecode += 2;
+ ecode += 2 + 2*LINK_SIZE;
break;
/* Recursion either matches the current regex, or some subexpression. The
@@ -6297,7 +6805,7 @@ for (;;)
{
if (eptr == md->start_subject) prev_is_word = FALSE; else
{
- lastptr = eptr - 1;
+ const uschar *lastptr = eptr - 1;
while((*lastptr & 0xc0) == 0x80) lastptr--;
GETCHAR(c, lastptr);
prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
@@ -6427,6 +6935,63 @@ for (;;)
ecode++;
break;
+#ifdef SUPPORT_UCP
+ /* Check the next character by Unicode property. We will get here only
+ if the support is in the binary; otherwise a compile-time error occurs. */
+
+ case OP_PROP:
+ case OP_NOTPROP:
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ {
+ int chartype, rqdtype;
+ int othercase;
+ int category = ucp_findchar(c, &chartype, &othercase);
+
+ rqdtype = *(++ecode);
+ ecode++;
+
+ if (rqdtype >= 128)
+ {
+ if ((rqdtype - 128 != category) == (op == OP_PROP))
+ RRETURN(MATCH_NOMATCH);
+ }
+ else
+ {
+ if ((rqdtype != chartype) == (op == OP_PROP))
+ RRETURN(MATCH_NOMATCH);
+ }
+ }
+ break;
+
+ /* Match an extended Unicode sequence. We will get here only if the support
+ is in the binary; otherwise a compile-time error occurs. */
+
+ case OP_EXTUNI:
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ {
+ int chartype;
+ int othercase;
+ int category = ucp_findchar(c, &chartype, &othercase);
+ if (category == ucp_M) RRETURN(MATCH_NOMATCH);
+ while (eptr < md->end_subject)
+ {
+ int len = 1;
+ if (!md->utf8) c = *eptr; else
+ {
+ GETCHARLEN(c, eptr, len);
+ }
+ category = ucp_findchar(c, &chartype, &othercase);
+ if (category != ucp_M) break;
+ eptr += len;
+ }
+ }
+ ecode++;
+ break;
+#endif
+
+
/* Match a back reference, possibly repeatedly. Look past the end of the
item to see if there is repeat information following. The code is similar
to that for character classes, but repeated for efficiency. Then obey
@@ -6438,7 +7003,7 @@ for (;;)
case OP_REF:
{
offset = GET2(ecode, 1) << 1; /* Doubled ref number */
- ecode += 3; /* Advance past item */
+ ecode += 3; /* Advance past item */
/* If the reference is unset, set the length to be longer than the amount
of subject left; this ensures that every attempt at a match fails. We
@@ -6540,9 +7105,11 @@ for (;;)
/* Match a bit-mapped character class, possibly repeatedly. This op code is
- used when all the characters in the class have values in the range 0-255.
- The only difference between OP_CLASS and OP_NCLASS occurs when a data
- character outside the range is encountered.
+ used when all the characters in the class have values in the range 0-255,
+ and either the matching is caseful, or the characters are in the range
+ 0-127 when UTF-8 processing is enabled. The only difference between
+ OP_CLASS and OP_NCLASS occurs when a data character outside the range is
+ encountered.
First, look past the end of the item to see if there is repeat information
following. Then obey similar code to character type repeats - written out
@@ -6814,38 +7381,82 @@ for (;;)
}
#endif /* End of XCLASS */
- /* Match a run of characters */
+ /* Match a single character, casefully */
+
+ case OP_CHAR:
+#ifdef SUPPORT_UTF8
+ if (md->utf8)
+ {
+ length = 1;
+ ecode++;
+ GETCHARLEN(fc, ecode, length);
+ if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
+ while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
+ }
+ else
+#endif
- case OP_CHARS:
+ /* Non-UTF-8 mode */
{
- register int slen = ecode[1];
+ if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
+ if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
ecode += 2;
+ }
+ break;
-#ifdef DEBUG /* Sigh. Some compilers never learn. */
- if (eptr >= md->end_subject)
- printf("matching subject <null> against pattern ");
- else
- {
- printf("matching subject ");
- pchars(eptr, slen, TRUE, md);
- printf(" against pattern ");
- }
- pchars(ecode, slen, FALSE, md);
- printf("\n");
-#endif
+ /* Match a single character, caselessly */
- if (slen > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
- if ((ims & PCRE_CASELESS) != 0)
+ case OP_CHARNC:
+#ifdef SUPPORT_UTF8
+ if (md->utf8)
+ {
+ length = 1;
+ ecode++;
+ GETCHARLEN(fc, ecode, length);
+
+ if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
+
+ /* If the pattern character's value is < 128, we have only one byte, and
+ can use the fast lookup table. */
+
+ if (fc < 128)
{
- while (slen-- > 0)
- if (md->lcc[*ecode++] != md->lcc[*eptr++])
- RRETURN(MATCH_NOMATCH);
+ if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
}
+
+ /* Otherwise we must pick up the subject character */
+
else
{
- while (slen-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
+ int dc;
+ GETCHARINC(dc, eptr);
+ ecode += length;
+
+ /* If we have Unicode property support, we can use it to test the other
+ case of the character, if there is one. The result of ucp_findchar() is
+ < 0 if the char isn't found, and othercase is returned as zero if there
+ isn't one. */
+
+ if (fc != dc)
+ {
+#ifdef SUPPORT_UCP
+ int chartype;
+ int othercase;
+ if (ucp_findchar(fc, &chartype, &othercase) < 0 || dc != othercase)
+#endif
+ RRETURN(MATCH_NOMATCH);
+ }
}
}
+ else
+#endif /* SUPPORT_UTF8 */
+
+ /* Non-UTF-8 mode */
+ {
+ if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
+ if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
+ ecode += 2;
+ }
break;
/* Match a single character repeatedly; different opcodes share code. */
@@ -6889,15 +7500,33 @@ for (;;)
if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
ecode += length;
- /* Handle multibyte character matching specially here. There is no
- support for any kind of casing for multibyte characters. */
+ /* Handle multibyte character matching specially here. There is
+ support for caseless matching if UCP support is present. */
if (length > 1)
{
+ int oclength = 0;
+ uschar occhars[8];
+
+#ifdef SUPPORT_UCP
+ int othercase;
+ int chartype;
+ if ((ims & PCRE_CASELESS) != 0 &&
+ ucp_findchar(fc, &chartype, &othercase) >= 0 &&
+ othercase > 0)
+ oclength = ord2utf8(othercase, occhars);
+#endif /* SUPPORT_UCP */
+
for (i = 1; i <= min; i++)
{
- if (memcmp(eptr, charptr, length) != 0) RRETURN(MATCH_NOMATCH);
- eptr += length;
+ if (memcmp(eptr, charptr, length) == 0) eptr += length;
+ /* Need braces because of following else */
+ else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
+ else
+ {
+ if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
+ eptr += oclength;
+ }
}
if (min == max) continue;
@@ -6908,11 +7537,15 @@ for (;;)
{
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max ||
- eptr >= md->end_subject ||
- memcmp(eptr, charptr, length) != 0)
- RRETURN(MATCH_NOMATCH);
- eptr += length;
+ if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ if (memcmp(eptr, charptr, length) == 0) eptr += length;
+ /* Need braces because of following else */
+ else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
+ else
+ {
+ if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
+ eptr += oclength;
+ }
}
/* Control never gets here */
}
@@ -6921,10 +7554,14 @@ for (;;)
pp = eptr;
for (i = min; i < max; i++)
{
- if (eptr > md->end_subject - length ||
- memcmp(eptr, charptr, length) != 0)
- break;
- eptr += length;
+ if (eptr > md->end_subject - length) break;
+ if (memcmp(eptr, charptr, length) == 0) eptr += length;
+ else if (oclength == 0) break;
+ else
+ {
+ if (memcmp(eptr, occhars, oclength) != 0) break;
+ eptr += oclength;
+ }
}
while (eptr >= pp)
{
@@ -6942,7 +7579,7 @@ for (;;)
value of fc will always be < 128. */
}
else
-#endif
+#endif /* SUPPORT_UTF8 */
/* When not in UTF-8 mode, load a single-byte character. */
{
@@ -7088,9 +7725,9 @@ for (;;)
max = rep_max[c]; /* zero for max => infinity */
if (max == 0) max = INT_MAX;
- /* Common code for all repeated single-character (less than 255) matches.
- We can give up quickly if there are fewer than the minimum number of
- characters left in the subject. */
+ /* Common code for all repeated single-byte matches. We can give up quickly
+ if there are fewer than the minimum number of bytes left in the
+ subject. */
REPEATNOTCHAR:
if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
@@ -7358,16 +7995,77 @@ for (;;)
REPEATTYPE:
ctype = *ecode++; /* Code for the character type */
+#ifdef SUPPORT_UCP
+ if (ctype == OP_PROP || ctype == OP_NOTPROP)
+ {
+ prop_fail_result = ctype == OP_NOTPROP;
+ prop_type = *ecode++;
+ if (prop_type >= 128)
+ {
+ prop_test_against = prop_type - 128;
+ prop_test_variable = &prop_category;
+ }
+ else
+ {
+ prop_test_against = prop_type;
+ prop_test_variable = &prop_chartype;
+ }
+ }
+ else prop_type = -1;
+#endif
+
/* First, ensure the minimum number of matches are present. Use inline
code for maximizing the speed, and do the type test once at the start
(i.e. keep it out of the loop). Also we can test that there are at least
the minimum number of bytes before we start. This isn't as effective in
UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
- is tidier. */
+ is tidier. Also separate the UCP code, which can be the same for both UTF-8
+ and single-bytes. */
if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
if (min > 0)
{
+#ifdef SUPPORT_UCP
+ if (prop_type > 0)
+ {
+ for (i = 1; i <= min; i++)
+ {
+ GETCHARINC(c, eptr);
+ prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
+ if ((*prop_test_variable == prop_test_against) == prop_fail_result)
+ RRETURN(MATCH_NOMATCH);
+ }
+ }
+
+ /* Match extended Unicode sequences. We will get here only if the
+ support is in the binary; otherwise a compile-time error occurs. */
+
+ else if (ctype == OP_EXTUNI)
+ {
+ for (i = 1; i <= min; i++)
+ {
+ GETCHARINCTEST(c, eptr);
+ prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
+ if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
+ while (eptr < md->end_subject)
+ {
+ int len = 1;
+ if (!md->utf8) c = *eptr; else
+ {
+ GETCHARLEN(c, eptr, len);
+ }
+ prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
+ if (prop_category != ucp_M) break;
+ eptr += len;
+ }
+ }
+ }
+
+ else
+#endif /* SUPPORT_UCP */
+
+/* Handle all other cases when the coding is UTF-8 */
+
#ifdef SUPPORT_UTF8
if (md->utf8) switch(ctype)
{
@@ -7390,7 +8088,7 @@ for (;;)
{
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINC(c, eptr);
- if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
+ if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
RRETURN(MATCH_NOMATCH);
}
break;
@@ -7444,11 +8142,16 @@ for (;;)
/* No need to skip more bytes - we know it's a 1-byte character */
}
break;
- }
+
+ default:
+ RRETURN(PCRE_ERROR_INTERNAL);
+ } /* End switch(ctype) */
+
else
-#endif
+#endif /* SUPPORT_UTF8 */
- /* Code for the non-UTF-8 case for minimum matching */
+ /* Code for the non-UTF-8 case for minimum matching of operators other
+ than OP_PROP and OP_NOTPROP. */
switch(ctype)
{
@@ -7496,6 +8199,9 @@ for (;;)
if ((md->ctypes[*eptr++] & ctype_word) == 0)
RRETURN(MATCH_NOMATCH);
break;
+
+ default:
+ RRETURN(PCRE_ERROR_INTERNAL);
}
}
@@ -7504,10 +8210,56 @@ for (;;)
if (min == max) continue;
/* If minimizing, we have to test the rest of the pattern before each
- subsequent match. Again, separate the UTF-8 case for speed. */
+ subsequent match. Again, separate the UTF-8 case for speed, and also
+ separate the UCP cases. */
if (minimize)
{
+#ifdef SUPPORT_UCP
+ if (prop_type > 0)
+ {
+ for (fi = min;; fi++)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINC(c, eptr);
+ prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
+ if ((*prop_test_variable == prop_test_against) == prop_fail_result)
+ RRETURN(MATCH_NOMATCH);
+ }
+ }
+
+ /* Match extended Unicode sequences. We will get here only if the
+ support is in the binary; otherwise a compile-time error occurs. */
+
+ else if (ctype == OP_EXTUNI)
+ {
+ for (fi = min;; fi++)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
+ if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
+ while (eptr < md->end_subject)
+ {
+ int len = 1;
+ if (!md->utf8) c = *eptr; else
+ {
+ GETCHARLEN(c, eptr, len);
+ }
+ prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
+ if (prop_category != ucp_M) break;
+ eptr += len;
+ }
+ }
+ }
+
+ else
+#endif /* SUPPORT_UCP */
+
#ifdef SUPPORT_UTF8
/* UTF-8 mode */
if (md->utf8)
@@ -7557,6 +8309,9 @@ for (;;)
if (c >= 256 && (md->ctypes[c] & ctype_word) == 0)
RRETURN(MATCH_NOMATCH);
break;
+
+ default:
+ RRETURN(PCRE_ERROR_INTERNAL);
}
}
}
@@ -7602,6 +8357,9 @@ for (;;)
case OP_WORDCHAR:
if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
break;
+
+ default:
+ RRETURN(PCRE_ERROR_INTERNAL);
}
}
}
@@ -7610,11 +8368,85 @@ for (;;)
/* If maximizing it is worth using inline code for speed, doing the type
test once at the start (i.e. keep it out of the loop). Again, keep the
- UTF-8 stuff separate. */
+ UTF-8 and UCP stuff separate. */
else
{
- pp = eptr;
+ pp = eptr; /* Remember where we started */
+
+#ifdef SUPPORT_UCP
+ if (prop_type > 0)
+ {
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject) break;
+ GETCHARLEN(c, eptr, len);
+ prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
+ if ((*prop_test_variable == prop_test_against) == prop_fail_result)
+ break;
+ eptr+= len;
+ }
+
+ /* eptr is now past the end of the maximum run */
+
+ for(;;)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (eptr-- == pp) break; /* Stop if tried at original pos */
+ BACKCHAR(eptr);
+ }
+ }
+
+ /* Match extended Unicode sequences. We will get here only if the
+ support is in the binary; otherwise a compile-time error occurs. */
+
+ else if (ctype == OP_EXTUNI)
+ {
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject) break;
+ GETCHARINCTEST(c, eptr);
+ prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
+ if (prop_category == ucp_M) break;
+ while (eptr < md->end_subject)
+ {
+ int len = 1;
+ if (!md->utf8) c = *eptr; else
+ {
+ GETCHARLEN(c, eptr, len);
+ }
+ prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
+ if (prop_category != ucp_M) break;
+ eptr += len;
+ }
+ }
+
+ /* eptr is now past the end of the maximum run */
+
+ for(;;)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (eptr-- == pp) break; /* Stop if tried at original pos */
+ for (;;) /* Move back over one extended */
+ {
+ int len = 1;
+ BACKCHAR(eptr);
+ if (!md->utf8) c = *eptr; else
+ {
+ GETCHARLEN(c, eptr, len);
+ }
+ prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
+ if (prop_category != ucp_M) break;
+ eptr--;
+ }
+ }
+ }
+
+ else
+#endif /* SUPPORT_UCP */
#ifdef SUPPORT_UTF8
/* UTF-8 mode */
@@ -7745,6 +8577,9 @@ for (;;)
eptr+= len;
}
break;
+
+ default:
+ RRETURN(PCRE_ERROR_INTERNAL);
}
/* eptr is now past the end of the maximum run */
@@ -7835,6 +8670,9 @@ for (;;)
eptr++;
}
break;
+
+ default:
+ RRETURN(PCRE_ERROR_INTERNAL);
}
/* eptr is now past the end of the maximum run */
@@ -7889,7 +8727,6 @@ Undefine all the macros that were defined above to handle this. */
#undef callpat
#undef charptr
#undef data
-#undef lastptr
#undef next
#undef pp
#undef prev
@@ -7940,7 +8777,7 @@ portions of the string if it matches. Two elements in the vector are set for
each substring: the offsets to the start and end of the substring.
Arguments:
- external_re points to the compiled expression
+ argument_re points to the compiled expression
extra_data points to extra data or is NULL
subject points to the subject string
length length of subject string (may contain binary zeros)
@@ -7956,7 +8793,7 @@ Returns: > 0 => success; value is the number of elements filled in
*/
EXPORT int
-pcre_exec(const pcre *external_re, const pcre_extra *extra_data,
+pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
const char *subject, int length, int start_offset, int options, int *offsets,
int offsetcount)
{
@@ -7971,18 +8808,25 @@ BOOL startline;
BOOL first_byte_caseless = FALSE;
BOOL req_byte_caseless = FALSE;
match_data match_block;
+const uschar *tables;
const uschar *start_bits = NULL;
const uschar *start_match = (const uschar *)subject + start_offset;
const uschar *end_subject;
const uschar *req_byte_ptr = start_match - 1;
+
+pcre_study_data internal_study;
const pcre_study_data *study;
-const real_pcre *re = (const real_pcre *)external_re;
+
+real_pcre internal_re;
+const real_pcre *external_re = (const real_pcre *)argument_re;
+const real_pcre *re = external_re;
/* Plausibility checks */
if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
if (re == NULL || subject == NULL ||
(offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
+if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
/* Fish out the optional data from the extra_data structure, first setting
the default values. */
@@ -7991,6 +8835,10 @@ study = NULL;
match_block.match_limit = MATCH_LIMIT;
match_block.callout_data = NULL;
+/* The table pointer is always in native byte order. */
+
+tables = external_re->tables;
+
if (extra_data != NULL)
{
register unsigned int flags = extra_data->flags;
@@ -8000,17 +8848,37 @@ if (extra_data != NULL)
match_block.match_limit = extra_data->match_limit;
if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
match_block.callout_data = extra_data->callout_data;
+ if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
}
-/* Now we have re supposedly pointing to the regex */
+/* If the exec call supplied NULL for tables, use the inbuilt ones. This
+is a feature that makes it possible to save compiled regex and re-use them
+in other programs later. */
-if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
+if (tables == NULL) tables = pcre_default_tables;
+
+/* Check that the first field in the block is the magic number. If it is not,
+test for a regex that was compiled on a host of opposite endianness. If this is
+the case, flipped values are put in internal_re and internal_study if there was
+study data too. */
+
+if (re->magic_number != MAGIC_NUMBER)
+ {
+ re = try_flipped(re, &internal_re, study, &internal_study);
+ if (re == NULL) return PCRE_ERROR_BADMAGIC;
+ if (study != NULL) study = &internal_study;
+ }
+
+/* Set up other data */
anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
startline = (re->options & PCRE_STARTLINE) != 0;
-match_block.start_code =
- (const uschar *)re + sizeof(real_pcre) + re->name_count * re->name_entry_size;
+/* The code starts after the real_pcre block and the capture name table. */
+
+match_block.start_code = (const uschar *)external_re + re->name_table_offset +
+ re->name_count * re->name_entry_size;
+
match_block.start_subject = (const uschar *)subject;
match_block.start_offset = start_offset;
match_block.end_subject = match_block.start_subject + length;
@@ -8022,11 +8890,19 @@ match_block.utf8 = (re->options & PCRE_UTF8) != 0;
match_block.notbol = (options & PCRE_NOTBOL) != 0;
match_block.noteol = (options & PCRE_NOTEOL) != 0;
match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
+match_block.partial = (options & PCRE_PARTIAL) != 0;
+match_block.hitend = FALSE;
match_block.recursive = NULL; /* No recursion at top level */
-match_block.lcc = re->tables + lcc_offset;
-match_block.ctypes = re->tables + ctypes_offset;
+match_block.lcc = tables + lcc_offset;
+match_block.ctypes = tables + ctypes_offset;
+
+/* Partial matching is supported only for a restricted set of regexes at the
+moment. */
+
+if (match_block.partial && (re->options & PCRE_NOPARTIAL) != 0)
+ return PCRE_ERROR_BADPARTIAL;
/* Check a UTF-8 string if required. Unfortunately there's no way of passing
back the character offset. */
@@ -8055,7 +8931,7 @@ restoring at the exit of a group is easy. */
ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
/* If the expression has got more back references than the offsets supplied can
-hold, we get a temporary bit of working store to use during the matching.
+hold, we get a temporary chunk of working store to use during the matching.
Otherwise, we can use the vector supplied, rounding down its size to a multiple
of 3. */
@@ -8121,7 +8997,7 @@ if ((re->options & PCRE_REQCHSET) != 0)
{
req_byte = re->req_byte & 255;
req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
- req_byte2 = (re->tables + fcc_offset)[req_byte]; /* case flipped */
+ req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
}
/* Loop for handling unanchored repeated matching attempts; for anchored regexs
@@ -8129,12 +9005,14 @@ the loop runs just once. */
do
{
- register int *iptr = match_block.offset_vector;
- register int *iend = iptr + resetcount;
-
/* Reset the maximum number of extractions we might see. */
- while (iptr < iend) *iptr++ = -1;
+ if (match_block.offset_vector != NULL)
+ {
+ register int *iptr = match_block.offset_vector;
+ register int *iend = iptr + resetcount;
+ while (iptr < iend) *iptr++ = -1;
+ }
/* Advance to a unique first char if possible */
@@ -8166,7 +9044,7 @@ do
{
while (start_match < end_subject)
{
- register int c = *start_match;
+ register unsigned int c = *start_match;
if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
}
}
@@ -8188,9 +9066,14 @@ do
HOWEVER: when the subject string is very, very long, searching to its end can
take a long time, and give bad performance on quite ordinary patterns. This
showed up when somebody was matching /^C/ on a 32-megabyte string... so we
- don't do this when the string is sufficiently long. */
+ don't do this when the string is sufficiently long.
+
+ ALSO: this processing is disabled when partial matching is requested.
+ */
- if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
+ if (req_byte >= 0 &&
+ end_subject - start_match < REQ_BYTE_MAX &&
+ !match_block.partial)
{
register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0);
@@ -8245,7 +9128,8 @@ do
start_match++;
#ifdef SUPPORT_UTF8
if (match_block.utf8)
- while((*start_match & 0xc0) == 0x80) start_match++;
+ while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
+ start_match++;
#endif
continue;
}
@@ -8296,9 +9180,16 @@ if (using_temporary_offsets)
(pcre_free)(match_block.offset_vector);
}
-DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
-
-return PCRE_ERROR_NOMATCH;
+if (match_block.partial && match_block.hitend)
+ {
+ DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
+ return PCRE_ERROR_PARTIAL;
+ }
+else
+ {
+ DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
+ return PCRE_ERROR_NOMATCH;
+ }
}
/* End of pcre.c */