summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorzherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>2011-11-28 20:39:30 +0000
committerzherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>2011-11-28 20:39:30 +0000
commit00cc776fe74e502bc0774ceca2bb3f11283e189a (patch)
tree1f19206b0bbf8f9f4d2af9a8a34d0198c8a70808
parent4d715f1b6035e095635067d977ad56948ff4e4c2 (diff)
downloadpcre-00cc776fe74e502bc0774ceca2bb3f11283e189a.tar.gz
Make character ranges 16 bit friendly
git-svn-id: svn://vcs.exim.org/pcre/code/branches/pcre16@770 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--Makefile.am3
-rw-r--r--pcre16_xclass.c45
-rw-r--r--pcre_compile.c266
-rw-r--r--pcre_dfa_exec.c2
-rw-r--r--pcre_exec.c67
-rw-r--r--pcre_internal.h25
-rw-r--r--pcre_jit_compile.c46
-rw-r--r--pcre_printint.src14
-rw-r--r--pcre_study.c18
-rw-r--r--pcre_xclass.c7
10 files changed, 319 insertions, 174 deletions
diff --git a/Makefile.am b/Makefile.am
index 440d699..7d5de86 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -219,7 +219,8 @@ libpcre16_la_SOURCES = \
pcre16_tables.c \
pcre16_try_flipped.c \
pcre16_utf16_utils.c \
- pcre16_valid_utf16.c
+ pcre16_valid_utf16.c \
+ pcre16_xclass.c
## This file is generated as part of the building process, so don't distribute.
nodist_libpcre16_la_SOURCES = \
diff --git a/pcre16_xclass.c b/pcre16_xclass.c
new file mode 100644
index 0000000..acb5631
--- /dev/null
+++ b/pcre16_xclass.c
@@ -0,0 +1,45 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+ Written by Philip Hazel
+ Copyright (c) 1997-2011 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+/* Generate code with 16 bit character support. */
+#define COMPILE_PCRE16
+
+#include "pcre_xclass.c"
+
+/* End of pcre16_xclass.c */
diff --git a/pcre_compile.c b/pcre_compile.c
index 1664506..46d881d 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -1764,15 +1764,15 @@ for (;;)
/* Check a class for variable quantification */
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || defined COMPILE_PCRE16
case OP_XCLASS:
- cc += GET(cc, 1) - 33;
+ cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];
/* Fall through */
#endif
case OP_CLASS:
case OP_NCLASS:
- cc += 33;
+ cc += PRIV(OP_lengths)[OP_CLASS];
switch (*cc)
{
@@ -2310,7 +2310,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
case OP_CLASS:
case OP_NCLASS:
- ccode = code + 33;
+ ccode = code + PRIV(OP_lengths)[OP_CLASS];
#ifdef SUPPORT_UTF8
CHECK_CLASS_REPEAT:
@@ -3299,22 +3299,27 @@ const pcre_uchar *nestptr = NULL;
pcre_uchar *previous = NULL;
pcre_uchar *previous_callout = NULL;
pcre_uchar *save_hwm = NULL;
-pcre_uchar classbits[32];
+pcre_uint8 classbits[32];
/* We can fish out the UTF-8 setting once and for all into a BOOL, but we
must not do this for other options (e.g. PCRE_EXTENDED) because they may change
dynamically as we process the pattern. */
#ifdef SUPPORT_UTF8
-BOOL class_utf8;
BOOL utf8 = (options & PCRE_UTF8) != 0;
-pcre_uint8 *class_utf8data;
-pcre_uint8 *class_utf8data_base;
pcre_uint8 utf8_char[6];
#else
BOOL utf8 = FALSE;
#endif
+/* Helper variables for OP_XCLASS opcode (for characters > 255). */
+
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
+BOOL xclass;
+pcre_uchar *class_uchardata;
+pcre_uchar *class_uchardata_base;
+#endif
+
#ifdef PCRE_DEBUG
if (lengthptr != NULL) DPRINTF((">> start branch\n"));
#endif
@@ -3620,8 +3625,7 @@ for (;; ptr++)
{
if (ptr[1] == CHAR_E)
ptr++;
- else if (STRNCMP_UC_C8(ptr + 1,
- STR_Q STR_BACKSLASH STR_E, 3) == 0)
+ else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
ptr += 3;
else
break;
@@ -3665,10 +3669,10 @@ for (;; ptr++)
memset(classbits, 0, 32 * sizeof(pcre_uint8));
-#ifdef SUPPORT_UTF8
- class_utf8 = FALSE; /* No chars >= 256 */
- class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
- class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
+ xclass = FALSE; /* No chars >= 256 */
+ class_uchardata = code + LINK_SIZE + 2; /* For UTF-8 items */
+ class_uchardata_base = class_uchardata; /* For resetting in pass 1 */
#endif
/* Process characters until ] is reached. By writing this as a "do" it
@@ -3684,18 +3688,19 @@ for (;; ptr++)
{ /* Braces are required because the */
GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
}
+#endif
- /* In the pre-compile phase, accumulate the length of any UTF-8 extra
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
+ /* In the pre-compile phase, accumulate the length of any extra
data and reset the pointer. This is so that very large classes that
- contain a zillion UTF-8 characters no longer overwrite the work space
+ contain a zillion > 255 characters no longer overwrite the work space
(which is on the stack). */
if (lengthptr != NULL)
{
- *lengthptr += class_utf8data - class_utf8data_base;
- class_utf8data = class_utf8data_base;
+ *lengthptr += class_uchardata - class_uchardata_base;
+ class_uchardata = class_uchardata_base;
}
-
#endif
/* Inside \Q...\E everything is literal except \E */
@@ -3896,23 +3901,23 @@ for (;; ptr++)
SETBIT(classbits, 0x09); /* VT */
SETBIT(classbits, 0x20); /* SPACE */
SETBIT(classbits, 0xa0); /* NSBP */
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf8)
{
- class_utf8 = TRUE;
- *class_utf8data++ = XCL_SINGLE;
- class_utf8data += PRIV(ord2utf8)(0x1680, class_utf8data);
- *class_utf8data++ = XCL_SINGLE;
- class_utf8data += PRIV(ord2utf8)(0x180e, class_utf8data);
- *class_utf8data++ = XCL_RANGE;
- class_utf8data += PRIV(ord2utf8)(0x2000, class_utf8data);
- class_utf8data += PRIV(ord2utf8)(0x200A, class_utf8data);
- *class_utf8data++ = XCL_SINGLE;
- class_utf8data += PRIV(ord2utf8)(0x202f, class_utf8data);
- *class_utf8data++ = XCL_SINGLE;
- class_utf8data += PRIV(ord2utf8)(0x205f, class_utf8data);
- *class_utf8data++ = XCL_SINGLE;
- class_utf8data += PRIV(ord2utf8)(0x3000, class_utf8data);
+ xclass = TRUE;
+ *class_uchardata++ = XCL_SINGLE;
+ class_uchardata += PRIV(ord2utf8)(0x1680, class_uchardata);
+ *class_uchardata++ = XCL_SINGLE;
+ class_uchardata += PRIV(ord2utf8)(0x180e, class_uchardata);
+ *class_uchardata++ = XCL_RANGE;
+ class_uchardata += PRIV(ord2utf8)(0x2000, class_uchardata);
+ class_uchardata += PRIV(ord2utf8)(0x200A, class_uchardata);
+ *class_uchardata++ = XCL_SINGLE;
+ class_uchardata += PRIV(ord2utf8)(0x202f, class_uchardata);
+ *class_uchardata++ = XCL_SINGLE;
+ class_uchardata += PRIV(ord2utf8)(0x205f, class_uchardata);
+ *class_uchardata++ = XCL_SINGLE;
+ class_uchardata += PRIV(ord2utf8)(0x3000, class_uchardata);
}
#endif
continue;
@@ -3931,31 +3936,31 @@ for (;; ptr++)
classbits[c] |= x;
}
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf8)
{
- class_utf8 = TRUE;
- *class_utf8data++ = XCL_RANGE;
- class_utf8data += PRIV(ord2utf8)(0x0100, class_utf8data);
- class_utf8data += PRIV(ord2utf8)(0x167f, class_utf8data);
- *class_utf8data++ = XCL_RANGE;
- class_utf8data += PRIV(ord2utf8)(0x1681, class_utf8data);
- class_utf8data += PRIV(ord2utf8)(0x180d, class_utf8data);
- *class_utf8data++ = XCL_RANGE;
- class_utf8data += PRIV(ord2utf8)(0x180f, class_utf8data);
- class_utf8data += PRIV(ord2utf8)(0x1fff, class_utf8data);
- *class_utf8data++ = XCL_RANGE;
- class_utf8data += PRIV(ord2utf8)(0x200B, class_utf8data);
- class_utf8data += PRIV(ord2utf8)(0x202e, class_utf8data);
- *class_utf8data++ = XCL_RANGE;
- class_utf8data += PRIV(ord2utf8)(0x2030, class_utf8data);
- class_utf8data += PRIV(ord2utf8)(0x205e, class_utf8data);
- *class_utf8data++ = XCL_RANGE;
- class_utf8data += PRIV(ord2utf8)(0x2060, class_utf8data);
- class_utf8data += PRIV(ord2utf8)(0x2fff, class_utf8data);
- *class_utf8data++ = XCL_RANGE;
- class_utf8data += PRIV(ord2utf8)(0x3001, class_utf8data);
- class_utf8data += PRIV(ord2utf8)(0x7fffffff, class_utf8data);
+ xclass = TRUE;
+ *class_uchardata++ = XCL_RANGE;
+ class_uchardata += PRIV(ord2utf8)(0x0100, class_uchardata);
+ class_uchardata += PRIV(ord2utf8)(0x167f, class_uchardata);
+ *class_uchardata++ = XCL_RANGE;
+ class_uchardata += PRIV(ord2utf8)(0x1681, class_uchardata);
+ class_uchardata += PRIV(ord2utf8)(0x180d, class_uchardata);
+ *class_uchardata++ = XCL_RANGE;
+ class_uchardata += PRIV(ord2utf8)(0x180f, class_uchardata);
+ class_uchardata += PRIV(ord2utf8)(0x1fff, class_uchardata);
+ *class_uchardata++ = XCL_RANGE;
+ class_uchardata += PRIV(ord2utf8)(0x200B, class_uchardata);
+ class_uchardata += PRIV(ord2utf8)(0x202e, class_uchardata);
+ *class_uchardata++ = XCL_RANGE;
+ class_uchardata += PRIV(ord2utf8)(0x2030, class_uchardata);
+ class_uchardata += PRIV(ord2utf8)(0x205e, class_uchardata);
+ *class_uchardata++ = XCL_RANGE;
+ class_uchardata += PRIV(ord2utf8)(0x2060, class_uchardata);
+ class_uchardata += PRIV(ord2utf8)(0x2fff, class_uchardata);
+ *class_uchardata++ = XCL_RANGE;
+ class_uchardata += PRIV(ord2utf8)(0x3001, class_uchardata);
+ class_uchardata += PRIV(ord2utf8)(0x7fffffff, class_uchardata);
}
#endif
continue;
@@ -3966,13 +3971,13 @@ for (;; ptr++)
SETBIT(classbits, 0x0c); /* FF */
SETBIT(classbits, 0x0d); /* CR */
SETBIT(classbits, 0x85); /* NEL */
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf8)
{
- class_utf8 = TRUE;
- *class_utf8data++ = XCL_RANGE;
- class_utf8data += PRIV(ord2utf8)(0x2028, class_utf8data);
- class_utf8data += PRIV(ord2utf8)(0x2029, class_utf8data);
+ xclass = TRUE;
+ *class_uchardata++ = XCL_RANGE;
+ class_uchardata += PRIV(ord2utf8)(0x2028, class_uchardata);
+ class_uchardata += PRIV(ord2utf8)(0x2029, class_uchardata);
}
#endif
continue;
@@ -3994,16 +3999,16 @@ for (;; ptr++)
classbits[c] |= x;
}
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf8)
{
- class_utf8 = TRUE;
- *class_utf8data++ = XCL_RANGE;
- class_utf8data += PRIV(ord2utf8)(0x0100, class_utf8data);
- class_utf8data += PRIV(ord2utf8)(0x2027, class_utf8data);
- *class_utf8data++ = XCL_RANGE;
- class_utf8data += PRIV(ord2utf8)(0x2029, class_utf8data);
- class_utf8data += PRIV(ord2utf8)(0x7fffffff, class_utf8data);
+ xclass = TRUE;
+ *class_uchardata++ = XCL_RANGE;
+ class_uchardata += PRIV(ord2utf8)(0x0100, class_uchardata);
+ class_uchardata += PRIV(ord2utf8)(0x2027, class_uchardata);
+ *class_uchardata++ = XCL_RANGE;
+ class_uchardata += PRIV(ord2utf8)(0x2029, class_uchardata);
+ class_uchardata += PRIV(ord2utf8)(0x7fffffff, class_uchardata);
}
#endif
continue;
@@ -4016,11 +4021,11 @@ for (;; ptr++)
int pdata;
int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
if (ptype < 0) goto FAILED;
- class_utf8 = TRUE;
- *class_utf8data++ = ((-c == ESC_p) != negated)?
+ xclass = TRUE;
+ *class_uchardata++ = ((-c == ESC_p) != negated)?
XCL_PROP : XCL_NOTPROP;
- *class_utf8data++ = ptype;
- *class_utf8data++ = pdata;
+ *class_uchardata++ = ptype;
+ *class_uchardata++ = pdata;
class_charcount -= 2; /* Not a < 256 character */
continue;
}
@@ -4042,7 +4047,7 @@ for (;; ptr++)
}
/* Fall through if we have a single character (c >= 0). This may be
- greater than 256 in UTF-8 mode. */
+ greater than 256 mode. */
} /* End of backslash handling */
@@ -4140,10 +4145,15 @@ for (;; ptr++)
matching for characters > 127 is available only if UCP support is
available. */
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
+#endif
+#ifndef COMPILE_PCRE8
+ if (d > 255)
+#endif
+#if defined SUPPORT_UTF || defined COMPILE_PCRE16
{
- class_utf8 = TRUE;
+ xclass = TRUE;
/* With UCP support, we can find the other case equivalents of
the relevant characters. There may be several ranges. Optimize how
@@ -4176,14 +4186,14 @@ for (;; ptr++)
if (occ == ocd)
{
- *class_utf8data++ = XCL_SINGLE;
+ *class_uchardata++ = XCL_SINGLE;
}
else
{
- *class_utf8data++ = XCL_RANGE;
- class_utf8data += PRIV(ord2utf8)(occ, class_utf8data);
+ *class_uchardata++ = XCL_RANGE;
+ class_uchardata += PRIV(ord2utf8)(occ, class_uchardata);
}
- class_utf8data += PRIV(ord2utf8)(ocd, class_utf8data);
+ class_uchardata += PRIV(ord2utf8)(ocd, class_uchardata);
}
}
#endif /* SUPPORT_UCP */
@@ -4191,30 +4201,38 @@ for (;; ptr++)
/* Now record the original range, possibly modified for UCP caseless
overlapping ranges. */
- *class_utf8data++ = XCL_RANGE;
- class_utf8data += PRIV(ord2utf8)(c, class_utf8data);
- class_utf8data += PRIV(ord2utf8)(d, class_utf8data);
+ *class_uchardata++ = XCL_RANGE;
+#ifdef SUPPORT_UTF
+ class_uchardata += PRIV(ord2utf8)(c, class_uchardata);
+ class_uchardata += PRIV(ord2utf8)(d, class_uchardata);
+#else
+ *class_uchardata++ = c;
+ *class_uchardata++ = d;
+#endif
/* With UCP support, we are done. Without UCP support, there is no
- caseless matching for UTF-8 characters > 127; we can use the bit map
- for the smaller ones. */
+ caseless matching for UTF characters > 127; we can use the bit map
+ for the smaller ones. As for 16 bit characters without UTF, we
+ can still use */
#ifdef SUPPORT_UCP
continue; /* With next character in the class */
#else
+#ifdef SUPPORT_UTF
if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
-
/* Adjust upper limit and fall through to set up the map */
-
d = 127;
-
+#else
+ if (c > 255) continue;
+ /* Adjust upper limit and fall through to set up the map */
+ d = 255;
+#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UCP */
}
-#endif /* SUPPORT_UTF8 */
+#endif /* SUPPORT_UTF8 || COMPILE_PCRE16 */
- /* We use the bit map for all cases when not in UTF-8 mode; else
- ranges that lie entirely within 0-127 when there is UCP support; else
- for partial ranges without UCP support. */
+ /* We use the bit map for 8 bit mode, or when the characters fall
+ partially or entirely to [0-255] ([0-127] for UCP) ranges. */
class_charcount += d - c + 1;
class_lastchar = d;
@@ -4242,12 +4260,21 @@ for (;; ptr++)
/* Handle a character that cannot go in the bit map */
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
+#endif
+#ifndef COMPILE_PCRE8
+ if (c > 255)
+#endif
+#if defined SUPPORT_UTF || defined COMPILE_PCRE16
{
- class_utf8 = TRUE;
- *class_utf8data++ = XCL_SINGLE;
- class_utf8data += PRIV(ord2utf8)(c, class_utf8data);
+ xclass = TRUE;
+ *class_uchardata++ = XCL_SINGLE;
+#ifdef SUPPORT_UTF
+ class_uchardata += PRIV(ord2utf8)(c, class_uchardata);
+#else
+ *class_uchardata++ = c;
+#endif
#ifdef SUPPORT_UCP
if ((options & PCRE_CASELESS) != 0)
@@ -4255,8 +4282,8 @@ for (;; ptr++)
unsigned int othercase;
if ((othercase = UCD_OTHERCASE(c)) != c)
{
- *class_utf8data++ = XCL_SINGLE;
- class_utf8data += PRIV(ord2utf8)(othercase, class_utf8data);
+ *class_uchardata++ = XCL_SINGLE;
+ class_uchardata += PRIV(ord2utf8)(othercase, class_uchardata);
}
}
#endif /* SUPPORT_UCP */
@@ -4312,11 +4339,13 @@ for (;; ptr++)
char if this item is first, whatever repeat count may follow. In the case
of reqbyte, save the previous value for reinstating. */
-#ifdef SUPPORT_UTF8
- if (class_charcount == 1 && !class_utf8 &&
+#ifdef SUPPORT_UTF
+ if (class_charcount == 1 && !xclass &&
(!utf8 || !negate_class || class_lastchar < 128))
-#else
+#elif defined COMPILE_PCRE8
if (class_charcount == 1)
+#else
+ if (class_charcount == 1 && !xclass)
#endif
{
zeroreqbyte = reqbyte;
@@ -4364,13 +4393,18 @@ for (;; ptr++)
be listed) there are no characters < 256, we can omit the bitmap in the
actual compiled code. */
-#ifdef SUPPORT_UTF8
- if (class_utf8 && (!should_flip_negation || (options & PCRE_UCP) != 0))
+#ifdef SUPPORT_UTF
+ if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
+#endif
+#ifndef COMPILE_PCRE8
+ if (xclass && !should_flip_negation)
+#endif
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
{
- *class_utf8data++ = XCL_END; /* Marks the end of extra data */
+ *class_uchardata++ = XCL_END; /* Marks the end of extra data */
*code++ = OP_XCLASS;
code += LINK_SIZE;
- *code = negate_class? XCL_NOT : 0;
+ *code = negate_class? XCL_NOT:0;
/* If the map is required, move up the extra data to make room for it;
otherwise just move the code pointer to the end of the extra data. */
@@ -4378,11 +4412,12 @@ for (;; ptr++)
if (class_charcount > 0)
{
*code++ |= XCL_MAP;
- memmove(code + 32, code, class_utf8data - code);
+ memmove(code + (32 / sizeof(pcre_uchar)), code,
+ IN_UCHARS(class_uchardata - code));
memcpy(code, classbits, 32);
- code = class_utf8data + 32;
+ code = class_uchardata + (32 / sizeof(pcre_uchar));
}
- else code = class_utf8data;
+ else code = class_uchardata;
/* Now fill in the complete length of the item */
@@ -4398,16 +4433,13 @@ for (;; ptr++)
negating it if necessary. */
*code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
- if (negate_class)
- {
- if (lengthptr == NULL) /* Save time in the pre-compile phase */
- for (c = 0; c < 32; c++) code[c] = ~classbits[c];
- }
- else
+ if (lengthptr == NULL) /* Save time in the pre-compile phase */
{
+ if (negate_class)
+ for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
memcpy(code, classbits, 32);
}
- code += 32;
+ code += 32 / sizeof(pcre_uchar);
break;
@@ -4761,7 +4793,7 @@ for (;; ptr++)
else if (*previous == OP_CLASS ||
*previous == OP_NCLASS ||
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF8 || !defined COMPILE_PCRE8
*previous == OP_XCLASS ||
#endif
*previous == OP_REF ||
diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c
index ea5b00c..0793897 100644
--- a/pcre_dfa_exec.c
+++ b/pcre_dfa_exec.c
@@ -2426,7 +2426,7 @@ for (;;)
if (codevalue != OP_XCLASS)
{
- ecode = code + 33;
+ ecode = code + 1 + (32 / sizeof(pcre_uchar));
if (clen > 0)
{
isinclass = (c > 255)? (codevalue == OP_NCLASS) :
diff --git a/pcre_exec.c b/pcre_exec.c
index 41a2482..e532513 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -2706,8 +2706,11 @@ for (;;)
case OP_NCLASS:
case OP_CLASS:
{
+ /* The data variable is saved across frames, so the byte map needs to
+ be stored there. */
+#define BYTE_MAP ((pcre_uint8 *)data)
data = ecode + 1; /* Save for matching */
- ecode += 33; /* Advance past the item */
+ ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
switch (*ecode)
{
@@ -2740,7 +2743,7 @@ for (;;)
/* First, ensure the minimum number of matches are present. */
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
/* UTF-8 mode */
if (utf8)
{
@@ -2757,9 +2760,7 @@ for (;;)
if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
}
else
- {
- if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
- }
+ if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
}
}
else
@@ -2774,7 +2775,14 @@ for (;;)
MRRETURN(MATCH_NOMATCH);
}
c = *eptr++;
- if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
+#ifndef COMPILE_PCRE8
+ if (c > 255)
+ {
+ if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
+ }
+ else
+#endif
+ if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
}
}
@@ -2788,7 +2796,7 @@ for (;;)
if (minimize)
{
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
/* UTF-8 mode */
if (utf8)
{
@@ -2808,9 +2816,7 @@ for (;;)
if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
}
else
- {
- if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
- }
+ if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
}
}
else
@@ -2828,7 +2834,14 @@ for (;;)
MRRETURN(MATCH_NOMATCH);
}
c = *eptr++;
- if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
+#ifndef COMPILE_PCRE8
+ if (c > 255)
+ {
+ if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
+ }
+ else
+#endif
+ if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
}
}
/* Control never gets here */
@@ -2840,8 +2853,8 @@ for (;;)
{
pp = eptr;
-#ifdef SUPPORT_UTF8
- /* UTF-8 mode */
+#ifdef SUPPORT_UTF
+ /* UTF mode */
if (utf8)
{
for (i = min; i < max; i++)
@@ -2858,9 +2871,7 @@ for (;;)
if (op == OP_CLASS) break;
}
else
- {
- if ((data[c/8] & (1 << (c&7))) == 0) break;
- }
+ if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
eptr += len;
}
for (;;)
@@ -2873,7 +2884,7 @@ for (;;)
}
else
#endif
- /* Not UTF-8 mode */
+ /* Not UTF mode */
{
for (i = min; i < max; i++)
{
@@ -2883,7 +2894,14 @@ for (;;)
break;
}
c = *eptr;
- if ((data[c/8] & (1 << (c&7))) == 0) break;
+#ifndef COMPILE_PCRE8
+ if (c > 255)
+ {
+ if (op == OP_CLASS) break;
+ }
+ else
+#endif
+ if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
eptr++;
}
while (eptr >= pp)
@@ -2896,6 +2914,7 @@ for (;;)
MRRETURN(MATCH_NOMATCH);
}
+#undef BYTE_MAP
}
/* Control never gets here */
@@ -2904,7 +2923,7 @@ for (;;)
when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
mode, because Unicode properties are supported in non-UTF-8 mode. */
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
case OP_XCLASS:
{
data = ecode + 1 + LINK_SIZE; /* Save for matching */
@@ -2991,7 +3010,11 @@ for (;;)
SCHECK_PARTIAL();
break;
}
+#ifdef SUPPORT_UTF
GETCHARLENTEST(c, eptr, len);
+#else
+ c = *eptr;
+#endif
if (!PRIV(xclass)(c, data)) break;
eptr += len;
}
@@ -3000,7 +3023,9 @@ for (;;)
RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (eptr-- == pp) break; /* Stop if tried at original pos */
+#ifdef SUPPORT_UTF
if (utf8) BACKCHAR(eptr);
+#endif
}
MRRETURN(MATCH_NOMATCH);
}
@@ -6353,7 +6378,11 @@ for(;;)
{
while (start_match < end_subject)
{
+#ifdef COMPILE_PCRE8
register unsigned int c = *start_match;
+#else
+ register unsigned int c = *start_match & 0xff;
+#endif
if ((start_bits[c/8] & (1 << (c&7))) == 0)
{
start_match++;
diff --git a/pcre_internal.h b/pcre_internal.h
index 0228207..b9f8dd4 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -71,6 +71,21 @@ script prevents both being selected, but not everybody uses "configure". */
#define SUPPORT_UTF8 1
#endif
+/* If SUPPORT_UCP is defined, SUPPORT_UTF16 must also be defined. The
+"configure" script ensures this, but not everybody uses "configure". */
+
+#if defined SUPPORT_UCP && defined COMPILE_PCRE16 && !defined SUPPORT_UTF16
+#define SUPPORT_UTF16 1
+#endif
+
+/* This macro is defined if either UTF-8 or UTF-16 support or both are
+enabled. */
+
+#if defined SUPPORT_UTF8 || defined SUPPORT_UTF16
+/* Unicode Transformation Format is enabled. */
+#define SUPPORT_UTF 1
+#endif
+
/* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
inline, and there are *still* stupid compilers about that don't like indented
pre-processor statements, or at least there were when I first wrote this. After
@@ -1325,7 +1340,7 @@ only. */
#define PT_WORD 8 /* Word - L plus N plus underscore */
/* Flag bits and data types for the extended class (OP_XCLASS) for classes that
-contain UTF-8 characters with values greater than 255. */
+contain characters with values greater than 255. */
#define XCL_NOT 0x01 /* Flag: this is a negative class */
#define XCL_MAP 0x02 /* Flag: a 32-byte map is present */
@@ -1522,8 +1537,8 @@ enum {
OP_CLASS, /* 106 Match a character class, chars < 256 only */
OP_NCLASS, /* 107 Same, but the bitmap was created from a negative
class - the difference is relevant only when a
- UTF-8 character > 255 is encountered. */
- OP_XCLASS, /* 108 Extended class for handling UTF-8 chars within the
+ character > 255 is encountered. */
+ OP_XCLASS, /* 108 Extended class for handling > 255 chars within the
class. This does both positive and negative. */
OP_REF, /* 109 Match a back reference, casefully */
OP_REFI, /* 110 Match a back reference, caselessly */
@@ -1704,8 +1719,8 @@ in UTF-8 mode. The code that uses this table must know about such things. */
/* Character class & ref repeats */ \
1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \
1+2*IMM2_SIZE, 1+2*IMM2_SIZE, /* CRRANGE, CRMINRANGE */ \
- 33, /* CLASS */ \
- 33, /* NCLASS */ \
+ 1+(32/sizeof(pcre_uchar)), /* CLASS */ \
+ 1+(32/sizeof(pcre_uchar)), /* NCLASS */ \
0, /* XCLASS - variable length */ \
1+IMM2_SIZE, /* REF */ \
1+IMM2_SIZE, /* REFI */ \
diff --git a/pcre_jit_compile.c b/pcre_jit_compile.c
index 5fed4a1..7a2c41d 100644
--- a/pcre_jit_compile.c
+++ b/pcre_jit_compile.c
@@ -592,9 +592,9 @@ switch(*cc)
case OP_CLASS:
case OP_NCLASS:
- return cc + 33;
+ return cc + 1 + 32 / sizeof(pcre_uchar);
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
case OP_XCLASS:
return cc + GET(cc, 1);
#endif
@@ -1879,11 +1879,14 @@ if (firstline)
start = LABEL();
leave = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
-OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
-#ifdef SUPPORT_UTF8
+OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+#ifdef SUPPORT_UTF
if (common->utf8)
OP1(SLJIT_MOV, TMP3, 0, TMP1, 0);
#endif
+#ifndef COMPILE_PCRE8
+OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xff);
+#endif
OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7);
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3);
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), start_bits);
@@ -1891,11 +1894,11 @@ OP2(SLJIT_SHL, TMP2, 0, SLJIT_IMM, 1, TMP2, 0);
OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, TMP2, 0);
found = JUMP(SLJIT_C_NOT_ZERO);
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (common->utf8)
OP1(SLJIT_MOV, TMP1, 0, TMP3, 0);
#endif
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
#ifdef SUPPORT_UTF8
if (common->utf8)
{
@@ -2435,7 +2438,7 @@ while (utf8length > 0);
return cc;
}
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
#define SET_TYPE_OFFSET(value) \
if ((value) != typeoffset) \
@@ -2482,8 +2485,12 @@ read_char(common);
if ((*cc++ & XCL_MAP) != 0)
{
OP1(SLJIT_MOV, TMP3, 0, TMP1, 0);
+#ifndef COMPILE_PCRE8
+ jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
+#elif defined SUPPORT_UTF8
if (common->utf8)
jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
+#endif
OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7);
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3);
@@ -2492,13 +2499,17 @@ if ((*cc++ & XCL_MAP) != 0)
OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, TMP2, 0);
add_jump(compiler, list, JUMP(SLJIT_C_NOT_ZERO));
+#ifndef COMPILE_PCRE8
+ JUMPHERE(jump);
+#elif defined SUPPORT_UTF8
if (common->utf8)
JUMPHERE(jump);
+#endif
OP1(SLJIT_MOV, TMP1, 0, TMP3, 0);
#ifdef SUPPORT_UCP
charsaved = TRUE;
#endif
- cc += 32;
+ cc += 32 / sizeof(pcre_uchar);
}
/* Scanning the necessary info. */
@@ -3179,9 +3190,12 @@ switch(type)
case OP_NCLASS:
check_input_end(common, fallbacks);
read_char(common);
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
jump[0] = NULL;
+#ifdef SUPPORT_UTF8
+ /* This check can only be skipped in pure 8 bit mode. */
if (common->utf8)
+#endif
{
jump[0] = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
if (type == OP_CLASS)
@@ -3197,13 +3211,13 @@ switch(type)
OP2(SLJIT_SHL, TMP2, 0, SLJIT_IMM, 1, TMP2, 0);
OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, TMP2, 0);
add_jump(compiler, fallbacks, JUMP(SLJIT_C_ZERO));
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
if (jump[0] != NULL)
JUMPHERE(jump[0]);
#endif
- return cc + 32;
+ return cc + 32 / sizeof(pcre_uchar);
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || defined COMPILE_PCRE16
case OP_XCLASS:
compile_xclass_hotpath(common, cc + LINK_SIZE, fallbacks);
return cc + GET(cc, 0) - 1;
@@ -4725,7 +4739,7 @@ else
SLJIT_ASSERT(*opcode >= OP_CLASS || *opcode <= OP_XCLASS);
*type = *opcode;
cc++;
- class_len = (*type < OP_XCLASS) ? 33 : GET(cc, 0);
+ class_len = (*type < OP_XCLASS) ? (1 + (32 / sizeof(pcre_uchar))) : GET(cc, 0);
*opcode = cc[class_len - 1];
if (*opcode >= OP_CRSTAR && *opcode <= OP_CRMINQUERY)
{
@@ -5133,13 +5147,13 @@ while (cc < ccend)
case OP_CLASS:
case OP_NCLASS:
- if (cc[33] >= OP_CRSTAR && cc[33] <= OP_CRMINRANGE)
+ if (cc[1 + (32 / sizeof(pcre_uchar))] >= OP_CRSTAR && cc[1 + (32 / sizeof(pcre_uchar))] <= OP_CRMINRANGE)
cc = compile_iterator_hotpath(common, cc, parent);
else
cc = compile_char1_hotpath(common, *cc, cc + 1, parent->top != NULL ? &parent->top->nextfallbacks : &parent->topfallbacks);
break;
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || defined COMPILE_PCRE16
case OP_XCLASS:
if (*(cc + GET(cc, 1)) >= OP_CRSTAR && *(cc + GET(cc, 1)) <= OP_CRMINRANGE)
cc = compile_iterator_hotpath(common, cc, parent);
@@ -5994,7 +6008,9 @@ while (current)
case OP_TYPEPOSUPTO:
case OP_CLASS:
case OP_NCLASS:
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
case OP_XCLASS:
+#endif
compile_iterator_fallbackpath(common, current);
break;
diff --git a/pcre_printint.src b/pcre_printint.src
index a5670e5..5a9f15d 100644
--- a/pcre_printint.src
+++ b/pcre_printint.src
@@ -471,9 +471,9 @@ for(;;)
fprintf(f, " %s %s", OP_names[*code], get_ucpname(code[1], code[2]));
break;
- /* OP_XCLASS can only occur in UTF-8 mode. However, there's no harm in
- having this code always here, and it makes it less messy without all those
- #ifdefs. */
+ /* OP_XCLASS can only occur in UTF or PCRE16 modes. However, there's no
+ harm in having this code always here, and it makes it less messy without
+ all those #ifdefs. */
case OP_CLASS:
case OP_NCLASS:
@@ -481,6 +481,7 @@ for(;;)
{
int i, min, max;
BOOL printmap;
+ pcre_uint8 *map;
fprintf(f, " [");
@@ -501,13 +502,14 @@ for(;;)
if (printmap)
{
+ map = (pcre_uint8 *)ccode;
for (i = 0; i < 256; i++)
{
- if ((ccode[i/8] & (1 << (i&7))) != 0)
+ if ((map[i/8] & (1 << (i&7))) != 0)
{
int j;
for (j = i+1; j < 256; j++)
- if ((ccode[j/8] & (1 << (j&7))) == 0) break;
+ if ((map[j/8] & (1 << (j&7))) == 0) break;
if (i == '-' || i == ']') fprintf(f, "\\");
if (PRINTABLE(i)) fprintf(f, "%c", i);
else fprintf(f, "\\x%02x", i);
@@ -521,7 +523,7 @@ for(;;)
i = j;
}
}
- ccode += 32;
+ ccode += 32 / sizeof(pcre_uchar);
}
/* For an XCLASS there is always some additional data */
diff --git a/pcre_study.c b/pcre_study.c
index 5253c49..661627d 100644
--- a/pcre_study.c
+++ b/pcre_study.c
@@ -322,15 +322,15 @@ for (;;)
/* Check a class for variable quantification */
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF8 || !defined COMPILE_PCRE8
case OP_XCLASS:
- cc += GET(cc, 1) - 33;
+ cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];
/* Fall through */
#endif
case OP_CLASS:
case OP_NCLASS:
- cc += 33;
+ cc += PRIV(OP_lengths)[OP_CLASS];
switch (*cc)
{
@@ -789,7 +789,9 @@ do
case OP_SOM:
case OP_THEN:
case OP_THEN_ARG:
+#if defined SUPPORT_UTF8 || !defined COMPILE_PCRE8
case OP_XCLASS:
+#endif
return SSB_FAIL;
/* We can ignore word boundary tests. */
@@ -1134,7 +1136,9 @@ do
case OP_CLASS:
{
+ pcre_uint8 *map;
tcode++;
+ map = (pcre_uint8 *)tcode;
/* In UTF-8 mode, the bits in a bit map correspond to character
values, not to byte values. However, the bit map we are constructing is
@@ -1145,10 +1149,10 @@ do
#ifdef SUPPORT_UTF8
if (utf8)
{
- for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
+ for (c = 0; c < 16; c++) start_bits[c] |= map[c];
for (c = 128; c < 256; c++)
{
- if ((tcode[c/8] && (1 << (c&7))) != 0)
+ if ((map[c/8] && (1 << (c&7))) != 0)
{
int d = (c >> 6) | 0xc0; /* Set bit for this starter */
start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
@@ -1162,13 +1166,13 @@ do
else
#endif
{
- for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
+ for (c = 0; c < 32; c++) start_bits[c] |= map[c];
}
/* Advance past the bit map, and act on what follows. For a zero
minimum repeat, continue; otherwise stop processing. */
- tcode += 32;
+ tcode += 32 / sizeof(pcre_uchar);
switch (*tcode)
{
case OP_CRSTAR:
diff --git a/pcre_xclass.c b/pcre_xclass.c
index 024d71d..cdb9d07 100644
--- a/pcre_xclass.c
+++ b/pcre_xclass.c
@@ -75,15 +75,16 @@ additional data. */
if (c < 256)
{
- if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
- return !negated; /* char found */
+ if ((*data & XCL_MAP) != 0 &&
+ (((pcre_uint8 *)(data + 1))[c/8] & (1 << (c&7))) != 0)
+ return !negated; /* char found */
}
/* First skip the bit map if present. Then match against the list of Unicode
properties or large chars or ranges that end with a large char. We won't ever
encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */
-if ((*data++ & XCL_MAP) != 0) data += 32;
+if ((*data++ & XCL_MAP) != 0) data += 32 / sizeof(pcre_uchar);
while ((t = *data++) != XCL_END)
{