summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorzherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>2011-12-03 07:58:30 +0000
committerzherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>2011-12-03 07:58:30 +0000
commitad1a6e3a96050e61e6e2127d3a00ded77a1eb80c (patch)
tree4987dde0d6b3aee6401d3e89ce6ddc3acef49df3
parentc9fa02b130f1a9da7b17b915e75248f19afb6d7a (diff)
downloadpcre-ad1a6e3a96050e61e6e2127d3a00ded77a1eb80c.tar.gz
renaming utf8 to utf, JIT compiler update, disallowing invalid utf chars
git-svn-id: svn://vcs.exim.org/pcre/code/branches/pcre16@781 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--Makefile.am1
-rw-r--r--pcre16_ord2utf16.c95
-rw-r--r--pcre16_utf16_utils.c2
-rw-r--r--pcre16_valid_utf16.c4
-rw-r--r--pcre_compile.c293
-rw-r--r--pcre_dfa_exec.c46
-rw-r--r--pcre_exec.c130
-rw-r--r--pcre_internal.h26
-rw-r--r--pcre_jit_compile.c99
-rw-r--r--pcre_newline.c16
-rw-r--r--pcre_ord2utf8.c18
-rw-r--r--pcre_study.c61
-rw-r--r--pcre_valid_utf8.c2
-rw-r--r--pcreposix.c1
-rw-r--r--sljit/sljitConfigInternal.h4
-rw-r--r--sljit/sljitExecAllocator.c4
-rw-r--r--sljit/sljitLir.h11
-rw-r--r--sljit/sljitNativeARM_Thumb2.c1
-rw-r--r--sljit/sljitNativeARM_v5.c1
-rw-r--r--sljit/sljitNativeMIPS_common.c1
-rw-r--r--sljit/sljitNativePPC_common.c1
-rw-r--r--sljit/sljitNativeX86_common.c15
-rw-r--r--testdata/testinput106
-rw-r--r--testdata/testinput514
-rw-r--r--testdata/testoutput1031
-rw-r--r--testdata/testoutput547
26 files changed, 512 insertions, 418 deletions
diff --git a/Makefile.am b/Makefile.am
index 7d5de86..39cf574 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -214,6 +214,7 @@ libpcre16_la_SOURCES = \
pcre16_exec.c \
pcre16_jit_compile.c \
pcre16_newline.c \
+ pcre16_ord2utf16.c \
pcre16_string_utils.c \
pcre16_study.c \
pcre16_tables.c \
diff --git a/pcre16_ord2utf16.c b/pcre16_ord2utf16.c
new file mode 100644
index 0000000..421c3a3
--- /dev/null
+++ b/pcre16_ord2utf16.c
@@ -0,0 +1,95 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+ Written by Philip Hazel
+ Copyright (c) 1997-2008 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+
+/* This file contains a private PCRE function that converts an ordinal
+character value into a UTF16 string. */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "pcre_internal.h"
+
+
+/*************************************************
+* Convert character value to UTF-16 *
+*************************************************/
+
+/* This function takes an integer value in the range 0 - 0x10ffff
+and encodes it as a UTF-16 character in 1 to 2 pcre_uchars.
+
+Arguments:
+ cvalue the character value
+ buffer pointer to buffer for result - at least 2 pcre_uchars long
+
+Returns: number of characters placed in the buffer
+*/
+
+int
+PRIV(ord2utf)(pcre_uint32 cvalue, pcre_uchar *buffer)
+{
+#ifdef SUPPORT_UTF16
+
+/* Checking invalid cvalue character, encoded as invalid UTF-16 character.
+Should never happen in practice. */
+if ((cvalue & 0xf800) == 0xd800 || cvalue >= 0x110000)
+ cvalue = 0xfffe;
+
+if (cvalue <= 0xffff)
+ {
+ *buffer = (pcre_uchar)cvalue;
+ return 1;
+ }
+
+cvalue -= 0x10000;
+*buffer++ = 0xd800 | (cvalue >> 10);
+*buffer = 0xdc00 | (cvalue & 0x3ff);
+return 2;
+
+#else
+
+(void)(cvalue); /* Keep compiler happy; this function won't ever be */
+(void)(buffer); /* called when SUPPORT_UTF8 is not defined. */
+return 0;
+
+#endif
+}
+
+/* End of pcre16_ord2utf16.c */
diff --git a/pcre16_utf16_utils.c b/pcre16_utf16_utils.c
index cd82e26..5ff3953 100644
--- a/pcre16_utf16_utils.c
+++ b/pcre16_utf16_utils.c
@@ -57,7 +57,7 @@ any Byte Order Marks (BOMS). Returns with the remainig length. */
BOOL same_bo = TRUE;
PCRE_SPTR16 end = input + length;
/* The c variable must be unsigned. */
-register uschar c;
+register pcre_uchar c;
while (input < end)
{
diff --git a/pcre16_valid_utf16.c b/pcre16_valid_utf16.c
index b64519e..c7c7507 100644
--- a/pcre16_valid_utf16.c
+++ b/pcre16_valid_utf16.c
@@ -78,11 +78,11 @@ Returns: = 0 if the string is a valid UTF-16 string
*/
int
-PRIV(valid_utf16)(PCRE_PUCHAR string, int length, int *erroroffset)
+PRIV(valid_utf)(PCRE_PUCHAR string, int length, int *erroroffset)
{
#ifdef SUPPORT_UTF16
register PCRE_PUCHAR p;
-register uschar c;
+register pcre_uchar c;
if (length < 0)
{
diff --git a/pcre_compile.c b/pcre_compile.c
index 0bdd0fd..da4ce22 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -470,6 +470,7 @@ static const char error_texts[] =
"\\k is not followed by a braced, angle-bracketed, or quoted name\0"
/* 70 */
"internal error: unknown opcode in find_fixedlength()\0"
+ "Not allowed UTF-8 / UTF-16 code point (>= 0xd800 && <= 0xdfff)\0"
;
/* Table to identify digits and hex digits. This is used when compiling
@@ -538,7 +539,7 @@ static const pcre_uint8 digitab[] =
/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
-static const pcre_unit8 digitab[] =
+static const pcre_uint8 digitab[] =
{
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
@@ -706,9 +707,11 @@ static int
check_escape(const pcre_uchar **ptrptr, int *errorcodeptr, int bracount,
int options, BOOL isclass)
{
-BOOL utf8 = (options & PCRE_UTF8) != 0;
+/* PCRE_UTF16 has the same value as PCRE_UTF8. */
+BOOL utf = (options & PCRE_UTF8) != 0;
const pcre_uchar *ptr = *ptrptr + 1;
-int c, i;
+pcre_int32 c;
+int i;
GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
ptr--; /* Set pointer back to the last byte */
@@ -940,12 +943,12 @@ else
c -= CHAR_0;
while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
c = c * 8 + *(++ptr) - CHAR_0;
- if (!utf8 && c > 0xff) *errorcodeptr = ERR51;
+ if (!utf && c > 0xff) *errorcodeptr = ERR51;
break;
/* \x is complicated. \x{ddd} is a character number which can be greater
- than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
- treated as a data character. */
+ than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.
+ If not, { is treated as a data character. */
case CHAR_x:
if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
@@ -974,14 +977,12 @@ else
if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
{
const pcre_uchar *pt = ptr + 2;
- int count = 0;
c = 0;
while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)
{
register int cc = *pt++;
if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
- count++;
#ifndef EBCDIC /* ASCII/UTF-8 coding */
if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
@@ -990,17 +991,25 @@ else
if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
#endif
- }
- if (*pt == CHAR_RIGHT_CURLY_BRACKET)
- {
#ifdef COMPILE_PCRE8
- if (c < 0 || count > (utf8? 8:2)) *errorcodeptr = ERR34;
+ if (c > (utf ? 0x10ffff : 0xff)) { c = -1; break; }
#else
#ifdef COMPILE_PCRE16
- if (c < 0 || count > (utf8? 8:4)) *errorcodeptr = ERR34;
+ if (c > (utf ? 0x10ffff : 0xffff)) { c = -1; break; }
#endif
#endif
+ }
+
+ if (c < 0)
+ {
+ while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++;
+ *errorcodeptr = ERR34;
+ }
+
+ if (*pt == CHAR_RIGHT_CURLY_BRACKET)
+ {
+ if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR71;
ptr = pt;
break;
}
@@ -1281,7 +1290,7 @@ Arguments:
name name to seek, or NULL if seeking a numbered subpattern
lorn name length, or subpattern number if name is NULL
xmode TRUE if we are in /x mode
- utf8 TRUE if we are in UTF-8 mode
+ utf TRUE if we are in UTF-8 / UTF-16 mode
count pointer to the current capturing subpattern number (updated)
Returns: the number of the named subpattern, or -1 if not found
@@ -1289,7 +1298,7 @@ Returns: the number of the named subpattern, or -1 if not found
static int
find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn,
- BOOL xmode, BOOL utf8, int *count)
+ BOOL xmode, BOOL utf, int *count)
{
pcre_uchar *ptr = *ptrptr;
int start_count = *count;
@@ -1458,7 +1467,7 @@ for (; ptr < cd->end_pattern; ptr++)
if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
ptr++;
#ifdef SUPPORT_UTF8
- if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+ if (utf) while ((*ptr & 0xc0) == 0x80) ptr++;
#endif
}
if (*ptr == 0) goto FAIL_EXIT;
@@ -1469,7 +1478,7 @@ for (; ptr < cd->end_pattern; ptr++)
if (*ptr == CHAR_LEFT_PARENTHESIS)
{
- int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
+ int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, count);
if (rc > 0) return rc;
if (*ptr == 0) goto FAIL_EXIT;
}
@@ -1515,14 +1524,14 @@ Arguments:
name name to seek, or NULL if seeking a numbered subpattern
lorn name length, or subpattern number if name is NULL
xmode TRUE if we are in /x mode
- utf8 TRUE if we are in UTF-8 mode
+ utf TRUE if we are in UTF-8 / UTF-16 mode
Returns: the number of the found subpattern, or -1 if not found
*/
static int
find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode,
- BOOL utf8)
+ BOOL utf)
{
pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern;
int count = 0;
@@ -1535,7 +1544,7 @@ matching closing parens. That is why we have to have a loop. */
for (;;)
{
- rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
+ rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, &count);
if (rc > 0 || *ptr++ == 0) break;
}
@@ -1618,7 +1627,7 @@ and doing the check at the end; a flag specifies which mode we are running in.
Arguments:
code points to the start of the pattern (the bracket)
- utf8 TRUE in UTF-8 mode
+ utf TRUE in UTF-8 / UTF-16 mode
atend TRUE if called when the pattern is complete
cd the "compile data" structure
@@ -1630,7 +1639,7 @@ Returns: the fixed length,
*/
static int
-find_fixedlength(pcre_uchar *code, BOOL utf8, BOOL atend, compile_data *cd)
+find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
{
int length = -1;
@@ -1657,7 +1666,7 @@ for (;;)
case OP_ONCE:
case OP_ONCE_NC:
case OP_COND:
- d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf8, atend, cd);
+ d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
if (d < 0) return d;
branchlength += d;
do cc += GET(cc, 1); while (*cc == OP_ALT);
@@ -1691,7 +1700,7 @@ for (;;)
cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1); /* Start subpattern */
do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
if (cc > cs && cc < ce) return -1; /* Recursion */
- d = find_fixedlength(cs + 2, utf8, atend, cd);
+ d = find_fixedlength(cs + 2, utf, atend, cd);
if (d < 0) return d;
branchlength += d;
cc += 1 + LINK_SIZE;
@@ -1751,7 +1760,7 @@ for (;;)
branchlength++;
cc += 2;
#ifdef SUPPORT_UTF8
- if (utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+ if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
#endif
break;
@@ -1765,7 +1774,7 @@ for (;;)
branchlength += GET2(cc,1);
cc += 2 + IMM2_SIZE;
#ifdef SUPPORT_UTF8
- if (utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+ if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
#endif
break;
@@ -1945,14 +1954,14 @@ length.
Arguments:
code points to start of expression
- utf8 TRUE in UTF-8 mode
+ utf TRUE in UTF-8 / UTF-16 mode
number the required bracket number or negative to find a lookbehind
Returns: pointer to the opcode for the bracket, or NULL if not found
*/
const pcre_uchar *
-PRIV(find_bracket)(const pcre_uchar *code, BOOL utf8, int number)
+PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
{
for (;;)
{
@@ -2033,7 +2042,7 @@ for (;;)
arrange to skip the extra bytes. */
#ifdef SUPPORT_UTF8
- if (utf8) switch(c)
+ if (utf) switch(c)
{
case OP_CHAR:
case OP_CHARI:
@@ -2067,7 +2076,7 @@ for (;;)
break;
}
#else
- (void)(utf8); /* Keep compiler happy by referencing function argument */
+ (void)(utf); /* Keep compiler happy by referencing function argument */
#endif
}
}
@@ -2084,13 +2093,13 @@ instance of OP_RECURSE.
Arguments:
code points to start of expression
- utf8 TRUE in UTF-8 mode
+ utf TRUE in UTF-8 / UTF-16 mode
Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
*/
static const pcre_uchar *
-find_recurse(const pcre_uchar *code, BOOL utf8)
+find_recurse(const pcre_uchar *code, BOOL utf)
{
for (;;)
{
@@ -2153,7 +2162,7 @@ for (;;)
to arrange to skip the extra bytes. */
#ifdef SUPPORT_UTF8
- if (utf8) switch(c)
+ if (utf) switch(c)
{
case OP_CHAR:
case OP_CHARI:
@@ -2187,7 +2196,7 @@ for (;;)
break;
}
#else
- (void)(utf8); /* Keep compiler happy by referencing function argument */
+ (void)(utf); /* Keep compiler happy by referencing function argument */
#endif
}
}
@@ -2210,7 +2219,7 @@ bracket whose current branch will already have been scanned.
Arguments:
code points to start of search
endcode points to where to stop
- utf8 TRUE if in UTF8 mode
+ utf TRUE if in UTF-8 / UTF-16 mode
cd contains pointers to tables etc.
Returns: TRUE if what is matched could be empty
@@ -2218,7 +2227,7 @@ Returns: TRUE if what is matched could be empty
static BOOL
could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
- BOOL utf8, compile_data *cd)
+ BOOL utf, compile_data *cd)
{
register int c;
for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
@@ -2266,7 +2275,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
do
{
- if (could_be_empty_branch(scode, endcode, utf8, cd))
+ if (could_be_empty_branch(scode, endcode, utf, cd))
{
empty_branch = TRUE;
break;
@@ -2322,7 +2331,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
empty_branch = FALSE;
do
{
- if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
+ if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd))
empty_branch = TRUE;
code += GET(code, 1);
}
@@ -2456,7 +2465,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
case OP_MINQUERYI:
case OP_POSQUERY:
case OP_POSQUERYI:
- if (utf8 && code[1] >= 0xc0) code += PRIV(utf8_table4)[code[1] & 0x3f];
+ if (utf && code[1] >= 0xc0) code += PRIV(utf8_table4)[code[1] & 0x3f];
break;
case OP_UPTO:
@@ -2465,7 +2474,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
case OP_MINUPTOI:
case OP_POSUPTO:
case OP_POSUPTOI:
- if (utf8 && code[1 + IMM2_SIZE] >= 0xc0) code += PRIV(utf8_table4)[code[1 + IMM2_SIZE] & 0x3f];
+ if (utf && code[1 + IMM2_SIZE] >= 0xc0) code += PRIV(utf8_table4)[code[1 + IMM2_SIZE] & 0x3f];
break;
#endif
@@ -2509,7 +2518,7 @@ Arguments:
code points to start of the recursion
endcode points to where to stop (current RECURSE item)
bcptr points to the chain of current (unclosed) branch starts
- utf8 TRUE if in UTF-8 mode
+ utf TRUE if in UTF-8 / UTF-16 mode
cd pointers to tables etc
Returns: TRUE if what is matched could be empty
@@ -2517,11 +2526,11 @@ Returns: TRUE if what is matched could be empty
static BOOL
could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
- branch_chain *bcptr, BOOL utf8, compile_data *cd)
+ branch_chain *bcptr, BOOL utf, compile_data *cd)
{
while (bcptr != NULL && bcptr->current_branch >= code)
{
- if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
+ if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd))
return FALSE;
bcptr = bcptr->outer;
}
@@ -2656,7 +2665,7 @@ value in the reference (which is a group number).
Arguments:
group points to the start of the group
adjust the amount by which the group is to be moved
- utf8 TRUE in UTF-8 mode
+ utf TRUE in UTF-8 / UTF-16 mode
cd contains pointers to tables etc.
save_hwm the hwm forward reference pointer at the start of the group
@@ -2664,12 +2673,12 @@ Returns: nothing
*/
static void
-adjust_recurse(pcre_uchar *group, int adjust, BOOL utf8, compile_data *cd,
+adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
pcre_uchar *save_hwm)
{
pcre_uchar *ptr = group;
-while ((ptr = (pcre_uchar *)find_recurse(ptr, utf8)) != NULL)
+while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
{
int offset;
pcre_uchar *hc;
@@ -2875,7 +2884,7 @@ sense to automatically possessify the repeated item.
Arguments:
previous pointer to the repeated opcode
- utf8 TRUE in UTF-8 mode
+ utf TRUE in UTF-8 / UTF-16 mode
ptr next character in pattern
options options bits
cd contains pointers to tables etc.
@@ -2884,7 +2893,7 @@ Returns: TRUE if possessifying is wanted
*/
static BOOL
-check_auto_possessive(const pcre_uchar *previous, BOOL utf8,
+check_auto_possessive(const pcre_uchar *previous, BOOL utf,
const pcre_uchar *ptr, int options, compile_data *cd)
{
int c, next;
@@ -2905,7 +2914,7 @@ if ((options & PCRE_EXTENDED) != 0)
if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
ptr++;
#ifdef SUPPORT_UTF8
- if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+ if (utf) while ((*ptr & 0xc0) == 0x80) ptr++;
#endif
}
}
@@ -2927,7 +2936,7 @@ if (*ptr == CHAR_BACKSLASH)
else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
{
#ifdef SUPPORT_UTF8
- if (utf8) { GETCHARINC(next, ptr); } else
+ if (utf) { GETCHARINC(next, ptr); } else
#endif
next = *ptr++;
}
@@ -2949,7 +2958,7 @@ if ((options & PCRE_EXTENDED) != 0)
if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
ptr++;
#ifdef SUPPORT_UTF8
- if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+ if (utf) while ((*ptr & 0xc0) == 0x80) ptr++;
#endif
}
}
@@ -2988,7 +2997,7 @@ if (next >= 0) switch(op_code)
#endif
if (c == next) return FALSE;
#ifdef SUPPORT_UTF8
- if (utf8)
+ if (utf)
{
unsigned int othercase;
if (next < 128) othercase = cd->fcc[next]; else
@@ -3013,7 +3022,7 @@ if (next >= 0) switch(op_code)
case OP_NOTI:
if ((c = *previous) == next) return TRUE;
#ifdef SUPPORT_UTF8
- if (utf8)
+ if (utf)
{
unsigned int othercase;
if (next < 128) othercase = cd->fcc[next]; else
@@ -3348,10 +3357,11 @@ must not do this for other options (e.g. PCRE_EXTENDED) because they may change
dynamically as we process the pattern. */
#ifdef SUPPORT_UTF8
-BOOL utf8 = (options & PCRE_UTF8) != 0;
-pcre_uint8 utf8_char[6];
+/* PCRE_UTF16 has the same value as PCRE_UTF8. */
+BOOL utf = (options & PCRE_UTF8) != 0;
+pcre_uchar utf_chars[6];
#else
-BOOL utf8 = FALSE;
+BOOL utf = FALSE;
#endif
/* Helper variables for OP_XCLASS opcode (for characters > 255). */
@@ -3459,8 +3469,8 @@ for (;; ptr++)
}
*lengthptr += (int)(code - last_code);
- DPRINTF(("length=%d added %d c=%c\n", *lengthptr, (int)(code - last_code),
- c));
+ DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
+ (int)(code - last_code), c, c));
/* If "previous" is set and it is not at the start of the work space, move
it back to there, in order to avoid filling up the work space. Otherwise,
@@ -3547,7 +3557,7 @@ for (;; ptr++)
if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
ptr++;
#ifdef SUPPORT_UTF8
- if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+ if (utf) while ((*ptr & 0xc0) == 0x80) ptr++;
#endif
}
if (*ptr != 0) continue;
@@ -3727,7 +3737,7 @@ for (;; ptr++)
const pcre_uchar *oldptr;
#ifdef SUPPORT_UTF8
- if (utf8 && c > 127)
+ if (utf && c > 127)
{ /* Braces are required because the */
GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
}
@@ -3945,22 +3955,22 @@ for (;; ptr++)
SETBIT(classbits, 0x20); /* SPACE */
SETBIT(classbits, 0xa0); /* NSBP */
#ifdef SUPPORT_UTF
- if (utf8)
+ if (utf)
{
xclass = TRUE;
*class_uchardata++ = XCL_SINGLE;
- class_uchardata += PRIV(ord2utf8)(0x1680, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(0x1680, class_uchardata);
*class_uchardata++ = XCL_SINGLE;
- class_uchardata += PRIV(ord2utf8)(0x180e, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(0x180e, class_uchardata);
*class_uchardata++ = XCL_RANGE;
- class_uchardata += PRIV(ord2utf8)(0x2000, class_uchardata);
- class_uchardata += PRIV(ord2utf8)(0x200A, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(0x2000, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(0x200A, class_uchardata);
*class_uchardata++ = XCL_SINGLE;
- class_uchardata += PRIV(ord2utf8)(0x202f, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(0x202f, class_uchardata);
*class_uchardata++ = XCL_SINGLE;
- class_uchardata += PRIV(ord2utf8)(0x205f, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(0x205f, class_uchardata);
*class_uchardata++ = XCL_SINGLE;
- class_uchardata += PRIV(ord2utf8)(0x3000, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(0x3000, class_uchardata);
}
#endif
continue;
@@ -3980,30 +3990,30 @@ for (;; ptr++)
}
#ifdef SUPPORT_UTF
- if (utf8)
+ if (utf)
{
xclass = TRUE;
*class_uchardata++ = XCL_RANGE;
- class_uchardata += PRIV(ord2utf8)(0x0100, class_uchardata);
- class_uchardata += PRIV(ord2utf8)(0x167f, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(0x167f, class_uchardata);
*class_uchardata++ = XCL_RANGE;
- class_uchardata += PRIV(ord2utf8)(0x1681, class_uchardata);
- class_uchardata += PRIV(ord2utf8)(0x180d, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(0x1681, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(0x180d, class_uchardata);
*class_uchardata++ = XCL_RANGE;
- class_uchardata += PRIV(ord2utf8)(0x180f, class_uchardata);
- class_uchardata += PRIV(ord2utf8)(0x1fff, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(0x180f, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(0x1fff, class_uchardata);
*class_uchardata++ = XCL_RANGE;
- class_uchardata += PRIV(ord2utf8)(0x200B, class_uchardata);
- class_uchardata += PRIV(ord2utf8)(0x202e, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(0x200B, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(0x202e, class_uchardata);
*class_uchardata++ = XCL_RANGE;
- class_uchardata += PRIV(ord2utf8)(0x2030, class_uchardata);
- class_uchardata += PRIV(ord2utf8)(0x205e, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(0x2030, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(0x205e, class_uchardata);
*class_uchardata++ = XCL_RANGE;
- class_uchardata += PRIV(ord2utf8)(0x2060, class_uchardata);
- class_uchardata += PRIV(ord2utf8)(0x2fff, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(0x2060, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(0x2fff, class_uchardata);
*class_uchardata++ = XCL_RANGE;
- class_uchardata += PRIV(ord2utf8)(0x3001, class_uchardata);
- class_uchardata += PRIV(ord2utf8)(0x7fffffff, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(0x3001, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
}
#endif
continue;
@@ -4015,12 +4025,12 @@ for (;; ptr++)
SETBIT(classbits, 0x0d); /* CR */
SETBIT(classbits, 0x85); /* NEL */
#ifdef SUPPORT_UTF
- if (utf8)
+ if (utf)
{
xclass = TRUE;
*class_uchardata++ = XCL_RANGE;
- class_uchardata += PRIV(ord2utf8)(0x2028, class_uchardata);
- class_uchardata += PRIV(ord2utf8)(0x2029, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(0x2028, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata);
}
#endif
continue;
@@ -4043,15 +4053,15 @@ for (;; ptr++)
}
#ifdef SUPPORT_UTF
- if (utf8)
+ if (utf)
{
xclass = TRUE;
*class_uchardata++ = XCL_RANGE;
- class_uchardata += PRIV(ord2utf8)(0x0100, class_uchardata);
- class_uchardata += PRIV(ord2utf8)(0x2027, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(0x2027, class_uchardata);
*class_uchardata++ = XCL_RANGE;
- class_uchardata += PRIV(ord2utf8)(0x2029, class_uchardata);
- class_uchardata += PRIV(ord2utf8)(0x7fffffff, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
}
#endif
continue;
@@ -4139,7 +4149,7 @@ for (;; ptr++)
}
#ifdef SUPPORT_UTF8
- if (utf8)
+ if (utf)
{ /* Braces are required because the */
GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
}
@@ -4189,7 +4199,7 @@ for (;; ptr++)
available. */
#ifdef SUPPORT_UTF
- if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
+ if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
#endif
#ifndef COMPILE_PCRE8
if (d > 255)
@@ -4234,9 +4244,9 @@ for (;; ptr++)
else
{
*class_uchardata++ = XCL_RANGE;
- class_uchardata += PRIV(ord2utf8)(occ, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(occ, class_uchardata);
}
- class_uchardata += PRIV(ord2utf8)(ocd, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(ocd, class_uchardata);
}
}
#endif /* SUPPORT_UCP */
@@ -4246,8 +4256,8 @@ for (;; ptr++)
*class_uchardata++ = XCL_RANGE;
#ifdef SUPPORT_UTF
- class_uchardata += PRIV(ord2utf8)(c, class_uchardata);
- class_uchardata += PRIV(ord2utf8)(d, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(c, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(d, class_uchardata);
#else
*class_uchardata++ = c;
*class_uchardata++ = d;
@@ -4304,7 +4314,7 @@ for (;; ptr++)
/* Handle a character that cannot go in the bit map */
#ifdef SUPPORT_UTF
- if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
+ if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
#endif
#ifndef COMPILE_PCRE8
if (c > 255)
@@ -4314,7 +4324,7 @@ for (;; ptr++)
xclass = TRUE;
*class_uchardata++ = XCL_SINGLE;
#ifdef SUPPORT_UTF
- class_uchardata += PRIV(ord2utf8)(c, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(c, class_uchardata);
#else
*class_uchardata++ = c;
#endif
@@ -4326,7 +4336,7 @@ for (;; ptr++)
if ((othercase = UCD_OTHERCASE(c)) != c)
{
*class_uchardata++ = XCL_SINGLE;
- class_uchardata += PRIV(ord2utf8)(othercase, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(othercase, class_uchardata);
}
}
#endif /* SUPPORT_UCP */
@@ -4384,11 +4394,9 @@ for (;; ptr++)
#ifdef SUPPORT_UTF
if (class_charcount == 1 && !xclass &&
- (!utf8 || !negate_class || class_lastchar < 128))
-#elif defined COMPILE_PCRE8
- if (class_charcount == 1)
+ (!utf || !negate_class || class_lastchar < 128))
#else
- if (class_charcount == 1 && !xclass)
+ if (class_charcount == 1)
#endif
{
zeroreqchar = reqchar;
@@ -4408,8 +4416,8 @@ for (;; ptr++)
then we can handle this with the normal one-character code. */
#ifdef SUPPORT_UTF8
- if (utf8 && class_lastchar > 127)
- mclength = PRIV(ord2utf8)(class_lastchar, mcbuffer);
+ if (utf && class_lastchar > 127)
+ mclength = PRIV(ord2utf)(class_lastchar, mcbuffer);
else
#endif
{
@@ -4599,12 +4607,12 @@ for (;; ptr++)
length rather than a small character. */
#ifdef SUPPORT_UTF8
- if (utf8 && (code[-1] & 0x80) != 0)
+ if (utf && (code[-1] & 0x80) != 0)
{
pcre_uchar *lastchar = code - 1;
while((*lastchar & 0xc0) == 0x80) lastchar--;
c = code - lastchar; /* Length of UTF-8 character */
- memcpy(utf8_char, lastchar, c); /* Save the char */
+ memcpy(utf_chars, lastchar, c); /* Save the char */
c |= 0x80; /* Flag c as a length */
}
else
@@ -4625,7 +4633,7 @@ for (;; ptr++)
if (!possessive_quantifier &&
repeat_max < 0 &&
- check_auto_possessive(previous, utf8, ptr + 1, options, cd))
+ check_auto_possessive(previous, utf, ptr + 1, options, cd))
{
repeat_type = 0; /* Force greedy */
possessive_quantifier = TRUE;
@@ -4646,7 +4654,7 @@ for (;; ptr++)
c = previous[1];
if (!possessive_quantifier &&
repeat_max < 0 &&
- check_auto_possessive(previous, utf8, ptr + 1, options, cd))
+ check_auto_possessive(previous, utf, ptr + 1, options, cd))
{
repeat_type = 0; /* Force greedy */
possessive_quantifier = TRUE;
@@ -4670,7 +4678,7 @@ for (;; ptr++)
if (!possessive_quantifier &&
repeat_max < 0 &&
- check_auto_possessive(previous, utf8, ptr + 1, options, cd))
+ check_auto_possessive(previous, utf, ptr + 1, options, cd))
{
repeat_type = 0; /* Force greedy */
possessive_quantifier = TRUE;
@@ -4755,9 +4763,9 @@ for (;; ptr++)
if (repeat_max < 0)
{
#ifdef SUPPORT_UTF8
- if (utf8 && c >= 128)
+ if (utf && c >= 128)
{
- memcpy(code, utf8_char, c & 7);
+ memcpy(code, utf_chars, c & 7);
code += c & 7;
}
else
@@ -4780,9 +4788,9 @@ for (;; ptr++)
else if (repeat_max != repeat_min)
{
#ifdef SUPPORT_UTF8
- if (utf8 && c >= 128)
+ if (utf && c >= 128)
{
- memcpy(code, utf8_char, c & 7);
+ memcpy(code, utf_chars, c & 7);
code += c & 7;
}
else
@@ -4810,9 +4818,9 @@ for (;; ptr++)
/* The character or character type itself comes last in all cases. */
#ifdef SUPPORT_UTF8
- if (utf8 && c >= 128)
+ if (utf && c >= 128)
{
- memcpy(code, utf8_char, c & 7);
+ memcpy(code, utf_chars, c & 7);
code += c & 7;
}
else
@@ -4939,7 +4947,7 @@ for (;; ptr++)
if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
{
*code = OP_END;
- adjust_recurse(previous, 1, utf8, cd, save_hwm);
+ adjust_recurse(previous, 1, utf, cd, save_hwm);
memmove(previous + 1, previous, IN_UCHARS(len));
code++;
if (repeat_max == 0)
@@ -4963,7 +4971,7 @@ for (;; ptr++)
{
int offset;
*code = OP_END;
- adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
+ adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm);
memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
code += 2 + LINK_SIZE;
*previous++ = OP_BRAZERO + repeat_type;
@@ -5165,7 +5173,7 @@ for (;; ptr++)
pcre_uchar *scode = bracode;
do
{
- if (could_be_empty_branch(scode, ketcode, utf8, cd))
+ if (could_be_empty_branch(scode, ketcode, utf, cd))
{
*bracode += OP_SBRA - OP_BRA;
break;
@@ -5188,7 +5196,7 @@ for (;; ptr++)
{
int nlen = (int)(code - bracode);
*code = OP_END;
- adjust_recurse(bracode, 1 + LINK_SIZE, utf8, cd, save_hwm);
+ adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm);
memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
code += 1 + LINK_SIZE;
nlen += 1 + LINK_SIZE;
@@ -5266,7 +5274,7 @@ for (;; ptr++)
{
tempcode += PRIV(OP_lengths)[*tempcode];
#ifdef SUPPORT_UTF8
- if (utf8 && tempcode[-1] >= 0xc0)
+ if (utf && tempcode[-1] >= 0xc0)
tempcode += PRIV(utf8_table4)[tempcode[-1] & 0x3f];
#endif
}
@@ -5304,7 +5312,7 @@ for (;; ptr++)
default:
*code = OP_END;
- adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm);
+ adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
code += 1 + LINK_SIZE;
len += 1 + LINK_SIZE;
@@ -5613,7 +5621,7 @@ for (;; ptr++)
/* Search the pattern for a forward reference */
else if ((i = find_parens(cd, name, namelen,
- (options & PCRE_EXTENDED) != 0, utf8)) > 0)
+ (options & PCRE_EXTENDED) != 0, utf)) > 0)
{
PUT2(code, 2+LINK_SIZE, i);
code[1+LINK_SIZE]++;
@@ -5958,7 +5966,7 @@ for (;; ptr++)
temp = cd->end_pattern;
cd->end_pattern = ptr;
recno = find_parens(cd, name, namelen,
- (options & PCRE_EXTENDED) != 0, utf8);
+ (options & PCRE_EXTENDED) != 0, utf);
cd->end_pattern = temp;
if (recno < 0) recno = 0; /* Forward ref; set dummy number */
}
@@ -5985,7 +5993,7 @@ for (;; ptr++)
}
else if ((recno = /* Forward back reference */
find_parens(cd, name, namelen,
- (options & PCRE_EXTENDED) != 0, utf8)) <= 0)
+ (options & PCRE_EXTENDED) != 0, utf)) <= 0)
{
*errorcodeptr = ERR15;
goto FAILED;
@@ -6089,14 +6097,14 @@ for (;; ptr++)
{
*code = OP_END;
if (recno != 0)
- called = PRIV(find_bracket)(cd->start_code, utf8, recno);
+ called = PRIV(find_bracket)(cd->start_code, utf, recno);
/* Forward reference */
if (called == NULL)
{
if (find_parens(cd, NULL, recno,
- (options & PCRE_EXTENDED) != 0, utf8) < 0)
+ (options & PCRE_EXTENDED) != 0, utf) < 0)
{
*errorcodeptr = ERR15;
goto FAILED;
@@ -6120,7 +6128,7 @@ for (;; ptr++)
conditional subpatterns will be picked up then. */
else if (GET(called, 1) == 0 && cond_depth <= 0 &&
- could_be_empty(called, code, bcptr, utf8, cd))
+ could_be_empty(called, code, bcptr, utf, cd))
{
*errorcodeptr = ERR40;
goto FAILED;
@@ -6618,7 +6626,7 @@ for (;; ptr++)
{
previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
- *code++ = (!utf8 && c == -ESC_C)? OP_ALLANY : -c;
+ *code++ = (!utf && c == -ESC_C)? OP_ALLANY : -c;
}
}
continue;
@@ -6629,8 +6637,8 @@ for (;; ptr++)
handle it as a data character. */
#ifdef SUPPORT_UTF8
- if (utf8 && c > 127)
- mclength = PRIV(ord2utf8)(c, mcbuffer);
+ if (utf && c > 127)
+ mclength = PRIV(ord2utf)(c, mcbuffer);
else
#endif
@@ -6652,7 +6660,7 @@ for (;; ptr++)
mcbuffer[0] = c;
#ifdef SUPPORT_UTF8
- if (utf8 && c >= 0xc0)
+ if (utf && c >= 0xc0)
{
while ((ptr[1] & 0xc0) == 0x80)
mcbuffer[mclength++] = *(++ptr);
@@ -7360,7 +7368,7 @@ pcre_int32 firstchar, reqchar;
int newline;
int errorcode = 0;
int skipatstart = 0;
-BOOL utf8;
+BOOL utf;
size_t size;
pcre_uchar *code;
const pcre_uchar *codestart;
@@ -7458,22 +7466,23 @@ while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
else break;
}
-utf8 = (options & PCRE_UTF8) != 0;
+/* PCRE_UTF16 has the same value as PCRE_UTF8. */
+utf = (options & PCRE_UTF8) != 0;
/* Can't support UTF8 unless PCRE has been compiled to include the code. The
-return of an error code from PRIV(valid_utf8)() is a new feature, introduced in
+return of an error code from PRIV(valid_utf)() is a new feature, introduced in
release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
not used here. */
#ifdef SUPPORT_UTF8
-if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
- (errorcode = PRIV(valid_utf8)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
+if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&
+ (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
{
errorcode = ERR44;
goto PCRE_EARLY_ERROR_RETURN2;
}
#else
-if (utf8)
+if (utf)
{
errorcode = ERR32;
goto PCRE_EARLY_ERROR_RETURN;
@@ -7688,7 +7697,7 @@ while (errorcode == 0 && cd->hwm > cworkspace)
cd->hwm -= LINK_SIZE;
offset = GET(cd->hwm, 0);
recno = GET(codestart, offset);
- groupptr = PRIV(find_bracket)(codestart, utf8, recno);
+ groupptr = PRIV(find_bracket)(codestart, utf, recno);
if (groupptr == NULL) errorcode = ERR53;
else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));
}
@@ -7715,9 +7724,9 @@ if (cd->check_lookbehind)
of zero, but that is a pathological case, and it does no harm.) When we find
one, we temporarily terminate the branch it is in while we scan it. */
- for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf8, -1);
+ for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf, -1);
cc != NULL;
- cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf8, -1))
+ cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf, -1))
{
if (GET(cc, 1) == 0)
{
diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c
index 8fed9b3..8247f46 100644
--- a/pcre_dfa_exec.c
+++ b/pcre_dfa_exec.c
@@ -414,9 +414,9 @@ const pcre_uchar *end_subject = md->end_subject;
const pcre_uchar *start_code = md->start_code;
#ifdef SUPPORT_UTF8
-BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
+BOOL utf = (md->poptions & PCRE_UTF8) != 0;
#else
-BOOL utf8 = FALSE;
+BOOL utf = FALSE;
#endif
rlevel++;
@@ -474,7 +474,7 @@ if (*first_op == OP_REVERSE)
#ifdef SUPPORT_UTF8
/* In character mode we have to step back character by character */
- if (utf8)
+ if (utf)
{
for (gone_back = 0; gone_back < max_back; gone_back++)
{
@@ -606,7 +606,7 @@ for (;;)
{
clen = 1; /* Number of bytes in the character */
#ifdef SUPPORT_UTF8
- if (utf8) { GETCHARLEN(c, ptr, clen); } else
+ if (utf) { GETCHARLEN(c, ptr, clen); } else
#endif /* SUPPORT_UTF8 */
c = *ptr;
}
@@ -695,7 +695,7 @@ for (;;)
{
dlen = 1;
#ifdef SUPPORT_UTF8
- if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
+ if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
#endif /* SUPPORT_UTF8 */
d = code[coptable[codevalue]];
if (codevalue >= OP_TYPESTAR)
@@ -960,7 +960,7 @@ for (;;)
const pcre_uchar *temp = ptr - 1;
if (temp < md->start_used_ptr) md->start_used_ptr = temp;
#ifdef SUPPORT_UTF8
- if (utf8) BACKCHAR(temp);
+ if (utf) BACKCHAR(temp);
#endif
GETCHARTEST(d, temp);
#ifdef SUPPORT_UCP
@@ -1986,7 +1986,7 @@ for (;;)
if (clen == 0) break;
#ifdef SUPPORT_UTF8
- if (utf8)
+ if (utf)
{
if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
{
@@ -2007,8 +2007,7 @@ for (;;)
}
else
#endif /* SUPPORT_UTF8 */
-
- /* Non-UTF-8 mode */
+ /* Not UTF mode */
{
if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
}
@@ -2211,7 +2210,7 @@ for (;;)
if (caseless)
{
#ifdef SUPPORT_UTF8
- if (utf8 && d >= 128)
+ if (utf && d >= 128)
{
#ifdef SUPPORT_UCP
otherd = UCD_OTHERCASE(d);
@@ -2258,7 +2257,7 @@ for (;;)
if (caseless)
{
#ifdef SUPPORT_UTF8
- if (utf8 && d >= 128)
+ if (utf && d >= 128)
{
#ifdef SUPPORT_UCP
otherd = UCD_OTHERCASE(d);
@@ -2303,7 +2302,7 @@ for (;;)
if (caseless)
{
#ifdef SUPPORT_UTF8
- if (utf8 && d >= 128)
+ if (utf && d >= 128)
{
#ifdef SUPPORT_UCP
otherd = UCD_OTHERCASE(d);
@@ -2340,7 +2339,7 @@ for (;;)
if (caseless)
{
#ifdef SUPPORT_UTF8
- if (utf8 && d >= 128)
+ if (utf && d >= 128)
{
#ifdef SUPPORT_UCP
otherd = UCD_OTHERCASE(d);
@@ -2384,7 +2383,7 @@ for (;;)
if (caseless)
{
#ifdef SUPPORT_UTF8
- if (utf8 && d >= 128)
+ if (utf && d >= 128)
{
#ifdef SUPPORT_UCP
otherd = UCD_OTHERCASE(d);
@@ -3005,7 +3004,7 @@ pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
real_pcre *re = (real_pcre *)argument_re;
dfa_match_data match_block;
dfa_match_data *md = &match_block;
-BOOL utf8, anchored, startline, firstline;
+BOOL utf, anchored, startline, firstline;
const pcre_uchar *current_subject, *end_subject;
const pcre_uint8 *lcc;
@@ -3073,9 +3072,10 @@ end_subject = (const unsigned char *)subject + length;
req_char_ptr = current_subject - 1;
#ifdef SUPPORT_UTF8
-utf8 = (re->options & PCRE_UTF8) != 0;
+/* PCRE_UTF16 has the same value as PCRE_UTF8. */
+utf = (re->options & PCRE_UTF8) != 0;
#else
-utf8 = FALSE;
+utf = FALSE;
#endif
anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
@@ -3147,10 +3147,10 @@ else
back the character offset. */
#ifdef SUPPORT_UTF8
-if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
+if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
{
int erroroffset;
- int errorcode = PRIV(valid_utf8)((pcre_uchar *)subject, length, &erroroffset);
+ int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
if (errorcode != 0)
{
if (offsetcount >= 2)
@@ -3235,7 +3235,7 @@ for (;;)
{
PCRE_PUCHAR t = current_subject;
#ifdef SUPPORT_UTF8
- if (utf8)
+ if (utf)
{
while (t < md->end_subject && !IS_NEWLINE(t))
{
@@ -3278,7 +3278,7 @@ for (;;)
if (current_subject > md->start_subject + start_offset)
{
#ifdef SUPPORT_UTF8
- if (utf8)
+ if (utf)
{
while (current_subject < end_subject &&
!WAS_NEWLINE(current_subject))
@@ -3317,7 +3317,7 @@ for (;;)
{
current_subject++;
#ifdef SUPPORT_UTF8
- if (utf8)
+ if (utf)
while(current_subject < end_subject &&
(*current_subject & 0xc0) == 0x80) current_subject++;
#endif
@@ -3426,7 +3426,7 @@ for (;;)
if (firstline && IS_NEWLINE(current_subject)) break;
current_subject++;
- if (utf8)
+ if (utf)
{
while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
current_subject++;
diff --git a/pcre_exec.c b/pcre_exec.c
index 778a301..db013e6 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -183,7 +183,7 @@ if (caseless)
{
#ifdef SUPPORT_UTF8
#ifdef SUPPORT_UCP
- if (md->utf8)
+ if (md->utf)
{
/* Match characters up to the end of the reference. NOTE: the number of
bytes matched may differ, because there are some characters whose upper and
@@ -385,7 +385,7 @@ typedef struct heapframe {
int Xprop_value;
int Xprop_fail_result;
int Xoclength;
- pcre_uint8 Xocchars[8];
+ pcre_uchar Xocchars[6];
#endif
int Xcodelink;
@@ -450,7 +450,7 @@ the subject. */
/* Performance note: It might be tempting to extract commonly used fields from
-the md structure (e.g. utf8, end_subject) into individual variables to improve
+the md structure (e.g. utf, end_subject) into individual variables to improve
performance. Tests using gcc on a SPARC disproved this; in the first case, it
made performance worse.
@@ -485,7 +485,7 @@ so they can be ordinary variables in all cases. Mark some of them with
register int rrc; /* Returns from recursive calls */
register int i; /* Used for loops not involving calls to RMATCH() */
register unsigned int c; /* Character values not kept over RMATCH() calls */
-register BOOL utf8; /* Local copy of UTF-8 flag for speed */
+register BOOL utf; /* Local copy of UTF flag for speed */
BOOL minimize, possessive; /* Quantifier options */
BOOL caseless;
@@ -606,7 +606,7 @@ int prop_type;
int prop_value;
int prop_fail_result;
int oclength;
-pcre_uint8 occhars[8];
+pcre_uchar occhars[6];
#endif
int codelink;
@@ -660,9 +660,9 @@ complicated macro. It has to be used in one particular way. This shouldn't,
however, impact performance when true recursion is being used. */
#ifdef SUPPORT_UTF8
-utf8 = md->utf8; /* Local copy of the flag */
+utf = md->utf; /* Local copy of the flag */
#else
-utf8 = FALSE;
+utf = FALSE;
#endif
/* First check that we haven't called match() too many times, or that we
@@ -1597,7 +1597,7 @@ for (;;)
case OP_REVERSE:
#ifdef SUPPORT_UTF8
- if (utf8)
+ if (utf)
{
i = GET(ecode, 1);
while (i-- > 0)
@@ -2070,7 +2070,7 @@ for (;;)
partial matching. */
#ifdef SUPPORT_UTF8
- if (utf8)
+ if (utf)
{
/* Get status of previous character */
@@ -2189,7 +2189,7 @@ for (;;)
MRRETURN(MATCH_NOMATCH);
}
eptr++;
- if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+ if (utf) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
ecode++;
break;
@@ -2546,7 +2546,7 @@ for (;;)
while (eptr < md->end_subject)
{
int len = 1;
- if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
+ if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
if (UCD_CATEGORY(c) != ucp_M) break;
eptr += len;
}
@@ -2744,8 +2744,7 @@ for (;;)
/* First, ensure the minimum number of matches are present. */
#ifdef SUPPORT_UTF
- /* UTF-8 mode */
- if (utf8)
+ if (utf)
{
for (i = 1; i <= min; i++)
{
@@ -2765,7 +2764,7 @@ for (;;)
}
else
#endif
- /* Not UTF-8 mode */
+ /* Not UTF mode */
{
for (i = 1; i <= min; i++)
{
@@ -2797,8 +2796,7 @@ for (;;)
if (minimize)
{
#ifdef SUPPORT_UTF
- /* UTF-8 mode */
- if (utf8)
+ if (utf)
{
for (fi = min;; fi++)
{
@@ -2821,7 +2819,7 @@ for (;;)
}
else
#endif
- /* Not UTF-8 mode */
+ /* Not UTF mode */
{
for (fi = min;; fi++)
{
@@ -2854,8 +2852,7 @@ for (;;)
pp = eptr;
#ifdef SUPPORT_UTF
- /* UTF mode */
- if (utf8)
+ if (utf)
{
for (i = min; i < max; i++)
{
@@ -3024,7 +3021,7 @@ for (;;)
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (eptr-- == pp) break; /* Stop if tried at original pos */
#ifdef SUPPORT_UTF
- if (utf8) BACKCHAR(eptr);
+ if (utf) BACKCHAR(eptr);
#endif
}
MRRETURN(MATCH_NOMATCH);
@@ -3038,7 +3035,7 @@ for (;;)
case OP_CHAR:
#ifdef SUPPORT_UTF8
- if (utf8)
+ if (utf)
{
length = 1;
ecode++;
@@ -3052,8 +3049,7 @@ for (;;)
}
else
#endif
-
- /* Non-UTF-8 mode */
+ /* Not UTF mode */
{
if (md->end_subject - eptr < 1)
{
@@ -3069,7 +3065,7 @@ for (;;)
case OP_CHARI:
#ifdef SUPPORT_UTF8
- if (utf8)
+ if (utf)
{
length = 1;
ecode++;
@@ -3112,7 +3108,7 @@ for (;;)
else
#endif /* SUPPORT_UTF8 */
- /* Non-UTF-8 mode */
+ /* Not UTF mode */
{
if (md->end_subject - eptr < 1)
{
@@ -3193,7 +3189,7 @@ for (;;)
REPEATCHAR:
#ifdef SUPPORT_UTF8
- if (utf8)
+ if (utf)
{
length = 1;
charptr = ecode;
@@ -3209,7 +3205,7 @@ for (;;)
unsigned int othercase;
if (op >= OP_STARI && /* Caseless */
(othercase = UCD_OTHERCASE(fc)) != fc)
- oclength = PRIV(ord2utf8)(othercase, occhars);
+ oclength = PRIV(ord2utf)(othercase, occhars);
else oclength = 0;
#endif /* SUPPORT_UCP */
@@ -3220,7 +3216,7 @@ for (;;)
#ifdef SUPPORT_UCP
else if (oclength > 0 &&
eptr <= md->end_subject - oclength &&
- memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
+ memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
#endif /* SUPPORT_UCP */
else
{
@@ -3243,7 +3239,7 @@ for (;;)
#ifdef SUPPORT_UCP
else if (oclength > 0 &&
eptr <= md->end_subject - oclength &&
- memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
+ memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
#endif /* SUPPORT_UCP */
else
{
@@ -3264,7 +3260,7 @@ for (;;)
#ifdef SUPPORT_UCP
else if (oclength > 0 &&
eptr <= md->end_subject - oclength &&
- memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
+ memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
#endif /* SUPPORT_UCP */
else
{
@@ -3548,8 +3544,7 @@ for (;;)
fc = md->lcc[fc];
#ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
+ if (utf)
{
register unsigned int d;
for (i = 1; i <= min; i++)
@@ -3566,8 +3561,7 @@ for (;;)
}
else
#endif
-
- /* Not UTF-8 mode */
+ /* Not UTF mode */
{
for (i = 1; i <= min; i++)
{
@@ -3585,8 +3579,7 @@ for (;;)
if (minimize)
{
#ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
+ if (utf)
{
register unsigned int d;
for (fi = min;; fi++)
@@ -3606,7 +3599,7 @@ for (;;)
}
else
#endif
- /* Not UTF-8 mode */
+ /* Not UTF mode */
{
for (fi = min;; fi++)
{
@@ -3631,8 +3624,7 @@ for (;;)
pp = eptr;
#ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
+ if (utf)
{
register unsigned int d;
for (i = min; i < max; i++)
@@ -3659,7 +3651,7 @@ for (;;)
}
else
#endif
- /* Not UTF-8 mode */
+ /* Not UTF mode */
{
for (i = min; i < max; i++)
{
@@ -3690,8 +3682,7 @@ for (;;)
else
{
#ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
+ if (utf)
{
register unsigned int d;
for (i = 1; i <= min; i++)
@@ -3707,7 +3698,7 @@ for (;;)
}
else
#endif
- /* Not UTF-8 mode */
+ /* Not UTF mode */
{
for (i = 1; i <= min; i++)
{
@@ -3725,8 +3716,7 @@ for (;;)
if (minimize)
{
#ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
+ if (utf)
{
register unsigned int d;
for (fi = min;; fi++)
@@ -3745,7 +3735,7 @@ for (;;)
}
else
#endif
- /* Not UTF-8 mode */
+ /* Not UTF mode */
{
for (fi = min;; fi++)
{
@@ -3770,8 +3760,7 @@ for (;;)
pp = eptr;
#ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
+ if (utf)
{
register unsigned int d;
for (i = min; i < max; i++)
@@ -3797,7 +3786,7 @@ for (;;)
}
else
#endif
- /* Not UTF-8 mode */
+ /* Not UTF mode */
{
for (i = min; i < max; i++)
{
@@ -4073,7 +4062,7 @@ for (;;)
while (eptr < md->end_subject)
{
int len = 1;
- if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
+ if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
if (UCD_CATEGORY(c) != ucp_M) break;
eptr += len;
}
@@ -4086,7 +4075,7 @@ for (;;)
/* Handle all other cases when the coding is UTF-8 */
#ifdef SUPPORT_UTF8
- if (utf8) switch(ctype)
+ if (utf) switch(ctype)
{
case OP_ANY:
for (i = 1; i <= min; i++)
@@ -4794,7 +4783,7 @@ for (;;)
while (eptr < md->end_subject)
{
int len = 1;
- if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
+ if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
if (UCD_CATEGORY(c) != ucp_M) break;
eptr += len;
}
@@ -4804,8 +4793,7 @@ for (;;)
#endif /* SUPPORT_UCP */
#ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
+ if (utf)
{
for (fi = min;; fi++)
{
@@ -4968,7 +4956,7 @@ for (;;)
}
else
#endif
- /* Not UTF-8 mode */
+ /* Not UTF mode */
{
for (fi = min;; fi++)
{
@@ -5267,7 +5255,7 @@ for (;;)
RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (eptr-- == pp) break; /* Stop if tried at original pos */
- if (utf8) BACKCHAR(eptr);
+ if (utf) BACKCHAR(eptr);
}
}
@@ -5284,13 +5272,13 @@ for (;;)
SCHECK_PARTIAL();
break;
}
- if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
+ if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
if (UCD_CATEGORY(c) == ucp_M) break;
eptr += len;
while (eptr < md->end_subject)
{
len = 1;
- if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
+ if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
if (UCD_CATEGORY(c) != ucp_M) break;
eptr += len;
}
@@ -5307,7 +5295,7 @@ for (;;)
if (eptr-- == pp) break; /* Stop if tried at original pos */
for (;;) /* Move back over one extended */
{
- if (!utf8) c = *eptr; else
+ if (!utf) c = *eptr; else
{
BACKCHAR(eptr);
GETCHAR(c, eptr);
@@ -5322,9 +5310,7 @@ for (;;)
#endif /* SUPPORT_UCP */
#ifdef SUPPORT_UTF8
- /* UTF-8 mode */
-
- if (utf8)
+ if (utf)
{
switch(ctype)
{
@@ -5607,8 +5593,7 @@ for (;;)
}
else
#endif /* SUPPORT_UTF8 */
-
- /* Not UTF-8 mode */
+ /* Not UTF mode */
{
switch(ctype)
{
@@ -5969,7 +5954,7 @@ BOOL using_temporary_offsets = FALSE;
BOOL anchored;
BOOL startline;
BOOL firstline;
-BOOL utf8;
+BOOL utf;
BOOL has_first_char = FALSE;
BOOL has_req_char = FALSE;
pcre_uchar first_char = 0;
@@ -6005,7 +5990,8 @@ follows immediately afterwards. Other values in the md block are used only
during "normal" pcre_exec() processing, not when the JIT support is in use,
so they are set up later. */
-utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
+/* PCRE_UTF16 has the same value as PCRE_UTF8. */
+utf = md->utf = (re->options & PCRE_UTF8) != 0;
md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
@@ -6013,10 +5999,10 @@ md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
code for an invalid string if a results vector is available. */
#ifdef SUPPORT_UTF8
-if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
+if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
{
int erroroffset;
- int errorcode = PRIV(valid_utf8)((PCRE_PUCHAR)subject, length, &erroroffset);
+ int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
if (errorcode != 0)
{
if (offsetcount >= 2)
@@ -6306,7 +6292,7 @@ for(;;)
{
PCRE_PUCHAR t = start_match;
#ifdef SUPPORT_UTF8
- if (utf8)
+ if (utf)
{
while (t < md->end_subject && !IS_NEWLINE(t))
{
@@ -6348,7 +6334,7 @@ for(;;)
if (start_match > md->start_subject + start_offset)
{
#ifdef SUPPORT_UTF8
- if (utf8)
+ if (utf)
{
while (start_match < end_subject && !WAS_NEWLINE(start_match))
{
@@ -6389,7 +6375,7 @@ for(;;)
{
start_match++;
#ifdef SUPPORT_UTF8
- if (utf8)
+ if (utf)
while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
start_match++;
#endif
@@ -6521,7 +6507,7 @@ for(;;)
case MATCH_THEN:
new_start_match = start_match + 1;
#ifdef SUPPORT_UTF8
- if (utf8)
+ if (utf)
while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
new_start_match++;
#endif
diff --git a/pcre_internal.h b/pcre_internal.h
index 9dbaf05..637565b 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -292,8 +292,8 @@ start/end of string field names are. */
#define IS_NEWLINE(p) \
((NLBLOCK->nltype != NLTYPE_FIXED)? \
((p) < NLBLOCK->PSEND && \
- PRIV(is_newline)((p), NLBLOCK->nltype, NLBLOCK->PSEND, &(NLBLOCK->nllen),\
- utf8)) \
+ PRIV(is_newline)((p), NLBLOCK->nltype, NLBLOCK->PSEND, \
+ &(NLBLOCK->nllen), utf)) \
: \
((p) <= NLBLOCK->PSEND - NLBLOCK->nllen && \
(p)[0] == NLBLOCK->nl[0] && \
@@ -307,7 +307,7 @@ start/end of string field names are. */
((NLBLOCK->nltype != NLTYPE_FIXED)? \
((p) > NLBLOCK->PSSTART && \
PRIV(was_newline)((p), NLBLOCK->nltype, NLBLOCK->PSSTART, \
- &(NLBLOCK->nllen), utf8)) \
+ &(NLBLOCK->nllen), utf)) \
: \
((p) >= NLBLOCK->PSSTART + NLBLOCK->nllen && \
(p)[-NLBLOCK->nllen] == NLBLOCK->nl[0] && \
@@ -581,7 +581,7 @@ pointer. */
#define GETCHARTEST(c, eptr) \
c = *eptr; \
- if (utf8 && c >= 0xc0) GETUTF8(c, eptr);
+ if (utf && c >= 0xc0) GETUTF8(c, eptr);
/* Base macro to pick up the remaining bytes of a UTF-8 character, advancing
the pointer. */
@@ -629,7 +629,7 @@ This is called when we don't know if we are in UTF-8 mode. */
#define GETCHARINCTEST(c, eptr) \
c = *eptr++; \
- if (utf8 && c >= 0xc0) GETUTF8INC(c, eptr);
+ if (utf && c >= 0xc0) GETUTF8INC(c, eptr);
/* Base macro to pick up the remaining bytes of a UTF-8 character, not
advancing the pointer, incrementing the length. */
@@ -681,7 +681,7 @@ do not know if we are in UTF-8 mode. */
#define GETCHARLENTEST(c, eptr, len) \
c = *eptr; \
- if (utf8 && c >= 0xc0) GETUTF8LEN(c, eptr, len);
+ if (utf && c >= 0xc0) GETUTF8LEN(c, eptr, len);
/* If the pointer is not at the start of a character, move it back until
it is. This is called only in UTF-8 mode - we don't put a test within the macro
@@ -1366,7 +1366,7 @@ value such as \n. They must have non-zero values, as check_escape() returns
their negation. Also, they must appear in the same order as in the opcode
definitions below, up to ESC_z. There's a dummy for OP_ALLANY because it
corresponds to "." in DOTALL mode rather than an escape sequence. It is also
-used for [^] in JavaScript compatibility mode, and for \C in non-utf8 mode. In
+used for [^] in JavaScript compatibility mode, and for \C in non-utf mode. In
non-DOTALL mode, "." behaves like \N.
The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc.
@@ -1784,7 +1784,7 @@ enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59,
ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69,
- ERR70, ERRCOUNT };
+ ERR70, ERR71, ERRCOUNT };
/* The real format of the start of the pcre block; the index of names and the
code vector run on as long as necessary after the end. We store an explicit
@@ -1934,7 +1934,7 @@ typedef struct match_data {
BOOL offset_overflow; /* Set if too many extractions */
BOOL notbol; /* NOTBOL flag */
BOOL noteol; /* NOTEOL flag */
- BOOL utf8; /* UTF8 flag */
+ BOOL utf; /* UTF-8 / UTF-16 flag */
BOOL jscript_compat; /* JAVASCRIPT_COMPAT flag */
BOOL use_ucp; /* PCRE_UCP flag */
BOOL endonly; /* Dollar not before final \n */
@@ -2103,14 +2103,10 @@ extern unsigned int PRIV(strlen_uc)(const pcre_uchar *str);
extern const pcre_uchar *PRIV(find_bracket)(const pcre_uchar *, BOOL, int);
extern BOOL PRIV(is_newline)(PCRE_PUCHAR, int, PCRE_PUCHAR,
int *, BOOL);
-extern int PRIV(ord2utf8)(int, pcre_uint8 *);
+extern int PRIV(ord2utf)(pcre_uint32, pcre_uchar *);
extern real_pcre *PRIV(try_flipped)(const real_pcre *, real_pcre *,
const pcre_study_data *, pcre_study_data *);
-#ifndef COMPILE_PCRE16
-extern int PRIV(valid_utf8)(PCRE_PUCHAR, int, int *);
-#else
-extern int PRIV(valid_utf16)(PCRE_PUCHAR, int, int *);
-#endif
+extern int PRIV(valid_utf)(PCRE_PUCHAR, int, int *);
extern BOOL PRIV(was_newline)(PCRE_PUCHAR, int, PCRE_PUCHAR,
int *, BOOL);
extern BOOL PRIV(xclass)(int, const pcre_uchar *);
diff --git a/pcre_jit_compile.c b/pcre_jit_compile.c
index 03833e0..16611f1 100644
--- a/pcre_jit_compile.c
+++ b/pcre_jit_compile.c
@@ -298,7 +298,7 @@ typedef struct compiler_common {
jump_list *caselesscmp;
BOOL jscript_compat;
#ifdef SUPPORT_UTF8
- BOOL utf8;
+ BOOL utf;
#ifdef SUPPORT_UCP
BOOL useucp;
#endif
@@ -497,7 +497,7 @@ switch(*cc)
case OP_ANYBYTE:
#ifdef SUPPORT_UTF8
- if (common->utf8) return NULL;
+ if (common->utf) return NULL;
#endif
return cc + 1;
@@ -544,7 +544,7 @@ switch(*cc)
case OP_NOTPOSQUERYI:
cc += 2;
#ifdef SUPPORT_UTF8
- if (common->utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+ if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
#endif
return cc;
@@ -566,7 +566,7 @@ switch(*cc)
case OP_NOTPOSUPTOI:
cc += 2 + IMM2_SIZE;
#ifdef SUPPORT_UTF8
- if (common->utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+ if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
#endif
return cc;
@@ -1264,7 +1264,7 @@ static SLJIT_INLINE BOOL char_has_othercase(compiler_common *common, pcre_uchar*
unsigned int c;
#ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
{
GETCHAR(c, cc);
if (c > 127)
@@ -1286,7 +1286,7 @@ static SLJIT_INLINE unsigned int char_othercase(compiler_common *common, unsigne
{
/* Returns with the othercase. */
#ifdef SUPPORT_UTF8
-if (common->utf8 && c > 127)
+if (common->utf && c > 127)
{
#ifdef SUPPORT_UCP
return UCD_OTHERCASE(c);
@@ -1307,7 +1307,7 @@ int n;
#endif
#ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
{
GETCHAR(c, cc);
if (c <= 127)
@@ -1343,7 +1343,7 @@ if (!ispowerof2(bit))
return 0;
#ifdef SUPPORT_UTF8
-if (common->utf8 && c > 127)
+if (common->utf && c > 127)
{
n = PRIV(utf8_table4)[*cc & 0x3f];
while ((bit & 0x3f) == 0)
@@ -1374,7 +1374,7 @@ struct sljit_jump *jump;
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
#ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
{
jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
add_jump(compiler, &common->utf8readchar, JUMP(SLJIT_FAST_CALL));
@@ -1395,7 +1395,7 @@ struct sljit_jump *jump;
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
#ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
{
jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
add_jump(compiler, &common->utf8readchar, JUMP(SLJIT_FAST_CALL));
@@ -1414,7 +1414,7 @@ struct sljit_jump *jump;
#endif
#ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
{
OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
@@ -1439,7 +1439,7 @@ DEFINE_COMPILER;
#ifdef SUPPORT_UTF8
struct sljit_label *label;
-if (common->utf8)
+if (common->utf)
{
label = LABEL();
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
@@ -1697,7 +1697,7 @@ mainloop = LABEL();
/* Increasing the STR_PTR here requires one less jump in the most common case. */
#ifdef SUPPORT_UTF8
-if (common->utf8) readuchar = TRUE;
+if (common->utf) readuchar = TRUE;
#endif
if (newlinecheck) readuchar = TRUE;
@@ -1709,7 +1709,7 @@ if (newlinecheck)
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
#ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
{
singlebyte = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)PRIV(utf8_table4) - 0xc0);
@@ -1771,7 +1771,7 @@ else
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
#ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
{
CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0, start);
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)PRIV(utf8_table4) - 0xc0);
@@ -1882,7 +1882,7 @@ start = LABEL();
leave = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
#ifdef SUPPORT_UTF
-if (common->utf8)
+if (common->utf)
OP1(SLJIT_MOV, TMP3, 0, TMP1, 0);
#endif
#ifndef COMPILE_PCRE8
@@ -1896,12 +1896,12 @@ OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, TMP2, 0);
found = JUMP(SLJIT_C_NOT_ZERO);
#ifdef SUPPORT_UTF
-if (common->utf8)
+if (common->utf)
OP1(SLJIT_MOV, TMP1, 0, TMP3, 0);
#endif
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
#ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
{
CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0, start);
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)PRIV(utf8_table4) - 0xc0);
@@ -2051,7 +2051,7 @@ else
#ifdef SUPPORT_UTF8
/* Here LOCALS1 has already been zeroed. */
jump = NULL;
- if (common->utf8)
+ if (common->utf)
jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
#endif
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), common->ctypes);
@@ -2090,7 +2090,7 @@ else
#ifdef SUPPORT_UTF8
OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 0);
jump = NULL;
- if (common->utf8)
+ if (common->utf)
jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
#endif
OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(TMP1), common->ctypes);
@@ -2119,7 +2119,7 @@ OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x0d - 0x0a);
COND_VALUE(SLJIT_MOV, TMP2, 0, SLJIT_C_LESS_EQUAL);
OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x85 - 0x0a);
#ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
{
COND_VALUE(SLJIT_OR, TMP2, 0, SLJIT_C_EQUAL);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x1);
@@ -2143,7 +2143,7 @@ OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x20);
COND_VALUE(SLJIT_OR, TMP2, 0, SLJIT_C_EQUAL);
OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xa0);
#ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
{
COND_VALUE(SLJIT_OR, TMP2, 0, SLJIT_C_EQUAL);
OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x1680);
@@ -2177,7 +2177,7 @@ OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x0d - 0x0a);
COND_VALUE(SLJIT_MOV, TMP2, 0, SLJIT_C_LESS_EQUAL);
OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x85 - 0x0a);
#ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
{
COND_VALUE(SLJIT_OR | SLJIT_SET_E, TMP2, 0, SLJIT_C_EQUAL);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x1);
@@ -2289,7 +2289,7 @@ DEFINE_COMPILER;
unsigned int othercasebit = 0;
pcre_uchar *othercasechar = NULL;
#ifdef SUPPORT_UTF8
-int utf8length;
+int utflength;
#endif
if (caseless && char_has_othercase(common, cc))
@@ -2336,9 +2336,9 @@ if (context->sourcereg == -1)
}
#ifdef SUPPORT_UTF8
-utf8length = 1;
-if (common->utf8 && *cc >= 0xc0)
- utf8length += PRIV(utf8_table4)[*cc & 0x3f];
+utflength = 1;
+if (common->utf && *cc >= 0xc0)
+ utflength += PRIV(utf8_table4)[*cc & 0x3f];
do
{
@@ -2432,9 +2432,9 @@ do
cc++;
#ifdef SUPPORT_UTF8
- utf8length--;
+ utflength--;
}
-while (utf8length > 0);
+while (utflength > 0);
#endif
return cc;
@@ -2480,7 +2480,7 @@ unsigned int typeoffset;
int invertcmp, numberofcmps;
unsigned int charoffset;
-/* Although SUPPORT_UTF8 must be defined, we are not necessary in utf8 mode. */
+/* Although SUPPORT_UTF must be defined, we are not necessary in utf mode. */
check_input_end(common, fallbacks);
read_char(common);
@@ -2490,7 +2490,7 @@ if ((*cc++ & XCL_MAP) != 0)
#ifndef COMPILE_PCRE8
jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
#elif defined SUPPORT_UTF8
- if (common->utf8)
+ if (common->utf)
jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
#endif
@@ -2504,7 +2504,7 @@ if ((*cc++ & XCL_MAP) != 0)
#ifndef COMPILE_PCRE8
JUMPHERE(jump);
#elif defined SUPPORT_UTF8
- if (common->utf8)
+ if (common->utf)
JUMPHERE(jump);
#endif
OP1(SLJIT_MOV, TMP1, 0, TMP3, 0);
@@ -2524,7 +2524,7 @@ while (*cc != XCL_END)
{
cc += 2;
#ifdef SUPPORT_UTF8
- if (common->utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+ if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
#endif
#ifdef SUPPORT_UCP
needschar = TRUE;
@@ -2534,11 +2534,11 @@ while (*cc != XCL_END)
{
cc += 2;
#ifdef SUPPORT_UTF8
- if (common->utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+ if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
#endif
cc++;
#ifdef SUPPORT_UTF8
- if (common->utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+ if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
#endif
#ifdef SUPPORT_UCP
needschar = TRUE;
@@ -2639,7 +2639,7 @@ while (*cc != XCL_END)
{
cc ++;
#ifdef SUPPORT_UTF8
- if (common->utf8)
+ if (common->utf)
{
GETCHARINC(c, cc);
}
@@ -2670,7 +2670,7 @@ while (*cc != XCL_END)
{
cc ++;
#ifdef SUPPORT_UTF8
- if (common->utf8)
+ if (common->utf)
{
GETCHARINC(c, cc);
}
@@ -2679,7 +2679,7 @@ while (*cc != XCL_END)
c = *cc++;
SET_CHAR_OFFSET(c);
#ifdef SUPPORT_UTF8
- if (common->utf8)
+ if (common->utf)
{
GETCHARINC(c, cc);
}
@@ -2876,7 +2876,7 @@ switch(type)
case OP_ALLANY:
check_input_end(common, fallbacks);
#ifdef SUPPORT_UTF8
- if (common->utf8)
+ if (common->utf)
{
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
@@ -3096,7 +3096,7 @@ switch(type)
case OP_CHARI:
length = 1;
#ifdef SUPPORT_UTF8
- if (common->utf8 && *cc >= 0xc0) length += PRIV(utf8_table4)[*cc & 0x3f];
+ if (common->utf && *cc >= 0xc0) length += PRIV(utf8_table4)[*cc & 0x3f];
#endif
if (type == OP_CHAR || !char_has_othercase(common, cc) || char_get_othercase_bit(common, cc) != 0)
{
@@ -3113,7 +3113,7 @@ switch(type)
check_input_end(common, fallbacks);
read_char(common);
#ifdef SUPPORT_UTF8
- if (common->utf8)
+ if (common->utf)
{
GETCHAR(c, cc);
}
@@ -3130,7 +3130,7 @@ switch(type)
case OP_NOT:
case OP_NOTI:
#ifdef SUPPORT_UTF8
- if (common->utf8)
+ if (common->utf)
{
length = 1;
if (*cc >= 0xc0) length += PRIV(utf8_table4)[*cc & 0x3f];
@@ -3196,7 +3196,7 @@ switch(type)
jump[0] = NULL;
#ifdef SUPPORT_UTF8
/* This check can only be skipped in pure 8 bit mode. */
- if (common->utf8)
+ if (common->utf)
#endif
{
jump[0] = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
@@ -3231,7 +3231,7 @@ switch(type)
OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0);
OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, begin));
#ifdef SUPPORT_UTF8
- if (common->utf8)
+ if (common->utf)
{
OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, length);
label = LABEL();
@@ -3269,7 +3269,7 @@ do
{
size = 1;
#ifdef SUPPORT_UTF8
- if (common->utf8 && cc[1] >= 0xc0)
+ if (common->utf && cc[1] >= 0xc0)
size += PRIV(utf8_table4)[cc[1] & 0x3f];
#endif
}
@@ -3277,7 +3277,7 @@ do
{
size = 1;
#ifdef SUPPORT_UTF8
- if (common->utf8)
+ if (common->utf)
{
if (char_has_othercase(common, cc + 1) && char_get_othercase_bit(common, cc + 1) == 0)
size = 0;
@@ -3381,7 +3381,7 @@ if (withchecks && !common->jscript_compat)
#ifdef SUPPORT_UTF8
#ifdef SUPPORT_UCP
-if (common->utf8 && *cc == OP_REFI)
+if (common->utf && *cc == OP_REFI)
{
SLJIT_ASSERT(TMP1 == SLJIT_TEMPORARY_REG1 && STACK_TOP == SLJIT_TEMPORARY_REG2 && TMP2 == SLJIT_TEMPORARY_REG3);
OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), OVECTOR(offset + 1));
@@ -4787,7 +4787,7 @@ if (end != NULL)
{
*end = cc + 1;
#ifdef SUPPORT_UTF8
- if (common->utf8 && *cc >= 0xc0) *end += PRIV(utf8_table4)[*cc & 0x3f];
+ if (common->utf && *cc >= 0xc0) *end += PRIV(utf8_table4)[*cc & 0x3f];
#endif
}
return cc;
@@ -6254,7 +6254,8 @@ common->casefulcmp = NULL;
common->caselesscmp = NULL;
common->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
#ifdef SUPPORT_UTF8
-common->utf8 = (re->options & PCRE_UTF8) != 0;
+/* PCRE_UTF16 has the same value as PCRE_UTF8. */
+common->utf = (re->options & PCRE_UTF8) != 0;
#ifdef SUPPORT_UCP
common->useucp = (re->options & PCRE_UCP) != 0;
#endif
diff --git a/pcre_newline.c b/pcre_newline.c
index 92b81d1..0c2ddcd 100644
--- a/pcre_newline.c
+++ b/pcre_newline.c
@@ -67,17 +67,17 @@ Arguments:
type the newline type
endptr pointer to the end of the string
lenptr where to return the length
- utf8 TRUE if in utf8 mode
+ utf TRUE if in utf mode
Returns: TRUE or FALSE
*/
BOOL
PRIV(is_newline)(PCRE_PUCHAR ptr, int type, PCRE_PUCHAR endptr, int *lenptr,
- BOOL utf8)
+ BOOL utf)
{
int c;
-if (utf8) { GETCHAR(c, ptr); } else c = *ptr;
+if (utf) { GETCHAR(c, ptr); } else c = *ptr;
if (type == NLTYPE_ANYCRLF) switch(c)
{
@@ -96,7 +96,7 @@ else switch(c)
case 0x000c: *lenptr = 1; return TRUE; /* FF */
case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1;
return TRUE; /* CR */
- case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */
+ case 0x0085: *lenptr = utf? 2 : 1; return TRUE; /* NEL */
case 0x2028: /* LS */
case 0x2029: *lenptr = 3; return TRUE; /* PS */
default: return FALSE;
@@ -117,19 +117,19 @@ Arguments:
type the newline type
startptr pointer to the start of the string
lenptr where to return the length
- utf8 TRUE if in utf8 mode
+ utf TRUE if in utf mode
Returns: TRUE or FALSE
*/
BOOL
PRIV(was_newline)(PCRE_PUCHAR ptr, int type, PCRE_PUCHAR startptr, int *lenptr,
- BOOL utf8)
+ BOOL utf)
{
int c;
ptr--;
#ifdef SUPPORT_UTF8
-if (utf8)
+if (utf)
{
BACKCHAR(ptr);
GETCHAR(c, ptr);
@@ -154,7 +154,7 @@ else switch(c)
case 0x000b: /* VT */
case 0x000c: /* FF */
case 0x000d: *lenptr = 1; return TRUE; /* CR */
- case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */
+ case 0x0085: *lenptr = utf? 2 : 1; return TRUE; /* NEL */
case 0x2028: /* LS */
case 0x2029: *lenptr = 3; return TRUE; /* PS */
default: return FALSE;
diff --git a/pcre_ord2utf8.c b/pcre_ord2utf8.c
index 354adc0..b374987 100644
--- a/pcre_ord2utf8.c
+++ b/pcre_ord2utf8.c
@@ -52,21 +52,28 @@ character value into a UTF8 string. */
* Convert character value to UTF-8 *
*************************************************/
-/* This function takes an integer value in the range 0 - 0x7fffffff
-and encodes it as a UTF-8 character in 0 to 6 bytes.
+/* This function takes an integer value in the range 0 - 0x10ffff
+and encodes it as a UTF-8 character in 1 to 6 pcre_uchars.
Arguments:
cvalue the character value
- buffer pointer to buffer for result - at least 6 bytes long
+ buffer pointer to buffer for result - at least 6 pcre_uchars long
Returns: number of characters placed in the buffer
*/
int
-PRIV(ord2utf8)(int cvalue, pcre_uint8 *buffer)
+PRIV(ord2utf)(pcre_uint32 cvalue, pcre_uchar *buffer)
{
#ifdef SUPPORT_UTF8
+
register int i, j;
+
+/* Checking invalid cvalue character, encoded as invalid UTF-16 character.
+Should never happen in practice. */
+if ((cvalue & 0xf800) == 0xd800 || cvalue >= 0x110000)
+ cvalue = 0xfffe;
+
for (i = 0; i < PRIV(utf8_table1_size); i++)
if (cvalue <= PRIV(utf8_table1)[i]) break;
buffer += i;
@@ -77,10 +84,13 @@ for (j = i; j > 0; j--)
}
*buffer = PRIV(utf8_table2)[i] | cvalue;
return i + 1;
+
#else
+
(void)(cvalue); /* Keep compiler happy; this function won't ever be */
(void)(buffer); /* called when SUPPORT_UTF8 is not defined. */
return 0;
+
#endif
}
diff --git a/pcre_study.c b/pcre_study.c
index 661627d..098980d 100644
--- a/pcre_study.c
+++ b/pcre_study.c
@@ -82,7 +82,8 @@ find_minlength(const pcre_uchar *code, const pcre_uchar *startcode, int options,
int recurse_depth)
{
int length = -1;
-BOOL utf8 = (options & PCRE_UTF8) != 0;
+/* PCRE_UTF16 has the same value as PCRE_UTF8. */
+BOOL utf = (options & PCRE_UTF8) != 0;
BOOL had_recurse = FALSE;
register int branchlength = 0;
register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
@@ -224,7 +225,7 @@ for (;;)
branchlength++;
cc += 2;
#ifdef SUPPORT_UTF8
- if (utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+ if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
#endif
break;
@@ -245,7 +246,7 @@ for (;;)
branchlength += GET2(cc,1);
cc += 2 + IMM2_SIZE;
#ifdef SUPPORT_UTF8
- if (utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+ if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
#endif
break;
@@ -293,7 +294,7 @@ for (;;)
case OP_ANYBYTE:
#ifdef SUPPORT_UTF8
- if (utf8) return -1;
+ if (utf) return -1;
#endif
branchlength++;
cc++;
@@ -374,7 +375,7 @@ for (;;)
case OP_REFI:
if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
{
- ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf8, GET2(cc, 1));
+ ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
if (cs == NULL) return -2;
do ce += GET(ce, 1); while (*ce == OP_ALT);
if (cc > cs && cc < ce)
@@ -486,7 +487,7 @@ for (;;)
cc += PRIV(OP_lengths)[op];
#ifdef SUPPORT_UTF8
- if (utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+ if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
#endif
break;
@@ -537,29 +538,29 @@ Arguments:
p points to the character
caseless the caseless flag
cd the block with char table pointers
- utf8 TRUE for UTF-8 mode
+ utf TRUE for UTF-8 / UTF-16 mode
Returns: pointer after the character
*/
static const pcre_uchar *
set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
- compile_data *cd, BOOL utf8)
+ compile_data *cd, BOOL utf)
{
unsigned int c = *p;
SET_BIT(c);
#ifdef SUPPORT_UTF8
-if (utf8 && c > 127)
+if (utf && c > 127)
{
GETCHARINC(c, p);
#ifdef SUPPORT_UCP
if (caseless)
{
- pcre_uint8 buff[8];
+ pcre_uchar buff[6];
c = UCD_OTHERCASE(c);
- (void)PRIV(ord2utf8)(c, buff);
+ (void)PRIV(ord2utf)(c, buff);
SET_BIT(buff[0]);
}
#endif
@@ -607,8 +608,8 @@ for (c = 128; c < 256; c++)
{
if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
{
- pcre_uint8 buff[8];
- (void)PRIV(ord2utf8)(c, buff);
+ pcre_uchar buff[6];
+ (void)PRIV(ord2utf)(c, buff);
SET_BIT(buff[0]);
}
}
@@ -663,7 +664,7 @@ function fails unless the result is SSB_DONE.
Arguments:
code points to an expression
start_bits points to a 32-byte table, initialized to 0
- utf8 TRUE if in UTF-8 mode
+ utf TRUE if in UTF-8 / UTF-16 mode
cd the block with char table pointers
Returns: SSB_FAIL => Failed to find any starting bytes
@@ -673,12 +674,12 @@ Returns: SSB_FAIL => Failed to find any starting bytes
*/
static int
-set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf8,
+set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
compile_data *cd)
{
register int c;
int yield = SSB_DONE;
-int table_limit = utf8? 16:32;
+int table_limit = utf? 16:32;
#if 0
/* ========================================================================= */
@@ -817,7 +818,7 @@ do
case OP_ONCE:
case OP_ONCE_NC:
case OP_ASSERT:
- rc = set_start_bits(tcode, start_bits, utf8, cd);
+ rc = set_start_bits(tcode, start_bits, utf, cd);
if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
if (rc == SSB_DONE) try_next = FALSE; else
{
@@ -864,7 +865,7 @@ do
case OP_BRAZERO:
case OP_BRAMINZERO:
case OP_BRAPOSZERO:
- rc = set_start_bits(++tcode, start_bits, utf8, cd);
+ rc = set_start_bits(++tcode, start_bits, utf, cd);
if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
/* =========================================================================
See the comment at the head of this function concerning the next line,
@@ -891,7 +892,7 @@ do
case OP_QUERY:
case OP_MINQUERY:
case OP_POSQUERY:
- tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
+ tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
break;
case OP_STARI:
@@ -900,7 +901,7 @@ do
case OP_QUERYI:
case OP_MINQUERYI:
case OP_POSQUERYI:
- tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
+ tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
break;
/* Single-char upto sets the bit and tries the next */
@@ -908,13 +909,13 @@ do
case OP_UPTO:
case OP_MINUPTO:
case OP_POSUPTO:
- tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf8);
+ tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
break;
case OP_UPTOI:
case OP_MINUPTOI:
case OP_POSUPTOI:
- tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf8);
+ tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
break;
/* At least one single char sets the bit and stops */
@@ -926,7 +927,7 @@ do
case OP_PLUS:
case OP_MINPLUS:
case OP_POSPLUS:
- (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
+ (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
try_next = FALSE;
break;
@@ -937,7 +938,7 @@ do
case OP_PLUSI:
case OP_MINPLUSI:
case OP_POSPLUSI:
- (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
+ (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
try_next = FALSE;
break;
@@ -950,7 +951,7 @@ do
case OP_HSPACE:
SET_BIT(0x09);
SET_BIT(0x20);
- if (utf8)
+ if (utf)
{
SET_BIT(0xC2); /* For U+00A0 */
SET_BIT(0xE1); /* For U+1680, U+180E */
@@ -967,7 +968,7 @@ do
SET_BIT(0x0B);
SET_BIT(0x0C);
SET_BIT(0x0D);
- if (utf8)
+ if (utf)
{
SET_BIT(0xC2); /* For U+0085 */
SET_BIT(0xE2); /* For U+2028, U+2029 */
@@ -1057,7 +1058,7 @@ do
case OP_HSPACE:
SET_BIT(0x09);
SET_BIT(0x20);
- if (utf8)
+ if (utf)
{
SET_BIT(0xC2); /* For U+00A0 */
SET_BIT(0xE1); /* For U+1680, U+180E */
@@ -1073,7 +1074,7 @@ do
SET_BIT(0x0B);
SET_BIT(0x0C);
SET_BIT(0x0D);
- if (utf8)
+ if (utf)
{
SET_BIT(0xC2); /* For U+0085 */
SET_BIT(0xE2); /* For U+2028, U+2029 */
@@ -1126,7 +1127,7 @@ do
case OP_NCLASS:
#ifdef SUPPORT_UTF8
- if (utf8)
+ if (utf)
{
start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
@@ -1147,7 +1148,7 @@ do
characters in the range 128 - 255. */
#ifdef SUPPORT_UTF8
- if (utf8)
+ if (utf)
{
for (c = 0; c < 16; c++) start_bits[c] |= map[c];
for (c = 128; c < 256; c++)
diff --git a/pcre_valid_utf8.c b/pcre_valid_utf8.c
index 8d13014..bbab87f 100644
--- a/pcre_valid_utf8.c
+++ b/pcre_valid_utf8.c
@@ -103,7 +103,7 @@ Returns: = 0 if the string is a valid UTF-8 string
*/
int
-PRIV(valid_utf8)(PCRE_PUCHAR string, int length, int *erroroffset)
+PRIV(valid_utf)(PCRE_PUCHAR string, int length, int *erroroffset)
{
#ifdef SUPPORT_UTF8
register PCRE_PUCHAR p;
diff --git a/pcreposix.c b/pcreposix.c
index 648254b..2dc1561 100644
--- a/pcreposix.c
+++ b/pcreposix.c
@@ -155,6 +155,7 @@ static const int eint[] = {
REG_BADPAT, /* \k is not followed by a braced, angle-bracketed, or quoted name */
/* 70 */
REG_BADPAT, /* internal error: unknown opcode in find_fixedlength() */
+ REG_BADPAT, /* Not allowed UTF-8 / UTF-16 code point (>= 0xd800 && <= 0xdfff) */
};
/* Table of texts corresponding to POSIX error codes */
diff --git a/sljit/sljitConfigInternal.h b/sljit/sljitConfigInternal.h
index 3f771d8..ad0be19 100644
--- a/sljit/sljitConfigInternal.h
+++ b/sljit/sljitConfigInternal.h
@@ -354,8 +354,8 @@ typedef long int sljit_w;
#endif /* !SLJIT_UNALIGNED */
#if (defined SLJIT_EXECUTABLE_ALLOCATOR && SLJIT_EXECUTABLE_ALLOCATOR)
-static void* sljit_malloc_exec(sljit_uw size);
-static void sljit_free_exec(void* ptr);
+SLJIT_API_FUNC_ATTRIBUTE void* sljit_malloc_exec(sljit_uw size);
+SLJIT_API_FUNC_ATTRIBUTE void sljit_free_exec(void* ptr);
#define SLJIT_MALLOC_EXEC(size) sljit_malloc_exec(size)
#define SLJIT_FREE_EXEC(ptr) sljit_free_exec(ptr)
#endif
diff --git a/sljit/sljitExecAllocator.c b/sljit/sljitExecAllocator.c
index bfe8eb1..cdea346 100644
--- a/sljit/sljitExecAllocator.c
+++ b/sljit/sljitExecAllocator.c
@@ -163,7 +163,7 @@ static SLJIT_INLINE void sljit_remove_free_block(struct free_block *free_block)
}
}
-static void* sljit_malloc_exec(sljit_uw size)
+SLJIT_API_FUNC_ATTRIBUTE void* sljit_malloc_exec(sljit_uw size)
{
struct block_header *header;
struct block_header *next_header;
@@ -231,7 +231,7 @@ static void* sljit_malloc_exec(sljit_uw size)
return MEM_START(header);
}
-static void sljit_free_exec(void* ptr)
+SLJIT_API_FUNC_ATTRIBUTE void sljit_free_exec(void* ptr)
{
struct block_header *header;
struct free_block* free_block;
diff --git a/sljit/sljitLir.h b/sljit/sljitLir.h
index 2a82968..54906bc 100644
--- a/sljit/sljitLir.h
+++ b/sljit/sljitLir.h
@@ -195,6 +195,8 @@ struct sljit_compiler {
int local_size;
/* Code size. */
sljit_uw size;
+ /* For statistical purposes. */
+ sljit_uw executable_size;
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
int args;
@@ -291,6 +293,15 @@ SLJIT_API_FUNC_ATTRIBUTE void sljit_compiler_verbose(struct sljit_compiler *comp
SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compiler);
SLJIT_API_FUNC_ATTRIBUTE void sljit_free_code(void* code);
+/*
+ After the code generation we can retrieve the allocated executable memory size,
+ although this area may not be fully filled with instructions depending on some
+ optimizations. This function is useful only for statistical purposes.
+
+ Before a successful code generation, this function returns with 0.
+*/
+static SLJIT_INLINE sljit_uw sljit_get_generated_code_size(struct sljit_compiler *compiler) { return compiler->executable_size; }
+
/* Instruction generation. Returns with error code. */
/*
diff --git a/sljit/sljitNativeARM_Thumb2.c b/sljit/sljitNativeARM_Thumb2.c
index c476711..3764aeb 100644
--- a/sljit/sljitNativeARM_Thumb2.c
+++ b/sljit/sljitNativeARM_Thumb2.c
@@ -416,6 +416,7 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compil
SLJIT_CACHE_FLUSH(code, code_ptr);
compiler->error = SLJIT_ERR_COMPILED;
+ compiler->executable_size = compiler->size * sizeof(sljit_uh);
/* Set thumb mode flag. */
return (void*)((sljit_uw)code | 0x1);
}
diff --git a/sljit/sljitNativeARM_v5.c b/sljit/sljitNativeARM_v5.c
index 1b40afa..99584cf 100644
--- a/sljit/sljitNativeARM_v5.c
+++ b/sljit/sljitNativeARM_v5.c
@@ -788,6 +788,7 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compil
SLJIT_CACHE_FLUSH(code, code_ptr);
compiler->error = SLJIT_ERR_COMPILED;
+ compiler->executable_size = size * sizeof(sljit_uw);
return code;
}
diff --git a/sljit/sljitNativeMIPS_common.c b/sljit/sljitNativeMIPS_common.c
index c4fe152..7fcb6d6 100644
--- a/sljit/sljitNativeMIPS_common.c
+++ b/sljit/sljitNativeMIPS_common.c
@@ -397,6 +397,7 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compil
}
compiler->error = SLJIT_ERR_COMPILED;
+ compiler->executable_size = compiler->size * sizeof(sljit_ins);
#ifndef __GNUC__
SLJIT_CACHE_FLUSH(code, code_ptr);
#else
diff --git a/sljit/sljitNativePPC_common.c b/sljit/sljitNativePPC_common.c
index af14b75..28afd9e 100644
--- a/sljit/sljitNativePPC_common.c
+++ b/sljit/sljitNativePPC_common.c
@@ -354,6 +354,7 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compil
SLJIT_CACHE_FLUSH(code, code_ptr);
compiler->error = SLJIT_ERR_COMPILED;
+ compiler->executable_size = compiler->size * sizeof(sljit_ins);
#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
if (((sljit_w)code_ptr) & 0x4)
diff --git a/sljit/sljitNativeX86_common.c b/sljit/sljitNativeX86_common.c
index c6661bc..cc215a2 100644
--- a/sljit/sljitNativeX86_common.c
+++ b/sljit/sljitNativeX86_common.c
@@ -357,22 +357,22 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compil
while (jump) {
if (jump->flags & PATCH_MB) {
SLJIT_ASSERT((sljit_w)(jump->u.label->addr - (jump->addr + sizeof(sljit_b))) >= -128 && (sljit_w)(jump->u.label->addr - (jump->addr + sizeof(sljit_b))) <= 127);
- *(sljit_ub*)jump->addr = jump->u.label->addr - (jump->addr + sizeof(sljit_b));
+ *(sljit_ub*)jump->addr = (sljit_ub)(jump->u.label->addr - (jump->addr + sizeof(sljit_b)));
} else if (jump->flags & PATCH_MW) {
if (jump->flags & JUMP_LABEL) {
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
- *(sljit_w*)jump->addr = jump->u.label->addr - (jump->addr + sizeof(sljit_w));
+ *(sljit_w*)jump->addr = (sljit_w)(jump->u.label->addr - (jump->addr + sizeof(sljit_w)));
#else
SLJIT_ASSERT((sljit_w)(jump->u.label->addr - (jump->addr + sizeof(sljit_hw))) >= -0x80000000ll && (sljit_w)(jump->u.label->addr - (jump->addr + sizeof(sljit_hw))) <= 0x7fffffffll);
- *(sljit_hw*)jump->addr = jump->u.label->addr - (jump->addr + sizeof(sljit_hw));
+ *(sljit_hw*)jump->addr = (sljit_hw)(jump->u.label->addr - (jump->addr + sizeof(sljit_hw)));
#endif
}
else {
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
- *(sljit_w*)jump->addr = jump->u.target - (jump->addr + sizeof(sljit_w));
+ *(sljit_w*)jump->addr = (sljit_w)(jump->u.target - (jump->addr + sizeof(sljit_w)));
#else
SLJIT_ASSERT((sljit_w)(jump->u.target - (jump->addr + sizeof(sljit_hw))) >= -0x80000000ll && (sljit_w)(jump->u.target - (jump->addr + sizeof(sljit_hw))) <= 0x7fffffffll);
- *(sljit_hw*)jump->addr = jump->u.target - (jump->addr + sizeof(sljit_hw));
+ *(sljit_hw*)jump->addr = (sljit_hw)(jump->u.target - (jump->addr + sizeof(sljit_hw)));
#endif
}
}
@@ -387,6 +387,7 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compil
/* Maybe we waste some space because of short jumps. */
SLJIT_ASSERT(code_ptr <= code + compiler->size);
compiler->error = SLJIT_ERR_COMPILED;
+ compiler->executable_size = compiler->size;
return (void*)code;
}
@@ -1360,7 +1361,7 @@ static int emit_mul(struct sljit_compiler *compiler,
code = (sljit_ub*)ensure_buf(compiler, 1 + 4);
FAIL_IF(!code);
INC_CSIZE(4);
- *(sljit_hw*)code = src1w;
+ *(sljit_hw*)code = (sljit_hw)src1w;
}
else {
EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src1w);
@@ -1403,7 +1404,7 @@ static int emit_mul(struct sljit_compiler *compiler,
code = (sljit_ub*)ensure_buf(compiler, 1 + 4);
FAIL_IF(!code);
INC_CSIZE(4);
- *(sljit_hw*)code = src2w;
+ *(sljit_hw*)code = (sljit_hw)src2w;
}
else {
EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src1w);
diff --git a/testdata/testinput10 b/testdata/testinput10
index 7210cc5..8e70c70 100644
--- a/testdata/testinput10
+++ b/testdata/testinput10
@@ -52,11 +52,9 @@ is required for these tests. --/
/\x{100000}/8BM
-/\x{1000000}/8BM
+/\x{10ffff}/8BM
-/\x{4000000}/8BM
-
-/\x{7fffFFFF}/8BM
+/\x{110000}/8BM
/[\x{ff}]/8BM
diff --git a/testdata/testinput5 b/testdata/testinput5
index ca7eb54..9ba5b4b 100644
--- a/testdata/testinput5
+++ b/testdata/testinput5
@@ -9,11 +9,9 @@
/\x{100000}/8DZ
-/\x{1000000}/8DZ
+/\x{10ffff}/8DZ
-/\x{4000000}/8DZ
-
-/\x{7fffFFFF}/8DZ
+/\x{110000}/8DZ
/[\x{ff}]/8DZ
@@ -23,6 +21,14 @@
/\x{100000000}/8
+/\x{d800}/8
+
+/\x{dfff}/8
+
+/\x{d7ff}/8
+
+/\x{e000}/8
+
/^\x{100}a\x{1234}/8
\x{100}a\x{1234}bcd
diff --git a/testdata/testoutput10 b/testdata/testoutput10
index f66a12a..47a2a97 100644
--- a/testdata/testoutput10
+++ b/testdata/testoutput10
@@ -317,32 +317,17 @@ Memory allocation (code space): 12
11 End
------------------------------------------------------------------
-/\x{1000000}/8BM
-Memory allocation (code space): 13
-------------------------------------------------------------------
- 0 9 Bra
- 3 \x{1000000}
- 9 9 Ket
- 12 End
-------------------------------------------------------------------
-
-/\x{4000000}/8BM
-Memory allocation (code space): 14
+/\x{10ffff}/8BM
+Memory allocation (code space): 12
------------------------------------------------------------------
- 0 10 Bra
- 3 \x{4000000}
- 10 10 Ket
- 13 End
+ 0 8 Bra
+ 3 \x{10ffff}
+ 8 8 Ket
+ 11 End
------------------------------------------------------------------
-/\x{7fffFFFF}/8BM
-Memory allocation (code space): 14
-------------------------------------------------------------------
- 0 10 Bra
- 3 \x{7fffffff}
- 10 10 Ket
- 13 End
-------------------------------------------------------------------
+/\x{110000}/8BM
+Failed: character value in \x{...} sequence is too large at offset 9
/[\x{ff}]/8BM
Memory allocation (code space): 10
diff --git a/testdata/testoutput5 b/testdata/testoutput5
index 1eaab47..b63934d 100644
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@@ -49,42 +49,21 @@ Options: utf8
First char = 244
Need char = 128
-/\x{1000000}/8DZ
+/\x{10ffff}/8DZ
------------------------------------------------------------------
Bra
- \x{1000000}
+ \x{10ffff}
Ket
End
------------------------------------------------------------------
Capturing subpattern count = 0
Options: utf8
-First char = 249
-Need char = 128
-
-/\x{4000000}/8DZ
-------------------------------------------------------------------
- Bra
- \x{4000000}
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 252
-Need char = 128
-
-/\x{7fffFFFF}/8DZ
-------------------------------------------------------------------
- Bra
- \x{7fffffff}
- Ket
- End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 253
+First char = 244
Need char = 191
+/\x{110000}/8DZ
+Failed: character value in \x{...} sequence is too large at offset 9
+
/[\x{ff}]/8DZ
------------------------------------------------------------------
Bra
@@ -115,6 +94,16 @@ Failed: character value in \x{...} sequence is too large at offset 11
/\x{100000000}/8
Failed: character value in \x{...} sequence is too large at offset 12
+/\x{d800}/8
+Failed: Not allowed UTF-8 / UTF-16 code point (>= 0xd800 && <= 0xdfff) at offset 7
+
+/\x{dfff}/8
+Failed: Not allowed UTF-8 / UTF-16 code point (>= 0xd800 && <= 0xdfff) at offset 7
+
+/\x{d7ff}/8
+
+/\x{e000}/8
+
/^\x{100}a\x{1234}/8
\x{100}a\x{1234}bcd
0: \x{100}a\x{1234}
@@ -1436,7 +1425,7 @@ No match
/[\H]/8BZ
------------------------------------------------------------------
Bra
- [\x00-\x08\x0a-\x1f!-\x9f\xa1-\xff\x{100}-\x{167f}\x{1681}-\x{180d}\x{180f}-\x{1fff}\x{200b}-\x{202e}\x{2030}-\x{205e}\x{2060}-\x{2fff}\x{3001}-\x{7fffffff}]
+ [\x00-\x08\x0a-\x1f!-\x9f\xa1-\xff\x{100}-\x{167f}\x{1681}-\x{180d}\x{180f}-\x{1fff}\x{200b}-\x{202e}\x{2030}-\x{205e}\x{2060}-\x{2fff}\x{3001}-\x{10ffff}]
Ket
End
------------------------------------------------------------------
@@ -1444,7 +1433,7 @@ No match
/[\V]/8BZ
------------------------------------------------------------------
Bra
- [\x00-\x09\x0e-\x84\x86-\xff\x{100}-\x{2027}\x{2029}-\x{7fffffff}]
+ [\x00-\x09\x0e-\x84\x86-\xff\x{100}-\x{2027}\x{2029}-\x{10ffff}]
Ket
End
------------------------------------------------------------------