diff options
author | zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2011-12-03 07:58:30 +0000 |
---|---|---|
committer | zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2011-12-03 07:58:30 +0000 |
commit | ad1a6e3a96050e61e6e2127d3a00ded77a1eb80c (patch) | |
tree | 4987dde0d6b3aee6401d3e89ce6ddc3acef49df3 | |
parent | c9fa02b130f1a9da7b17b915e75248f19afb6d7a (diff) | |
download | pcre-ad1a6e3a96050e61e6e2127d3a00ded77a1eb80c.tar.gz |
renaming utf8 to utf, JIT compiler update, disallowing invalid utf chars
git-svn-id: svn://vcs.exim.org/pcre/code/branches/pcre16@781 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | Makefile.am | 1 | ||||
-rw-r--r-- | pcre16_ord2utf16.c | 95 | ||||
-rw-r--r-- | pcre16_utf16_utils.c | 2 | ||||
-rw-r--r-- | pcre16_valid_utf16.c | 4 | ||||
-rw-r--r-- | pcre_compile.c | 293 | ||||
-rw-r--r-- | pcre_dfa_exec.c | 46 | ||||
-rw-r--r-- | pcre_exec.c | 130 | ||||
-rw-r--r-- | pcre_internal.h | 26 | ||||
-rw-r--r-- | pcre_jit_compile.c | 99 | ||||
-rw-r--r-- | pcre_newline.c | 16 | ||||
-rw-r--r-- | pcre_ord2utf8.c | 18 | ||||
-rw-r--r-- | pcre_study.c | 61 | ||||
-rw-r--r-- | pcre_valid_utf8.c | 2 | ||||
-rw-r--r-- | pcreposix.c | 1 | ||||
-rw-r--r-- | sljit/sljitConfigInternal.h | 4 | ||||
-rw-r--r-- | sljit/sljitExecAllocator.c | 4 | ||||
-rw-r--r-- | sljit/sljitLir.h | 11 | ||||
-rw-r--r-- | sljit/sljitNativeARM_Thumb2.c | 1 | ||||
-rw-r--r-- | sljit/sljitNativeARM_v5.c | 1 | ||||
-rw-r--r-- | sljit/sljitNativeMIPS_common.c | 1 | ||||
-rw-r--r-- | sljit/sljitNativePPC_common.c | 1 | ||||
-rw-r--r-- | sljit/sljitNativeX86_common.c | 15 | ||||
-rw-r--r-- | testdata/testinput10 | 6 | ||||
-rw-r--r-- | testdata/testinput5 | 14 | ||||
-rw-r--r-- | testdata/testoutput10 | 31 | ||||
-rw-r--r-- | testdata/testoutput5 | 47 |
26 files changed, 512 insertions, 418 deletions
diff --git a/Makefile.am b/Makefile.am index 7d5de86..39cf574 100644 --- a/Makefile.am +++ b/Makefile.am @@ -214,6 +214,7 @@ libpcre16_la_SOURCES = \ pcre16_exec.c \ pcre16_jit_compile.c \ pcre16_newline.c \ + pcre16_ord2utf16.c \ pcre16_string_utils.c \ pcre16_study.c \ pcre16_tables.c \ diff --git a/pcre16_ord2utf16.c b/pcre16_ord2utf16.c new file mode 100644 index 0000000..421c3a3 --- /dev/null +++ b/pcre16_ord2utf16.c @@ -0,0 +1,95 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Copyright (c) 1997-2008 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +/* This file contains a private PCRE function that converts an ordinal +character value into a UTF16 string. */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "pcre_internal.h" + + +/************************************************* +* Convert character value to UTF-16 * +*************************************************/ + +/* This function takes an integer value in the range 0 - 0x10ffff +and encodes it as a UTF-16 character in 1 to 2 pcre_uchars. + +Arguments: + cvalue the character value + buffer pointer to buffer for result - at least 2 pcre_uchars long + +Returns: number of characters placed in the buffer +*/ + +int +PRIV(ord2utf)(pcre_uint32 cvalue, pcre_uchar *buffer) +{ +#ifdef SUPPORT_UTF16 + +/* Checking invalid cvalue character, encoded as invalid UTF-16 character. +Should never happen in practice. */ +if ((cvalue & 0xf800) == 0xd800 || cvalue >= 0x110000) + cvalue = 0xfffe; + +if (cvalue <= 0xffff) + { + *buffer = (pcre_uchar)cvalue; + return 1; + } + +cvalue -= 0x10000; +*buffer++ = 0xd800 | (cvalue >> 10); +*buffer = 0xdc00 | (cvalue & 0x3ff); +return 2; + +#else + +(void)(cvalue); /* Keep compiler happy; this function won't ever be */ +(void)(buffer); /* called when SUPPORT_UTF8 is not defined. */ +return 0; + +#endif +} + +/* End of pcre16_ord2utf16.c */ diff --git a/pcre16_utf16_utils.c b/pcre16_utf16_utils.c index cd82e26..5ff3953 100644 --- a/pcre16_utf16_utils.c +++ b/pcre16_utf16_utils.c @@ -57,7 +57,7 @@ any Byte Order Marks (BOMS). Returns with the remainig length. */ BOOL same_bo = TRUE; PCRE_SPTR16 end = input + length; /* The c variable must be unsigned. */ -register uschar c; +register pcre_uchar c; while (input < end) { diff --git a/pcre16_valid_utf16.c b/pcre16_valid_utf16.c index b64519e..c7c7507 100644 --- a/pcre16_valid_utf16.c +++ b/pcre16_valid_utf16.c @@ -78,11 +78,11 @@ Returns: = 0 if the string is a valid UTF-16 string */ int -PRIV(valid_utf16)(PCRE_PUCHAR string, int length, int *erroroffset) +PRIV(valid_utf)(PCRE_PUCHAR string, int length, int *erroroffset) { #ifdef SUPPORT_UTF16 register PCRE_PUCHAR p; -register uschar c; +register pcre_uchar c; if (length < 0) { diff --git a/pcre_compile.c b/pcre_compile.c index 0bdd0fd..da4ce22 100644 --- a/pcre_compile.c +++ b/pcre_compile.c @@ -470,6 +470,7 @@ static const char error_texts[] = "\\k is not followed by a braced, angle-bracketed, or quoted name\0" /* 70 */ "internal error: unknown opcode in find_fixedlength()\0" + "Not allowed UTF-8 / UTF-16 code point (>= 0xd800 && <= 0xdfff)\0" ; /* Table to identify digits and hex digits. This is used when compiling @@ -538,7 +539,7 @@ static const pcre_uint8 digitab[] = /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */ -static const pcre_unit8 digitab[] = +static const pcre_uint8 digitab[] = { 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ @@ -706,9 +707,11 @@ static int check_escape(const pcre_uchar **ptrptr, int *errorcodeptr, int bracount, int options, BOOL isclass) { -BOOL utf8 = (options & PCRE_UTF8) != 0; +/* PCRE_UTF16 has the same value as PCRE_UTF8. */ +BOOL utf = (options & PCRE_UTF8) != 0; const pcre_uchar *ptr = *ptrptr + 1; -int c, i; +pcre_int32 c; +int i; GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ ptr--; /* Set pointer back to the last byte */ @@ -940,12 +943,12 @@ else c -= CHAR_0; while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7) c = c * 8 + *(++ptr) - CHAR_0; - if (!utf8 && c > 0xff) *errorcodeptr = ERR51; + if (!utf && c > 0xff) *errorcodeptr = ERR51; break; /* \x is complicated. \x{ddd} is a character number which can be greater - than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is - treated as a data character. */ + than 0xff in utf or non-8bit mode, but only if the ddd are hex digits. + If not, { is treated as a data character. */ case CHAR_x: if ((options & PCRE_JAVASCRIPT_COMPAT) != 0) @@ -974,14 +977,12 @@ else if (ptr[1] == CHAR_LEFT_CURLY_BRACKET) { const pcre_uchar *pt = ptr + 2; - int count = 0; c = 0; while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) { register int cc = *pt++; if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */ - count++; #ifndef EBCDIC /* ASCII/UTF-8 coding */ if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */ @@ -990,17 +991,25 @@ else if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */ c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10)); #endif - } - if (*pt == CHAR_RIGHT_CURLY_BRACKET) - { #ifdef COMPILE_PCRE8 - if (c < 0 || count > (utf8? 8:2)) *errorcodeptr = ERR34; + if (c > (utf ? 0x10ffff : 0xff)) { c = -1; break; } #else #ifdef COMPILE_PCRE16 - if (c < 0 || count > (utf8? 8:4)) *errorcodeptr = ERR34; + if (c > (utf ? 0x10ffff : 0xffff)) { c = -1; break; } #endif #endif + } + + if (c < 0) + { + while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++; + *errorcodeptr = ERR34; + } + + if (*pt == CHAR_RIGHT_CURLY_BRACKET) + { + if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR71; ptr = pt; break; } @@ -1281,7 +1290,7 @@ Arguments: name name to seek, or NULL if seeking a numbered subpattern lorn name length, or subpattern number if name is NULL xmode TRUE if we are in /x mode - utf8 TRUE if we are in UTF-8 mode + utf TRUE if we are in UTF-8 / UTF-16 mode count pointer to the current capturing subpattern number (updated) Returns: the number of the named subpattern, or -1 if not found @@ -1289,7 +1298,7 @@ Returns: the number of the named subpattern, or -1 if not found static int find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn, - BOOL xmode, BOOL utf8, int *count) + BOOL xmode, BOOL utf, int *count) { pcre_uchar *ptr = *ptrptr; int start_count = *count; @@ -1458,7 +1467,7 @@ for (; ptr < cd->end_pattern; ptr++) if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; } ptr++; #ifdef SUPPORT_UTF8 - if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++; + if (utf) while ((*ptr & 0xc0) == 0x80) ptr++; #endif } if (*ptr == 0) goto FAIL_EXIT; @@ -1469,7 +1478,7 @@ for (; ptr < cd->end_pattern; ptr++) if (*ptr == CHAR_LEFT_PARENTHESIS) { - int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count); + int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, count); if (rc > 0) return rc; if (*ptr == 0) goto FAIL_EXIT; } @@ -1515,14 +1524,14 @@ Arguments: name name to seek, or NULL if seeking a numbered subpattern lorn name length, or subpattern number if name is NULL xmode TRUE if we are in /x mode - utf8 TRUE if we are in UTF-8 mode + utf TRUE if we are in UTF-8 / UTF-16 mode Returns: the number of the found subpattern, or -1 if not found */ static int find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode, - BOOL utf8) + BOOL utf) { pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern; int count = 0; @@ -1535,7 +1544,7 @@ matching closing parens. That is why we have to have a loop. */ for (;;) { - rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count); + rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, &count); if (rc > 0 || *ptr++ == 0) break; } @@ -1618,7 +1627,7 @@ and doing the check at the end; a flag specifies which mode we are running in. Arguments: code points to the start of the pattern (the bracket) - utf8 TRUE in UTF-8 mode + utf TRUE in UTF-8 / UTF-16 mode atend TRUE if called when the pattern is complete cd the "compile data" structure @@ -1630,7 +1639,7 @@ Returns: the fixed length, */ static int -find_fixedlength(pcre_uchar *code, BOOL utf8, BOOL atend, compile_data *cd) +find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd) { int length = -1; @@ -1657,7 +1666,7 @@ for (;;) case OP_ONCE: case OP_ONCE_NC: case OP_COND: - d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf8, atend, cd); + d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd); if (d < 0) return d; branchlength += d; do cc += GET(cc, 1); while (*cc == OP_ALT); @@ -1691,7 +1700,7 @@ for (;;) cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1); /* Start subpattern */ do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */ if (cc > cs && cc < ce) return -1; /* Recursion */ - d = find_fixedlength(cs + 2, utf8, atend, cd); + d = find_fixedlength(cs + 2, utf, atend, cd); if (d < 0) return d; branchlength += d; cc += 1 + LINK_SIZE; @@ -1751,7 +1760,7 @@ for (;;) branchlength++; cc += 2; #ifdef SUPPORT_UTF8 - if (utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; + if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; #endif break; @@ -1765,7 +1774,7 @@ for (;;) branchlength += GET2(cc,1); cc += 2 + IMM2_SIZE; #ifdef SUPPORT_UTF8 - if (utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; + if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; #endif break; @@ -1945,14 +1954,14 @@ length. Arguments: code points to start of expression - utf8 TRUE in UTF-8 mode + utf TRUE in UTF-8 / UTF-16 mode number the required bracket number or negative to find a lookbehind Returns: pointer to the opcode for the bracket, or NULL if not found */ const pcre_uchar * -PRIV(find_bracket)(const pcre_uchar *code, BOOL utf8, int number) +PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number) { for (;;) { @@ -2033,7 +2042,7 @@ for (;;) arrange to skip the extra bytes. */ #ifdef SUPPORT_UTF8 - if (utf8) switch(c) + if (utf) switch(c) { case OP_CHAR: case OP_CHARI: @@ -2067,7 +2076,7 @@ for (;;) break; } #else - (void)(utf8); /* Keep compiler happy by referencing function argument */ + (void)(utf); /* Keep compiler happy by referencing function argument */ #endif } } @@ -2084,13 +2093,13 @@ instance of OP_RECURSE. Arguments: code points to start of expression - utf8 TRUE in UTF-8 mode + utf TRUE in UTF-8 / UTF-16 mode Returns: pointer to the opcode for OP_RECURSE, or NULL if not found */ static const pcre_uchar * -find_recurse(const pcre_uchar *code, BOOL utf8) +find_recurse(const pcre_uchar *code, BOOL utf) { for (;;) { @@ -2153,7 +2162,7 @@ for (;;) to arrange to skip the extra bytes. */ #ifdef SUPPORT_UTF8 - if (utf8) switch(c) + if (utf) switch(c) { case OP_CHAR: case OP_CHARI: @@ -2187,7 +2196,7 @@ for (;;) break; } #else - (void)(utf8); /* Keep compiler happy by referencing function argument */ + (void)(utf); /* Keep compiler happy by referencing function argument */ #endif } } @@ -2210,7 +2219,7 @@ bracket whose current branch will already have been scanned. Arguments: code points to start of search endcode points to where to stop - utf8 TRUE if in UTF8 mode + utf TRUE if in UTF-8 / UTF-16 mode cd contains pointers to tables etc. Returns: TRUE if what is matched could be empty @@ -2218,7 +2227,7 @@ Returns: TRUE if what is matched could be empty static BOOL could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode, - BOOL utf8, compile_data *cd) + BOOL utf, compile_data *cd) { register int c; for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); @@ -2266,7 +2275,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); do { - if (could_be_empty_branch(scode, endcode, utf8, cd)) + if (could_be_empty_branch(scode, endcode, utf, cd)) { empty_branch = TRUE; break; @@ -2322,7 +2331,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); empty_branch = FALSE; do { - if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd)) + if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd)) empty_branch = TRUE; code += GET(code, 1); } @@ -2456,7 +2465,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); case OP_MINQUERYI: case OP_POSQUERY: case OP_POSQUERYI: - if (utf8 && code[1] >= 0xc0) code += PRIV(utf8_table4)[code[1] & 0x3f]; + if (utf && code[1] >= 0xc0) code += PRIV(utf8_table4)[code[1] & 0x3f]; break; case OP_UPTO: @@ -2465,7 +2474,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); case OP_MINUPTOI: case OP_POSUPTO: case OP_POSUPTOI: - if (utf8 && code[1 + IMM2_SIZE] >= 0xc0) code += PRIV(utf8_table4)[code[1 + IMM2_SIZE] & 0x3f]; + if (utf && code[1 + IMM2_SIZE] >= 0xc0) code += PRIV(utf8_table4)[code[1 + IMM2_SIZE] & 0x3f]; break; #endif @@ -2509,7 +2518,7 @@ Arguments: code points to start of the recursion endcode points to where to stop (current RECURSE item) bcptr points to the chain of current (unclosed) branch starts - utf8 TRUE if in UTF-8 mode + utf TRUE if in UTF-8 / UTF-16 mode cd pointers to tables etc Returns: TRUE if what is matched could be empty @@ -2517,11 +2526,11 @@ Returns: TRUE if what is matched could be empty static BOOL could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode, - branch_chain *bcptr, BOOL utf8, compile_data *cd) + branch_chain *bcptr, BOOL utf, compile_data *cd) { while (bcptr != NULL && bcptr->current_branch >= code) { - if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd)) + if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd)) return FALSE; bcptr = bcptr->outer; } @@ -2656,7 +2665,7 @@ value in the reference (which is a group number). Arguments: group points to the start of the group adjust the amount by which the group is to be moved - utf8 TRUE in UTF-8 mode + utf TRUE in UTF-8 / UTF-16 mode cd contains pointers to tables etc. save_hwm the hwm forward reference pointer at the start of the group @@ -2664,12 +2673,12 @@ Returns: nothing */ static void -adjust_recurse(pcre_uchar *group, int adjust, BOOL utf8, compile_data *cd, +adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd, pcre_uchar *save_hwm) { pcre_uchar *ptr = group; -while ((ptr = (pcre_uchar *)find_recurse(ptr, utf8)) != NULL) +while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL) { int offset; pcre_uchar *hc; @@ -2875,7 +2884,7 @@ sense to automatically possessify the repeated item. Arguments: previous pointer to the repeated opcode - utf8 TRUE in UTF-8 mode + utf TRUE in UTF-8 / UTF-16 mode ptr next character in pattern options options bits cd contains pointers to tables etc. @@ -2884,7 +2893,7 @@ Returns: TRUE if possessifying is wanted */ static BOOL -check_auto_possessive(const pcre_uchar *previous, BOOL utf8, +check_auto_possessive(const pcre_uchar *previous, BOOL utf, const pcre_uchar *ptr, int options, compile_data *cd) { int c, next; @@ -2905,7 +2914,7 @@ if ((options & PCRE_EXTENDED) != 0) if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } ptr++; #ifdef SUPPORT_UTF8 - if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++; + if (utf) while ((*ptr & 0xc0) == 0x80) ptr++; #endif } } @@ -2927,7 +2936,7 @@ if (*ptr == CHAR_BACKSLASH) else if ((cd->ctypes[*ptr] & ctype_meta) == 0) { #ifdef SUPPORT_UTF8 - if (utf8) { GETCHARINC(next, ptr); } else + if (utf) { GETCHARINC(next, ptr); } else #endif next = *ptr++; } @@ -2949,7 +2958,7 @@ if ((options & PCRE_EXTENDED) != 0) if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } ptr++; #ifdef SUPPORT_UTF8 - if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++; + if (utf) while ((*ptr & 0xc0) == 0x80) ptr++; #endif } } @@ -2988,7 +2997,7 @@ if (next >= 0) switch(op_code) #endif if (c == next) return FALSE; #ifdef SUPPORT_UTF8 - if (utf8) + if (utf) { unsigned int othercase; if (next < 128) othercase = cd->fcc[next]; else @@ -3013,7 +3022,7 @@ if (next >= 0) switch(op_code) case OP_NOTI: if ((c = *previous) == next) return TRUE; #ifdef SUPPORT_UTF8 - if (utf8) + if (utf) { unsigned int othercase; if (next < 128) othercase = cd->fcc[next]; else @@ -3348,10 +3357,11 @@ must not do this for other options (e.g. PCRE_EXTENDED) because they may change dynamically as we process the pattern. */ #ifdef SUPPORT_UTF8 -BOOL utf8 = (options & PCRE_UTF8) != 0; -pcre_uint8 utf8_char[6]; +/* PCRE_UTF16 has the same value as PCRE_UTF8. */ +BOOL utf = (options & PCRE_UTF8) != 0; +pcre_uchar utf_chars[6]; #else -BOOL utf8 = FALSE; +BOOL utf = FALSE; #endif /* Helper variables for OP_XCLASS opcode (for characters > 255). */ @@ -3459,8 +3469,8 @@ for (;; ptr++) } *lengthptr += (int)(code - last_code); - DPRINTF(("length=%d added %d c=%c\n", *lengthptr, (int)(code - last_code), - c)); + DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr, + (int)(code - last_code), c, c)); /* If "previous" is set and it is not at the start of the work space, move it back to there, in order to avoid filling up the work space. Otherwise, @@ -3547,7 +3557,7 @@ for (;; ptr++) if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; } ptr++; #ifdef SUPPORT_UTF8 - if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++; + if (utf) while ((*ptr & 0xc0) == 0x80) ptr++; #endif } if (*ptr != 0) continue; @@ -3727,7 +3737,7 @@ for (;; ptr++) const pcre_uchar *oldptr; #ifdef SUPPORT_UTF8 - if (utf8 && c > 127) + if (utf && c > 127) { /* Braces are required because the */ GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */ } @@ -3945,22 +3955,22 @@ for (;; ptr++) SETBIT(classbits, 0x20); /* SPACE */ SETBIT(classbits, 0xa0); /* NSBP */ #ifdef SUPPORT_UTF - if (utf8) + if (utf) { xclass = TRUE; *class_uchardata++ = XCL_SINGLE; - class_uchardata += PRIV(ord2utf8)(0x1680, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x1680, class_uchardata); *class_uchardata++ = XCL_SINGLE; - class_uchardata += PRIV(ord2utf8)(0x180e, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x180e, class_uchardata); *class_uchardata++ = XCL_RANGE; - class_uchardata += PRIV(ord2utf8)(0x2000, class_uchardata); - class_uchardata += PRIV(ord2utf8)(0x200A, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x2000, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x200A, class_uchardata); *class_uchardata++ = XCL_SINGLE; - class_uchardata += PRIV(ord2utf8)(0x202f, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x202f, class_uchardata); *class_uchardata++ = XCL_SINGLE; - class_uchardata += PRIV(ord2utf8)(0x205f, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x205f, class_uchardata); *class_uchardata++ = XCL_SINGLE; - class_uchardata += PRIV(ord2utf8)(0x3000, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x3000, class_uchardata); } #endif continue; @@ -3980,30 +3990,30 @@ for (;; ptr++) } #ifdef SUPPORT_UTF - if (utf8) + if (utf) { xclass = TRUE; *class_uchardata++ = XCL_RANGE; - class_uchardata += PRIV(ord2utf8)(0x0100, class_uchardata); - class_uchardata += PRIV(ord2utf8)(0x167f, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x167f, class_uchardata); *class_uchardata++ = XCL_RANGE; - class_uchardata += PRIV(ord2utf8)(0x1681, class_uchardata); - class_uchardata += PRIV(ord2utf8)(0x180d, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x1681, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x180d, class_uchardata); *class_uchardata++ = XCL_RANGE; - class_uchardata += PRIV(ord2utf8)(0x180f, class_uchardata); - class_uchardata += PRIV(ord2utf8)(0x1fff, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x180f, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x1fff, class_uchardata); *class_uchardata++ = XCL_RANGE; - class_uchardata += PRIV(ord2utf8)(0x200B, class_uchardata); - class_uchardata += PRIV(ord2utf8)(0x202e, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x200B, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x202e, class_uchardata); *class_uchardata++ = XCL_RANGE; - class_uchardata += PRIV(ord2utf8)(0x2030, class_uchardata); - class_uchardata += PRIV(ord2utf8)(0x205e, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x2030, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x205e, class_uchardata); *class_uchardata++ = XCL_RANGE; - class_uchardata += PRIV(ord2utf8)(0x2060, class_uchardata); - class_uchardata += PRIV(ord2utf8)(0x2fff, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x2060, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x2fff, class_uchardata); *class_uchardata++ = XCL_RANGE; - class_uchardata += PRIV(ord2utf8)(0x3001, class_uchardata); - class_uchardata += PRIV(ord2utf8)(0x7fffffff, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x3001, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata); } #endif continue; @@ -4015,12 +4025,12 @@ for (;; ptr++) SETBIT(classbits, 0x0d); /* CR */ SETBIT(classbits, 0x85); /* NEL */ #ifdef SUPPORT_UTF - if (utf8) + if (utf) { xclass = TRUE; *class_uchardata++ = XCL_RANGE; - class_uchardata += PRIV(ord2utf8)(0x2028, class_uchardata); - class_uchardata += PRIV(ord2utf8)(0x2029, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x2028, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata); } #endif continue; @@ -4043,15 +4053,15 @@ for (;; ptr++) } #ifdef SUPPORT_UTF - if (utf8) + if (utf) { xclass = TRUE; *class_uchardata++ = XCL_RANGE; - class_uchardata += PRIV(ord2utf8)(0x0100, class_uchardata); - class_uchardata += PRIV(ord2utf8)(0x2027, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x2027, class_uchardata); *class_uchardata++ = XCL_RANGE; - class_uchardata += PRIV(ord2utf8)(0x2029, class_uchardata); - class_uchardata += PRIV(ord2utf8)(0x7fffffff, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata); } #endif continue; @@ -4139,7 +4149,7 @@ for (;; ptr++) } #ifdef SUPPORT_UTF8 - if (utf8) + if (utf) { /* Braces are required because the */ GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */ } @@ -4189,7 +4199,7 @@ for (;; ptr++) available. */ #ifdef SUPPORT_UTF - if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127))) + if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127))) #endif #ifndef COMPILE_PCRE8 if (d > 255) @@ -4234,9 +4244,9 @@ for (;; ptr++) else { *class_uchardata++ = XCL_RANGE; - class_uchardata += PRIV(ord2utf8)(occ, class_uchardata); + class_uchardata += PRIV(ord2utf)(occ, class_uchardata); } - class_uchardata += PRIV(ord2utf8)(ocd, class_uchardata); + class_uchardata += PRIV(ord2utf)(ocd, class_uchardata); } } #endif /* SUPPORT_UCP */ @@ -4246,8 +4256,8 @@ for (;; ptr++) *class_uchardata++ = XCL_RANGE; #ifdef SUPPORT_UTF - class_uchardata += PRIV(ord2utf8)(c, class_uchardata); - class_uchardata += PRIV(ord2utf8)(d, class_uchardata); + class_uchardata += PRIV(ord2utf)(c, class_uchardata); + class_uchardata += PRIV(ord2utf)(d, class_uchardata); #else *class_uchardata++ = c; *class_uchardata++ = d; @@ -4304,7 +4314,7 @@ for (;; ptr++) /* Handle a character that cannot go in the bit map */ #ifdef SUPPORT_UTF - if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127))) + if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127))) #endif #ifndef COMPILE_PCRE8 if (c > 255) @@ -4314,7 +4324,7 @@ for (;; ptr++) xclass = TRUE; *class_uchardata++ = XCL_SINGLE; #ifdef SUPPORT_UTF - class_uchardata += PRIV(ord2utf8)(c, class_uchardata); + class_uchardata += PRIV(ord2utf)(c, class_uchardata); #else *class_uchardata++ = c; #endif @@ -4326,7 +4336,7 @@ for (;; ptr++) if ((othercase = UCD_OTHERCASE(c)) != c) { *class_uchardata++ = XCL_SINGLE; - class_uchardata += PRIV(ord2utf8)(othercase, class_uchardata); + class_uchardata += PRIV(ord2utf)(othercase, class_uchardata); } } #endif /* SUPPORT_UCP */ @@ -4384,11 +4394,9 @@ for (;; ptr++) #ifdef SUPPORT_UTF if (class_charcount == 1 && !xclass && - (!utf8 || !negate_class || class_lastchar < 128)) -#elif defined COMPILE_PCRE8 - if (class_charcount == 1) + (!utf || !negate_class || class_lastchar < 128)) #else - if (class_charcount == 1 && !xclass) + if (class_charcount == 1) #endif { zeroreqchar = reqchar; @@ -4408,8 +4416,8 @@ for (;; ptr++) then we can handle this with the normal one-character code. */ #ifdef SUPPORT_UTF8 - if (utf8 && class_lastchar > 127) - mclength = PRIV(ord2utf8)(class_lastchar, mcbuffer); + if (utf && class_lastchar > 127) + mclength = PRIV(ord2utf)(class_lastchar, mcbuffer); else #endif { @@ -4599,12 +4607,12 @@ for (;; ptr++) length rather than a small character. */ #ifdef SUPPORT_UTF8 - if (utf8 && (code[-1] & 0x80) != 0) + if (utf && (code[-1] & 0x80) != 0) { pcre_uchar *lastchar = code - 1; while((*lastchar & 0xc0) == 0x80) lastchar--; c = code - lastchar; /* Length of UTF-8 character */ - memcpy(utf8_char, lastchar, c); /* Save the char */ + memcpy(utf_chars, lastchar, c); /* Save the char */ c |= 0x80; /* Flag c as a length */ } else @@ -4625,7 +4633,7 @@ for (;; ptr++) if (!possessive_quantifier && repeat_max < 0 && - check_auto_possessive(previous, utf8, ptr + 1, options, cd)) + check_auto_possessive(previous, utf, ptr + 1, options, cd)) { repeat_type = 0; /* Force greedy */ possessive_quantifier = TRUE; @@ -4646,7 +4654,7 @@ for (;; ptr++) c = previous[1]; if (!possessive_quantifier && repeat_max < 0 && - check_auto_possessive(previous, utf8, ptr + 1, options, cd)) + check_auto_possessive(previous, utf, ptr + 1, options, cd)) { repeat_type = 0; /* Force greedy */ possessive_quantifier = TRUE; @@ -4670,7 +4678,7 @@ for (;; ptr++) if (!possessive_quantifier && repeat_max < 0 && - check_auto_possessive(previous, utf8, ptr + 1, options, cd)) + check_auto_possessive(previous, utf, ptr + 1, options, cd)) { repeat_type = 0; /* Force greedy */ possessive_quantifier = TRUE; @@ -4755,9 +4763,9 @@ for (;; ptr++) if (repeat_max < 0) { #ifdef SUPPORT_UTF8 - if (utf8 && c >= 128) + if (utf && c >= 128) { - memcpy(code, utf8_char, c & 7); + memcpy(code, utf_chars, c & 7); code += c & 7; } else @@ -4780,9 +4788,9 @@ for (;; ptr++) else if (repeat_max != repeat_min) { #ifdef SUPPORT_UTF8 - if (utf8 && c >= 128) + if (utf && c >= 128) { - memcpy(code, utf8_char, c & 7); + memcpy(code, utf_chars, c & 7); code += c & 7; } else @@ -4810,9 +4818,9 @@ for (;; ptr++) /* The character or character type itself comes last in all cases. */ #ifdef SUPPORT_UTF8 - if (utf8 && c >= 128) + if (utf && c >= 128) { - memcpy(code, utf8_char, c & 7); + memcpy(code, utf_chars, c & 7); code += c & 7; } else @@ -4939,7 +4947,7 @@ for (;; ptr++) if (repeat_max <= 1) /* Covers 0, 1, and unlimited */ { *code = OP_END; - adjust_recurse(previous, 1, utf8, cd, save_hwm); + adjust_recurse(previous, 1, utf, cd, save_hwm); memmove(previous + 1, previous, IN_UCHARS(len)); code++; if (repeat_max == 0) @@ -4963,7 +4971,7 @@ for (;; ptr++) { int offset; *code = OP_END; - adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm); + adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm); memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len)); code += 2 + LINK_SIZE; *previous++ = OP_BRAZERO + repeat_type; @@ -5165,7 +5173,7 @@ for (;; ptr++) pcre_uchar *scode = bracode; do { - if (could_be_empty_branch(scode, ketcode, utf8, cd)) + if (could_be_empty_branch(scode, ketcode, utf, cd)) { *bracode += OP_SBRA - OP_BRA; break; @@ -5188,7 +5196,7 @@ for (;; ptr++) { int nlen = (int)(code - bracode); *code = OP_END; - adjust_recurse(bracode, 1 + LINK_SIZE, utf8, cd, save_hwm); + adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm); memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen)); code += 1 + LINK_SIZE; nlen += 1 + LINK_SIZE; @@ -5266,7 +5274,7 @@ for (;; ptr++) { tempcode += PRIV(OP_lengths)[*tempcode]; #ifdef SUPPORT_UTF8 - if (utf8 && tempcode[-1] >= 0xc0) + if (utf && tempcode[-1] >= 0xc0) tempcode += PRIV(utf8_table4)[tempcode[-1] & 0x3f]; #endif } @@ -5304,7 +5312,7 @@ for (;; ptr++) default: *code = OP_END; - adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm); + adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm); memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len)); code += 1 + LINK_SIZE; len += 1 + LINK_SIZE; @@ -5613,7 +5621,7 @@ for (;; ptr++) /* Search the pattern for a forward reference */ else if ((i = find_parens(cd, name, namelen, - (options & PCRE_EXTENDED) != 0, utf8)) > 0) + (options & PCRE_EXTENDED) != 0, utf)) > 0) { PUT2(code, 2+LINK_SIZE, i); code[1+LINK_SIZE]++; @@ -5958,7 +5966,7 @@ for (;; ptr++) temp = cd->end_pattern; cd->end_pattern = ptr; recno = find_parens(cd, name, namelen, - (options & PCRE_EXTENDED) != 0, utf8); + (options & PCRE_EXTENDED) != 0, utf); cd->end_pattern = temp; if (recno < 0) recno = 0; /* Forward ref; set dummy number */ } @@ -5985,7 +5993,7 @@ for (;; ptr++) } else if ((recno = /* Forward back reference */ find_parens(cd, name, namelen, - (options & PCRE_EXTENDED) != 0, utf8)) <= 0) + (options & PCRE_EXTENDED) != 0, utf)) <= 0) { *errorcodeptr = ERR15; goto FAILED; @@ -6089,14 +6097,14 @@ for (;; ptr++) { *code = OP_END; if (recno != 0) - called = PRIV(find_bracket)(cd->start_code, utf8, recno); + called = PRIV(find_bracket)(cd->start_code, utf, recno); /* Forward reference */ if (called == NULL) { if (find_parens(cd, NULL, recno, - (options & PCRE_EXTENDED) != 0, utf8) < 0) + (options & PCRE_EXTENDED) != 0, utf) < 0) { *errorcodeptr = ERR15; goto FAILED; @@ -6120,7 +6128,7 @@ for (;; ptr++) conditional subpatterns will be picked up then. */ else if (GET(called, 1) == 0 && cond_depth <= 0 && - could_be_empty(called, code, bcptr, utf8, cd)) + could_be_empty(called, code, bcptr, utf, cd)) { *errorcodeptr = ERR40; goto FAILED; @@ -6618,7 +6626,7 @@ for (;; ptr++) { previous = (-c > ESC_b && -c < ESC_Z)? code : NULL; - *code++ = (!utf8 && c == -ESC_C)? OP_ALLANY : -c; + *code++ = (!utf && c == -ESC_C)? OP_ALLANY : -c; } } continue; @@ -6629,8 +6637,8 @@ for (;; ptr++) handle it as a data character. */ #ifdef SUPPORT_UTF8 - if (utf8 && c > 127) - mclength = PRIV(ord2utf8)(c, mcbuffer); + if (utf && c > 127) + mclength = PRIV(ord2utf)(c, mcbuffer); else #endif @@ -6652,7 +6660,7 @@ for (;; ptr++) mcbuffer[0] = c; #ifdef SUPPORT_UTF8 - if (utf8 && c >= 0xc0) + if (utf && c >= 0xc0) { while ((ptr[1] & 0xc0) == 0x80) mcbuffer[mclength++] = *(++ptr); @@ -7360,7 +7368,7 @@ pcre_int32 firstchar, reqchar; int newline; int errorcode = 0; int skipatstart = 0; -BOOL utf8; +BOOL utf; size_t size; pcre_uchar *code; const pcre_uchar *codestart; @@ -7458,22 +7466,23 @@ while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS && else break; } -utf8 = (options & PCRE_UTF8) != 0; +/* PCRE_UTF16 has the same value as PCRE_UTF8. */ +utf = (options & PCRE_UTF8) != 0; /* Can't support UTF8 unless PCRE has been compiled to include the code. The -return of an error code from PRIV(valid_utf8)() is a new feature, introduced in +return of an error code from PRIV(valid_utf)() is a new feature, introduced in release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is not used here. */ #ifdef SUPPORT_UTF8 -if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 && - (errorcode = PRIV(valid_utf8)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0) +if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 && + (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0) { errorcode = ERR44; goto PCRE_EARLY_ERROR_RETURN2; } #else -if (utf8) +if (utf) { errorcode = ERR32; goto PCRE_EARLY_ERROR_RETURN; @@ -7688,7 +7697,7 @@ while (errorcode == 0 && cd->hwm > cworkspace) cd->hwm -= LINK_SIZE; offset = GET(cd->hwm, 0); recno = GET(codestart, offset); - groupptr = PRIV(find_bracket)(codestart, utf8, recno); + groupptr = PRIV(find_bracket)(codestart, utf, recno); if (groupptr == NULL) errorcode = ERR53; else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart)); } @@ -7715,9 +7724,9 @@ if (cd->check_lookbehind) of zero, but that is a pathological case, and it does no harm.) When we find one, we temporarily terminate the branch it is in while we scan it. */ - for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf8, -1); + for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf, -1); cc != NULL; - cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf8, -1)) + cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf, -1)) { if (GET(cc, 1) == 0) { diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c index 8fed9b3..8247f46 100644 --- a/pcre_dfa_exec.c +++ b/pcre_dfa_exec.c @@ -414,9 +414,9 @@ const pcre_uchar *end_subject = md->end_subject; const pcre_uchar *start_code = md->start_code; #ifdef SUPPORT_UTF8 -BOOL utf8 = (md->poptions & PCRE_UTF8) != 0; +BOOL utf = (md->poptions & PCRE_UTF8) != 0; #else -BOOL utf8 = FALSE; +BOOL utf = FALSE; #endif rlevel++; @@ -474,7 +474,7 @@ if (*first_op == OP_REVERSE) #ifdef SUPPORT_UTF8 /* In character mode we have to step back character by character */ - if (utf8) + if (utf) { for (gone_back = 0; gone_back < max_back; gone_back++) { @@ -606,7 +606,7 @@ for (;;) { clen = 1; /* Number of bytes in the character */ #ifdef SUPPORT_UTF8 - if (utf8) { GETCHARLEN(c, ptr, clen); } else + if (utf) { GETCHARLEN(c, ptr, clen); } else #endif /* SUPPORT_UTF8 */ c = *ptr; } @@ -695,7 +695,7 @@ for (;;) { dlen = 1; #ifdef SUPPORT_UTF8 - if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else + if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else #endif /* SUPPORT_UTF8 */ d = code[coptable[codevalue]]; if (codevalue >= OP_TYPESTAR) @@ -960,7 +960,7 @@ for (;;) const pcre_uchar *temp = ptr - 1; if (temp < md->start_used_ptr) md->start_used_ptr = temp; #ifdef SUPPORT_UTF8 - if (utf8) BACKCHAR(temp); + if (utf) BACKCHAR(temp); #endif GETCHARTEST(d, temp); #ifdef SUPPORT_UCP @@ -1986,7 +1986,7 @@ for (;;) if (clen == 0) break; #ifdef SUPPORT_UTF8 - if (utf8) + if (utf) { if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else { @@ -2007,8 +2007,7 @@ for (;;) } else #endif /* SUPPORT_UTF8 */ - - /* Non-UTF-8 mode */ + /* Not UTF mode */ { if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); } } @@ -2211,7 +2210,7 @@ for (;;) if (caseless) { #ifdef SUPPORT_UTF8 - if (utf8 && d >= 128) + if (utf && d >= 128) { #ifdef SUPPORT_UCP otherd = UCD_OTHERCASE(d); @@ -2258,7 +2257,7 @@ for (;;) if (caseless) { #ifdef SUPPORT_UTF8 - if (utf8 && d >= 128) + if (utf && d >= 128) { #ifdef SUPPORT_UCP otherd = UCD_OTHERCASE(d); @@ -2303,7 +2302,7 @@ for (;;) if (caseless) { #ifdef SUPPORT_UTF8 - if (utf8 && d >= 128) + if (utf && d >= 128) { #ifdef SUPPORT_UCP otherd = UCD_OTHERCASE(d); @@ -2340,7 +2339,7 @@ for (;;) if (caseless) { #ifdef SUPPORT_UTF8 - if (utf8 && d >= 128) + if (utf && d >= 128) { #ifdef SUPPORT_UCP otherd = UCD_OTHERCASE(d); @@ -2384,7 +2383,7 @@ for (;;) if (caseless) { #ifdef SUPPORT_UTF8 - if (utf8 && d >= 128) + if (utf && d >= 128) { #ifdef SUPPORT_UCP otherd = UCD_OTHERCASE(d); @@ -3005,7 +3004,7 @@ pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data, real_pcre *re = (real_pcre *)argument_re; dfa_match_data match_block; dfa_match_data *md = &match_block; -BOOL utf8, anchored, startline, firstline; +BOOL utf, anchored, startline, firstline; const pcre_uchar *current_subject, *end_subject; const pcre_uint8 *lcc; @@ -3073,9 +3072,10 @@ end_subject = (const unsigned char *)subject + length; req_char_ptr = current_subject - 1; #ifdef SUPPORT_UTF8 -utf8 = (re->options & PCRE_UTF8) != 0; +/* PCRE_UTF16 has the same value as PCRE_UTF8. */ +utf = (re->options & PCRE_UTF8) != 0; #else -utf8 = FALSE; +utf = FALSE; #endif anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 || @@ -3147,10 +3147,10 @@ else back the character offset. */ #ifdef SUPPORT_UTF8 -if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) +if (utf && (options & PCRE_NO_UTF8_CHECK) == 0) { int erroroffset; - int errorcode = PRIV(valid_utf8)((pcre_uchar *)subject, length, &erroroffset); + int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset); if (errorcode != 0) { if (offsetcount >= 2) @@ -3235,7 +3235,7 @@ for (;;) { PCRE_PUCHAR t = current_subject; #ifdef SUPPORT_UTF8 - if (utf8) + if (utf) { while (t < md->end_subject && !IS_NEWLINE(t)) { @@ -3278,7 +3278,7 @@ for (;;) if (current_subject > md->start_subject + start_offset) { #ifdef SUPPORT_UTF8 - if (utf8) + if (utf) { while (current_subject < end_subject && !WAS_NEWLINE(current_subject)) @@ -3317,7 +3317,7 @@ for (;;) { current_subject++; #ifdef SUPPORT_UTF8 - if (utf8) + if (utf) while(current_subject < end_subject && (*current_subject & 0xc0) == 0x80) current_subject++; #endif @@ -3426,7 +3426,7 @@ for (;;) if (firstline && IS_NEWLINE(current_subject)) break; current_subject++; - if (utf8) + if (utf) { while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80) current_subject++; diff --git a/pcre_exec.c b/pcre_exec.c index 778a301..db013e6 100644 --- a/pcre_exec.c +++ b/pcre_exec.c @@ -183,7 +183,7 @@ if (caseless) { #ifdef SUPPORT_UTF8 #ifdef SUPPORT_UCP - if (md->utf8) + if (md->utf) { /* Match characters up to the end of the reference. NOTE: the number of bytes matched may differ, because there are some characters whose upper and @@ -385,7 +385,7 @@ typedef struct heapframe { int Xprop_value; int Xprop_fail_result; int Xoclength; - pcre_uint8 Xocchars[8]; + pcre_uchar Xocchars[6]; #endif int Xcodelink; @@ -450,7 +450,7 @@ the subject. */ /* Performance note: It might be tempting to extract commonly used fields from -the md structure (e.g. utf8, end_subject) into individual variables to improve +the md structure (e.g. utf, end_subject) into individual variables to improve performance. Tests using gcc on a SPARC disproved this; in the first case, it made performance worse. @@ -485,7 +485,7 @@ so they can be ordinary variables in all cases. Mark some of them with register int rrc; /* Returns from recursive calls */ register int i; /* Used for loops not involving calls to RMATCH() */ register unsigned int c; /* Character values not kept over RMATCH() calls */ -register BOOL utf8; /* Local copy of UTF-8 flag for speed */ +register BOOL utf; /* Local copy of UTF flag for speed */ BOOL minimize, possessive; /* Quantifier options */ BOOL caseless; @@ -606,7 +606,7 @@ int prop_type; int prop_value; int prop_fail_result; int oclength; -pcre_uint8 occhars[8]; +pcre_uchar occhars[6]; #endif int codelink; @@ -660,9 +660,9 @@ complicated macro. It has to be used in one particular way. This shouldn't, however, impact performance when true recursion is being used. */ #ifdef SUPPORT_UTF8 -utf8 = md->utf8; /* Local copy of the flag */ +utf = md->utf; /* Local copy of the flag */ #else -utf8 = FALSE; +utf = FALSE; #endif /* First check that we haven't called match() too many times, or that we @@ -1597,7 +1597,7 @@ for (;;) case OP_REVERSE: #ifdef SUPPORT_UTF8 - if (utf8) + if (utf) { i = GET(ecode, 1); while (i-- > 0) @@ -2070,7 +2070,7 @@ for (;;) partial matching. */ #ifdef SUPPORT_UTF8 - if (utf8) + if (utf) { /* Get status of previous character */ @@ -2189,7 +2189,7 @@ for (;;) MRRETURN(MATCH_NOMATCH); } eptr++; - if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; + if (utf) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; ecode++; break; @@ -2546,7 +2546,7 @@ for (;;) while (eptr < md->end_subject) { int len = 1; - if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); } + if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } if (UCD_CATEGORY(c) != ucp_M) break; eptr += len; } @@ -2744,8 +2744,7 @@ for (;;) /* First, ensure the minimum number of matches are present. */ #ifdef SUPPORT_UTF - /* UTF-8 mode */ - if (utf8) + if (utf) { for (i = 1; i <= min; i++) { @@ -2765,7 +2764,7 @@ for (;;) } else #endif - /* Not UTF-8 mode */ + /* Not UTF mode */ { for (i = 1; i <= min; i++) { @@ -2797,8 +2796,7 @@ for (;;) if (minimize) { #ifdef SUPPORT_UTF - /* UTF-8 mode */ - if (utf8) + if (utf) { for (fi = min;; fi++) { @@ -2821,7 +2819,7 @@ for (;;) } else #endif - /* Not UTF-8 mode */ + /* Not UTF mode */ { for (fi = min;; fi++) { @@ -2854,8 +2852,7 @@ for (;;) pp = eptr; #ifdef SUPPORT_UTF - /* UTF mode */ - if (utf8) + if (utf) { for (i = min; i < max; i++) { @@ -3024,7 +3021,7 @@ for (;;) if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (eptr-- == pp) break; /* Stop if tried at original pos */ #ifdef SUPPORT_UTF - if (utf8) BACKCHAR(eptr); + if (utf) BACKCHAR(eptr); #endif } MRRETURN(MATCH_NOMATCH); @@ -3038,7 +3035,7 @@ for (;;) case OP_CHAR: #ifdef SUPPORT_UTF8 - if (utf8) + if (utf) { length = 1; ecode++; @@ -3052,8 +3049,7 @@ for (;;) } else #endif - - /* Non-UTF-8 mode */ + /* Not UTF mode */ { if (md->end_subject - eptr < 1) { @@ -3069,7 +3065,7 @@ for (;;) case OP_CHARI: #ifdef SUPPORT_UTF8 - if (utf8) + if (utf) { length = 1; ecode++; @@ -3112,7 +3108,7 @@ for (;;) else #endif /* SUPPORT_UTF8 */ - /* Non-UTF-8 mode */ + /* Not UTF mode */ { if (md->end_subject - eptr < 1) { @@ -3193,7 +3189,7 @@ for (;;) REPEATCHAR: #ifdef SUPPORT_UTF8 - if (utf8) + if (utf) { length = 1; charptr = ecode; @@ -3209,7 +3205,7 @@ for (;;) unsigned int othercase; if (op >= OP_STARI && /* Caseless */ (othercase = UCD_OTHERCASE(fc)) != fc) - oclength = PRIV(ord2utf8)(othercase, occhars); + oclength = PRIV(ord2utf)(othercase, occhars); else oclength = 0; #endif /* SUPPORT_UCP */ @@ -3220,7 +3216,7 @@ for (;;) #ifdef SUPPORT_UCP else if (oclength > 0 && eptr <= md->end_subject - oclength && - memcmp(eptr, occhars, oclength) == 0) eptr += oclength; + memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength; #endif /* SUPPORT_UCP */ else { @@ -3243,7 +3239,7 @@ for (;;) #ifdef SUPPORT_UCP else if (oclength > 0 && eptr <= md->end_subject - oclength && - memcmp(eptr, occhars, oclength) == 0) eptr += oclength; + memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength; #endif /* SUPPORT_UCP */ else { @@ -3264,7 +3260,7 @@ for (;;) #ifdef SUPPORT_UCP else if (oclength > 0 && eptr <= md->end_subject - oclength && - memcmp(eptr, occhars, oclength) == 0) eptr += oclength; + memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength; #endif /* SUPPORT_UCP */ else { @@ -3548,8 +3544,7 @@ for (;;) fc = md->lcc[fc]; #ifdef SUPPORT_UTF8 - /* UTF-8 mode */ - if (utf8) + if (utf) { register unsigned int d; for (i = 1; i <= min; i++) @@ -3566,8 +3561,7 @@ for (;;) } else #endif - - /* Not UTF-8 mode */ + /* Not UTF mode */ { for (i = 1; i <= min; i++) { @@ -3585,8 +3579,7 @@ for (;;) if (minimize) { #ifdef SUPPORT_UTF8 - /* UTF-8 mode */ - if (utf8) + if (utf) { register unsigned int d; for (fi = min;; fi++) @@ -3606,7 +3599,7 @@ for (;;) } else #endif - /* Not UTF-8 mode */ + /* Not UTF mode */ { for (fi = min;; fi++) { @@ -3631,8 +3624,7 @@ for (;;) pp = eptr; #ifdef SUPPORT_UTF8 - /* UTF-8 mode */ - if (utf8) + if (utf) { register unsigned int d; for (i = min; i < max; i++) @@ -3659,7 +3651,7 @@ for (;;) } else #endif - /* Not UTF-8 mode */ + /* Not UTF mode */ { for (i = min; i < max; i++) { @@ -3690,8 +3682,7 @@ for (;;) else { #ifdef SUPPORT_UTF8 - /* UTF-8 mode */ - if (utf8) + if (utf) { register unsigned int d; for (i = 1; i <= min; i++) @@ -3707,7 +3698,7 @@ for (;;) } else #endif - /* Not UTF-8 mode */ + /* Not UTF mode */ { for (i = 1; i <= min; i++) { @@ -3725,8 +3716,7 @@ for (;;) if (minimize) { #ifdef SUPPORT_UTF8 - /* UTF-8 mode */ - if (utf8) + if (utf) { register unsigned int d; for (fi = min;; fi++) @@ -3745,7 +3735,7 @@ for (;;) } else #endif - /* Not UTF-8 mode */ + /* Not UTF mode */ { for (fi = min;; fi++) { @@ -3770,8 +3760,7 @@ for (;;) pp = eptr; #ifdef SUPPORT_UTF8 - /* UTF-8 mode */ - if (utf8) + if (utf) { register unsigned int d; for (i = min; i < max; i++) @@ -3797,7 +3786,7 @@ for (;;) } else #endif - /* Not UTF-8 mode */ + /* Not UTF mode */ { for (i = min; i < max; i++) { @@ -4073,7 +4062,7 @@ for (;;) while (eptr < md->end_subject) { int len = 1; - if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); } + if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } if (UCD_CATEGORY(c) != ucp_M) break; eptr += len; } @@ -4086,7 +4075,7 @@ for (;;) /* Handle all other cases when the coding is UTF-8 */ #ifdef SUPPORT_UTF8 - if (utf8) switch(ctype) + if (utf) switch(ctype) { case OP_ANY: for (i = 1; i <= min; i++) @@ -4794,7 +4783,7 @@ for (;;) while (eptr < md->end_subject) { int len = 1; - if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); } + if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } if (UCD_CATEGORY(c) != ucp_M) break; eptr += len; } @@ -4804,8 +4793,7 @@ for (;;) #endif /* SUPPORT_UCP */ #ifdef SUPPORT_UTF8 - /* UTF-8 mode */ - if (utf8) + if (utf) { for (fi = min;; fi++) { @@ -4968,7 +4956,7 @@ for (;;) } else #endif - /* Not UTF-8 mode */ + /* Not UTF mode */ { for (fi = min;; fi++) { @@ -5267,7 +5255,7 @@ for (;;) RMATCH(eptr, ecode, offset_top, md, eptrb, RM44); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (eptr-- == pp) break; /* Stop if tried at original pos */ - if (utf8) BACKCHAR(eptr); + if (utf) BACKCHAR(eptr); } } @@ -5284,13 +5272,13 @@ for (;;) SCHECK_PARTIAL(); break; } - if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); } + if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } if (UCD_CATEGORY(c) == ucp_M) break; eptr += len; while (eptr < md->end_subject) { len = 1; - if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); } + if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } if (UCD_CATEGORY(c) != ucp_M) break; eptr += len; } @@ -5307,7 +5295,7 @@ for (;;) if (eptr-- == pp) break; /* Stop if tried at original pos */ for (;;) /* Move back over one extended */ { - if (!utf8) c = *eptr; else + if (!utf) c = *eptr; else { BACKCHAR(eptr); GETCHAR(c, eptr); @@ -5322,9 +5310,7 @@ for (;;) #endif /* SUPPORT_UCP */ #ifdef SUPPORT_UTF8 - /* UTF-8 mode */ - - if (utf8) + if (utf) { switch(ctype) { @@ -5607,8 +5593,7 @@ for (;;) } else #endif /* SUPPORT_UTF8 */ - - /* Not UTF-8 mode */ + /* Not UTF mode */ { switch(ctype) { @@ -5969,7 +5954,7 @@ BOOL using_temporary_offsets = FALSE; BOOL anchored; BOOL startline; BOOL firstline; -BOOL utf8; +BOOL utf; BOOL has_first_char = FALSE; BOOL has_req_char = FALSE; pcre_uchar first_char = 0; @@ -6005,7 +5990,8 @@ follows immediately afterwards. Other values in the md block are used only during "normal" pcre_exec() processing, not when the JIT support is in use, so they are set up later. */ -utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0; +/* PCRE_UTF16 has the same value as PCRE_UTF8. */ +utf = md->utf = (re->options & PCRE_UTF8) != 0; md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 : ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0; @@ -6013,10 +5999,10 @@ md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 : code for an invalid string if a results vector is available. */ #ifdef SUPPORT_UTF8 -if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) +if (utf && (options & PCRE_NO_UTF8_CHECK) == 0) { int erroroffset; - int errorcode = PRIV(valid_utf8)((PCRE_PUCHAR)subject, length, &erroroffset); + int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset); if (errorcode != 0) { if (offsetcount >= 2) @@ -6306,7 +6292,7 @@ for(;;) { PCRE_PUCHAR t = start_match; #ifdef SUPPORT_UTF8 - if (utf8) + if (utf) { while (t < md->end_subject && !IS_NEWLINE(t)) { @@ -6348,7 +6334,7 @@ for(;;) if (start_match > md->start_subject + start_offset) { #ifdef SUPPORT_UTF8 - if (utf8) + if (utf) { while (start_match < end_subject && !WAS_NEWLINE(start_match)) { @@ -6389,7 +6375,7 @@ for(;;) { start_match++; #ifdef SUPPORT_UTF8 - if (utf8) + if (utf) while(start_match < end_subject && (*start_match & 0xc0) == 0x80) start_match++; #endif @@ -6521,7 +6507,7 @@ for(;;) case MATCH_THEN: new_start_match = start_match + 1; #ifdef SUPPORT_UTF8 - if (utf8) + if (utf) while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80) new_start_match++; #endif diff --git a/pcre_internal.h b/pcre_internal.h index 9dbaf05..637565b 100644 --- a/pcre_internal.h +++ b/pcre_internal.h @@ -292,8 +292,8 @@ start/end of string field names are. */ #define IS_NEWLINE(p) \ ((NLBLOCK->nltype != NLTYPE_FIXED)? \ ((p) < NLBLOCK->PSEND && \ - PRIV(is_newline)((p), NLBLOCK->nltype, NLBLOCK->PSEND, &(NLBLOCK->nllen),\ - utf8)) \ + PRIV(is_newline)((p), NLBLOCK->nltype, NLBLOCK->PSEND, \ + &(NLBLOCK->nllen), utf)) \ : \ ((p) <= NLBLOCK->PSEND - NLBLOCK->nllen && \ (p)[0] == NLBLOCK->nl[0] && \ @@ -307,7 +307,7 @@ start/end of string field names are. */ ((NLBLOCK->nltype != NLTYPE_FIXED)? \ ((p) > NLBLOCK->PSSTART && \ PRIV(was_newline)((p), NLBLOCK->nltype, NLBLOCK->PSSTART, \ - &(NLBLOCK->nllen), utf8)) \ + &(NLBLOCK->nllen), utf)) \ : \ ((p) >= NLBLOCK->PSSTART + NLBLOCK->nllen && \ (p)[-NLBLOCK->nllen] == NLBLOCK->nl[0] && \ @@ -581,7 +581,7 @@ pointer. */ #define GETCHARTEST(c, eptr) \ c = *eptr; \ - if (utf8 && c >= 0xc0) GETUTF8(c, eptr); + if (utf && c >= 0xc0) GETUTF8(c, eptr); /* Base macro to pick up the remaining bytes of a UTF-8 character, advancing the pointer. */ @@ -629,7 +629,7 @@ This is called when we don't know if we are in UTF-8 mode. */ #define GETCHARINCTEST(c, eptr) \ c = *eptr++; \ - if (utf8 && c >= 0xc0) GETUTF8INC(c, eptr); + if (utf && c >= 0xc0) GETUTF8INC(c, eptr); /* Base macro to pick up the remaining bytes of a UTF-8 character, not advancing the pointer, incrementing the length. */ @@ -681,7 +681,7 @@ do not know if we are in UTF-8 mode. */ #define GETCHARLENTEST(c, eptr, len) \ c = *eptr; \ - if (utf8 && c >= 0xc0) GETUTF8LEN(c, eptr, len); + if (utf && c >= 0xc0) GETUTF8LEN(c, eptr, len); /* If the pointer is not at the start of a character, move it back until it is. This is called only in UTF-8 mode - we don't put a test within the macro @@ -1366,7 +1366,7 @@ value such as \n. They must have non-zero values, as check_escape() returns their negation. Also, they must appear in the same order as in the opcode definitions below, up to ESC_z. There's a dummy for OP_ALLANY because it corresponds to "." in DOTALL mode rather than an escape sequence. It is also -used for [^] in JavaScript compatibility mode, and for \C in non-utf8 mode. In +used for [^] in JavaScript compatibility mode, and for \C in non-utf mode. In non-DOTALL mode, "." behaves like \N. The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc. @@ -1784,7 +1784,7 @@ enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, - ERR70, ERRCOUNT }; + ERR70, ERR71, ERRCOUNT }; /* The real format of the start of the pcre block; the index of names and the code vector run on as long as necessary after the end. We store an explicit @@ -1934,7 +1934,7 @@ typedef struct match_data { BOOL offset_overflow; /* Set if too many extractions */ BOOL notbol; /* NOTBOL flag */ BOOL noteol; /* NOTEOL flag */ - BOOL utf8; /* UTF8 flag */ + BOOL utf; /* UTF-8 / UTF-16 flag */ BOOL jscript_compat; /* JAVASCRIPT_COMPAT flag */ BOOL use_ucp; /* PCRE_UCP flag */ BOOL endonly; /* Dollar not before final \n */ @@ -2103,14 +2103,10 @@ extern unsigned int PRIV(strlen_uc)(const pcre_uchar *str); extern const pcre_uchar *PRIV(find_bracket)(const pcre_uchar *, BOOL, int); extern BOOL PRIV(is_newline)(PCRE_PUCHAR, int, PCRE_PUCHAR, int *, BOOL); -extern int PRIV(ord2utf8)(int, pcre_uint8 *); +extern int PRIV(ord2utf)(pcre_uint32, pcre_uchar *); extern real_pcre *PRIV(try_flipped)(const real_pcre *, real_pcre *, const pcre_study_data *, pcre_study_data *); -#ifndef COMPILE_PCRE16 -extern int PRIV(valid_utf8)(PCRE_PUCHAR, int, int *); -#else -extern int PRIV(valid_utf16)(PCRE_PUCHAR, int, int *); -#endif +extern int PRIV(valid_utf)(PCRE_PUCHAR, int, int *); extern BOOL PRIV(was_newline)(PCRE_PUCHAR, int, PCRE_PUCHAR, int *, BOOL); extern BOOL PRIV(xclass)(int, const pcre_uchar *); diff --git a/pcre_jit_compile.c b/pcre_jit_compile.c index 03833e0..16611f1 100644 --- a/pcre_jit_compile.c +++ b/pcre_jit_compile.c @@ -298,7 +298,7 @@ typedef struct compiler_common { jump_list *caselesscmp; BOOL jscript_compat; #ifdef SUPPORT_UTF8 - BOOL utf8; + BOOL utf; #ifdef SUPPORT_UCP BOOL useucp; #endif @@ -497,7 +497,7 @@ switch(*cc) case OP_ANYBYTE: #ifdef SUPPORT_UTF8 - if (common->utf8) return NULL; + if (common->utf) return NULL; #endif return cc + 1; @@ -544,7 +544,7 @@ switch(*cc) case OP_NOTPOSQUERYI: cc += 2; #ifdef SUPPORT_UTF8 - if (common->utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; + if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; #endif return cc; @@ -566,7 +566,7 @@ switch(*cc) case OP_NOTPOSUPTOI: cc += 2 + IMM2_SIZE; #ifdef SUPPORT_UTF8 - if (common->utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; + if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; #endif return cc; @@ -1264,7 +1264,7 @@ static SLJIT_INLINE BOOL char_has_othercase(compiler_common *common, pcre_uchar* unsigned int c; #ifdef SUPPORT_UTF8 -if (common->utf8) +if (common->utf) { GETCHAR(c, cc); if (c > 127) @@ -1286,7 +1286,7 @@ static SLJIT_INLINE unsigned int char_othercase(compiler_common *common, unsigne { /* Returns with the othercase. */ #ifdef SUPPORT_UTF8 -if (common->utf8 && c > 127) +if (common->utf && c > 127) { #ifdef SUPPORT_UCP return UCD_OTHERCASE(c); @@ -1307,7 +1307,7 @@ int n; #endif #ifdef SUPPORT_UTF8 -if (common->utf8) +if (common->utf) { GETCHAR(c, cc); if (c <= 127) @@ -1343,7 +1343,7 @@ if (!ispowerof2(bit)) return 0; #ifdef SUPPORT_UTF8 -if (common->utf8 && c > 127) +if (common->utf && c > 127) { n = PRIV(utf8_table4)[*cc & 0x3f]; while ((bit & 0x3f) == 0) @@ -1374,7 +1374,7 @@ struct sljit_jump *jump; OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); #ifdef SUPPORT_UTF8 -if (common->utf8) +if (common->utf) { jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0); add_jump(compiler, &common->utf8readchar, JUMP(SLJIT_FAST_CALL)); @@ -1395,7 +1395,7 @@ struct sljit_jump *jump; OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); #ifdef SUPPORT_UTF8 -if (common->utf8) +if (common->utf) { jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0); add_jump(compiler, &common->utf8readchar, JUMP(SLJIT_FAST_CALL)); @@ -1414,7 +1414,7 @@ struct sljit_jump *jump; #endif #ifdef SUPPORT_UTF8 -if (common->utf8) +if (common->utf) { OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 0); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); @@ -1439,7 +1439,7 @@ DEFINE_COMPILER; #ifdef SUPPORT_UTF8 struct sljit_label *label; -if (common->utf8) +if (common->utf) { label = LABEL(); OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); @@ -1697,7 +1697,7 @@ mainloop = LABEL(); /* Increasing the STR_PTR here requires one less jump in the most common case. */ #ifdef SUPPORT_UTF8 -if (common->utf8) readuchar = TRUE; +if (common->utf) readuchar = TRUE; #endif if (newlinecheck) readuchar = TRUE; @@ -1709,7 +1709,7 @@ if (newlinecheck) OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); #ifdef SUPPORT_UTF8 -if (common->utf8) +if (common->utf) { singlebyte = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0); OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)PRIV(utf8_table4) - 0xc0); @@ -1771,7 +1771,7 @@ else OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); #ifdef SUPPORT_UTF8 -if (common->utf8) +if (common->utf) { CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0, start); OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)PRIV(utf8_table4) - 0xc0); @@ -1882,7 +1882,7 @@ start = LABEL(); leave = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); #ifdef SUPPORT_UTF -if (common->utf8) +if (common->utf) OP1(SLJIT_MOV, TMP3, 0, TMP1, 0); #endif #ifndef COMPILE_PCRE8 @@ -1896,12 +1896,12 @@ OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, TMP2, 0); found = JUMP(SLJIT_C_NOT_ZERO); #ifdef SUPPORT_UTF -if (common->utf8) +if (common->utf) OP1(SLJIT_MOV, TMP1, 0, TMP3, 0); #endif OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); #ifdef SUPPORT_UTF8 -if (common->utf8) +if (common->utf) { CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0, start); OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)PRIV(utf8_table4) - 0xc0); @@ -2051,7 +2051,7 @@ else #ifdef SUPPORT_UTF8 /* Here LOCALS1 has already been zeroed. */ jump = NULL; - if (common->utf8) + if (common->utf) jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255); #endif OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), common->ctypes); @@ -2090,7 +2090,7 @@ else #ifdef SUPPORT_UTF8 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 0); jump = NULL; - if (common->utf8) + if (common->utf) jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255); #endif OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(TMP1), common->ctypes); @@ -2119,7 +2119,7 @@ OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x0d - 0x0a); COND_VALUE(SLJIT_MOV, TMP2, 0, SLJIT_C_LESS_EQUAL); OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x85 - 0x0a); #ifdef SUPPORT_UTF8 -if (common->utf8) +if (common->utf) { COND_VALUE(SLJIT_OR, TMP2, 0, SLJIT_C_EQUAL); OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x1); @@ -2143,7 +2143,7 @@ OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x20); COND_VALUE(SLJIT_OR, TMP2, 0, SLJIT_C_EQUAL); OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xa0); #ifdef SUPPORT_UTF8 -if (common->utf8) +if (common->utf) { COND_VALUE(SLJIT_OR, TMP2, 0, SLJIT_C_EQUAL); OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x1680); @@ -2177,7 +2177,7 @@ OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x0d - 0x0a); COND_VALUE(SLJIT_MOV, TMP2, 0, SLJIT_C_LESS_EQUAL); OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x85 - 0x0a); #ifdef SUPPORT_UTF8 -if (common->utf8) +if (common->utf) { COND_VALUE(SLJIT_OR | SLJIT_SET_E, TMP2, 0, SLJIT_C_EQUAL); OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x1); @@ -2289,7 +2289,7 @@ DEFINE_COMPILER; unsigned int othercasebit = 0; pcre_uchar *othercasechar = NULL; #ifdef SUPPORT_UTF8 -int utf8length; +int utflength; #endif if (caseless && char_has_othercase(common, cc)) @@ -2336,9 +2336,9 @@ if (context->sourcereg == -1) } #ifdef SUPPORT_UTF8 -utf8length = 1; -if (common->utf8 && *cc >= 0xc0) - utf8length += PRIV(utf8_table4)[*cc & 0x3f]; +utflength = 1; +if (common->utf && *cc >= 0xc0) + utflength += PRIV(utf8_table4)[*cc & 0x3f]; do { @@ -2432,9 +2432,9 @@ do cc++; #ifdef SUPPORT_UTF8 - utf8length--; + utflength--; } -while (utf8length > 0); +while (utflength > 0); #endif return cc; @@ -2480,7 +2480,7 @@ unsigned int typeoffset; int invertcmp, numberofcmps; unsigned int charoffset; -/* Although SUPPORT_UTF8 must be defined, we are not necessary in utf8 mode. */ +/* Although SUPPORT_UTF must be defined, we are not necessary in utf mode. */ check_input_end(common, fallbacks); read_char(common); @@ -2490,7 +2490,7 @@ if ((*cc++ & XCL_MAP) != 0) #ifndef COMPILE_PCRE8 jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255); #elif defined SUPPORT_UTF8 - if (common->utf8) + if (common->utf) jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255); #endif @@ -2504,7 +2504,7 @@ if ((*cc++ & XCL_MAP) != 0) #ifndef COMPILE_PCRE8 JUMPHERE(jump); #elif defined SUPPORT_UTF8 - if (common->utf8) + if (common->utf) JUMPHERE(jump); #endif OP1(SLJIT_MOV, TMP1, 0, TMP3, 0); @@ -2524,7 +2524,7 @@ while (*cc != XCL_END) { cc += 2; #ifdef SUPPORT_UTF8 - if (common->utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; + if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; #endif #ifdef SUPPORT_UCP needschar = TRUE; @@ -2534,11 +2534,11 @@ while (*cc != XCL_END) { cc += 2; #ifdef SUPPORT_UTF8 - if (common->utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; + if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; #endif cc++; #ifdef SUPPORT_UTF8 - if (common->utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; + if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; #endif #ifdef SUPPORT_UCP needschar = TRUE; @@ -2639,7 +2639,7 @@ while (*cc != XCL_END) { cc ++; #ifdef SUPPORT_UTF8 - if (common->utf8) + if (common->utf) { GETCHARINC(c, cc); } @@ -2670,7 +2670,7 @@ while (*cc != XCL_END) { cc ++; #ifdef SUPPORT_UTF8 - if (common->utf8) + if (common->utf) { GETCHARINC(c, cc); } @@ -2679,7 +2679,7 @@ while (*cc != XCL_END) c = *cc++; SET_CHAR_OFFSET(c); #ifdef SUPPORT_UTF8 - if (common->utf8) + if (common->utf) { GETCHARINC(c, cc); } @@ -2876,7 +2876,7 @@ switch(type) case OP_ALLANY: check_input_end(common, fallbacks); #ifdef SUPPORT_UTF8 - if (common->utf8) + if (common->utf) { OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); @@ -3096,7 +3096,7 @@ switch(type) case OP_CHARI: length = 1; #ifdef SUPPORT_UTF8 - if (common->utf8 && *cc >= 0xc0) length += PRIV(utf8_table4)[*cc & 0x3f]; + if (common->utf && *cc >= 0xc0) length += PRIV(utf8_table4)[*cc & 0x3f]; #endif if (type == OP_CHAR || !char_has_othercase(common, cc) || char_get_othercase_bit(common, cc) != 0) { @@ -3113,7 +3113,7 @@ switch(type) check_input_end(common, fallbacks); read_char(common); #ifdef SUPPORT_UTF8 - if (common->utf8) + if (common->utf) { GETCHAR(c, cc); } @@ -3130,7 +3130,7 @@ switch(type) case OP_NOT: case OP_NOTI: #ifdef SUPPORT_UTF8 - if (common->utf8) + if (common->utf) { length = 1; if (*cc >= 0xc0) length += PRIV(utf8_table4)[*cc & 0x3f]; @@ -3196,7 +3196,7 @@ switch(type) jump[0] = NULL; #ifdef SUPPORT_UTF8 /* This check can only be skipped in pure 8 bit mode. */ - if (common->utf8) + if (common->utf) #endif { jump[0] = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255); @@ -3231,7 +3231,7 @@ switch(type) OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0); OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, begin)); #ifdef SUPPORT_UTF8 - if (common->utf8) + if (common->utf) { OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, length); label = LABEL(); @@ -3269,7 +3269,7 @@ do { size = 1; #ifdef SUPPORT_UTF8 - if (common->utf8 && cc[1] >= 0xc0) + if (common->utf && cc[1] >= 0xc0) size += PRIV(utf8_table4)[cc[1] & 0x3f]; #endif } @@ -3277,7 +3277,7 @@ do { size = 1; #ifdef SUPPORT_UTF8 - if (common->utf8) + if (common->utf) { if (char_has_othercase(common, cc + 1) && char_get_othercase_bit(common, cc + 1) == 0) size = 0; @@ -3381,7 +3381,7 @@ if (withchecks && !common->jscript_compat) #ifdef SUPPORT_UTF8 #ifdef SUPPORT_UCP -if (common->utf8 && *cc == OP_REFI) +if (common->utf && *cc == OP_REFI) { SLJIT_ASSERT(TMP1 == SLJIT_TEMPORARY_REG1 && STACK_TOP == SLJIT_TEMPORARY_REG2 && TMP2 == SLJIT_TEMPORARY_REG3); OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), OVECTOR(offset + 1)); @@ -4787,7 +4787,7 @@ if (end != NULL) { *end = cc + 1; #ifdef SUPPORT_UTF8 - if (common->utf8 && *cc >= 0xc0) *end += PRIV(utf8_table4)[*cc & 0x3f]; + if (common->utf && *cc >= 0xc0) *end += PRIV(utf8_table4)[*cc & 0x3f]; #endif } return cc; @@ -6254,7 +6254,8 @@ common->casefulcmp = NULL; common->caselesscmp = NULL; common->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0; #ifdef SUPPORT_UTF8 -common->utf8 = (re->options & PCRE_UTF8) != 0; +/* PCRE_UTF16 has the same value as PCRE_UTF8. */ +common->utf = (re->options & PCRE_UTF8) != 0; #ifdef SUPPORT_UCP common->useucp = (re->options & PCRE_UCP) != 0; #endif diff --git a/pcre_newline.c b/pcre_newline.c index 92b81d1..0c2ddcd 100644 --- a/pcre_newline.c +++ b/pcre_newline.c @@ -67,17 +67,17 @@ Arguments: type the newline type endptr pointer to the end of the string lenptr where to return the length - utf8 TRUE if in utf8 mode + utf TRUE if in utf mode Returns: TRUE or FALSE */ BOOL PRIV(is_newline)(PCRE_PUCHAR ptr, int type, PCRE_PUCHAR endptr, int *lenptr, - BOOL utf8) + BOOL utf) { int c; -if (utf8) { GETCHAR(c, ptr); } else c = *ptr; +if (utf) { GETCHAR(c, ptr); } else c = *ptr; if (type == NLTYPE_ANYCRLF) switch(c) { @@ -96,7 +96,7 @@ else switch(c) case 0x000c: *lenptr = 1; return TRUE; /* FF */ case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1; return TRUE; /* CR */ - case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */ + case 0x0085: *lenptr = utf? 2 : 1; return TRUE; /* NEL */ case 0x2028: /* LS */ case 0x2029: *lenptr = 3; return TRUE; /* PS */ default: return FALSE; @@ -117,19 +117,19 @@ Arguments: type the newline type startptr pointer to the start of the string lenptr where to return the length - utf8 TRUE if in utf8 mode + utf TRUE if in utf mode Returns: TRUE or FALSE */ BOOL PRIV(was_newline)(PCRE_PUCHAR ptr, int type, PCRE_PUCHAR startptr, int *lenptr, - BOOL utf8) + BOOL utf) { int c; ptr--; #ifdef SUPPORT_UTF8 -if (utf8) +if (utf) { BACKCHAR(ptr); GETCHAR(c, ptr); @@ -154,7 +154,7 @@ else switch(c) case 0x000b: /* VT */ case 0x000c: /* FF */ case 0x000d: *lenptr = 1; return TRUE; /* CR */ - case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */ + case 0x0085: *lenptr = utf? 2 : 1; return TRUE; /* NEL */ case 0x2028: /* LS */ case 0x2029: *lenptr = 3; return TRUE; /* PS */ default: return FALSE; diff --git a/pcre_ord2utf8.c b/pcre_ord2utf8.c index 354adc0..b374987 100644 --- a/pcre_ord2utf8.c +++ b/pcre_ord2utf8.c @@ -52,21 +52,28 @@ character value into a UTF8 string. */ * Convert character value to UTF-8 * *************************************************/ -/* This function takes an integer value in the range 0 - 0x7fffffff -and encodes it as a UTF-8 character in 0 to 6 bytes. +/* This function takes an integer value in the range 0 - 0x10ffff +and encodes it as a UTF-8 character in 1 to 6 pcre_uchars. Arguments: cvalue the character value - buffer pointer to buffer for result - at least 6 bytes long + buffer pointer to buffer for result - at least 6 pcre_uchars long Returns: number of characters placed in the buffer */ int -PRIV(ord2utf8)(int cvalue, pcre_uint8 *buffer) +PRIV(ord2utf)(pcre_uint32 cvalue, pcre_uchar *buffer) { #ifdef SUPPORT_UTF8 + register int i, j; + +/* Checking invalid cvalue character, encoded as invalid UTF-16 character. +Should never happen in practice. */ +if ((cvalue & 0xf800) == 0xd800 || cvalue >= 0x110000) + cvalue = 0xfffe; + for (i = 0; i < PRIV(utf8_table1_size); i++) if (cvalue <= PRIV(utf8_table1)[i]) break; buffer += i; @@ -77,10 +84,13 @@ for (j = i; j > 0; j--) } *buffer = PRIV(utf8_table2)[i] | cvalue; return i + 1; + #else + (void)(cvalue); /* Keep compiler happy; this function won't ever be */ (void)(buffer); /* called when SUPPORT_UTF8 is not defined. */ return 0; + #endif } diff --git a/pcre_study.c b/pcre_study.c index 661627d..098980d 100644 --- a/pcre_study.c +++ b/pcre_study.c @@ -82,7 +82,8 @@ find_minlength(const pcre_uchar *code, const pcre_uchar *startcode, int options, int recurse_depth) { int length = -1; -BOOL utf8 = (options & PCRE_UTF8) != 0; +/* PCRE_UTF16 has the same value as PCRE_UTF8. */ +BOOL utf = (options & PCRE_UTF8) != 0; BOOL had_recurse = FALSE; register int branchlength = 0; register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE; @@ -224,7 +225,7 @@ for (;;) branchlength++; cc += 2; #ifdef SUPPORT_UTF8 - if (utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; + if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; #endif break; @@ -245,7 +246,7 @@ for (;;) branchlength += GET2(cc,1); cc += 2 + IMM2_SIZE; #ifdef SUPPORT_UTF8 - if (utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; + if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; #endif break; @@ -293,7 +294,7 @@ for (;;) case OP_ANYBYTE: #ifdef SUPPORT_UTF8 - if (utf8) return -1; + if (utf) return -1; #endif branchlength++; cc++; @@ -374,7 +375,7 @@ for (;;) case OP_REFI: if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) { - ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf8, GET2(cc, 1)); + ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1)); if (cs == NULL) return -2; do ce += GET(ce, 1); while (*ce == OP_ALT); if (cc > cs && cc < ce) @@ -486,7 +487,7 @@ for (;;) cc += PRIV(OP_lengths)[op]; #ifdef SUPPORT_UTF8 - if (utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; + if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; #endif break; @@ -537,29 +538,29 @@ Arguments: p points to the character caseless the caseless flag cd the block with char table pointers - utf8 TRUE for UTF-8 mode + utf TRUE for UTF-8 / UTF-16 mode Returns: pointer after the character */ static const pcre_uchar * set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless, - compile_data *cd, BOOL utf8) + compile_data *cd, BOOL utf) { unsigned int c = *p; SET_BIT(c); #ifdef SUPPORT_UTF8 -if (utf8 && c > 127) +if (utf && c > 127) { GETCHARINC(c, p); #ifdef SUPPORT_UCP if (caseless) { - pcre_uint8 buff[8]; + pcre_uchar buff[6]; c = UCD_OTHERCASE(c); - (void)PRIV(ord2utf8)(c, buff); + (void)PRIV(ord2utf)(c, buff); SET_BIT(buff[0]); } #endif @@ -607,8 +608,8 @@ for (c = 128; c < 256; c++) { if ((cd->cbits[c/8] & (1 << (c&7))) != 0) { - pcre_uint8 buff[8]; - (void)PRIV(ord2utf8)(c, buff); + pcre_uchar buff[6]; + (void)PRIV(ord2utf)(c, buff); SET_BIT(buff[0]); } } @@ -663,7 +664,7 @@ function fails unless the result is SSB_DONE. Arguments: code points to an expression start_bits points to a 32-byte table, initialized to 0 - utf8 TRUE if in UTF-8 mode + utf TRUE if in UTF-8 / UTF-16 mode cd the block with char table pointers Returns: SSB_FAIL => Failed to find any starting bytes @@ -673,12 +674,12 @@ Returns: SSB_FAIL => Failed to find any starting bytes */ static int -set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf8, +set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf, compile_data *cd) { register int c; int yield = SSB_DONE; -int table_limit = utf8? 16:32; +int table_limit = utf? 16:32; #if 0 /* ========================================================================= */ @@ -817,7 +818,7 @@ do case OP_ONCE: case OP_ONCE_NC: case OP_ASSERT: - rc = set_start_bits(tcode, start_bits, utf8, cd); + rc = set_start_bits(tcode, start_bits, utf, cd); if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc; if (rc == SSB_DONE) try_next = FALSE; else { @@ -864,7 +865,7 @@ do case OP_BRAZERO: case OP_BRAMINZERO: case OP_BRAPOSZERO: - rc = set_start_bits(++tcode, start_bits, utf8, cd); + rc = set_start_bits(++tcode, start_bits, utf, cd); if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc; /* ========================================================================= See the comment at the head of this function concerning the next line, @@ -891,7 +892,7 @@ do case OP_QUERY: case OP_MINQUERY: case OP_POSQUERY: - tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8); + tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf); break; case OP_STARI: @@ -900,7 +901,7 @@ do case OP_QUERYI: case OP_MINQUERYI: case OP_POSQUERYI: - tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8); + tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf); break; /* Single-char upto sets the bit and tries the next */ @@ -908,13 +909,13 @@ do case OP_UPTO: case OP_MINUPTO: case OP_POSUPTO: - tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf8); + tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf); break; case OP_UPTOI: case OP_MINUPTOI: case OP_POSUPTOI: - tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf8); + tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf); break; /* At least one single char sets the bit and stops */ @@ -926,7 +927,7 @@ do case OP_PLUS: case OP_MINPLUS: case OP_POSPLUS: - (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8); + (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf); try_next = FALSE; break; @@ -937,7 +938,7 @@ do case OP_PLUSI: case OP_MINPLUSI: case OP_POSPLUSI: - (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8); + (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf); try_next = FALSE; break; @@ -950,7 +951,7 @@ do case OP_HSPACE: SET_BIT(0x09); SET_BIT(0x20); - if (utf8) + if (utf) { SET_BIT(0xC2); /* For U+00A0 */ SET_BIT(0xE1); /* For U+1680, U+180E */ @@ -967,7 +968,7 @@ do SET_BIT(0x0B); SET_BIT(0x0C); SET_BIT(0x0D); - if (utf8) + if (utf) { SET_BIT(0xC2); /* For U+0085 */ SET_BIT(0xE2); /* For U+2028, U+2029 */ @@ -1057,7 +1058,7 @@ do case OP_HSPACE: SET_BIT(0x09); SET_BIT(0x20); - if (utf8) + if (utf) { SET_BIT(0xC2); /* For U+00A0 */ SET_BIT(0xE1); /* For U+1680, U+180E */ @@ -1073,7 +1074,7 @@ do SET_BIT(0x0B); SET_BIT(0x0C); SET_BIT(0x0D); - if (utf8) + if (utf) { SET_BIT(0xC2); /* For U+0085 */ SET_BIT(0xE2); /* For U+2028, U+2029 */ @@ -1126,7 +1127,7 @@ do case OP_NCLASS: #ifdef SUPPORT_UTF8 - if (utf8) + if (utf) { start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */ memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */ @@ -1147,7 +1148,7 @@ do characters in the range 128 - 255. */ #ifdef SUPPORT_UTF8 - if (utf8) + if (utf) { for (c = 0; c < 16; c++) start_bits[c] |= map[c]; for (c = 128; c < 256; c++) diff --git a/pcre_valid_utf8.c b/pcre_valid_utf8.c index 8d13014..bbab87f 100644 --- a/pcre_valid_utf8.c +++ b/pcre_valid_utf8.c @@ -103,7 +103,7 @@ Returns: = 0 if the string is a valid UTF-8 string */ int -PRIV(valid_utf8)(PCRE_PUCHAR string, int length, int *erroroffset) +PRIV(valid_utf)(PCRE_PUCHAR string, int length, int *erroroffset) { #ifdef SUPPORT_UTF8 register PCRE_PUCHAR p; diff --git a/pcreposix.c b/pcreposix.c index 648254b..2dc1561 100644 --- a/pcreposix.c +++ b/pcreposix.c @@ -155,6 +155,7 @@ static const int eint[] = { REG_BADPAT, /* \k is not followed by a braced, angle-bracketed, or quoted name */ /* 70 */ REG_BADPAT, /* internal error: unknown opcode in find_fixedlength() */ + REG_BADPAT, /* Not allowed UTF-8 / UTF-16 code point (>= 0xd800 && <= 0xdfff) */ }; /* Table of texts corresponding to POSIX error codes */ diff --git a/sljit/sljitConfigInternal.h b/sljit/sljitConfigInternal.h index 3f771d8..ad0be19 100644 --- a/sljit/sljitConfigInternal.h +++ b/sljit/sljitConfigInternal.h @@ -354,8 +354,8 @@ typedef long int sljit_w; #endif /* !SLJIT_UNALIGNED */ #if (defined SLJIT_EXECUTABLE_ALLOCATOR && SLJIT_EXECUTABLE_ALLOCATOR) -static void* sljit_malloc_exec(sljit_uw size); -static void sljit_free_exec(void* ptr); +SLJIT_API_FUNC_ATTRIBUTE void* sljit_malloc_exec(sljit_uw size); +SLJIT_API_FUNC_ATTRIBUTE void sljit_free_exec(void* ptr); #define SLJIT_MALLOC_EXEC(size) sljit_malloc_exec(size) #define SLJIT_FREE_EXEC(ptr) sljit_free_exec(ptr) #endif diff --git a/sljit/sljitExecAllocator.c b/sljit/sljitExecAllocator.c index bfe8eb1..cdea346 100644 --- a/sljit/sljitExecAllocator.c +++ b/sljit/sljitExecAllocator.c @@ -163,7 +163,7 @@ static SLJIT_INLINE void sljit_remove_free_block(struct free_block *free_block) } } -static void* sljit_malloc_exec(sljit_uw size) +SLJIT_API_FUNC_ATTRIBUTE void* sljit_malloc_exec(sljit_uw size) { struct block_header *header; struct block_header *next_header; @@ -231,7 +231,7 @@ static void* sljit_malloc_exec(sljit_uw size) return MEM_START(header); } -static void sljit_free_exec(void* ptr) +SLJIT_API_FUNC_ATTRIBUTE void sljit_free_exec(void* ptr) { struct block_header *header; struct free_block* free_block; diff --git a/sljit/sljitLir.h b/sljit/sljitLir.h index 2a82968..54906bc 100644 --- a/sljit/sljitLir.h +++ b/sljit/sljitLir.h @@ -195,6 +195,8 @@ struct sljit_compiler { int local_size; /* Code size. */ sljit_uw size; + /* For statistical purposes. */ + sljit_uw executable_size; #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) int args; @@ -291,6 +293,15 @@ SLJIT_API_FUNC_ATTRIBUTE void sljit_compiler_verbose(struct sljit_compiler *comp SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compiler); SLJIT_API_FUNC_ATTRIBUTE void sljit_free_code(void* code); +/* + After the code generation we can retrieve the allocated executable memory size, + although this area may not be fully filled with instructions depending on some + optimizations. This function is useful only for statistical purposes. + + Before a successful code generation, this function returns with 0. +*/ +static SLJIT_INLINE sljit_uw sljit_get_generated_code_size(struct sljit_compiler *compiler) { return compiler->executable_size; } + /* Instruction generation. Returns with error code. */ /* diff --git a/sljit/sljitNativeARM_Thumb2.c b/sljit/sljitNativeARM_Thumb2.c index c476711..3764aeb 100644 --- a/sljit/sljitNativeARM_Thumb2.c +++ b/sljit/sljitNativeARM_Thumb2.c @@ -416,6 +416,7 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compil SLJIT_CACHE_FLUSH(code, code_ptr); compiler->error = SLJIT_ERR_COMPILED; + compiler->executable_size = compiler->size * sizeof(sljit_uh); /* Set thumb mode flag. */ return (void*)((sljit_uw)code | 0x1); } diff --git a/sljit/sljitNativeARM_v5.c b/sljit/sljitNativeARM_v5.c index 1b40afa..99584cf 100644 --- a/sljit/sljitNativeARM_v5.c +++ b/sljit/sljitNativeARM_v5.c @@ -788,6 +788,7 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compil SLJIT_CACHE_FLUSH(code, code_ptr); compiler->error = SLJIT_ERR_COMPILED; + compiler->executable_size = size * sizeof(sljit_uw); return code; } diff --git a/sljit/sljitNativeMIPS_common.c b/sljit/sljitNativeMIPS_common.c index c4fe152..7fcb6d6 100644 --- a/sljit/sljitNativeMIPS_common.c +++ b/sljit/sljitNativeMIPS_common.c @@ -397,6 +397,7 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compil } compiler->error = SLJIT_ERR_COMPILED; + compiler->executable_size = compiler->size * sizeof(sljit_ins); #ifndef __GNUC__ SLJIT_CACHE_FLUSH(code, code_ptr); #else diff --git a/sljit/sljitNativePPC_common.c b/sljit/sljitNativePPC_common.c index af14b75..28afd9e 100644 --- a/sljit/sljitNativePPC_common.c +++ b/sljit/sljitNativePPC_common.c @@ -354,6 +354,7 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compil SLJIT_CACHE_FLUSH(code, code_ptr); compiler->error = SLJIT_ERR_COMPILED; + compiler->executable_size = compiler->size * sizeof(sljit_ins); #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64) if (((sljit_w)code_ptr) & 0x4) diff --git a/sljit/sljitNativeX86_common.c b/sljit/sljitNativeX86_common.c index c6661bc..cc215a2 100644 --- a/sljit/sljitNativeX86_common.c +++ b/sljit/sljitNativeX86_common.c @@ -357,22 +357,22 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compil while (jump) { if (jump->flags & PATCH_MB) { SLJIT_ASSERT((sljit_w)(jump->u.label->addr - (jump->addr + sizeof(sljit_b))) >= -128 && (sljit_w)(jump->u.label->addr - (jump->addr + sizeof(sljit_b))) <= 127); - *(sljit_ub*)jump->addr = jump->u.label->addr - (jump->addr + sizeof(sljit_b)); + *(sljit_ub*)jump->addr = (sljit_ub)(jump->u.label->addr - (jump->addr + sizeof(sljit_b))); } else if (jump->flags & PATCH_MW) { if (jump->flags & JUMP_LABEL) { #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) - *(sljit_w*)jump->addr = jump->u.label->addr - (jump->addr + sizeof(sljit_w)); + *(sljit_w*)jump->addr = (sljit_w)(jump->u.label->addr - (jump->addr + sizeof(sljit_w))); #else SLJIT_ASSERT((sljit_w)(jump->u.label->addr - (jump->addr + sizeof(sljit_hw))) >= -0x80000000ll && (sljit_w)(jump->u.label->addr - (jump->addr + sizeof(sljit_hw))) <= 0x7fffffffll); - *(sljit_hw*)jump->addr = jump->u.label->addr - (jump->addr + sizeof(sljit_hw)); + *(sljit_hw*)jump->addr = (sljit_hw)(jump->u.label->addr - (jump->addr + sizeof(sljit_hw))); #endif } else { #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) - *(sljit_w*)jump->addr = jump->u.target - (jump->addr + sizeof(sljit_w)); + *(sljit_w*)jump->addr = (sljit_w)(jump->u.target - (jump->addr + sizeof(sljit_w))); #else SLJIT_ASSERT((sljit_w)(jump->u.target - (jump->addr + sizeof(sljit_hw))) >= -0x80000000ll && (sljit_w)(jump->u.target - (jump->addr + sizeof(sljit_hw))) <= 0x7fffffffll); - *(sljit_hw*)jump->addr = jump->u.target - (jump->addr + sizeof(sljit_hw)); + *(sljit_hw*)jump->addr = (sljit_hw)(jump->u.target - (jump->addr + sizeof(sljit_hw))); #endif } } @@ -387,6 +387,7 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compil /* Maybe we waste some space because of short jumps. */ SLJIT_ASSERT(code_ptr <= code + compiler->size); compiler->error = SLJIT_ERR_COMPILED; + compiler->executable_size = compiler->size; return (void*)code; } @@ -1360,7 +1361,7 @@ static int emit_mul(struct sljit_compiler *compiler, code = (sljit_ub*)ensure_buf(compiler, 1 + 4); FAIL_IF(!code); INC_CSIZE(4); - *(sljit_hw*)code = src1w; + *(sljit_hw*)code = (sljit_hw)src1w; } else { EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src1w); @@ -1403,7 +1404,7 @@ static int emit_mul(struct sljit_compiler *compiler, code = (sljit_ub*)ensure_buf(compiler, 1 + 4); FAIL_IF(!code); INC_CSIZE(4); - *(sljit_hw*)code = src2w; + *(sljit_hw*)code = (sljit_hw)src2w; } else { EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src1w); diff --git a/testdata/testinput10 b/testdata/testinput10 index 7210cc5..8e70c70 100644 --- a/testdata/testinput10 +++ b/testdata/testinput10 @@ -52,11 +52,9 @@ is required for these tests. --/ /\x{100000}/8BM -/\x{1000000}/8BM +/\x{10ffff}/8BM -/\x{4000000}/8BM - -/\x{7fffFFFF}/8BM +/\x{110000}/8BM /[\x{ff}]/8BM diff --git a/testdata/testinput5 b/testdata/testinput5 index ca7eb54..9ba5b4b 100644 --- a/testdata/testinput5 +++ b/testdata/testinput5 @@ -9,11 +9,9 @@ /\x{100000}/8DZ -/\x{1000000}/8DZ +/\x{10ffff}/8DZ -/\x{4000000}/8DZ - -/\x{7fffFFFF}/8DZ +/\x{110000}/8DZ /[\x{ff}]/8DZ @@ -23,6 +21,14 @@ /\x{100000000}/8 +/\x{d800}/8 + +/\x{dfff}/8 + +/\x{d7ff}/8 + +/\x{e000}/8 + /^\x{100}a\x{1234}/8 \x{100}a\x{1234}bcd diff --git a/testdata/testoutput10 b/testdata/testoutput10 index f66a12a..47a2a97 100644 --- a/testdata/testoutput10 +++ b/testdata/testoutput10 @@ -317,32 +317,17 @@ Memory allocation (code space): 12 11 End ------------------------------------------------------------------ -/\x{1000000}/8BM -Memory allocation (code space): 13 ------------------------------------------------------------------- - 0 9 Bra - 3 \x{1000000} - 9 9 Ket - 12 End ------------------------------------------------------------------- - -/\x{4000000}/8BM -Memory allocation (code space): 14 +/\x{10ffff}/8BM +Memory allocation (code space): 12 ------------------------------------------------------------------ - 0 10 Bra - 3 \x{4000000} - 10 10 Ket - 13 End + 0 8 Bra + 3 \x{10ffff} + 8 8 Ket + 11 End ------------------------------------------------------------------ -/\x{7fffFFFF}/8BM -Memory allocation (code space): 14 ------------------------------------------------------------------- - 0 10 Bra - 3 \x{7fffffff} - 10 10 Ket - 13 End ------------------------------------------------------------------- +/\x{110000}/8BM +Failed: character value in \x{...} sequence is too large at offset 9 /[\x{ff}]/8BM Memory allocation (code space): 10 diff --git a/testdata/testoutput5 b/testdata/testoutput5 index 1eaab47..b63934d 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -49,42 +49,21 @@ Options: utf8 First char = 244 Need char = 128 -/\x{1000000}/8DZ +/\x{10ffff}/8DZ ------------------------------------------------------------------ Bra - \x{1000000} + \x{10ffff} Ket End ------------------------------------------------------------------ Capturing subpattern count = 0 Options: utf8 -First char = 249 -Need char = 128 - -/\x{4000000}/8DZ ------------------------------------------------------------------- - Bra - \x{4000000} - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -Options: utf8 -First char = 252 -Need char = 128 - -/\x{7fffFFFF}/8DZ ------------------------------------------------------------------- - Bra - \x{7fffffff} - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -Options: utf8 -First char = 253 +First char = 244 Need char = 191 +/\x{110000}/8DZ +Failed: character value in \x{...} sequence is too large at offset 9 + /[\x{ff}]/8DZ ------------------------------------------------------------------ Bra @@ -115,6 +94,16 @@ Failed: character value in \x{...} sequence is too large at offset 11 /\x{100000000}/8 Failed: character value in \x{...} sequence is too large at offset 12 +/\x{d800}/8 +Failed: Not allowed UTF-8 / UTF-16 code point (>= 0xd800 && <= 0xdfff) at offset 7 + +/\x{dfff}/8 +Failed: Not allowed UTF-8 / UTF-16 code point (>= 0xd800 && <= 0xdfff) at offset 7 + +/\x{d7ff}/8 + +/\x{e000}/8 + /^\x{100}a\x{1234}/8 \x{100}a\x{1234}bcd 0: \x{100}a\x{1234} @@ -1436,7 +1425,7 @@ No match /[\H]/8BZ ------------------------------------------------------------------ Bra - [\x00-\x08\x0a-\x1f!-\x9f\xa1-\xff\x{100}-\x{167f}\x{1681}-\x{180d}\x{180f}-\x{1fff}\x{200b}-\x{202e}\x{2030}-\x{205e}\x{2060}-\x{2fff}\x{3001}-\x{7fffffff}] + [\x00-\x08\x0a-\x1f!-\x9f\xa1-\xff\x{100}-\x{167f}\x{1681}-\x{180d}\x{180f}-\x{1fff}\x{200b}-\x{202e}\x{2030}-\x{205e}\x{2060}-\x{2fff}\x{3001}-\x{10ffff}] Ket End ------------------------------------------------------------------ @@ -1444,7 +1433,7 @@ No match /[\V]/8BZ ------------------------------------------------------------------ Bra - [\x00-\x09\x0e-\x84\x86-\xff\x{100}-\x{2027}\x{2029}-\x{7fffffff}] + [\x00-\x09\x0e-\x84\x86-\xff\x{100}-\x{2027}\x{2029}-\x{10ffff}] Ket End ------------------------------------------------------------------ |