diff options
author | zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2011-12-03 23:58:37 +0000 |
---|---|---|
committer | zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2011-12-03 23:58:37 +0000 |
commit | 216818740b54b629e7bd59cd49f783c72e244e23 (patch) | |
tree | 35603a12be962c35a4e39e879a1a8e021f53d765 | |
parent | ad1a6e3a96050e61e6e2127d3a00ded77a1eb80c (diff) | |
download | pcre-216818740b54b629e7bd59cd49f783c72e244e23.tar.gz |
Start working on UTF-16. Updating macros and adding new ones.
git-svn-id: svn://vcs.exim.org/pcre/code/branches/pcre16@782 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | Makefile.am | 1 | ||||
-rw-r--r-- | pcre16_ord2utf16.c | 4 | ||||
-rw-r--r-- | pcre16_ucd.c | 45 | ||||
-rw-r--r-- | pcre16_utf16_utils.c | 3 | ||||
-rw-r--r-- | pcre16_valid_utf16.c | 3 | ||||
-rw-r--r-- | pcre_compile.c | 63 | ||||
-rw-r--r-- | pcre_dfa_exec.c | 35 | ||||
-rw-r--r-- | pcre_exec.c | 60 | ||||
-rw-r--r-- | pcre_internal.h | 126 | ||||
-rw-r--r-- | pcre_jit_compile.c | 373 | ||||
-rw-r--r-- | pcre_printint.src | 63 | ||||
-rw-r--r-- | pcre_study.c | 6 | ||||
-rw-r--r-- | pcre_tables.c | 8 |
13 files changed, 574 insertions, 216 deletions
diff --git a/Makefile.am b/Makefile.am index 39cf574..c939f9f 100644 --- a/Makefile.am +++ b/Makefile.am @@ -219,6 +219,7 @@ libpcre16_la_SOURCES = \ pcre16_study.c \ pcre16_tables.c \ pcre16_try_flipped.c \ + pcre16_ucd.c \ pcre16_utf16_utils.c \ pcre16_valid_utf16.c \ pcre16_xclass.c diff --git a/pcre16_ord2utf16.c b/pcre16_ord2utf16.c index 421c3a3..b02ccc2 100644 --- a/pcre16_ord2utf16.c +++ b/pcre16_ord2utf16.c @@ -45,8 +45,10 @@ character value into a UTF16 string. */ #include "config.h" #endif -#include "pcre_internal.h" +/* Generate code with 16 bit character support. */ +#define COMPILE_PCRE16 +#include "pcre_internal.h" /************************************************* * Convert character value to UTF-16 * diff --git a/pcre16_ucd.c b/pcre16_ucd.c new file mode 100644 index 0000000..962ed46 --- /dev/null +++ b/pcre16_ucd.c @@ -0,0 +1,45 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Copyright (c) 1997-2011 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + +/* Generate code with 16 bit character support. */ +#define COMPILE_PCRE16 + +#include "pcre_ucd.c" + +/* End of pcre16_ucd.c */ diff --git a/pcre16_utf16_utils.c b/pcre16_utf16_utils.c index 5ff3953..ddd96b9 100644 --- a/pcre16_utf16_utils.c +++ b/pcre16_utf16_utils.c @@ -46,6 +46,9 @@ strings to host byte order. */ #include "config.h" #endif +/* Generate code with 16 bit character support. */ +#define COMPILE_PCRE16 + #include "pcre_internal.h" int diff --git a/pcre16_valid_utf16.c b/pcre16_valid_utf16.c index c7c7507..cc3e50e 100644 --- a/pcre16_valid_utf16.c +++ b/pcre16_valid_utf16.c @@ -46,6 +46,9 @@ strings. */ #include "config.h" #endif +/* Generate code with 16 bit character support. */ +#define COMPILE_PCRE16 + #include "pcre_internal.h" diff --git a/pcre_compile.c b/pcre_compile.c index da4ce22..3461dbd 100644 --- a/pcre_compile.c +++ b/pcre_compile.c @@ -1466,8 +1466,8 @@ for (; ptr < cd->end_pattern; ptr++) { if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; } ptr++; -#ifdef SUPPORT_UTF8 - if (utf) while ((*ptr & 0xc0) == 0x80) ptr++; +#ifdef SUPPORT_UTF + if (utf) FORWARDCHAR(ptr); #endif } if (*ptr == 0) goto FAIL_EXIT; @@ -1759,8 +1759,8 @@ for (;;) case OP_NOTI: branchlength++; cc += 2; -#ifdef SUPPORT_UTF8 - if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; +#ifdef SUPPORT_UTF + if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); #endif break; @@ -1773,8 +1773,8 @@ for (;;) case OP_NOTEXACTI: branchlength += GET2(cc,1); cc += 2 + IMM2_SIZE; -#ifdef SUPPORT_UTF8 - if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; +#ifdef SUPPORT_UTF + if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); #endif break; @@ -2041,7 +2041,7 @@ for (;;) a multi-byte character. The length in the table is a minimum, so we have to arrange to skip the extra bytes. */ -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf) switch(c) { case OP_CHAR: @@ -2072,7 +2072,7 @@ for (;;) case OP_MINQUERYI: case OP_POSQUERY: case OP_POSQUERYI: - if (code[-1] >= 0xc0) code += PRIV(utf8_table4)[code[-1] & 0x3f]; + if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); break; } #else @@ -2161,7 +2161,7 @@ for (;;) by a multi-byte character. The length in the table is a minimum, so we have to arrange to skip the extra bytes. */ -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf) switch(c) { case OP_CHAR: @@ -2192,7 +2192,7 @@ for (;;) case OP_MINQUERYI: case OP_POSQUERY: case OP_POSQUERYI: - if (code[-1] >= 0xc0) code += PRIV(utf8_table4)[code[-1] & 0x3f]; + if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); break; } #else @@ -2452,7 +2452,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO, MINUPTO, and POSUPTO may be followed by a multibyte character */ -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF case OP_STAR: case OP_STARI: case OP_MINSTAR: @@ -2465,7 +2465,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); case OP_MINQUERYI: case OP_POSQUERY: case OP_POSQUERYI: - if (utf && code[1] >= 0xc0) code += PRIV(utf8_table4)[code[1] & 0x3f]; + if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]); break; case OP_UPTO: @@ -2474,7 +2474,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); case OP_MINUPTOI: case OP_POSUPTO: case OP_POSUPTOI: - if (utf && code[1 + IMM2_SIZE] >= 0xc0) code += PRIV(utf8_table4)[code[1 + IMM2_SIZE] & 0x3f]; + if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]); break; #endif @@ -2913,8 +2913,8 @@ if ((options & PCRE_EXTENDED) != 0) { if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } ptr++; -#ifdef SUPPORT_UTF8 - if (utf) while ((*ptr & 0xc0) == 0x80) ptr++; +#ifdef SUPPORT_UTF + if (utf) FORWARDCHAR(ptr); #endif } } @@ -2957,8 +2957,8 @@ if ((options & PCRE_EXTENDED) != 0) { if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } ptr++; -#ifdef SUPPORT_UTF8 - if (utf) while ((*ptr & 0xc0) == 0x80) ptr++; +#ifdef SUPPORT_UTF + if (utf) FORWARDCHAR(ptr); #endif } } @@ -3424,7 +3424,7 @@ for (;; ptr++) int tempbracount; pcre_uchar mcbuffer[8]; - /* Get next byte in the pattern */ + /* Get next character in the pattern */ c = *ptr; @@ -3556,8 +3556,8 @@ for (;; ptr++) { if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; } ptr++; -#ifdef SUPPORT_UTF8 - if (utf) while ((*ptr & 0xc0) == 0x80) ptr++; +#ifdef SUPPORT_UTF + if (utf) FORWARDCHAR(ptr); #endif } if (*ptr != 0) continue; @@ -4601,7 +4601,7 @@ for (;; ptr++) { op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR; - /* Deal with UTF-8 characters that take up more than one byte. It's + /* Deal with UTF characters that take up more than one character. It's easier to write this out separately than try to macrify it. Use c to hold the length of the character in bytes, plus 0x80 to flag that it's a length rather than a small character. */ @@ -4610,16 +4610,16 @@ for (;; ptr++) if (utf && (code[-1] & 0x80) != 0) { pcre_uchar *lastchar = code - 1; - while((*lastchar & 0xc0) == 0x80) lastchar--; + BACKCHAR(lastchar); c = code - lastchar; /* Length of UTF-8 character */ - memcpy(utf_chars, lastchar, c); /* Save the char */ + memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */ c |= 0x80; /* Flag c as a length */ } else #endif - /* Handle the case of a single byte - either with no UTF8 support, or - with UTF-8 disabled, or for a UTF-8 character < 128. */ + /* Handle the case of a single charater - either with no UTF support, or + with UTF disabled, or for a single character UTF character. */ { c = code[-1]; @@ -5273,9 +5273,9 @@ for (;; ptr++) else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT) { tempcode += PRIV(OP_lengths)[*tempcode]; -#ifdef SUPPORT_UTF8 - if (utf && tempcode[-1] >= 0xc0) - tempcode += PRIV(utf8_table4)[tempcode[-1] & 0x3f]; +#ifdef SUPPORT_UTF + if (utf && HAS_EXTRALEN(tempcode[-1])) + tempcode += GET_EXTRALEN(tempcode[-1]); #endif } @@ -6659,11 +6659,10 @@ for (;; ptr++) mclength = 1; mcbuffer[0] = c; -#ifdef SUPPORT_UTF8 - if (utf && c >= 0xc0) +#ifdef SUPPORT_UTF + if (utf && HAS_EXTRALEN(c)) { - while ((ptr[1] & 0xc0) == 0x80) - mcbuffer[mclength++] = *(++ptr); + INTERNALCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr)); } #endif diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c index 8247f46..d7b292d 100644 --- a/pcre_dfa_exec.c +++ b/pcre_dfa_exec.c @@ -480,9 +480,7 @@ if (*first_op == OP_REVERSE) { if (current_subject <= start_subject) break; current_subject--; - while (current_subject > start_subject && - (*current_subject & 0xc0) == 0x80) - current_subject--; + INTERNALCHAR(current_subject > start_subject, *current_subject, current_subject--); } } else @@ -3161,9 +3159,17 @@ if (utf && (options & PCRE_NO_UTF8_CHECK) == 0) return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)? PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8; } +#ifdef COMPILE_PCRE8 if (start_offset > 0 && start_offset < length && (((PCRE_PUCHAR)subject)[start_offset] & 0xc0) == 0x80) return PCRE_ERROR_BADUTF8_OFFSET; +#else +#ifdef COMPILE_PCRE16 + if (start_offset > 0 && start_offset < length && + (((PCRE_PUCHAR)subject)[start_offset] & 0xfc00) == 0xdc00) + return PCRE_ERROR_BADUTF8_OFFSET; +#endif /* COMPILE_PCRE16 */ +#endif /* COMPILE_PCRE8 */ } #endif @@ -3234,13 +3240,13 @@ for (;;) if (firstline) { PCRE_PUCHAR t = current_subject; -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf) { while (t < md->end_subject && !IS_NEWLINE(t)) { t++; - while (t < end_subject && (*t & 0xc0) == 0x80) t++; + INTERNALCHAR(t < end_subject, *t, t++); } } else @@ -3277,16 +3283,15 @@ for (;;) { if (current_subject > md->start_subject + start_offset) { -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf) { while (current_subject < end_subject && !WAS_NEWLINE(current_subject)) { current_subject++; - while(current_subject < end_subject && - (*current_subject & 0xc0) == 0x80) - current_subject++; + INTERNALCHAR(current_subject < end_subject, *current_subject, + current_subject++); } } else @@ -3316,10 +3321,10 @@ for (;;) if ((start_bits[c/8] & (1 << (c&7))) == 0) { current_subject++; -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf) - while(current_subject < end_subject && - (*current_subject & 0xc0) == 0x80) current_subject++; + INTERNALCHAR(current_subject < end_subject, *current_subject, + current_subject++); #endif } else break; @@ -3426,11 +3431,13 @@ for (;;) if (firstline && IS_NEWLINE(current_subject)) break; current_subject++; +#ifdef SUPPORT_UTF if (utf) { - while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80) - current_subject++; + INTERNALCHAR(current_subject < end_subject, *current_subject, + current_subject++); } +#endif if (current_subject > end_subject) break; /* If we have just passed a CR and we are now at a LF, and the pattern does diff --git a/pcre_exec.c b/pcre_exec.c index db013e6..6761598 100644 --- a/pcre_exec.c +++ b/pcre_exec.c @@ -2077,7 +2077,7 @@ for (;;) if (eptr == md->start_subject) prev_is_word = FALSE; else { PCRE_PUCHAR lastptr = eptr - 1; - while((*lastptr & 0xc0) == 0x80) lastptr--; + BACKCHAR(lastptr); if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr; GETCHAR(c, lastptr); #ifdef SUPPORT_UCP @@ -2189,7 +2189,9 @@ for (;;) MRRETURN(MATCH_NOMATCH); } eptr++; - if (utf) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; +#ifdef SUPPORT_UTF + if (utf) INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++); +#endif ecode++; break; @@ -4074,7 +4076,7 @@ for (;;) /* Handle all other cases when the coding is UTF-8 */ -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf) switch(ctype) { case OP_ANY: @@ -4087,7 +4089,7 @@ for (;;) } if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); eptr++; - while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; + INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++); } break; @@ -4100,7 +4102,7 @@ for (;;) MRRETURN(MATCH_NOMATCH); } eptr++; - while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; + INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++); } break; @@ -4298,7 +4300,8 @@ for (;;) } if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH); - while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80); + eptr++; + INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++); } break; @@ -4326,7 +4329,8 @@ for (;;) } if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH); - while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80); + eptr++; + INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++); } break; @@ -5309,7 +5313,7 @@ for (;;) else #endif /* SUPPORT_UCP */ -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf) { switch(ctype) @@ -5326,7 +5330,7 @@ for (;;) } if (IS_NEWLINE(eptr)) break; eptr++; - while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; + INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++); } } @@ -5343,7 +5347,7 @@ for (;;) } if (IS_NEWLINE(eptr)) break; eptr++; - while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; + INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++); } } break; @@ -5359,7 +5363,7 @@ for (;;) break; } eptr++; - while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; + INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++); } } else @@ -6014,10 +6018,18 @@ if (utf && (options & PCRE_NO_UTF8_CHECK) == 0) PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8; } - /* Check that a start_offset points to the start of a UTF-8 character. */ + /* Check that a start_offset points to the start of a UTF character. */ +#ifdef COMPILE_PCRE8 if (start_offset > 0 && start_offset < length && (((PCRE_PUCHAR)subject)[start_offset] & 0xc0) == 0x80) return PCRE_ERROR_BADUTF8_OFFSET; +#else +#ifdef COMPILE_PCRE16 + if (start_offset > 0 && start_offset < length && + (((PCRE_PUCHAR)subject)[start_offset] & 0xfc00) == 0xdc00) + return PCRE_ERROR_BADUTF8_OFFSET; +#endif /* COMPILE_PCRE16 */ +#endif /* COMPILE_PCRE8 */ } #endif @@ -6291,13 +6303,13 @@ for(;;) if (firstline) { PCRE_PUCHAR t = start_match; -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf) { while (t < md->end_subject && !IS_NEWLINE(t)) { t++; - while (t < end_subject && (*t & 0xc0) == 0x80) t++; + INTERNALCHAR(t < end_subject, *t, t++); } } else @@ -6333,14 +6345,14 @@ for(;;) { if (start_match > md->start_subject + start_offset) { -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf) { while (start_match < end_subject && !WAS_NEWLINE(start_match)) { start_match++; - while(start_match < end_subject && (*start_match & 0xc0) == 0x80) - start_match++; + INTERNALCHAR(start_match < end_subject, *start_match, + start_match++); } } else @@ -6366,7 +6378,7 @@ for(;;) { while (start_match < end_subject) { -#ifdef COMPILE_PCRE8 +#ifdef COMPILE_PCRE register unsigned int c = *start_match; #else register unsigned int c = *start_match & 0xff; @@ -6374,10 +6386,10 @@ for(;;) if ((start_bits[c/8] & (1 << (c&7))) == 0) { start_match++; -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf) - while(start_match < end_subject && (*start_match & 0xc0) == 0x80) - start_match++; + INTERNALCHAR(start_match < end_subject, *start_match, + start_match++); #endif } else break; @@ -6506,10 +6518,10 @@ for(;;) case MATCH_PRUNE: case MATCH_THEN: new_start_match = start_match + 1; -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf) - while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80) - new_start_match++; + INTERNALCHAR(new_start_match < end_subject, *new_start_match, + new_start_match++); #endif break; diff --git a/pcre_internal.h b/pcre_internal.h index 637565b..7642b91 100644 --- a/pcre_internal.h +++ b/pcre_internal.h @@ -531,7 +531,9 @@ not used when UTF-8 is not supported, so it is not defined, and BACKCHAR should never be called in byte mode. To make sure they can never even appear when UTF-8 support is omitted, we don't even define them. */ -#ifndef SUPPORT_UTF8 +/* #define HAS_EXTRALEN(c) */ +/* #define GET_EXTRALEN(c) */ +#ifndef SUPPORT_UTF #define GETCHAR(c, eptr) c = *eptr; #define GETCHARTEST(c, eptr) c = *eptr; #define GETCHARINC(c, eptr) c = *eptr++; @@ -539,14 +541,27 @@ UTF-8 support is omitted, we don't even define them. */ #define GETCHARLEN(c, eptr, len) c = *eptr; /* #define GETCHARLENTEST(c, eptr, len) */ /* #define BACKCHAR(eptr) */ +/* #define FORWARDCHAR(eptr) */ +/* #define INTERNALCHAR(condition, eptr, action) */ + +#else /* SUPPORT_UTF */ -#else /* SUPPORT_UTF8 */ +#ifdef COMPILE_PCRE8 /* These macros were originally written in the form of loops that used data from the tables whose names start with PRIV(utf8_table). They were rewritten by a user so as not to use loops, because in some environments this gives a significant performance advantage, and it seems never to do any harm. */ +/* Tests whether the code point needs extra characters to decode. */ + +#define HAS_EXTRALEN(c) ((c) >= 0xc0) + +/* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE. +Otherwise it has an undefined behaviour. */ + +#define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3f]) + /* Base macro to pick up the remaining bytes of a UTF-8 character, not advancing the pointer. */ @@ -689,7 +704,107 @@ because almost all calls are already within a block of UTF-8 only code. */ #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr-- -#endif /* SUPPORT_UTF8 */ +/* Same as above, just in the other direction. */ +#define FORWARDCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr++ + +/* Same as above, but it allows a fully customizable form. */ +#define INTERNALCHAR(condition, eptr, action) \ + while((condition) && ((eptr) & 0xc0) == 0x80) action + +#else /* COMPILE_PCRE8 */ + +#ifdef COMPILE_PCRE16 + +/* Tests whether the code point needs extra characters to decode. */ + +#define HAS_EXTRALEN(c) (((c) & 0xfc00) == 0xd800) + +/* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE. +Otherwise it has an undefined behaviour. */ + +#define GET_EXTRALEN(c) 1 + +/* Base macro to pick up the low surrogate of a UTF-16 character, not +advancing the pointer. */ + +#define GETUTF16(c, eptr) \ + { c = (((c & 0x3ff) << 10) | (eptr[1] & 0x3ff)) + 0x10000; } + +/* Get the next UTF-16 character, not advancing the pointer. This is called when +we know we are in UTF-16 mode. */ + +#define GETCHAR(c, eptr) \ + c = *eptr; \ + if ((c & 0xfc00) == 0xd800) GETUTF16(c, eptr); + +/* Get the next UTF-16 character, testing for UTF-16 mode, and not advancing the +pointer. */ + +#define GETCHARTEST(c, eptr) \ + c = *eptr; \ + if (utf && (c & 0xfc00) == 0xd800) GETUTF16(c, eptr); + +/* Base macro to pick up the low surrogate of a UTF-16 character, advancing +the pointer. */ + +#define GETUTF16INC(c, eptr) \ + { c = (((c & 0x3ff) << 10) | (eptr[1] & 0x3ff)) + 0x10000; eptr++; } + +/* Get the next UTF-16 character, advancing the pointer. This is called when we +know we are in UTF-16 mode. */ + +#define GETCHARINC(c, eptr) \ + c = *eptr++; \ + if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, eptr); + +/* Get the next character, testing for UTF-16 mode, and advancing the pointer. +This is called when we don't know if we are in UTF-16 mode. */ + +#define GETCHARINCTEST(c, eptr) \ + c = *eptr++; \ + if (utf && (c & 0xfc00) == 0xd800) GETUTF16INC(c, eptr); + +/* Base macro to pick up the low surrogate of a UTF-16 character, not +advancing the pointer, incrementing the length. */ + +#define GETUTF16LEN(c, eptr, len) \ + { c = (((c & 0x3ff) << 10) | (eptr[1] & 0x3ff)) + 0x10000; len++; } + +/* Get the next UTF-16 character, not advancing the pointer, incrementing +length if there is a low surrogate. This is called when we know we are in +UTF-16 mode. */ + +#define GETCHARLEN(c, eptr, len) \ + c = *eptr; \ + if ((c & 0xfc00) == 0xd800) GETUTF16LEN(c, eptr, len); + +/* Get the next UTF-816character, testing for UTF-16 mode, not advancing the +pointer, incrementing length if there is a low surrogate. This is called when +we do not know if we are in UTF-16 mode. */ + +#define GETCHARLENTEST(c, eptr, len) \ + c = *eptr; \ + if (utf && (c & 0xfc00) == 0xd800) GETUTF16LEN(c, eptr, len); + +/* If the pointer is not at the start of a character, move it back until +it is. This is called only in UTF-16 mode - we don't put a test within the +macro because almost all calls are already within a block of UTF-16 only +code. */ + +#define BACKCHAR(eptr) if ((*eptr & 0xfc00) == 0xdc00) eptr-- + +/* Same as above, just in the other direction. */ +#define FORWARDCHAR(eptr) if ((*eptr & 0xfc00) == 0xdc00) eptr++ + +/* Same as above, but it allows a fully customizable form. */ +#define INTERNALCHAR(condition, eptr, action) \ + if ((condition) && ((eptr) & 0xfc00) == 0xdc00) action + +#endif + +#endif /* COMPILE_PCRE8 */ + +#endif /* SUPPORT_UTF */ /* In case there is no definition of offsetof() provided - though any proper @@ -2043,12 +2158,15 @@ of the exported public functions. They have to be "external" in the C sense, but are not part of the PCRE public API. The data for these tables is in the pcre_tables.c module. */ +#ifdef COMPILE_PCRE8 + extern const int PRIV(utf8_table1)[]; +extern const int PRIV(utf8_table1_size); extern const int PRIV(utf8_table2)[]; extern const int PRIV(utf8_table3)[]; extern const pcre_uint8 PRIV(utf8_table4)[]; -extern const int PRIV(utf8_table1_size); +#endif /* COMPILE_PCRE8 */ extern const char PRIV(utt_names)[]; extern const ucp_type_table PRIV(utt)[]; diff --git a/pcre_jit_compile.c b/pcre_jit_compile.c index 16611f1..03c7b2c 100644 --- a/pcre_jit_compile.c +++ b/pcre_jit_compile.c @@ -302,9 +302,11 @@ typedef struct compiler_common { #ifdef SUPPORT_UCP BOOL useucp; #endif - jump_list *utf8readchar; - jump_list *utf8readtype8; + jump_list *utfreadchar; +#ifdef COMPILE_PCRE8 + jump_list *utfreadtype8; #endif +#endif /* SUPPORT_UTF8 */ #ifdef SUPPORT_UCP jump_list *getucd; #endif @@ -543,8 +545,8 @@ switch(*cc) case OP_NOTPOSPLUSI: case OP_NOTPOSQUERYI: cc += 2; -#ifdef SUPPORT_UTF8 - if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; +#ifdef SUPPORT_UTF + if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); #endif return cc; @@ -565,8 +567,8 @@ switch(*cc) case OP_NOTEXACTI: case OP_NOTPOSUPTOI: cc += 2 + IMM2_SIZE; -#ifdef SUPPORT_UTF8 - if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; +#ifdef SUPPORT_UTF + if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); #endif return cc; @@ -1285,7 +1287,7 @@ return MAX_255(c) ? common->fcc[c] != c : FALSE; static SLJIT_INLINE unsigned int char_othercase(compiler_common *common, unsigned int c) { /* Returns with the othercase. */ -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (common->utf && c > 127) { #ifdef SUPPORT_UCP @@ -1302,11 +1304,11 @@ static unsigned int char_get_othercase_bit(compiler_common *common, pcre_uchar* { /* Detects if the character and its othercase has only 1 bit difference. */ unsigned int c, oc, bit; -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF8 && defined COMPILE_PCRE8 int n; #endif -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (common->utf) { GETCHAR(c, cc); @@ -1324,11 +1326,11 @@ if (common->utf) else { c = *cc; - oc = common->fcc[c]; + oc = TABLE_GET(c, common->fcc, c); } #else c = *cc; -oc = common->fcc[c]; +oc = TABLE_GET(c, common->fcc, c); #endif SLJIT_ASSERT(c != oc); @@ -1342,10 +1344,12 @@ if (c <= 127 && bit == 0x20) if (!ispowerof2(bit)) return 0; +#ifdef COMPILE_PCRE8 + #ifdef SUPPORT_UTF8 if (common->utf && c > 127) { - n = PRIV(utf8_table4)[*cc & 0x3f]; + n = GET_EXTRALEN(*cc); while ((bit & 0x3f) == 0) { n--; @@ -1353,8 +1357,25 @@ if (common->utf && c > 127) } return (n << 8) | bit; } -#endif +#endif /* SUPPORT_UTF8 */ return (0 << 8) | bit; + +#else /* COMPILE_PCRE8 */ + +#ifdef COMPILE_PCRE16 +#ifdef SUPPORT_UTF16 +if (common->utf && c > 65535) + { + if (bit >= (1 << 10)) + bit >>= 10; + else + return (bit <= 255) ? ((2 << 8) | bit) : ((3 << 8) | (bit >> 8)); + } +#endif /* SUPPORT_UTF16 */ +return (bit <= 255) ? ((0 << 8) | bit) : ((1 << 8) | (bit >> 8)); +#endif /* COMPILE_PCRE16 */ + +#endif /* COMPILE_PCRE8 */ } static SLJIT_INLINE void check_input_end(compiler_common *common, jump_list **fallbacks) @@ -1368,16 +1389,22 @@ static void read_char(compiler_common *common) /* Reads the character into TMP1, updates STR_PTR. Does not check STR_END. TMP2 Destroyed. */ DEFINE_COMPILER; -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF struct sljit_jump *jump; #endif OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (common->utf) { +#ifdef COMPILE_PCRE8 jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0); - add_jump(compiler, &common->utf8readchar, JUMP(SLJIT_FAST_CALL)); +#else +#ifdef COMPILE_PCRE16 + jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xd800); +#endif +#endif /* COMPILE_PCRE8 */ + add_jump(compiler, &common->utfreadchar, JUMP(SLJIT_FAST_CALL)); JUMPHERE(jump); } #endif @@ -1389,16 +1416,22 @@ static void peek_char(compiler_common *common) /* Reads the character into TMP1, keeps STR_PTR. Does not check STR_END. TMP2 Destroyed. */ DEFINE_COMPILER; -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF struct sljit_jump *jump; #endif OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (common->utf) { +#ifdef COMPILE_PCRE8 jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0); - add_jump(compiler, &common->utf8readchar, JUMP(SLJIT_FAST_CALL)); +#else +#ifdef COMPILE_PCRE16 + jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xd800); +#endif +#endif /* COMPILE_PCRE8 */ + add_jump(compiler, &common->utfreadchar, JUMP(SLJIT_FAST_CALL)); OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0); JUMPHERE(jump); } @@ -1409,46 +1442,83 @@ static void read_char8_type(compiler_common *common) { /* Reads the character type into TMP1, updates STR_PTR. Does not check STR_END. */ DEFINE_COMPILER; -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF || defined COMPILE_PCRE16 struct sljit_jump *jump; #endif -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (common->utf) { - OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 0); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); + OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 0); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); +#ifdef COMPILE_PCRE8 /* This can be an extra read in some situations, but hopefully - it is a clever early read in most cases. */ + it is needed in most cases. */ OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); jump = CMP(SLJIT_C_LESS, TMP2, 0, SLJIT_IMM, 0xc0); - add_jump(compiler, &common->utf8readtype8, JUMP(SLJIT_FAST_CALL)); + add_jump(compiler, &common->utfreadtype8, JUMP(SLJIT_FAST_CALL)); JUMPHERE(jump); +#else +#ifdef COMPILE_PCRE16 + OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); + jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 0xff); + OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); + JUMPHERE(jump); + /* Skip low surrogate if necessary. */ + OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xfc00); + OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0xd800); + COND_VALUE(SLJIT_MOV, TMP2, 0, SLJIT_C_EQUAL); + OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 1); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0); +#endif +#endif /* COMPILE_PCRE8 */ return; } #endif -OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 0); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); -OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), common->ctypes); +#ifdef COMPILE_PCRE16 +/* The ctypes array contains only 255 values. */ +OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); +jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 0xff); +#endif +OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); +#ifdef COMPILE_PCRE16 +JUMPHERE(jump); +#endif } static void skip_char_back(compiler_common *common) { /* Goes one character back. Only affects STR_PTR. Does not check begin. */ DEFINE_COMPILER; -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF && defined COMPILE_PCRE8 struct sljit_label *label; if (common->utf) { label = LABEL(); - OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); + OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -IN_UCHARS(1)); + OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc0); CMPTO(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, 0x80, label); return; } #endif +#if defined SUPPORT_UTF && defined COMPILE_PCRE16 +if (common->utf) + { + OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -IN_UCHARS(1)); + OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); + /* Skip low surrogate if necessary. */ + OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00); + OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xdc00); + COND_VALUE(SLJIT_MOV, TMP1, 0, SLJIT_C_EQUAL); + OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1); + OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP1, 0); + return; + } +#endif OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); } @@ -1477,10 +1547,12 @@ else } } -#ifdef SUPPORT_UTF8 -static void do_utf8readchar(compiler_common *common) +#ifdef SUPPORT_UTF + +#ifdef COMPILE_PCRE8 +static void do_utfreadchar(compiler_common *common) { -/* Fast decoding an utf8 character. TMP1 contains the first byte +/* Fast decoding a UTF-8 character. TMP1 contains the first byte of the character (>= 0xc0). Return char value in TMP1, length - 1 in TMP2. */ DEFINE_COMPILER; struct sljit_jump *jump; @@ -1489,82 +1561,57 @@ sljit_emit_fast_enter(compiler, RETURN_ADDR, 0, 1, 5, 5, common->localsize); /* Searching for the first zero. */ OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x20); jump = JUMP(SLJIT_C_NOT_ZERO); -/* 2 byte sequence */ -OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 1); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); +/* Two byte sequence. */ +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x1f); OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); -OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 1); +OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, IN_UCHARS(1)); sljit_emit_fast_return(compiler, RETURN_ADDR, 0); JUMPHERE(jump); OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x10); jump = JUMP(SLJIT_C_NOT_ZERO); -/* 3 byte sequence */ -OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 1); +/* Three byte sequence. */ +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x0f); OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 12); OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6); OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); -OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 2); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 2); +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(2)); +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); -OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 2); +OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, IN_UCHARS(2)); sljit_emit_fast_return(compiler, RETURN_ADDR, 0); JUMPHERE(jump); -OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x08); -jump = JUMP(SLJIT_C_NOT_ZERO); -/* 4 byte sequence */ -OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 1); +/* Four byte sequence. */ +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x07); OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 18); OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 12); OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); -OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 2); +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(2)); OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6); OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); -OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 3); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 3); +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(3)); +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3)); OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); -OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 3); -sljit_emit_fast_return(compiler, RETURN_ADDR, 0); -JUMPHERE(jump); - -/* 5 byte sequence */ -OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 1); -OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x03); -OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 24); -OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); -OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 18); -OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); -OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 2); -OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); -OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 12); -OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); -OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 3); -OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); -OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6); -OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); -OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 4); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 4); -OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); -OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); -OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 4); +OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, IN_UCHARS(3)); sljit_emit_fast_return(compiler, RETURN_ADDR, 0); } -static void do_utf8readtype8(compiler_common *common) +static void do_utfreadtype8(compiler_common *common) { -/* Fast decoding an utf8 character type. TMP2 contains the first byte -of the character (>= 0xc0) and TMP1 is destroyed. Return value in TMP1. */ +/* Fast decoding a UTF-8 character type. TMP2 contains the first byte +of the character (>= 0xc0). Return value in TMP1. */ DEFINE_COMPILER; struct sljit_jump *jump; struct sljit_jump *compare; @@ -1573,9 +1620,9 @@ sljit_emit_fast_enter(compiler, RETURN_ADDR, 0, 1, 5, 5, common->localsize); OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0x20); jump = JUMP(SLJIT_C_NOT_ZERO); -/* 2 byte sequence */ -OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); +/* Two byte sequence. */ +OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x1f); OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6); OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3f); @@ -1596,7 +1643,38 @@ OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); sljit_emit_fast_return(compiler, RETURN_ADDR, 0); } -#endif +#else /* COMPILE_PCRE8 */ + +#ifdef COMPILE_PCRE16 +static void do_utfreadchar(compiler_common *common) +{ +/* Fast decoding a UTF-16 character. TMP1 contains the first 16 bit char +of the character (>= 0xd800). Return char value in TMP1, length - 1 in TMP2. */ +DEFINE_COMPILER; +struct sljit_jump *jump; + +sljit_emit_fast_enter(compiler, RETURN_ADDR, 0, 1, 5, 5, common->localsize); +jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xdc00); +/* Do nothing, only return. */ +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); + +JUMPHERE(jump); +/* Combine two 16 bit characters. */ +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 1); +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); +OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3ff); +OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 10); +OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3ff); +OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); +OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, IN_UCHARS(1)); +OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000); +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); +} +#endif /* COMPILE_PCRE16 */ + +#endif /* COMPILE_PCRE8 */ + +#endif /* SUPPORT_UTF */ #ifdef SUPPORT_UCP @@ -1634,8 +1712,8 @@ struct sljit_label *newlinelabel = NULL; struct sljit_jump *start; struct sljit_jump *end = NULL; struct sljit_jump *nl = NULL; -#ifdef SUPPORT_UTF8 -struct sljit_jump *singlebyte; +#ifdef SUPPORT_UTF +struct sljit_jump *singlechar; #endif jump_list *newline = NULL; BOOL newlinecheck = FALSE; @@ -1708,13 +1786,25 @@ if (newlinecheck) CMPTO(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff, newlinelabel); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF && defined COMPILE_PCRE8 if (common->utf) { - singlebyte = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0); + singlechar = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0); OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)PRIV(utf8_table4) - 0xc0); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); - JUMPHERE(singlebyte); + JUMPHERE(singlechar); + } +#endif +#if defined SUPPORT_UTF && defined COMPILE_PCRE16 +if (common->utf) + { + singlechar = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xd800); + OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00); + OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xd800); + COND_VALUE(SLJIT_MOV, TMP1, 0, SLJIT_C_EQUAL); + OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); + JUMPHERE(singlechar); } #endif JUMPHERE(start); @@ -1770,7 +1860,7 @@ else } OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF && defined COMPILE_PCRE8 if (common->utf) { CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0, start); @@ -1778,6 +1868,17 @@ if (common->utf) OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); } #endif +#if defined SUPPORT_UTF && defined COMPILE_PCRE16 +if (common->utf) + { + CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xd800, start); + OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00); + OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xd800); + COND_VALUE(SLJIT_MOV, TMP1, 0, SLJIT_C_EQUAL); + OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); + } +#endif JUMPTO(SLJIT_JUMP, start); JUMPHERE(found); JUMPHERE(leave); @@ -1900,7 +2001,7 @@ if (common->utf) OP1(SLJIT_MOV, TMP1, 0, TMP3, 0); #endif OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF && defined COMPILE_PCRE8 if (common->utf) { CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0, start); @@ -1908,6 +2009,17 @@ if (common->utf) OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); } #endif +#if defined SUPPORT_UTF && defined COMPILE_PCRE16 +if (common->utf) + { + CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xd800, start); + OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00); + OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xd800); + COND_VALUE(SLJIT_MOV, TMP1, 0, SLJIT_C_EQUAL); + OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); + } +#endif JUMPTO(SLJIT_JUMP, start); JUMPHERE(found); JUMPHERE(leave); @@ -2335,10 +2447,10 @@ if (context->sourcereg == -1) context->sourcereg = TMP2; } -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF utflength = 1; -if (common->utf && *cc >= 0xc0) - utflength += PRIV(utf8_table4)[*cc & 0x3f]; +if (common->utf && HAS_EXTRALEN(*cc)) + utflength += GET_EXTRALEN(*cc); do { @@ -2523,8 +2635,8 @@ while (*cc != XCL_END) if (*cc == XCL_SINGLE) { cc += 2; -#ifdef SUPPORT_UTF8 - if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; +#ifdef SUPPORT_UTF + if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); #endif #ifdef SUPPORT_UCP needschar = TRUE; @@ -2533,12 +2645,12 @@ while (*cc != XCL_END) else if (*cc == XCL_RANGE) { cc += 2; -#ifdef SUPPORT_UTF8 - if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; +#ifdef SUPPORT_UTF + if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); #endif cc++; -#ifdef SUPPORT_UTF8 - if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; +#ifdef SUPPORT_UTF + if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); #endif #ifdef SUPPORT_UCP needschar = TRUE; @@ -2875,24 +2987,35 @@ switch(type) case OP_ALLANY: check_input_end(common, fallbacks); -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (common->utf) { - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); + OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); +#ifdef COMPILE_PCRE8 jump[0] = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0); OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)PRIV(utf8_table4) - 0xc0); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); +#else /* COMPILE_PCRE8 */ +#ifdef COMPILE_PCRE16 + jump[0] = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xd800); + OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00); + OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xd800); + COND_VALUE(SLJIT_MOV, TMP1, 0, SLJIT_C_EQUAL); + OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); +#endif /* COMPILE_PCRE16 */ +#endif /* COMPILE_PCRE8 */ JUMPHERE(jump[0]); return cc; } #endif - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); return cc; case OP_ANYBYTE: check_input_end(common, fallbacks); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); return cc; #ifdef SUPPORT_UTF8 @@ -3095,8 +3218,8 @@ switch(type) case OP_CHAR: case OP_CHARI: length = 1; -#ifdef SUPPORT_UTF8 - if (common->utf && *cc >= 0xc0) length += PRIV(utf8_table4)[*cc & 0x3f]; +#ifdef SUPPORT_UTF + if (common->utf && HAS_EXTRALEN(*cc)) length += GET_EXTRALEN(*cc); #endif if (type == OP_CHAR || !char_has_othercase(common, cc) || char_get_othercase_bit(common, cc) != 0) { @@ -3129,11 +3252,11 @@ switch(type) case OP_NOT: case OP_NOTI: -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (common->utf) { length = 1; - if (*cc >= 0xc0) length += PRIV(utf8_table4)[*cc & 0x3f]; + if (HAS_EXTRALEN(*cc)) length += GET_EXTRALEN(*cc); check_input_end(common, fallbacks); GETCHAR(c, cc); @@ -3152,7 +3275,9 @@ switch(type) /* Skip the variable-length character. */ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); jump[0] = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0); +#ifdef COMPILE_PCRE8 OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)PRIV(utf8_table4) - 0xc0); +#endif OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); JUMPHERE(jump[0]); return cc + length; @@ -3268,21 +3393,21 @@ do if (*cc == OP_CHAR) { size = 1; -#ifdef SUPPORT_UTF8 - if (common->utf && cc[1] >= 0xc0) - size += PRIV(utf8_table4)[cc[1] & 0x3f]; +#ifdef SUPPORT_UTF + if (common->utf && HAS_EXTRALEN(cc[1])) + size += GET_EXTRALEN(cc[1]); #endif } else if (*cc == OP_CHARI) { size = 1; -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (common->utf) { if (char_has_othercase(common, cc + 1) && char_get_othercase_bit(common, cc + 1) == 0) size = 0; - else if (cc[1] >= 0xc0) - size += PRIV(utf8_table4)[cc[1] & 0x3f]; + else if (HAS_EXTRALEN(cc[1])) + size += GET_EXTRALEN(cc[1]); } else #endif @@ -4786,8 +4911,8 @@ if (*type == 0) if (end != NULL) { *end = cc + 1; -#ifdef SUPPORT_UTF8 - if (common->utf && *cc >= 0xc0) *end += PRIV(utf8_table4)[*cc & 0x3f]; +#ifdef SUPPORT_UTF + if (common->utf && HAS_EXTRALEN(*cc)) *end += GET_EXTRALEN(*cc); #endif } return cc; @@ -6259,9 +6384,11 @@ common->utf = (re->options & PCRE_UTF8) != 0; #ifdef SUPPORT_UCP common->useucp = (re->options & PCRE_UCP) != 0; #endif -common->utf8readchar = NULL; -common->utf8readtype8 = NULL; +common->utfreadchar = NULL; +#ifdef COMPILE_PCRE8 +common->utfreadtype8 = NULL; #endif +#endif /* SUPPORT_UTF8 */ #ifdef SUPPORT_UCP common->getucd = NULL; #endif @@ -6487,18 +6614,20 @@ if (common->caselesscmp != NULL) set_jumps(common->caselesscmp, LABEL()); do_caselesscmp(common); } -#ifdef SUPPORT_UTF8 -if (common->utf8readchar != NULL) +#ifdef SUPPORT_UTF +if (common->utfreadchar != NULL) { - set_jumps(common->utf8readchar, LABEL()); - do_utf8readchar(common); + set_jumps(common->utfreadchar, LABEL()); + do_utfreadchar(common); } -if (common->utf8readtype8 != NULL) +#ifdef COMPILE_PCRE8 +if (common->utfreadtype8 != NULL) { - set_jumps(common->utf8readtype8, LABEL()); - do_utf8readtype8(common); + set_jumps(common->utfreadtype8, LABEL()); + do_utfreadtype8(common); } #endif +#endif /* COMPILE_PCRE8 */ #ifdef SUPPORT_UCP if (common->getucd != NULL) { diff --git a/pcre_printint.src b/pcre_printint.src index 5a9f15d..2922e54 100644 --- a/pcre_printint.src +++ b/pcre_printint.src @@ -72,17 +72,20 @@ static const char *OP_names[] = { OP_NAME_LIST }; *************************************************/ static int -print_char(FILE *f, pcre_uchar *ptr, BOOL utf8) +print_char(FILE *f, pcre_uchar *ptr, BOOL utf) { int c = *ptr; -#ifndef SUPPORT_UTF8 -(void)utf8; /* Avoid compiler warning */ +#ifndef SUPPORT_UTF +(void)utf; /* Avoid compiler warning */ if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c); return 0; #else -if (!utf8 || (c & 0xc0) != 0xc0) + +#ifdef COMPILE_PCRE8 + +if (!utf || (c & 0xc0) != 0xc0) { if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c); return 0; @@ -110,14 +113,45 @@ else s -= 6; c |= (ptr[i] & 0x3f) << s; } - if (c < 128) fprintf(f, "\\x%02x", c); else fprintf(f, "\\x{%x}", c); + fprintf(f, "\\x{%x}", c); return a; } -#endif + +#else + +#ifdef COMPILE_PCRE16 + +if (!utf || (c & 0xfc00) != 0xd800) + { + if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c); + return 0; + } +else + { + /* This is a check for malformed UTF-16; it should only occur if the sanity + check has been turned off. Rather than swallow a low surrogate, just stop if + we hit a bad one. Print it with \X instead of \x as an indication. */ + + if ((ptr[1] & 0xfc00) != 0xdc00) + { + fprintf(f, "\\X{%x}", c); + return 0; + } + + c = (((c & 0x3ff) << 10) | (ptr[1] & 0x3ff)) + 0x10000; + fprintf(f, "\\x{%x}", c); + return 1; + } + +#endif /* COMPILE_PCRE16 */ + +#endif /* COMPILE_PCRE8 */ + +#endif /* SUPPORT_UTF */ } /************************************************* -* Print uchar string (regardless of utf8) * +* Print uchar string (regardless of utf) * *************************************************/ static void @@ -168,7 +202,7 @@ pcre_printint(pcre *external_re, FILE *f, BOOL print_lengths) { real_pcre *re = (real_pcre *)external_re; pcre_uchar *codestart, *code; -BOOL utf8; +BOOL utf; unsigned int options = re->options; int offset = re->name_table_offset; @@ -187,7 +221,8 @@ if (re->magic_number != MAGIC_NUMBER) } code = codestart = (pcre_uchar *)re + offset + count * size; -utf8 = (options & PCRE_UTF8) != 0; +/* PCRE_UTF16 has the same value as PCRE_UTF8. */ +utf = (options & PCRE_UTF8) != 0; for(;;) { @@ -232,7 +267,7 @@ for(;;) do { code++; - code += 1 + print_char(f, code, utf8); + code += 1 + print_char(f, code, utf); } while (*code == OP_CHAR); fprintf(f, "\n"); @@ -243,7 +278,7 @@ for(;;) do { code++; - code += 1 + print_char(f, code, utf8); + code += 1 + print_char(f, code, utf); } while (*code == OP_CHARI); fprintf(f, "\n"); @@ -349,7 +384,7 @@ for(;;) extra = 2; } } - else extra = print_char(f, code+1, utf8); + else extra = print_char(f, code+1, utf); fprintf(f, "%s", OP_names[*code]); break; @@ -364,7 +399,7 @@ for(;;) case OP_MINUPTO: case OP_POSUPTO: fprintf(f, " %s ", flag); - extra = print_char(f, code + 1 + IMM2_SIZE, utf8); + extra = print_char(f, code + 1 + IMM2_SIZE, utf); fprintf(f, "{"); if (*code != OP_EXACT && *code != OP_EXACTI) fprintf(f, "0,"); fprintf(f, "%d}", GET2(code,1)); @@ -557,7 +592,7 @@ for(;;) } } - /* Indicate a non-UTF8 class which was created by negation */ + /* Indicate a non-UTF class which was created by negation */ fprintf(f, "]%s", (*code == OP_NCLASS)? " (neg)" : ""); diff --git a/pcre_study.c b/pcre_study.c index 098980d..1e10397 100644 --- a/pcre_study.c +++ b/pcre_study.c @@ -225,7 +225,7 @@ for (;;) branchlength++; cc += 2; #ifdef SUPPORT_UTF8 - if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; + if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); #endif break; @@ -246,7 +246,7 @@ for (;;) branchlength += GET2(cc,1); cc += 2 + IMM2_SIZE; #ifdef SUPPORT_UTF8 - if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; + if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); #endif break; @@ -487,7 +487,7 @@ for (;;) cc += PRIV(OP_lengths)[op]; #ifdef SUPPORT_UTF8 - if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; + if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); #endif break; diff --git a/pcre_tables.c b/pcre_tables.c index 7c52961..b8cabf3 100644 --- a/pcre_tables.c +++ b/pcre_tables.c @@ -65,7 +65,9 @@ const pcre_uint8 PRIV(OP_lengths)[] = { OP_LENGTHS }; /* These are the breakpoints for different numbers of bytes in a UTF-8 character. */ -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF + +#ifdef COMPILE_PCRE8 const int PRIV(utf8_table1)[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff}; @@ -87,6 +89,8 @@ const pcre_uint8 PRIV(utf8_table4)[] = { 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; +#endif /* COMPILE_PCRE8 */ + /* Table to translate from particular type value to the general value. */ const int PRIV(ucp_gentype)[] = { @@ -554,6 +558,6 @@ const ucp_type_table PRIV(utt)[] = { const int PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table); -#endif /* SUPPORT_UTF8 */ +#endif /* SUPPORT_UTF */ /* End of pcre_tables.c */ |