diff options
author | zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2011-12-05 20:12:24 +0000 |
---|---|---|
committer | zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2011-12-05 20:12:24 +0000 |
commit | a9839b968cee5828bf35dbcb05a31859a49ab7a2 (patch) | |
tree | 836125e6c0ea7958e295ccda9f7d060b05102430 | |
parent | 216818740b54b629e7bd59cd49f783c72e244e23 (diff) | |
download | pcre-a9839b968cee5828bf35dbcb05a31859a49ab7a2.tar.gz |
Improving UTF-16 support by fixing a lot of issues.
git-svn-id: svn://vcs.exim.org/pcre/code/branches/pcre16@785 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | Makefile.am | 3 | ||||
-rw-r--r-- | pcre.h.in | 10 | ||||
-rw-r--r-- | pcre16_fullinfo.c | 45 | ||||
-rw-r--r-- | pcre16_info.c | 45 | ||||
-rw-r--r-- | pcre16_version.c | 45 | ||||
-rw-r--r-- | pcre_compile.c | 98 | ||||
-rw-r--r-- | pcre_dfa_exec.c | 29 | ||||
-rw-r--r-- | pcre_exec.c | 49 | ||||
-rw-r--r-- | pcre_fullinfo.c | 6 | ||||
-rw-r--r-- | pcre_info.c | 11 | ||||
-rw-r--r-- | pcre_internal.h | 8 | ||||
-rw-r--r-- | pcre_jit_compile.c | 206 | ||||
-rw-r--r-- | pcre_newline.c | 30 | ||||
-rw-r--r-- | pcre_printint.src | 4 | ||||
-rw-r--r-- | pcre_study.c | 113 | ||||
-rw-r--r-- | pcre_version.c | 5 |
16 files changed, 543 insertions, 164 deletions
diff --git a/Makefile.am b/Makefile.am index c939f9f..817b01a 100644 --- a/Makefile.am +++ b/Makefile.am @@ -212,6 +212,8 @@ libpcre16_la_SOURCES = \ pcre16_chartables.c \ pcre16_compile.c \ pcre16_exec.c \ + pcre16_fullinfo.c \ + pcre16_info.c \ pcre16_jit_compile.c \ pcre16_newline.c \ pcre16_ord2utf16.c \ @@ -222,6 +224,7 @@ libpcre16_la_SOURCES = \ pcre16_ucd.c \ pcre16_utf16_utils.c \ pcre16_valid_utf16.c \ + pcre16_version.c \ pcre16_xclass.c ## This file is generated as part of the building process, so don't distribute. @@ -367,6 +367,8 @@ PCRE_EXP_DECL void pcre_free_substring(const char *); PCRE_EXP_DECL void pcre_free_substring_list(const char **); PCRE_EXP_DECL int pcre_fullinfo(const pcre *, const pcre_extra *, int, void *); +PCRE_EXP_DECL int pcre16_fullinfo(const pcre *, const pcre_extra *, int, + void *); PCRE_EXP_DECL int pcre_get_named_substring(const pcre *, const char *, int *, int, const char *, const char **); PCRE_EXP_DECL int pcre_get_stringnumber(const pcre *, const char *); @@ -377,15 +379,19 @@ PCRE_EXP_DECL int pcre_get_substring(const char *, int *, int, int, PCRE_EXP_DECL int pcre_get_substring_list(const char *, int *, int, const char ***); PCRE_EXP_DECL int pcre_info(const pcre *, int *, int *); +PCRE_EXP_DECL int pcre16_info(const pcre *, int *, int *); PCRE_EXP_DECL const unsigned char *pcre_maketables(void); PCRE_EXP_DECL int pcre_refcount(pcre *, int); -PCRE_EXP_DECL int pcre16_utf16_to_host_byte_order(PCRE_SCHAR16 *, - PCRE_SPTR16, int, int); PCRE_EXP_DECL pcre_extra *pcre_study(const pcre *, int, const char **); PCRE_EXP_DECL pcre_extra *pcre16_study(const pcre *, int, const char **); PCRE_EXP_DECL void pcre_free_study(pcre_extra *); PCRE_EXP_DECL void pcre16_free_study(pcre_extra *); PCRE_EXP_DECL const char *pcre_version(void); +PCRE_EXP_DECL const char *pcre16_version(void); + +/* Utility functions. */ +PCRE_EXP_DECL int pcre16_utf16_to_host_byte_order(PCRE_SCHAR16 *, + PCRE_SPTR16, int, int); /* JIT compiler related functions. */ diff --git a/pcre16_fullinfo.c b/pcre16_fullinfo.c new file mode 100644 index 0000000..0e67deb --- /dev/null +++ b/pcre16_fullinfo.c @@ -0,0 +1,45 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Copyright (c) 1997-2011 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + +/* Generate code with 16 bit character support. */ +#define COMPILE_PCRE16 + +#include "pcre_fullinfo.c" + +/* End of pcre16_fullinfo.c */ diff --git a/pcre16_info.c b/pcre16_info.c new file mode 100644 index 0000000..b4b221a --- /dev/null +++ b/pcre16_info.c @@ -0,0 +1,45 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Copyright (c) 1997-2011 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + +/* Generate code with 16 bit character support. */ +#define COMPILE_PCRE16 + +#include "pcre_info.c" + +/* End of pcre16_info.c */ diff --git a/pcre16_version.c b/pcre16_version.c new file mode 100644 index 0000000..d4a3329 --- /dev/null +++ b/pcre16_version.c @@ -0,0 +1,45 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Copyright (c) 1997-2011 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + +/* Generate code with 16 bit character support. */ +#define COMPILE_PCRE16 + +#include "pcre_version.c" + +/* End of pcre16_version.c */ diff --git a/pcre_compile.c b/pcre_compile.c index 3461dbd..da22f59 100644 --- a/pcre_compile.c +++ b/pcre_compile.c @@ -102,6 +102,10 @@ overrun before it actually does run off the end of the data block. */ #define REQ_CASELESS 0x10000000l /* Indicates caselessness */ #define REQ_VARY 0x20000000l /* Reqchar followed non-literal item */ +/* Repeated character flags. */ + +#define UTF_LENGTH 0x10000000l /* The char contains its length. */ + /* Table for handling escaped characters in the range '0'-'z'. Positive returns are simple data values; negative values are for special things like \d and so on. Zero means further processing is needed (for things like \x), or the escape @@ -2896,7 +2900,7 @@ static BOOL check_auto_possessive(const pcre_uchar *previous, BOOL utf, const pcre_uchar *ptr, int options, compile_data *cd) { -int c, next; +pcre_int32 c, next; int op_code = *previous++; /* Skip whitespace and comments in extended mode */ @@ -2932,15 +2936,13 @@ if (*ptr == CHAR_BACKSLASH) if (temperrorcode != 0) return FALSE; ptr++; /* Point after the escape sequence */ } - -else if ((cd->ctypes[*ptr] & ctype_meta) == 0) +else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0) { -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf) { GETCHARINC(next, ptr); } else #endif next = *ptr++; } - else return FALSE; /* Skip whitespace and comments in extended mode */ @@ -4603,20 +4605,25 @@ for (;; ptr++) /* Deal with UTF characters that take up more than one character. It's easier to write this out separately than try to macrify it. Use c to - hold the length of the character in bytes, plus 0x80 to flag that it's a - length rather than a small character. */ + hold the length of the character in bytes, plus UTF_LENGTH to flag that + it's a length rather than a small character. */ -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF +#ifdef COMPILE_PCRE8 if (utf && (code[-1] & 0x80) != 0) +#endif /* COMPILE_PCRE8 */ +#ifdef COMPILE_PCRE16 + if (utf && (code[-1] & 0xfc00) == 0xdc00) +#endif /* COMPILE_PCRE8 */ { pcre_uchar *lastchar = code - 1; BACKCHAR(lastchar); c = code - lastchar; /* Length of UTF-8 character */ memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */ - c |= 0x80; /* Flag c as a length */ + c |= UTF_LENGTH; /* Flag c as a length */ } else -#endif +#endif /* SUPPORT_UTF */ /* Handle the case of a single charater - either with no UTF support, or with UTF disabled, or for a single character UTF character. */ @@ -4758,14 +4765,14 @@ for (;; ptr++) we have to insert the character for the previous code. For a repeated Unicode property match, there are two extra bytes that define the required property. In UTF-8 mode, long characters have their length in - c, with the 0x80 bit as a flag. */ + c, with the UTF_LENGTH bit as a flag. */ if (repeat_max < 0) { -#ifdef SUPPORT_UTF8 - if (utf && c >= 128) +#ifdef SUPPORT_UTF + if (utf && (c & UTF_LENGTH) != 0) { - memcpy(code, utf_chars, c & 7); + memcpy(code, utf_chars, IN_UCHARS(c & 7)); code += c & 7; } else @@ -4787,10 +4794,10 @@ for (;; ptr++) else if (repeat_max != repeat_min) { -#ifdef SUPPORT_UTF8 - if (utf && c >= 128) +#ifdef SUPPORT_UTF + if (utf && (c & UTF_LENGTH) != 0) { - memcpy(code, utf_chars, c & 7); + memcpy(code, utf_chars, IN_UCHARS(c & 7)); code += c & 7; } else @@ -4817,10 +4824,10 @@ for (;; ptr++) /* The character or character type itself comes last in all cases. */ -#ifdef SUPPORT_UTF8 - if (utf && c >= 128) +#ifdef SUPPORT_UTF + if (utf && (c & UTF_LENGTH) != 0) { - memcpy(code, utf_chars, c & 7); + memcpy(code, utf_chars, IN_UCHARS(c & 7)); code += c & 7; } else @@ -6661,9 +6668,7 @@ for (;; ptr++) #ifdef SUPPORT_UTF if (utf && HAS_EXTRALEN(c)) - { - INTERNALCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr)); - } + ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr)); #endif /* At this point we have the character's bytes in mcbuffer, and the length @@ -7789,9 +7794,27 @@ if ((re->options & PCRE_ANCHORED) == 0) re->first_char = firstchar & 0xffff; #endif #endif - if ((firstchar & REQ_CASELESS) != 0 && MAX_255(re->first_char) - && cd->fcc[re->first_char] != re->first_char) - re->flags |= PCRE_FCH_CASELESS; + if ((firstchar & REQ_CASELESS) != 0) + { +#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8) + /* We ignore non-ASCII first chars in 8 bit mode. */ + if (utf) + { + if (re->first_char < 128) + { + if (cd->fcc[re->first_char] != re->first_char) + re->flags |= PCRE_FCH_CASELESS; + } + else if ((options & PCRE_UCP) != 0 + && UCD_OTHERCASE(re->first_char) != re->first_char) + re->flags |= PCRE_FCH_CASELESS; + } + else +#endif + if (MAX_255(re->first_char) + && cd->fcc[re->first_char] != re->first_char) + re->flags |= PCRE_FCH_CASELESS; + } re->flags |= PCRE_FIRSTSET; } @@ -7814,9 +7837,26 @@ if (reqchar >= 0 && re->req_char = reqchar & 0xffff; #endif #endif - if ((reqchar & REQ_CASELESS) != 0 && MAX_255(re->req_char) - && cd->fcc[re->req_char] != re->req_char) - re->flags |= PCRE_RCH_CASELESS; + if ((reqchar & REQ_CASELESS) != 0) + { +#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8) + /* We ignore non-ASCII first chars in 8 bit mode. */ + if (utf) + { + if (re->first_char < 128) + { + if (cd->fcc[re->first_char] != re->first_char) + re->flags |= PCRE_RCH_CASELESS; + } + else if ((options & PCRE_UCP) != 0 + && UCD_OTHERCASE(re->first_char) != re->first_char) + re->flags |= PCRE_RCH_CASELESS; + } + else +#endif + if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char) + re->flags |= PCRE_RCH_CASELESS; + } re->flags |= PCRE_REQCHSET; } diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c index d7b292d..1bc96c1 100644 --- a/pcre_dfa_exec.c +++ b/pcre_dfa_exec.c @@ -480,7 +480,7 @@ if (*first_op == OP_REVERSE) { if (current_subject <= start_subject) break; current_subject--; - INTERNALCHAR(current_subject > start_subject, *current_subject, current_subject--); + ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--); } } else @@ -3199,7 +3199,13 @@ if (!anchored) has_first_char = TRUE; first_char = first_char2 = re->first_char; if ((re->flags & PCRE_FCH_CASELESS) != 0) + { first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char); +#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8) + if (first_char > 127 && utf && md->use_ucp) + first_char2 = UCD_OTHERCASE(first_char); +#endif + } } else { @@ -3217,7 +3223,13 @@ if ((re->flags & PCRE_REQCHSET) != 0) has_req_char = TRUE; req_char = req_char2 = re->req_char; if ((re->flags & PCRE_RCH_CASELESS) != 0) + { req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char); +#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8) + if (req_char > 127 && utf && md->use_ucp) + req_char2 = UCD_OTHERCASE(req_char); +#endif + } } /* Call the main matching function, looping for a non-anchored regex after a @@ -3246,7 +3258,7 @@ for (;;) while (t < md->end_subject && !IS_NEWLINE(t)) { t++; - INTERNALCHAR(t < end_subject, *t, t++); + ACROSSCHAR(t < end_subject, *t, t++); } } else @@ -3290,7 +3302,7 @@ for (;;) !WAS_NEWLINE(current_subject)) { current_subject++; - INTERNALCHAR(current_subject < end_subject, *current_subject, + ACROSSCHAR(current_subject < end_subject, *current_subject, current_subject++); } } @@ -3318,12 +3330,17 @@ for (;;) while (current_subject < end_subject) { register unsigned int c = *current_subject; +#ifndef COMPILE_PCRE8 + if (c > 255) c = 255; +#endif if ((start_bits[c/8] & (1 << (c&7))) == 0) { current_subject++; -#ifdef SUPPORT_UTF +#if defined SUPPORT_UTF && defined COMPILE_PCRE8 + /* In non 8-bit mode, the iteration will stop for + characters > 255 at the beginning or not stop at all. */ if (utf) - INTERNALCHAR(current_subject < end_subject, *current_subject, + ACROSSCHAR(current_subject < end_subject, *current_subject, current_subject++); #endif } @@ -3434,7 +3451,7 @@ for (;;) #ifdef SUPPORT_UTF if (utf) { - INTERNALCHAR(current_subject < end_subject, *current_subject, + ACROSSCHAR(current_subject < end_subject, *current_subject, current_subject++); } #endif diff --git a/pcre_exec.c b/pcre_exec.c index 6761598..bb1b60a 100644 --- a/pcre_exec.c +++ b/pcre_exec.c @@ -2069,7 +2069,7 @@ for (;;) be "non-word" characters. Remember the earliest consulted character for partial matching. */ -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf) { /* Get status of previous character */ @@ -2190,7 +2190,7 @@ for (;;) } eptr++; #ifdef SUPPORT_UTF - if (utf) INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++); + if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); #endif ecode++; break; @@ -3066,7 +3066,7 @@ for (;;) /* Match a single character, caselessly */ case OP_CHARI: -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf) { length = 1; @@ -4089,7 +4089,7 @@ for (;;) } if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); eptr++; - INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++); + ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); } break; @@ -4102,7 +4102,7 @@ for (;;) MRRETURN(MATCH_NOMATCH); } eptr++; - INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++); + ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); } break; @@ -4301,7 +4301,7 @@ for (;;) if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH); eptr++; - INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++); + ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); } break; @@ -4330,7 +4330,7 @@ for (;;) if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH); eptr++; - INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++); + ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); } break; @@ -5330,7 +5330,7 @@ for (;;) } if (IS_NEWLINE(eptr)) break; eptr++; - INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++); + ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); } } @@ -5347,7 +5347,7 @@ for (;;) } if (IS_NEWLINE(eptr)) break; eptr++; - INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++); + ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); } } break; @@ -5363,7 +5363,7 @@ for (;;) break; } eptr++; - INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++); + ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); } } else @@ -6264,7 +6264,13 @@ if (!anchored) has_first_char = TRUE; first_char = first_char2 = re->first_char; if ((re->flags & PCRE_FCH_CASELESS) != 0) + { first_char2 = TABLE_GET(first_char, tables + fcc_offset, first_char); +#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8) + if (first_char > 127 && utf && md->use_ucp) + first_char2 = UCD_OTHERCASE(first_char); +#endif + } } else if (!startline && study != NULL && @@ -6280,7 +6286,13 @@ if ((re->flags & PCRE_REQCHSET) != 0) has_req_char = TRUE; req_char = req_char2 = re->req_char; if ((re->flags & PCRE_RCH_CASELESS) != 0) + { req_char2 = TABLE_GET(req_char, tables + fcc_offset, req_char); +#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8) + if (req_char > 127 && utf && md->use_ucp) + req_char2 = UCD_OTHERCASE(req_char); +#endif + } } @@ -6309,7 +6321,7 @@ for(;;) while (t < md->end_subject && !IS_NEWLINE(t)) { t++; - INTERNALCHAR(t < end_subject, *t, t++); + ACROSSCHAR(t < end_subject, *t, t++); } } else @@ -6351,7 +6363,7 @@ for(;;) while (start_match < end_subject && !WAS_NEWLINE(start_match)) { start_match++; - INTERNALCHAR(start_match < end_subject, *start_match, + ACROSSCHAR(start_match < end_subject, *start_match, start_match++); } } @@ -6378,17 +6390,18 @@ for(;;) { while (start_match < end_subject) { -#ifdef COMPILE_PCRE register unsigned int c = *start_match; -#else - register unsigned int c = *start_match & 0xff; +#ifndef COMPILE_PCRE8 + if (c > 255) c = 255; #endif if ((start_bits[c/8] & (1 << (c&7))) == 0) { start_match++; -#ifdef SUPPORT_UTF +#if defined SUPPORT_UTF && defined COMPILE_PCRE8 + /* In non 8-bit mode, the iteration will stop for + characters > 255 at the beginning or not stop at all. */ if (utf) - INTERNALCHAR(start_match < end_subject, *start_match, + ACROSSCHAR(start_match < end_subject, *start_match, start_match++); #endif } @@ -6520,7 +6533,7 @@ for(;;) new_start_match = start_match + 1; #ifdef SUPPORT_UTF if (utf) - INTERNALCHAR(new_start_match < end_subject, *new_start_match, + ACROSSCHAR(new_start_match < end_subject, *new_start_match, new_start_match++); #endif break; diff --git a/pcre_fullinfo.c b/pcre_fullinfo.c index 6c89121..2bdf24b 100644 --- a/pcre_fullinfo.c +++ b/pcre_fullinfo.c @@ -65,9 +65,15 @@ Arguments: Returns: 0 if data returned, negative on error */ +#ifdef COMPILE_PCRE8 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what, void *where) +#else +PCRE_EXP_DEFN int PCRE_CALL_CONVENTION +pcre16_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what, + void *where) +#endif { real_pcre internal_re; pcre_study_data internal_study; diff --git a/pcre_info.c b/pcre_info.c index 9211df4..e7b3730 100644 --- a/pcre_info.c +++ b/pcre_info.c @@ -72,8 +72,13 @@ Returns: number of capturing subpatterns or negative values on error */ +#ifdef COMPILE_PCRE8 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION -pcre_info(const pcre *argument_re, int *optptr, int *first_byte) +pcre_info(const pcre *argument_re, int *optptr, int *first_char) +#else +PCRE_EXP_DEFN int PCRE_CALL_CONVENTION +pcre16_info(const pcre *argument_re, int *optptr, int *first_char) +#endif { real_pcre internal_re; const real_pcre *re = (const real_pcre *)argument_re; @@ -84,8 +89,8 @@ if (re->magic_number != MAGIC_NUMBER) if (re == NULL) return PCRE_ERROR_BADMAGIC; } if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_COMPILE_OPTIONS); -if (first_byte != NULL) - *first_byte = ((re->flags & PCRE_FIRSTSET) != 0)? re->first_char : +if (first_char != NULL) + *first_char = ((re->flags & PCRE_FIRSTSET) != 0)? re->first_char : ((re->flags & PCRE_STARTLINE) != 0)? -1 : -2; return re->top_bracket; } diff --git a/pcre_internal.h b/pcre_internal.h index 7642b91..4046e41 100644 --- a/pcre_internal.h +++ b/pcre_internal.h @@ -542,7 +542,7 @@ UTF-8 support is omitted, we don't even define them. */ /* #define GETCHARLENTEST(c, eptr, len) */ /* #define BACKCHAR(eptr) */ /* #define FORWARDCHAR(eptr) */ -/* #define INTERNALCHAR(condition, eptr, action) */ +/* #define ACROSSCHAR(condition, eptr, action) */ #else /* SUPPORT_UTF */ @@ -708,7 +708,7 @@ because almost all calls are already within a block of UTF-8 only code. */ #define FORWARDCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr++ /* Same as above, but it allows a fully customizable form. */ -#define INTERNALCHAR(condition, eptr, action) \ +#define ACROSSCHAR(condition, eptr, action) \ while((condition) && ((eptr) & 0xc0) == 0x80) action #else /* COMPILE_PCRE8 */ @@ -748,7 +748,7 @@ pointer. */ the pointer. */ #define GETUTF16INC(c, eptr) \ - { c = (((c & 0x3ff) << 10) | (eptr[1] & 0x3ff)) + 0x10000; eptr++; } + { c = (((c & 0x3ff) << 10) | (*eptr++ & 0x3ff)) + 0x10000; } /* Get the next UTF-16 character, advancing the pointer. This is called when we know we are in UTF-16 mode. */ @@ -797,7 +797,7 @@ code. */ #define FORWARDCHAR(eptr) if ((*eptr & 0xfc00) == 0xdc00) eptr++ /* Same as above, but it allows a fully customizable form. */ -#define INTERNALCHAR(condition, eptr, action) \ +#define ACROSSCHAR(condition, eptr, action) \ if ((condition) && ((eptr) & 0xfc00) == 0xdc00) action #endif diff --git a/pcre_jit_compile.c b/pcre_jit_compile.c index 03c7b2c..df158be 100644 --- a/pcre_jit_compile.c +++ b/pcre_jit_compile.c @@ -300,7 +300,7 @@ typedef struct compiler_common { #ifdef SUPPORT_UTF8 BOOL utf; #ifdef SUPPORT_UCP - BOOL useucp; + BOOL use_ucp; #endif jump_list *utfreadchar; #ifdef COMPILE_PCRE8 @@ -390,10 +390,12 @@ the start pointers when the end of the capturing group has not yet reached. */ #define PRIV_DATA(cc) (common->localptrs[(cc) - common->start]) #ifdef COMPILE_PCRE8 -#define MOV_UCHAR SLJIT_MOV_UB +#define MOV_UCHAR SLJIT_MOV_UB +#define MOVU_UCHAR SLJIT_MOVU_UB #else #ifdef COMPILE_PCRE16 -#define MOV_UCHAR SLJIT_MOV_UH +#define MOV_UCHAR SLJIT_MOV_UH +#define MOVU_UCHAR SLJIT_MOVU_UH #else #error Unsupported compiling mode #endif @@ -1369,10 +1371,10 @@ if (common->utf && c > 65535) if (bit >= (1 << 10)) bit >>= 10; else - return (bit <= 255) ? ((2 << 8) | bit) : ((3 << 8) | (bit >> 8)); + return (bit < 256) ? ((2 << 8) | bit) : ((3 << 8) | (bit >> 8)); } #endif /* SUPPORT_UTF16 */ -return (bit <= 255) ? ((0 << 8) | bit) : ((1 << 8) | (bit >> 8)); +return (bit < 256) ? ((0 << 8) | bit) : ((1 << 8) | (bit >> 8)); #endif /* COMPILE_PCRE16 */ #endif /* COMPILE_PCRE8 */ @@ -1420,7 +1422,7 @@ DEFINE_COMPILER; struct sljit_jump *jump; #endif -OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); +OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); #ifdef SUPPORT_UTF if (common->utf) { @@ -1461,7 +1463,7 @@ if (common->utf) #else #ifdef COMPILE_PCRE16 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); - jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 0xff); + jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 255); OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); JUMPHERE(jump); /* Skip low surrogate if necessary. */ @@ -1478,9 +1480,9 @@ if (common->utf) OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 0); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); #ifdef COMPILE_PCRE16 -/* The ctypes array contains only 255 values. */ +/* The ctypes array contains only 256 values. */ OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); -jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 0xff); +jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 255); #endif OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); #ifdef COMPILE_PCRE16 @@ -1542,7 +1544,7 @@ else if (nltype == NLTYPE_ANYCRLF) } else { - SLJIT_ASSERT(nltype == NLTYPE_FIXED && common->newline <= 255); + SLJIT_ASSERT(nltype == NLTYPE_FIXED && common->newline < 256); add_jump(compiler, fallbacks, CMP(jumpiftrue ? SLJIT_C_EQUAL : SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, common->newline)); } } @@ -1660,7 +1662,7 @@ sljit_emit_fast_return(compiler, RETURN_ADDR, 0); JUMPHERE(jump); /* Combine two 16 bit characters. */ -OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 1); +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3ff); OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 10); @@ -1818,7 +1820,7 @@ if (newlinecheck) return mainloop; } -static SLJIT_INLINE void fast_forward_first_char(compiler_common *common, pcre_uchar firstchar, BOOL caseless, BOOL firstline) +static SLJIT_INLINE void fast_forward_first_char(compiler_common *common, pcre_uchar first_char, BOOL caseless, BOOL firstline) { DEFINE_COMPILER; struct sljit_label *start; @@ -1836,22 +1838,28 @@ start = LABEL(); leave = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); -oc = firstchar; +oc = first_char; if (caseless) - oc = TABLE_GET(firstchar, common->fcc, firstchar); -if (firstchar == oc) - found = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, firstchar); + { + oc = TABLE_GET(first_char, common->fcc, first_char); +#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8) + if (first_char > 127 && common->utf && common->use_ucp) + oc = UCD_OTHERCASE(first_char); +#endif + } +if (first_char == oc) + found = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, first_char); else { - bit = firstchar ^ oc; + bit = first_char ^ oc; if (ispowerof2(bit)) { OP2(SLJIT_OR, TMP2, 0, TMP1, 0, SLJIT_IMM, bit); - found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, firstchar | bit); + found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, first_char | bit); } else { - OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, firstchar); + OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, first_char); COND_VALUE(SLJIT_MOV, TMP2, 0, SLJIT_C_EQUAL); OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, oc); COND_VALUE(SLJIT_OR | SLJIT_SET_E, TMP2, 0, SLJIT_C_EQUAL); @@ -1912,16 +1920,19 @@ if (common->nltype == NLTYPE_FIXED && common->newline > 255) OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, begin)); firstchar = CMP(SLJIT_C_LESS_EQUAL, STR_PTR, 0, TMP2, 0); - OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 2); + OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(2)); OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, STR_PTR, 0, TMP1, 0); COND_VALUE(SLJIT_MOV, TMP2, 0, SLJIT_C_GREATER_EQUAL); +#ifdef COMPILE_PCRE16 + OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 1); +#endif OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0); loop = LABEL(); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); leave = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), -2); - OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), -1); + OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2)); + OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1)); CMPTO(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff, loop); CMPTO(SLJIT_C_NOT_EQUAL, TMP2, 0, SLJIT_IMM, common->newline & 0xff, loop); @@ -1952,9 +1963,12 @@ if (common->nltype == NLTYPE_ANY || common->nltype == NLTYPE_ANYCRLF) leave = JUMP(SLJIT_JUMP); JUMPHERE(foundcr); notfoundnl = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); + OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, CHAR_NL); COND_VALUE(SLJIT_MOV, TMP1, 0, SLJIT_C_EQUAL); +#ifdef COMPILE_PCRE16 + OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1); +#endif OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); JUMPHERE(notfoundnl); JUMPHERE(leave); @@ -1972,6 +1986,9 @@ DEFINE_COMPILER; struct sljit_label *start; struct sljit_jump *leave; struct sljit_jump *found; +#ifndef COMPILE_PCRE8 +struct sljit_jump *jump; +#endif if (firstline) { @@ -1987,7 +2004,9 @@ if (common->utf) OP1(SLJIT_MOV, TMP3, 0, TMP1, 0); #endif #ifndef COMPILE_PCRE8 -OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xff); +jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 255); +OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 255); +JUMPHERE(jump); #endif OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7); OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3); @@ -2028,7 +2047,7 @@ if (firstline) OP1(SLJIT_MOV, STR_END, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), POSSESSIVE0); } -static SLJIT_INLINE struct sljit_jump *search_requested_char(compiler_common *common, pcre_uchar reqchar, BOOL caseless, BOOL has_firstchar) +static SLJIT_INLINE struct sljit_jump *search_requested_char(compiler_common *common, pcre_uchar req_char, BOOL caseless, BOOL has_firstchar) { DEFINE_COMPILER; struct sljit_label *loop; @@ -2045,34 +2064,40 @@ toolong = CMP(SLJIT_C_LESS, TMP1, 0, STR_END, 0); alreadyfound = CMP(SLJIT_C_LESS, STR_PTR, 0, TMP2, 0); if (has_firstchar) - OP2(SLJIT_ADD, TMP1, 0, STR_PTR, 0, SLJIT_IMM, 1); + OP2(SLJIT_ADD, TMP1, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); else OP1(SLJIT_MOV, TMP1, 0, STR_PTR, 0); loop = LABEL(); notfound = CMP(SLJIT_C_GREATER_EQUAL, TMP1, 0, STR_END, 0); -OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(TMP1), 0); -oc = reqchar; +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(TMP1), 0); +oc = req_char; if (caseless) - oc = TABLE_GET(reqchar, common->fcc, reqchar); -if (reqchar == oc) - found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, reqchar); + { + oc = TABLE_GET(req_char, common->fcc, req_char); +#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8) + if (req_char > 127 && common->utf && common->use_ucp) + oc = UCD_OTHERCASE(req_char); +#endif + } +if (req_char == oc) + found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, req_char); else { - bit = reqchar ^ oc; + bit = req_char ^ oc; if (ispowerof2(bit)) { OP2(SLJIT_OR, TMP2, 0, TMP2, 0, SLJIT_IMM, bit); - found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, reqchar | bit); + found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, req_char | bit); } else { - found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, reqchar); + found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, req_char); foundoc = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, oc); } } -OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 1); +OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(1)); JUMPTO(SLJIT_JUMP, loop); JUMPHERE(found); @@ -2126,7 +2151,7 @@ static void check_wordboundary(compiler_common *common) { DEFINE_COMPILER; struct sljit_jump *beginend; -#ifdef SUPPORT_UTF8 +#if !(defined COMPILE_PCRE8) || defined SUPPORT_UTF struct sljit_jump *jump; #endif @@ -2143,7 +2168,7 @@ read_char(common); /* Testing char type. */ #ifdef SUPPORT_UCP -if (common->useucp) +if (common->use_ucp) { OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 1); jump = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_UNDERSCORE); @@ -2160,20 +2185,24 @@ if (common->useucp) else #endif { -#ifdef SUPPORT_UTF8 +#ifndef COMPILE_PCRE8 + jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255); +#elif defined SUPPORT_UTF /* Here LOCALS1 has already been zeroed. */ jump = NULL; if (common->utf) jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255); -#endif +#endif /* COMPILE_PCRE8 */ OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), common->ctypes); OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 4 /* ctype_word */); OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 1); OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS1, TMP1, 0); -#ifdef SUPPORT_UTF8 +#ifndef COMPILE_PCRE8 + JUMPHERE(jump); +#elif defined SUPPORT_UTF if (jump != NULL) JUMPHERE(jump); -#endif +#endif /* COMPILE_PCRE8 */ } JUMPHERE(beginend); @@ -2183,7 +2212,7 @@ peek_char(common); /* Testing char type. This is a code duplication. */ #ifdef SUPPORT_UCP -if (common->useucp) +if (common->use_ucp) { OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 1); jump = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_UNDERSCORE); @@ -2199,7 +2228,11 @@ if (common->useucp) else #endif { -#ifdef SUPPORT_UTF8 +#ifndef COMPILE_PCRE8 + /* TMP2 may be destroyed by peek_char. */ + OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 0); + jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255); +#elif defined SUPPORT_UTF OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 0); jump = NULL; if (common->utf) @@ -2208,10 +2241,12 @@ else OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(TMP1), common->ctypes); OP2(SLJIT_LSHR, TMP2, 0, TMP2, 0, SLJIT_IMM, 4 /* ctype_word */); OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 1); -#ifdef SUPPORT_UTF8 +#ifndef COMPILE_PCRE8 + JUMPHERE(jump); +#elif defined SUPPORT_UTF if (jump != NULL) JUMPHERE(jump); -#endif +#endif /* COMPILE_PCRE8 */ } JUMPHERE(beginend); @@ -2314,18 +2349,18 @@ sljit_emit_fast_enter(compiler, RETURN_ADDR, 0, 1, 5, 5, common->localsize); OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0); OP1(SLJIT_MOV, TMP3, 0, CHAR1, 0); OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS0, CHAR2, 0); -OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 1); -OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); +OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(1)); +OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); label = LABEL(); -OP1(SLJIT_MOVU_UB, CHAR1, 0, SLJIT_MEM1(TMP1), 1); -OP1(SLJIT_MOVU_UB, CHAR2, 0, SLJIT_MEM1(STR_PTR), 1); +OP1(MOVU_UCHAR, CHAR1, 0, SLJIT_MEM1(TMP1), IN_UCHARS(1)); +OP1(MOVU_UCHAR, CHAR2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); jump = CMP(SLJIT_C_NOT_EQUAL, CHAR1, 0, CHAR2, 0); -OP2(SLJIT_SUB | SLJIT_SET_E, TMP2, 0, TMP2, 0, SLJIT_IMM, 1); +OP2(SLJIT_SUB | SLJIT_SET_E, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(1)); JUMPTO(SLJIT_C_NOT_ZERO, label); JUMPHERE(jump); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); OP1(SLJIT_MOV, CHAR1, 0, TMP3, 0); OP1(SLJIT_MOV, CHAR2, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS0); sljit_emit_fast_return(compiler, RETURN_ADDR, 0); @@ -2346,20 +2381,30 @@ OP1(SLJIT_MOV, TMP3, 0, LCC_TABLE, 0); OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS0, CHAR1, 0); OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS1, CHAR2, 0); OP1(SLJIT_MOV, LCC_TABLE, 0, SLJIT_IMM, common->lcc); -OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 1); -OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); +OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(1)); +OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); label = LABEL(); -OP1(SLJIT_MOVU_UB, CHAR1, 0, SLJIT_MEM1(TMP1), 1); -OP1(SLJIT_MOVU_UB, CHAR2, 0, SLJIT_MEM1(STR_PTR), 1); +OP1(MOVU_UCHAR, CHAR1, 0, SLJIT_MEM1(TMP1), IN_UCHARS(1)); +OP1(MOVU_UCHAR, CHAR2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); +#ifndef COMPILE_PCRE8 +jump = CMP(SLJIT_C_GREATER, CHAR1, 0, SLJIT_IMM, 255); +#endif OP1(SLJIT_MOV_UB, CHAR1, 0, SLJIT_MEM2(LCC_TABLE, CHAR1), 0); +#ifndef COMPILE_PCRE8 +JUMPHERE(jump); +jump = CMP(SLJIT_C_GREATER, CHAR2, 0, SLJIT_IMM, 255); +#endif OP1(SLJIT_MOV_UB, CHAR2, 0, SLJIT_MEM2(LCC_TABLE, CHAR2), 0); +#ifndef COMPILE_PCRE8 +JUMPHERE(jump); +#endif jump = CMP(SLJIT_C_NOT_EQUAL, CHAR1, 0, CHAR2, 0); -OP2(SLJIT_SUB | SLJIT_SET_E, TMP2, 0, TMP2, 0, SLJIT_IMM, 1); +OP2(SLJIT_SUB | SLJIT_SET_E, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(1)); JUMPTO(SLJIT_C_NOT_ZERO, label); JUMPHERE(jump); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); OP1(SLJIT_MOV, LCC_TABLE, 0, TMP3, 0); OP1(SLJIT_MOV, CHAR1, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS0); OP1(SLJIT_MOV, CHAR2, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS1); @@ -2378,7 +2423,7 @@ static const pcre_uchar *SLJIT_CALL do_utf_caselesscmp(pcre_uchar *src1, jit_arg /* This function would be ineffective to do in JIT level. */ int c1, c2; const pcre_uchar *src2 = args->ptr; -const pcre_uchar *end2 = (pcre_uchar *)args->end; +const pcre_uchar *end2 = args->end; while (src1 < end1) { @@ -2976,7 +3021,7 @@ switch(type) { jump[0] = CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff); jump[1] = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); + OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); add_jump(compiler, fallbacks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, common->newline & 0xff)); JUMPHERE(jump[1]); JUMPHERE(jump[0]); @@ -3037,9 +3082,9 @@ switch(type) read_char(common); jump[0] = CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR); jump[1] = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); + OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); jump[2] = CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_NL); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); jump[3] = JUMP(SLJIT_JUMP); JUMPHERE(jump[0]); check_newlinechar(common, common->bsr_nltype, fallbacks, FALSE); @@ -3089,36 +3134,37 @@ switch(type) jump[0] = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); if (common->nltype == NLTYPE_FIXED && common->newline > 255) { - OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 2); - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); + OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); + OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP2, 0, STR_END, 0)); - OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 1); + OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff)); add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP2, 0, SLJIT_IMM, common->newline & 0xff)); } else if (common->nltype == NLTYPE_FIXED) { - OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 1); - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); + OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); + OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP2, 0, STR_END, 0)); add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, common->newline)); } else { - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); + OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); jump[1] = CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR); - OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 2); + OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, TMP2, 0, STR_END, 0); jump[2] = JUMP(SLJIT_C_GREATER); add_jump(compiler, fallbacks, JUMP(SLJIT_C_LESS)); - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 1); + /* Equal. */ + OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); jump[3] = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_NL); add_jump(compiler, fallbacks, JUMP(SLJIT_JUMP)); JUMPHERE(jump[1]); if (common->nltype == NLTYPE_ANYCRLF) { - OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 1); + OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); add_jump(compiler, fallbacks, CMP(SLJIT_C_LESS, TMP2, 0, STR_END, 0)); add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_NL)); } @@ -3158,15 +3204,13 @@ switch(type) jump[0] = JUMP(SLJIT_JUMP); JUMPHERE(jump[1]); - OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, end)); - add_jump(compiler, fallbacks, CMP(SLJIT_C_EQUAL, TMP2, 0, STR_PTR, 0)); - + add_jump(compiler, fallbacks, CMP(SLJIT_C_EQUAL, STR_PTR, 0, STR_END, 0)); if (common->nltype == NLTYPE_FIXED && common->newline > 255) { - OP2(SLJIT_SUB, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 2); + OP2(SLJIT_SUB, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); add_jump(compiler, fallbacks, CMP(SLJIT_C_LESS, TMP2, 0, TMP1, 0)); - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), -2); - OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), -1); + OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2)); + OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1)); add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff)); add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP2, 0, SLJIT_IMM, common->newline & 0xff)); } @@ -3200,10 +3244,10 @@ switch(type) if (common->nltype == NLTYPE_FIXED && common->newline > 255) { - OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 2); + OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); add_jump(compiler, fallbacks, CMP(SLJIT_C_GREATER, TMP2, 0, STR_END, 0)); - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); - OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 1); + OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); + OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff)); add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP2, 0, SLJIT_IMM, common->newline & 0xff)); } @@ -6382,7 +6426,7 @@ common->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0; /* PCRE_UTF16 has the same value as PCRE_UTF8. */ common->utf = (re->options & PCRE_UTF8) != 0; #ifdef SUPPORT_UCP -common->useucp = (re->options & PCRE_UCP) != 0; +common->use_ucp = (re->options & PCRE_UCP) != 0; #endif common->utfreadchar = NULL; #ifdef COMPILE_PCRE8 diff --git a/pcre_newline.c b/pcre_newline.c index 0c2ddcd..d618b80 100644 --- a/pcre_newline.c +++ b/pcre_newline.c @@ -77,7 +77,15 @@ PRIV(is_newline)(PCRE_PUCHAR ptr, int type, PCRE_PUCHAR endptr, int *lenptr, BOOL utf) { int c; -if (utf) { GETCHAR(c, ptr); } else c = *ptr; +(void)utf; +#ifdef SUPPORT_UTF +if (utf) + { + GETCHAR(c, ptr); + } +else +#endif /* SUPPORT_UTF8 */ + c = *ptr; if (type == NLTYPE_ANYCRLF) switch(c) { @@ -96,9 +104,15 @@ else switch(c) case 0x000c: *lenptr = 1; return TRUE; /* FF */ case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1; return TRUE; /* CR */ +#ifdef COMPILE_PCRE8 case 0x0085: *lenptr = utf? 2 : 1; return TRUE; /* NEL */ case 0x2028: /* LS */ case 0x2029: *lenptr = 3; return TRUE; /* PS */ +#else + case 0x0085: /* NEL */ + case 0x2028: /* LS */ + case 0x2029: *lenptr = 1; return TRUE; /* PS */ +#endif /* COMPILE_PCRE8 */ default: return FALSE; } } @@ -127,17 +141,17 @@ PRIV(was_newline)(PCRE_PUCHAR ptr, int type, PCRE_PUCHAR startptr, int *lenptr, BOOL utf) { int c; +(void)utf; ptr--; -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf) { BACKCHAR(ptr); GETCHAR(c, ptr); } -else c = *ptr; -#else /* no UTF-8 support */ -c = *ptr; +else #endif /* SUPPORT_UTF8 */ + c = *ptr; if (type == NLTYPE_ANYCRLF) switch(c) { @@ -154,9 +168,15 @@ else switch(c) case 0x000b: /* VT */ case 0x000c: /* FF */ case 0x000d: *lenptr = 1; return TRUE; /* CR */ +#ifdef COMPILE_PCRE8 case 0x0085: *lenptr = utf? 2 : 1; return TRUE; /* NEL */ case 0x2028: /* LS */ case 0x2029: *lenptr = 3; return TRUE; /* PS */ +#else + case 0x0085: /* NEL */ + case 0x2028: /* LS */ + case 0x2029: *lenptr = 1; return TRUE; /* PS */ +#endif /* COMPILE_PCRE8 */ default: return FALSE; } } diff --git a/pcre_printint.src b/pcre_printint.src index 2922e54..d30619e 100644 --- a/pcre_printint.src +++ b/pcre_printint.src @@ -123,7 +123,9 @@ else if (!utf || (c & 0xfc00) != 0xd800) { - if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c); + if (PRINTABLE(c)) fprintf(f, "%c", c); + else if (c <= 0xff) fprintf(f, "\\x%02x", c); + else fprintf(f, "\\x{%x}", c); return 0; } else diff --git a/pcre_study.c b/pcre_study.c index 1e10397..3f25c3a 100644 --- a/pcre_study.c +++ b/pcre_study.c @@ -224,7 +224,7 @@ for (;;) case OP_NOTPOSPLUSI: branchlength++; cc += 2; -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); #endif break; @@ -245,7 +245,7 @@ for (;;) case OP_NOTEXACTI: branchlength += GET2(cc,1); cc += 2 + IMM2_SIZE; -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); #endif break; @@ -293,7 +293,7 @@ for (;;) appear, but leave the code, just in case.) */ case OP_ANYBYTE: -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf) return -1; #endif branchlength++; @@ -486,7 +486,7 @@ for (;;) case OP_NOTPOSQUERYI: cc += PRIV(OP_lengths)[op]; -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); #endif break; @@ -549,9 +549,10 @@ set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless, { unsigned int c = *p; +#ifdef COMPILE_PCRE8 SET_BIT(c); -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf && c > 127) { GETCHARINC(c, p); @@ -572,6 +573,33 @@ if (utf && c > 127) if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]); return p + 1; +#endif + +#ifdef COMPILE_PCRE16 +if (c > 0xff) + c = 0xff; +SET_BIT(c); + +#ifdef SUPPORT_UTF +if (utf && c > 127) + { + GETCHARINC(c, p); +#ifdef SUPPORT_UCP + if (caseless) + { + c = UCD_OTHERCASE(c); + if (c > 0xff) + c = 0xff; + SET_BIT(c); + } +#endif + return p; + } +#endif + +if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]); +return p + 1; +#endif } @@ -602,7 +630,7 @@ set_type_bits(pcre_uint8 *start_bits, int cbit_type, int table_limit, { register int c; for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type]; -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF && defined COMPILE_PCRE8 if (table_limit == 32) return; for (c = 128; c < 256; c++) { @@ -644,7 +672,9 @@ set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, int table_limit, { register int c; for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type]; +#if defined SUPPORT_UTF && defined COMPILE_PCRE8 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff; +#endif } @@ -679,7 +709,11 @@ set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf, { register int c; int yield = SSB_DONE; +#if defined SUPPORT_UTF && defined COMPILE_PCRE8 int table_limit = utf? 16:32; +#else +int table_limit = 32; +#endif #if 0 /* ========================================================================= */ @@ -951,14 +985,23 @@ do case OP_HSPACE: SET_BIT(0x09); SET_BIT(0x20); +#ifdef SUPPORT_UTF if (utf) { +#ifdef COMPILE_PCRE8 SET_BIT(0xC2); /* For U+00A0 */ SET_BIT(0xE1); /* For U+1680, U+180E */ SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */ SET_BIT(0xE3); /* For U+3000 */ +#endif +#ifdef COMPILE_PCRE16 + SET_BIT(0xA0); + SET_BIT(0xFF); /* For characters > 255 */ +#endif } - else SET_BIT(0xA0); + else +#endif /* SUPPORT_UTF */ + SET_BIT(0xA0); try_next = FALSE; break; @@ -968,12 +1011,21 @@ do SET_BIT(0x0B); SET_BIT(0x0C); SET_BIT(0x0D); +#ifdef SUPPORT_UTF if (utf) { +#ifdef COMPILE_PCRE8 SET_BIT(0xC2); /* For U+0085 */ SET_BIT(0xE2); /* For U+2028, U+2029 */ +#endif +#ifdef COMPILE_PCRE16 + SET_BIT(0x85); + SET_BIT(0xFF); /* For characters > 255 */ +#endif } - else SET_BIT(0x85); + else +#endif /* SUPPORT_UTF */ + SET_BIT(0x85); try_next = FALSE; break; @@ -1058,14 +1110,23 @@ do case OP_HSPACE: SET_BIT(0x09); SET_BIT(0x20); +#ifdef COMPILE_PCRE8 if (utf) { +#ifdef COMPILE_PCRE8 SET_BIT(0xC2); /* For U+00A0 */ SET_BIT(0xE1); /* For U+1680, U+180E */ SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */ SET_BIT(0xE3); /* For U+3000 */ +#endif +#ifdef COMPILE_PCRE16 + SET_BIT(0xA0); + SET_BIT(0xFF); /* For characters > 255 */ +#endif } - else SET_BIT(0xA0); + else +#endif /* SUPPORT_UTF */ + SET_BIT(0xA0); break; case OP_ANYNL: @@ -1074,12 +1135,21 @@ do SET_BIT(0x0B); SET_BIT(0x0C); SET_BIT(0x0D); +#ifdef COMPILE_PCRE8 if (utf) { +#ifdef COMPILE_PCRE8 SET_BIT(0xC2); /* For U+0085 */ SET_BIT(0xE2); /* For U+2028, U+2029 */ +#endif +#ifdef COMPILE_PCRE16 + SET_BIT(0x85); + SET_BIT(0xFF); /* For characters > 255 */ +#endif } - else SET_BIT(0x85); + else +#endif /* SUPPORT_UTF */ + SET_BIT(0x85); break; case OP_NOT_DIGIT: @@ -1126,13 +1196,16 @@ do character with a value > 255. */ case OP_NCLASS: -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF && defined COMPILE_PCRE8 if (utf) { start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */ memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */ } #endif +#ifdef COMPILE_PCRE16 + SET_BIT(0xFF); /* For characters > 255 */ +#endif /* Fall through */ case OP_CLASS: @@ -1147,7 +1220,7 @@ do value is > 127. In fact, there are only two possible starting bytes for characters in the range 128 - 255. */ -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF && defined COMPILE_PCRE8 if (utf) { for (c = 0; c < 16; c++) start_bits[c] |= map[c]; @@ -1161,12 +1234,10 @@ do } } } - - /* In non-UTF-8 mode, the two bit maps are completely compatible. */ - else #endif { + /* In non-UTF-8 mode, the two bit maps are completely compatible. */ for (c = 0; c < 32; c++) start_bits[c] |= map[c]; } @@ -1342,6 +1413,18 @@ if (bits_set || min > 0 memcpy(study->start_bits, start_bits, sizeof(start_bits)); } +#ifdef PCRE_DEBUG + if (bits_set) + { + pcre_uint8 *ptr = (pcre_uint32 *)start_bits; + int i; + + printf("Start bits:\n"); + for (i = 0; i < 32; i++) + printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n"); + } +#endif + /* Always set the minlength value in the block, because the JIT compiler makes use of it. However, don't set the bit unless the length is greater than zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time diff --git a/pcre_version.c b/pcre_version.c index 7067cd4..2269d4f 100644 --- a/pcre_version.c +++ b/pcre_version.c @@ -79,8 +79,13 @@ I could find no way of detecting that a macro is defined as an empty string at pre-processor time. This hack uses a standard trick for avoiding calling the STRING macro with an empty argument when doing the test. */ +#ifdef COMPILE_PCRE8 PCRE_EXP_DEFN const char * PCRE_CALL_CONVENTION pcre_version(void) +#else +PCRE_EXP_DEFN const char * PCRE_CALL_CONVENTION +pcre16_version(void) +#endif { return (XSTRING(Z PCRE_PRERELEASE)[1] == 0)? XSTRING(PCRE_MAJOR.PCRE_MINOR PCRE_DATE) : |