diff options
author | zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2011-12-10 02:20:06 +0000 |
---|---|---|
committer | zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2011-12-10 02:20:06 +0000 |
commit | 669e6f0bbc3b07f6df3b0d0cafba3555e39e433c (patch) | |
tree | 37c97c3fa732981cf8d2dbed54d27cca37fa8fac | |
parent | 24054b0ee8c34e475c8ecc21938f7139f1ca6d2c (diff) | |
download | pcre-669e6f0bbc3b07f6df3b0d0cafba3555e39e433c.tar.gz |
extending the 16 bit API, mode check, and fixes
git-svn-id: svn://vcs.exim.org/pcre/code/branches/pcre16@795 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | Makefile.am | 2 | ||||
-rw-r--r-- | pcre.h.in | 22 | ||||
-rw-r--r-- | pcre16_dfa_exec.c | 45 | ||||
-rw-r--r-- | pcre16_get.c | 45 | ||||
-rw-r--r-- | pcre16_ord2utf16.c | 2 | ||||
-rw-r--r-- | pcre_compile.c | 110 | ||||
-rw-r--r-- | pcre_dfa_exec.c | 72 | ||||
-rw-r--r-- | pcre_exec.c | 7 | ||||
-rw-r--r-- | pcre_fullinfo.c | 1 | ||||
-rw-r--r-- | pcre_get.c | 125 | ||||
-rw-r--r-- | pcre_info.c | 1 | ||||
-rw-r--r-- | pcre_internal.h | 32 | ||||
-rw-r--r-- | pcre_jit_compile.c | 2 | ||||
-rw-r--r-- | pcre_jit_test.c | 107 | ||||
-rw-r--r-- | pcre_newline.c | 4 | ||||
-rw-r--r-- | pcre_ord2utf8.c | 4 | ||||
-rw-r--r-- | pcre_study.c | 21 | ||||
-rw-r--r-- | pcre_valid_utf8.c | 4 | ||||
-rw-r--r-- | pcre_xclass.c | 31 | ||||
-rw-r--r-- | pcreposix.c | 1 |
20 files changed, 484 insertions, 154 deletions
diff --git a/Makefile.am b/Makefile.am index b64ccd5..ac2c675 100644 --- a/Makefile.am +++ b/Makefile.am @@ -212,8 +212,10 @@ libpcre16_la_SOURCES = \ pcre16_chartables.c \ pcre16_compile.c \ pcre16_config.c \ + pcre16_dfa_exec.c \ pcre16_exec.c \ pcre16_fullinfo.c \ + pcre16_get.c \ pcre16_info.c \ pcre16_jit_compile.c \ pcre16_newline.c \ @@ -166,6 +166,7 @@ compile-time only bits for runtime options, or vice versa. */ #define PCRE_ERROR_SHORTUTF8 (-25) #define PCRE_ERROR_RECURSELOOP (-26) #define PCRE_ERROR_JIT_STACKLIMIT (-27) +#define PCRE_ERROR_BADMODE (-28) /* Specific error codes for UTF-8 validity checks */ @@ -357,29 +358,46 @@ PCRE_EXP_DECL int pcre_config(int, void *); PCRE_EXP_DECL int pcre16_config(int, void *); PCRE_EXP_DECL int pcre_copy_named_substring(const pcre *, const char *, int *, int, const char *, char *, int); -PCRE_EXP_DECL int pcre_copy_substring(const char *, int *, int, int, char *, - int); +PCRE_EXP_DECL int pcre16_copy_named_substring(const pcre *, PCRE_SPTR16, + int *, int, PCRE_SPTR16, PCRE_SCHAR16 *, int); +PCRE_EXP_DECL int pcre_copy_substring(const char *, int *, int, int, + char *, int); +PCRE_EXP_DECL int pcre16_copy_substring(PCRE_SPTR16, int *, int, int, + PCRE_SCHAR16 *, int); PCRE_EXP_DECL int pcre_dfa_exec(const pcre *, const pcre_extra *, const char *, int, int, int, int *, int , int *, int); +PCRE_EXP_DECL int pcre16_dfa_exec(const pcre *, const pcre_extra *, + PCRE_SPTR16, int, int, int, int *, int , int *, int); PCRE_EXP_DECL int pcre_exec(const pcre *, const pcre_extra *, PCRE_SPTR, int, int, int, int *, int); PCRE_EXP_DECL int pcre16_exec(const pcre *, const pcre_extra *, PCRE_SPTR16, int, int, int, int *, int); PCRE_EXP_DECL void pcre_free_substring(const char *); +PCRE_EXP_DECL void pcre16_free_substring(PCRE_SPTR16); PCRE_EXP_DECL void pcre_free_substring_list(const char **); +PCRE_EXP_DECL void pcre16_free_substring_list(PCRE_SPTR16 *); PCRE_EXP_DECL int pcre_fullinfo(const pcre *, const pcre_extra *, int, void *); PCRE_EXP_DECL int pcre16_fullinfo(const pcre *, const pcre_extra *, int, void *); PCRE_EXP_DECL int pcre_get_named_substring(const pcre *, const char *, int *, int, const char *, const char **); +PCRE_EXP_DECL int pcre16_get_named_substring(const pcre *, PCRE_SPTR16, + int *, int, PCRE_SPTR16, PCRE_SPTR16 *); PCRE_EXP_DECL int pcre_get_stringnumber(const pcre *, const char *); +PCRE_EXP_DECL int pcre16_get_stringnumber(const pcre *, PCRE_SPTR16); PCRE_EXP_DECL int pcre_get_stringtable_entries(const pcre *, const char *, char **, char **); +PCRE_EXP_DECL int pcre16_get_stringtable_entries(const pcre *, PCRE_SPTR16, + PCRE_SCHAR16 **, PCRE_SCHAR16 **); PCRE_EXP_DECL int pcre_get_substring(const char *, int *, int, int, const char **); +PCRE_EXP_DECL int pcre16_get_substring(PCRE_SPTR16, int *, int, int, + PCRE_SPTR16 *); PCRE_EXP_DECL int pcre_get_substring_list(const char *, int *, int, const char ***); +PCRE_EXP_DECL int pcre16_get_substring_list(PCRE_SPTR16, int *, int, + PCRE_SPTR16 **); PCRE_EXP_DECL int pcre_info(const pcre *, int *, int *); PCRE_EXP_DECL int pcre16_info(const pcre *, int *, int *); PCRE_EXP_DECL const unsigned char *pcre_maketables(void); diff --git a/pcre16_dfa_exec.c b/pcre16_dfa_exec.c new file mode 100644 index 0000000..dc6ea49 --- /dev/null +++ b/pcre16_dfa_exec.c @@ -0,0 +1,45 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Copyright (c) 1997-2011 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + +/* Generate code with 16 bit character support. */ +#define COMPILE_PCRE16 + +#include "pcre_dfa_exec.c" + +/* End of pcre16_dfa_exec.c */ diff --git a/pcre16_get.c b/pcre16_get.c new file mode 100644 index 0000000..0b9bd61 --- /dev/null +++ b/pcre16_get.c @@ -0,0 +1,45 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Copyright (c) 1997-2011 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + +/* Generate code with 16 bit character support. */ +#define COMPILE_PCRE16 + +#include "pcre_get.c" + +/* End of pcre16_get.c */ diff --git a/pcre16_ord2utf16.c b/pcre16_ord2utf16.c index 99bed29..c0d3ee5 100644 --- a/pcre16_ord2utf16.c +++ b/pcre16_ord2utf16.c @@ -87,7 +87,7 @@ return 2; #else /* SUPPORT_UTF */ (void)(cvalue); /* Keep compiler happy; this function won't ever be */ -(void)(buffer); /* called when SUPPORT_UTF8 is not defined. */ +(void)(buffer); /* called when SUPPORT_UTF is not defined. */ return 0; #endif /* SUPPORT_UTF */ } diff --git a/pcre_compile.c b/pcre_compile.c index 24a7b1c..3fa7c67 100644 --- a/pcre_compile.c +++ b/pcre_compile.c @@ -2357,7 +2357,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); actual length is stored in the compiled code, so we must update "code" here. */ -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 case OP_XCLASS: ccode = code += GET(code, 1); goto CHECK_CLASS_REPEAT; @@ -2367,7 +2367,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); case OP_NCLASS: ccode = code + PRIV(OP_lengths)[OP_CLASS]; -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 CHECK_CLASS_REPEAT: #endif @@ -2980,7 +2980,7 @@ the next item is a character. */ if (next >= 0) switch(op_code) { case OP_CHAR: -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF GETCHARTEST(c, previous); #else c = *previous; @@ -2992,13 +2992,13 @@ if (next >= 0) switch(op_code) high-valued characters. */ case OP_CHARI: -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF GETCHARTEST(c, previous); #else c = *previous; #endif if (c == next) return FALSE; -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf) { unsigned int othercase; @@ -3011,7 +3011,7 @@ if (next >= 0) switch(op_code) return (unsigned int)c != othercase; } else -#endif /* SUPPORT_UTF8 */ +#endif /* SUPPORT_UTF */ return (c != cd->fcc[next]); /* Non-UTF-8 mode */ /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These @@ -3023,7 +3023,7 @@ if (next >= 0) switch(op_code) case OP_NOTI: if ((c = *previous) == next) return TRUE; -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf) { unsigned int othercase; @@ -3036,7 +3036,7 @@ if (next >= 0) switch(op_code) return (unsigned int)c == othercase; } else -#endif /* SUPPORT_UTF8 */ +#endif /* SUPPORT_UTF */ return (c == cd->fcc[next]); /* Non-UTF-8 mode */ /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set. @@ -3128,7 +3128,7 @@ switch(op_code) { case OP_CHAR: case OP_CHARI: -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF GETCHARTEST(c, previous); #else c = *previous; @@ -3358,7 +3358,7 @@ pcre_uint8 classbits[32]; must not do this for other options (e.g. PCRE_EXTENDED) because they may change dynamically as we process the pattern. */ -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF /* PCRE_UTF16 has the same value as PCRE_UTF8. */ BOOL utf = (options & PCRE_UTF8) != 0; pcre_uchar utf_chars[6]; @@ -4150,7 +4150,7 @@ for (;; ptr++) goto LONE_SINGLE_CHARACTER; } -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf) { /* Braces are required because the */ GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */ @@ -4200,7 +4200,9 @@ for (;; ptr++) matching for characters > 127 is available only if UCP support is available. */ -#ifdef SUPPORT_UTF +#if defined SUPPORT_UTF && !(defined COMPILE_PCRE8) + if ((d > 255) || (utf && ((options & PCRE_CASELESS) != 0 && d > 127))) +#elif defined SUPPORT_UTF if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127))) #elif !(defined COMPILE_PCRE8) if (d > 255) @@ -4214,7 +4216,11 @@ for (;; ptr++) they fit with the basic range. */ #ifdef SUPPORT_UCP +#ifndef COMPILE_PCRE8 + if (utf && (options & PCRE_CASELESS) != 0) +#else if ((options & PCRE_CASELESS) != 0) +#endif { unsigned int occ, ocd; unsigned int cc = c; @@ -4257,12 +4263,25 @@ for (;; ptr++) *class_uchardata++ = XCL_RANGE; #ifdef SUPPORT_UTF +#ifndef COMPILE_PCRE8 + if (utf) + { + class_uchardata += PRIV(ord2utf)(c, class_uchardata); + class_uchardata += PRIV(ord2utf)(d, class_uchardata); + } + else + { + *class_uchardata++ = c; + *class_uchardata++ = d; + } +#else class_uchardata += PRIV(ord2utf)(c, class_uchardata); class_uchardata += PRIV(ord2utf)(d, class_uchardata); -#else +#endif +#else /* SUPPORT_UTF */ *class_uchardata++ = c; *class_uchardata++ = d; -#endif +#endif /* SUPPORT_UTF */ /* With UCP support, we are done. Without UCP support, there is no caseless matching for UTF characters > 127; we can use the bit map @@ -4270,9 +4289,26 @@ for (;; ptr++) can still use */ #ifdef SUPPORT_UCP - continue; /* With next character in the class */ -#else -#ifdef SUPPORT_UTF +#ifndef COMPILE_PCRE8 + if (utf) +#endif + continue; /* With next character in the class */ +#endif /* SUPPORT_UCP */ + +#if defined SUPPORT_UTF && !defined(SUPPORT_UCP) && !(defined COMPILE_PCRE8) + if (utf) + { + if ((options & PCRE_CASELESS) == 0 || c > 127) continue; + /* Adjust upper limit and fall through to set up the map */ + d = 127; + } + else + { + if (c > 255) continue; + /* Adjust upper limit and fall through to set up the map */ + d = 255; + } +#elif defined SUPPORT_UTF && !defined(SUPPORT_UCP) if ((options & PCRE_CASELESS) == 0 || c > 127) continue; /* Adjust upper limit and fall through to set up the map */ d = 127; @@ -4280,10 +4316,9 @@ for (;; ptr++) if (c > 255) continue; /* Adjust upper limit and fall through to set up the map */ d = 255; -#endif /* SUPPORT_UTF */ -#endif /* SUPPORT_UCP */ +#endif /* SUPPORT_UTF && !SUPPORT_UCP && !COMPILE_PCRE8 */ } -#endif /* SUPPORT_UTF8 || COMPILE_PCRE16 */ +#endif /* SUPPORT_UTF || !COMPILE_PCRE8 */ /* We use the bit map for 8 bit mode, or when the characters fall partially or entirely to [0-255] ([0-127] for UCP) ranges. */ @@ -4314,7 +4349,9 @@ for (;; ptr++) /* Handle a character that cannot go in the bit map */ -#ifdef SUPPORT_UTF +#if defined SUPPORT_UTF && !(defined COMPILE_PCRE8) + if ((c > 255) || (utf && ((options & PCRE_CASELESS) != 0 && c > 127))) +#elif defined SUPPORT_UTF if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127))) #elif !(defined COMPILE_PCRE8) if (c > 255) @@ -4324,13 +4361,26 @@ for (;; ptr++) xclass = TRUE; *class_uchardata++ = XCL_SINGLE; #ifdef SUPPORT_UTF - class_uchardata += PRIV(ord2utf)(c, class_uchardata); -#else - *class_uchardata++ = c; +#ifndef COMPILE_PCRE8 + /* In non 8 bit mode, we can get here even + if we are not in UTF mode. */ + if (!utf) + *class_uchardata++ = c; + else #endif + class_uchardata += PRIV(ord2utf)(c, class_uchardata); +#else /* SUPPORT_UTF */ + *class_uchardata++ = c; +#endif /* SUPPORT_UTF */ #ifdef SUPPORT_UCP +#ifdef COMPILE_PCRE8 if ((options & PCRE_CASELESS) != 0) +#else + /* In non 8 bit mode, we can get here even + if we are not in UTF mode. */ + if (utf && (options & PCRE_CASELESS) != 0) +#endif { unsigned int othercase; if ((othercase = UCD_OTHERCASE(c)) != c) @@ -4415,7 +4465,7 @@ for (;; ptr++) /* For a single, positive character, get the value into mcbuffer, and then we can handle this with the normal one-character code. */ -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf && class_lastchar > 127) mclength = PRIV(ord2utf)(class_lastchar, mcbuffer); else @@ -4843,7 +4893,7 @@ for (;; ptr++) else if (*previous == OP_CLASS || *previous == OP_NCLASS || -#if defined SUPPORT_UTF8 || !defined COMPILE_PCRE8 +#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 *previous == OP_XCLASS || #endif *previous == OP_REF || @@ -6635,7 +6685,7 @@ for (;; ptr++) a value > 127. We set its representation in the length/buffer, and then handle it as a data character. */ -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf && c > 127) mclength = PRIV(ord2utf)(c, mcbuffer); else @@ -7471,12 +7521,12 @@ while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS && /* PCRE_UTF16 has the same value as PCRE_UTF8. */ utf = (options & PCRE_UTF8) != 0; -/* Can't support UTF8 unless PCRE has been compiled to include the code. The +/* Can't support UTF unless PCRE has been compiled to include the code. The return of an error code from PRIV(valid_utf)() is a new feature, introduced in release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is not used here. */ -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 && (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0) { @@ -7673,7 +7723,7 @@ code = (pcre_uchar *)codestart; &firstchar, &reqchar, NULL, cd, NULL); re->top_bracket = cd->bracount; re->top_backref = cd->top_backref; -re->flags = cd->external_flags; +re->flags = cd->external_flags | PCRE_MODE; if (cd->had_accept) reqchar = REQ_NONE; /* Must disable after (*ACCEPT) */ diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c index a5bc745..58197ce 100644 --- a/pcre_dfa_exec.c +++ b/pcre_dfa_exec.c @@ -413,7 +413,7 @@ const pcre_uchar *start_subject = md->start_subject; const pcre_uchar *end_subject = md->end_subject; const pcre_uchar *start_code = md->start_code; -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF BOOL utf = (md->poptions & PCRE_UTF8) != 0; #else BOOL utf = FALSE; @@ -471,7 +471,7 @@ if (*first_op == OP_REVERSE) /* If we can't go back the amount required for the longest lookbehind pattern, go back as far as we can; some alternatives may still be viable. */ -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF /* In character mode we have to step back character by character */ if (utf) @@ -603,9 +603,9 @@ for (;;) if (ptr < end_subject) { clen = 1; /* Number of bytes in the character */ -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf) { GETCHARLEN(c, ptr, clen); } else -#endif /* SUPPORT_UTF8 */ +#endif /* SUPPORT_UTF */ c = *ptr; } else @@ -692,9 +692,9 @@ for (;;) if (coptable[codevalue] > 0) { dlen = 1; -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else -#endif /* SUPPORT_UTF8 */ +#endif /* SUPPORT_UTF */ d = code[coptable[codevalue]]; if (codevalue >= OP_TYPESTAR) { @@ -957,8 +957,8 @@ for (;;) { const pcre_uchar *temp = ptr - 1; if (temp < md->start_used_ptr) md->start_used_ptr = temp; -#ifdef SUPPORT_UTF8 - if (utf) BACKCHAR(temp); +#ifdef SUPPORT_UTF + if (utf) { BACKCHAR(temp); } #endif GETCHARTEST(d, temp); #ifdef SUPPORT_UCP @@ -1983,28 +1983,28 @@ for (;;) case OP_CHARI: if (clen == 0) break; -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf) { if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else { unsigned int othercase; - if (c < 128) othercase = fcc[c]; else - - /* If we have Unicode property support, we can use it to test the - other case of the character. */ - + if (c < 128) + othercase = fcc[c]; + else + /* If we have Unicode property support, we can use it to test the + other case of the character. */ #ifdef SUPPORT_UCP - othercase = UCD_OTHERCASE(c); + othercase = UCD_OTHERCASE(c); #else - othercase = NOTACHAR; + othercase = NOTACHAR; #endif if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); } } } else -#endif /* SUPPORT_UTF8 */ +#endif /* SUPPORT_UTF */ /* Not UTF mode */ { if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); } @@ -2207,7 +2207,7 @@ for (;;) unsigned int otherd = NOTACHAR; if (caseless) { -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf && d >= 128) { #ifdef SUPPORT_UCP @@ -2215,7 +2215,7 @@ for (;;) #endif /* SUPPORT_UCP */ } else -#endif /* SUPPORT_UTF8 */ +#endif /* SUPPORT_UTF */ otherd = fcc[d]; } if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) @@ -2254,7 +2254,7 @@ for (;;) unsigned int otherd = NOTACHAR; if (caseless) { -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf && d >= 128) { #ifdef SUPPORT_UCP @@ -2262,7 +2262,7 @@ for (;;) #endif /* SUPPORT_UCP */ } else -#endif /* SUPPORT_UTF8 */ +#endif /* SUPPORT_UTF */ otherd = fcc[d]; } if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) @@ -2299,7 +2299,7 @@ for (;;) unsigned int otherd = NOTACHAR; if (caseless) { -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf && d >= 128) { #ifdef SUPPORT_UCP @@ -2307,7 +2307,7 @@ for (;;) #endif /* SUPPORT_UCP */ } else -#endif /* SUPPORT_UTF8 */ +#endif /* SUPPORT_UTF */ otherd = fcc[d]; } if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) @@ -2336,7 +2336,7 @@ for (;;) unsigned int otherd = NOTACHAR; if (caseless) { -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf && d >= 128) { #ifdef SUPPORT_UCP @@ -2344,7 +2344,7 @@ for (;;) #endif /* SUPPORT_UCP */ } else -#endif /* SUPPORT_UTF8 */ +#endif /* SUPPORT_UTF */ otherd = fcc[d]; } if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) @@ -2380,7 +2380,7 @@ for (;;) unsigned int otherd = NOTACHAR; if (caseless) { -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf && d >= 128) { #ifdef SUPPORT_UCP @@ -2388,7 +2388,7 @@ for (;;) #endif /* SUPPORT_UCP */ } else -#endif /* SUPPORT_UTF8 */ +#endif /* SUPPORT_UTF */ otherd = fcc[d]; } if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) @@ -2438,7 +2438,7 @@ for (;;) else { ecode = code + GET(code, 1); - if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE); + if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf); } /* At this point, isinclass is set for all kinds of class, and ecode @@ -2994,10 +2994,17 @@ Returns: > 0 => number of match offset pairs placed in offsets < -1 => some kind of unexpected problem */ +#ifdef COMPILE_PCRE8 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data, const char *subject, int length, int start_offset, int options, int *offsets, int offsetcount, int *workspace, int wscount) +#else +PCRE_EXP_DEFN int PCRE_CALL_CONVENTION +pcre16_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data, + PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets, + int offsetcount, int *workspace, int wscount) +#endif { real_pcre *re = (real_pcre *)argument_re; dfa_match_data match_block; @@ -3062,14 +3069,15 @@ if (re->magic_number != MAGIC_NUMBER) if (re == NULL) return PCRE_ERROR_BADMAGIC; if (study != NULL) study = &internal_study; } +if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE; /* Set some local values */ -current_subject = (const unsigned char *)subject + start_offset; -end_subject = (const unsigned char *)subject + length; +current_subject = (const pcre_uchar *)subject + start_offset; +end_subject = (const pcre_uchar *)subject + length; req_char_ptr = current_subject - 1; -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF /* PCRE_UTF16 has the same value as PCRE_UTF8. */ utf = (re->options & PCRE_UTF8) != 0; #else @@ -3083,7 +3091,7 @@ anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 || md->start_code = (const pcre_uchar *)argument_re + re->name_table_offset + re->name_count * re->name_entry_size; -md->start_subject = (const unsigned char *)subject; +md->start_subject = (const pcre_uchar *)subject; md->end_subject = end_subject; md->start_offset = start_offset; md->moptions = options; diff --git a/pcre_exec.c b/pcre_exec.c index c5932f7..9aa07a7 100644 --- a/pcre_exec.c +++ b/pcre_exec.c @@ -2968,7 +2968,7 @@ for (;;) MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); - if (!PRIV(xclass)(c, data)) MRRETURN(MATCH_NOMATCH); + if (!PRIV(xclass)(c, data, utf)) MRRETURN(MATCH_NOMATCH); } /* If max == min we can continue with the main loop without the @@ -2992,7 +2992,7 @@ for (;;) MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); - if (!PRIV(xclass)(c, data)) MRRETURN(MATCH_NOMATCH); + if (!PRIV(xclass)(c, data, utf)) MRRETURN(MATCH_NOMATCH); } /* Control never gets here */ } @@ -3015,7 +3015,7 @@ for (;;) #else c = *eptr; #endif - if (!PRIV(xclass)(c, data)) break; + if (!PRIV(xclass)(c, data, utf)) break; eptr += len; } for(;;) @@ -6113,6 +6113,7 @@ if (re->magic_number != MAGIC_NUMBER) if (re == NULL) return PCRE_ERROR_BADMAGIC; if (study != NULL) study = &internal_study; } +if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE; /* Set up other data */ diff --git a/pcre_fullinfo.c b/pcre_fullinfo.c index 2bdf24b..078f5fd 100644 --- a/pcre_fullinfo.c +++ b/pcre_fullinfo.c @@ -91,6 +91,7 @@ if (re->magic_number != MAGIC_NUMBER) if (re == NULL) return PCRE_ERROR_BADMAGIC; if (study != NULL) study = &internal_study; } +if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE; switch (what) { @@ -65,8 +65,13 @@ Returns: the number of the named parentheses, or a negative number (PCRE_ERROR_NOSUBSTRING) if not found */ +#ifdef COMPILE_PCRE8 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_get_stringnumber(const pcre *code, const char *stringname) +#else +PCRE_EXP_DEFN int PCRE_CALL_CONVENTION +pcre16_get_stringnumber(const pcre *code, PCRE_SPTR16 stringname) +#endif { int rc; int entrysize; @@ -87,7 +92,8 @@ while (top > bot) { int mid = (top + bot) / 2; pcre_uchar *entry = nametable + entrysize*mid; - int c = strcmp(stringname, (char *)(entry + 2)); + int c = STRCMP_UC_UC((pcre_uchar *)stringname, + (pcre_uchar *)(entry + IMM2_SIZE)); if (c == 0) return (entry[0] << 8) + entry[1]; if (c > 0) bot = mid + 1; else top = mid; } @@ -114,9 +120,15 @@ Returns: the length of each entry, or a negative number (PCRE_ERROR_NOSUBSTRING) if not found */ +#ifdef COMPILE_PCRE8 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_get_stringtable_entries(const pcre *code, const char *stringname, char **firstptr, char **lastptr) +#else +PCRE_EXP_DEFN int PCRE_CALL_CONVENTION +pcre16_get_stringtable_entries(const pcre *code, PCRE_SPTR16 stringname, + PCRE_SCHAR16 **firstptr, PCRE_SCHAR16 **lastptr) +#endif { int rc; int entrysize; @@ -138,23 +150,31 @@ while (top > bot) { int mid = (top + bot) / 2; pcre_uchar *entry = nametable + entrysize*mid; - int c = strcmp(stringname, (char *)(entry + 2)); + int c = STRCMP_UC_UC((pcre_uchar *)stringname, + (pcre_uchar *)(entry + IMM2_SIZE)); if (c == 0) { pcre_uchar *first = entry; pcre_uchar *last = entry; while (first > nametable) { - if (strcmp(stringname, (char *)(first - entrysize + 2)) != 0) break; + if (STRCMP_UC_UC((pcre_uchar *)stringname, + (pcre_uchar *)(first - entrysize + IMM2_SIZE)) != 0) break; first -= entrysize; } while (last < lastentry) { - if (strcmp(stringname, (char *)(last + entrysize + 2)) != 0) break; + if (STRCMP_UC_UC((pcre_uchar *)stringname, + (pcre_uchar *)(last + entrysize + IMM2_SIZE)) != 0) break; last += entrysize; } +#ifdef COMPILE_PCRE8 *firstptr = (char *)first; *lastptr = (char *)last; +#else + *firstptr = (PCRE_SCHAR16 *)first; + *lastptr = (PCRE_SCHAR16 *)last; +#endif return entrysize; } if (c > 0) bot = mid + 1; else top = mid; @@ -182,16 +202,29 @@ Returns: the number of the first that is set, or a negative number on error */ +#ifdef COMPILE_PCRE8 static int get_first_set(const pcre *code, const char *stringname, int *ovector) +#else +static int +get_first_set(const pcre *code, PCRE_SPTR16 stringname, int *ovector) +#endif { const real_pcre *re = (const real_pcre *)code; int entrysize; -char *first, *last; +pcre_uchar *first, *last; pcre_uchar *entry; +#ifdef COMPILE_PCRE8 if ((re->options & PCRE_DUPNAMES) == 0 && (re->flags & PCRE_JCHANGED) == 0) return pcre_get_stringnumber(code, stringname); -entrysize = pcre_get_stringtable_entries(code, stringname, &first, &last); +entrysize = pcre_get_stringtable_entries(code, stringname, + (char **)&first, (char **)&last); +#else +if ((re->options & PCRE_DUPNAMES) == 0 && (re->flags & PCRE_JCHANGED) == 0) + return pcre16_get_stringnumber(code, stringname); +entrysize = pcre16_get_stringtable_entries(code, stringname, + (PCRE_SCHAR16 **)&first, (PCRE_SCHAR16 **)&last); +#endif if (entrysize <= 0) return entrysize; for (entry = (pcre_uchar *)first; entry <= (pcre_uchar *)last; entry += entrysize) { @@ -231,9 +264,15 @@ Returns: if successful: PCRE_ERROR_NOSUBSTRING (-7) no such captured substring */ +#ifdef COMPILE_PCRE8 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_copy_substring(const char *subject, int *ovector, int stringcount, int stringnumber, char *buffer, int size) +#else +PCRE_EXP_DEFN int PCRE_CALL_CONVENTION +pcre16_copy_substring(PCRE_SPTR16 subject, int *ovector, int stringcount, + int stringnumber, PCRE_SCHAR16 *buffer, int size) +#endif { int yield; if (stringnumber < 0 || stringnumber >= stringcount) @@ -241,7 +280,7 @@ if (stringnumber < 0 || stringnumber >= stringcount) stringnumber *= 2; yield = ovector[stringnumber+1] - ovector[stringnumber]; if (size < yield + 1) return PCRE_ERROR_NOMEMORY; -memcpy(buffer, subject + ovector[stringnumber], yield); +memcpy(buffer, subject + ovector[stringnumber], IN_UCHARS(yield)); buffer[yield] = 0; return yield; } @@ -276,13 +315,23 @@ Returns: if successful: PCRE_ERROR_NOSUBSTRING (-7) no such captured substring */ +#ifdef COMPILE_PCRE8 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_copy_named_substring(const pcre *code, const char *subject, int *ovector, int stringcount, const char *stringname, char *buffer, int size) +#else +PCRE_EXP_DEFN int PCRE_CALL_CONVENTION +pcre16_copy_named_substring(const pcre *code, PCRE_SPTR16 subject, int *ovector, + int stringcount, PCRE_SPTR16 stringname, PCRE_SCHAR16 *buffer, int size) +#endif { int n = get_first_set(code, stringname, ovector); if (n <= 0) return n; +#ifdef COMPILE_PCRE8 return pcre_copy_substring(subject, ovector, stringcount, n, buffer, size); +#else +return pcre16_copy_substring(subject, ovector, stringcount, n, buffer, size); +#endif } @@ -308,29 +357,39 @@ Returns: if successful: 0 PCRE_ERROR_NOMEMORY (-6) failed to get store */ +#ifdef COMPILE_PCRE8 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_get_substring_list(const char *subject, int *ovector, int stringcount, const char ***listptr) +#else +PCRE_EXP_DEFN int PCRE_CALL_CONVENTION +pcre16_get_substring_list(PCRE_SPTR16 subject, int *ovector, int stringcount, + PCRE_SPTR16 **listptr) +#endif { int i; -int size = sizeof(char *); +int size = sizeof(pcre_uchar *); int double_count = stringcount * 2; -char **stringlist; -char *p; +pcre_uchar **stringlist; +pcre_uchar *p; for (i = 0; i < double_count; i += 2) - size += sizeof(char *) + ovector[i+1] - ovector[i] + 1; + size += sizeof(pcre_uchar *) + IN_UCHARS(ovector[i+1] - ovector[i] + 1); -stringlist = (char **)(pcre_malloc)(size); +stringlist = (pcre_uchar **)(pcre_malloc)(size); if (stringlist == NULL) return PCRE_ERROR_NOMEMORY; +#ifdef COMPILE_PCRE8 *listptr = (const char **)stringlist; -p = (char *)(stringlist + stringcount + 1); +#else +*listptr = (PCRE_SPTR16 *)stringlist; +#endif +p = (pcre_uchar *)(stringlist + stringcount + 1); for (i = 0; i < double_count; i += 2) { int len = ovector[i+1] - ovector[i]; - memcpy(p, subject + ovector[i], len); + memcpy(p, subject + ovector[i], IN_UCHARS(len)); *stringlist++ = p; p += len; *p++ = 0; @@ -353,8 +412,13 @@ Argument: the result of a previous pcre_get_substring_list() Returns: nothing */ +#ifdef COMPILE_PCRE8 PCRE_EXP_DEFN void PCRE_CALL_CONVENTION pcre_free_substring_list(const char **pointer) +#else +PCRE_EXP_DEFN void PCRE_CALL_CONVENTION +pcre16_free_substring_list(PCRE_SPTR16 *pointer) +#endif { (pcre_free)((void *)pointer); } @@ -386,21 +450,31 @@ Returns: if successful: PCRE_ERROR_NOSUBSTRING (-7) substring not present */ +#ifdef COMPILE_PCRE8 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_get_substring(const char *subject, int *ovector, int stringcount, int stringnumber, const char **stringptr) +#else +PCRE_EXP_DEFN int PCRE_CALL_CONVENTION +pcre16_get_substring(PCRE_SPTR16 subject, int *ovector, int stringcount, + int stringnumber, PCRE_SPTR16 *stringptr) +#endif { int yield; -char *substring; +pcre_uchar *substring; if (stringnumber < 0 || stringnumber >= stringcount) return PCRE_ERROR_NOSUBSTRING; stringnumber *= 2; yield = ovector[stringnumber+1] - ovector[stringnumber]; -substring = (char *)(pcre_malloc)(yield + 1); +substring = (pcre_uchar *)(pcre_malloc)(IN_UCHARS(yield + 1)); if (substring == NULL) return PCRE_ERROR_NOMEMORY; -memcpy(substring, subject + ovector[stringnumber], yield); +memcpy(substring, subject + ovector[stringnumber], IN_UCHARS(yield)); substring[yield] = 0; -*stringptr = substring; +#ifdef COMPILE_PCRE8 +*stringptr = (const char *)substring; +#else +*stringptr = (PCRE_SPTR16)substring; +#endif return yield; } @@ -433,13 +507,23 @@ Returns: if successful: PCRE_ERROR_NOSUBSTRING (-7) no such captured substring */ +#ifdef COMPILE_PCRE8 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_get_named_substring(const pcre *code, const char *subject, int *ovector, int stringcount, const char *stringname, const char **stringptr) +#else +PCRE_EXP_DEFN int PCRE_CALL_CONVENTION +pcre16_get_named_substring(const pcre *code, PCRE_SPTR16 subject, int *ovector, + int stringcount, PCRE_SPTR16 stringname, PCRE_SPTR16 *stringptr) +#endif { int n = get_first_set(code, stringname, ovector); if (n <= 0) return n; +#ifdef COMPILE_PCRE8 return pcre_get_substring(subject, ovector, stringcount, n, stringptr); +#else +return pcre16_get_substring(subject, ovector, stringcount, n, stringptr); +#endif } @@ -456,8 +540,13 @@ Argument: the result of a previous pcre_get_substring() Returns: nothing */ +#ifdef COMPILE_PCRE8 PCRE_EXP_DEFN void PCRE_CALL_CONVENTION pcre_free_substring(const char *pointer) +#else +PCRE_EXP_DEFN void PCRE_CALL_CONVENTION +pcre16_free_substring(PCRE_SPTR16 pointer) +#endif { (pcre_free)((void *)pointer); } diff --git a/pcre_info.c b/pcre_info.c index e7b3730..52d593a 100644 --- a/pcre_info.c +++ b/pcre_info.c @@ -88,6 +88,7 @@ if (re->magic_number != MAGIC_NUMBER) re = PRIV(try_flipped)(re, &internal_re, NULL, NULL); if (re == NULL) return PCRE_ERROR_BADMAGIC; } +if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE; if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_COMPILE_OPTIONS); if (first_char != NULL) *first_char = ((re->flags & PCRE_FIRSTSET) != 0)? re->first_char : diff --git a/pcre_internal.h b/pcre_internal.h index e748809..9a20e73 100644 --- a/pcre_internal.h +++ b/pcre_internal.h @@ -832,15 +832,21 @@ are in a 16-bit flags word. From release 8.00, PCRE_NOPARTIAL is unused, as the restrictions on partial matching have been lifted. It remains for backwards compatibility. */ -#define PCRE_NOPARTIAL 0x0001 /* can't use partial with this regex */ -#define PCRE_FIRSTSET 0x0002 /* first_char is set */ -#define PCRE_REQCHSET 0x0004 /* req_byte is set */ -#define PCRE_STARTLINE 0x0008 /* start after \n for multiline */ -#define PCRE_JCHANGED 0x0010 /* j option used in regex */ -#define PCRE_HASCRORLF 0x0020 /* explicit \r or \n in pattern */ -#define PCRE_HASTHEN 0x0040 /* pattern contains (*THEN) */ -#define PCRE_FCH_CASELESS 0x0080 /* caseless first char */ -#define PCRE_RCH_CASELESS 0x0100 /* caseless requested char */ +#ifdef COMPILE_PCRE8 +#define PCRE_MODE 0x0001 /* compiled in 8 bit mode */ +#endif +#ifdef COMPILE_PCRE16 +#define PCRE_MODE 0x0002 /* compiled in 16 bit mode */ +#endif +#define PCRE_FIRSTSET 0x0010 /* first_char is set */ +#define PCRE_FCH_CASELESS 0x0020 /* caseless first char */ +#define PCRE_REQCHSET 0x0040 /* req_byte is set */ +#define PCRE_RCH_CASELESS 0x0080 /* caseless requested char */ +#define PCRE_STARTLINE 0x0100 /* start after \n for multiline */ +#define PCRE_NOPARTIAL 0x0200 /* can't use partial with this regex */ +#define PCRE_JCHANGED 0x0400 /* j option used in regex */ +#define PCRE_HASCRORLF 0x0800 /* explicit \r or \n in pattern */ +#define PCRE_HASTHEN 0x1000 /* pattern contains (*THEN) */ /* Flags for the "extra" block produced by pcre_study(). */ @@ -917,7 +923,7 @@ for) in a minority area (EBCDIC platforms), this is not sensible. Any application that did need both could compile two versions of the library, using macros to give the functions distinct names. */ -#ifndef SUPPORT_UTF8 +#ifndef SUPPORT_UTF /* UTF-8 support is not enabled; use the platform-dependent character literals so that PCRE works on both ASCII and EBCDIC platforms, in non-UTF-mode only. */ @@ -1186,7 +1192,7 @@ so that PCRE works on both ASCII and EBCDIC platforms, in non-UTF-mode only. */ #define STRING_UCP_RIGHTPAR "UCP)" #define STRING_NO_START_OPT_RIGHTPAR "NO_START_OPT)" -#else /* SUPPORT_UTF8 */ +#else /* SUPPORT_UTF */ /* UTF-8 support is enabled; always use UTF-8 (=ASCII) character codes. This works in both modes non-EBCDIC platforms, and on EBCDIC platforms in UTF-8 mode @@ -1446,7 +1452,7 @@ only. */ #define STRING_UCP_RIGHTPAR STR_U STR_C STR_P STR_RIGHT_PARENTHESIS #define STRING_NO_START_OPT_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_S STR_T STR_A STR_R STR_T STR_UNDERSCORE STR_O STR_P STR_T STR_RIGHT_PARENTHESIS -#endif /* SUPPORT_UTF8 */ +#endif /* SUPPORT_UTF */ /* Escape items that are just an encoding of a particular data value. */ @@ -2249,7 +2255,7 @@ extern real_pcre *PRIV(try_flipped)(const real_pcre *, real_pcre *, extern int PRIV(valid_utf)(PCRE_PUCHAR, int, int *); extern BOOL PRIV(was_newline)(PCRE_PUCHAR, int, PCRE_PUCHAR, int *, BOOL); -extern BOOL PRIV(xclass)(int, const pcre_uchar *); +extern BOOL PRIV(xclass)(int, const pcre_uchar *, BOOL); #ifdef SUPPORT_JIT extern void PRIV(jit_compile)(const real_pcre *, pcre_extra *); diff --git a/pcre_jit_compile.c b/pcre_jit_compile.c index 8c6b206..50376b9 100644 --- a/pcre_jit_compile.c +++ b/pcre_jit_compile.c @@ -1311,7 +1311,7 @@ static unsigned int char_get_othercase_bit(compiler_common *common, pcre_uchar* { /* Detects if the character and its othercase has only 1 bit difference. */ unsigned int c, oc, bit; -#if defined SUPPORT_UTF8 && defined COMPILE_PCRE8 +#if defined SUPPORT_UTF && defined COMPILE_PCRE8 int n; #endif diff --git a/pcre_jit_test.c b/pcre_jit_test.c index d82af25..5b66bac 100644 --- a/pcre_jit_test.c +++ b/pcre_jit_test.c @@ -56,6 +56,8 @@ POSSIBILITY OF SUCH DAMAGE. Non-letter characters: \xc2\xa1 = 0xa1 = (Inverted Exclamation Mark) \xf3\xa9\xb7\x80 = 0xe9dc0 = 957888 + \xed\xa0\x80 = 55296 = 0xd800 (Invalid UTF character) + \xed\xb0\x80 = 56320 = 0xdc00 (Invalid UTF character) Newlines: \xc2\x85 = 0x85 = 133 (NExt Line = NEL) \xe2\x80\xa8 = 0x2028 = 8232 (Line Separator) @@ -99,13 +101,19 @@ int main(void) #error SUPPORT_PCRE8 or SUPPORT_PCRE16 must be defined #endif -#define MUA (PCRE_MULTILINE | PCRE_UTF8 | PCRE_NEWLINE_ANYCRLF) -#define MUAP (PCRE_MULTILINE | PCRE_UTF8 | PCRE_NEWLINE_ANYCRLF | PCRE_UCP) -#define CMUA (PCRE_CASELESS | PCRE_MULTILINE | PCRE_UTF8 | PCRE_NEWLINE_ANYCRLF) -#define CMUAP (PCRE_CASELESS | PCRE_MULTILINE | PCRE_UTF8 | PCRE_NEWLINE_ANYCRLF | PCRE_UCP) -#define MA (PCRE_MULTILINE | PCRE_NEWLINE_ANYCRLF) -#define MAP (PCRE_MULTILINE | PCRE_NEWLINE_ANYCRLF | PCRE_UCP) -#define CMA (PCRE_CASELESS | PCRE_MULTILINE | PCRE_NEWLINE_ANYCRLF) +#define MUA (PCRE_MULTILINE | PCRE_UTF8 | PCRE_NEWLINE_ANYCRLF) +#define MUAP (PCRE_MULTILINE | PCRE_UTF8 | PCRE_NEWLINE_ANYCRLF | PCRE_UCP) +#define CMUA (PCRE_CASELESS | PCRE_MULTILINE | PCRE_UTF8 | PCRE_NEWLINE_ANYCRLF) +#define CMUAP (PCRE_CASELESS | PCRE_MULTILINE | PCRE_UTF8 | PCRE_NEWLINE_ANYCRLF | PCRE_UCP) +#define MA (PCRE_MULTILINE | PCRE_NEWLINE_ANYCRLF) +#define MAP (PCRE_MULTILINE | PCRE_NEWLINE_ANYCRLF | PCRE_UCP) +#define CMA (PCRE_CASELESS | PCRE_MULTILINE | PCRE_NEWLINE_ANYCRLF) + +#define OFFSET_MASK 0xffff +#define F_DIFF 0x010000 +#define F_FORCECONV 0x020000 +#define F_NO8 0x100000 +#define F_NO16 0x200000 struct regression_test_case { int flags; @@ -521,7 +529,7 @@ static struct regression_test_case regression_test_cases[] = { { MUA, 0, "(?(?!(b))a*|b*)+k", "ababbalbbadabak" }, { MUA, 0, "(?(?!(b))(?:aaaaaa|a)|(?:bbbbbb|b))+aaaak", "aaaaaaaaaaaaaa bbbbbbbbbbbbbbb aaaaaaak" }, { MUA, 0, "(?(?!b)(?:aaaaaa|a)|(?:bbbbbb|b))+aaaak", "aaaaaaaaaaaaaa bbbbbbbbbbbbbbb aaaaaaak" }, - { MUA | PCRE_BUG, 0, "(?(?!(b))(?:aaaaaa|a)|(?:bbbbbb|b))+bbbbk", "aaaaaaaaaaaaaa bbbbbbbbbbbbbbb bbbbbbbk" }, + { MUA, 0 | F_DIFF, "(?(?!(b))(?:aaaaaa|a)|(?:bbbbbb|b))+bbbbk", "aaaaaaaaaaaaaa bbbbbbbbbbbbbbb bbbbbbbk" }, { MUA, 0, "(?(?!b)(?:aaaaaa|a)|(?:bbbbbb|b))+bbbbk", "aaaaaaaaaaaaaa bbbbbbbbbbbbbbb bbbbbbbk" }, { MUA, 0, "(?(?=a)a*|b*)+k", "ababbalbbadabak" }, { MUA, 0, "(?(?!b)a*|b*)+k", "ababbalbbadabak" }, @@ -535,11 +543,11 @@ static struct regression_test_case regression_test_cases[] = { { MUA, 0, "((?=\\w{5})\\w(?(?=\\w*k)\\d|[a-f_])*\\w\\s)+", "mol m10kk m088k _f_a_ mbkkl" }, { MUA, 0, "(c)?\?(?(1)a|b)", "cdcaa" }, { MUA, 0, "(c)?\?(?(1)a|b)", "cbb" }, - { MUA | PCRE_BUG, 0, "(?(?=(a))(aaaa|a?))+aak", "aaaaab aaaaak" }, + { MUA, 0 | F_DIFF, "(?(?=(a))(aaaa|a?))+aak", "aaaaab aaaaak" }, { MUA, 0, "(?(?=a)(aaaa|a?))+aak", "aaaaab aaaaak" }, { MUA, 0, "(?(?!(b))(aaaa|a?))+aak", "aaaaab aaaaak" }, { MUA, 0, "(?(?!b)(aaaa|a?))+aak", "aaaaab aaaaak" }, - { MUA | PCRE_BUG, 0, "(?(?=(a))a*)+aak", "aaaaab aaaaak" }, + { MUA, 0 | F_DIFF, "(?(?=(a))a*)+aak", "aaaaab aaaaak" }, { MUA, 0, "(?(?=a)a*)+aak", "aaaaab aaaaak" }, { MUA, 0, "(?(?!(b))a*)+aak", "aaaaab aaaaak" }, { MUA, 0, "(?(?!b)a*)+aak", "aaaaab aaaaak" }, @@ -601,6 +609,20 @@ static struct regression_test_case regression_test_cases[] = { { MUA, 0, "((?(R)(?:aaaa|a)|(?:(aaaa)|(a)))+)(?1)$", "aaaaaaaaaa aaaa" }, { MUA, 0, "(?P<Name>a(?(R&Name)a|b))(?1)", "aab abb abaa" }, + /* 16 bit specific tests. */ + { CMA, 0 | F_FORCECONV, "\xc3\xa1", "\xc3\x81\xc3\xa1" }, + { CMA, 0 | F_FORCECONV, "\xe1\xbd\xb8", "\xe1\xbf\xb8\xe1\xbd\xb8" }, + { CMA, 0 | F_FORCECONV, "[\xc3\xa1]", "\xc3\x81\xc3\xa1" }, + { CMA, 0 | F_FORCECONV, "[\xe1\xbd\xb8]", "\xe1\xbf\xb8\xe1\xbd\xb8" }, + { CMA, 0 | F_FORCECONV | F_NO8, "\xed\xa0\x80\\x{d800}\xed\xb0\x80\\x{dc00}", "\xed\xa0\x80\xed\xa0\x80\xed\xb0\x80\xed\xb0\x80" }, + { CMA, 0 | F_FORCECONV | F_NO8, "[\xed\xa0\x80\\x{d800}]{1,2}?[\xed\xb0\x80\\x{dc00}]{1,2}?#", "\xed\xa0\x80\xed\xa0\x80\xed\xb0\x80\xed\xb0\x80#" }, + { CMA, 0 | F_FORCECONV, "[\xed\xa0\x80\xed\xb0\x80#]{0,3}(?<=\xed\xb0\x80.)", "\xed\xa0\x80#\xed\xa0\x80##\xed\xb0\x80\xed\xa0\x80" }, + { CMA, 0 | F_FORCECONV, "[\xed\xa0\x80-\xed\xb3\xbf]", "\xed\x9f\xbf\xed\xa0\x83" }, + { CMA, 0 | F_FORCECONV, "[\xed\xa0\x80-\xed\xb3\xbf]", "\xed\xb4\x80\xed\xb3\xb0" }, + { CMA, 0 | F_FORCECONV | F_NO8, "[\\x{d800}-\\x{dcff}]", "\xed\x9f\xbf\xed\xa0\x83" }, + { CMA, 0 | F_FORCECONV | F_NO8, "[\\x{d800}-\\x{dcff}]", "\xed\xb4\x80\xed\xb3\xb0" }, + { CMA, 0 | F_FORCECONV, "[\xed\xa0\x80-\xef\xbf\xbf]+[\x1-\xed\xb0\x80]+#", "\xed\xa0\x85\xc3\x81\xed\xa0\x85\xef\xbf\xb0\xc2\x85\xed\xa9\x89#" }, + /* Deep recursion. */ { MUA, 0, "((((?:(?:(?:\\w)+)?)*|(?>\\w)+?)+|(?>\\w)?\?)*)?\\s", "aaaaa+ " }, { MUA, 0, "(?:((?:(?:(?:\\w*?)+)??|(?>\\w)?|\\w*+)*)+)+?\\s", "aa+ " }, @@ -721,7 +743,7 @@ static int regression_tests(void) int ovector8_2[32]; int return_value8_1, return_value8_2; int utf8 = 0, ucp8 = 0; - int disabled_flags8 = PCRE_BUG; + int disabled_flags8 = 0; #endif #ifdef SUPPORT_PCRE16 pcre *re16; @@ -730,7 +752,7 @@ static int regression_tests(void) int ovector16_2[32]; int return_value16_1, return_value16_2; int utf16 = 0, ucp16 = 0; - int disabled_flags16 = PCRE_BUG; + int disabled_flags16 = 0; int length16; #endif @@ -765,9 +787,11 @@ static int regression_tests(void) error = NULL; #ifdef SUPPORT_PCRE8 - re8 = pcre_compile(current->pattern, - current->flags & ~(PCRE_NOTBOL | PCRE_NOTEOL | PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART | disabled_flags8), - &error, &err_offs, NULL); + re8 = NULL; + if (!(current->start_offset & F_NO8)) + re8 = pcre_compile(current->pattern, + current->flags & ~(PCRE_NOTBOL | PCRE_NOTEOL | PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART | disabled_flags8), + &error, &err_offs, NULL); extra8 = NULL; if (re8) { @@ -784,17 +808,20 @@ static int regression_tests(void) pcre_free(re8); re8 = NULL; } - } else if (utf8 && ucp8) + } else if (utf8 && ucp8 && !(current->start_offset & F_NO8)) printf("\n8 bit: Cannot compile pattern: %s\n", current->pattern); #endif #ifdef SUPPORT_PCRE16 - if (current->flags & PCRE_UTF8) + if ((current->flags & PCRE_UTF8) || (current->start_offset & F_FORCECONV)) convert_utf8_to_utf16(current->pattern, regtest_buf, NULL, REGTEST_MAX_LENGTH); else copy_char8_to_char16(current->pattern, regtest_buf, REGTEST_MAX_LENGTH); - re16 = pcre16_compile(regtest_buf, - current->flags & ~(PCRE_NOTBOL | PCRE_NOTEOL | PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART | disabled_flags16), - &error, &err_offs, NULL); + + re16 = NULL; + if (!(current->start_offset & F_NO16)) + re16 = pcre16_compile(regtest_buf, + current->flags & ~(PCRE_NOTBOL | PCRE_NOTEOL | PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART | disabled_flags16), + &error, &err_offs, NULL); extra16 = NULL; if (re16) { @@ -811,7 +838,7 @@ static int regression_tests(void) pcre_free(re16); re16 = NULL; } - } else if (utf16 && ucp16) + } else if (utf16 && ucp16 && !(current->start_offset & F_NO16)) printf("\n16 bit: Cannot compile pattern: %s\n", current->pattern); #endif @@ -822,16 +849,15 @@ static int regression_tests(void) #ifdef SUPPORT_PCRE8 return_value8_1 = -1000; return_value8_2 = -1000; + for (i = 0; i < 32; ++i) + ovector8_1[i] = -2; + for (i = 0; i < 32; ++i) + ovector8_2[i] = -2; if (re8) { setstack(extra8, 0); - for (i = 0; i < 32; ++i) - ovector8_1[i] = -2; - return_value8_1 = pcre_exec(re8, extra8, current->input, strlen(current->input), current->start_offset, + return_value8_1 = pcre_exec(re8, extra8, current->input, strlen(current->input), current->start_offset & OFFSET_MASK, current->flags & (PCRE_NOTBOL | PCRE_NOTEOL | PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART), ovector8_1, 32); - - for (i = 0; i < 32; ++i) - ovector8_2[i] = -2; - return_value8_2 = pcre_exec(re8, NULL, current->input, strlen(current->input), current->start_offset, + return_value8_2 = pcre_exec(re8, NULL, current->input, strlen(current->input), current->start_offset & OFFSET_MASK, current->flags & (PCRE_NOTBOL | PCRE_NOTEOL | PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART), ovector8_2, 32); } #endif @@ -839,32 +865,30 @@ static int regression_tests(void) #ifdef SUPPORT_PCRE16 return_value16_1 = -1000; return_value16_2 = -1000; + for (i = 0; i < 32; ++i) + ovector16_1[i] = -2; + for (i = 0; i < 32; ++i) + ovector16_2[i] = -2; if (re16) { setstack(extra16, 0); - if (current->flags & PCRE_UTF8) + if ((current->flags & PCRE_UTF8) || (current->start_offset & F_FORCECONV)) length16 = convert_utf8_to_utf16(current->input, regtest_buf, regtest_offsetmap, REGTEST_MAX_LENGTH); else length16 = copy_char8_to_char16(current->input, regtest_buf, REGTEST_MAX_LENGTH); - - for (i = 0; i < 32; ++i) - ovector16_1[i] = -2; - return_value16_1 = pcre16_exec(re16, extra16, regtest_buf, length16, current->start_offset, + return_value16_1 = pcre16_exec(re16, extra16, regtest_buf, length16, current->start_offset & OFFSET_MASK, current->flags & (PCRE_NOTBOL | PCRE_NOTEOL | PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART), ovector16_1, 32); - - for (i = 0; i < 32; ++i) - ovector16_2[i] = -2; - return_value16_2 = pcre16_exec(re16, NULL, regtest_buf, length16, current->start_offset, + return_value16_2 = pcre16_exec(re16, NULL, regtest_buf, length16, current->start_offset & OFFSET_MASK, current->flags & (PCRE_NOTBOL | PCRE_NOTEOL | PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART), ovector16_2, 32); } #endif - /* If PCRE_BUG is set, just run the test, but do not compare the results. + /* If F_DIFF is set, just run the test, but do not compare the results. Segfaults can still be captured. */ is_succesful = 1; - if (!(current->flags & PCRE_BUG)) { + if (!(current->start_offset & F_DIFF)) { #if defined SUPPORT_PCRE8 && defined SUPPORT_PCRE16 - if (utf8 == utf16) { + if (utf8 == utf16 && !(current->start_offset & F_FORCECONV)) { /* All results must be the same. */ if (return_value8_1 != return_value8_2 || return_value8_1 != return_value16_1 || return_value8_1 != return_value16_2) { printf("\n8 and 16 bit: Return value differs(%d:%d:%d:%d): [%d] '%s' @ '%s'\n", @@ -947,7 +971,7 @@ static int regression_tests(void) } #endif - /* printf("[%d-%d]%s", ovector1[0], ovector1[1], (current->flags & PCRE_CASELESS) ? "C" : ""); */ + /* printf("[%d-%d|%d-%d]%s", ovector8_1[0], ovector8_1[1], ovector16_1[0], ovector16_1[1], (current->flags & PCRE_CASELESS) ? "C" : ""); */ printf("."); fflush(stdout); current++; @@ -962,5 +986,4 @@ static int regression_tests(void) } } - /* End of pcre_jit_test.c */ diff --git a/pcre_newline.c b/pcre_newline.c index d618b80..2343f73 100644 --- a/pcre_newline.c +++ b/pcre_newline.c @@ -84,7 +84,7 @@ if (utf) GETCHAR(c, ptr); } else -#endif /* SUPPORT_UTF8 */ +#endif /* SUPPORT_UTF */ c = *ptr; if (type == NLTYPE_ANYCRLF) switch(c) @@ -150,7 +150,7 @@ if (utf) GETCHAR(c, ptr); } else -#endif /* SUPPORT_UTF8 */ +#endif /* SUPPORT_UTF */ c = *ptr; if (type == NLTYPE_ANYCRLF) switch(c) diff --git a/pcre_ord2utf8.c b/pcre_ord2utf8.c index b374987..67cf529 100644 --- a/pcre_ord2utf8.c +++ b/pcre_ord2utf8.c @@ -65,7 +65,7 @@ Returns: number of characters placed in the buffer int PRIV(ord2utf)(pcre_uint32 cvalue, pcre_uchar *buffer) { -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF register int i, j; @@ -88,7 +88,7 @@ return i + 1; #else (void)(cvalue); /* Keep compiler happy; this function won't ever be */ -(void)(buffer); /* called when SUPPORT_UTF8 is not defined. */ +(void)(buffer); /* called when SUPPORT_UTF is not defined. */ return 0; #endif diff --git a/pcre_study.c b/pcre_study.c index 3f25c3a..493108e 100644 --- a/pcre_study.c +++ b/pcre_study.c @@ -323,7 +323,7 @@ for (;;) /* Check a class for variable quantification */ -#if defined SUPPORT_UTF8 || !defined COMPILE_PCRE8 +#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 case OP_XCLASS: cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS]; /* Fall through */ @@ -824,7 +824,7 @@ do case OP_SOM: case OP_THEN: case OP_THEN_ARG: -#if defined SUPPORT_UTF8 || !defined COMPILE_PCRE8 +#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 case OP_XCLASS: #endif return SSB_FAIL; @@ -1325,6 +1325,16 @@ if (re == NULL || re->magic_number != MAGIC_NUMBER) return NULL; } +if ((re->flags & PCRE_MODE) == 0) + { +#ifdef COMPILE_PCRE8 + *errorptr = "argument is compiled in 16 bit mode"; +#else + *errorptr = "argument is compiled in 8 bit mode"; +#endif + return NULL; + } + if ((options & ~PUBLIC_STUDY_OPTIONS) != 0) { *errorptr = "unknown or incorrect option bit(s) set"; @@ -1346,9 +1356,16 @@ if ((re->options & PCRE_ANCHORED) == 0 && /* Set the character tables in the block that is passed around */ tables = re->tables; + +#ifdef COMPILE_PCRE8 if (tables == NULL) (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, (void *)(&tables)); +#else + if (tables == NULL) + (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, + (void *)(&tables)); +#endif compile_block.lcc = tables + lcc_offset; compile_block.fcc = tables + fcc_offset; diff --git a/pcre_valid_utf8.c b/pcre_valid_utf8.c index bbab87f..05d82f9 100644 --- a/pcre_valid_utf8.c +++ b/pcre_valid_utf8.c @@ -105,7 +105,7 @@ Returns: = 0 if the string is a valid UTF-8 string int PRIV(valid_utf)(PCRE_PUCHAR string, int length, int *erroroffset) { -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF register PCRE_PUCHAR p; if (length < 0) @@ -288,7 +288,7 @@ for (p = string; length-- > 0; p++) } } -#else /* SUPPORT_UTF8 */ +#else /* SUPPORT_UTF */ (void)(string); /* Keep picky compilers happy */ (void)(length); #endif diff --git a/pcre_xclass.c b/pcre_xclass.c index cdb9d07..1c2b65a 100644 --- a/pcre_xclass.c +++ b/pcre_xclass.c @@ -64,11 +64,17 @@ Returns: TRUE if character matches, else FALSE */ BOOL -PRIV(xclass)(int c, const pcre_uchar *data) +PRIV(xclass)(int c, const pcre_uchar *data, BOOL utf) { int t; BOOL negated = (*data & XCL_NOT) != 0; +(void)utf; +#ifdef COMPILE_PCRE8 +/* In 8 bit mode, this must always be TRUE. Help the compiler to know that. */ +utf = TRUE; +#endif + /* Character values < 256 are matched against a bitmap, if one is present. If not, we still carry on, because there may be ranges that start below 256 in the additional data. */ @@ -91,13 +97,30 @@ while ((t = *data++) != XCL_END) int x, y; if (t == XCL_SINGLE) { - GETCHARINC(x, data); +#ifdef SUPPORT_UTF + if (utf) + { + GETCHARINC(x, data); /* macro generates multiple statements */ + } + else +#endif + x = *data++; if (c == x) return !negated; } else if (t == XCL_RANGE) { - GETCHARINC(x, data); - GETCHARINC(y, data); +#ifdef SUPPORT_UTF + if (utf) + { + GETCHARINC(x, data); /* macro generates multiple statements */ + GETCHARINC(y, data); /* macro generates multiple statements */ + } + else +#endif + { + x = *data++; + y = *data++; + } if (c >= x && c <= y) return !negated; } diff --git a/pcreposix.c b/pcreposix.c index 2dc1561..0426e2e 100644 --- a/pcreposix.c +++ b/pcreposix.c @@ -401,6 +401,7 @@ switch(rc) case PCRE_ERROR_MATCHLIMIT: return REG_ESPACE; case PCRE_ERROR_BADUTF8: return REG_INVARG; case PCRE_ERROR_BADUTF8_OFFSET: return REG_INVARG; + case PCRE_ERROR_BADMODE: return REG_INVARG; default: return REG_ASSERT; } } |