summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorzherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>2011-12-05 20:12:24 +0000
committerzherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>2011-12-05 20:12:24 +0000
commita9839b968cee5828bf35dbcb05a31859a49ab7a2 (patch)
tree836125e6c0ea7958e295ccda9f7d060b05102430
parent216818740b54b629e7bd59cd49f783c72e244e23 (diff)
downloadpcre-a9839b968cee5828bf35dbcb05a31859a49ab7a2.tar.gz
Improving UTF-16 support by fixing a lot of issues.
git-svn-id: svn://vcs.exim.org/pcre/code/branches/pcre16@785 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--Makefile.am3
-rw-r--r--pcre.h.in10
-rw-r--r--pcre16_fullinfo.c45
-rw-r--r--pcre16_info.c45
-rw-r--r--pcre16_version.c45
-rw-r--r--pcre_compile.c98
-rw-r--r--pcre_dfa_exec.c29
-rw-r--r--pcre_exec.c49
-rw-r--r--pcre_fullinfo.c6
-rw-r--r--pcre_info.c11
-rw-r--r--pcre_internal.h8
-rw-r--r--pcre_jit_compile.c206
-rw-r--r--pcre_newline.c30
-rw-r--r--pcre_printint.src4
-rw-r--r--pcre_study.c113
-rw-r--r--pcre_version.c5
16 files changed, 543 insertions, 164 deletions
diff --git a/Makefile.am b/Makefile.am
index c939f9f..817b01a 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -212,6 +212,8 @@ libpcre16_la_SOURCES = \
pcre16_chartables.c \
pcre16_compile.c \
pcre16_exec.c \
+ pcre16_fullinfo.c \
+ pcre16_info.c \
pcre16_jit_compile.c \
pcre16_newline.c \
pcre16_ord2utf16.c \
@@ -222,6 +224,7 @@ libpcre16_la_SOURCES = \
pcre16_ucd.c \
pcre16_utf16_utils.c \
pcre16_valid_utf16.c \
+ pcre16_version.c \
pcre16_xclass.c
## This file is generated as part of the building process, so don't distribute.
diff --git a/pcre.h.in b/pcre.h.in
index 7b2bca5..b9ec777 100644
--- a/pcre.h.in
+++ b/pcre.h.in
@@ -367,6 +367,8 @@ PCRE_EXP_DECL void pcre_free_substring(const char *);
PCRE_EXP_DECL void pcre_free_substring_list(const char **);
PCRE_EXP_DECL int pcre_fullinfo(const pcre *, const pcre_extra *, int,
void *);
+PCRE_EXP_DECL int pcre16_fullinfo(const pcre *, const pcre_extra *, int,
+ void *);
PCRE_EXP_DECL int pcre_get_named_substring(const pcre *, const char *,
int *, int, const char *, const char **);
PCRE_EXP_DECL int pcre_get_stringnumber(const pcre *, const char *);
@@ -377,15 +379,19 @@ PCRE_EXP_DECL int pcre_get_substring(const char *, int *, int, int,
PCRE_EXP_DECL int pcre_get_substring_list(const char *, int *, int,
const char ***);
PCRE_EXP_DECL int pcre_info(const pcre *, int *, int *);
+PCRE_EXP_DECL int pcre16_info(const pcre *, int *, int *);
PCRE_EXP_DECL const unsigned char *pcre_maketables(void);
PCRE_EXP_DECL int pcre_refcount(pcre *, int);
-PCRE_EXP_DECL int pcre16_utf16_to_host_byte_order(PCRE_SCHAR16 *,
- PCRE_SPTR16, int, int);
PCRE_EXP_DECL pcre_extra *pcre_study(const pcre *, int, const char **);
PCRE_EXP_DECL pcre_extra *pcre16_study(const pcre *, int, const char **);
PCRE_EXP_DECL void pcre_free_study(pcre_extra *);
PCRE_EXP_DECL void pcre16_free_study(pcre_extra *);
PCRE_EXP_DECL const char *pcre_version(void);
+PCRE_EXP_DECL const char *pcre16_version(void);
+
+/* Utility functions. */
+PCRE_EXP_DECL int pcre16_utf16_to_host_byte_order(PCRE_SCHAR16 *,
+ PCRE_SPTR16, int, int);
/* JIT compiler related functions. */
diff --git a/pcre16_fullinfo.c b/pcre16_fullinfo.c
new file mode 100644
index 0000000..0e67deb
--- /dev/null
+++ b/pcre16_fullinfo.c
@@ -0,0 +1,45 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+ Written by Philip Hazel
+ Copyright (c) 1997-2011 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+/* Generate code with 16 bit character support. */
+#define COMPILE_PCRE16
+
+#include "pcre_fullinfo.c"
+
+/* End of pcre16_fullinfo.c */
diff --git a/pcre16_info.c b/pcre16_info.c
new file mode 100644
index 0000000..b4b221a
--- /dev/null
+++ b/pcre16_info.c
@@ -0,0 +1,45 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+ Written by Philip Hazel
+ Copyright (c) 1997-2011 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+/* Generate code with 16 bit character support. */
+#define COMPILE_PCRE16
+
+#include "pcre_info.c"
+
+/* End of pcre16_info.c */
diff --git a/pcre16_version.c b/pcre16_version.c
new file mode 100644
index 0000000..d4a3329
--- /dev/null
+++ b/pcre16_version.c
@@ -0,0 +1,45 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+ Written by Philip Hazel
+ Copyright (c) 1997-2011 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+/* Generate code with 16 bit character support. */
+#define COMPILE_PCRE16
+
+#include "pcre_version.c"
+
+/* End of pcre16_version.c */
diff --git a/pcre_compile.c b/pcre_compile.c
index 3461dbd..da22f59 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -102,6 +102,10 @@ overrun before it actually does run off the end of the data block. */
#define REQ_CASELESS 0x10000000l /* Indicates caselessness */
#define REQ_VARY 0x20000000l /* Reqchar followed non-literal item */
+/* Repeated character flags. */
+
+#define UTF_LENGTH 0x10000000l /* The char contains its length. */
+
/* Table for handling escaped characters in the range '0'-'z'. Positive returns
are simple data values; negative values are for special things like \d and so
on. Zero means further processing is needed (for things like \x), or the escape
@@ -2896,7 +2900,7 @@ static BOOL
check_auto_possessive(const pcre_uchar *previous, BOOL utf,
const pcre_uchar *ptr, int options, compile_data *cd)
{
-int c, next;
+pcre_int32 c, next;
int op_code = *previous++;
/* Skip whitespace and comments in extended mode */
@@ -2932,15 +2936,13 @@ if (*ptr == CHAR_BACKSLASH)
if (temperrorcode != 0) return FALSE;
ptr++; /* Point after the escape sequence */
}
-
-else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
+else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0)
{
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf) { GETCHARINC(next, ptr); } else
#endif
next = *ptr++;
}
-
else return FALSE;
/* Skip whitespace and comments in extended mode */
@@ -4603,20 +4605,25 @@ for (;; ptr++)
/* Deal with UTF characters that take up more than one character. It's
easier to write this out separately than try to macrify it. Use c to
- hold the length of the character in bytes, plus 0x80 to flag that it's a
- length rather than a small character. */
+ hold the length of the character in bytes, plus UTF_LENGTH to flag that
+ it's a length rather than a small character. */
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
+#ifdef COMPILE_PCRE8
if (utf && (code[-1] & 0x80) != 0)
+#endif /* COMPILE_PCRE8 */
+#ifdef COMPILE_PCRE16
+ if (utf && (code[-1] & 0xfc00) == 0xdc00)
+#endif /* COMPILE_PCRE8 */
{
pcre_uchar *lastchar = code - 1;
BACKCHAR(lastchar);
c = code - lastchar; /* Length of UTF-8 character */
memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
- c |= 0x80; /* Flag c as a length */
+ c |= UTF_LENGTH; /* Flag c as a length */
}
else
-#endif
+#endif /* SUPPORT_UTF */
/* Handle the case of a single charater - either with no UTF support, or
with UTF disabled, or for a single character UTF character. */
@@ -4758,14 +4765,14 @@ for (;; ptr++)
we have to insert the character for the previous code. For a repeated
Unicode property match, there are two extra bytes that define the
required property. In UTF-8 mode, long characters have their length in
- c, with the 0x80 bit as a flag. */
+ c, with the UTF_LENGTH bit as a flag. */
if (repeat_max < 0)
{
-#ifdef SUPPORT_UTF8
- if (utf && c >= 128)
+#ifdef SUPPORT_UTF
+ if (utf && (c & UTF_LENGTH) != 0)
{
- memcpy(code, utf_chars, c & 7);
+ memcpy(code, utf_chars, IN_UCHARS(c & 7));
code += c & 7;
}
else
@@ -4787,10 +4794,10 @@ for (;; ptr++)
else if (repeat_max != repeat_min)
{
-#ifdef SUPPORT_UTF8
- if (utf && c >= 128)
+#ifdef SUPPORT_UTF
+ if (utf && (c & UTF_LENGTH) != 0)
{
- memcpy(code, utf_chars, c & 7);
+ memcpy(code, utf_chars, IN_UCHARS(c & 7));
code += c & 7;
}
else
@@ -4817,10 +4824,10 @@ for (;; ptr++)
/* The character or character type itself comes last in all cases. */
-#ifdef SUPPORT_UTF8
- if (utf && c >= 128)
+#ifdef SUPPORT_UTF
+ if (utf && (c & UTF_LENGTH) != 0)
{
- memcpy(code, utf_chars, c & 7);
+ memcpy(code, utf_chars, IN_UCHARS(c & 7));
code += c & 7;
}
else
@@ -6661,9 +6668,7 @@ for (;; ptr++)
#ifdef SUPPORT_UTF
if (utf && HAS_EXTRALEN(c))
- {
- INTERNALCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
- }
+ ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
#endif
/* At this point we have the character's bytes in mcbuffer, and the length
@@ -7789,9 +7794,27 @@ if ((re->options & PCRE_ANCHORED) == 0)
re->first_char = firstchar & 0xffff;
#endif
#endif
- if ((firstchar & REQ_CASELESS) != 0 && MAX_255(re->first_char)
- && cd->fcc[re->first_char] != re->first_char)
- re->flags |= PCRE_FCH_CASELESS;
+ if ((firstchar & REQ_CASELESS) != 0)
+ {
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+ /* We ignore non-ASCII first chars in 8 bit mode. */
+ if (utf)
+ {
+ if (re->first_char < 128)
+ {
+ if (cd->fcc[re->first_char] != re->first_char)
+ re->flags |= PCRE_FCH_CASELESS;
+ }
+ else if ((options & PCRE_UCP) != 0
+ && UCD_OTHERCASE(re->first_char) != re->first_char)
+ re->flags |= PCRE_FCH_CASELESS;
+ }
+ else
+#endif
+ if (MAX_255(re->first_char)
+ && cd->fcc[re->first_char] != re->first_char)
+ re->flags |= PCRE_FCH_CASELESS;
+ }
re->flags |= PCRE_FIRSTSET;
}
@@ -7814,9 +7837,26 @@ if (reqchar >= 0 &&
re->req_char = reqchar & 0xffff;
#endif
#endif
- if ((reqchar & REQ_CASELESS) != 0 && MAX_255(re->req_char)
- && cd->fcc[re->req_char] != re->req_char)
- re->flags |= PCRE_RCH_CASELESS;
+ if ((reqchar & REQ_CASELESS) != 0)
+ {
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+ /* We ignore non-ASCII first chars in 8 bit mode. */
+ if (utf)
+ {
+ if (re->first_char < 128)
+ {
+ if (cd->fcc[re->first_char] != re->first_char)
+ re->flags |= PCRE_RCH_CASELESS;
+ }
+ else if ((options & PCRE_UCP) != 0
+ && UCD_OTHERCASE(re->first_char) != re->first_char)
+ re->flags |= PCRE_RCH_CASELESS;
+ }
+ else
+#endif
+ if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char)
+ re->flags |= PCRE_RCH_CASELESS;
+ }
re->flags |= PCRE_REQCHSET;
}
diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c
index d7b292d..1bc96c1 100644
--- a/pcre_dfa_exec.c
+++ b/pcre_dfa_exec.c
@@ -480,7 +480,7 @@ if (*first_op == OP_REVERSE)
{
if (current_subject <= start_subject) break;
current_subject--;
- INTERNALCHAR(current_subject > start_subject, *current_subject, current_subject--);
+ ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
}
}
else
@@ -3199,7 +3199,13 @@ if (!anchored)
has_first_char = TRUE;
first_char = first_char2 = re->first_char;
if ((re->flags & PCRE_FCH_CASELESS) != 0)
+ {
first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+ if (first_char > 127 && utf && md->use_ucp)
+ first_char2 = UCD_OTHERCASE(first_char);
+#endif
+ }
}
else
{
@@ -3217,7 +3223,13 @@ if ((re->flags & PCRE_REQCHSET) != 0)
has_req_char = TRUE;
req_char = req_char2 = re->req_char;
if ((re->flags & PCRE_RCH_CASELESS) != 0)
+ {
req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+ if (req_char > 127 && utf && md->use_ucp)
+ req_char2 = UCD_OTHERCASE(req_char);
+#endif
+ }
}
/* Call the main matching function, looping for a non-anchored regex after a
@@ -3246,7 +3258,7 @@ for (;;)
while (t < md->end_subject && !IS_NEWLINE(t))
{
t++;
- INTERNALCHAR(t < end_subject, *t, t++);
+ ACROSSCHAR(t < end_subject, *t, t++);
}
}
else
@@ -3290,7 +3302,7 @@ for (;;)
!WAS_NEWLINE(current_subject))
{
current_subject++;
- INTERNALCHAR(current_subject < end_subject, *current_subject,
+ ACROSSCHAR(current_subject < end_subject, *current_subject,
current_subject++);
}
}
@@ -3318,12 +3330,17 @@ for (;;)
while (current_subject < end_subject)
{
register unsigned int c = *current_subject;
+#ifndef COMPILE_PCRE8
+ if (c > 255) c = 255;
+#endif
if ((start_bits[c/8] & (1 << (c&7))) == 0)
{
current_subject++;
-#ifdef SUPPORT_UTF
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
+ /* In non 8-bit mode, the iteration will stop for
+ characters > 255 at the beginning or not stop at all. */
if (utf)
- INTERNALCHAR(current_subject < end_subject, *current_subject,
+ ACROSSCHAR(current_subject < end_subject, *current_subject,
current_subject++);
#endif
}
@@ -3434,7 +3451,7 @@ for (;;)
#ifdef SUPPORT_UTF
if (utf)
{
- INTERNALCHAR(current_subject < end_subject, *current_subject,
+ ACROSSCHAR(current_subject < end_subject, *current_subject,
current_subject++);
}
#endif
diff --git a/pcre_exec.c b/pcre_exec.c
index 6761598..bb1b60a 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -2069,7 +2069,7 @@ for (;;)
be "non-word" characters. Remember the earliest consulted character for
partial matching. */
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
{
/* Get status of previous character */
@@ -2190,7 +2190,7 @@ for (;;)
}
eptr++;
#ifdef SUPPORT_UTF
- if (utf) INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+ if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
#endif
ecode++;
break;
@@ -3066,7 +3066,7 @@ for (;;)
/* Match a single character, caselessly */
case OP_CHARI:
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
{
length = 1;
@@ -4089,7 +4089,7 @@ for (;;)
}
if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
eptr++;
- INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+ ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
}
break;
@@ -4102,7 +4102,7 @@ for (;;)
MRRETURN(MATCH_NOMATCH);
}
eptr++;
- INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+ ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
}
break;
@@ -4301,7 +4301,7 @@ for (;;)
if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
MRRETURN(MATCH_NOMATCH);
eptr++;
- INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+ ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
}
break;
@@ -4330,7 +4330,7 @@ for (;;)
if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
MRRETURN(MATCH_NOMATCH);
eptr++;
- INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+ ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
}
break;
@@ -5330,7 +5330,7 @@ for (;;)
}
if (IS_NEWLINE(eptr)) break;
eptr++;
- INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+ ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
}
}
@@ -5347,7 +5347,7 @@ for (;;)
}
if (IS_NEWLINE(eptr)) break;
eptr++;
- INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+ ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
}
}
break;
@@ -5363,7 +5363,7 @@ for (;;)
break;
}
eptr++;
- INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+ ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
}
}
else
@@ -6264,7 +6264,13 @@ if (!anchored)
has_first_char = TRUE;
first_char = first_char2 = re->first_char;
if ((re->flags & PCRE_FCH_CASELESS) != 0)
+ {
first_char2 = TABLE_GET(first_char, tables + fcc_offset, first_char);
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+ if (first_char > 127 && utf && md->use_ucp)
+ first_char2 = UCD_OTHERCASE(first_char);
+#endif
+ }
}
else
if (!startline && study != NULL &&
@@ -6280,7 +6286,13 @@ if ((re->flags & PCRE_REQCHSET) != 0)
has_req_char = TRUE;
req_char = req_char2 = re->req_char;
if ((re->flags & PCRE_RCH_CASELESS) != 0)
+ {
req_char2 = TABLE_GET(req_char, tables + fcc_offset, req_char);
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+ if (req_char > 127 && utf && md->use_ucp)
+ req_char2 = UCD_OTHERCASE(req_char);
+#endif
+ }
}
@@ -6309,7 +6321,7 @@ for(;;)
while (t < md->end_subject && !IS_NEWLINE(t))
{
t++;
- INTERNALCHAR(t < end_subject, *t, t++);
+ ACROSSCHAR(t < end_subject, *t, t++);
}
}
else
@@ -6351,7 +6363,7 @@ for(;;)
while (start_match < end_subject && !WAS_NEWLINE(start_match))
{
start_match++;
- INTERNALCHAR(start_match < end_subject, *start_match,
+ ACROSSCHAR(start_match < end_subject, *start_match,
start_match++);
}
}
@@ -6378,17 +6390,18 @@ for(;;)
{
while (start_match < end_subject)
{
-#ifdef COMPILE_PCRE
register unsigned int c = *start_match;
-#else
- register unsigned int c = *start_match & 0xff;
+#ifndef COMPILE_PCRE8
+ if (c > 255) c = 255;
#endif
if ((start_bits[c/8] & (1 << (c&7))) == 0)
{
start_match++;
-#ifdef SUPPORT_UTF
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
+ /* In non 8-bit mode, the iteration will stop for
+ characters > 255 at the beginning or not stop at all. */
if (utf)
- INTERNALCHAR(start_match < end_subject, *start_match,
+ ACROSSCHAR(start_match < end_subject, *start_match,
start_match++);
#endif
}
@@ -6520,7 +6533,7 @@ for(;;)
new_start_match = start_match + 1;
#ifdef SUPPORT_UTF
if (utf)
- INTERNALCHAR(new_start_match < end_subject, *new_start_match,
+ ACROSSCHAR(new_start_match < end_subject, *new_start_match,
new_start_match++);
#endif
break;
diff --git a/pcre_fullinfo.c b/pcre_fullinfo.c
index 6c89121..2bdf24b 100644
--- a/pcre_fullinfo.c
+++ b/pcre_fullinfo.c
@@ -65,9 +65,15 @@ Arguments:
Returns: 0 if data returned, negative on error
*/
+#ifdef COMPILE_PCRE8
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what,
void *where)
+#else
+PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
+pcre16_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what,
+ void *where)
+#endif
{
real_pcre internal_re;
pcre_study_data internal_study;
diff --git a/pcre_info.c b/pcre_info.c
index 9211df4..e7b3730 100644
--- a/pcre_info.c
+++ b/pcre_info.c
@@ -72,8 +72,13 @@ Returns: number of capturing subpatterns
or negative values on error
*/
+#ifdef COMPILE_PCRE8
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
-pcre_info(const pcre *argument_re, int *optptr, int *first_byte)
+pcre_info(const pcre *argument_re, int *optptr, int *first_char)
+#else
+PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
+pcre16_info(const pcre *argument_re, int *optptr, int *first_char)
+#endif
{
real_pcre internal_re;
const real_pcre *re = (const real_pcre *)argument_re;
@@ -84,8 +89,8 @@ if (re->magic_number != MAGIC_NUMBER)
if (re == NULL) return PCRE_ERROR_BADMAGIC;
}
if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_COMPILE_OPTIONS);
-if (first_byte != NULL)
- *first_byte = ((re->flags & PCRE_FIRSTSET) != 0)? re->first_char :
+if (first_char != NULL)
+ *first_char = ((re->flags & PCRE_FIRSTSET) != 0)? re->first_char :
((re->flags & PCRE_STARTLINE) != 0)? -1 : -2;
return re->top_bracket;
}
diff --git a/pcre_internal.h b/pcre_internal.h
index 7642b91..4046e41 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -542,7 +542,7 @@ UTF-8 support is omitted, we don't even define them. */
/* #define GETCHARLENTEST(c, eptr, len) */
/* #define BACKCHAR(eptr) */
/* #define FORWARDCHAR(eptr) */
-/* #define INTERNALCHAR(condition, eptr, action) */
+/* #define ACROSSCHAR(condition, eptr, action) */
#else /* SUPPORT_UTF */
@@ -708,7 +708,7 @@ because almost all calls are already within a block of UTF-8 only code. */
#define FORWARDCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr++
/* Same as above, but it allows a fully customizable form. */
-#define INTERNALCHAR(condition, eptr, action) \
+#define ACROSSCHAR(condition, eptr, action) \
while((condition) && ((eptr) & 0xc0) == 0x80) action
#else /* COMPILE_PCRE8 */
@@ -748,7 +748,7 @@ pointer. */
the pointer. */
#define GETUTF16INC(c, eptr) \
- { c = (((c & 0x3ff) << 10) | (eptr[1] & 0x3ff)) + 0x10000; eptr++; }
+ { c = (((c & 0x3ff) << 10) | (*eptr++ & 0x3ff)) + 0x10000; }
/* Get the next UTF-16 character, advancing the pointer. This is called when we
know we are in UTF-16 mode. */
@@ -797,7 +797,7 @@ code. */
#define FORWARDCHAR(eptr) if ((*eptr & 0xfc00) == 0xdc00) eptr++
/* Same as above, but it allows a fully customizable form. */
-#define INTERNALCHAR(condition, eptr, action) \
+#define ACROSSCHAR(condition, eptr, action) \
if ((condition) && ((eptr) & 0xfc00) == 0xdc00) action
#endif
diff --git a/pcre_jit_compile.c b/pcre_jit_compile.c
index 03c7b2c..df158be 100644
--- a/pcre_jit_compile.c
+++ b/pcre_jit_compile.c
@@ -300,7 +300,7 @@ typedef struct compiler_common {
#ifdef SUPPORT_UTF8
BOOL utf;
#ifdef SUPPORT_UCP
- BOOL useucp;
+ BOOL use_ucp;
#endif
jump_list *utfreadchar;
#ifdef COMPILE_PCRE8
@@ -390,10 +390,12 @@ the start pointers when the end of the capturing group has not yet reached. */
#define PRIV_DATA(cc) (common->localptrs[(cc) - common->start])
#ifdef COMPILE_PCRE8
-#define MOV_UCHAR SLJIT_MOV_UB
+#define MOV_UCHAR SLJIT_MOV_UB
+#define MOVU_UCHAR SLJIT_MOVU_UB
#else
#ifdef COMPILE_PCRE16
-#define MOV_UCHAR SLJIT_MOV_UH
+#define MOV_UCHAR SLJIT_MOV_UH
+#define MOVU_UCHAR SLJIT_MOVU_UH
#else
#error Unsupported compiling mode
#endif
@@ -1369,10 +1371,10 @@ if (common->utf && c > 65535)
if (bit >= (1 << 10))
bit >>= 10;
else
- return (bit <= 255) ? ((2 << 8) | bit) : ((3 << 8) | (bit >> 8));
+ return (bit < 256) ? ((2 << 8) | bit) : ((3 << 8) | (bit >> 8));
}
#endif /* SUPPORT_UTF16 */
-return (bit <= 255) ? ((0 << 8) | bit) : ((1 << 8) | (bit >> 8));
+return (bit < 256) ? ((0 << 8) | bit) : ((1 << 8) | (bit >> 8));
#endif /* COMPILE_PCRE16 */
#endif /* COMPILE_PCRE8 */
@@ -1420,7 +1422,7 @@ DEFINE_COMPILER;
struct sljit_jump *jump;
#endif
-OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
#ifdef SUPPORT_UTF
if (common->utf)
{
@@ -1461,7 +1463,7 @@ if (common->utf)
#else
#ifdef COMPILE_PCRE16
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
- jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 0xff);
+ jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 255);
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
JUMPHERE(jump);
/* Skip low surrogate if necessary. */
@@ -1478,9 +1480,9 @@ if (common->utf)
OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
#ifdef COMPILE_PCRE16
-/* The ctypes array contains only 255 values. */
+/* The ctypes array contains only 256 values. */
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
-jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 0xff);
+jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 255);
#endif
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
#ifdef COMPILE_PCRE16
@@ -1542,7 +1544,7 @@ else if (nltype == NLTYPE_ANYCRLF)
}
else
{
- SLJIT_ASSERT(nltype == NLTYPE_FIXED && common->newline <= 255);
+ SLJIT_ASSERT(nltype == NLTYPE_FIXED && common->newline < 256);
add_jump(compiler, fallbacks, CMP(jumpiftrue ? SLJIT_C_EQUAL : SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, common->newline));
}
}
@@ -1660,7 +1662,7 @@ sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
JUMPHERE(jump);
/* Combine two 16 bit characters. */
-OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 1);
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3ff);
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 10);
@@ -1818,7 +1820,7 @@ if (newlinecheck)
return mainloop;
}
-static SLJIT_INLINE void fast_forward_first_char(compiler_common *common, pcre_uchar firstchar, BOOL caseless, BOOL firstline)
+static SLJIT_INLINE void fast_forward_first_char(compiler_common *common, pcre_uchar first_char, BOOL caseless, BOOL firstline)
{
DEFINE_COMPILER;
struct sljit_label *start;
@@ -1836,22 +1838,28 @@ start = LABEL();
leave = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
-oc = firstchar;
+oc = first_char;
if (caseless)
- oc = TABLE_GET(firstchar, common->fcc, firstchar);
-if (firstchar == oc)
- found = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, firstchar);
+ {
+ oc = TABLE_GET(first_char, common->fcc, first_char);
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+ if (first_char > 127 && common->utf && common->use_ucp)
+ oc = UCD_OTHERCASE(first_char);
+#endif
+ }
+if (first_char == oc)
+ found = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, first_char);
else
{
- bit = firstchar ^ oc;
+ bit = first_char ^ oc;
if (ispowerof2(bit))
{
OP2(SLJIT_OR, TMP2, 0, TMP1, 0, SLJIT_IMM, bit);
- found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, firstchar | bit);
+ found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, first_char | bit);
}
else
{
- OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, firstchar);
+ OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, first_char);
COND_VALUE(SLJIT_MOV, TMP2, 0, SLJIT_C_EQUAL);
OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, oc);
COND_VALUE(SLJIT_OR | SLJIT_SET_E, TMP2, 0, SLJIT_C_EQUAL);
@@ -1912,16 +1920,19 @@ if (common->nltype == NLTYPE_FIXED && common->newline > 255)
OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, begin));
firstchar = CMP(SLJIT_C_LESS_EQUAL, STR_PTR, 0, TMP2, 0);
- OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);
+ OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(2));
OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, STR_PTR, 0, TMP1, 0);
COND_VALUE(SLJIT_MOV, TMP2, 0, SLJIT_C_GREATER_EQUAL);
+#ifdef COMPILE_PCRE16
+ OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 1);
+#endif
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
loop = LABEL();
- OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
leave = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), -2);
- OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), -1);
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2));
+ OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
CMPTO(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff, loop);
CMPTO(SLJIT_C_NOT_EQUAL, TMP2, 0, SLJIT_IMM, common->newline & 0xff, loop);
@@ -1952,9 +1963,12 @@ if (common->nltype == NLTYPE_ANY || common->nltype == NLTYPE_ANYCRLF)
leave = JUMP(SLJIT_JUMP);
JUMPHERE(foundcr);
notfoundnl = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, CHAR_NL);
COND_VALUE(SLJIT_MOV, TMP1, 0, SLJIT_C_EQUAL);
+#ifdef COMPILE_PCRE16
+ OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
+#endif
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
JUMPHERE(notfoundnl);
JUMPHERE(leave);
@@ -1972,6 +1986,9 @@ DEFINE_COMPILER;
struct sljit_label *start;
struct sljit_jump *leave;
struct sljit_jump *found;
+#ifndef COMPILE_PCRE8
+struct sljit_jump *jump;
+#endif
if (firstline)
{
@@ -1987,7 +2004,9 @@ if (common->utf)
OP1(SLJIT_MOV, TMP3, 0, TMP1, 0);
#endif
#ifndef COMPILE_PCRE8
-OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xff);
+jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 255);
+OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 255);
+JUMPHERE(jump);
#endif
OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7);
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3);
@@ -2028,7 +2047,7 @@ if (firstline)
OP1(SLJIT_MOV, STR_END, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), POSSESSIVE0);
}
-static SLJIT_INLINE struct sljit_jump *search_requested_char(compiler_common *common, pcre_uchar reqchar, BOOL caseless, BOOL has_firstchar)
+static SLJIT_INLINE struct sljit_jump *search_requested_char(compiler_common *common, pcre_uchar req_char, BOOL caseless, BOOL has_firstchar)
{
DEFINE_COMPILER;
struct sljit_label *loop;
@@ -2045,34 +2064,40 @@ toolong = CMP(SLJIT_C_LESS, TMP1, 0, STR_END, 0);
alreadyfound = CMP(SLJIT_C_LESS, STR_PTR, 0, TMP2, 0);
if (has_firstchar)
- OP2(SLJIT_ADD, TMP1, 0, STR_PTR, 0, SLJIT_IMM, 1);
+ OP2(SLJIT_ADD, TMP1, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
else
OP1(SLJIT_MOV, TMP1, 0, STR_PTR, 0);
loop = LABEL();
notfound = CMP(SLJIT_C_GREATER_EQUAL, TMP1, 0, STR_END, 0);
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(TMP1), 0);
-oc = reqchar;
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(TMP1), 0);
+oc = req_char;
if (caseless)
- oc = TABLE_GET(reqchar, common->fcc, reqchar);
-if (reqchar == oc)
- found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, reqchar);
+ {
+ oc = TABLE_GET(req_char, common->fcc, req_char);
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+ if (req_char > 127 && common->utf && common->use_ucp)
+ oc = UCD_OTHERCASE(req_char);
+#endif
+ }
+if (req_char == oc)
+ found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, req_char);
else
{
- bit = reqchar ^ oc;
+ bit = req_char ^ oc;
if (ispowerof2(bit))
{
OP2(SLJIT_OR, TMP2, 0, TMP2, 0, SLJIT_IMM, bit);
- found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, reqchar | bit);
+ found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, req_char | bit);
}
else
{
- found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, reqchar);
+ found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, req_char);
foundoc = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, oc);
}
}
-OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
+OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(1));
JUMPTO(SLJIT_JUMP, loop);
JUMPHERE(found);
@@ -2126,7 +2151,7 @@ static void check_wordboundary(compiler_common *common)
{
DEFINE_COMPILER;
struct sljit_jump *beginend;
-#ifdef SUPPORT_UTF8
+#if !(defined COMPILE_PCRE8) || defined SUPPORT_UTF
struct sljit_jump *jump;
#endif
@@ -2143,7 +2168,7 @@ read_char(common);
/* Testing char type. */
#ifdef SUPPORT_UCP
-if (common->useucp)
+if (common->use_ucp)
{
OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 1);
jump = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_UNDERSCORE);
@@ -2160,20 +2185,24 @@ if (common->useucp)
else
#endif
{
-#ifdef SUPPORT_UTF8
+#ifndef COMPILE_PCRE8
+ jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
+#elif defined SUPPORT_UTF
/* Here LOCALS1 has already been zeroed. */
jump = NULL;
if (common->utf)
jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
-#endif
+#endif /* COMPILE_PCRE8 */
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), common->ctypes);
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 4 /* ctype_word */);
OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS1, TMP1, 0);
-#ifdef SUPPORT_UTF8
+#ifndef COMPILE_PCRE8
+ JUMPHERE(jump);
+#elif defined SUPPORT_UTF
if (jump != NULL)
JUMPHERE(jump);
-#endif
+#endif /* COMPILE_PCRE8 */
}
JUMPHERE(beginend);
@@ -2183,7 +2212,7 @@ peek_char(common);
/* Testing char type. This is a code duplication. */
#ifdef SUPPORT_UCP
-if (common->useucp)
+if (common->use_ucp)
{
OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 1);
jump = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_UNDERSCORE);
@@ -2199,7 +2228,11 @@ if (common->useucp)
else
#endif
{
-#ifdef SUPPORT_UTF8
+#ifndef COMPILE_PCRE8
+ /* TMP2 may be destroyed by peek_char. */
+ OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 0);
+ jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
+#elif defined SUPPORT_UTF
OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 0);
jump = NULL;
if (common->utf)
@@ -2208,10 +2241,12 @@ else
OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(TMP1), common->ctypes);
OP2(SLJIT_LSHR, TMP2, 0, TMP2, 0, SLJIT_IMM, 4 /* ctype_word */);
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 1);
-#ifdef SUPPORT_UTF8
+#ifndef COMPILE_PCRE8
+ JUMPHERE(jump);
+#elif defined SUPPORT_UTF
if (jump != NULL)
JUMPHERE(jump);
-#endif
+#endif /* COMPILE_PCRE8 */
}
JUMPHERE(beginend);
@@ -2314,18 +2349,18 @@ sljit_emit_fast_enter(compiler, RETURN_ADDR, 0, 1, 5, 5, common->localsize);
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
OP1(SLJIT_MOV, TMP3, 0, CHAR1, 0);
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS0, CHAR2, 0);
-OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
-OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(1));
+OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
label = LABEL();
-OP1(SLJIT_MOVU_UB, CHAR1, 0, SLJIT_MEM1(TMP1), 1);
-OP1(SLJIT_MOVU_UB, CHAR2, 0, SLJIT_MEM1(STR_PTR), 1);
+OP1(MOVU_UCHAR, CHAR1, 0, SLJIT_MEM1(TMP1), IN_UCHARS(1));
+OP1(MOVU_UCHAR, CHAR2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
jump = CMP(SLJIT_C_NOT_EQUAL, CHAR1, 0, CHAR2, 0);
-OP2(SLJIT_SUB | SLJIT_SET_E, TMP2, 0, TMP2, 0, SLJIT_IMM, 1);
+OP2(SLJIT_SUB | SLJIT_SET_E, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(1));
JUMPTO(SLJIT_C_NOT_ZERO, label);
JUMPHERE(jump);
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
OP1(SLJIT_MOV, CHAR1, 0, TMP3, 0);
OP1(SLJIT_MOV, CHAR2, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS0);
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
@@ -2346,20 +2381,30 @@ OP1(SLJIT_MOV, TMP3, 0, LCC_TABLE, 0);
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS0, CHAR1, 0);
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS1, CHAR2, 0);
OP1(SLJIT_MOV, LCC_TABLE, 0, SLJIT_IMM, common->lcc);
-OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
-OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(1));
+OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
label = LABEL();
-OP1(SLJIT_MOVU_UB, CHAR1, 0, SLJIT_MEM1(TMP1), 1);
-OP1(SLJIT_MOVU_UB, CHAR2, 0, SLJIT_MEM1(STR_PTR), 1);
+OP1(MOVU_UCHAR, CHAR1, 0, SLJIT_MEM1(TMP1), IN_UCHARS(1));
+OP1(MOVU_UCHAR, CHAR2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
+#ifndef COMPILE_PCRE8
+jump = CMP(SLJIT_C_GREATER, CHAR1, 0, SLJIT_IMM, 255);
+#endif
OP1(SLJIT_MOV_UB, CHAR1, 0, SLJIT_MEM2(LCC_TABLE, CHAR1), 0);
+#ifndef COMPILE_PCRE8
+JUMPHERE(jump);
+jump = CMP(SLJIT_C_GREATER, CHAR2, 0, SLJIT_IMM, 255);
+#endif
OP1(SLJIT_MOV_UB, CHAR2, 0, SLJIT_MEM2(LCC_TABLE, CHAR2), 0);
+#ifndef COMPILE_PCRE8
+JUMPHERE(jump);
+#endif
jump = CMP(SLJIT_C_NOT_EQUAL, CHAR1, 0, CHAR2, 0);
-OP2(SLJIT_SUB | SLJIT_SET_E, TMP2, 0, TMP2, 0, SLJIT_IMM, 1);
+OP2(SLJIT_SUB | SLJIT_SET_E, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(1));
JUMPTO(SLJIT_C_NOT_ZERO, label);
JUMPHERE(jump);
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
OP1(SLJIT_MOV, LCC_TABLE, 0, TMP3, 0);
OP1(SLJIT_MOV, CHAR1, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS0);
OP1(SLJIT_MOV, CHAR2, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS1);
@@ -2378,7 +2423,7 @@ static const pcre_uchar *SLJIT_CALL do_utf_caselesscmp(pcre_uchar *src1, jit_arg
/* This function would be ineffective to do in JIT level. */
int c1, c2;
const pcre_uchar *src2 = args->ptr;
-const pcre_uchar *end2 = (pcre_uchar *)args->end;
+const pcre_uchar *end2 = args->end;
while (src1 < end1)
{
@@ -2976,7 +3021,7 @@ switch(type)
{
jump[0] = CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff);
jump[1] = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
add_jump(compiler, fallbacks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, common->newline & 0xff));
JUMPHERE(jump[1]);
JUMPHERE(jump[0]);
@@ -3037,9 +3082,9 @@ switch(type)
read_char(common);
jump[0] = CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR);
jump[1] = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
jump[2] = CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_NL);
- OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
jump[3] = JUMP(SLJIT_JUMP);
JUMPHERE(jump[0]);
check_newlinechar(common, common->bsr_nltype, fallbacks, FALSE);
@@ -3089,36 +3134,37 @@ switch(type)
jump[0] = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
if (common->nltype == NLTYPE_FIXED && common->newline > 255)
{
- OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 2);
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+ OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP2, 0, STR_END, 0));
- OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 1);
+ OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff));
add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP2, 0, SLJIT_IMM, common->newline & 0xff));
}
else if (common->nltype == NLTYPE_FIXED)
{
- OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 1);
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+ OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP2, 0, STR_END, 0));
add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, common->newline));
}
else
{
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
jump[1] = CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR);
- OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 2);
+ OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, TMP2, 0, STR_END, 0);
jump[2] = JUMP(SLJIT_C_GREATER);
add_jump(compiler, fallbacks, JUMP(SLJIT_C_LESS));
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 1);
+ /* Equal. */
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
jump[3] = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_NL);
add_jump(compiler, fallbacks, JUMP(SLJIT_JUMP));
JUMPHERE(jump[1]);
if (common->nltype == NLTYPE_ANYCRLF)
{
- OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 1);
+ OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
add_jump(compiler, fallbacks, CMP(SLJIT_C_LESS, TMP2, 0, STR_END, 0));
add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_NL));
}
@@ -3158,15 +3204,13 @@ switch(type)
jump[0] = JUMP(SLJIT_JUMP);
JUMPHERE(jump[1]);
- OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, end));
- add_jump(compiler, fallbacks, CMP(SLJIT_C_EQUAL, TMP2, 0, STR_PTR, 0));
-
+ add_jump(compiler, fallbacks, CMP(SLJIT_C_EQUAL, STR_PTR, 0, STR_END, 0));
if (common->nltype == NLTYPE_FIXED && common->newline > 255)
{
- OP2(SLJIT_SUB, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 2);
+ OP2(SLJIT_SUB, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
add_jump(compiler, fallbacks, CMP(SLJIT_C_LESS, TMP2, 0, TMP1, 0));
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), -2);
- OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), -1);
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2));
+ OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff));
add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP2, 0, SLJIT_IMM, common->newline & 0xff));
}
@@ -3200,10 +3244,10 @@ switch(type)
if (common->nltype == NLTYPE_FIXED && common->newline > 255)
{
- OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 2);
+ OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
add_jump(compiler, fallbacks, CMP(SLJIT_C_GREATER, TMP2, 0, STR_END, 0));
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
- OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 1);
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
+ OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff));
add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP2, 0, SLJIT_IMM, common->newline & 0xff));
}
@@ -6382,7 +6426,7 @@ common->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
/* PCRE_UTF16 has the same value as PCRE_UTF8. */
common->utf = (re->options & PCRE_UTF8) != 0;
#ifdef SUPPORT_UCP
-common->useucp = (re->options & PCRE_UCP) != 0;
+common->use_ucp = (re->options & PCRE_UCP) != 0;
#endif
common->utfreadchar = NULL;
#ifdef COMPILE_PCRE8
diff --git a/pcre_newline.c b/pcre_newline.c
index 0c2ddcd..d618b80 100644
--- a/pcre_newline.c
+++ b/pcre_newline.c
@@ -77,7 +77,15 @@ PRIV(is_newline)(PCRE_PUCHAR ptr, int type, PCRE_PUCHAR endptr, int *lenptr,
BOOL utf)
{
int c;
-if (utf) { GETCHAR(c, ptr); } else c = *ptr;
+(void)utf;
+#ifdef SUPPORT_UTF
+if (utf)
+ {
+ GETCHAR(c, ptr);
+ }
+else
+#endif /* SUPPORT_UTF8 */
+ c = *ptr;
if (type == NLTYPE_ANYCRLF) switch(c)
{
@@ -96,9 +104,15 @@ else switch(c)
case 0x000c: *lenptr = 1; return TRUE; /* FF */
case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1;
return TRUE; /* CR */
+#ifdef COMPILE_PCRE8
case 0x0085: *lenptr = utf? 2 : 1; return TRUE; /* NEL */
case 0x2028: /* LS */
case 0x2029: *lenptr = 3; return TRUE; /* PS */
+#else
+ case 0x0085: /* NEL */
+ case 0x2028: /* LS */
+ case 0x2029: *lenptr = 1; return TRUE; /* PS */
+#endif /* COMPILE_PCRE8 */
default: return FALSE;
}
}
@@ -127,17 +141,17 @@ PRIV(was_newline)(PCRE_PUCHAR ptr, int type, PCRE_PUCHAR startptr, int *lenptr,
BOOL utf)
{
int c;
+(void)utf;
ptr--;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
{
BACKCHAR(ptr);
GETCHAR(c, ptr);
}
-else c = *ptr;
-#else /* no UTF-8 support */
-c = *ptr;
+else
#endif /* SUPPORT_UTF8 */
+ c = *ptr;
if (type == NLTYPE_ANYCRLF) switch(c)
{
@@ -154,9 +168,15 @@ else switch(c)
case 0x000b: /* VT */
case 0x000c: /* FF */
case 0x000d: *lenptr = 1; return TRUE; /* CR */
+#ifdef COMPILE_PCRE8
case 0x0085: *lenptr = utf? 2 : 1; return TRUE; /* NEL */
case 0x2028: /* LS */
case 0x2029: *lenptr = 3; return TRUE; /* PS */
+#else
+ case 0x0085: /* NEL */
+ case 0x2028: /* LS */
+ case 0x2029: *lenptr = 1; return TRUE; /* PS */
+#endif /* COMPILE_PCRE8 */
default: return FALSE;
}
}
diff --git a/pcre_printint.src b/pcre_printint.src
index 2922e54..d30619e 100644
--- a/pcre_printint.src
+++ b/pcre_printint.src
@@ -123,7 +123,9 @@ else
if (!utf || (c & 0xfc00) != 0xd800)
{
- if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
+ if (PRINTABLE(c)) fprintf(f, "%c", c);
+ else if (c <= 0xff) fprintf(f, "\\x%02x", c);
+ else fprintf(f, "\\x{%x}", c);
return 0;
}
else
diff --git a/pcre_study.c b/pcre_study.c
index 1e10397..3f25c3a 100644
--- a/pcre_study.c
+++ b/pcre_study.c
@@ -224,7 +224,7 @@ for (;;)
case OP_NOTPOSPLUSI:
branchlength++;
cc += 2;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
#endif
break;
@@ -245,7 +245,7 @@ for (;;)
case OP_NOTEXACTI:
branchlength += GET2(cc,1);
cc += 2 + IMM2_SIZE;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
#endif
break;
@@ -293,7 +293,7 @@ for (;;)
appear, but leave the code, just in case.) */
case OP_ANYBYTE:
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf) return -1;
#endif
branchlength++;
@@ -486,7 +486,7 @@ for (;;)
case OP_NOTPOSQUERYI:
cc += PRIV(OP_lengths)[op];
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
#endif
break;
@@ -549,9 +549,10 @@ set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
{
unsigned int c = *p;
+#ifdef COMPILE_PCRE8
SET_BIT(c);
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf && c > 127)
{
GETCHARINC(c, p);
@@ -572,6 +573,33 @@ if (utf && c > 127)
if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
return p + 1;
+#endif
+
+#ifdef COMPILE_PCRE16
+if (c > 0xff)
+ c = 0xff;
+SET_BIT(c);
+
+#ifdef SUPPORT_UTF
+if (utf && c > 127)
+ {
+ GETCHARINC(c, p);
+#ifdef SUPPORT_UCP
+ if (caseless)
+ {
+ c = UCD_OTHERCASE(c);
+ if (c > 0xff)
+ c = 0xff;
+ SET_BIT(c);
+ }
+#endif
+ return p;
+ }
+#endif
+
+if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
+return p + 1;
+#endif
}
@@ -602,7 +630,7 @@ set_type_bits(pcre_uint8 *start_bits, int cbit_type, int table_limit,
{
register int c;
for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
if (table_limit == 32) return;
for (c = 128; c < 256; c++)
{
@@ -644,7 +672,9 @@ set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, int table_limit,
{
register int c;
for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
+#endif
}
@@ -679,7 +709,11 @@ set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
{
register int c;
int yield = SSB_DONE;
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
int table_limit = utf? 16:32;
+#else
+int table_limit = 32;
+#endif
#if 0
/* ========================================================================= */
@@ -951,14 +985,23 @@ do
case OP_HSPACE:
SET_BIT(0x09);
SET_BIT(0x20);
+#ifdef SUPPORT_UTF
if (utf)
{
+#ifdef COMPILE_PCRE8
SET_BIT(0xC2); /* For U+00A0 */
SET_BIT(0xE1); /* For U+1680, U+180E */
SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
SET_BIT(0xE3); /* For U+3000 */
+#endif
+#ifdef COMPILE_PCRE16
+ SET_BIT(0xA0);
+ SET_BIT(0xFF); /* For characters > 255 */
+#endif
}
- else SET_BIT(0xA0);
+ else
+#endif /* SUPPORT_UTF */
+ SET_BIT(0xA0);
try_next = FALSE;
break;
@@ -968,12 +1011,21 @@ do
SET_BIT(0x0B);
SET_BIT(0x0C);
SET_BIT(0x0D);
+#ifdef SUPPORT_UTF
if (utf)
{
+#ifdef COMPILE_PCRE8
SET_BIT(0xC2); /* For U+0085 */
SET_BIT(0xE2); /* For U+2028, U+2029 */
+#endif
+#ifdef COMPILE_PCRE16
+ SET_BIT(0x85);
+ SET_BIT(0xFF); /* For characters > 255 */
+#endif
}
- else SET_BIT(0x85);
+ else
+#endif /* SUPPORT_UTF */
+ SET_BIT(0x85);
try_next = FALSE;
break;
@@ -1058,14 +1110,23 @@ do
case OP_HSPACE:
SET_BIT(0x09);
SET_BIT(0x20);
+#ifdef COMPILE_PCRE8
if (utf)
{
+#ifdef COMPILE_PCRE8
SET_BIT(0xC2); /* For U+00A0 */
SET_BIT(0xE1); /* For U+1680, U+180E */
SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
SET_BIT(0xE3); /* For U+3000 */
+#endif
+#ifdef COMPILE_PCRE16
+ SET_BIT(0xA0);
+ SET_BIT(0xFF); /* For characters > 255 */
+#endif
}
- else SET_BIT(0xA0);
+ else
+#endif /* SUPPORT_UTF */
+ SET_BIT(0xA0);
break;
case OP_ANYNL:
@@ -1074,12 +1135,21 @@ do
SET_BIT(0x0B);
SET_BIT(0x0C);
SET_BIT(0x0D);
+#ifdef COMPILE_PCRE8
if (utf)
{
+#ifdef COMPILE_PCRE8
SET_BIT(0xC2); /* For U+0085 */
SET_BIT(0xE2); /* For U+2028, U+2029 */
+#endif
+#ifdef COMPILE_PCRE16
+ SET_BIT(0x85);
+ SET_BIT(0xFF); /* For characters > 255 */
+#endif
}
- else SET_BIT(0x85);
+ else
+#endif /* SUPPORT_UTF */
+ SET_BIT(0x85);
break;
case OP_NOT_DIGIT:
@@ -1126,13 +1196,16 @@ do
character with a value > 255. */
case OP_NCLASS:
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
if (utf)
{
start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
}
#endif
+#ifdef COMPILE_PCRE16
+ SET_BIT(0xFF); /* For characters > 255 */
+#endif
/* Fall through */
case OP_CLASS:
@@ -1147,7 +1220,7 @@ do
value is > 127. In fact, there are only two possible starting bytes for
characters in the range 128 - 255. */
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
if (utf)
{
for (c = 0; c < 16; c++) start_bits[c] |= map[c];
@@ -1161,12 +1234,10 @@ do
}
}
}
-
- /* In non-UTF-8 mode, the two bit maps are completely compatible. */
-
else
#endif
{
+ /* In non-UTF-8 mode, the two bit maps are completely compatible. */
for (c = 0; c < 32; c++) start_bits[c] |= map[c];
}
@@ -1342,6 +1413,18 @@ if (bits_set || min > 0
memcpy(study->start_bits, start_bits, sizeof(start_bits));
}
+#ifdef PCRE_DEBUG
+ if (bits_set)
+ {
+ pcre_uint8 *ptr = (pcre_uint32 *)start_bits;
+ int i;
+
+ printf("Start bits:\n");
+ for (i = 0; i < 32; i++)
+ printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
+ }
+#endif
+
/* Always set the minlength value in the block, because the JIT compiler
makes use of it. However, don't set the bit unless the length is greater than
zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
diff --git a/pcre_version.c b/pcre_version.c
index 7067cd4..2269d4f 100644
--- a/pcre_version.c
+++ b/pcre_version.c
@@ -79,8 +79,13 @@ I could find no way of detecting that a macro is defined as an empty string at
pre-processor time. This hack uses a standard trick for avoiding calling
the STRING macro with an empty argument when doing the test. */
+#ifdef COMPILE_PCRE8
PCRE_EXP_DEFN const char * PCRE_CALL_CONVENTION
pcre_version(void)
+#else
+PCRE_EXP_DEFN const char * PCRE_CALL_CONVENTION
+pcre16_version(void)
+#endif
{
return (XSTRING(Z PCRE_PRERELEASE)[1] == 0)?
XSTRING(PCRE_MAJOR.PCRE_MINOR PCRE_DATE) :