summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorzherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>2011-12-03 23:58:37 +0000
committerzherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>2011-12-03 23:58:37 +0000
commit216818740b54b629e7bd59cd49f783c72e244e23 (patch)
tree35603a12be962c35a4e39e879a1a8e021f53d765
parentad1a6e3a96050e61e6e2127d3a00ded77a1eb80c (diff)
downloadpcre-216818740b54b629e7bd59cd49f783c72e244e23.tar.gz
Start working on UTF-16. Updating macros and adding new ones.
git-svn-id: svn://vcs.exim.org/pcre/code/branches/pcre16@782 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--Makefile.am1
-rw-r--r--pcre16_ord2utf16.c4
-rw-r--r--pcre16_ucd.c45
-rw-r--r--pcre16_utf16_utils.c3
-rw-r--r--pcre16_valid_utf16.c3
-rw-r--r--pcre_compile.c63
-rw-r--r--pcre_dfa_exec.c35
-rw-r--r--pcre_exec.c60
-rw-r--r--pcre_internal.h126
-rw-r--r--pcre_jit_compile.c373
-rw-r--r--pcre_printint.src63
-rw-r--r--pcre_study.c6
-rw-r--r--pcre_tables.c8
13 files changed, 574 insertions, 216 deletions
diff --git a/Makefile.am b/Makefile.am
index 39cf574..c939f9f 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -219,6 +219,7 @@ libpcre16_la_SOURCES = \
pcre16_study.c \
pcre16_tables.c \
pcre16_try_flipped.c \
+ pcre16_ucd.c \
pcre16_utf16_utils.c \
pcre16_valid_utf16.c \
pcre16_xclass.c
diff --git a/pcre16_ord2utf16.c b/pcre16_ord2utf16.c
index 421c3a3..b02ccc2 100644
--- a/pcre16_ord2utf16.c
+++ b/pcre16_ord2utf16.c
@@ -45,8 +45,10 @@ character value into a UTF16 string. */
#include "config.h"
#endif
-#include "pcre_internal.h"
+/* Generate code with 16 bit character support. */
+#define COMPILE_PCRE16
+#include "pcre_internal.h"
/*************************************************
* Convert character value to UTF-16 *
diff --git a/pcre16_ucd.c b/pcre16_ucd.c
new file mode 100644
index 0000000..962ed46
--- /dev/null
+++ b/pcre16_ucd.c
@@ -0,0 +1,45 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+ Written by Philip Hazel
+ Copyright (c) 1997-2011 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+/* Generate code with 16 bit character support. */
+#define COMPILE_PCRE16
+
+#include "pcre_ucd.c"
+
+/* End of pcre16_ucd.c */
diff --git a/pcre16_utf16_utils.c b/pcre16_utf16_utils.c
index 5ff3953..ddd96b9 100644
--- a/pcre16_utf16_utils.c
+++ b/pcre16_utf16_utils.c
@@ -46,6 +46,9 @@ strings to host byte order. */
#include "config.h"
#endif
+/* Generate code with 16 bit character support. */
+#define COMPILE_PCRE16
+
#include "pcre_internal.h"
int
diff --git a/pcre16_valid_utf16.c b/pcre16_valid_utf16.c
index c7c7507..cc3e50e 100644
--- a/pcre16_valid_utf16.c
+++ b/pcre16_valid_utf16.c
@@ -46,6 +46,9 @@ strings. */
#include "config.h"
#endif
+/* Generate code with 16 bit character support. */
+#define COMPILE_PCRE16
+
#include "pcre_internal.h"
diff --git a/pcre_compile.c b/pcre_compile.c
index da4ce22..3461dbd 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -1466,8 +1466,8 @@ for (; ptr < cd->end_pattern; ptr++)
{
if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
ptr++;
-#ifdef SUPPORT_UTF8
- if (utf) while ((*ptr & 0xc0) == 0x80) ptr++;
+#ifdef SUPPORT_UTF
+ if (utf) FORWARDCHAR(ptr);
#endif
}
if (*ptr == 0) goto FAIL_EXIT;
@@ -1759,8 +1759,8 @@ for (;;)
case OP_NOTI:
branchlength++;
cc += 2;
-#ifdef SUPPORT_UTF8
- if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+#ifdef SUPPORT_UTF
+ if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
#endif
break;
@@ -1773,8 +1773,8 @@ for (;;)
case OP_NOTEXACTI:
branchlength += GET2(cc,1);
cc += 2 + IMM2_SIZE;
-#ifdef SUPPORT_UTF8
- if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+#ifdef SUPPORT_UTF
+ if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
#endif
break;
@@ -2041,7 +2041,7 @@ for (;;)
a multi-byte character. The length in the table is a minimum, so we have to
arrange to skip the extra bytes. */
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf) switch(c)
{
case OP_CHAR:
@@ -2072,7 +2072,7 @@ for (;;)
case OP_MINQUERYI:
case OP_POSQUERY:
case OP_POSQUERYI:
- if (code[-1] >= 0xc0) code += PRIV(utf8_table4)[code[-1] & 0x3f];
+ if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
break;
}
#else
@@ -2161,7 +2161,7 @@ for (;;)
by a multi-byte character. The length in the table is a minimum, so we have
to arrange to skip the extra bytes. */
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf) switch(c)
{
case OP_CHAR:
@@ -2192,7 +2192,7 @@ for (;;)
case OP_MINQUERYI:
case OP_POSQUERY:
case OP_POSQUERYI:
- if (code[-1] >= 0xc0) code += PRIV(utf8_table4)[code[-1] & 0x3f];
+ if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
break;
}
#else
@@ -2452,7 +2452,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
/* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
MINUPTO, and POSUPTO may be followed by a multibyte character */
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
case OP_STAR:
case OP_STARI:
case OP_MINSTAR:
@@ -2465,7 +2465,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
case OP_MINQUERYI:
case OP_POSQUERY:
case OP_POSQUERYI:
- if (utf && code[1] >= 0xc0) code += PRIV(utf8_table4)[code[1] & 0x3f];
+ if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
break;
case OP_UPTO:
@@ -2474,7 +2474,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
case OP_MINUPTOI:
case OP_POSUPTO:
case OP_POSUPTOI:
- if (utf && code[1 + IMM2_SIZE] >= 0xc0) code += PRIV(utf8_table4)[code[1 + IMM2_SIZE] & 0x3f];
+ if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
break;
#endif
@@ -2913,8 +2913,8 @@ if ((options & PCRE_EXTENDED) != 0)
{
if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
ptr++;
-#ifdef SUPPORT_UTF8
- if (utf) while ((*ptr & 0xc0) == 0x80) ptr++;
+#ifdef SUPPORT_UTF
+ if (utf) FORWARDCHAR(ptr);
#endif
}
}
@@ -2957,8 +2957,8 @@ if ((options & PCRE_EXTENDED) != 0)
{
if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
ptr++;
-#ifdef SUPPORT_UTF8
- if (utf) while ((*ptr & 0xc0) == 0x80) ptr++;
+#ifdef SUPPORT_UTF
+ if (utf) FORWARDCHAR(ptr);
#endif
}
}
@@ -3424,7 +3424,7 @@ for (;; ptr++)
int tempbracount;
pcre_uchar mcbuffer[8];
- /* Get next byte in the pattern */
+ /* Get next character in the pattern */
c = *ptr;
@@ -3556,8 +3556,8 @@ for (;; ptr++)
{
if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
ptr++;
-#ifdef SUPPORT_UTF8
- if (utf) while ((*ptr & 0xc0) == 0x80) ptr++;
+#ifdef SUPPORT_UTF
+ if (utf) FORWARDCHAR(ptr);
#endif
}
if (*ptr != 0) continue;
@@ -4601,7 +4601,7 @@ for (;; ptr++)
{
op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR;
- /* Deal with UTF-8 characters that take up more than one byte. It's
+ /* Deal with UTF characters that take up more than one character. It's
easier to write this out separately than try to macrify it. Use c to
hold the length of the character in bytes, plus 0x80 to flag that it's a
length rather than a small character. */
@@ -4610,16 +4610,16 @@ for (;; ptr++)
if (utf && (code[-1] & 0x80) != 0)
{
pcre_uchar *lastchar = code - 1;
- while((*lastchar & 0xc0) == 0x80) lastchar--;
+ BACKCHAR(lastchar);
c = code - lastchar; /* Length of UTF-8 character */
- memcpy(utf_chars, lastchar, c); /* Save the char */
+ memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
c |= 0x80; /* Flag c as a length */
}
else
#endif
- /* Handle the case of a single byte - either with no UTF8 support, or
- with UTF-8 disabled, or for a UTF-8 character < 128. */
+ /* Handle the case of a single charater - either with no UTF support, or
+ with UTF disabled, or for a single character UTF character. */
{
c = code[-1];
@@ -5273,9 +5273,9 @@ for (;; ptr++)
else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
{
tempcode += PRIV(OP_lengths)[*tempcode];
-#ifdef SUPPORT_UTF8
- if (utf && tempcode[-1] >= 0xc0)
- tempcode += PRIV(utf8_table4)[tempcode[-1] & 0x3f];
+#ifdef SUPPORT_UTF
+ if (utf && HAS_EXTRALEN(tempcode[-1]))
+ tempcode += GET_EXTRALEN(tempcode[-1]);
#endif
}
@@ -6659,11 +6659,10 @@ for (;; ptr++)
mclength = 1;
mcbuffer[0] = c;
-#ifdef SUPPORT_UTF8
- if (utf && c >= 0xc0)
+#ifdef SUPPORT_UTF
+ if (utf && HAS_EXTRALEN(c))
{
- while ((ptr[1] & 0xc0) == 0x80)
- mcbuffer[mclength++] = *(++ptr);
+ INTERNALCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
}
#endif
diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c
index 8247f46..d7b292d 100644
--- a/pcre_dfa_exec.c
+++ b/pcre_dfa_exec.c
@@ -480,9 +480,7 @@ if (*first_op == OP_REVERSE)
{
if (current_subject <= start_subject) break;
current_subject--;
- while (current_subject > start_subject &&
- (*current_subject & 0xc0) == 0x80)
- current_subject--;
+ INTERNALCHAR(current_subject > start_subject, *current_subject, current_subject--);
}
}
else
@@ -3161,9 +3159,17 @@ if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
}
+#ifdef COMPILE_PCRE8
if (start_offset > 0 && start_offset < length &&
(((PCRE_PUCHAR)subject)[start_offset] & 0xc0) == 0x80)
return PCRE_ERROR_BADUTF8_OFFSET;
+#else
+#ifdef COMPILE_PCRE16
+ if (start_offset > 0 && start_offset < length &&
+ (((PCRE_PUCHAR)subject)[start_offset] & 0xfc00) == 0xdc00)
+ return PCRE_ERROR_BADUTF8_OFFSET;
+#endif /* COMPILE_PCRE16 */
+#endif /* COMPILE_PCRE8 */
}
#endif
@@ -3234,13 +3240,13 @@ for (;;)
if (firstline)
{
PCRE_PUCHAR t = current_subject;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
{
while (t < md->end_subject && !IS_NEWLINE(t))
{
t++;
- while (t < end_subject && (*t & 0xc0) == 0x80) t++;
+ INTERNALCHAR(t < end_subject, *t, t++);
}
}
else
@@ -3277,16 +3283,15 @@ for (;;)
{
if (current_subject > md->start_subject + start_offset)
{
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
{
while (current_subject < end_subject &&
!WAS_NEWLINE(current_subject))
{
current_subject++;
- while(current_subject < end_subject &&
- (*current_subject & 0xc0) == 0x80)
- current_subject++;
+ INTERNALCHAR(current_subject < end_subject, *current_subject,
+ current_subject++);
}
}
else
@@ -3316,10 +3321,10 @@ for (;;)
if ((start_bits[c/8] & (1 << (c&7))) == 0)
{
current_subject++;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
- while(current_subject < end_subject &&
- (*current_subject & 0xc0) == 0x80) current_subject++;
+ INTERNALCHAR(current_subject < end_subject, *current_subject,
+ current_subject++);
#endif
}
else break;
@@ -3426,11 +3431,13 @@ for (;;)
if (firstline && IS_NEWLINE(current_subject)) break;
current_subject++;
+#ifdef SUPPORT_UTF
if (utf)
{
- while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
- current_subject++;
+ INTERNALCHAR(current_subject < end_subject, *current_subject,
+ current_subject++);
}
+#endif
if (current_subject > end_subject) break;
/* If we have just passed a CR and we are now at a LF, and the pattern does
diff --git a/pcre_exec.c b/pcre_exec.c
index db013e6..6761598 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -2077,7 +2077,7 @@ for (;;)
if (eptr == md->start_subject) prev_is_word = FALSE; else
{
PCRE_PUCHAR lastptr = eptr - 1;
- while((*lastptr & 0xc0) == 0x80) lastptr--;
+ BACKCHAR(lastptr);
if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
GETCHAR(c, lastptr);
#ifdef SUPPORT_UCP
@@ -2189,7 +2189,9 @@ for (;;)
MRRETURN(MATCH_NOMATCH);
}
eptr++;
- if (utf) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+#ifdef SUPPORT_UTF
+ if (utf) INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+#endif
ecode++;
break;
@@ -4074,7 +4076,7 @@ for (;;)
/* Handle all other cases when the coding is UTF-8 */
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf) switch(ctype)
{
case OP_ANY:
@@ -4087,7 +4089,7 @@ for (;;)
}
if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
eptr++;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+ INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
}
break;
@@ -4100,7 +4102,7 @@ for (;;)
MRRETURN(MATCH_NOMATCH);
}
eptr++;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+ INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
}
break;
@@ -4298,7 +4300,8 @@ for (;;)
}
if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
MRRETURN(MATCH_NOMATCH);
- while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
+ eptr++;
+ INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
}
break;
@@ -4326,7 +4329,8 @@ for (;;)
}
if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
MRRETURN(MATCH_NOMATCH);
- while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
+ eptr++;
+ INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
}
break;
@@ -5309,7 +5313,7 @@ for (;;)
else
#endif /* SUPPORT_UCP */
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
{
switch(ctype)
@@ -5326,7 +5330,7 @@ for (;;)
}
if (IS_NEWLINE(eptr)) break;
eptr++;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+ INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
}
}
@@ -5343,7 +5347,7 @@ for (;;)
}
if (IS_NEWLINE(eptr)) break;
eptr++;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+ INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
}
}
break;
@@ -5359,7 +5363,7 @@ for (;;)
break;
}
eptr++;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+ INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
}
}
else
@@ -6014,10 +6018,18 @@ if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
}
- /* Check that a start_offset points to the start of a UTF-8 character. */
+ /* Check that a start_offset points to the start of a UTF character. */
+#ifdef COMPILE_PCRE8
if (start_offset > 0 && start_offset < length &&
(((PCRE_PUCHAR)subject)[start_offset] & 0xc0) == 0x80)
return PCRE_ERROR_BADUTF8_OFFSET;
+#else
+#ifdef COMPILE_PCRE16
+ if (start_offset > 0 && start_offset < length &&
+ (((PCRE_PUCHAR)subject)[start_offset] & 0xfc00) == 0xdc00)
+ return PCRE_ERROR_BADUTF8_OFFSET;
+#endif /* COMPILE_PCRE16 */
+#endif /* COMPILE_PCRE8 */
}
#endif
@@ -6291,13 +6303,13 @@ for(;;)
if (firstline)
{
PCRE_PUCHAR t = start_match;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
{
while (t < md->end_subject && !IS_NEWLINE(t))
{
t++;
- while (t < end_subject && (*t & 0xc0) == 0x80) t++;
+ INTERNALCHAR(t < end_subject, *t, t++);
}
}
else
@@ -6333,14 +6345,14 @@ for(;;)
{
if (start_match > md->start_subject + start_offset)
{
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
{
while (start_match < end_subject && !WAS_NEWLINE(start_match))
{
start_match++;
- while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
- start_match++;
+ INTERNALCHAR(start_match < end_subject, *start_match,
+ start_match++);
}
}
else
@@ -6366,7 +6378,7 @@ for(;;)
{
while (start_match < end_subject)
{
-#ifdef COMPILE_PCRE8
+#ifdef COMPILE_PCRE
register unsigned int c = *start_match;
#else
register unsigned int c = *start_match & 0xff;
@@ -6374,10 +6386,10 @@ for(;;)
if ((start_bits[c/8] & (1 << (c&7))) == 0)
{
start_match++;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
- while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
- start_match++;
+ INTERNALCHAR(start_match < end_subject, *start_match,
+ start_match++);
#endif
}
else break;
@@ -6506,10 +6518,10 @@ for(;;)
case MATCH_PRUNE:
case MATCH_THEN:
new_start_match = start_match + 1;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
- while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
- new_start_match++;
+ INTERNALCHAR(new_start_match < end_subject, *new_start_match,
+ new_start_match++);
#endif
break;
diff --git a/pcre_internal.h b/pcre_internal.h
index 637565b..7642b91 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -531,7 +531,9 @@ not used when UTF-8 is not supported, so it is not defined, and BACKCHAR should
never be called in byte mode. To make sure they can never even appear when
UTF-8 support is omitted, we don't even define them. */
-#ifndef SUPPORT_UTF8
+/* #define HAS_EXTRALEN(c) */
+/* #define GET_EXTRALEN(c) */
+#ifndef SUPPORT_UTF
#define GETCHAR(c, eptr) c = *eptr;
#define GETCHARTEST(c, eptr) c = *eptr;
#define GETCHARINC(c, eptr) c = *eptr++;
@@ -539,14 +541,27 @@ UTF-8 support is omitted, we don't even define them. */
#define GETCHARLEN(c, eptr, len) c = *eptr;
/* #define GETCHARLENTEST(c, eptr, len) */
/* #define BACKCHAR(eptr) */
+/* #define FORWARDCHAR(eptr) */
+/* #define INTERNALCHAR(condition, eptr, action) */
+
+#else /* SUPPORT_UTF */
-#else /* SUPPORT_UTF8 */
+#ifdef COMPILE_PCRE8
/* These macros were originally written in the form of loops that used data
from the tables whose names start with PRIV(utf8_table). They were rewritten by
a user so as not to use loops, because in some environments this gives a
significant performance advantage, and it seems never to do any harm. */
+/* Tests whether the code point needs extra characters to decode. */
+
+#define HAS_EXTRALEN(c) ((c) >= 0xc0)
+
+/* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE.
+Otherwise it has an undefined behaviour. */
+
+#define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3f])
+
/* Base macro to pick up the remaining bytes of a UTF-8 character, not
advancing the pointer. */
@@ -689,7 +704,107 @@ because almost all calls are already within a block of UTF-8 only code. */
#define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--
-#endif /* SUPPORT_UTF8 */
+/* Same as above, just in the other direction. */
+#define FORWARDCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr++
+
+/* Same as above, but it allows a fully customizable form. */
+#define INTERNALCHAR(condition, eptr, action) \
+ while((condition) && ((eptr) & 0xc0) == 0x80) action
+
+#else /* COMPILE_PCRE8 */
+
+#ifdef COMPILE_PCRE16
+
+/* Tests whether the code point needs extra characters to decode. */
+
+#define HAS_EXTRALEN(c) (((c) & 0xfc00) == 0xd800)
+
+/* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE.
+Otherwise it has an undefined behaviour. */
+
+#define GET_EXTRALEN(c) 1
+
+/* Base macro to pick up the low surrogate of a UTF-16 character, not
+advancing the pointer. */
+
+#define GETUTF16(c, eptr) \
+ { c = (((c & 0x3ff) << 10) | (eptr[1] & 0x3ff)) + 0x10000; }
+
+/* Get the next UTF-16 character, not advancing the pointer. This is called when
+we know we are in UTF-16 mode. */
+
+#define GETCHAR(c, eptr) \
+ c = *eptr; \
+ if ((c & 0xfc00) == 0xd800) GETUTF16(c, eptr);
+
+/* Get the next UTF-16 character, testing for UTF-16 mode, and not advancing the
+pointer. */
+
+#define GETCHARTEST(c, eptr) \
+ c = *eptr; \
+ if (utf && (c & 0xfc00) == 0xd800) GETUTF16(c, eptr);
+
+/* Base macro to pick up the low surrogate of a UTF-16 character, advancing
+the pointer. */
+
+#define GETUTF16INC(c, eptr) \
+ { c = (((c & 0x3ff) << 10) | (eptr[1] & 0x3ff)) + 0x10000; eptr++; }
+
+/* Get the next UTF-16 character, advancing the pointer. This is called when we
+know we are in UTF-16 mode. */
+
+#define GETCHARINC(c, eptr) \
+ c = *eptr++; \
+ if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, eptr);
+
+/* Get the next character, testing for UTF-16 mode, and advancing the pointer.
+This is called when we don't know if we are in UTF-16 mode. */
+
+#define GETCHARINCTEST(c, eptr) \
+ c = *eptr++; \
+ if (utf && (c & 0xfc00) == 0xd800) GETUTF16INC(c, eptr);
+
+/* Base macro to pick up the low surrogate of a UTF-16 character, not
+advancing the pointer, incrementing the length. */
+
+#define GETUTF16LEN(c, eptr, len) \
+ { c = (((c & 0x3ff) << 10) | (eptr[1] & 0x3ff)) + 0x10000; len++; }
+
+/* Get the next UTF-16 character, not advancing the pointer, incrementing
+length if there is a low surrogate. This is called when we know we are in
+UTF-16 mode. */
+
+#define GETCHARLEN(c, eptr, len) \
+ c = *eptr; \
+ if ((c & 0xfc00) == 0xd800) GETUTF16LEN(c, eptr, len);
+
+/* Get the next UTF-816character, testing for UTF-16 mode, not advancing the
+pointer, incrementing length if there is a low surrogate. This is called when
+we do not know if we are in UTF-16 mode. */
+
+#define GETCHARLENTEST(c, eptr, len) \
+ c = *eptr; \
+ if (utf && (c & 0xfc00) == 0xd800) GETUTF16LEN(c, eptr, len);
+
+/* If the pointer is not at the start of a character, move it back until
+it is. This is called only in UTF-16 mode - we don't put a test within the
+macro because almost all calls are already within a block of UTF-16 only
+code. */
+
+#define BACKCHAR(eptr) if ((*eptr & 0xfc00) == 0xdc00) eptr--
+
+/* Same as above, just in the other direction. */
+#define FORWARDCHAR(eptr) if ((*eptr & 0xfc00) == 0xdc00) eptr++
+
+/* Same as above, but it allows a fully customizable form. */
+#define INTERNALCHAR(condition, eptr, action) \
+ if ((condition) && ((eptr) & 0xfc00) == 0xdc00) action
+
+#endif
+
+#endif /* COMPILE_PCRE8 */
+
+#endif /* SUPPORT_UTF */
/* In case there is no definition of offsetof() provided - though any proper
@@ -2043,12 +2158,15 @@ of the exported public functions. They have to be "external" in the C sense,
but are not part of the PCRE public API. The data for these tables is in the
pcre_tables.c module. */
+#ifdef COMPILE_PCRE8
+
extern const int PRIV(utf8_table1)[];
+extern const int PRIV(utf8_table1_size);
extern const int PRIV(utf8_table2)[];
extern const int PRIV(utf8_table3)[];
extern const pcre_uint8 PRIV(utf8_table4)[];
-extern const int PRIV(utf8_table1_size);
+#endif /* COMPILE_PCRE8 */
extern const char PRIV(utt_names)[];
extern const ucp_type_table PRIV(utt)[];
diff --git a/pcre_jit_compile.c b/pcre_jit_compile.c
index 16611f1..03c7b2c 100644
--- a/pcre_jit_compile.c
+++ b/pcre_jit_compile.c
@@ -302,9 +302,11 @@ typedef struct compiler_common {
#ifdef SUPPORT_UCP
BOOL useucp;
#endif
- jump_list *utf8readchar;
- jump_list *utf8readtype8;
+ jump_list *utfreadchar;
+#ifdef COMPILE_PCRE8
+ jump_list *utfreadtype8;
#endif
+#endif /* SUPPORT_UTF8 */
#ifdef SUPPORT_UCP
jump_list *getucd;
#endif
@@ -543,8 +545,8 @@ switch(*cc)
case OP_NOTPOSPLUSI:
case OP_NOTPOSQUERYI:
cc += 2;
-#ifdef SUPPORT_UTF8
- if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+#ifdef SUPPORT_UTF
+ if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
#endif
return cc;
@@ -565,8 +567,8 @@ switch(*cc)
case OP_NOTEXACTI:
case OP_NOTPOSUPTOI:
cc += 2 + IMM2_SIZE;
-#ifdef SUPPORT_UTF8
- if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+#ifdef SUPPORT_UTF
+ if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
#endif
return cc;
@@ -1285,7 +1287,7 @@ return MAX_255(c) ? common->fcc[c] != c : FALSE;
static SLJIT_INLINE unsigned int char_othercase(compiler_common *common, unsigned int c)
{
/* Returns with the othercase. */
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (common->utf && c > 127)
{
#ifdef SUPPORT_UCP
@@ -1302,11 +1304,11 @@ static unsigned int char_get_othercase_bit(compiler_common *common, pcre_uchar*
{
/* Detects if the character and its othercase has only 1 bit difference. */
unsigned int c, oc, bit;
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF8 && defined COMPILE_PCRE8
int n;
#endif
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (common->utf)
{
GETCHAR(c, cc);
@@ -1324,11 +1326,11 @@ if (common->utf)
else
{
c = *cc;
- oc = common->fcc[c];
+ oc = TABLE_GET(c, common->fcc, c);
}
#else
c = *cc;
-oc = common->fcc[c];
+oc = TABLE_GET(c, common->fcc, c);
#endif
SLJIT_ASSERT(c != oc);
@@ -1342,10 +1344,12 @@ if (c <= 127 && bit == 0x20)
if (!ispowerof2(bit))
return 0;
+#ifdef COMPILE_PCRE8
+
#ifdef SUPPORT_UTF8
if (common->utf && c > 127)
{
- n = PRIV(utf8_table4)[*cc & 0x3f];
+ n = GET_EXTRALEN(*cc);
while ((bit & 0x3f) == 0)
{
n--;
@@ -1353,8 +1357,25 @@ if (common->utf && c > 127)
}
return (n << 8) | bit;
}
-#endif
+#endif /* SUPPORT_UTF8 */
return (0 << 8) | bit;
+
+#else /* COMPILE_PCRE8 */
+
+#ifdef COMPILE_PCRE16
+#ifdef SUPPORT_UTF16
+if (common->utf && c > 65535)
+ {
+ if (bit >= (1 << 10))
+ bit >>= 10;
+ else
+ return (bit <= 255) ? ((2 << 8) | bit) : ((3 << 8) | (bit >> 8));
+ }
+#endif /* SUPPORT_UTF16 */
+return (bit <= 255) ? ((0 << 8) | bit) : ((1 << 8) | (bit >> 8));
+#endif /* COMPILE_PCRE16 */
+
+#endif /* COMPILE_PCRE8 */
}
static SLJIT_INLINE void check_input_end(compiler_common *common, jump_list **fallbacks)
@@ -1368,16 +1389,22 @@ static void read_char(compiler_common *common)
/* Reads the character into TMP1, updates STR_PTR.
Does not check STR_END. TMP2 Destroyed. */
DEFINE_COMPILER;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
struct sljit_jump *jump;
#endif
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (common->utf)
{
+#ifdef COMPILE_PCRE8
jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
- add_jump(compiler, &common->utf8readchar, JUMP(SLJIT_FAST_CALL));
+#else
+#ifdef COMPILE_PCRE16
+ jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xd800);
+#endif
+#endif /* COMPILE_PCRE8 */
+ add_jump(compiler, &common->utfreadchar, JUMP(SLJIT_FAST_CALL));
JUMPHERE(jump);
}
#endif
@@ -1389,16 +1416,22 @@ static void peek_char(compiler_common *common)
/* Reads the character into TMP1, keeps STR_PTR.
Does not check STR_END. TMP2 Destroyed. */
DEFINE_COMPILER;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
struct sljit_jump *jump;
#endif
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (common->utf)
{
+#ifdef COMPILE_PCRE8
jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
- add_jump(compiler, &common->utf8readchar, JUMP(SLJIT_FAST_CALL));
+#else
+#ifdef COMPILE_PCRE16
+ jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xd800);
+#endif
+#endif /* COMPILE_PCRE8 */
+ add_jump(compiler, &common->utfreadchar, JUMP(SLJIT_FAST_CALL));
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
JUMPHERE(jump);
}
@@ -1409,46 +1442,83 @@ static void read_char8_type(compiler_common *common)
{
/* Reads the character type into TMP1, updates STR_PTR. Does not check STR_END. */
DEFINE_COMPILER;
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || defined COMPILE_PCRE16
struct sljit_jump *jump;
#endif
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (common->utf)
{
- OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 0);
- OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+ OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 0);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+#ifdef COMPILE_PCRE8
/* This can be an extra read in some situations, but hopefully
- it is a clever early read in most cases. */
+ it is needed in most cases. */
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
jump = CMP(SLJIT_C_LESS, TMP2, 0, SLJIT_IMM, 0xc0);
- add_jump(compiler, &common->utf8readtype8, JUMP(SLJIT_FAST_CALL));
+ add_jump(compiler, &common->utfreadtype8, JUMP(SLJIT_FAST_CALL));
JUMPHERE(jump);
+#else
+#ifdef COMPILE_PCRE16
+ OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
+ jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 0xff);
+ OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
+ JUMPHERE(jump);
+ /* Skip low surrogate if necessary. */
+ OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xfc00);
+ OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0xd800);
+ COND_VALUE(SLJIT_MOV, TMP2, 0, SLJIT_C_EQUAL);
+ OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 1);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
+#endif
+#endif /* COMPILE_PCRE8 */
return;
}
#endif
-OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
-OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), common->ctypes);
+#ifdef COMPILE_PCRE16
+/* The ctypes array contains only 255 values. */
+OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
+jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 0xff);
+#endif
+OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
+#ifdef COMPILE_PCRE16
+JUMPHERE(jump);
+#endif
}
static void skip_char_back(compiler_common *common)
{
/* Goes one character back. Only affects STR_PTR. Does not check begin. */
DEFINE_COMPILER;
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
struct sljit_label *label;
if (common->utf)
{
label = LABEL();
- OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -IN_UCHARS(1));
+ OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc0);
CMPTO(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, 0x80, label);
return;
}
#endif
+#if defined SUPPORT_UTF && defined COMPILE_PCRE16
+if (common->utf)
+ {
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -IN_UCHARS(1));
+ OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+ /* Skip low surrogate if necessary. */
+ OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00);
+ OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xdc00);
+ COND_VALUE(SLJIT_MOV, TMP1, 0, SLJIT_C_EQUAL);
+ OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
+ OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+ return;
+ }
+#endif
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
}
@@ -1477,10 +1547,12 @@ else
}
}
-#ifdef SUPPORT_UTF8
-static void do_utf8readchar(compiler_common *common)
+#ifdef SUPPORT_UTF
+
+#ifdef COMPILE_PCRE8
+static void do_utfreadchar(compiler_common *common)
{
-/* Fast decoding an utf8 character. TMP1 contains the first byte
+/* Fast decoding a UTF-8 character. TMP1 contains the first byte
of the character (>= 0xc0). Return char value in TMP1, length - 1 in TMP2. */
DEFINE_COMPILER;
struct sljit_jump *jump;
@@ -1489,82 +1561,57 @@ sljit_emit_fast_enter(compiler, RETURN_ADDR, 0, 1, 5, 5, common->localsize);
/* Searching for the first zero. */
OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x20);
jump = JUMP(SLJIT_C_NOT_ZERO);
-/* 2 byte sequence */
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 1);
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+/* Two byte sequence. */
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x1f);
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 1);
+OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, IN_UCHARS(1));
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
JUMPHERE(jump);
OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x10);
jump = JUMP(SLJIT_C_NOT_ZERO);
-/* 3 byte sequence */
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 1);
+/* Three byte sequence. */
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x0f);
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 12);
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 2);
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 2);
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(2));
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 2);
+OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, IN_UCHARS(2));
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
JUMPHERE(jump);
-OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x08);
-jump = JUMP(SLJIT_C_NOT_ZERO);
-/* 4 byte sequence */
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 1);
+/* Four byte sequence. */
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x07);
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 18);
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 12);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 2);
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(2));
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 3);
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 3);
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(3));
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3));
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 3);
-sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
-JUMPHERE(jump);
-
-/* 5 byte sequence */
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 1);
-OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x03);
-OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 24);
-OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
-OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 18);
-OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 2);
-OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
-OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 12);
-OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 3);
-OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
-OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6);
-OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 4);
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 4);
-OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
-OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 4);
+OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, IN_UCHARS(3));
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
}
-static void do_utf8readtype8(compiler_common *common)
+static void do_utfreadtype8(compiler_common *common)
{
-/* Fast decoding an utf8 character type. TMP2 contains the first byte
-of the character (>= 0xc0) and TMP1 is destroyed. Return value in TMP1. */
+/* Fast decoding a UTF-8 character type. TMP2 contains the first byte
+of the character (>= 0xc0). Return value in TMP1. */
DEFINE_COMPILER;
struct sljit_jump *jump;
struct sljit_jump *compare;
@@ -1573,9 +1620,9 @@ sljit_emit_fast_enter(compiler, RETURN_ADDR, 0, 1, 5, 5, common->localsize);
OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0x20);
jump = JUMP(SLJIT_C_NOT_ZERO);
-/* 2 byte sequence */
-OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+/* Two byte sequence. */
+OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x1f);
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6);
OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3f);
@@ -1596,7 +1643,38 @@ OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
}
-#endif
+#else /* COMPILE_PCRE8 */
+
+#ifdef COMPILE_PCRE16
+static void do_utfreadchar(compiler_common *common)
+{
+/* Fast decoding a UTF-16 character. TMP1 contains the first 16 bit char
+of the character (>= 0xd800). Return char value in TMP1, length - 1 in TMP2. */
+DEFINE_COMPILER;
+struct sljit_jump *jump;
+
+sljit_emit_fast_enter(compiler, RETURN_ADDR, 0, 1, 5, 5, common->localsize);
+jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xdc00);
+/* Do nothing, only return. */
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+
+JUMPHERE(jump);
+/* Combine two 16 bit characters. */
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 1);
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3ff);
+OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 10);
+OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3ff);
+OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, IN_UCHARS(1));
+OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000);
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+}
+#endif /* COMPILE_PCRE16 */
+
+#endif /* COMPILE_PCRE8 */
+
+#endif /* SUPPORT_UTF */
#ifdef SUPPORT_UCP
@@ -1634,8 +1712,8 @@ struct sljit_label *newlinelabel = NULL;
struct sljit_jump *start;
struct sljit_jump *end = NULL;
struct sljit_jump *nl = NULL;
-#ifdef SUPPORT_UTF8
-struct sljit_jump *singlebyte;
+#ifdef SUPPORT_UTF
+struct sljit_jump *singlechar;
#endif
jump_list *newline = NULL;
BOOL newlinecheck = FALSE;
@@ -1708,13 +1786,25 @@ if (newlinecheck)
CMPTO(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff, newlinelabel);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
if (common->utf)
{
- singlebyte = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
+ singlechar = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)PRIV(utf8_table4) - 0xc0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
- JUMPHERE(singlebyte);
+ JUMPHERE(singlechar);
+ }
+#endif
+#if defined SUPPORT_UTF && defined COMPILE_PCRE16
+if (common->utf)
+ {
+ singlechar = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xd800);
+ OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00);
+ OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xd800);
+ COND_VALUE(SLJIT_MOV, TMP1, 0, SLJIT_C_EQUAL);
+ OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+ JUMPHERE(singlechar);
}
#endif
JUMPHERE(start);
@@ -1770,7 +1860,7 @@ else
}
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
if (common->utf)
{
CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0, start);
@@ -1778,6 +1868,17 @@ if (common->utf)
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
}
#endif
+#if defined SUPPORT_UTF && defined COMPILE_PCRE16
+if (common->utf)
+ {
+ CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xd800, start);
+ OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00);
+ OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xd800);
+ COND_VALUE(SLJIT_MOV, TMP1, 0, SLJIT_C_EQUAL);
+ OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+ }
+#endif
JUMPTO(SLJIT_JUMP, start);
JUMPHERE(found);
JUMPHERE(leave);
@@ -1900,7 +2001,7 @@ if (common->utf)
OP1(SLJIT_MOV, TMP1, 0, TMP3, 0);
#endif
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
if (common->utf)
{
CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0, start);
@@ -1908,6 +2009,17 @@ if (common->utf)
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
}
#endif
+#if defined SUPPORT_UTF && defined COMPILE_PCRE16
+if (common->utf)
+ {
+ CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xd800, start);
+ OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00);
+ OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xd800);
+ COND_VALUE(SLJIT_MOV, TMP1, 0, SLJIT_C_EQUAL);
+ OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+ }
+#endif
JUMPTO(SLJIT_JUMP, start);
JUMPHERE(found);
JUMPHERE(leave);
@@ -2335,10 +2447,10 @@ if (context->sourcereg == -1)
context->sourcereg = TMP2;
}
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
utflength = 1;
-if (common->utf && *cc >= 0xc0)
- utflength += PRIV(utf8_table4)[*cc & 0x3f];
+if (common->utf && HAS_EXTRALEN(*cc))
+ utflength += GET_EXTRALEN(*cc);
do
{
@@ -2523,8 +2635,8 @@ while (*cc != XCL_END)
if (*cc == XCL_SINGLE)
{
cc += 2;
-#ifdef SUPPORT_UTF8
- if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+#ifdef SUPPORT_UTF
+ if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
#endif
#ifdef SUPPORT_UCP
needschar = TRUE;
@@ -2533,12 +2645,12 @@ while (*cc != XCL_END)
else if (*cc == XCL_RANGE)
{
cc += 2;
-#ifdef SUPPORT_UTF8
- if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+#ifdef SUPPORT_UTF
+ if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
#endif
cc++;
-#ifdef SUPPORT_UTF8
- if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+#ifdef SUPPORT_UTF
+ if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
#endif
#ifdef SUPPORT_UCP
needschar = TRUE;
@@ -2875,24 +2987,35 @@ switch(type)
case OP_ALLANY:
check_input_end(common, fallbacks);
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (common->utf)
{
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
- OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+#ifdef COMPILE_PCRE8
jump[0] = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)PRIV(utf8_table4) - 0xc0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+#else /* COMPILE_PCRE8 */
+#ifdef COMPILE_PCRE16
+ jump[0] = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xd800);
+ OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00);
+ OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xd800);
+ COND_VALUE(SLJIT_MOV, TMP1, 0, SLJIT_C_EQUAL);
+ OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+#endif /* COMPILE_PCRE16 */
+#endif /* COMPILE_PCRE8 */
JUMPHERE(jump[0]);
return cc;
}
#endif
- OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
return cc;
case OP_ANYBYTE:
check_input_end(common, fallbacks);
- OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
return cc;
#ifdef SUPPORT_UTF8
@@ -3095,8 +3218,8 @@ switch(type)
case OP_CHAR:
case OP_CHARI:
length = 1;
-#ifdef SUPPORT_UTF8
- if (common->utf && *cc >= 0xc0) length += PRIV(utf8_table4)[*cc & 0x3f];
+#ifdef SUPPORT_UTF
+ if (common->utf && HAS_EXTRALEN(*cc)) length += GET_EXTRALEN(*cc);
#endif
if (type == OP_CHAR || !char_has_othercase(common, cc) || char_get_othercase_bit(common, cc) != 0)
{
@@ -3129,11 +3252,11 @@ switch(type)
case OP_NOT:
case OP_NOTI:
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (common->utf)
{
length = 1;
- if (*cc >= 0xc0) length += PRIV(utf8_table4)[*cc & 0x3f];
+ if (HAS_EXTRALEN(*cc)) length += GET_EXTRALEN(*cc);
check_input_end(common, fallbacks);
GETCHAR(c, cc);
@@ -3152,7 +3275,9 @@ switch(type)
/* Skip the variable-length character. */
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
jump[0] = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
+#ifdef COMPILE_PCRE8
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)PRIV(utf8_table4) - 0xc0);
+#endif
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
JUMPHERE(jump[0]);
return cc + length;
@@ -3268,21 +3393,21 @@ do
if (*cc == OP_CHAR)
{
size = 1;
-#ifdef SUPPORT_UTF8
- if (common->utf && cc[1] >= 0xc0)
- size += PRIV(utf8_table4)[cc[1] & 0x3f];
+#ifdef SUPPORT_UTF
+ if (common->utf && HAS_EXTRALEN(cc[1]))
+ size += GET_EXTRALEN(cc[1]);
#endif
}
else if (*cc == OP_CHARI)
{
size = 1;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (common->utf)
{
if (char_has_othercase(common, cc + 1) && char_get_othercase_bit(common, cc + 1) == 0)
size = 0;
- else if (cc[1] >= 0xc0)
- size += PRIV(utf8_table4)[cc[1] & 0x3f];
+ else if (HAS_EXTRALEN(cc[1]))
+ size += GET_EXTRALEN(cc[1]);
}
else
#endif
@@ -4786,8 +4911,8 @@ if (*type == 0)
if (end != NULL)
{
*end = cc + 1;
-#ifdef SUPPORT_UTF8
- if (common->utf && *cc >= 0xc0) *end += PRIV(utf8_table4)[*cc & 0x3f];
+#ifdef SUPPORT_UTF
+ if (common->utf && HAS_EXTRALEN(*cc)) *end += GET_EXTRALEN(*cc);
#endif
}
return cc;
@@ -6259,9 +6384,11 @@ common->utf = (re->options & PCRE_UTF8) != 0;
#ifdef SUPPORT_UCP
common->useucp = (re->options & PCRE_UCP) != 0;
#endif
-common->utf8readchar = NULL;
-common->utf8readtype8 = NULL;
+common->utfreadchar = NULL;
+#ifdef COMPILE_PCRE8
+common->utfreadtype8 = NULL;
#endif
+#endif /* SUPPORT_UTF8 */
#ifdef SUPPORT_UCP
common->getucd = NULL;
#endif
@@ -6487,18 +6614,20 @@ if (common->caselesscmp != NULL)
set_jumps(common->caselesscmp, LABEL());
do_caselesscmp(common);
}
-#ifdef SUPPORT_UTF8
-if (common->utf8readchar != NULL)
+#ifdef SUPPORT_UTF
+if (common->utfreadchar != NULL)
{
- set_jumps(common->utf8readchar, LABEL());
- do_utf8readchar(common);
+ set_jumps(common->utfreadchar, LABEL());
+ do_utfreadchar(common);
}
-if (common->utf8readtype8 != NULL)
+#ifdef COMPILE_PCRE8
+if (common->utfreadtype8 != NULL)
{
- set_jumps(common->utf8readtype8, LABEL());
- do_utf8readtype8(common);
+ set_jumps(common->utfreadtype8, LABEL());
+ do_utfreadtype8(common);
}
#endif
+#endif /* COMPILE_PCRE8 */
#ifdef SUPPORT_UCP
if (common->getucd != NULL)
{
diff --git a/pcre_printint.src b/pcre_printint.src
index 5a9f15d..2922e54 100644
--- a/pcre_printint.src
+++ b/pcre_printint.src
@@ -72,17 +72,20 @@ static const char *OP_names[] = { OP_NAME_LIST };
*************************************************/
static int
-print_char(FILE *f, pcre_uchar *ptr, BOOL utf8)
+print_char(FILE *f, pcre_uchar *ptr, BOOL utf)
{
int c = *ptr;
-#ifndef SUPPORT_UTF8
-(void)utf8; /* Avoid compiler warning */
+#ifndef SUPPORT_UTF
+(void)utf; /* Avoid compiler warning */
if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
return 0;
#else
-if (!utf8 || (c & 0xc0) != 0xc0)
+
+#ifdef COMPILE_PCRE8
+
+if (!utf || (c & 0xc0) != 0xc0)
{
if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
return 0;
@@ -110,14 +113,45 @@ else
s -= 6;
c |= (ptr[i] & 0x3f) << s;
}
- if (c < 128) fprintf(f, "\\x%02x", c); else fprintf(f, "\\x{%x}", c);
+ fprintf(f, "\\x{%x}", c);
return a;
}
-#endif
+
+#else
+
+#ifdef COMPILE_PCRE16
+
+if (!utf || (c & 0xfc00) != 0xd800)
+ {
+ if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
+ return 0;
+ }
+else
+ {
+ /* This is a check for malformed UTF-16; it should only occur if the sanity
+ check has been turned off. Rather than swallow a low surrogate, just stop if
+ we hit a bad one. Print it with \X instead of \x as an indication. */
+
+ if ((ptr[1] & 0xfc00) != 0xdc00)
+ {
+ fprintf(f, "\\X{%x}", c);
+ return 0;
+ }
+
+ c = (((c & 0x3ff) << 10) | (ptr[1] & 0x3ff)) + 0x10000;
+ fprintf(f, "\\x{%x}", c);
+ return 1;
+ }
+
+#endif /* COMPILE_PCRE16 */
+
+#endif /* COMPILE_PCRE8 */
+
+#endif /* SUPPORT_UTF */
}
/*************************************************
-* Print uchar string (regardless of utf8) *
+* Print uchar string (regardless of utf) *
*************************************************/
static void
@@ -168,7 +202,7 @@ pcre_printint(pcre *external_re, FILE *f, BOOL print_lengths)
{
real_pcre *re = (real_pcre *)external_re;
pcre_uchar *codestart, *code;
-BOOL utf8;
+BOOL utf;
unsigned int options = re->options;
int offset = re->name_table_offset;
@@ -187,7 +221,8 @@ if (re->magic_number != MAGIC_NUMBER)
}
code = codestart = (pcre_uchar *)re + offset + count * size;
-utf8 = (options & PCRE_UTF8) != 0;
+/* PCRE_UTF16 has the same value as PCRE_UTF8. */
+utf = (options & PCRE_UTF8) != 0;
for(;;)
{
@@ -232,7 +267,7 @@ for(;;)
do
{
code++;
- code += 1 + print_char(f, code, utf8);
+ code += 1 + print_char(f, code, utf);
}
while (*code == OP_CHAR);
fprintf(f, "\n");
@@ -243,7 +278,7 @@ for(;;)
do
{
code++;
- code += 1 + print_char(f, code, utf8);
+ code += 1 + print_char(f, code, utf);
}
while (*code == OP_CHARI);
fprintf(f, "\n");
@@ -349,7 +384,7 @@ for(;;)
extra = 2;
}
}
- else extra = print_char(f, code+1, utf8);
+ else extra = print_char(f, code+1, utf);
fprintf(f, "%s", OP_names[*code]);
break;
@@ -364,7 +399,7 @@ for(;;)
case OP_MINUPTO:
case OP_POSUPTO:
fprintf(f, " %s ", flag);
- extra = print_char(f, code + 1 + IMM2_SIZE, utf8);
+ extra = print_char(f, code + 1 + IMM2_SIZE, utf);
fprintf(f, "{");
if (*code != OP_EXACT && *code != OP_EXACTI) fprintf(f, "0,");
fprintf(f, "%d}", GET2(code,1));
@@ -557,7 +592,7 @@ for(;;)
}
}
- /* Indicate a non-UTF8 class which was created by negation */
+ /* Indicate a non-UTF class which was created by negation */
fprintf(f, "]%s", (*code == OP_NCLASS)? " (neg)" : "");
diff --git a/pcre_study.c b/pcre_study.c
index 098980d..1e10397 100644
--- a/pcre_study.c
+++ b/pcre_study.c
@@ -225,7 +225,7 @@ for (;;)
branchlength++;
cc += 2;
#ifdef SUPPORT_UTF8
- if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+ if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
#endif
break;
@@ -246,7 +246,7 @@ for (;;)
branchlength += GET2(cc,1);
cc += 2 + IMM2_SIZE;
#ifdef SUPPORT_UTF8
- if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+ if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
#endif
break;
@@ -487,7 +487,7 @@ for (;;)
cc += PRIV(OP_lengths)[op];
#ifdef SUPPORT_UTF8
- if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+ if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
#endif
break;
diff --git a/pcre_tables.c b/pcre_tables.c
index 7c52961..b8cabf3 100644
--- a/pcre_tables.c
+++ b/pcre_tables.c
@@ -65,7 +65,9 @@ const pcre_uint8 PRIV(OP_lengths)[] = { OP_LENGTHS };
/* These are the breakpoints for different numbers of bytes in a UTF-8
character. */
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
+
+#ifdef COMPILE_PCRE8
const int PRIV(utf8_table1)[] =
{ 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
@@ -87,6 +89,8 @@ const pcre_uint8 PRIV(utf8_table4)[] = {
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
+#endif /* COMPILE_PCRE8 */
+
/* Table to translate from particular type value to the general value. */
const int PRIV(ucp_gentype)[] = {
@@ -554,6 +558,6 @@ const ucp_type_table PRIV(utt)[] = {
const int PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);
-#endif /* SUPPORT_UTF8 */
+#endif /* SUPPORT_UTF */
/* End of pcre_tables.c */