summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorzherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>2011-12-07 14:36:26 +0000
committerzherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>2011-12-07 14:36:26 +0000
commit4b661f8c6abbe9be96af67b9d5547bb96359cc99 (patch)
tree120a0978274ca69871e78091aeab44486501d482
parentb4a0233a732c67c98886725229df86fc150b0e82 (diff)
downloadpcre-4b661f8c6abbe9be96af67b9d5547bb96359cc99.tar.gz
UTF16 fixes: iterated character parsing, named references
git-svn-id: svn://vcs.exim.org/pcre/code/branches/pcre16@789 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--pcre16_ord2utf16.c2
-rw-r--r--pcre16_utf16_utils.c39
-rw-r--r--pcre_compile.c21
-rw-r--r--pcre_exec.c120
-rw-r--r--pcre_internal.h4
-rw-r--r--pcre_jit_test.c77
-rw-r--r--testdata/testoutput106
7 files changed, 166 insertions, 103 deletions
diff --git a/pcre16_ord2utf16.c b/pcre16_ord2utf16.c
index b02ccc2..67c4c5c 100644
--- a/pcre16_ord2utf16.c
+++ b/pcre16_ord2utf16.c
@@ -86,11 +86,9 @@ cvalue -= 0x10000;
return 2;
#else
-
(void)(cvalue); /* Keep compiler happy; this function won't ever be */
(void)(buffer); /* called when SUPPORT_UTF8 is not defined. */
return 0;
-
#endif
}
diff --git a/pcre16_utf16_utils.c b/pcre16_utf16_utils.c
index ddd96b9..8f970bb 100644
--- a/pcre16_utf16_utils.c
+++ b/pcre16_utf16_utils.c
@@ -51,6 +51,29 @@ strings to host byte order. */
#include "pcre_internal.h"
+/*************************************************
+* Convert any UTF-16 string to host byte order *
+*************************************************/
+
+/* This function takes an UTF-16 string and converts
+it to host byte order. The length can be explicitly set,
+or autmatically detected for zero terminated strings.
+BOMs can be kept or discarded during the conversion.
+Conversion can be done in place (output == input).
+
+Arguments:
+ output the output buffer, its size must be greater
+ or equal than the input string
+ input any UTF-16 string
+ length the number of characters in the input string
+ can be less than zero for zero terminated strings
+ keep_boms for a non-zero value, the BOM (0xfeff) characters
+ are copied as well
+
+Returns: the number of characters placed into the output buffer,
+ including the zero-terminator
+*/
+
int
pcre16_utf16_to_host_byte_order(PCRE_SCHAR16 *output, PCRE_SPTR16 input, int length, int keep_boms)
{
@@ -58,25 +81,31 @@ pcre16_utf16_to_host_byte_order(PCRE_SCHAR16 *output, PCRE_SPTR16 input, int len
/* This function converts any UTF-16 string to host byte order and optionally removes
any Byte Order Marks (BOMS). Returns with the remainig length. */
BOOL same_bo = TRUE;
-PCRE_SPTR16 end = input + length;
+pcre_uchar *optr = (pcre_uchar *)output;
+const pcre_uchar *iptr = (const pcre_uchar *)input;
+const pcre_uchar *end;
/* The c variable must be unsigned. */
register pcre_uchar c;
-while (input < end)
+if (length < 0)
+ length = STRLEN_UC(iptr) + 1;
+end = iptr + length;
+
+while (iptr < end)
{
- c = *input++;
+ c = *iptr++;
if (c == 0xfeff || c == 0xfffe)
{
/* Detecting the byte order of the machine is unnecessary, it is
enough to know that the UTF-16 string has the same byte order or not. */
same_bo = c == 0xfeff;
if (keep_boms != 0)
- *output++ = 0xfeff;
+ *optr++ = 0xfeff;
else
length--;
}
else
- *output++ = same_bo ? c : ((c >> 8) | (c << 8)); /* Flip bytes if needed. */
+ *optr++ = same_bo ? c : ((c >> 8) | (c << 8)); /* Flip bytes if needed. */
}
#else
diff --git a/pcre_compile.c b/pcre_compile.c
index bdfac5b..223e475 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -4202,11 +4202,10 @@ for (;; ptr++)
#ifdef SUPPORT_UTF
if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
-#endif
-#ifndef COMPILE_PCRE8
+#elif !(defined COMPILE_PCRE8)
if (d > 255)
#endif
-#if defined SUPPORT_UTF || defined COMPILE_PCRE16
+#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
{
xclass = TRUE;
@@ -5817,9 +5816,9 @@ for (;; ptr++)
*errorcodeptr = ERR49;
goto FAILED;
}
- if (namelen + 3 > cd->name_entry_size)
+ if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
{
- cd->name_entry_size = namelen + 3;
+ cd->name_entry_size = namelen + IMM2_SIZE + 1;
if (namelen > MAX_NAME_SIZE)
{
*errorcodeptr = ERR48;
@@ -5848,10 +5847,10 @@ for (;; ptr++)
for (i = 0; i < cd->names_found; i++)
{
- int crc = memcmp(name, slot+2, IN_UCHARS(namelen));
+ int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(namelen));
if (crc == 0)
{
- if (slot[2+namelen] == 0)
+ if (slot[IMM2_SIZE+namelen] == 0)
{
if (GET2(slot, 0) != cd->bracount + 1 &&
(options & PCRE_DUPNAMES) == 0)
@@ -5903,8 +5902,8 @@ for (;; ptr++)
}
PUT2(slot, 0, cd->bracount + 1);
- memcpy(slot + 2, name, IN_UCHARS(namelen));
- slot[2 + namelen] = 0;
+ memcpy(slot + IMM2_SIZE, name, IN_UCHARS(namelen));
+ slot[IMM2_SIZE + namelen] = 0;
}
}
@@ -5988,7 +5987,7 @@ for (;; ptr++)
for (i = 0; i < cd->names_found; i++)
{
if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
- slot[2+namelen] == 0)
+ slot[IMM2_SIZE+namelen] == 0)
break;
slot += cd->name_entry_size;
}
@@ -7614,7 +7613,7 @@ externally provided function. Integer overflow should no longer be possible
because nowadays we limit the maximum value of cd->names_found and
cd->name_entry_size. */
-size = sizeof(real_pcre) + (length + cd->names_found * (cd->name_entry_size + 3)) * sizeof(pcre_uchar);
+size = sizeof(real_pcre) + (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
re = (real_pcre *)(pcre_malloc)(size);
if (re == NULL)
diff --git a/pcre_exec.c b/pcre_exec.c
index 5f0a156..676f4b8 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -181,7 +181,7 @@ ASCII characters. */
if (caseless)
{
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
#ifdef SUPPORT_UCP
if (md->utf)
{
@@ -365,7 +365,7 @@ typedef struct heapframe {
/* Function local variables */
PCRE_PUCHAR Xcallpat;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
PCRE_PUCHAR Xcharptr;
#endif
PCRE_PUCHAR Xdata;
@@ -527,7 +527,7 @@ HEAP_RECURSE:
/* Ditto for the local variables */
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
#define charptr frame->Xcharptr
#endif
#define callpat frame->Xcallpat
@@ -585,7 +585,7 @@ declarations can be cut out in a block. The only declarations within blocks
below are for variables that do not have to be preserved over a recursive call
to RMATCH(). */
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
const pcre_uchar *charptr;
#endif
const pcre_uchar *callpat;
@@ -634,6 +634,7 @@ the alternative names that are used. */
#define code_offset codelink
#define condassert condition
#define matched_once prev_is_word
+#define foc number
/* These statements are here to stop the compiler complaining about unitialized
variables. */
@@ -659,7 +660,7 @@ defined). However, RMATCH isn't like a function call because it's quite a
complicated macro. It has to be used in one particular way. This shouldn't,
however, impact performance when true recursion is being used. */
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
utf = md->utf; /* Local copy of the flag */
#else
utf = FALSE;
@@ -1596,7 +1597,7 @@ for (;;)
back a number of characters, not bytes. */
case OP_REVERSE:
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
{
i = GET(ecode, 1);
@@ -2216,7 +2217,7 @@ for (;;)
}
GETCHARINCTEST(c, eptr);
if (
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
c < 256 &&
#endif
(md->ctypes[c] & ctype_digit) != 0
@@ -2233,8 +2234,8 @@ for (;;)
}
GETCHARINCTEST(c, eptr);
if (
-#ifdef SUPPORT_UTF8
- c >= 256 ||
+#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
+ c > 255 ||
#endif
(md->ctypes[c] & ctype_digit) == 0
)
@@ -2250,7 +2251,7 @@ for (;;)
}
GETCHARINCTEST(c, eptr);
if (
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
c < 256 &&
#endif
(md->ctypes[c] & ctype_space) != 0
@@ -2267,8 +2268,8 @@ for (;;)
}
GETCHARINCTEST(c, eptr);
if (
-#ifdef SUPPORT_UTF8
- c >= 256 ||
+#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
+ c > 255 ||
#endif
(md->ctypes[c] & ctype_space) == 0
)
@@ -2284,7 +2285,7 @@ for (;;)
}
GETCHARINCTEST(c, eptr);
if (
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
c < 256 &&
#endif
(md->ctypes[c] & ctype_word) != 0
@@ -2301,8 +2302,8 @@ for (;;)
}
GETCHARINCTEST(c, eptr);
if (
-#ifdef SUPPORT_UTF8
- c >= 256 ||
+#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
+ c > 255 ||
#endif
(md->ctypes[c] & ctype_word) == 0
)
@@ -3036,7 +3037,7 @@ for (;;)
/* Match a single character, casefully */
case OP_CHAR:
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
{
length = 1;
@@ -3108,7 +3109,7 @@ for (;;)
}
}
else
-#endif /* SUPPORT_UTF8 */
+#endif /* SUPPORT_UTF */
/* Not UTF mode */
{
@@ -3117,7 +3118,9 @@ for (;;)
SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
MRRETURN(MATCH_NOMATCH);
}
- if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
+ if (TABLE_GET(ecode[1], md->lcc, ecode[1])
+ != TABLE_GET(*eptr, md->lcc, *eptr)) MRRETURN(MATCH_NOMATCH);
+ eptr++;
ecode += 2;
}
break;
@@ -3190,7 +3193,7 @@ for (;;)
/* Common code for all repeated single-character matches. */
REPEATCHAR:
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
{
length = 1;
@@ -3214,7 +3217,7 @@ for (;;)
for (i = 1; i <= min; i++)
{
if (eptr <= md->end_subject - length &&
- memcmp(eptr, charptr, length) == 0) eptr += length;
+ memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
#ifdef SUPPORT_UCP
else if (oclength > 0 &&
eptr <= md->end_subject - oclength &&
@@ -3237,7 +3240,7 @@ for (;;)
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr <= md->end_subject - length &&
- memcmp(eptr, charptr, length) == 0) eptr += length;
+ memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
#ifdef SUPPORT_UCP
else if (oclength > 0 &&
eptr <= md->end_subject - oclength &&
@@ -3258,7 +3261,7 @@ for (;;)
for (i = min; i < max; i++)
{
if (eptr <= md->end_subject - length &&
- memcmp(eptr, charptr, length) == 0) eptr += length;
+ memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
#ifdef SUPPORT_UCP
else if (oclength > 0 &&
eptr <= md->end_subject - oclength &&
@@ -3294,14 +3297,12 @@ for (;;)
value of fc will always be < 128. */
}
else
-#endif /* SUPPORT_UTF8 */
+#endif /* SUPPORT_UTF */
+ /* When not in UTF-8 mode, load a single-byte character. */
+ fc = *ecode++;
- /* When not in UTF-8 mode, load a single-byte character. */
-
- fc = *ecode++;
-
- /* The value of fc at this point is always less than 256, though we may or
- may not be in UTF-8 mode. The code is duplicated for the caseless and
+ /* The value of fc at this point is always one character, though we may
+ or may not be in UTF mode. The code is duplicated for the caseless and
caseful cases, for speed, since matching characters is likely to be quite
common. First, ensure the minimum number of matches are present. If min =
max, continue at the same level without recursing. Otherwise, if
@@ -3314,7 +3315,23 @@ for (;;)
if (op >= OP_STARI) /* Caseless */
{
- fc = md->lcc[fc];
+#ifdef COMPILE_PCRE8
+ /* fc must be < 128 */
+ foc = md->fcc[fc];
+#else
+#ifdef SUPPORT_UTF
+#ifdef SUPPORT_UCP
+ if (utf && fc > 127)
+ foc = UCD_OTHERCASE(fc);
+#else
+ if (utf && fc > 127)
+ foc = fc;
+#endif /* SUPPORT_UCP */
+ else
+#endif /* SUPPORT_UTF */
+ foc = TABLE_GET(fc, md->fcc, fc);
+#endif /* COMPILE_PCRE8 */
+
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject)
@@ -3322,7 +3339,8 @@ for (;;)
SCHECK_PARTIAL();
MRRETURN(MATCH_NOMATCH);
}
- if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
+ if (fc != *eptr && foc != *eptr) MRRETURN(MATCH_NOMATCH);
+ eptr++;
}
if (min == max) continue;
if (minimize)
@@ -3337,7 +3355,8 @@ for (;;)
SCHECK_PARTIAL();
MRRETURN(MATCH_NOMATCH);
}
- if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
+ if (fc != *eptr && foc != *eptr) MRRETURN(MATCH_NOMATCH);
+ eptr++;
}
/* Control never gets here */
}
@@ -3351,7 +3370,7 @@ for (;;)
SCHECK_PARTIAL();
break;
}
- if (fc != md->lcc[*eptr]) break;
+ if (fc != *eptr && foc != *eptr) break;
eptr++;
}
@@ -3440,10 +3459,10 @@ for (;;)
GETCHARINCTEST(c, eptr);
if (op == OP_NOTI) /* The caseless case */
{
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
if (c < 256)
#endif
- c = md->lcc[c];
+ c = md->lcc[c];
if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
}
else /* Caseful */
@@ -3543,9 +3562,9 @@ for (;;)
if (op >= OP_NOTSTARI) /* Caseless */
{
- fc = md->lcc[fc];
+ fc = TABLE_GET(fc, md->lcc, fc);
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
{
register unsigned int d;
@@ -3580,7 +3599,7 @@ for (;;)
if (minimize)
{
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
{
register unsigned int d;
@@ -3625,7 +3644,7 @@ for (;;)
{
pp = eptr;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
{
register unsigned int d;
@@ -3683,7 +3702,7 @@ for (;;)
else
{
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
{
register unsigned int d;
@@ -3717,7 +3736,7 @@ for (;;)
if (minimize)
{
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
{
register unsigned int d;
@@ -3761,7 +3780,7 @@ for (;;)
{
pp = eptr;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
{
register unsigned int d;
@@ -4353,7 +4372,7 @@ for (;;)
} /* End switch(ctype) */
else
-#endif /* SUPPORT_UTF8 */
+#endif /* SUPPORT_UTF */
/* Code for the non-UTF-8 case for minimum matching of operators other
than OP_PROP and OP_NOTPROP. */
@@ -4796,7 +4815,7 @@ for (;;)
else
#endif /* SUPPORT_UCP */
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
{
for (fi = min;; fi++)
@@ -5596,7 +5615,7 @@ for (;;)
}
}
else
-#endif /* SUPPORT_UTF8 */
+#endif /* SUPPORT_UTF */
/* Not UTF mode */
{
switch(ctype)
@@ -5844,14 +5863,14 @@ switch (frame->Xwhere)
LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
LBL(65) LBL(66)
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
LBL(32) LBL(34) LBL(42) LBL(46)
#ifdef SUPPORT_UCP
LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
LBL(59) LBL(60) LBL(61) LBL(62)
#endif /* SUPPORT_UCP */
-#endif /* SUPPORT_UTF8 */
+#endif /* SUPPORT_UTF */
default:
DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
return PCRE_ERROR_INTERNAL;
@@ -6002,7 +6021,7 @@ md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
/* Check a UTF-8 string if required. Pass back the character offset and error
code for an invalid string if a results vector is available. */
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
{
int erroroffset;
@@ -6138,6 +6157,7 @@ md->recursive = NULL; /* No recursion at top level */
md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
md->lcc = tables + lcc_offset;
+md->fcc = tables + fcc_offset;
md->ctypes = tables + ctypes_offset;
/* Handle different \R options. */
@@ -6265,7 +6285,7 @@ if (!anchored)
first_char = first_char2 = re->first_char;
if ((re->flags & PCRE_FCH_CASELESS) != 0)
{
- first_char2 = TABLE_GET(first_char, tables + fcc_offset, first_char);
+ first_char2 = TABLE_GET(first_char, md->fcc, first_char);
#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
if (utf && first_char > 127)
first_char2 = UCD_OTHERCASE(first_char);
@@ -6287,7 +6307,7 @@ if ((re->flags & PCRE_REQCHSET) != 0)
req_char = req_char2 = re->req_char;
if ((re->flags & PCRE_RCH_CASELESS) != 0)
{
- req_char2 = TABLE_GET(req_char, tables + fcc_offset, req_char);
+ req_char2 = TABLE_GET(req_char, md->fcc, req_char);
#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
if (utf && req_char > 127)
req_char2 = UCD_OTHERCASE(req_char);
diff --git a/pcre_internal.h b/pcre_internal.h
index b93101f..624e07c 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -2055,6 +2055,7 @@ typedef struct match_data {
pcre_uchar *name_table; /* Table of names */
pcre_uchar nl[4]; /* Newline string when fixed */
const pcre_uint8 *lcc; /* Points to lower casing table */
+ const pcre_uint8 *fcc; /* Points to case-flipping table */
const pcre_uint8 *ctypes; /* Points to table of type maps */
BOOL offset_overflow; /* Set if too many extractions */
BOOL notbol; /* NOTBOL flag */
@@ -2262,6 +2263,7 @@ extern const int PRIV(ucp_gentype)[];
extern const int PRIV(ucp_typerange)[];
#endif
+#ifdef SUPPORT_UCP
/* UCD access macros */
#define UCD_BLOCK_SIZE 128
@@ -2274,6 +2276,8 @@ extern const int PRIV(ucp_typerange)[];
#define UCD_CATEGORY(ch) PRIV(ucp_gentype)[UCD_CHARTYPE(ch)]
#define UCD_OTHERCASE(ch) (ch + GET_UCD(ch)->other_case)
+#endif /* SUPPORT_UCP */
+
#endif
/* End of pcre_internal.h */
diff --git a/pcre_jit_test.c b/pcre_jit_test.c
index e4d2432..8aee260 100644
--- a/pcre_jit_test.c
+++ b/pcre_jit_test.c
@@ -621,11 +621,11 @@ pcre_jit_stack* callback(void *arg)
return (pcre_jit_stack *)arg;
}
-static void setstack(pcre_extra *extra, int realloc)
+static void setstack(pcre_extra *extra, int alloc_again)
{
static pcre_jit_stack *stack;
- if (realloc) {
+ if (alloc_again) {
if (stack)
pcre_jit_stack_free(stack);
stack = pcre_jit_stack_alloc(1, 1024 * 1024);
@@ -638,29 +638,29 @@ static void setstack(pcre_extra *extra, int realloc)
static int convert_utf8_to_utf16(const char *input, PCRE_SCHAR16 *output, int *offsetmap, int max_length)
{
- unsigned char *ptr = (unsigned char*)input;
- PCRE_SCHAR16 *optr = output;
+ unsigned char *iptr = (unsigned char*)input;
+ unsigned short *optr = (unsigned short *)output;
unsigned int c;
if (max_length == 0)
return 0;
- while (*ptr && max_length > 1) {
+ while (*iptr && max_length > 1) {
c = 0;
if (offsetmap)
- *offsetmap++ = (int)(ptr - (unsigned char*)input);
-
- if (!(*ptr & 0x80))
- c = *ptr++;
- else if (!(*ptr & 0x20)) {
- c = ((ptr[0] & 0x1f) << 6) | (ptr[1] & 0x3f);
- ptr += 2;
- } else if (!(*ptr & 0x10)) {
- c = ((ptr[0] & 0x0f) << 12) | ((ptr[1] & 0x3f) << 6) | (ptr[2] & 0x3f);
- ptr += 3;
- } else if (!(*ptr & 0x08)) {
- c = ((ptr[0] & 0x07) << 18) | ((ptr[1] & 0x3f) << 12) | ((ptr[2] & 0x3f) << 6) | (ptr[3] & 0x3f);
- ptr += 4;
+ *offsetmap++ = (int)(iptr - (unsigned char*)input);
+
+ if (!(*iptr & 0x80))
+ c = *iptr++;
+ else if (!(*iptr & 0x20)) {
+ c = ((iptr[0] & 0x1f) << 6) | (iptr[1] & 0x3f);
+ iptr += 2;
+ } else if (!(*iptr & 0x10)) {
+ c = ((iptr[0] & 0x0f) << 12) | ((iptr[1] & 0x3f) << 6) | (iptr[2] & 0x3f);
+ iptr += 3;
+ } else if (!(*iptr & 0x08)) {
+ c = ((iptr[0] & 0x07) << 18) | ((iptr[1] & 0x3f) << 12) | ((iptr[2] & 0x3f) << 6) | (iptr[3] & 0x3f);
+ iptr += 4;
}
if (c < 65536) {
@@ -668,7 +668,7 @@ static int convert_utf8_to_utf16(const char *input, PCRE_SCHAR16 *output, int *o
max_length--;
} else if (max_length <= 2) {
*optr = '\0';
- return optr - output;
+ return (int)(optr - (unsigned short *)output);
} else {
c -= 0x10000;
*optr++ = 0xd800 | ((c >> 10) & 0x3ff);
@@ -679,24 +679,25 @@ static int convert_utf8_to_utf16(const char *input, PCRE_SCHAR16 *output, int *o
}
}
if (offsetmap)
- *offsetmap = (int)(ptr - (unsigned char*)input);
+ *offsetmap = (int)(iptr - (unsigned char*)input);
*optr = '\0';
- return optr - output;
+ return (int)(optr - (unsigned short *)output);
}
static int copy_char8_to_char16(const char *input, PCRE_SCHAR16 *output, int max_length)
{
- PCRE_SCHAR16 *optr = output;
+ unsigned char *iptr = (unsigned char*)input;
+ unsigned short *optr = (unsigned short *)output;
if (max_length == 0)
return 0;
- while (*input && max_length > 1) {
- *optr++ = *input++;
+ while (*iptr && max_length > 1) {
+ *optr++ = *iptr++;
max_length--;
}
*optr = '\0';
- return optr - output;
+ return (int)(optr - (unsigned short *)output);
}
#define REGTEST_MAX_LENGTH 4096
@@ -768,6 +769,7 @@ static int regression_tests(void)
current->flags & ~(PCRE_NOTBOL | PCRE_NOTEOL | PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART | disabled_flags8),
&error, &err_offs, NULL);
+ extra8 = NULL;
if (re8) {
error = NULL;
extra8 = pcre_study(re8, PCRE_STUDY_JIT_COMPILE, &error);
@@ -786,10 +788,15 @@ static int regression_tests(void)
printf("\n8 bit: Cannot compile pattern: %s\n", current->pattern);
#endif
#ifdef SUPPORT_PCRE16
- convert_utf8_to_utf16(current->pattern, regtest_buf, NULL, REGTEST_MAX_LENGTH);
+ if (current->flags & PCRE_UTF8)
+ convert_utf8_to_utf16(current->pattern, regtest_buf, NULL, REGTEST_MAX_LENGTH);
+ else
+ copy_char8_to_char16(current->pattern, regtest_buf, REGTEST_MAX_LENGTH);
re16 = pcre16_compile(regtest_buf,
current->flags & ~(PCRE_NOTBOL | PCRE_NOTEOL | PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART | disabled_flags16),
&error, &err_offs, NULL);
+
+ extra16 = NULL;
if (re16) {
error = NULL;
extra16 = pcre16_study(re16, PCRE_STUDY_JIT_COMPILE, &error);
@@ -813,6 +820,8 @@ static int regression_tests(void)
setstack(NULL, 1);
#ifdef SUPPORT_PCRE8
+ return_value8_1 = -1000;
+ return_value8_2 = -1000;
if (re8) {
setstack(extra8, 0);
for (i = 0; i < 32; ++i)
@@ -828,6 +837,8 @@ static int regression_tests(void)
#endif
#ifdef SUPPORT_PCRE16
+ return_value16_1 = -1000;
+ return_value16_2 = -1000;
if (re16) {
setstack(extra16, 0);
if (current->flags & PCRE_UTF8)
@@ -853,7 +864,7 @@ static int regression_tests(void)
is_succesful = 1;
if (!(current->flags & PCRE_BUG)) {
#if defined SUPPORT_PCRE8 && defined SUPPORT_PCRE16
- if ((current->flags & PCRE_UTF8) && utf8 && utf16) {
+ if (utf8 == utf16) {
/* All results must be the same. */
if (return_value8_1 != return_value8_2 || return_value8_1 != return_value16_1 || return_value8_1 != return_value16_2) {
printf("\n8 and 16 bit: Return value differs(%d:%d:%d:%d): [%d] '%s' @ '%s'\n",
@@ -863,11 +874,13 @@ static int regression_tests(void)
} else if (return_value8_1 >= 0) {
return_value8_1 *= 2;
/* Transform back the results. */
- for (i = 0; i < return_value8_1; ++i) {
- if (ovector16_1[i] >= 0)
- ovector16_1[i] = regtest_offsetmap[ovector16_1[i]];
- if (ovector16_2[i] >= 0)
- ovector16_2[i] = regtest_offsetmap[ovector16_2[i]];
+ if (current->flags & PCRE_UTF8) {
+ for (i = 0; i < return_value8_1; ++i) {
+ if (ovector16_1[i] >= 0)
+ ovector16_1[i] = regtest_offsetmap[ovector16_1[i]];
+ if (ovector16_2[i] >= 0)
+ ovector16_2[i] = regtest_offsetmap[ovector16_2[i]];
+ }
}
for (i = 0; i < return_value8_1; ++i)
diff --git a/testdata/testoutput10 b/testdata/testoutput10
index 47a2a97..ef9b82c 100644
--- a/testdata/testoutput10
+++ b/testdata/testoutput10
@@ -194,7 +194,7 @@ Memory allocation (code space): 28
------------------------------------------------------------------
/a(?P<name1>b|c)d(?P<longername2>e)/BM
-Memory allocation (code space): 42
+Memory allocation (code space): 36
------------------------------------------------------------------
0 32 Bra
3 a
@@ -212,7 +212,7 @@ Memory allocation (code space): 42
------------------------------------------------------------------
/(?:a(?P<c>c(?P<d>d)))(?P<a>a)/BM
-Memory allocation (code space): 54
+Memory allocation (code space): 45
------------------------------------------------------------------
0 41 Bra
3 25 Bra
@@ -232,7 +232,7 @@ Memory allocation (code space): 54
------------------------------------------------------------------
/(?P<a>a)...(?P=a)bbb(?P>a)d/BM
-Memory allocation (code space): 37
+Memory allocation (code space): 34
------------------------------------------------------------------
0 30 Bra
3 7 CBra 1