UTF16 fixes: iterated character parsing, named references

git-svn-id: svn://vcs.exim.org/pcre/code/branches/pcre16@789 2f5784b3-3f2a-0410-8824-cb99058d5e15
author: zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2011-12-07 14:36:26 +0000
committer: zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2011-12-07 14:36:26 +0000
commit: 4b661f8c6abbe9be96af67b9d5547bb96359cc99 (patch)
tree: 120a0978274ca69871e78091aeab44486501d482
parent: b4a0233a732c67c98886725229df86fc150b0e82 (diff)
download: pcre-4b661f8c6abbe9be96af67b9d5547bb96359cc99.tar.gz
7 files changed, 166 insertions, 103 deletions
diff --git a/pcre16_ord2utf16.c b/pcre16_ord2utf16.c
index b02ccc2..67c4c5c 100644
--- a/pcre16_ord2utf16.c
+++ b/pcre16_ord2utf16.c
@@ -86,11 +86,9 @@ cvalue -= 0x10000;
 return 2;
 
 #else
-
 (void)(cvalue);  /* Keep compiler happy; this function won't ever be */
 (void)(buffer);  /* called when SUPPORT_UTF8 is not defined. */
 return 0;
-
 #endif
 }
 
diff --git a/pcre16_utf16_utils.c b/pcre16_utf16_utils.c
index ddd96b9..8f970bb 100644
--- a/pcre16_utf16_utils.c
+++ b/pcre16_utf16_utils.c
@@ -51,6 +51,29 @@ strings to host byte order. */
 
 #include "pcre_internal.h"
 
+/*************************************************
+*  Convert any UTF-16 string to host byte order  *
+*************************************************/
+
+/* This function takes an UTF-16 string and converts
+it to host byte order. The length can be explicitly set,
+or autmatically detected for zero terminated strings.
+BOMs can be kept or discarded during the conversion.
+Conversion can be done in place (output == input).
+
+Arguments:
+  output     the output buffer, its size must be greater
+             or equal than the input string
+  input      any UTF-16 string
+  length     the number of characters in the input string
+             can be less than zero for zero terminated strings
+  keep_boms  for a non-zero value, the BOM (0xfeff) characters
+             are copied as well
+
+Returns:     the number of characters placed into the output buffer,
+             including the zero-terminator
+*/
+
 int
 pcre16_utf16_to_host_byte_order(PCRE_SCHAR16 *output, PCRE_SPTR16 input, int length, int keep_boms)
 {
@@ -58,25 +81,31 @@ pcre16_utf16_to_host_byte_order(PCRE_SCHAR16 *output, PCRE_SPTR16 input, int len
 /* This function converts any UTF-16 string to host byte order and optionally removes
 any Byte Order Marks (BOMS). Returns with the remainig length. */
 BOOL same_bo = TRUE;
-PCRE_SPTR16 end = input + length;
+pcre_uchar *optr = (pcre_uchar *)output;
+const pcre_uchar *iptr = (const pcre_uchar *)input;
+const pcre_uchar *end;
 /* The c variable must be unsigned. */
 register pcre_uchar c;
 
-while (input < end)
+if (length < 0)
+  length = STRLEN_UC(iptr) + 1;
+end = iptr + length;
+
+while (iptr < end)
   {
-  c = *input++;
+  c = *iptr++;
   if (c == 0xfeff || c == 0xfffe)
     {
     /* Detecting the byte order of the machine is unnecessary, it is
     enough to know that the UTF-16 string has the same byte order or not. */
     same_bo = c == 0xfeff;
     if (keep_boms != 0)
-      *output++ = 0xfeff;
+      *optr++ = 0xfeff;
     else
       length--;
     }
   else
-    *output++ = same_bo ? c : ((c >> 8) | (c << 8)); /* Flip bytes if needed. */
+    *optr++ = same_bo ? c : ((c >> 8) | (c << 8)); /* Flip bytes if needed. */
   }
 
 #else
diff --git a/pcre_compile.c b/pcre_compile.c
index bdfac5b..223e475 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -4202,11 +4202,10 @@ for (;; ptr++)
 
 #ifdef SUPPORT_UTF
         if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
-#endif
-#ifndef COMPILE_PCRE8
+#elif !(defined COMPILE_PCRE8)
         if (d > 255)
 #endif
-#if defined SUPPORT_UTF || defined COMPILE_PCRE16
+#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
           {
           xclass = TRUE;
 
@@ -5817,9 +5816,9 @@ for (;; ptr++)
               *errorcodeptr = ERR49;
               goto FAILED;
               }
-            if (namelen + 3 > cd->name_entry_size)
+            if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
               {
-              cd->name_entry_size = namelen + 3;
+              cd->name_entry_size = namelen + IMM2_SIZE + 1;
               if (namelen > MAX_NAME_SIZE)
                 {
                 *errorcodeptr = ERR48;
@@ -5848,10 +5847,10 @@ for (;; ptr++)
 
             for (i = 0; i < cd->names_found; i++)
               {
-              int crc = memcmp(name, slot+2, IN_UCHARS(namelen));
+              int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(namelen));
               if (crc == 0)
                 {
-                if (slot[2+namelen] == 0)
+                if (slot[IMM2_SIZE+namelen] == 0)
                   {
                   if (GET2(slot, 0) != cd->bracount + 1 &&
                       (options & PCRE_DUPNAMES) == 0)
@@ -5903,8 +5902,8 @@ for (;; ptr++)
               }
 
             PUT2(slot, 0, cd->bracount + 1);
-            memcpy(slot + 2, name, IN_UCHARS(namelen));
-            slot[2 + namelen] = 0;
+            memcpy(slot + IMM2_SIZE, name, IN_UCHARS(namelen));
+            slot[IMM2_SIZE + namelen] = 0;
             }
           }
 
@@ -5988,7 +5987,7 @@ for (;; ptr++)
           for (i = 0; i < cd->names_found; i++)
             {
             if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
-                slot[2+namelen] == 0)
+                slot[IMM2_SIZE+namelen] == 0)
               break;
             slot += cd->name_entry_size;
             }
@@ -7614,7 +7613,7 @@ externally provided function. Integer overflow should no longer be possible
 because nowadays we limit the maximum value of cd->names_found and
 cd->name_entry_size. */
 
-size = sizeof(real_pcre) + (length + cd->names_found * (cd->name_entry_size + 3)) * sizeof(pcre_uchar);
+size = sizeof(real_pcre) + (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
 re = (real_pcre *)(pcre_malloc)(size);
 
 if (re == NULL)
diff --git a/pcre_exec.c b/pcre_exec.c
index 5f0a156..676f4b8 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -181,7 +181,7 @@ ASCII characters. */
 
 if (caseless)
   {
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
 #ifdef SUPPORT_UCP
   if (md->utf)
     {
@@ -365,7 +365,7 @@ typedef struct heapframe {
   /* Function local variables */
 
   PCRE_PUCHAR Xcallpat;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
   PCRE_PUCHAR Xcharptr;
 #endif
   PCRE_PUCHAR Xdata;
@@ -527,7 +527,7 @@ HEAP_RECURSE:
 
 /* Ditto for the local variables */
 
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
 #define charptr            frame->Xcharptr
 #endif
 #define callpat            frame->Xcallpat
@@ -585,7 +585,7 @@ declarations can be cut out in a block. The only declarations within blocks
 below are for variables that do not have to be preserved over a recursive call
 to RMATCH(). */
 
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
 const pcre_uchar *charptr;
 #endif
 const pcre_uchar *callpat;
@@ -634,6 +634,7 @@ the alternative names that are used. */
 #define code_offset   codelink
 #define condassert    condition
 #define matched_once  prev_is_word
+#define foc           number
 
 /* These statements are here to stop the compiler complaining about unitialized
 variables. */
@@ -659,7 +660,7 @@ defined). However, RMATCH isn't like a function call because it's quite a
 complicated macro. It has to be used in one particular way. This shouldn't,
 however, impact performance when true recursion is being used. */
 
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
 utf = md->utf;       /* Local copy of the flag */
 #else
 utf = FALSE;
@@ -1596,7 +1597,7 @@ for (;;)
     back a number of characters, not bytes. */
 
     case OP_REVERSE:
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
     if (utf)
       {
       i = GET(ecode, 1);
@@ -2216,7 +2217,7 @@ for (;;)
       }
     GETCHARINCTEST(c, eptr);
     if (
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
        c < 256 &&
 #endif
        (md->ctypes[c] & ctype_digit) != 0
@@ -2233,8 +2234,8 @@ for (;;)
       }
     GETCHARINCTEST(c, eptr);
     if (
-#ifdef SUPPORT_UTF8
-       c >= 256 ||
+#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
+       c > 255 ||
 #endif
        (md->ctypes[c] & ctype_digit) == 0
        )
@@ -2250,7 +2251,7 @@ for (;;)
       }
     GETCHARINCTEST(c, eptr);
     if (
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
        c < 256 &&
 #endif
        (md->ctypes[c] & ctype_space) != 0
@@ -2267,8 +2268,8 @@ for (;;)
       }
     GETCHARINCTEST(c, eptr);
     if (
-#ifdef SUPPORT_UTF8
-       c >= 256 ||
+#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
+       c > 255 ||
 #endif
        (md->ctypes[c] & ctype_space) == 0
        )
@@ -2284,7 +2285,7 @@ for (;;)
       }
     GETCHARINCTEST(c, eptr);
     if (
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
        c < 256 &&
 #endif
        (md->ctypes[c] & ctype_word) != 0
@@ -2301,8 +2302,8 @@ for (;;)
       }
     GETCHARINCTEST(c, eptr);
     if (
-#ifdef SUPPORT_UTF8
-       c >= 256 ||
+#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
+       c > 255 ||
 #endif
        (md->ctypes[c] & ctype_word) == 0
        )
@@ -3036,7 +3037,7 @@ for (;;)
     /* Match a single character, casefully */
 
     case OP_CHAR:
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
     if (utf)
       {
       length = 1;
@@ -3108,7 +3109,7 @@ for (;;)
         }
       }
     else
-#endif   /* SUPPORT_UTF8 */
+#endif   /* SUPPORT_UTF */
 
     /* Not UTF mode */
       {
@@ -3117,7 +3118,9 @@ for (;;)
         SCHECK_PARTIAL();            /* This one can use SCHECK_PARTIAL() */
         MRRETURN(MATCH_NOMATCH);
         }
-      if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
+      if (TABLE_GET(ecode[1], md->lcc, ecode[1])
+          != TABLE_GET(*eptr, md->lcc, *eptr)) MRRETURN(MATCH_NOMATCH);
+      eptr++;
       ecode += 2;
       }
     break;
@@ -3190,7 +3193,7 @@ for (;;)
     /* Common code for all repeated single-character matches. */
 
     REPEATCHAR:
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
     if (utf)
       {
       length = 1;
@@ -3214,7 +3217,7 @@ for (;;)
         for (i = 1; i <= min; i++)
           {
           if (eptr <= md->end_subject - length &&
-            memcmp(eptr, charptr, length) == 0) eptr += length;
+            memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
 #ifdef SUPPORT_UCP
           else if (oclength > 0 &&
                    eptr <= md->end_subject - oclength &&
@@ -3237,7 +3240,7 @@ for (;;)
             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
             if (fi >= max) MRRETURN(MATCH_NOMATCH);
             if (eptr <= md->end_subject - length &&
-              memcmp(eptr, charptr, length) == 0) eptr += length;
+              memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
 #ifdef SUPPORT_UCP
             else if (oclength > 0 &&
                      eptr <= md->end_subject - oclength &&
@@ -3258,7 +3261,7 @@ for (;;)
           for (i = min; i < max; i++)
             {
             if (eptr <= md->end_subject - length &&
-                memcmp(eptr, charptr, length) == 0) eptr += length;
+                memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
 #ifdef SUPPORT_UCP
             else if (oclength > 0 &&
                      eptr <= md->end_subject - oclength &&
@@ -3294,14 +3297,12 @@ for (;;)
       value of fc will always be < 128. */
       }
     else
-#endif  /* SUPPORT_UTF8 */
+#endif  /* SUPPORT_UTF */
+      /* When not in UTF-8 mode, load a single-byte character. */
+      fc = *ecode++;
 
-    /* When not in UTF-8 mode, load a single-byte character. */
-
-    fc = *ecode++;
-
-    /* The value of fc at this point is always less than 256, though we may or
-    may not be in UTF-8 mode. The code is duplicated for the caseless and
+    /* The value of fc at this point is always one character, though we may
+    or may not be in UTF mode. The code is duplicated for the caseless and
     caseful cases, for speed, since matching characters is likely to be quite
     common. First, ensure the minimum number of matches are present. If min =
     max, continue at the same level without recursing. Otherwise, if
@@ -3314,7 +3315,23 @@ for (;;)
 
     if (op >= OP_STARI)  /* Caseless */
       {
-      fc = md->lcc[fc];
+#ifdef COMPILE_PCRE8
+      /* fc must be < 128 */
+      foc = md->fcc[fc];
+#else
+#ifdef SUPPORT_UTF
+#ifdef SUPPORT_UCP
+      if (utf && fc > 127)
+        foc = UCD_OTHERCASE(fc);
+#else
+      if (utf && fc > 127)
+        foc = fc;
+#endif /* SUPPORT_UCP */
+      else
+#endif /* SUPPORT_UTF */
+        foc = TABLE_GET(fc, md->fcc, fc);
+#endif /* COMPILE_PCRE8 */
+
       for (i = 1; i <= min; i++)
         {
         if (eptr >= md->end_subject)
@@ -3322,7 +3339,8 @@ for (;;)
           SCHECK_PARTIAL();
           MRRETURN(MATCH_NOMATCH);
           }
-        if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
+        if (fc != *eptr && foc != *eptr) MRRETURN(MATCH_NOMATCH);
+        eptr++;
         }
       if (min == max) continue;
       if (minimize)
@@ -3337,7 +3355,8 @@ for (;;)
             SCHECK_PARTIAL();
             MRRETURN(MATCH_NOMATCH);
             }
-          if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
+          if (fc != *eptr && foc != *eptr) MRRETURN(MATCH_NOMATCH);
+          eptr++;
           }
         /* Control never gets here */
         }
@@ -3351,7 +3370,7 @@ for (;;)
             SCHECK_PARTIAL();
             break;
             }
-          if (fc != md->lcc[*eptr]) break;
+          if (fc != *eptr && foc != *eptr) break;
           eptr++;
           }
 
@@ -3440,10 +3459,10 @@ for (;;)
     GETCHARINCTEST(c, eptr);
     if (op == OP_NOTI)         /* The caseless case */
       {
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
       if (c < 256)
 #endif
-      c = md->lcc[c];
+        c = md->lcc[c];
       if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
       }
     else    /* Caseful */
@@ -3543,9 +3562,9 @@ for (;;)
 
     if (op >= OP_NOTSTARI)     /* Caseless */
       {
-      fc = md->lcc[fc];
+      fc = TABLE_GET(fc, md->lcc, fc);
 
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
       if (utf)
         {
         register unsigned int d;
@@ -3580,7 +3599,7 @@ for (;;)
 
       if (minimize)
         {
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
         if (utf)
           {
           register unsigned int d;
@@ -3625,7 +3644,7 @@ for (;;)
         {
         pp = eptr;
 
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
         if (utf)
           {
           register unsigned int d;
@@ -3683,7 +3702,7 @@ for (;;)
 
     else
       {
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
       if (utf)
         {
         register unsigned int d;
@@ -3717,7 +3736,7 @@ for (;;)
 
       if (minimize)
         {
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
         if (utf)
           {
           register unsigned int d;
@@ -3761,7 +3780,7 @@ for (;;)
         {
         pp = eptr;
 
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
         if (utf)
           {
           register unsigned int d;
@@ -4353,7 +4372,7 @@ for (;;)
         }  /* End switch(ctype) */
 
       else
-#endif     /* SUPPORT_UTF8 */
+#endif     /* SUPPORT_UTF */
 
       /* Code for the non-UTF-8 case for minimum matching of operators other
       than OP_PROP and OP_NOTPROP. */
@@ -4796,7 +4815,7 @@ for (;;)
       else
 #endif     /* SUPPORT_UCP */
 
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
       if (utf)
         {
         for (fi = min;; fi++)
@@ -5596,7 +5615,7 @@ for (;;)
           }
         }
       else
-#endif  /* SUPPORT_UTF8 */
+#endif  /* SUPPORT_UTF */
       /* Not UTF mode */
         {
         switch(ctype)
@@ -5844,14 +5863,14 @@ switch (frame->Xwhere)
   LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
   LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
   LBL(65) LBL(66)
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
   LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
   LBL(32) LBL(34) LBL(42) LBL(46)
 #ifdef SUPPORT_UCP
   LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
   LBL(59) LBL(60) LBL(61) LBL(62)
 #endif  /* SUPPORT_UCP */
-#endif  /* SUPPORT_UTF8 */
+#endif  /* SUPPORT_UTF */
   default:
   DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
   return PCRE_ERROR_INTERNAL;
@@ -6002,7 +6021,7 @@ md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
 /* Check a UTF-8 string if required. Pass back the character offset and error
 code for an invalid string if a results vector is available. */
 
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
   {
   int erroroffset;
@@ -6138,6 +6157,7 @@ md->recursive = NULL;                   /* No recursion at top level */
 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
 
 md->lcc = tables + lcc_offset;
+md->fcc = tables + fcc_offset;
 md->ctypes = tables + ctypes_offset;
 
 /* Handle different \R options. */
@@ -6265,7 +6285,7 @@ if (!anchored)
     first_char = first_char2 = re->first_char;
     if ((re->flags & PCRE_FCH_CASELESS) != 0)
       {
-      first_char2 = TABLE_GET(first_char, tables + fcc_offset, first_char);
+      first_char2 = TABLE_GET(first_char, md->fcc, first_char);
 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
       if (utf && first_char > 127)
         first_char2 = UCD_OTHERCASE(first_char);
@@ -6287,7 +6307,7 @@ if ((re->flags & PCRE_REQCHSET) != 0)
   req_char = req_char2 = re->req_char;
   if ((re->flags & PCRE_RCH_CASELESS) != 0)
     {
-    req_char2 = TABLE_GET(req_char, tables + fcc_offset, req_char);
+    req_char2 = TABLE_GET(req_char, md->fcc, req_char);
 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
     if (utf && req_char > 127)
       req_char2 = UCD_OTHERCASE(req_char);
diff --git a/pcre_internal.h b/pcre_internal.h
index b93101f..624e07c 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -2055,6 +2055,7 @@ typedef struct match_data {
   pcre_uchar *name_table;         /* Table of names */
   pcre_uchar nl[4];               /* Newline string when fixed */
   const  pcre_uint8 *lcc;         /* Points to lower casing table */
+  const  pcre_uint8 *fcc;         /* Points to case-flipping table */
   const  pcre_uint8 *ctypes;      /* Points to table of type maps */
   BOOL   offset_overflow;         /* Set if too many extractions */
   BOOL   notbol;                  /* NOTBOL flag */
@@ -2262,6 +2263,7 @@ extern const int         PRIV(ucp_gentype)[];
 extern const int         PRIV(ucp_typerange)[];
 #endif
 
+#ifdef SUPPORT_UCP
 /* UCD access macros */
 
 #define UCD_BLOCK_SIZE 128
@@ -2274,6 +2276,8 @@ extern const int         PRIV(ucp_typerange)[];
 #define UCD_CATEGORY(ch)  PRIV(ucp_gentype)[UCD_CHARTYPE(ch)]
 #define UCD_OTHERCASE(ch) (ch + GET_UCD(ch)->other_case)
 
+#endif /* SUPPORT_UCP */
+
 #endif
 
 /* End of pcre_internal.h */
diff --git a/pcre_jit_test.c b/pcre_jit_test.c
index e4d2432..8aee260 100644
--- a/pcre_jit_test.c
+++ b/pcre_jit_test.c
@@ -621,11 +621,11 @@ pcre_jit_stack* callback(void *arg)
 	return (pcre_jit_stack *)arg;
 }
 
-static void setstack(pcre_extra *extra, int realloc)
+static void setstack(pcre_extra *extra, int alloc_again)
 {
 	static pcre_jit_stack *stack;
 
-	if (realloc) {
+	if (alloc_again) {
 		if (stack)
 			pcre_jit_stack_free(stack);
 		stack = pcre_jit_stack_alloc(1, 1024 * 1024);
@@ -638,29 +638,29 @@ static void setstack(pcre_extra *extra, int realloc)
 
 static int convert_utf8_to_utf16(const char *input, PCRE_SCHAR16 *output, int *offsetmap, int max_length)
 {
-	unsigned char *ptr = (unsigned char*)input;
-	PCRE_SCHAR16 *optr = output;
+	unsigned char *iptr = (unsigned char*)input;
+	unsigned short *optr = (unsigned short *)output;
 	unsigned int c;
 
 	if (max_length == 0)
 		return 0;
 
-	while (*ptr && max_length > 1) {
+	while (*iptr && max_length > 1) {
 		c = 0;
 		if (offsetmap)
-			*offsetmap++ = (int)(ptr - (unsigned char*)input);
-
-		if (!(*ptr & 0x80))
-			c = *ptr++;
-		else if (!(*ptr & 0x20)) {
-			c = ((ptr[0] & 0x1f) << 6) | (ptr[1] & 0x3f);
-			ptr += 2;
-		} else if (!(*ptr & 0x10)) {
-			c = ((ptr[0] & 0x0f) << 12) | ((ptr[1] & 0x3f) << 6) | (ptr[2] & 0x3f);
-			ptr += 3;
-		} else if (!(*ptr & 0x08)) {
-			c = ((ptr[0] & 0x07) << 18) | ((ptr[1] & 0x3f) << 12) | ((ptr[2] & 0x3f) << 6) | (ptr[3] & 0x3f);
-			ptr += 4;
+			*offsetmap++ = (int)(iptr - (unsigned char*)input);
+
+		if (!(*iptr & 0x80))
+			c = *iptr++;
+		else if (!(*iptr & 0x20)) {
+			c = ((iptr[0] & 0x1f) << 6) | (iptr[1] & 0x3f);
+			iptr += 2;
+		} else if (!(*iptr & 0x10)) {
+			c = ((iptr[0] & 0x0f) << 12) | ((iptr[1] & 0x3f) << 6) | (iptr[2] & 0x3f);
+			iptr += 3;
+		} else if (!(*iptr & 0x08)) {
+			c = ((iptr[0] & 0x07) << 18) | ((iptr[1] & 0x3f) << 12) | ((iptr[2] & 0x3f) << 6) | (iptr[3] & 0x3f);
+			iptr += 4;
 		}
 
 		if (c < 65536) {
@@ -668,7 +668,7 @@ static int convert_utf8_to_utf16(const char *input, PCRE_SCHAR16 *output, int *o
 			max_length--;
 		} else if (max_length <= 2) {
 			*optr = '\0';
-			return optr - output;
+			return (int)(optr - (unsigned short *)output);
 		} else {
 			c -= 0x10000;
 			*optr++ = 0xd800 | ((c >> 10) & 0x3ff);
@@ -679,24 +679,25 @@ static int convert_utf8_to_utf16(const char *input, PCRE_SCHAR16 *output, int *o
 		}
 	}
 	if (offsetmap)
-		*offsetmap = (int)(ptr - (unsigned char*)input);
+		*offsetmap = (int)(iptr - (unsigned char*)input);
 	*optr = '\0';
-	return optr - output;
+	return (int)(optr - (unsigned short *)output);
 }
 
 static int copy_char8_to_char16(const char *input, PCRE_SCHAR16 *output, int max_length)
 {
-	PCRE_SCHAR16 *optr = output;
+	unsigned char *iptr = (unsigned char*)input;
+	unsigned short *optr = (unsigned short *)output;
 
 	if (max_length == 0)
 		return 0;
 
-	while (*input && max_length > 1) {
-		*optr++ = *input++;
+	while (*iptr && max_length > 1) {
+		*optr++ = *iptr++;
 		max_length--;
 	}
 	*optr = '\0';
-	return optr - output;
+	return (int)(optr - (unsigned short *)output);
 }
 
 #define REGTEST_MAX_LENGTH 4096
@@ -768,6 +769,7 @@ static int regression_tests(void)
 			current->flags & ~(PCRE_NOTBOL | PCRE_NOTEOL | PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART | disabled_flags8),
 			&error, &err_offs, NULL);
 
+		extra8 = NULL;
 		if (re8) {
 			error = NULL;
 			extra8 = pcre_study(re8, PCRE_STUDY_JIT_COMPILE, &error);
@@ -786,10 +788,15 @@ static int regression_tests(void)
 			printf("\n8 bit: Cannot compile pattern: %s\n", current->pattern);
 #endif
 #ifdef SUPPORT_PCRE16
-		convert_utf8_to_utf16(current->pattern, regtest_buf, NULL, REGTEST_MAX_LENGTH);
+		if (current->flags & PCRE_UTF8)
+			convert_utf8_to_utf16(current->pattern, regtest_buf, NULL, REGTEST_MAX_LENGTH);
+		else
+			copy_char8_to_char16(current->pattern, regtest_buf, REGTEST_MAX_LENGTH);
 		re16 = pcre16_compile(regtest_buf,
 			current->flags & ~(PCRE_NOTBOL | PCRE_NOTEOL | PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART | disabled_flags16),
 			&error, &err_offs, NULL);
+
+		extra16 = NULL;
 		if (re16) {
 			error = NULL;
 			extra16 = pcre16_study(re16, PCRE_STUDY_JIT_COMPILE, &error);
@@ -813,6 +820,8 @@ static int regression_tests(void)
 			setstack(NULL, 1);
 
 #ifdef SUPPORT_PCRE8
+		return_value8_1 = -1000;
+		return_value8_2 = -1000;
 		if (re8) {
 			setstack(extra8, 0);
 			for (i = 0; i < 32; ++i)
@@ -828,6 +837,8 @@ static int regression_tests(void)
 #endif
 
 #ifdef SUPPORT_PCRE16
+		return_value16_1 = -1000;
+		return_value16_2 = -1000;
 		if (re16) {
 			setstack(extra16, 0);
 			if (current->flags & PCRE_UTF8)
@@ -853,7 +864,7 @@ static int regression_tests(void)
 		is_succesful = 1;
 		if (!(current->flags & PCRE_BUG)) {
 #if defined SUPPORT_PCRE8 && defined SUPPORT_PCRE16
-			if ((current->flags & PCRE_UTF8) && utf8 && utf16) {
+			if (utf8 == utf16) {
 				/* All results must be the same. */
 				if (return_value8_1 != return_value8_2 || return_value8_1 != return_value16_1 || return_value8_1 != return_value16_2) {
 					printf("\n8 and 16 bit: Return value differs(%d:%d:%d:%d): [%d] '%s' @ '%s'\n",
@@ -863,11 +874,13 @@ static int regression_tests(void)
 				} else if (return_value8_1 >= 0) {
 					return_value8_1 *= 2;
 					/* Transform back the results. */
-					for (i = 0; i < return_value8_1; ++i) {
-						if (ovector16_1[i] >= 0)
-							ovector16_1[i] = regtest_offsetmap[ovector16_1[i]];
-						if (ovector16_2[i] >= 0)
-							ovector16_2[i] = regtest_offsetmap[ovector16_2[i]];
+					if (current->flags & PCRE_UTF8) {
+						for (i = 0; i < return_value8_1; ++i) {
+							if (ovector16_1[i] >= 0)
+								ovector16_1[i] = regtest_offsetmap[ovector16_1[i]];
+							if (ovector16_2[i] >= 0)
+								ovector16_2[i] = regtest_offsetmap[ovector16_2[i]];
+						}
 					}
 
 					for (i = 0; i < return_value8_1; ++i)
diff --git a/testdata/testoutput10 b/testdata/testoutput10
index 47a2a97..ef9b82c 100644
--- a/testdata/testoutput10
+++ b/testdata/testoutput10
@@ -194,7 +194,7 @@ Memory allocation (code space): 28
 ------------------------------------------------------------------
 
 /a(?P<name1>b|c)d(?P<longername2>e)/BM
-Memory allocation (code space): 42
+Memory allocation (code space): 36
 ------------------------------------------------------------------
   0  32 Bra
   3     a
@@ -212,7 +212,7 @@ Memory allocation (code space): 42
 ------------------------------------------------------------------
 
 /(?:a(?P<c>c(?P<d>d)))(?P<a>a)/BM
-Memory allocation (code space): 54
+Memory allocation (code space): 45
 ------------------------------------------------------------------
   0  41 Bra
   3  25 Bra
@@ -232,7 +232,7 @@ Memory allocation (code space): 54
 ------------------------------------------------------------------
 
 /(?P<a>a)...(?P=a)bbb(?P>a)d/BM
-Memory allocation (code space): 37
+Memory allocation (code space): 34
 ------------------------------------------------------------------
   0  30 Bra
   3   7 CBra 1
author	zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2011-12-07 14:36:26 +0000
committer	zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2011-12-07 14:36:26 +0000
commit	4b661f8c6abbe9be96af67b9d5547bb96359cc99 (patch)
tree	120a0978274ca69871e78091aeab44486501d482
parent	b4a0233a732c67c98886725229df86fc150b0e82 (diff)
download	pcre-4b661f8c6abbe9be96af67b9d5547bb96359cc99.tar.gz