renaming utf8 to utf, JIT compiler update, disallowing invalid utf chars

git-svn-id: svn://vcs.exim.org/pcre/code/branches/pcre16@781 2f5784b3-3f2a-0410-8824-cb99058d5e15
author: zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2011-12-03 07:58:30 +0000
committer: zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2011-12-03 07:58:30 +0000
commit: ad1a6e3a96050e61e6e2127d3a00ded77a1eb80c (patch)
tree: 4987dde0d6b3aee6401d3e89ce6ddc3acef49df3
parent: c9fa02b130f1a9da7b17b915e75248f19afb6d7a (diff)
download: pcre-ad1a6e3a96050e61e6e2127d3a00ded77a1eb80c.tar.gz
26 files changed, 512 insertions, 418 deletions
diff --git a/Makefile.am b/Makefile.am
index 7d5de86..39cf574 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -214,6 +214,7 @@ libpcre16_la_SOURCES = \
   pcre16_exec.c \
   pcre16_jit_compile.c \
   pcre16_newline.c \
+  pcre16_ord2utf16.c \
   pcre16_string_utils.c \
   pcre16_study.c \
   pcre16_tables.c \
diff --git a/pcre16_ord2utf16.c b/pcre16_ord2utf16.c
new file mode 100644
index 0000000..421c3a3
--- /dev/null
+++ b/pcre16_ord2utf16.c
@@ -0,0 +1,95 @@
+/*************************************************
+*      Perl-Compatible Regular Expressions       *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+                       Written by Philip Hazel
+           Copyright (c) 1997-2008 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+
+/* This file contains a private PCRE function that converts an ordinal
+character value into a UTF16 string. */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "pcre_internal.h"
+
+
+/*************************************************
+*       Convert character value to UTF-16         *
+*************************************************/
+
+/* This function takes an integer value in the range 0 - 0x10ffff
+and encodes it as a UTF-16 character in 1 to 2 pcre_uchars.
+
+Arguments:
+  cvalue     the character value
+  buffer     pointer to buffer for result - at least 2 pcre_uchars long
+
+Returns:     number of characters placed in the buffer
+*/
+
+int
+PRIV(ord2utf)(pcre_uint32 cvalue, pcre_uchar *buffer)
+{
+#ifdef SUPPORT_UTF16
+
+/* Checking invalid cvalue character, encoded as invalid UTF-16 character.
+Should never happen in practice. */
+if ((cvalue & 0xf800) == 0xd800 || cvalue >= 0x110000)
+  cvalue = 0xfffe;
+
+if (cvalue <= 0xffff)
+  {
+  *buffer = (pcre_uchar)cvalue;
+  return 1;
+  }
+
+cvalue -= 0x10000;
+*buffer++ = 0xd800 | (cvalue >> 10);
+*buffer = 0xdc00 | (cvalue & 0x3ff);
+return 2;
+
+#else
+
+(void)(cvalue);  /* Keep compiler happy; this function won't ever be */
+(void)(buffer);  /* called when SUPPORT_UTF8 is not defined. */
+return 0;
+
+#endif
+}
+
+/* End of pcre16_ord2utf16.c */
diff --git a/pcre16_utf16_utils.c b/pcre16_utf16_utils.c
index cd82e26..5ff3953 100644
--- a/pcre16_utf16_utils.c
+++ b/pcre16_utf16_utils.c
@@ -57,7 +57,7 @@ any Byte Order Marks (BOMS). Returns with the remainig length. */
 BOOL same_bo = TRUE;
 PCRE_SPTR16 end = input + length;
 /* The c variable must be unsigned. */
-register uschar c;
+register pcre_uchar c;
 
 while (input < end)
   {
diff --git a/pcre16_valid_utf16.c b/pcre16_valid_utf16.c
index b64519e..c7c7507 100644
--- a/pcre16_valid_utf16.c
+++ b/pcre16_valid_utf16.c
@@ -78,11 +78,11 @@ Returns:       = 0    if the string is a valid UTF-16 string
 */
 
 int
-PRIV(valid_utf16)(PCRE_PUCHAR string, int length, int *erroroffset)
+PRIV(valid_utf)(PCRE_PUCHAR string, int length, int *erroroffset)
 {
 #ifdef SUPPORT_UTF16
 register PCRE_PUCHAR p;
-register uschar c;
+register pcre_uchar c;
 
 if (length < 0)
   {
diff --git a/pcre_compile.c b/pcre_compile.c
index 0bdd0fd..da4ce22 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -470,6 +470,7 @@ static const char error_texts[] =
   "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
   /* 70 */
   "internal error: unknown opcode in find_fixedlength()\0"
+  "Not allowed UTF-8 / UTF-16 code point (>= 0xd800 && <= 0xdfff)\0"
   ;
 
 /* Table to identify digits and hex digits. This is used when compiling
@@ -538,7 +539,7 @@ static const pcre_uint8 digitab[] =
 
 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
 
-static const pcre_unit8 digitab[] =
+static const pcre_uint8 digitab[] =
   {
   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
@@ -706,9 +707,11 @@ static int
 check_escape(const pcre_uchar **ptrptr, int *errorcodeptr, int bracount,
   int options, BOOL isclass)
 {
-BOOL utf8 = (options & PCRE_UTF8) != 0;
+/* PCRE_UTF16 has the same value as PCRE_UTF8. */
+BOOL utf = (options & PCRE_UTF8) != 0;
 const pcre_uchar *ptr = *ptrptr + 1;
-int c, i;
+pcre_int32 c;
+int i;
 
 GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
 ptr--;                            /* Set pointer back to the last byte */
@@ -940,12 +943,12 @@ else
     c -= CHAR_0;
     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
         c = c * 8 + *(++ptr) - CHAR_0;
-    if (!utf8 && c > 0xff) *errorcodeptr = ERR51;
+    if (!utf && c > 0xff) *errorcodeptr = ERR51;
     break;
 
     /* \x is complicated. \x{ddd} is a character number which can be greater
-    than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
-    treated as a data character. */
+    than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.
+    If not, { is treated as a data character. */
 
     case CHAR_x:
     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
@@ -974,14 +977,12 @@ else
     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
       {
       const pcre_uchar *pt = ptr + 2;
-      int count = 0;
 
       c = 0;
       while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)
         {
         register int cc = *pt++;
         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
-        count++;
 
 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
         if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
@@ -990,17 +991,25 @@ else
         if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
         c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
 #endif
-        }
 
-      if (*pt == CHAR_RIGHT_CURLY_BRACKET)
-        {
 #ifdef COMPILE_PCRE8
-        if (c < 0 || count > (utf8? 8:2)) *errorcodeptr = ERR34;
+        if (c > (utf ? 0x10ffff : 0xff)) { c = -1; break; }
 #else
 #ifdef COMPILE_PCRE16
-        if (c < 0 || count > (utf8? 8:4)) *errorcodeptr = ERR34;
+        if (c > (utf ? 0x10ffff : 0xffff)) { c = -1; break; }
 #endif
 #endif
+        }
+
+      if (c < 0)
+        {
+        while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++;
+        *errorcodeptr = ERR34;
+        }
+
+      if (*pt == CHAR_RIGHT_CURLY_BRACKET)
+        {
+        if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR71;
         ptr = pt;
         break;
         }
@@ -1281,7 +1290,7 @@ Arguments:
   name         name to seek, or NULL if seeking a numbered subpattern
   lorn         name length, or subpattern number if name is NULL
   xmode        TRUE if we are in /x mode
-  utf8         TRUE if we are in UTF-8 mode
+  utf          TRUE if we are in UTF-8 / UTF-16 mode
   count        pointer to the current capturing subpattern number (updated)
 
 Returns:       the number of the named subpattern, or -1 if not found
@@ -1289,7 +1298,7 @@ Returns:       the number of the named subpattern, or -1 if not found
 
 static int
 find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn,
-  BOOL xmode, BOOL utf8, int *count)
+  BOOL xmode, BOOL utf, int *count)
 {
 pcre_uchar *ptr = *ptrptr;
 int start_count = *count;
@@ -1458,7 +1467,7 @@ for (; ptr < cd->end_pattern; ptr++)
       if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
       ptr++;
 #ifdef SUPPORT_UTF8
-      if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+      if (utf) while ((*ptr & 0xc0) == 0x80) ptr++;
 #endif
       }
     if (*ptr == 0) goto FAIL_EXIT;
@@ -1469,7 +1478,7 @@ for (; ptr < cd->end_pattern; ptr++)
 
   if (*ptr == CHAR_LEFT_PARENTHESIS)
     {
-    int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
+    int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, count);
     if (rc > 0) return rc;
     if (*ptr == 0) goto FAIL_EXIT;
     }
@@ -1515,14 +1524,14 @@ Arguments:
   name         name to seek, or NULL if seeking a numbered subpattern
   lorn         name length, or subpattern number if name is NULL
   xmode        TRUE if we are in /x mode
-  utf8         TRUE if we are in UTF-8 mode
+  utf          TRUE if we are in UTF-8 / UTF-16 mode
 
 Returns:       the number of the found subpattern, or -1 if not found
 */
 
 static int
 find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode,
-  BOOL utf8)
+  BOOL utf)
 {
 pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern;
 int count = 0;
@@ -1535,7 +1544,7 @@ matching closing parens. That is why we have to have a loop. */
 
 for (;;)
   {
-  rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
+  rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, &count);
   if (rc > 0 || *ptr++ == 0) break;
   }
 
@@ -1618,7 +1627,7 @@ and doing the check at the end; a flag specifies which mode we are running in.
 
 Arguments:
   code     points to the start of the pattern (the bracket)
-  utf8     TRUE in UTF-8 mode
+  utf      TRUE in UTF-8 / UTF-16 mode
   atend    TRUE if called when the pattern is complete
   cd       the "compile data" structure
 
@@ -1630,7 +1639,7 @@ Returns:   the fixed length,
 */
 
 static int
-find_fixedlength(pcre_uchar *code, BOOL utf8, BOOL atend, compile_data *cd)
+find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
 {
 int length = -1;
 
@@ -1657,7 +1666,7 @@ for (;;)
     case OP_ONCE:
     case OP_ONCE_NC:
     case OP_COND:
-    d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf8, atend, cd);
+    d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
     if (d < 0) return d;
     branchlength += d;
     do cc += GET(cc, 1); while (*cc == OP_ALT);
@@ -1691,7 +1700,7 @@ for (;;)
     cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
     do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
     if (cc > cs && cc < ce) return -1;                    /* Recursion */
-    d = find_fixedlength(cs + 2, utf8, atend, cd);
+    d = find_fixedlength(cs + 2, utf, atend, cd);
     if (d < 0) return d;
     branchlength += d;
     cc += 1 + LINK_SIZE;
@@ -1751,7 +1760,7 @@ for (;;)
     branchlength++;
     cc += 2;
 #ifdef SUPPORT_UTF8
-    if (utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+    if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
 #endif
     break;
 
@@ -1765,7 +1774,7 @@ for (;;)
     branchlength += GET2(cc,1);
     cc += 2 + IMM2_SIZE;
 #ifdef SUPPORT_UTF8
-    if (utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+    if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
 #endif
     break;
 
@@ -1945,14 +1954,14 @@ length.
 
 Arguments:
   code        points to start of expression
-  utf8        TRUE in UTF-8 mode
+  utf         TRUE in UTF-8 / UTF-16 mode
   number      the required bracket number or negative to find a lookbehind
 
 Returns:      pointer to the opcode for the bracket, or NULL if not found
 */
 
 const pcre_uchar *
-PRIV(find_bracket)(const pcre_uchar *code, BOOL utf8, int number)
+PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
 {
 for (;;)
   {
@@ -2033,7 +2042,7 @@ for (;;)
   arrange to skip the extra bytes. */
 
 #ifdef SUPPORT_UTF8
-    if (utf8) switch(c)
+    if (utf) switch(c)
       {
       case OP_CHAR:
       case OP_CHARI:
@@ -2067,7 +2076,7 @@ for (;;)
       break;
       }
 #else
-    (void)(utf8);  /* Keep compiler happy by referencing function argument */
+    (void)(utf);  /* Keep compiler happy by referencing function argument */
 #endif
     }
   }
@@ -2084,13 +2093,13 @@ instance of OP_RECURSE.
 
 Arguments:
   code        points to start of expression
-  utf8        TRUE in UTF-8 mode
+  utf         TRUE in UTF-8 / UTF-16 mode
 
 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
 */
 
 static const pcre_uchar *
-find_recurse(const pcre_uchar *code, BOOL utf8)
+find_recurse(const pcre_uchar *code, BOOL utf)
 {
 for (;;)
   {
@@ -2153,7 +2162,7 @@ for (;;)
     to arrange to skip the extra bytes. */
 
 #ifdef SUPPORT_UTF8
-    if (utf8) switch(c)
+    if (utf) switch(c)
       {
       case OP_CHAR:
       case OP_CHARI:
@@ -2187,7 +2196,7 @@ for (;;)
       break;
       }
 #else
-    (void)(utf8);  /* Keep compiler happy by referencing function argument */
+    (void)(utf);  /* Keep compiler happy by referencing function argument */
 #endif
     }
   }
@@ -2210,7 +2219,7 @@ bracket whose current branch will already have been scanned.
 Arguments:
   code        points to start of search
   endcode     points to where to stop
-  utf8        TRUE if in UTF8 mode
+  utf         TRUE if in UTF-8 / UTF-16 mode
   cd          contains pointers to tables etc.
 
 Returns:      TRUE if what is matched could be empty
@@ -2218,7 +2227,7 @@ Returns:      TRUE if what is matched could be empty
 
 static BOOL
 could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
-  BOOL utf8, compile_data *cd)
+  BOOL utf, compile_data *cd)
 {
 register int c;
 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
@@ -2266,7 +2275,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
 
     do
       {
-      if (could_be_empty_branch(scode, endcode, utf8, cd))
+      if (could_be_empty_branch(scode, endcode, utf, cd))
         {
         empty_branch = TRUE;
         break;
@@ -2322,7 +2331,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
       empty_branch = FALSE;
       do
         {
-        if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
+        if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd))
           empty_branch = TRUE;
         code += GET(code, 1);
         }
@@ -2456,7 +2465,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
     case OP_MINQUERYI:
     case OP_POSQUERY:
     case OP_POSQUERYI:
-    if (utf8 && code[1] >= 0xc0) code += PRIV(utf8_table4)[code[1] & 0x3f];
+    if (utf && code[1] >= 0xc0) code += PRIV(utf8_table4)[code[1] & 0x3f];
     break;
 
     case OP_UPTO:
@@ -2465,7 +2474,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
     case OP_MINUPTOI:
     case OP_POSUPTO:
     case OP_POSUPTOI:
-    if (utf8 && code[1 + IMM2_SIZE] >= 0xc0) code += PRIV(utf8_table4)[code[1 + IMM2_SIZE] & 0x3f];
+    if (utf && code[1 + IMM2_SIZE] >= 0xc0) code += PRIV(utf8_table4)[code[1 + IMM2_SIZE] & 0x3f];
     break;
 #endif
 
@@ -2509,7 +2518,7 @@ Arguments:
   code        points to start of the recursion
   endcode     points to where to stop (current RECURSE item)
   bcptr       points to the chain of current (unclosed) branch starts
-  utf8        TRUE if in UTF-8 mode
+  utf         TRUE if in UTF-8 / UTF-16 mode
   cd          pointers to tables etc
 
 Returns:      TRUE if what is matched could be empty
@@ -2517,11 +2526,11 @@ Returns:      TRUE if what is matched could be empty
 
 static BOOL
 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
-  branch_chain *bcptr, BOOL utf8, compile_data *cd)
+  branch_chain *bcptr, BOOL utf, compile_data *cd)
 {
 while (bcptr != NULL && bcptr->current_branch >= code)
   {
-  if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
+  if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd))
     return FALSE;
   bcptr = bcptr->outer;
   }
@@ -2656,7 +2665,7 @@ value in the reference (which is a group number).
 Arguments:
   group      points to the start of the group
   adjust     the amount by which the group is to be moved
-  utf8       TRUE in UTF-8 mode
+  utf        TRUE in UTF-8 / UTF-16 mode
   cd         contains pointers to tables etc.
   save_hwm   the hwm forward reference pointer at the start of the group
 
@@ -2664,12 +2673,12 @@ Returns:     nothing
 */
 
 static void
-adjust_recurse(pcre_uchar *group, int adjust, BOOL utf8, compile_data *cd,
+adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
   pcre_uchar *save_hwm)
 {
 pcre_uchar *ptr = group;
 
-while ((ptr = (pcre_uchar *)find_recurse(ptr, utf8)) != NULL)
+while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
   {
   int offset;
   pcre_uchar *hc;
@@ -2875,7 +2884,7 @@ sense to automatically possessify the repeated item.
 
 Arguments:
   previous      pointer to the repeated opcode
-  utf8          TRUE in UTF-8 mode
+  utf           TRUE in UTF-8 / UTF-16 mode
   ptr           next character in pattern
   options       options bits
   cd            contains pointers to tables etc.
@@ -2884,7 +2893,7 @@ Returns:        TRUE if possessifying is wanted
 */
 
 static BOOL
-check_auto_possessive(const pcre_uchar *previous, BOOL utf8,
+check_auto_possessive(const pcre_uchar *previous, BOOL utf,
   const pcre_uchar *ptr, int options, compile_data *cd)
 {
 int c, next;
@@ -2905,7 +2914,7 @@ if ((options & PCRE_EXTENDED) != 0)
         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
         ptr++;
 #ifdef SUPPORT_UTF8
-        if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+        if (utf) while ((*ptr & 0xc0) == 0x80) ptr++;
 #endif
         }
       }
@@ -2927,7 +2936,7 @@ if (*ptr == CHAR_BACKSLASH)
 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
   {
 #ifdef SUPPORT_UTF8
-  if (utf8) { GETCHARINC(next, ptr); } else
+  if (utf) { GETCHARINC(next, ptr); } else
 #endif
   next = *ptr++;
   }
@@ -2949,7 +2958,7 @@ if ((options & PCRE_EXTENDED) != 0)
         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
         ptr++;
 #ifdef SUPPORT_UTF8
-        if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+        if (utf) while ((*ptr & 0xc0) == 0x80) ptr++;
 #endif
         }
       }
@@ -2988,7 +2997,7 @@ if (next >= 0) switch(op_code)
 #endif
   if (c == next) return FALSE;
 #ifdef SUPPORT_UTF8
-  if (utf8)
+  if (utf)
     {
     unsigned int othercase;
     if (next < 128) othercase = cd->fcc[next]; else
@@ -3013,7 +3022,7 @@ if (next >= 0) switch(op_code)
   case OP_NOTI:
   if ((c = *previous) == next) return TRUE;
 #ifdef SUPPORT_UTF8
-  if (utf8)
+  if (utf)
     {
     unsigned int othercase;
     if (next < 128) othercase = cd->fcc[next]; else
@@ -3348,10 +3357,11 @@ must not do this for other options (e.g. PCRE_EXTENDED) because they may change
 dynamically as we process the pattern. */
 
 #ifdef SUPPORT_UTF8
-BOOL utf8 = (options & PCRE_UTF8) != 0;
-pcre_uint8 utf8_char[6];
+/* PCRE_UTF16 has the same value as PCRE_UTF8. */
+BOOL utf = (options & PCRE_UTF8) != 0;
+pcre_uchar utf_chars[6];
 #else
-BOOL utf8 = FALSE;
+BOOL utf = FALSE;
 #endif
 
 /* Helper variables for OP_XCLASS opcode (for characters > 255). */
@@ -3459,8 +3469,8 @@ for (;; ptr++)
       }
 
     *lengthptr += (int)(code - last_code);
-    DPRINTF(("length=%d added %d c=%c\n", *lengthptr, (int)(code - last_code),
-      c));
+    DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
+      (int)(code - last_code), c, c));
 
     /* If "previous" is set and it is not at the start of the work space, move
     it back to there, in order to avoid filling up the work space. Otherwise,
@@ -3547,7 +3557,7 @@ for (;; ptr++)
         if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
         ptr++;
 #ifdef SUPPORT_UTF8
-        if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+        if (utf) while ((*ptr & 0xc0) == 0x80) ptr++;
 #endif
         }
       if (*ptr != 0) continue;
@@ -3727,7 +3737,7 @@ for (;; ptr++)
       const pcre_uchar *oldptr;
 
 #ifdef SUPPORT_UTF8
-      if (utf8 && c > 127)
+      if (utf && c > 127)
         {                           /* Braces are required because the */
         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
         }
@@ -3945,22 +3955,22 @@ for (;; ptr++)
             SETBIT(classbits, 0x20); /* SPACE */
             SETBIT(classbits, 0xa0); /* NSBP */
 #ifdef SUPPORT_UTF
-            if (utf8)
+            if (utf)
               {
               xclass = TRUE;
               *class_uchardata++ = XCL_SINGLE;
-              class_uchardata += PRIV(ord2utf8)(0x1680, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x1680, class_uchardata);
               *class_uchardata++ = XCL_SINGLE;
-              class_uchardata += PRIV(ord2utf8)(0x180e, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x180e, class_uchardata);
               *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf8)(0x2000, class_uchardata);
-              class_uchardata += PRIV(ord2utf8)(0x200A, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x2000, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x200A, class_uchardata);
               *class_uchardata++ = XCL_SINGLE;
-              class_uchardata += PRIV(ord2utf8)(0x202f, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x202f, class_uchardata);
               *class_uchardata++ = XCL_SINGLE;
-              class_uchardata += PRIV(ord2utf8)(0x205f, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x205f, class_uchardata);
               *class_uchardata++ = XCL_SINGLE;
-              class_uchardata += PRIV(ord2utf8)(0x3000, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x3000, class_uchardata);
               }
 #endif
             continue;
@@ -3980,30 +3990,30 @@ for (;; ptr++)
               }
 
 #ifdef SUPPORT_UTF
-            if (utf8)
+            if (utf)
               {
               xclass = TRUE;
               *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf8)(0x0100, class_uchardata);
-              class_uchardata += PRIV(ord2utf8)(0x167f, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x167f, class_uchardata);
               *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf8)(0x1681, class_uchardata);
-              class_uchardata += PRIV(ord2utf8)(0x180d, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x1681, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x180d, class_uchardata);
               *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf8)(0x180f, class_uchardata);
-              class_uchardata += PRIV(ord2utf8)(0x1fff, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x180f, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x1fff, class_uchardata);
               *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf8)(0x200B, class_uchardata);
-              class_uchardata += PRIV(ord2utf8)(0x202e, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x200B, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x202e, class_uchardata);
               *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf8)(0x2030, class_uchardata);
-              class_uchardata += PRIV(ord2utf8)(0x205e, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x2030, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x205e, class_uchardata);
               *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf8)(0x2060, class_uchardata);
-              class_uchardata += PRIV(ord2utf8)(0x2fff, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x2060, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x2fff, class_uchardata);
               *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf8)(0x3001, class_uchardata);
-              class_uchardata += PRIV(ord2utf8)(0x7fffffff, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x3001, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
               }
 #endif
             continue;
@@ -4015,12 +4025,12 @@ for (;; ptr++)
             SETBIT(classbits, 0x0d); /* CR */
             SETBIT(classbits, 0x85); /* NEL */
 #ifdef SUPPORT_UTF
-            if (utf8)
+            if (utf)
               {
               xclass = TRUE;
               *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf8)(0x2028, class_uchardata);
-              class_uchardata += PRIV(ord2utf8)(0x2029, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x2028, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata);
               }
 #endif
             continue;
@@ -4043,15 +4053,15 @@ for (;; ptr++)
               }
 
 #ifdef SUPPORT_UTF
-            if (utf8)
+            if (utf)
               {
               xclass = TRUE;
               *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf8)(0x0100, class_uchardata);
-              class_uchardata += PRIV(ord2utf8)(0x2027, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x2027, class_uchardata);
               *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf8)(0x2029, class_uchardata);
-              class_uchardata += PRIV(ord2utf8)(0x7fffffff, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
               }
 #endif
             continue;
@@ -4139,7 +4149,7 @@ for (;; ptr++)
           }
 
 #ifdef SUPPORT_UTF8
-        if (utf8)
+        if (utf)
           {                           /* Braces are required because the */
           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
           }
@@ -4189,7 +4199,7 @@ for (;; ptr++)
         available. */
 
 #ifdef SUPPORT_UTF
-        if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
+        if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
 #endif
 #ifndef COMPILE_PCRE8
         if (d > 255)
@@ -4234,9 +4244,9 @@ for (;; ptr++)
               else
                 {
                 *class_uchardata++ = XCL_RANGE;
-                class_uchardata += PRIV(ord2utf8)(occ, class_uchardata);
+                class_uchardata += PRIV(ord2utf)(occ, class_uchardata);
                 }
-              class_uchardata += PRIV(ord2utf8)(ocd, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(ocd, class_uchardata);
               }
             }
 #endif  /* SUPPORT_UCP */
@@ -4246,8 +4256,8 @@ for (;; ptr++)
 
           *class_uchardata++ = XCL_RANGE;
 #ifdef SUPPORT_UTF
-          class_uchardata += PRIV(ord2utf8)(c, class_uchardata);
-          class_uchardata += PRIV(ord2utf8)(d, class_uchardata);
+          class_uchardata += PRIV(ord2utf)(c, class_uchardata);
+          class_uchardata += PRIV(ord2utf)(d, class_uchardata);
 #else
           *class_uchardata++ = c;
           *class_uchardata++ = d;
@@ -4304,7 +4314,7 @@ for (;; ptr++)
       /* Handle a character that cannot go in the bit map */
 
 #ifdef SUPPORT_UTF
-      if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
+      if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
 #endif
 #ifndef COMPILE_PCRE8
       if (c > 255)
@@ -4314,7 +4324,7 @@ for (;; ptr++)
         xclass = TRUE;
         *class_uchardata++ = XCL_SINGLE;
 #ifdef SUPPORT_UTF
-        class_uchardata += PRIV(ord2utf8)(c, class_uchardata);
+        class_uchardata += PRIV(ord2utf)(c, class_uchardata);
 #else
         *class_uchardata++ = c;
 #endif
@@ -4326,7 +4336,7 @@ for (;; ptr++)
           if ((othercase = UCD_OTHERCASE(c)) != c)
             {
             *class_uchardata++ = XCL_SINGLE;
-            class_uchardata += PRIV(ord2utf8)(othercase, class_uchardata);
+            class_uchardata += PRIV(ord2utf)(othercase, class_uchardata);
             }
           }
 #endif  /* SUPPORT_UCP */
@@ -4384,11 +4394,9 @@ for (;; ptr++)
 
 #ifdef SUPPORT_UTF
     if (class_charcount == 1 && !xclass &&
-      (!utf8 || !negate_class || class_lastchar < 128))
-#elif defined COMPILE_PCRE8
-    if (class_charcount == 1)
+      (!utf || !negate_class || class_lastchar < 128))
 #else
-    if (class_charcount == 1 && !xclass)
+    if (class_charcount == 1)
 #endif
       {
       zeroreqchar = reqchar;
@@ -4408,8 +4416,8 @@ for (;; ptr++)
       then we can handle this with the normal one-character code. */
 
 #ifdef SUPPORT_UTF8
-      if (utf8 && class_lastchar > 127)
-        mclength = PRIV(ord2utf8)(class_lastchar, mcbuffer);
+      if (utf && class_lastchar > 127)
+        mclength = PRIV(ord2utf)(class_lastchar, mcbuffer);
       else
 #endif
         {
@@ -4599,12 +4607,12 @@ for (;; ptr++)
       length rather than a small character. */
 
 #ifdef SUPPORT_UTF8
-      if (utf8 && (code[-1] & 0x80) != 0)
+      if (utf && (code[-1] & 0x80) != 0)
         {
         pcre_uchar *lastchar = code - 1;
         while((*lastchar & 0xc0) == 0x80) lastchar--;
         c = code - lastchar;            /* Length of UTF-8 character */
-        memcpy(utf8_char, lastchar, c); /* Save the char */
+        memcpy(utf_chars, lastchar, c); /* Save the char */
         c |= 0x80;                      /* Flag c as a length */
         }
       else
@@ -4625,7 +4633,7 @@ for (;; ptr++)
 
       if (!possessive_quantifier &&
           repeat_max < 0 &&
-          check_auto_possessive(previous, utf8, ptr + 1, options, cd))
+          check_auto_possessive(previous, utf, ptr + 1, options, cd))
         {
         repeat_type = 0;    /* Force greedy */
         possessive_quantifier = TRUE;
@@ -4646,7 +4654,7 @@ for (;; ptr++)
       c = previous[1];
       if (!possessive_quantifier &&
           repeat_max < 0 &&
-          check_auto_possessive(previous, utf8, ptr + 1, options, cd))
+          check_auto_possessive(previous, utf, ptr + 1, options, cd))
         {
         repeat_type = 0;    /* Force greedy */
         possessive_quantifier = TRUE;
@@ -4670,7 +4678,7 @@ for (;; ptr++)
 
       if (!possessive_quantifier &&
           repeat_max < 0 &&
-          check_auto_possessive(previous, utf8, ptr + 1, options, cd))
+          check_auto_possessive(previous, utf, ptr + 1, options, cd))
         {
         repeat_type = 0;    /* Force greedy */
         possessive_quantifier = TRUE;
@@ -4755,9 +4763,9 @@ for (;; ptr++)
         if (repeat_max < 0)
           {
 #ifdef SUPPORT_UTF8
-          if (utf8 && c >= 128)
+          if (utf && c >= 128)
             {
-            memcpy(code, utf8_char, c & 7);
+            memcpy(code, utf_chars, c & 7);
             code += c & 7;
             }
           else
@@ -4780,9 +4788,9 @@ for (;; ptr++)
         else if (repeat_max != repeat_min)
           {
 #ifdef SUPPORT_UTF8
-          if (utf8 && c >= 128)
+          if (utf && c >= 128)
             {
-            memcpy(code, utf8_char, c & 7);
+            memcpy(code, utf_chars, c & 7);
             code += c & 7;
             }
           else
@@ -4810,9 +4818,9 @@ for (;; ptr++)
       /* The character or character type itself comes last in all cases. */
 
 #ifdef SUPPORT_UTF8
-      if (utf8 && c >= 128)
+      if (utf && c >= 128)
         {
-        memcpy(code, utf8_char, c & 7);
+        memcpy(code, utf_chars, c & 7);
         code += c & 7;
         }
       else
@@ -4939,7 +4947,7 @@ for (;; ptr++)
         if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
           {
           *code = OP_END;
-          adjust_recurse(previous, 1, utf8, cd, save_hwm);
+          adjust_recurse(previous, 1, utf, cd, save_hwm);
           memmove(previous + 1, previous, IN_UCHARS(len));
           code++;
           if (repeat_max == 0)
@@ -4963,7 +4971,7 @@ for (;; ptr++)
           {
           int offset;
           *code = OP_END;
-          adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
+          adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm);
           memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
           code += 2 + LINK_SIZE;
           *previous++ = OP_BRAZERO + repeat_type;
@@ -5165,7 +5173,7 @@ for (;; ptr++)
             pcre_uchar *scode = bracode;
             do
               {
-              if (could_be_empty_branch(scode, ketcode, utf8, cd))
+              if (could_be_empty_branch(scode, ketcode, utf, cd))
                 {
                 *bracode += OP_SBRA - OP_BRA;
                 break;
@@ -5188,7 +5196,7 @@ for (;; ptr++)
               {
               int nlen = (int)(code - bracode);
               *code = OP_END;
-              adjust_recurse(bracode, 1 + LINK_SIZE, utf8, cd, save_hwm);
+              adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm);
               memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
               code += 1 + LINK_SIZE;
               nlen += 1 + LINK_SIZE;
@@ -5266,7 +5274,7 @@ for (;; ptr++)
         {
         tempcode += PRIV(OP_lengths)[*tempcode];
 #ifdef SUPPORT_UTF8
-        if (utf8 && tempcode[-1] >= 0xc0)
+        if (utf && tempcode[-1] >= 0xc0)
           tempcode += PRIV(utf8_table4)[tempcode[-1] & 0x3f];
 #endif
         }
@@ -5304,7 +5312,7 @@ for (;; ptr++)
 
         default:
         *code = OP_END;
-        adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm);
+        adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
         memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
         code += 1 + LINK_SIZE;
         len += 1 + LINK_SIZE;
@@ -5613,7 +5621,7 @@ for (;; ptr++)
         /* Search the pattern for a forward reference */
 
         else if ((i = find_parens(cd, name, namelen,
-                        (options & PCRE_EXTENDED) != 0, utf8)) > 0)
+                        (options & PCRE_EXTENDED) != 0, utf)) > 0)
           {
           PUT2(code, 2+LINK_SIZE, i);
           code[1+LINK_SIZE]++;
@@ -5958,7 +5966,7 @@ for (;; ptr++)
           temp = cd->end_pattern;
           cd->end_pattern = ptr;
           recno = find_parens(cd, name, namelen,
-            (options & PCRE_EXTENDED) != 0, utf8);
+            (options & PCRE_EXTENDED) != 0, utf);
           cd->end_pattern = temp;
           if (recno < 0) recno = 0;    /* Forward ref; set dummy number */
           }
@@ -5985,7 +5993,7 @@ for (;; ptr++)
             }
           else if ((recno =                /* Forward back reference */
                     find_parens(cd, name, namelen,
-                      (options & PCRE_EXTENDED) != 0, utf8)) <= 0)
+                      (options & PCRE_EXTENDED) != 0, utf)) <= 0)
             {
             *errorcodeptr = ERR15;
             goto FAILED;
@@ -6089,14 +6097,14 @@ for (;; ptr++)
             {
             *code = OP_END;
             if (recno != 0)
-              called = PRIV(find_bracket)(cd->start_code, utf8, recno);
+              called = PRIV(find_bracket)(cd->start_code, utf, recno);
 
             /* Forward reference */
 
             if (called == NULL)
               {
               if (find_parens(cd, NULL, recno,
-                    (options & PCRE_EXTENDED) != 0, utf8) < 0)
+                    (options & PCRE_EXTENDED) != 0, utf) < 0)
                 {
                 *errorcodeptr = ERR15;
                 goto FAILED;
@@ -6120,7 +6128,7 @@ for (;; ptr++)
             conditional subpatterns will be picked up then. */
 
             else if (GET(called, 1) == 0 && cond_depth <= 0 &&
-                     could_be_empty(called, code, bcptr, utf8, cd))
+                     could_be_empty(called, code, bcptr, utf, cd))
               {
               *errorcodeptr = ERR40;
               goto FAILED;
@@ -6618,7 +6626,7 @@ for (;; ptr++)
          
           {  
           previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
-          *code++ = (!utf8 && c == -ESC_C)? OP_ALLANY : -c;
+          *code++ = (!utf && c == -ESC_C)? OP_ALLANY : -c;
           }
         }
       continue;
@@ -6629,8 +6637,8 @@ for (;; ptr++)
     handle it as a data character. */
 
 #ifdef SUPPORT_UTF8
-    if (utf8 && c > 127)
-      mclength = PRIV(ord2utf8)(c, mcbuffer);
+    if (utf && c > 127)
+      mclength = PRIV(ord2utf)(c, mcbuffer);
     else
 #endif
 
@@ -6652,7 +6660,7 @@ for (;; ptr++)
     mcbuffer[0] = c;
 
 #ifdef SUPPORT_UTF8
-    if (utf8 && c >= 0xc0)
+    if (utf && c >= 0xc0)
       {
       while ((ptr[1] & 0xc0) == 0x80)
         mcbuffer[mclength++] = *(++ptr);
@@ -7360,7 +7368,7 @@ pcre_int32 firstchar, reqchar;
 int newline;
 int errorcode = 0;
 int skipatstart = 0;
-BOOL utf8;
+BOOL utf;
 size_t size;
 pcre_uchar *code;
 const pcre_uchar *codestart;
@@ -7458,22 +7466,23 @@ while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
   else break;
   }
 
-utf8 = (options & PCRE_UTF8) != 0;
+/* PCRE_UTF16 has the same value as PCRE_UTF8. */
+utf = (options & PCRE_UTF8) != 0;
 
 /* Can't support UTF8 unless PCRE has been compiled to include the code. The
-return of an error code from PRIV(valid_utf8)() is a new feature, introduced in
+return of an error code from PRIV(valid_utf)() is a new feature, introduced in
 release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
 not used here. */
 
 #ifdef SUPPORT_UTF8
-if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
-     (errorcode = PRIV(valid_utf8)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
+if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&
+     (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
   {
   errorcode = ERR44;
   goto PCRE_EARLY_ERROR_RETURN2;
   }
 #else
-if (utf8)
+if (utf)
   {
   errorcode = ERR32;
   goto PCRE_EARLY_ERROR_RETURN;
@@ -7688,7 +7697,7 @@ while (errorcode == 0 && cd->hwm > cworkspace)
   cd->hwm -= LINK_SIZE;
   offset = GET(cd->hwm, 0);
   recno = GET(codestart, offset);
-  groupptr = PRIV(find_bracket)(codestart, utf8, recno);
+  groupptr = PRIV(find_bracket)(codestart, utf, recno);
   if (groupptr == NULL) errorcode = ERR53;
     else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));
   }
@@ -7715,9 +7724,9 @@ if (cd->check_lookbehind)
   of zero, but that is a pathological case, and it does no harm.) When we find
   one, we temporarily terminate the branch it is in while we scan it. */
 
-  for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf8, -1);
+  for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf, -1);
        cc != NULL;
-       cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf8, -1))
+       cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf, -1))
     {
     if (GET(cc, 1) == 0)
       {
diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c
index 8fed9b3..8247f46 100644
--- a/pcre_dfa_exec.c
+++ b/pcre_dfa_exec.c
@@ -414,9 +414,9 @@ const pcre_uchar *end_subject = md->end_subject;
 const pcre_uchar *start_code = md->start_code;
 
 #ifdef SUPPORT_UTF8
-BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
+BOOL utf = (md->poptions & PCRE_UTF8) != 0;
 #else
-BOOL utf8 = FALSE;
+BOOL utf = FALSE;
 #endif
 
 rlevel++;
@@ -474,7 +474,7 @@ if (*first_op == OP_REVERSE)
 #ifdef SUPPORT_UTF8
   /* In character mode we have to step back character by character */
 
-  if (utf8)
+  if (utf)
     {
     for (gone_back = 0; gone_back < max_back; gone_back++)
       {
@@ -606,7 +606,7 @@ for (;;)
     {
     clen = 1;        /* Number of bytes in the character */
 #ifdef SUPPORT_UTF8
-    if (utf8) { GETCHARLEN(c, ptr, clen); } else
+    if (utf) { GETCHARLEN(c, ptr, clen); } else
 #endif  /* SUPPORT_UTF8 */
     c = *ptr;
     }
@@ -695,7 +695,7 @@ for (;;)
       {
       dlen = 1;
 #ifdef SUPPORT_UTF8
-      if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
+      if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
 #endif  /* SUPPORT_UTF8 */
       d = code[coptable[codevalue]];
       if (codevalue >= OP_TYPESTAR)
@@ -960,7 +960,7 @@ for (;;)
           const pcre_uchar *temp = ptr - 1;
           if (temp < md->start_used_ptr) md->start_used_ptr = temp;
 #ifdef SUPPORT_UTF8
-          if (utf8) BACKCHAR(temp);
+          if (utf) BACKCHAR(temp);
 #endif
           GETCHARTEST(d, temp);
 #ifdef SUPPORT_UCP
@@ -1986,7 +1986,7 @@ for (;;)
       if (clen == 0) break;
 
 #ifdef SUPPORT_UTF8
-      if (utf8)
+      if (utf)
         {
         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
           {
@@ -2007,8 +2007,7 @@ for (;;)
         }
       else
 #endif  /* SUPPORT_UTF8 */
-
-      /* Non-UTF-8 mode */
+      /* Not UTF mode */
         {
         if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
         }
@@ -2211,7 +2210,7 @@ for (;;)
         if (caseless)
           {
 #ifdef SUPPORT_UTF8
-          if (utf8 && d >= 128)
+          if (utf && d >= 128)
             {
 #ifdef SUPPORT_UCP
             otherd = UCD_OTHERCASE(d);
@@ -2258,7 +2257,7 @@ for (;;)
         if (caseless)
           {
 #ifdef SUPPORT_UTF8
-          if (utf8 && d >= 128)
+          if (utf && d >= 128)
             {
 #ifdef SUPPORT_UCP
             otherd = UCD_OTHERCASE(d);
@@ -2303,7 +2302,7 @@ for (;;)
         if (caseless)
           {
 #ifdef SUPPORT_UTF8
-          if (utf8 && d >= 128)
+          if (utf && d >= 128)
             {
 #ifdef SUPPORT_UCP
             otherd = UCD_OTHERCASE(d);
@@ -2340,7 +2339,7 @@ for (;;)
         if (caseless)
           {
 #ifdef SUPPORT_UTF8
-          if (utf8 && d >= 128)
+          if (utf && d >= 128)
             {
 #ifdef SUPPORT_UCP
             otherd = UCD_OTHERCASE(d);
@@ -2384,7 +2383,7 @@ for (;;)
         if (caseless)
           {
 #ifdef SUPPORT_UTF8
-          if (utf8 && d >= 128)
+          if (utf && d >= 128)
             {
 #ifdef SUPPORT_UCP
             otherd = UCD_OTHERCASE(d);
@@ -3005,7 +3004,7 @@ pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
 real_pcre *re = (real_pcre *)argument_re;
 dfa_match_data match_block;
 dfa_match_data *md = &match_block;
-BOOL utf8, anchored, startline, firstline;
+BOOL utf, anchored, startline, firstline;
 const pcre_uchar *current_subject, *end_subject;
 const pcre_uint8 *lcc;
 
@@ -3073,9 +3072,10 @@ end_subject = (const unsigned char *)subject + length;
 req_char_ptr = current_subject - 1;
 
 #ifdef SUPPORT_UTF8
-utf8 = (re->options & PCRE_UTF8) != 0;
+/* PCRE_UTF16 has the same value as PCRE_UTF8. */
+utf = (re->options & PCRE_UTF8) != 0;
 #else
-utf8 = FALSE;
+utf = FALSE;
 #endif
 
 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
@@ -3147,10 +3147,10 @@ else
 back the character offset. */
 
 #ifdef SUPPORT_UTF8
-if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
+if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
   {
   int erroroffset;
-  int errorcode = PRIV(valid_utf8)((pcre_uchar *)subject, length, &erroroffset);
+  int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
   if (errorcode != 0)
     {
     if (offsetcount >= 2)
@@ -3235,7 +3235,7 @@ for (;;)
       {
       PCRE_PUCHAR t = current_subject;
 #ifdef SUPPORT_UTF8
-      if (utf8)
+      if (utf)
         {
         while (t < md->end_subject && !IS_NEWLINE(t))
           {
@@ -3278,7 +3278,7 @@ for (;;)
         if (current_subject > md->start_subject + start_offset)
           {
 #ifdef SUPPORT_UTF8
-          if (utf8)
+          if (utf)
             {
             while (current_subject < end_subject &&
                    !WAS_NEWLINE(current_subject))
@@ -3317,7 +3317,7 @@ for (;;)
             {
             current_subject++;
 #ifdef SUPPORT_UTF8
-            if (utf8)
+            if (utf)
               while(current_subject < end_subject &&
                     (*current_subject & 0xc0) == 0x80) current_subject++;
 #endif
@@ -3426,7 +3426,7 @@ for (;;)
 
   if (firstline && IS_NEWLINE(current_subject)) break;
   current_subject++;
-  if (utf8)
+  if (utf)
     {
     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
       current_subject++;
diff --git a/pcre_exec.c b/pcre_exec.c
index 778a301..db013e6 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -183,7 +183,7 @@ if (caseless)
   {
 #ifdef SUPPORT_UTF8
 #ifdef SUPPORT_UCP
-  if (md->utf8)
+  if (md->utf)
     {
     /* Match characters up to the end of the reference. NOTE: the number of
     bytes matched may differ, because there are some characters whose upper and
@@ -385,7 +385,7 @@ typedef struct heapframe {
   int Xprop_value;
   int Xprop_fail_result;
   int Xoclength;
-  pcre_uint8 Xocchars[8];
+  pcre_uchar Xocchars[6];
 #endif
 
   int Xcodelink;
@@ -450,7 +450,7 @@ the subject. */
 
 
 /* Performance note: It might be tempting to extract commonly used fields from
-the md structure (e.g. utf8, end_subject) into individual variables to improve
+the md structure (e.g. utf, end_subject) into individual variables to improve
 performance. Tests using gcc on a SPARC disproved this; in the first case, it
 made performance worse.
 
@@ -485,7 +485,7 @@ so they can be ordinary variables in all cases. Mark some of them with
 register int  rrc;         /* Returns from recursive calls */
 register int  i;           /* Used for loops not involving calls to RMATCH() */
 register unsigned int c;   /* Character values not kept over RMATCH() calls */
-register BOOL utf8;        /* Local copy of UTF-8 flag for speed */
+register BOOL utf;         /* Local copy of UTF flag for speed */
 
 BOOL minimize, possessive; /* Quantifier options */
 BOOL caseless;
@@ -606,7 +606,7 @@ int prop_type;
 int prop_value;
 int prop_fail_result;
 int oclength;
-pcre_uint8 occhars[8];
+pcre_uchar occhars[6];
 #endif
 
 int codelink;
@@ -660,9 +660,9 @@ complicated macro. It has to be used in one particular way. This shouldn't,
 however, impact performance when true recursion is being used. */
 
 #ifdef SUPPORT_UTF8
-utf8 = md->utf8;       /* Local copy of the flag */
+utf = md->utf;       /* Local copy of the flag */
 #else
-utf8 = FALSE;
+utf = FALSE;
 #endif
 
 /* First check that we haven't called match() too many times, or that we
@@ -1597,7 +1597,7 @@ for (;;)
 
     case OP_REVERSE:
 #ifdef SUPPORT_UTF8
-    if (utf8)
+    if (utf)
       {
       i = GET(ecode, 1);
       while (i-- > 0)
@@ -2070,7 +2070,7 @@ for (;;)
       partial matching. */
 
 #ifdef SUPPORT_UTF8
-      if (utf8)
+      if (utf)
         {
         /* Get status of previous character */
 
@@ -2189,7 +2189,7 @@ for (;;)
       MRRETURN(MATCH_NOMATCH);
       }
     eptr++;
-    if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+    if (utf) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
     ecode++;
     break;
 
@@ -2546,7 +2546,7 @@ for (;;)
     while (eptr < md->end_subject)
       {
       int len = 1;
-      if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
+      if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
       if (UCD_CATEGORY(c) != ucp_M) break;
       eptr += len;
       }
@@ -2744,8 +2744,7 @@ for (;;)
       /* First, ensure the minimum number of matches are present. */
 
 #ifdef SUPPORT_UTF
-      /* UTF-8 mode */
-      if (utf8)
+      if (utf)
         {
         for (i = 1; i <= min; i++)
           {
@@ -2765,7 +2764,7 @@ for (;;)
         }
       else
 #endif
-      /* Not UTF-8 mode */
+      /* Not UTF mode */
         {
         for (i = 1; i <= min; i++)
           {
@@ -2797,8 +2796,7 @@ for (;;)
       if (minimize)
         {
 #ifdef SUPPORT_UTF
-        /* UTF-8 mode */
-        if (utf8)
+        if (utf)
           {
           for (fi = min;; fi++)
             {
@@ -2821,7 +2819,7 @@ for (;;)
           }
         else
 #endif
-        /* Not UTF-8 mode */
+        /* Not UTF mode */
           {
           for (fi = min;; fi++)
             {
@@ -2854,8 +2852,7 @@ for (;;)
         pp = eptr;
 
 #ifdef SUPPORT_UTF
-        /* UTF mode */
-        if (utf8)
+        if (utf)
           {
           for (i = min; i < max; i++)
             {
@@ -3024,7 +3021,7 @@ for (;;)
           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
           if (eptr-- == pp) break;        /* Stop if tried at original pos */
 #ifdef SUPPORT_UTF
-          if (utf8) BACKCHAR(eptr);
+          if (utf) BACKCHAR(eptr);
 #endif
           }
         MRRETURN(MATCH_NOMATCH);
@@ -3038,7 +3035,7 @@ for (;;)
 
     case OP_CHAR:
 #ifdef SUPPORT_UTF8
-    if (utf8)
+    if (utf)
       {
       length = 1;
       ecode++;
@@ -3052,8 +3049,7 @@ for (;;)
       }
     else
 #endif
-
-    /* Non-UTF-8 mode */
+    /* Not UTF mode */
       {
       if (md->end_subject - eptr < 1)
         {
@@ -3069,7 +3065,7 @@ for (;;)
 
     case OP_CHARI:
 #ifdef SUPPORT_UTF8
-    if (utf8)
+    if (utf)
       {
       length = 1;
       ecode++;
@@ -3112,7 +3108,7 @@ for (;;)
     else
 #endif   /* SUPPORT_UTF8 */
 
-    /* Non-UTF-8 mode */
+    /* Not UTF mode */
       {
       if (md->end_subject - eptr < 1)
         {
@@ -3193,7 +3189,7 @@ for (;;)
 
     REPEATCHAR:
 #ifdef SUPPORT_UTF8
-    if (utf8)
+    if (utf)
       {
       length = 1;
       charptr = ecode;
@@ -3209,7 +3205,7 @@ for (;;)
         unsigned int othercase;
         if (op >= OP_STARI &&     /* Caseless */
             (othercase = UCD_OTHERCASE(fc)) != fc)
-          oclength = PRIV(ord2utf8)(othercase, occhars);
+          oclength = PRIV(ord2utf)(othercase, occhars);
         else oclength = 0;
 #endif  /* SUPPORT_UCP */
 
@@ -3220,7 +3216,7 @@ for (;;)
 #ifdef SUPPORT_UCP
           else if (oclength > 0 &&
                    eptr <= md->end_subject - oclength &&
-                   memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
+                   memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
 #endif  /* SUPPORT_UCP */
           else
             {
@@ -3243,7 +3239,7 @@ for (;;)
 #ifdef SUPPORT_UCP
             else if (oclength > 0 &&
                      eptr <= md->end_subject - oclength &&
-                     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
+                     memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
 #endif  /* SUPPORT_UCP */
             else
               {
@@ -3264,7 +3260,7 @@ for (;;)
 #ifdef SUPPORT_UCP
             else if (oclength > 0 &&
                      eptr <= md->end_subject - oclength &&
-                     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
+                     memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
 #endif  /* SUPPORT_UCP */
             else
               {
@@ -3548,8 +3544,7 @@ for (;;)
       fc = md->lcc[fc];
 
 #ifdef SUPPORT_UTF8
-      /* UTF-8 mode */
-      if (utf8)
+      if (utf)
         {
         register unsigned int d;
         for (i = 1; i <= min; i++)
@@ -3566,8 +3561,7 @@ for (;;)
         }
       else
 #endif
-
-      /* Not UTF-8 mode */
+      /* Not UTF mode */
         {
         for (i = 1; i <= min; i++)
           {
@@ -3585,8 +3579,7 @@ for (;;)
       if (minimize)
         {
 #ifdef SUPPORT_UTF8
-        /* UTF-8 mode */
-        if (utf8)
+        if (utf)
           {
           register unsigned int d;
           for (fi = min;; fi++)
@@ -3606,7 +3599,7 @@ for (;;)
           }
         else
 #endif
-        /* Not UTF-8 mode */
+        /* Not UTF mode */
           {
           for (fi = min;; fi++)
             {
@@ -3631,8 +3624,7 @@ for (;;)
         pp = eptr;
 
 #ifdef SUPPORT_UTF8
-        /* UTF-8 mode */
-        if (utf8)
+        if (utf)
           {
           register unsigned int d;
           for (i = min; i < max; i++)
@@ -3659,7 +3651,7 @@ for (;;)
           }
         else
 #endif
-        /* Not UTF-8 mode */
+        /* Not UTF mode */
           {
           for (i = min; i < max; i++)
             {
@@ -3690,8 +3682,7 @@ for (;;)
     else
       {
 #ifdef SUPPORT_UTF8
-      /* UTF-8 mode */
-      if (utf8)
+      if (utf)
         {
         register unsigned int d;
         for (i = 1; i <= min; i++)
@@ -3707,7 +3698,7 @@ for (;;)
         }
       else
 #endif
-      /* Not UTF-8 mode */
+      /* Not UTF mode */
         {
         for (i = 1; i <= min; i++)
           {
@@ -3725,8 +3716,7 @@ for (;;)
       if (minimize)
         {
 #ifdef SUPPORT_UTF8
-        /* UTF-8 mode */
-        if (utf8)
+        if (utf)
           {
           register unsigned int d;
           for (fi = min;; fi++)
@@ -3745,7 +3735,7 @@ for (;;)
           }
         else
 #endif
-        /* Not UTF-8 mode */
+        /* Not UTF mode */
           {
           for (fi = min;; fi++)
             {
@@ -3770,8 +3760,7 @@ for (;;)
         pp = eptr;
 
 #ifdef SUPPORT_UTF8
-        /* UTF-8 mode */
-        if (utf8)
+        if (utf)
           {
           register unsigned int d;
           for (i = min; i < max; i++)
@@ -3797,7 +3786,7 @@ for (;;)
           }
         else
 #endif
-        /* Not UTF-8 mode */
+        /* Not UTF mode */
           {
           for (i = min; i < max; i++)
             {
@@ -4073,7 +4062,7 @@ for (;;)
           while (eptr < md->end_subject)
             {
             int len = 1;
-            if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
+            if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
             if (UCD_CATEGORY(c) != ucp_M) break;
             eptr += len;
             }
@@ -4086,7 +4075,7 @@ for (;;)
 /* Handle all other cases when the coding is UTF-8 */
 
 #ifdef SUPPORT_UTF8
-      if (utf8) switch(ctype)
+      if (utf) switch(ctype)
         {
         case OP_ANY:
         for (i = 1; i <= min; i++)
@@ -4794,7 +4783,7 @@ for (;;)
           while (eptr < md->end_subject)
             {
             int len = 1;
-            if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
+            if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
             if (UCD_CATEGORY(c) != ucp_M) break;
             eptr += len;
             }
@@ -4804,8 +4793,7 @@ for (;;)
 #endif     /* SUPPORT_UCP */
 
 #ifdef SUPPORT_UTF8
-      /* UTF-8 mode */
-      if (utf8)
+      if (utf)
         {
         for (fi = min;; fi++)
           {
@@ -4968,7 +4956,7 @@ for (;;)
         }
       else
 #endif
-      /* Not UTF-8 mode */
+      /* Not UTF mode */
         {
         for (fi = min;; fi++)
           {
@@ -5267,7 +5255,7 @@ for (;;)
           RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
           if (eptr-- == pp) break;        /* Stop if tried at original pos */
-          if (utf8) BACKCHAR(eptr);
+          if (utf) BACKCHAR(eptr);
           }
         }
 
@@ -5284,13 +5272,13 @@ for (;;)
             SCHECK_PARTIAL();
             break;
             }
-          if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
+          if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
           if (UCD_CATEGORY(c) == ucp_M) break;
           eptr += len;
           while (eptr < md->end_subject)
             {
             len = 1;
-            if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
+            if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
             if (UCD_CATEGORY(c) != ucp_M) break;
             eptr += len;
             }
@@ -5307,7 +5295,7 @@ for (;;)
           if (eptr-- == pp) break;        /* Stop if tried at original pos */
           for (;;)                        /* Move back over one extended */
             {
-            if (!utf8) c = *eptr; else
+            if (!utf) c = *eptr; else
               {
               BACKCHAR(eptr);
               GETCHAR(c, eptr);
@@ -5322,9 +5310,7 @@ for (;;)
 #endif   /* SUPPORT_UCP */
 
 #ifdef SUPPORT_UTF8
-      /* UTF-8 mode */
-
-      if (utf8)
+      if (utf)
         {
         switch(ctype)
           {
@@ -5607,8 +5593,7 @@ for (;;)
         }
       else
 #endif  /* SUPPORT_UTF8 */
-
-      /* Not UTF-8 mode */
+      /* Not UTF mode */
         {
         switch(ctype)
           {
@@ -5969,7 +5954,7 @@ BOOL using_temporary_offsets = FALSE;
 BOOL anchored;
 BOOL startline;
 BOOL firstline;
-BOOL utf8;
+BOOL utf;
 BOOL has_first_char = FALSE;
 BOOL has_req_char = FALSE;
 pcre_uchar first_char = 0;
@@ -6005,7 +5990,8 @@ follows immediately afterwards. Other values in the md block are used only
 during "normal" pcre_exec() processing, not when the JIT support is in use,
 so they are set up later. */
 
-utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
+/* PCRE_UTF16 has the same value as PCRE_UTF8. */
+utf = md->utf = (re->options & PCRE_UTF8) != 0;
 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
               ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
 
@@ -6013,10 +5999,10 @@ md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
 code for an invalid string if a results vector is available. */
 
 #ifdef SUPPORT_UTF8
-if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
+if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
   {
   int erroroffset;
-  int errorcode = PRIV(valid_utf8)((PCRE_PUCHAR)subject, length, &erroroffset);
+  int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
   if (errorcode != 0)
     {
     if (offsetcount >= 2)
@@ -6306,7 +6292,7 @@ for(;;)
     {
     PCRE_PUCHAR t = start_match;
 #ifdef SUPPORT_UTF8
-    if (utf8)
+    if (utf)
       {
       while (t < md->end_subject && !IS_NEWLINE(t))
         {
@@ -6348,7 +6334,7 @@ for(;;)
       if (start_match > md->start_subject + start_offset)
         {
 #ifdef SUPPORT_UTF8
-        if (utf8)
+        if (utf)
           {
           while (start_match < end_subject && !WAS_NEWLINE(start_match))
             {
@@ -6389,7 +6375,7 @@ for(;;)
           {
           start_match++;
 #ifdef SUPPORT_UTF8
-          if (utf8)
+          if (utf)
             while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
               start_match++;
 #endif
@@ -6521,7 +6507,7 @@ for(;;)
     case MATCH_THEN:
     new_start_match = start_match + 1;
 #ifdef SUPPORT_UTF8
-    if (utf8)
+    if (utf)
       while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
         new_start_match++;
 #endif
diff --git a/pcre_internal.h b/pcre_internal.h
index 9dbaf05..637565b 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -292,8 +292,8 @@ start/end of string field names are. */
 #define IS_NEWLINE(p) \
   ((NLBLOCK->nltype != NLTYPE_FIXED)? \
     ((p) < NLBLOCK->PSEND && \
-     PRIV(is_newline)((p), NLBLOCK->nltype, NLBLOCK->PSEND, &(NLBLOCK->nllen),\
-       utf8)) \
+     PRIV(is_newline)((p), NLBLOCK->nltype, NLBLOCK->PSEND, \
+       &(NLBLOCK->nllen), utf)) \
     : \
     ((p) <= NLBLOCK->PSEND - NLBLOCK->nllen && \
      (p)[0] == NLBLOCK->nl[0] && \
@@ -307,7 +307,7 @@ start/end of string field names are. */
   ((NLBLOCK->nltype != NLTYPE_FIXED)? \
     ((p) > NLBLOCK->PSSTART && \
      PRIV(was_newline)((p), NLBLOCK->nltype, NLBLOCK->PSSTART, \
-       &(NLBLOCK->nllen), utf8)) \
+       &(NLBLOCK->nllen), utf)) \
     : \
     ((p) >= NLBLOCK->PSSTART + NLBLOCK->nllen && \
      (p)[-NLBLOCK->nllen] == NLBLOCK->nl[0] && \
@@ -581,7 +581,7 @@ pointer. */
 
 #define GETCHARTEST(c, eptr) \
   c = *eptr; \
-  if (utf8 && c >= 0xc0) GETUTF8(c, eptr);
+  if (utf && c >= 0xc0) GETUTF8(c, eptr);
 
 /* Base macro to pick up the remaining bytes of a UTF-8 character, advancing
 the pointer. */
@@ -629,7 +629,7 @@ This is called when we don't know if we are in UTF-8 mode. */
 
 #define GETCHARINCTEST(c, eptr) \
   c = *eptr++; \
-  if (utf8 && c >= 0xc0) GETUTF8INC(c, eptr);
+  if (utf && c >= 0xc0) GETUTF8INC(c, eptr);
 
 /* Base macro to pick up the remaining bytes of a UTF-8 character, not
 advancing the pointer, incrementing the length. */
@@ -681,7 +681,7 @@ do not know if we are in UTF-8 mode. */
 
 #define GETCHARLENTEST(c, eptr, len) \
   c = *eptr; \
-  if (utf8 && c >= 0xc0) GETUTF8LEN(c, eptr, len);
+  if (utf && c >= 0xc0) GETUTF8LEN(c, eptr, len);
 
 /* If the pointer is not at the start of a character, move it back until
 it is. This is called only in UTF-8 mode - we don't put a test within the macro
@@ -1366,7 +1366,7 @@ value such as \n. They must have non-zero values, as check_escape() returns
 their negation. Also, they must appear in the same order as in the opcode
 definitions below, up to ESC_z. There's a dummy for OP_ALLANY because it
 corresponds to "." in DOTALL mode rather than an escape sequence. It is also
-used for [^] in JavaScript compatibility mode, and for \C in non-utf8 mode. In
+used for [^] in JavaScript compatibility mode, and for \C in non-utf mode. In
 non-DOTALL mode, "." behaves like \N.
 
 The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc.
@@ -1784,7 +1784,7 @@ enum { ERR0,  ERR1,  ERR2,  ERR3,  ERR4,  ERR5,  ERR6,  ERR7,  ERR8,  ERR9,
        ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
        ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59,
        ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69,
-       ERR70, ERRCOUNT };
+       ERR70, ERR71, ERRCOUNT };
 
 /* The real format of the start of the pcre block; the index of names and the
 code vector run on as long as necessary after the end. We store an explicit
@@ -1934,7 +1934,7 @@ typedef struct match_data {
   BOOL   offset_overflow;         /* Set if too many extractions */
   BOOL   notbol;                  /* NOTBOL flag */
   BOOL   noteol;                  /* NOTEOL flag */
-  BOOL   utf8;                    /* UTF8 flag */
+  BOOL   utf;                     /* UTF-8 / UTF-16 flag */
   BOOL   jscript_compat;          /* JAVASCRIPT_COMPAT flag */
   BOOL   use_ucp;                 /* PCRE_UCP flag */
   BOOL   endonly;                 /* Dollar not before final \n */
@@ -2103,14 +2103,10 @@ extern unsigned int      PRIV(strlen_uc)(const pcre_uchar *str);
 extern const pcre_uchar *PRIV(find_bracket)(const pcre_uchar *, BOOL, int);
 extern BOOL              PRIV(is_newline)(PCRE_PUCHAR, int, PCRE_PUCHAR,
                            int *, BOOL);
-extern int               PRIV(ord2utf8)(int, pcre_uint8 *);
+extern int               PRIV(ord2utf)(pcre_uint32, pcre_uchar *);
 extern real_pcre        *PRIV(try_flipped)(const real_pcre *, real_pcre *,
                            const pcre_study_data *, pcre_study_data *);
-#ifndef COMPILE_PCRE16
-extern int               PRIV(valid_utf8)(PCRE_PUCHAR, int, int *);
-#else
-extern int               PRIV(valid_utf16)(PCRE_PUCHAR, int, int *);
-#endif
+extern int               PRIV(valid_utf)(PCRE_PUCHAR, int, int *);
 extern BOOL              PRIV(was_newline)(PCRE_PUCHAR, int, PCRE_PUCHAR,
                            int *, BOOL);
 extern BOOL              PRIV(xclass)(int, const pcre_uchar *);
diff --git a/pcre_jit_compile.c b/pcre_jit_compile.c
index 03833e0..16611f1 100644
--- a/pcre_jit_compile.c
+++ b/pcre_jit_compile.c
@@ -298,7 +298,7 @@ typedef struct compiler_common {
   jump_list *caselesscmp;
   BOOL jscript_compat;
 #ifdef SUPPORT_UTF8
-  BOOL utf8;
+  BOOL utf;
 #ifdef SUPPORT_UCP
   BOOL useucp;
 #endif
@@ -497,7 +497,7 @@ switch(*cc)
 
   case OP_ANYBYTE:
 #ifdef SUPPORT_UTF8
-  if (common->utf8) return NULL;
+  if (common->utf) return NULL;
 #endif
   return cc + 1;
 
@@ -544,7 +544,7 @@ switch(*cc)
   case OP_NOTPOSQUERYI:
   cc += 2;
 #ifdef SUPPORT_UTF8
-  if (common->utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+  if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
 #endif
   return cc;
 
@@ -566,7 +566,7 @@ switch(*cc)
   case OP_NOTPOSUPTOI:
   cc += 2 + IMM2_SIZE;
 #ifdef SUPPORT_UTF8
-  if (common->utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+  if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
 #endif
   return cc;
 
@@ -1264,7 +1264,7 @@ static SLJIT_INLINE BOOL char_has_othercase(compiler_common *common, pcre_uchar*
 unsigned int c;
 
 #ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
   {
   GETCHAR(c, cc);
   if (c > 127)
@@ -1286,7 +1286,7 @@ static SLJIT_INLINE unsigned int char_othercase(compiler_common *common, unsigne
 {
 /* Returns with the othercase. */
 #ifdef SUPPORT_UTF8
-if (common->utf8 && c > 127)
+if (common->utf && c > 127)
   {
 #ifdef SUPPORT_UCP
   return UCD_OTHERCASE(c);
@@ -1307,7 +1307,7 @@ int n;
 #endif
 
 #ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
   {
   GETCHAR(c, cc);
   if (c <= 127)
@@ -1343,7 +1343,7 @@ if (!ispowerof2(bit))
   return 0;
 
 #ifdef SUPPORT_UTF8
-if (common->utf8 && c > 127)
+if (common->utf && c > 127)
   {
   n = PRIV(utf8_table4)[*cc & 0x3f];
   while ((bit & 0x3f) == 0)
@@ -1374,7 +1374,7 @@ struct sljit_jump *jump;
 
 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
 #ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
   {
   jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
   add_jump(compiler, &common->utf8readchar, JUMP(SLJIT_FAST_CALL));
@@ -1395,7 +1395,7 @@ struct sljit_jump *jump;
 
 OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
 #ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
   {
   jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
   add_jump(compiler, &common->utf8readchar, JUMP(SLJIT_FAST_CALL));
@@ -1414,7 +1414,7 @@ struct sljit_jump *jump;
 #endif
 
 #ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
   {
   OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 0);
   OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
@@ -1439,7 +1439,7 @@ DEFINE_COMPILER;
 #ifdef SUPPORT_UTF8
 struct sljit_label *label;
 
-if (common->utf8)
+if (common->utf)
   {
   label = LABEL();
   OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
@@ -1697,7 +1697,7 @@ mainloop = LABEL();
 
 /* Increasing the STR_PTR here requires one less jump in the most common case. */
 #ifdef SUPPORT_UTF8
-if (common->utf8) readuchar = TRUE;
+if (common->utf) readuchar = TRUE;
 #endif
 if (newlinecheck) readuchar = TRUE;
 
@@ -1709,7 +1709,7 @@ if (newlinecheck)
 
 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
 #ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
   {
   singlebyte = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
   OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)PRIV(utf8_table4) - 0xc0);
@@ -1771,7 +1771,7 @@ else
 
 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
 #ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
   {
   CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0, start);
   OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)PRIV(utf8_table4) - 0xc0);
@@ -1882,7 +1882,7 @@ start = LABEL();
 leave = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
 #ifdef SUPPORT_UTF
-if (common->utf8)
+if (common->utf)
   OP1(SLJIT_MOV, TMP3, 0, TMP1, 0);
 #endif
 #ifndef COMPILE_PCRE8
@@ -1896,12 +1896,12 @@ OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, TMP2, 0);
 found = JUMP(SLJIT_C_NOT_ZERO);
 
 #ifdef SUPPORT_UTF
-if (common->utf8)
+if (common->utf)
   OP1(SLJIT_MOV, TMP1, 0, TMP3, 0);
 #endif
 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
 #ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
   {
   CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0, start);
   OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)PRIV(utf8_table4) - 0xc0);
@@ -2051,7 +2051,7 @@ else
 #ifdef SUPPORT_UTF8
   /* Here LOCALS1 has already been zeroed. */
   jump = NULL;
-  if (common->utf8)
+  if (common->utf)
     jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
 #endif
   OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), common->ctypes);
@@ -2090,7 +2090,7 @@ else
 #ifdef SUPPORT_UTF8
   OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 0);
   jump = NULL;
-  if (common->utf8)
+  if (common->utf)
     jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
 #endif
   OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(TMP1), common->ctypes);
@@ -2119,7 +2119,7 @@ OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x0d - 0x0a);
 COND_VALUE(SLJIT_MOV, TMP2, 0, SLJIT_C_LESS_EQUAL);
 OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x85 - 0x0a);
 #ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
   {
   COND_VALUE(SLJIT_OR, TMP2, 0, SLJIT_C_EQUAL);
   OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x1);
@@ -2143,7 +2143,7 @@ OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x20);
 COND_VALUE(SLJIT_OR, TMP2, 0, SLJIT_C_EQUAL);
 OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xa0);
 #ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
   {
   COND_VALUE(SLJIT_OR, TMP2, 0, SLJIT_C_EQUAL);
   OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x1680);
@@ -2177,7 +2177,7 @@ OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x0d - 0x0a);
 COND_VALUE(SLJIT_MOV, TMP2, 0, SLJIT_C_LESS_EQUAL);
 OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x85 - 0x0a);
 #ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
   {
   COND_VALUE(SLJIT_OR | SLJIT_SET_E, TMP2, 0, SLJIT_C_EQUAL);
   OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x1);
@@ -2289,7 +2289,7 @@ DEFINE_COMPILER;
 unsigned int othercasebit = 0;
 pcre_uchar *othercasechar = NULL;
 #ifdef SUPPORT_UTF8
-int utf8length;
+int utflength;
 #endif
 
 if (caseless && char_has_othercase(common, cc))
@@ -2336,9 +2336,9 @@ if (context->sourcereg == -1)
   }
 
 #ifdef SUPPORT_UTF8
-utf8length = 1;
-if (common->utf8 && *cc >= 0xc0)
-  utf8length += PRIV(utf8_table4)[*cc & 0x3f];
+utflength = 1;
+if (common->utf && *cc >= 0xc0)
+  utflength += PRIV(utf8_table4)[*cc & 0x3f];
 
 do
   {
@@ -2432,9 +2432,9 @@ do
 
   cc++;
 #ifdef SUPPORT_UTF8
-  utf8length--;
+  utflength--;
   }
-while (utf8length > 0);
+while (utflength > 0);
 #endif
 
 return cc;
@@ -2480,7 +2480,7 @@ unsigned int typeoffset;
 int invertcmp, numberofcmps;
 unsigned int charoffset;
 
-/* Although SUPPORT_UTF8 must be defined, we are not necessary in utf8 mode. */
+/* Although SUPPORT_UTF must be defined, we are not necessary in utf mode. */
 check_input_end(common, fallbacks);
 read_char(common);
 
@@ -2490,7 +2490,7 @@ if ((*cc++ & XCL_MAP) != 0)
 #ifndef COMPILE_PCRE8
   jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
 #elif defined SUPPORT_UTF8
-  if (common->utf8)
+  if (common->utf)
     jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
 #endif
 
@@ -2504,7 +2504,7 @@ if ((*cc++ & XCL_MAP) != 0)
 #ifndef COMPILE_PCRE8
   JUMPHERE(jump);
 #elif defined SUPPORT_UTF8
-  if (common->utf8)
+  if (common->utf)
     JUMPHERE(jump);
 #endif
   OP1(SLJIT_MOV, TMP1, 0, TMP3, 0);
@@ -2524,7 +2524,7 @@ while (*cc != XCL_END)
     {
     cc += 2;
 #ifdef SUPPORT_UTF8
-    if (common->utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+    if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
 #endif
 #ifdef SUPPORT_UCP
     needschar = TRUE;
@@ -2534,11 +2534,11 @@ while (*cc != XCL_END)
     {
     cc += 2;
 #ifdef SUPPORT_UTF8
-    if (common->utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+    if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
 #endif
     cc++;
 #ifdef SUPPORT_UTF8
-    if (common->utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+    if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
 #endif
 #ifdef SUPPORT_UCP
     needschar = TRUE;
@@ -2639,7 +2639,7 @@ while (*cc != XCL_END)
     {
     cc ++;
 #ifdef SUPPORT_UTF8
-    if (common->utf8)
+    if (common->utf)
       {
       GETCHARINC(c, cc);
       }
@@ -2670,7 +2670,7 @@ while (*cc != XCL_END)
     {
     cc ++;
 #ifdef SUPPORT_UTF8
-    if (common->utf8)
+    if (common->utf)
       {
       GETCHARINC(c, cc);
       }
@@ -2679,7 +2679,7 @@ while (*cc != XCL_END)
       c = *cc++;
     SET_CHAR_OFFSET(c);
 #ifdef SUPPORT_UTF8
-    if (common->utf8)
+    if (common->utf)
       {
       GETCHARINC(c, cc);
       }
@@ -2876,7 +2876,7 @@ switch(type)
   case OP_ALLANY:
   check_input_end(common, fallbacks);
 #ifdef SUPPORT_UTF8
-  if (common->utf8)
+  if (common->utf)
     {
     OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
     OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
@@ -3096,7 +3096,7 @@ switch(type)
   case OP_CHARI:
   length = 1;
 #ifdef SUPPORT_UTF8
-  if (common->utf8 && *cc >= 0xc0) length += PRIV(utf8_table4)[*cc & 0x3f];
+  if (common->utf && *cc >= 0xc0) length += PRIV(utf8_table4)[*cc & 0x3f];
 #endif
   if (type == OP_CHAR || !char_has_othercase(common, cc) || char_get_othercase_bit(common, cc) != 0)
     {
@@ -3113,7 +3113,7 @@ switch(type)
   check_input_end(common, fallbacks);
   read_char(common);
 #ifdef SUPPORT_UTF8
-  if (common->utf8)
+  if (common->utf)
     {
     GETCHAR(c, cc);
     }
@@ -3130,7 +3130,7 @@ switch(type)
   case OP_NOT:
   case OP_NOTI:
 #ifdef SUPPORT_UTF8
-  if (common->utf8)
+  if (common->utf)
     {
     length = 1;
     if (*cc >= 0xc0) length += PRIV(utf8_table4)[*cc & 0x3f];
@@ -3196,7 +3196,7 @@ switch(type)
   jump[0] = NULL;
 #ifdef SUPPORT_UTF8
   /* This check can only be skipped in pure 8 bit mode. */
-  if (common->utf8)
+  if (common->utf)
 #endif
     {
     jump[0] = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
@@ -3231,7 +3231,7 @@ switch(type)
   OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0);
   OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, begin));
 #ifdef SUPPORT_UTF8
-  if (common->utf8)
+  if (common->utf)
     {
     OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, length);
     label = LABEL();
@@ -3269,7 +3269,7 @@ do
     {
     size = 1;
 #ifdef SUPPORT_UTF8
-    if (common->utf8 && cc[1] >= 0xc0)
+    if (common->utf && cc[1] >= 0xc0)
       size += PRIV(utf8_table4)[cc[1] & 0x3f];
 #endif
     }
@@ -3277,7 +3277,7 @@ do
     {
     size = 1;
 #ifdef SUPPORT_UTF8
-    if (common->utf8)
+    if (common->utf)
       {
       if (char_has_othercase(common, cc + 1) && char_get_othercase_bit(common, cc + 1) == 0)
         size = 0;
@@ -3381,7 +3381,7 @@ if (withchecks && !common->jscript_compat)
 
 #ifdef SUPPORT_UTF8
 #ifdef SUPPORT_UCP
-if (common->utf8 && *cc == OP_REFI)
+if (common->utf && *cc == OP_REFI)
   {
   SLJIT_ASSERT(TMP1 == SLJIT_TEMPORARY_REG1 && STACK_TOP == SLJIT_TEMPORARY_REG2 && TMP2 == SLJIT_TEMPORARY_REG3);
   OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), OVECTOR(offset + 1));
@@ -4787,7 +4787,7 @@ if (end != NULL)
   {
   *end = cc + 1;
 #ifdef SUPPORT_UTF8
-  if (common->utf8 && *cc >= 0xc0) *end += PRIV(utf8_table4)[*cc & 0x3f];
+  if (common->utf && *cc >= 0xc0) *end += PRIV(utf8_table4)[*cc & 0x3f];
 #endif
   }
 return cc;
@@ -6254,7 +6254,8 @@ common->casefulcmp = NULL;
 common->caselesscmp = NULL;
 common->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
 #ifdef SUPPORT_UTF8
-common->utf8 = (re->options & PCRE_UTF8) != 0;
+/* PCRE_UTF16 has the same value as PCRE_UTF8. */
+common->utf = (re->options & PCRE_UTF8) != 0;
 #ifdef SUPPORT_UCP
 common->useucp = (re->options & PCRE_UCP) != 0;
 #endif
diff --git a/pcre_newline.c b/pcre_newline.c
index 92b81d1..0c2ddcd 100644
--- a/pcre_newline.c
+++ b/pcre_newline.c
@@ -67,17 +67,17 @@ Arguments:
   type         the newline type
   endptr       pointer to the end of the string
   lenptr       where to return the length
-  utf8         TRUE if in utf8 mode
+  utf          TRUE if in utf mode
 
 Returns:       TRUE or FALSE
 */
 
 BOOL
 PRIV(is_newline)(PCRE_PUCHAR ptr, int type, PCRE_PUCHAR endptr, int *lenptr,
-  BOOL utf8)
+  BOOL utf)
 {
 int c;
-if (utf8) { GETCHAR(c, ptr); } else c = *ptr;
+if (utf) { GETCHAR(c, ptr); } else c = *ptr;
 
 if (type == NLTYPE_ANYCRLF) switch(c)
   {
@@ -96,7 +96,7 @@ else switch(c)
   case 0x000c: *lenptr = 1; return TRUE;             /* FF */
   case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1;
                return TRUE;                          /* CR */
-  case 0x0085: *lenptr = utf8? 2 : 1; return TRUE;   /* NEL */
+  case 0x0085: *lenptr = utf? 2 : 1; return TRUE;    /* NEL */
   case 0x2028:                                       /* LS */
   case 0x2029: *lenptr = 3; return TRUE;             /* PS */
   default: return FALSE;
@@ -117,19 +117,19 @@ Arguments:
   type         the newline type
   startptr     pointer to the start of the string
   lenptr       where to return the length
-  utf8         TRUE if in utf8 mode
+  utf          TRUE if in utf mode
 
 Returns:       TRUE or FALSE
 */
 
 BOOL
 PRIV(was_newline)(PCRE_PUCHAR ptr, int type, PCRE_PUCHAR startptr, int *lenptr,
-  BOOL utf8)
+  BOOL utf)
 {
 int c;
 ptr--;
 #ifdef SUPPORT_UTF8
-if (utf8)
+if (utf)
   {
   BACKCHAR(ptr);
   GETCHAR(c, ptr);
@@ -154,7 +154,7 @@ else switch(c)
   case 0x000b:                                      /* VT */
   case 0x000c:                                      /* FF */
   case 0x000d: *lenptr = 1; return TRUE;            /* CR */
-  case 0x0085: *lenptr = utf8? 2 : 1; return TRUE;  /* NEL */
+  case 0x0085: *lenptr = utf? 2 : 1; return TRUE;   /* NEL */
   case 0x2028:                                      /* LS */
   case 0x2029: *lenptr = 3; return TRUE;            /* PS */
   default: return FALSE;
diff --git a/pcre_ord2utf8.c b/pcre_ord2utf8.c
index 354adc0..b374987 100644
--- a/pcre_ord2utf8.c
+++ b/pcre_ord2utf8.c
@@ -52,21 +52,28 @@ character value into a UTF8 string. */
 *       Convert character value to UTF-8         *
 *************************************************/
 
-/* This function takes an integer value in the range 0 - 0x7fffffff
-and encodes it as a UTF-8 character in 0 to 6 bytes.
+/* This function takes an integer value in the range 0 - 0x10ffff
+and encodes it as a UTF-8 character in 1 to 6 pcre_uchars.
 
 Arguments:
   cvalue     the character value
-  buffer     pointer to buffer for result - at least 6 bytes long
+  buffer     pointer to buffer for result - at least 6 pcre_uchars long
 
 Returns:     number of characters placed in the buffer
 */
 
 int
-PRIV(ord2utf8)(int cvalue, pcre_uint8 *buffer)
+PRIV(ord2utf)(pcre_uint32 cvalue, pcre_uchar *buffer)
 {
 #ifdef SUPPORT_UTF8
+
 register int i, j;
+
+/* Checking invalid cvalue character, encoded as invalid UTF-16 character.
+Should never happen in practice. */
+if ((cvalue & 0xf800) == 0xd800 || cvalue >= 0x110000)
+  cvalue = 0xfffe;
+
 for (i = 0; i < PRIV(utf8_table1_size); i++)
   if (cvalue <= PRIV(utf8_table1)[i]) break;
 buffer += i;
@@ -77,10 +84,13 @@ for (j = i; j > 0; j--)
  }
 *buffer = PRIV(utf8_table2)[i] | cvalue;
 return i + 1;
+
 #else
+
 (void)(cvalue);  /* Keep compiler happy; this function won't ever be */
 (void)(buffer);  /* called when SUPPORT_UTF8 is not defined. */
 return 0;
+
 #endif
 }
 
diff --git a/pcre_study.c b/pcre_study.c
index 661627d..098980d 100644
--- a/pcre_study.c
+++ b/pcre_study.c
@@ -82,7 +82,8 @@ find_minlength(const pcre_uchar *code, const pcre_uchar *startcode, int options,
   int recurse_depth)
 {
 int length = -1;
-BOOL utf8 = (options & PCRE_UTF8) != 0;
+/* PCRE_UTF16 has the same value as PCRE_UTF8. */
+BOOL utf = (options & PCRE_UTF8) != 0;
 BOOL had_recurse = FALSE;
 register int branchlength = 0;
 register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
@@ -224,7 +225,7 @@ for (;;)
     branchlength++;
     cc += 2;
 #ifdef SUPPORT_UTF8
-    if (utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+    if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
 #endif
     break;
 
@@ -245,7 +246,7 @@ for (;;)
     branchlength += GET2(cc,1);
     cc += 2 + IMM2_SIZE;
 #ifdef SUPPORT_UTF8
-    if (utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+    if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
 #endif
     break;
 
@@ -293,7 +294,7 @@ for (;;)
 
     case OP_ANYBYTE:
 #ifdef SUPPORT_UTF8
-    if (utf8) return -1;
+    if (utf) return -1;
 #endif
     branchlength++;
     cc++;
@@ -374,7 +375,7 @@ for (;;)
     case OP_REFI:
     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
       {
-      ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf8, GET2(cc, 1));
+      ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
       if (cs == NULL) return -2;
       do ce += GET(ce, 1); while (*ce == OP_ALT);
       if (cc > cs && cc < ce)
@@ -486,7 +487,7 @@ for (;;)
 
     cc += PRIV(OP_lengths)[op];
 #ifdef SUPPORT_UTF8
-    if (utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+    if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
 #endif
     break;
 
@@ -537,29 +538,29 @@ Arguments:
   p             points to the character
   caseless      the caseless flag
   cd            the block with char table pointers
-  utf8          TRUE for UTF-8 mode
+  utf           TRUE for UTF-8 / UTF-16 mode
 
 Returns:        pointer after the character
 */
 
 static const pcre_uchar *
 set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
-  compile_data *cd, BOOL utf8)
+  compile_data *cd, BOOL utf)
 {
 unsigned int c = *p;
 
 SET_BIT(c);
 
 #ifdef SUPPORT_UTF8
-if (utf8 && c > 127)
+if (utf && c > 127)
   {
   GETCHARINC(c, p);
 #ifdef SUPPORT_UCP
   if (caseless)
     {
-    pcre_uint8 buff[8];
+    pcre_uchar buff[6];
     c = UCD_OTHERCASE(c);
-    (void)PRIV(ord2utf8)(c, buff);
+    (void)PRIV(ord2utf)(c, buff);
     SET_BIT(buff[0]);
     }
 #endif
@@ -607,8 +608,8 @@ for (c = 128; c < 256; c++)
   {
   if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
     {
-    pcre_uint8 buff[8];
-    (void)PRIV(ord2utf8)(c, buff);
+    pcre_uchar buff[6];
+    (void)PRIV(ord2utf)(c, buff);
     SET_BIT(buff[0]);
     }
   }
@@ -663,7 +664,7 @@ function fails unless the result is SSB_DONE.
 Arguments:
   code         points to an expression
   start_bits   points to a 32-byte table, initialized to 0
-  utf8         TRUE if in UTF-8 mode
+  utf          TRUE if in UTF-8 / UTF-16 mode
   cd           the block with char table pointers
 
 Returns:       SSB_FAIL     => Failed to find any starting bytes
@@ -673,12 +674,12 @@ Returns:       SSB_FAIL     => Failed to find any starting bytes
 */
 
 static int
-set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf8,
+set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
   compile_data *cd)
 {
 register int c;
 int yield = SSB_DONE;
-int table_limit = utf8? 16:32;
+int table_limit = utf? 16:32;
 
 #if 0
 /* ========================================================================= */
@@ -817,7 +818,7 @@ do
       case OP_ONCE:
       case OP_ONCE_NC:
       case OP_ASSERT:
-      rc = set_start_bits(tcode, start_bits, utf8, cd);
+      rc = set_start_bits(tcode, start_bits, utf, cd);
       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
       if (rc == SSB_DONE) try_next = FALSE; else
         {
@@ -864,7 +865,7 @@ do
       case OP_BRAZERO:
       case OP_BRAMINZERO:
       case OP_BRAPOSZERO:
-      rc = set_start_bits(++tcode, start_bits, utf8, cd);
+      rc = set_start_bits(++tcode, start_bits, utf, cd);
       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
 /* =========================================================================
       See the comment at the head of this function concerning the next line,
@@ -891,7 +892,7 @@ do
       case OP_QUERY:
       case OP_MINQUERY:
       case OP_POSQUERY:
-      tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
+      tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
       break;
 
       case OP_STARI:
@@ -900,7 +901,7 @@ do
       case OP_QUERYI:
       case OP_MINQUERYI:
       case OP_POSQUERYI:
-      tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
+      tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
       break;
 
       /* Single-char upto sets the bit and tries the next */
@@ -908,13 +909,13 @@ do
       case OP_UPTO:
       case OP_MINUPTO:
       case OP_POSUPTO:
-      tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf8);
+      tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
       break;
 
       case OP_UPTOI:
       case OP_MINUPTOI:
       case OP_POSUPTOI:
-      tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf8);
+      tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
       break;
 
       /* At least one single char sets the bit and stops */
@@ -926,7 +927,7 @@ do
       case OP_PLUS:
       case OP_MINPLUS:
       case OP_POSPLUS:
-      (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
+      (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
       try_next = FALSE;
       break;
 
@@ -937,7 +938,7 @@ do
       case OP_PLUSI:
       case OP_MINPLUSI:
       case OP_POSPLUSI:
-      (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
+      (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
       try_next = FALSE;
       break;
 
@@ -950,7 +951,7 @@ do
       case OP_HSPACE:
       SET_BIT(0x09);
       SET_BIT(0x20);
-      if (utf8)
+      if (utf)
         {
         SET_BIT(0xC2);  /* For U+00A0 */
         SET_BIT(0xE1);  /* For U+1680, U+180E */
@@ -967,7 +968,7 @@ do
       SET_BIT(0x0B);
       SET_BIT(0x0C);
       SET_BIT(0x0D);
-      if (utf8)
+      if (utf)
         {
         SET_BIT(0xC2);  /* For U+0085 */
         SET_BIT(0xE2);  /* For U+2028, U+2029 */
@@ -1057,7 +1058,7 @@ do
         case OP_HSPACE:
         SET_BIT(0x09);
         SET_BIT(0x20);
-        if (utf8)
+        if (utf)
           {
           SET_BIT(0xC2);  /* For U+00A0 */
           SET_BIT(0xE1);  /* For U+1680, U+180E */
@@ -1073,7 +1074,7 @@ do
         SET_BIT(0x0B);
         SET_BIT(0x0C);
         SET_BIT(0x0D);
-        if (utf8)
+        if (utf)
           {
           SET_BIT(0xC2);  /* For U+0085 */
           SET_BIT(0xE2);  /* For U+2028, U+2029 */
@@ -1126,7 +1127,7 @@ do
 
       case OP_NCLASS:
 #ifdef SUPPORT_UTF8
-      if (utf8)
+      if (utf)
         {
         start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
         memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
@@ -1147,7 +1148,7 @@ do
         characters in the range 128 - 255. */
 
 #ifdef SUPPORT_UTF8
-        if (utf8)
+        if (utf)
           {
           for (c = 0; c < 16; c++) start_bits[c] |= map[c];
           for (c = 128; c < 256; c++)
diff --git a/pcre_valid_utf8.c b/pcre_valid_utf8.c
index 8d13014..bbab87f 100644
--- a/pcre_valid_utf8.c
+++ b/pcre_valid_utf8.c
@@ -103,7 +103,7 @@ Returns:       = 0    if the string is a valid UTF-8 string
 */
 
 int
-PRIV(valid_utf8)(PCRE_PUCHAR string, int length, int *erroroffset)
+PRIV(valid_utf)(PCRE_PUCHAR string, int length, int *erroroffset)
 {
 #ifdef SUPPORT_UTF8
 register PCRE_PUCHAR p;
diff --git a/pcreposix.c b/pcreposix.c
index 648254b..2dc1561 100644
--- a/pcreposix.c
+++ b/pcreposix.c
@@ -155,6 +155,7 @@ static const int eint[] = {
   REG_BADPAT,  /* \k is not followed by a braced, angle-bracketed, or quoted name */
   /* 70 */
   REG_BADPAT,  /* internal error: unknown opcode in find_fixedlength() */ 
+  REG_BADPAT,  /* Not allowed UTF-8 / UTF-16 code point (>= 0xd800 && <= 0xdfff) */
 };
 
 /* Table of texts corresponding to POSIX error codes */
diff --git a/sljit/sljitConfigInternal.h b/sljit/sljitConfigInternal.h
index 3f771d8..ad0be19 100644
--- a/sljit/sljitConfigInternal.h
+++ b/sljit/sljitConfigInternal.h
@@ -354,8 +354,8 @@ typedef long int sljit_w;
 #endif /* !SLJIT_UNALIGNED */
 
 #if (defined SLJIT_EXECUTABLE_ALLOCATOR && SLJIT_EXECUTABLE_ALLOCATOR)
-static void* sljit_malloc_exec(sljit_uw size);
-static void sljit_free_exec(void* ptr);
+SLJIT_API_FUNC_ATTRIBUTE void* sljit_malloc_exec(sljit_uw size);
+SLJIT_API_FUNC_ATTRIBUTE void sljit_free_exec(void* ptr);
 #define SLJIT_MALLOC_EXEC(size) sljit_malloc_exec(size)
 #define SLJIT_FREE_EXEC(ptr) sljit_free_exec(ptr)
 #endif
diff --git a/sljit/sljitExecAllocator.c b/sljit/sljitExecAllocator.c
index bfe8eb1..cdea346 100644
--- a/sljit/sljitExecAllocator.c
+++ b/sljit/sljitExecAllocator.c
@@ -163,7 +163,7 @@ static SLJIT_INLINE void sljit_remove_free_block(struct free_block *free_block)
 	}
 }
 
-static void* sljit_malloc_exec(sljit_uw size)
+SLJIT_API_FUNC_ATTRIBUTE void* sljit_malloc_exec(sljit_uw size)
 {
 	struct block_header *header;
 	struct block_header *next_header;
@@ -231,7 +231,7 @@ static void* sljit_malloc_exec(sljit_uw size)
 	return MEM_START(header);
 }
 
-static void sljit_free_exec(void* ptr)
+SLJIT_API_FUNC_ATTRIBUTE void sljit_free_exec(void* ptr)
 {
 	struct block_header *header;
 	struct free_block* free_block;
diff --git a/sljit/sljitLir.h b/sljit/sljitLir.h
index 2a82968..54906bc 100644
--- a/sljit/sljitLir.h
+++ b/sljit/sljitLir.h
@@ -195,6 +195,8 @@ struct sljit_compiler {
 	int local_size;
 	/* Code size. */
 	sljit_uw size;
+	/* For statistical purposes. */
+	sljit_uw executable_size;
 
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
 	int args;
@@ -291,6 +293,15 @@ SLJIT_API_FUNC_ATTRIBUTE void sljit_compiler_verbose(struct sljit_compiler *comp
 SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compiler);
 SLJIT_API_FUNC_ATTRIBUTE void sljit_free_code(void* code);
 
+/*
+   After the code generation we can retrieve the allocated executable memory size,
+   although this area may not be fully filled with instructions depending on some
+   optimizations. This function is useful only for statistical purposes.
+
+   Before a successful code generation, this function returns with 0.
+*/
+static SLJIT_INLINE sljit_uw sljit_get_generated_code_size(struct sljit_compiler *compiler) { return compiler->executable_size; }
+
 /* Instruction generation. Returns with error code. */
 
 /*
diff --git a/sljit/sljitNativeARM_Thumb2.c b/sljit/sljitNativeARM_Thumb2.c
index c476711..3764aeb 100644
--- a/sljit/sljitNativeARM_Thumb2.c
+++ b/sljit/sljitNativeARM_Thumb2.c
@@ -416,6 +416,7 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compil
 
 	SLJIT_CACHE_FLUSH(code, code_ptr);
 	compiler->error = SLJIT_ERR_COMPILED;
+	compiler->executable_size = compiler->size * sizeof(sljit_uh);
 	/* Set thumb mode flag. */
 	return (void*)((sljit_uw)code | 0x1);
 }
diff --git a/sljit/sljitNativeARM_v5.c b/sljit/sljitNativeARM_v5.c
index 1b40afa..99584cf 100644
--- a/sljit/sljitNativeARM_v5.c
+++ b/sljit/sljitNativeARM_v5.c
@@ -788,6 +788,7 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compil
 
 	SLJIT_CACHE_FLUSH(code, code_ptr);
 	compiler->error = SLJIT_ERR_COMPILED;
+	compiler->executable_size = size * sizeof(sljit_uw);
 	return code;
 }
 
diff --git a/sljit/sljitNativeMIPS_common.c b/sljit/sljitNativeMIPS_common.c
index c4fe152..7fcb6d6 100644
--- a/sljit/sljitNativeMIPS_common.c
+++ b/sljit/sljitNativeMIPS_common.c
@@ -397,6 +397,7 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compil
 	}
 
 	compiler->error = SLJIT_ERR_COMPILED;
+	compiler->executable_size = compiler->size * sizeof(sljit_ins);
 #ifndef __GNUC__
 	SLJIT_CACHE_FLUSH(code, code_ptr);
 #else
diff --git a/sljit/sljitNativePPC_common.c b/sljit/sljitNativePPC_common.c
index af14b75..28afd9e 100644
--- a/sljit/sljitNativePPC_common.c
+++ b/sljit/sljitNativePPC_common.c
@@ -354,6 +354,7 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compil
 
 	SLJIT_CACHE_FLUSH(code, code_ptr);
 	compiler->error = SLJIT_ERR_COMPILED;
+	compiler->executable_size = compiler->size * sizeof(sljit_ins);
 
 #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
 	if (((sljit_w)code_ptr) & 0x4)
diff --git a/sljit/sljitNativeX86_common.c b/sljit/sljitNativeX86_common.c
index c6661bc..cc215a2 100644
--- a/sljit/sljitNativeX86_common.c
+++ b/sljit/sljitNativeX86_common.c
@@ -357,22 +357,22 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compil
 	while (jump) {
 		if (jump->flags & PATCH_MB) {
 			SLJIT_ASSERT((sljit_w)(jump->u.label->addr - (jump->addr + sizeof(sljit_b))) >= -128 && (sljit_w)(jump->u.label->addr - (jump->addr + sizeof(sljit_b))) <= 127);
-			*(sljit_ub*)jump->addr = jump->u.label->addr - (jump->addr + sizeof(sljit_b));
+			*(sljit_ub*)jump->addr = (sljit_ub)(jump->u.label->addr - (jump->addr + sizeof(sljit_b)));
 		} else if (jump->flags & PATCH_MW) {
 			if (jump->flags & JUMP_LABEL) {
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-				*(sljit_w*)jump->addr = jump->u.label->addr - (jump->addr + sizeof(sljit_w));
+				*(sljit_w*)jump->addr = (sljit_w)(jump->u.label->addr - (jump->addr + sizeof(sljit_w)));
 #else
 				SLJIT_ASSERT((sljit_w)(jump->u.label->addr - (jump->addr + sizeof(sljit_hw))) >= -0x80000000ll && (sljit_w)(jump->u.label->addr - (jump->addr + sizeof(sljit_hw))) <= 0x7fffffffll);
-				*(sljit_hw*)jump->addr = jump->u.label->addr - (jump->addr + sizeof(sljit_hw));
+				*(sljit_hw*)jump->addr = (sljit_hw)(jump->u.label->addr - (jump->addr + sizeof(sljit_hw)));
 #endif
 			}
 			else {
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-				*(sljit_w*)jump->addr = jump->u.target - (jump->addr + sizeof(sljit_w));
+				*(sljit_w*)jump->addr = (sljit_w)(jump->u.target - (jump->addr + sizeof(sljit_w)));
 #else
 				SLJIT_ASSERT((sljit_w)(jump->u.target - (jump->addr + sizeof(sljit_hw))) >= -0x80000000ll && (sljit_w)(jump->u.target - (jump->addr + sizeof(sljit_hw))) <= 0x7fffffffll);
-				*(sljit_hw*)jump->addr = jump->u.target - (jump->addr + sizeof(sljit_hw));
+				*(sljit_hw*)jump->addr = (sljit_hw)(jump->u.target - (jump->addr + sizeof(sljit_hw)));
 #endif
 			}
 		}
@@ -387,6 +387,7 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compil
 	/* Maybe we waste some space because of short jumps. */
 	SLJIT_ASSERT(code_ptr <= code + compiler->size);
 	compiler->error = SLJIT_ERR_COMPILED;
+	compiler->executable_size = compiler->size;
 	return (void*)code;
 }
 
@@ -1360,7 +1361,7 @@ static int emit_mul(struct sljit_compiler *compiler,
 			code = (sljit_ub*)ensure_buf(compiler, 1 + 4);
 			FAIL_IF(!code);
 			INC_CSIZE(4);
-			*(sljit_hw*)code = src1w;
+			*(sljit_hw*)code = (sljit_hw)src1w;
 		}
 		else {
 			EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src1w);
@@ -1403,7 +1404,7 @@ static int emit_mul(struct sljit_compiler *compiler,
 			code = (sljit_ub*)ensure_buf(compiler, 1 + 4);
 			FAIL_IF(!code);
 			INC_CSIZE(4);
-			*(sljit_hw*)code = src2w;
+			*(sljit_hw*)code = (sljit_hw)src2w;
 		}
 		else {
 			EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src1w);
diff --git a/testdata/testinput10 b/testdata/testinput10
index 7210cc5..8e70c70 100644
--- a/testdata/testinput10
+++ b/testdata/testinput10
@@ -52,11 +52,9 @@ is required for these tests. --/
 
 /\x{100000}/8BM
 
-/\x{1000000}/8BM
+/\x{10ffff}/8BM
 
-/\x{4000000}/8BM
-
-/\x{7fffFFFF}/8BM
+/\x{110000}/8BM
 
 /[\x{ff}]/8BM
 
diff --git a/testdata/testinput5 b/testdata/testinput5
index ca7eb54..9ba5b4b 100644
--- a/testdata/testinput5
+++ b/testdata/testinput5
@@ -9,11 +9,9 @@
 
 /\x{100000}/8DZ
 
-/\x{1000000}/8DZ
+/\x{10ffff}/8DZ
 
-/\x{4000000}/8DZ
-
-/\x{7fffFFFF}/8DZ
+/\x{110000}/8DZ
 
 /[\x{ff}]/8DZ
 
@@ -23,6 +21,14 @@
 
 /\x{100000000}/8
 
+/\x{d800}/8
+
+/\x{dfff}/8
+
+/\x{d7ff}/8
+
+/\x{e000}/8
+
 /^\x{100}a\x{1234}/8
     \x{100}a\x{1234}bcd
 
diff --git a/testdata/testoutput10 b/testdata/testoutput10
index f66a12a..47a2a97 100644
--- a/testdata/testoutput10
+++ b/testdata/testoutput10
@@ -317,32 +317,17 @@ Memory allocation (code space): 12
  11     End
 ------------------------------------------------------------------
 
-/\x{1000000}/8BM
-Memory allocation (code space): 13
-------------------------------------------------------------------
-  0   9 Bra
-  3     \x{1000000}
-  9   9 Ket
- 12     End
-------------------------------------------------------------------
-
-/\x{4000000}/8BM
-Memory allocation (code space): 14
+/\x{10ffff}/8BM
+Memory allocation (code space): 12
 ------------------------------------------------------------------
-  0  10 Bra
-  3     \x{4000000}
- 10  10 Ket
- 13     End
+  0   8 Bra
+  3     \x{10ffff}
+  8   8 Ket
+ 11     End
 ------------------------------------------------------------------
 
-/\x{7fffFFFF}/8BM
-Memory allocation (code space): 14
-------------------------------------------------------------------
-  0  10 Bra
-  3     \x{7fffffff}
- 10  10 Ket
- 13     End
-------------------------------------------------------------------
+/\x{110000}/8BM
+Failed: character value in \x{...} sequence is too large at offset 9
 
 /[\x{ff}]/8BM
 Memory allocation (code space): 10
diff --git a/testdata/testoutput5 b/testdata/testoutput5
index 1eaab47..b63934d 100644
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@@ -49,42 +49,21 @@ Options: utf8
 First char = 244
 Need char = 128
 
-/\x{1000000}/8DZ
+/\x{10ffff}/8DZ
 ------------------------------------------------------------------
         Bra
-        \x{1000000}
+        \x{10ffff}
         Ket
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
 Options: utf8
-First char = 249
-Need char = 128
-
-/\x{4000000}/8DZ
-------------------------------------------------------------------
-        Bra
-        \x{4000000}
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 252
-Need char = 128
-
-/\x{7fffFFFF}/8DZ
-------------------------------------------------------------------
-        Bra
-        \x{7fffffff}
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 253
+First char = 244
 Need char = 191
 
+/\x{110000}/8DZ
+Failed: character value in \x{...} sequence is too large at offset 9
+
 /[\x{ff}]/8DZ
 ------------------------------------------------------------------
         Bra
@@ -115,6 +94,16 @@ Failed: character value in \x{...} sequence is too large at offset 11
 /\x{100000000}/8
 Failed: character value in \x{...} sequence is too large at offset 12
 
+/\x{d800}/8
+Failed: Not allowed UTF-8 / UTF-16 code point (>= 0xd800 && <= 0xdfff) at offset 7
+
+/\x{dfff}/8
+Failed: Not allowed UTF-8 / UTF-16 code point (>= 0xd800 && <= 0xdfff) at offset 7
+
+/\x{d7ff}/8
+
+/\x{e000}/8
+
 /^\x{100}a\x{1234}/8
     \x{100}a\x{1234}bcd
  0: \x{100}a\x{1234}
@@ -1436,7 +1425,7 @@ No match
 /[\H]/8BZ
 ------------------------------------------------------------------
         Bra
-        [\x00-\x08\x0a-\x1f!-\x9f\xa1-\xff\x{100}-\x{167f}\x{1681}-\x{180d}\x{180f}-\x{1fff}\x{200b}-\x{202e}\x{2030}-\x{205e}\x{2060}-\x{2fff}\x{3001}-\x{7fffffff}]
+        [\x00-\x08\x0a-\x1f!-\x9f\xa1-\xff\x{100}-\x{167f}\x{1681}-\x{180d}\x{180f}-\x{1fff}\x{200b}-\x{202e}\x{2030}-\x{205e}\x{2060}-\x{2fff}\x{3001}-\x{10ffff}]
         Ket
         End
 ------------------------------------------------------------------
@@ -1444,7 +1433,7 @@ No match
 /[\V]/8BZ
 ------------------------------------------------------------------
         Bra
-        [\x00-\x09\x0e-\x84\x86-\xff\x{100}-\x{2027}\x{2029}-\x{7fffffff}]
+        [\x00-\x09\x0e-\x84\x86-\xff\x{100}-\x{2027}\x{2029}-\x{10ffff}]
         Ket
         End
 ------------------------------------------------------------------
author	zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2011-12-03 07:58:30 +0000
committer	zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2011-12-03 07:58:30 +0000
commit	ad1a6e3a96050e61e6e2127d3a00ded77a1eb80c (patch)
tree	4987dde0d6b3aee6401d3e89ce6ddc3acef49df3
parent	c9fa02b130f1a9da7b17b915e75248f19afb6d7a (diff)
download	pcre-ad1a6e3a96050e61e6e2127d3a00ded77a1eb80c.tar.gz