Start working on UTF-16. Updating macros and adding new ones.

git-svn-id: svn://vcs.exim.org/pcre/code/branches/pcre16@782 2f5784b3-3f2a-0410-8824-cb99058d5e15
author: zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2011-12-03 23:58:37 +0000
committer: zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2011-12-03 23:58:37 +0000
commit: 216818740b54b629e7bd59cd49f783c72e244e23 (patch)
tree: 35603a12be962c35a4e39e879a1a8e021f53d765
parent: ad1a6e3a96050e61e6e2127d3a00ded77a1eb80c (diff)
download: pcre-216818740b54b629e7bd59cd49f783c72e244e23.tar.gz
13 files changed, 574 insertions, 216 deletions
diff --git a/Makefile.am b/Makefile.am
index 39cf574..c939f9f 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -219,6 +219,7 @@ libpcre16_la_SOURCES = \
   pcre16_study.c \
   pcre16_tables.c \
   pcre16_try_flipped.c \
+  pcre16_ucd.c \
   pcre16_utf16_utils.c \
   pcre16_valid_utf16.c \
   pcre16_xclass.c
diff --git a/pcre16_ord2utf16.c b/pcre16_ord2utf16.c
index 421c3a3..b02ccc2 100644
--- a/pcre16_ord2utf16.c
+++ b/pcre16_ord2utf16.c
@@ -45,8 +45,10 @@ character value into a UTF16 string. */
 #include "config.h"
 #endif
 
-#include "pcre_internal.h"
+/* Generate code with 16 bit character support. */
+#define COMPILE_PCRE16
 
+#include "pcre_internal.h"
 
 /*************************************************
 *       Convert character value to UTF-16         *
diff --git a/pcre16_ucd.c b/pcre16_ucd.c
new file mode 100644
index 0000000..962ed46
--- /dev/null
+++ b/pcre16_ucd.c
@@ -0,0 +1,45 @@
+/*************************************************
+*      Perl-Compatible Regular Expressions       *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+                       Written by Philip Hazel
+           Copyright (c) 1997-2011 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+/* Generate code with 16 bit character support. */
+#define COMPILE_PCRE16
+
+#include "pcre_ucd.c"
+
+/* End of pcre16_ucd.c */
diff --git a/pcre16_utf16_utils.c b/pcre16_utf16_utils.c
index 5ff3953..ddd96b9 100644
--- a/pcre16_utf16_utils.c
+++ b/pcre16_utf16_utils.c
@@ -46,6 +46,9 @@ strings to host byte order. */
 #include "config.h"
 #endif
 
+/* Generate code with 16 bit character support. */
+#define COMPILE_PCRE16
+
 #include "pcre_internal.h"
 
 int
diff --git a/pcre16_valid_utf16.c b/pcre16_valid_utf16.c
index c7c7507..cc3e50e 100644
--- a/pcre16_valid_utf16.c
+++ b/pcre16_valid_utf16.c
@@ -46,6 +46,9 @@ strings. */
 #include "config.h"
 #endif
 
+/* Generate code with 16 bit character support. */
+#define COMPILE_PCRE16
+
 #include "pcre_internal.h"
 
 
diff --git a/pcre_compile.c b/pcre_compile.c
index da4ce22..3461dbd 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -1466,8 +1466,8 @@ for (; ptr < cd->end_pattern; ptr++)
       {
       if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
       ptr++;
-#ifdef SUPPORT_UTF8
-      if (utf) while ((*ptr & 0xc0) == 0x80) ptr++;
+#ifdef SUPPORT_UTF
+      if (utf) FORWARDCHAR(ptr);
 #endif
       }
     if (*ptr == 0) goto FAIL_EXIT;
@@ -1759,8 +1759,8 @@ for (;;)
     case OP_NOTI:
     branchlength++;
     cc += 2;
-#ifdef SUPPORT_UTF8
-    if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+#ifdef SUPPORT_UTF
+    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 #endif
     break;
 
@@ -1773,8 +1773,8 @@ for (;;)
     case OP_NOTEXACTI:
     branchlength += GET2(cc,1);
     cc += 2 + IMM2_SIZE;
-#ifdef SUPPORT_UTF8
-    if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+#ifdef SUPPORT_UTF
+    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 #endif
     break;
 
@@ -2041,7 +2041,7 @@ for (;;)
   a multi-byte character. The length in the table is a minimum, so we have to
   arrange to skip the extra bytes. */
 
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
     if (utf) switch(c)
       {
       case OP_CHAR:
@@ -2072,7 +2072,7 @@ for (;;)
       case OP_MINQUERYI:
       case OP_POSQUERY:
       case OP_POSQUERYI:
-      if (code[-1] >= 0xc0) code += PRIV(utf8_table4)[code[-1] & 0x3f];
+      if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
       break;
       }
 #else
@@ -2161,7 +2161,7 @@ for (;;)
     by a multi-byte character. The length in the table is a minimum, so we have
     to arrange to skip the extra bytes. */
 
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
     if (utf) switch(c)
       {
       case OP_CHAR:
@@ -2192,7 +2192,7 @@ for (;;)
       case OP_MINQUERYI:
       case OP_POSQUERY:
       case OP_POSQUERYI:
-      if (code[-1] >= 0xc0) code += PRIV(utf8_table4)[code[-1] & 0x3f];
+      if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
       break;
       }
 #else
@@ -2452,7 +2452,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
     MINUPTO, and POSUPTO may be followed by a multibyte character */
 
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
     case OP_STAR:
     case OP_STARI:
     case OP_MINSTAR:
@@ -2465,7 +2465,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
     case OP_MINQUERYI:
     case OP_POSQUERY:
     case OP_POSQUERYI:
-    if (utf && code[1] >= 0xc0) code += PRIV(utf8_table4)[code[1] & 0x3f];
+    if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
     break;
 
     case OP_UPTO:
@@ -2474,7 +2474,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
     case OP_MINUPTOI:
     case OP_POSUPTO:
     case OP_POSUPTOI:
-    if (utf && code[1 + IMM2_SIZE] >= 0xc0) code += PRIV(utf8_table4)[code[1 + IMM2_SIZE] & 0x3f];
+    if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
     break;
 #endif
 
@@ -2913,8 +2913,8 @@ if ((options & PCRE_EXTENDED) != 0)
         {
         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
         ptr++;
-#ifdef SUPPORT_UTF8
-        if (utf) while ((*ptr & 0xc0) == 0x80) ptr++;
+#ifdef SUPPORT_UTF
+        if (utf) FORWARDCHAR(ptr);
 #endif
         }
       }
@@ -2957,8 +2957,8 @@ if ((options & PCRE_EXTENDED) != 0)
         {
         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
         ptr++;
-#ifdef SUPPORT_UTF8
-        if (utf) while ((*ptr & 0xc0) == 0x80) ptr++;
+#ifdef SUPPORT_UTF
+        if (utf) FORWARDCHAR(ptr);
 #endif
         }
       }
@@ -3424,7 +3424,7 @@ for (;; ptr++)
   int tempbracount;
   pcre_uchar mcbuffer[8];
 
-  /* Get next byte in the pattern */
+  /* Get next character in the pattern */
 
   c = *ptr;
 
@@ -3556,8 +3556,8 @@ for (;; ptr++)
         {
         if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
         ptr++;
-#ifdef SUPPORT_UTF8
-        if (utf) while ((*ptr & 0xc0) == 0x80) ptr++;
+#ifdef SUPPORT_UTF
+        if (utf) FORWARDCHAR(ptr);
 #endif
         }
       if (*ptr != 0) continue;
@@ -4601,7 +4601,7 @@ for (;; ptr++)
       {
       op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR;
 
-      /* Deal with UTF-8 characters that take up more than one byte. It's
+      /* Deal with UTF characters that take up more than one character. It's
       easier to write this out separately than try to macrify it. Use c to
       hold the length of the character in bytes, plus 0x80 to flag that it's a
       length rather than a small character. */
@@ -4610,16 +4610,16 @@ for (;; ptr++)
       if (utf && (code[-1] & 0x80) != 0)
         {
         pcre_uchar *lastchar = code - 1;
-        while((*lastchar & 0xc0) == 0x80) lastchar--;
+        BACKCHAR(lastchar);
         c = code - lastchar;            /* Length of UTF-8 character */
-        memcpy(utf_chars, lastchar, c); /* Save the char */
+        memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
         c |= 0x80;                      /* Flag c as a length */
         }
       else
 #endif
 
-      /* Handle the case of a single byte - either with no UTF8 support, or
-      with UTF-8 disabled, or for a UTF-8 character < 128. */
+      /* Handle the case of a single charater - either with no UTF support, or
+      with UTF disabled, or for a single character UTF character. */
 
         {
         c = code[-1];
@@ -5273,9 +5273,9 @@ for (;; ptr++)
       else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
         {
         tempcode += PRIV(OP_lengths)[*tempcode];
-#ifdef SUPPORT_UTF8
-        if (utf && tempcode[-1] >= 0xc0)
-          tempcode += PRIV(utf8_table4)[tempcode[-1] & 0x3f];
+#ifdef SUPPORT_UTF
+        if (utf && HAS_EXTRALEN(tempcode[-1]))
+          tempcode += GET_EXTRALEN(tempcode[-1]);
 #endif
         }
 
@@ -6659,11 +6659,10 @@ for (;; ptr++)
     mclength = 1;
     mcbuffer[0] = c;
 
-#ifdef SUPPORT_UTF8
-    if (utf && c >= 0xc0)
+#ifdef SUPPORT_UTF
+    if (utf && HAS_EXTRALEN(c))
       {
-      while ((ptr[1] & 0xc0) == 0x80)
-        mcbuffer[mclength++] = *(++ptr);
+      INTERNALCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
       }
 #endif
 
diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c
index 8247f46..d7b292d 100644
--- a/pcre_dfa_exec.c
+++ b/pcre_dfa_exec.c
@@ -480,9 +480,7 @@ if (*first_op == OP_REVERSE)
       {
       if (current_subject <= start_subject) break;
       current_subject--;
-      while (current_subject > start_subject &&
-             (*current_subject & 0xc0) == 0x80)
-        current_subject--;
+      INTERNALCHAR(current_subject > start_subject, *current_subject, current_subject--);
       }
     }
   else
@@ -3161,9 +3159,17 @@ if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
     return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
       PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
     }
+#ifdef COMPILE_PCRE8
   if (start_offset > 0 && start_offset < length &&
         (((PCRE_PUCHAR)subject)[start_offset] & 0xc0) == 0x80)
     return PCRE_ERROR_BADUTF8_OFFSET;
+#else
+#ifdef COMPILE_PCRE16
+  if (start_offset > 0 && start_offset < length &&
+        (((PCRE_PUCHAR)subject)[start_offset] & 0xfc00) == 0xdc00)
+    return PCRE_ERROR_BADUTF8_OFFSET;
+#endif /* COMPILE_PCRE16 */
+#endif /* COMPILE_PCRE8 */
   }
 #endif
 
@@ -3234,13 +3240,13 @@ for (;;)
     if (firstline)
       {
       PCRE_PUCHAR t = current_subject;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
       if (utf)
         {
         while (t < md->end_subject && !IS_NEWLINE(t))
           {
           t++;
-          while (t < end_subject && (*t & 0xc0) == 0x80) t++;
+          INTERNALCHAR(t < end_subject, *t, t++);
           }
         }
       else
@@ -3277,16 +3283,15 @@ for (;;)
         {
         if (current_subject > md->start_subject + start_offset)
           {
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
           if (utf)
             {
             while (current_subject < end_subject &&
                    !WAS_NEWLINE(current_subject))
               {
               current_subject++;
-              while(current_subject < end_subject &&
-                    (*current_subject & 0xc0) == 0x80)
-                current_subject++;
+              INTERNALCHAR(current_subject < end_subject, *current_subject,
+                current_subject++);
               }
             }
           else
@@ -3316,10 +3321,10 @@ for (;;)
           if ((start_bits[c/8] & (1 << (c&7))) == 0)
             {
             current_subject++;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
             if (utf)
-              while(current_subject < end_subject &&
-                    (*current_subject & 0xc0) == 0x80) current_subject++;
+              INTERNALCHAR(current_subject < end_subject, *current_subject,
+                current_subject++);
 #endif
             }
           else break;
@@ -3426,11 +3431,13 @@ for (;;)
 
   if (firstline && IS_NEWLINE(current_subject)) break;
   current_subject++;
+#ifdef SUPPORT_UTF
   if (utf)
     {
-    while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
-      current_subject++;
+    INTERNALCHAR(current_subject < end_subject, *current_subject,
+      current_subject++);
     }
+#endif
   if (current_subject > end_subject) break;
 
   /* If we have just passed a CR and we are now at a LF, and the pattern does
diff --git a/pcre_exec.c b/pcre_exec.c
index db013e6..6761598 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -2077,7 +2077,7 @@ for (;;)
         if (eptr == md->start_subject) prev_is_word = FALSE; else
           {
           PCRE_PUCHAR lastptr = eptr - 1;
-          while((*lastptr & 0xc0) == 0x80) lastptr--;
+          BACKCHAR(lastptr);
           if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
           GETCHAR(c, lastptr);
 #ifdef SUPPORT_UCP
@@ -2189,7 +2189,9 @@ for (;;)
       MRRETURN(MATCH_NOMATCH);
       }
     eptr++;
-    if (utf) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+#ifdef SUPPORT_UTF
+    if (utf) INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+#endif
     ecode++;
     break;
 
@@ -4074,7 +4076,7 @@ for (;;)
 
 /* Handle all other cases when the coding is UTF-8 */
 
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
       if (utf) switch(ctype)
         {
         case OP_ANY:
@@ -4087,7 +4089,7 @@ for (;;)
             }
           if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
           eptr++;
-          while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+          INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
           }
         break;
 
@@ -4100,7 +4102,7 @@ for (;;)
             MRRETURN(MATCH_NOMATCH);
             }
           eptr++;
-          while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+          INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
           }
         break;
 
@@ -4298,7 +4300,8 @@ for (;;)
             }
           if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
             MRRETURN(MATCH_NOMATCH);
-          while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
+          eptr++;
+          INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
           }
         break;
 
@@ -4326,7 +4329,8 @@ for (;;)
             }
           if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
             MRRETURN(MATCH_NOMATCH);
-          while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
+          eptr++;
+          INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
           }
         break;
 
@@ -5309,7 +5313,7 @@ for (;;)
       else
 #endif   /* SUPPORT_UCP */
 
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
       if (utf)
         {
         switch(ctype)
@@ -5326,7 +5330,7 @@ for (;;)
                 }
               if (IS_NEWLINE(eptr)) break;
               eptr++;
-              while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+              INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
               }
             }
 
@@ -5343,7 +5347,7 @@ for (;;)
                 }
               if (IS_NEWLINE(eptr)) break;
               eptr++;
-              while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+              INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
               }
             }
           break;
@@ -5359,7 +5363,7 @@ for (;;)
                 break;
                 }
               eptr++;
-              while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+              INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
               }
             }
           else
@@ -6014,10 +6018,18 @@ if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
       PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
     }
 
-  /* Check that a start_offset points to the start of a UTF-8 character. */
+  /* Check that a start_offset points to the start of a UTF character. */
+#ifdef COMPILE_PCRE8
   if (start_offset > 0 && start_offset < length &&
       (((PCRE_PUCHAR)subject)[start_offset] & 0xc0) == 0x80)
     return PCRE_ERROR_BADUTF8_OFFSET;
+#else
+#ifdef COMPILE_PCRE16
+  if (start_offset > 0 && start_offset < length &&
+      (((PCRE_PUCHAR)subject)[start_offset] & 0xfc00) == 0xdc00)
+    return PCRE_ERROR_BADUTF8_OFFSET;
+#endif /* COMPILE_PCRE16 */
+#endif /* COMPILE_PCRE8 */
   }
 #endif
 
@@ -6291,13 +6303,13 @@ for(;;)
   if (firstline)
     {
     PCRE_PUCHAR t = start_match;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
     if (utf)
       {
       while (t < md->end_subject && !IS_NEWLINE(t))
         {
         t++;
-        while (t < end_subject && (*t & 0xc0) == 0x80) t++;
+        INTERNALCHAR(t < end_subject, *t, t++);
         }
       }
     else
@@ -6333,14 +6345,14 @@ for(;;)
       {
       if (start_match > md->start_subject + start_offset)
         {
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
         if (utf)
           {
           while (start_match < end_subject && !WAS_NEWLINE(start_match))
             {
             start_match++;
-            while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
-              start_match++;
+            INTERNALCHAR(start_match < end_subject, *start_match,
+              start_match++);
             }
           }
         else
@@ -6366,7 +6378,7 @@ for(;;)
       {
       while (start_match < end_subject)
         {
-#ifdef COMPILE_PCRE8
+#ifdef COMPILE_PCRE
         register unsigned int c = *start_match;
 #else
         register unsigned int c = *start_match & 0xff;
@@ -6374,10 +6386,10 @@ for(;;)
         if ((start_bits[c/8] & (1 << (c&7))) == 0)
           {
           start_match++;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
           if (utf)
-            while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
-              start_match++;
+            INTERNALCHAR(start_match < end_subject, *start_match,
+              start_match++);
 #endif
           }
         else break;
@@ -6506,10 +6518,10 @@ for(;;)
     case MATCH_PRUNE:
     case MATCH_THEN:
     new_start_match = start_match + 1;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
     if (utf)
-      while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
-        new_start_match++;
+      INTERNALCHAR(new_start_match < end_subject, *new_start_match,
+        new_start_match++);
 #endif
     break;
 
diff --git a/pcre_internal.h b/pcre_internal.h
index 637565b..7642b91 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -531,7 +531,9 @@ not used when UTF-8 is not supported, so it is not defined, and BACKCHAR should
 never be called in byte mode. To make sure they can never even appear when
 UTF-8 support is omitted, we don't even define them. */
 
-#ifndef SUPPORT_UTF8
+/* #define HAS_EXTRALEN(c) */
+/* #define GET_EXTRALEN(c) */
+#ifndef SUPPORT_UTF
 #define GETCHAR(c, eptr) c = *eptr;
 #define GETCHARTEST(c, eptr) c = *eptr;
 #define GETCHARINC(c, eptr) c = *eptr++;
@@ -539,14 +541,27 @@ UTF-8 support is omitted, we don't even define them. */
 #define GETCHARLEN(c, eptr, len) c = *eptr;
 /* #define GETCHARLENTEST(c, eptr, len) */
 /* #define BACKCHAR(eptr) */
+/* #define FORWARDCHAR(eptr) */
+/* #define INTERNALCHAR(condition, eptr, action) */
+
+#else   /* SUPPORT_UTF */
 
-#else   /* SUPPORT_UTF8 */
+#ifdef COMPILE_PCRE8
 
 /* These macros were originally written in the form of loops that used data
 from the tables whose names start with PRIV(utf8_table). They were rewritten by
 a user so as not to use loops, because in some environments this gives a
 significant performance advantage, and it seems never to do any harm. */
 
+/* Tests whether the code point needs extra characters to decode. */
+
+#define HAS_EXTRALEN(c) ((c) >= 0xc0)
+
+/* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE.
+Otherwise it has an undefined behaviour. */
+
+#define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3f])
+
 /* Base macro to pick up the remaining bytes of a UTF-8 character, not
 advancing the pointer. */
 
@@ -689,7 +704,107 @@ because almost all calls are already within a block of UTF-8 only code. */
 
 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--
 
-#endif  /* SUPPORT_UTF8 */
+/* Same as above, just in the other direction. */
+#define FORWARDCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr++
+
+/* Same as above, but it allows a fully customizable form. */
+#define INTERNALCHAR(condition, eptr, action) \
+  while((condition) && ((eptr) & 0xc0) == 0x80) action
+
+#else /* COMPILE_PCRE8 */
+
+#ifdef COMPILE_PCRE16
+
+/* Tests whether the code point needs extra characters to decode. */
+
+#define HAS_EXTRALEN(c) (((c) & 0xfc00) == 0xd800)
+
+/* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE.
+Otherwise it has an undefined behaviour. */
+
+#define GET_EXTRALEN(c) 1
+
+/* Base macro to pick up the low surrogate of a UTF-16 character, not
+advancing the pointer. */
+
+#define GETUTF16(c, eptr) \
+   { c = (((c & 0x3ff) << 10) | (eptr[1] & 0x3ff)) + 0x10000; }
+
+/* Get the next UTF-16 character, not advancing the pointer. This is called when
+we know we are in UTF-16 mode. */
+
+#define GETCHAR(c, eptr) \
+  c = *eptr; \
+  if ((c & 0xfc00) == 0xd800) GETUTF16(c, eptr);
+
+/* Get the next UTF-16 character, testing for UTF-16 mode, and not advancing the
+pointer. */
+
+#define GETCHARTEST(c, eptr) \
+  c = *eptr; \
+  if (utf && (c & 0xfc00) == 0xd800) GETUTF16(c, eptr);
+
+/* Base macro to pick up the low surrogate of a UTF-16 character, advancing
+the pointer. */
+
+#define GETUTF16INC(c, eptr) \
+   { c = (((c & 0x3ff) << 10) | (eptr[1] & 0x3ff)) + 0x10000; eptr++; }
+
+/* Get the next UTF-16 character, advancing the pointer. This is called when we
+know we are in UTF-16 mode. */
+
+#define GETCHARINC(c, eptr) \
+  c = *eptr++; \
+  if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, eptr);
+
+/* Get the next character, testing for UTF-16 mode, and advancing the pointer.
+This is called when we don't know if we are in UTF-16 mode. */
+
+#define GETCHARINCTEST(c, eptr) \
+  c = *eptr++; \
+  if (utf && (c & 0xfc00) == 0xd800) GETUTF16INC(c, eptr);
+
+/* Base macro to pick up the low surrogate of a UTF-16 character, not
+advancing the pointer, incrementing the length. */
+
+#define GETUTF16LEN(c, eptr, len) \
+   { c = (((c & 0x3ff) << 10) | (eptr[1] & 0x3ff)) + 0x10000; len++; }
+
+/* Get the next UTF-16 character, not advancing the pointer, incrementing
+length if there is a low surrogate. This is called when we know we are in
+UTF-16 mode. */
+
+#define GETCHARLEN(c, eptr, len) \
+  c = *eptr; \
+  if ((c & 0xfc00) == 0xd800) GETUTF16LEN(c, eptr, len);
+
+/* Get the next UTF-816character, testing for UTF-16 mode, not advancing the
+pointer, incrementing length if there is a low surrogate. This is called when
+we do not know if we are in UTF-16 mode. */
+
+#define GETCHARLENTEST(c, eptr, len) \
+  c = *eptr; \
+  if (utf && (c & 0xfc00) == 0xd800) GETUTF16LEN(c, eptr, len);
+
+/* If the pointer is not at the start of a character, move it back until
+it is. This is called only in UTF-16 mode - we don't put a test within the
+macro because almost all calls are already within a block of UTF-16 only
+code. */
+
+#define BACKCHAR(eptr) if ((*eptr & 0xfc00) == 0xdc00) eptr--
+
+/* Same as above, just in the other direction. */
+#define FORWARDCHAR(eptr) if ((*eptr & 0xfc00) == 0xdc00) eptr++
+
+/* Same as above, but it allows a fully customizable form. */
+#define INTERNALCHAR(condition, eptr, action) \
+  if ((condition) && ((eptr) & 0xfc00) == 0xdc00) action
+
+#endif
+
+#endif /* COMPILE_PCRE8 */
+
+#endif  /* SUPPORT_UTF */
 
 
 /* In case there is no definition of offsetof() provided - though any proper
@@ -2043,12 +2158,15 @@ of the exported public functions. They have to be "external" in the C sense,
 but are not part of the PCRE public API. The data for these tables is in the
 pcre_tables.c module. */
 
+#ifdef COMPILE_PCRE8
+
 extern const int            PRIV(utf8_table1)[];
+extern const int            PRIV(utf8_table1_size);
 extern const int            PRIV(utf8_table2)[];
 extern const int            PRIV(utf8_table3)[];
 extern const pcre_uint8     PRIV(utf8_table4)[];
 
-extern const int            PRIV(utf8_table1_size);
+#endif /* COMPILE_PCRE8 */
 
 extern const char           PRIV(utt_names)[];
 extern const ucp_type_table PRIV(utt)[];
diff --git a/pcre_jit_compile.c b/pcre_jit_compile.c
index 16611f1..03c7b2c 100644
--- a/pcre_jit_compile.c
+++ b/pcre_jit_compile.c
@@ -302,9 +302,11 @@ typedef struct compiler_common {
 #ifdef SUPPORT_UCP
   BOOL useucp;
 #endif
-  jump_list *utf8readchar;
-  jump_list *utf8readtype8;
+  jump_list *utfreadchar;
+#ifdef COMPILE_PCRE8
+  jump_list *utfreadtype8;
 #endif
+#endif /* SUPPORT_UTF8 */
 #ifdef SUPPORT_UCP
   jump_list *getucd;
 #endif
@@ -543,8 +545,8 @@ switch(*cc)
   case OP_NOTPOSPLUSI:
   case OP_NOTPOSQUERYI:
   cc += 2;
-#ifdef SUPPORT_UTF8
-  if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+#ifdef SUPPORT_UTF
+  if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 #endif
   return cc;
 
@@ -565,8 +567,8 @@ switch(*cc)
   case OP_NOTEXACTI:
   case OP_NOTPOSUPTOI:
   cc += 2 + IMM2_SIZE;
-#ifdef SUPPORT_UTF8
-  if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+#ifdef SUPPORT_UTF
+  if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 #endif
   return cc;
 
@@ -1285,7 +1287,7 @@ return MAX_255(c) ? common->fcc[c] != c : FALSE;
 static SLJIT_INLINE unsigned int char_othercase(compiler_common *common, unsigned int c)
 {
 /* Returns with the othercase. */
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
 if (common->utf && c > 127)
   {
 #ifdef SUPPORT_UCP
@@ -1302,11 +1304,11 @@ static unsigned int char_get_othercase_bit(compiler_common *common, pcre_uchar*
 {
 /* Detects if the character and its othercase has only 1 bit difference. */
 unsigned int c, oc, bit;
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF8 && defined COMPILE_PCRE8
 int n;
 #endif
 
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
 if (common->utf)
   {
   GETCHAR(c, cc);
@@ -1324,11 +1326,11 @@ if (common->utf)
 else
   {
   c = *cc;
-  oc = common->fcc[c];
+  oc = TABLE_GET(c, common->fcc, c);
   }
 #else
 c = *cc;
-oc = common->fcc[c];
+oc = TABLE_GET(c, common->fcc, c);
 #endif
 
 SLJIT_ASSERT(c != oc);
@@ -1342,10 +1344,12 @@ if (c <= 127 && bit == 0x20)
 if (!ispowerof2(bit))
   return 0;
 
+#ifdef COMPILE_PCRE8
+
 #ifdef SUPPORT_UTF8
 if (common->utf && c > 127)
   {
-  n = PRIV(utf8_table4)[*cc & 0x3f];
+  n = GET_EXTRALEN(*cc);
   while ((bit & 0x3f) == 0)
     {
     n--;
@@ -1353,8 +1357,25 @@ if (common->utf && c > 127)
     }
   return (n << 8) | bit;
   }
-#endif
+#endif /* SUPPORT_UTF8 */
 return (0 << 8) | bit;
+
+#else /* COMPILE_PCRE8 */
+
+#ifdef COMPILE_PCRE16
+#ifdef SUPPORT_UTF16
+if (common->utf && c > 65535)
+  {
+  if (bit >= (1 << 10))
+    bit >>= 10;
+  else
+    return (bit <= 255) ? ((2 << 8) | bit) : ((3 << 8) | (bit >> 8));
+  }
+#endif /* SUPPORT_UTF16 */
+return (bit <= 255) ? ((0 << 8) | bit) : ((1 << 8) | (bit >> 8));
+#endif /* COMPILE_PCRE16 */
+
+#endif /* COMPILE_PCRE8 */
 }
 
 static SLJIT_INLINE void check_input_end(compiler_common *common, jump_list **fallbacks)
@@ -1368,16 +1389,22 @@ static void read_char(compiler_common *common)
 /* Reads the character into TMP1, updates STR_PTR.
 Does not check STR_END. TMP2 Destroyed. */
 DEFINE_COMPILER;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
 struct sljit_jump *jump;
 #endif
 
 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
 if (common->utf)
   {
+#ifdef COMPILE_PCRE8
   jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
-  add_jump(compiler, &common->utf8readchar, JUMP(SLJIT_FAST_CALL));
+#else
+#ifdef COMPILE_PCRE16
+  jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xd800);
+#endif
+#endif /* COMPILE_PCRE8 */
+  add_jump(compiler, &common->utfreadchar, JUMP(SLJIT_FAST_CALL));
   JUMPHERE(jump);
   }
 #endif
@@ -1389,16 +1416,22 @@ static void peek_char(compiler_common *common)
 /* Reads the character into TMP1, keeps STR_PTR.
 Does not check STR_END. TMP2 Destroyed. */
 DEFINE_COMPILER;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
 struct sljit_jump *jump;
 #endif
 
 OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
 if (common->utf)
   {
+#ifdef COMPILE_PCRE8
   jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
-  add_jump(compiler, &common->utf8readchar, JUMP(SLJIT_FAST_CALL));
+#else
+#ifdef COMPILE_PCRE16
+  jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xd800);
+#endif
+#endif /* COMPILE_PCRE8 */
+  add_jump(compiler, &common->utfreadchar, JUMP(SLJIT_FAST_CALL));
   OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
   JUMPHERE(jump);
   }
@@ -1409,46 +1442,83 @@ static void read_char8_type(compiler_common *common)
 {
 /* Reads the character type into TMP1, updates STR_PTR. Does not check STR_END. */
 DEFINE_COMPILER;
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || defined COMPILE_PCRE16
 struct sljit_jump *jump;
 #endif
 
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
 if (common->utf)
   {
-  OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 0);
-  OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+  OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 0);
+  OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+#ifdef COMPILE_PCRE8
   /* This can be an extra read in some situations, but hopefully
-  it is a clever early read in most cases. */
+  it is needed in most cases. */
   OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
   jump = CMP(SLJIT_C_LESS, TMP2, 0, SLJIT_IMM, 0xc0);
-  add_jump(compiler, &common->utf8readtype8, JUMP(SLJIT_FAST_CALL));
+  add_jump(compiler, &common->utfreadtype8, JUMP(SLJIT_FAST_CALL));
   JUMPHERE(jump);
+#else
+#ifdef COMPILE_PCRE16
+  OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
+  jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 0xff);
+  OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
+  JUMPHERE(jump);
+  /* Skip low surrogate if necessary. */
+  OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xfc00);
+  OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0xd800);
+  COND_VALUE(SLJIT_MOV, TMP2, 0, SLJIT_C_EQUAL);
+  OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 1);
+  OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
+#endif
+#endif /* COMPILE_PCRE8 */
   return;
   }
 #endif
-OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 0);
 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
-OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), common->ctypes);
+#ifdef COMPILE_PCRE16
+/* The ctypes array contains only 255 values. */
+OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
+jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 0xff);
+#endif
+OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
+#ifdef COMPILE_PCRE16
+JUMPHERE(jump);
+#endif
 }
 
 static void skip_char_back(compiler_common *common)
 {
 /* Goes one character back. Only affects STR_PTR. Does not check begin. */
 DEFINE_COMPILER;
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
 struct sljit_label *label;
 
 if (common->utf)
   {
   label = LABEL();
-  OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
-  OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+  OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -IN_UCHARS(1));
+  OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
   OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc0);
   CMPTO(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, 0x80, label);
   return;
   }
 #endif
+#if defined SUPPORT_UTF && defined COMPILE_PCRE16
+if (common->utf)
+  {
+  OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -IN_UCHARS(1));
+  OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+  /* Skip low surrogate if necessary. */
+  OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00);
+  OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xdc00);
+  COND_VALUE(SLJIT_MOV, TMP1, 0, SLJIT_C_EQUAL);
+  OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
+  OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+  return;
+  }
+#endif
 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
 }
 
@@ -1477,10 +1547,12 @@ else
   }
 }
 
-#ifdef SUPPORT_UTF8
-static void do_utf8readchar(compiler_common *common)
+#ifdef SUPPORT_UTF
+
+#ifdef COMPILE_PCRE8
+static void do_utfreadchar(compiler_common *common)
 {
-/* Fast decoding an utf8 character. TMP1 contains the first byte
+/* Fast decoding a UTF-8 character. TMP1 contains the first byte
 of the character (>= 0xc0). Return char value in TMP1, length - 1 in TMP2. */
 DEFINE_COMPILER;
 struct sljit_jump *jump;
@@ -1489,82 +1561,57 @@ sljit_emit_fast_enter(compiler, RETURN_ADDR, 0, 1, 5, 5, common->localsize);
 /* Searching for the first zero. */
 OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x20);
 jump = JUMP(SLJIT_C_NOT_ZERO);
-/* 2 byte sequence */
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 1);
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+/* Two byte sequence. */
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
 OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x1f);
 OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
 OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 1);
+OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, IN_UCHARS(1));
 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
 JUMPHERE(jump);
 
 OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x10);
 jump = JUMP(SLJIT_C_NOT_ZERO);
-/* 3 byte sequence */
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 1);
+/* Three byte sequence. */
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
 OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x0f);
 OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 12);
 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
 OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6);
 OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 2);
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 2);
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(2));
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
 OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 2);
+OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, IN_UCHARS(2));
 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
 JUMPHERE(jump);
 
-OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x08);
-jump = JUMP(SLJIT_C_NOT_ZERO);
-/* 4 byte sequence */
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 1);
+/* Four byte sequence. */
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
 OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x07);
 OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 18);
 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
 OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 12);
 OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 2);
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(2));
 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
 OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6);
 OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 3);
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 3);
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(3));
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3));
 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
 OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 3);
-sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
-JUMPHERE(jump);
-
-/* 5 byte sequence */
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 1);
-OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x03);
-OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 24);
-OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
-OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 18);
-OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 2);
-OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
-OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 12);
-OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 3);
-OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
-OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6);
-OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 4);
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 4);
-OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
-OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 4);
+OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, IN_UCHARS(3));
 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
 }
 
-static void do_utf8readtype8(compiler_common *common)
+static void do_utfreadtype8(compiler_common *common)
 {
-/* Fast decoding an utf8 character type. TMP2 contains the first byte
-of the character (>= 0xc0) and TMP1 is destroyed. Return value in TMP1. */
+/* Fast decoding a UTF-8 character type. TMP2 contains the first byte
+of the character (>= 0xc0). Return value in TMP1. */
 DEFINE_COMPILER;
 struct sljit_jump *jump;
 struct sljit_jump *compare;
@@ -1573,9 +1620,9 @@ sljit_emit_fast_enter(compiler, RETURN_ADDR, 0, 1, 5, 5, common->localsize);
 
 OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0x20);
 jump = JUMP(SLJIT_C_NOT_ZERO);
-/* 2 byte sequence */
-OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+/* Two byte sequence. */
+OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x1f);
 OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6);
 OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3f);
@@ -1596,7 +1643,38 @@ OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
 }
 
-#endif
+#else /* COMPILE_PCRE8 */
+
+#ifdef COMPILE_PCRE16
+static void do_utfreadchar(compiler_common *common)
+{
+/* Fast decoding a UTF-16 character. TMP1 contains the first 16 bit char
+of the character (>= 0xd800). Return char value in TMP1, length - 1 in TMP2. */
+DEFINE_COMPILER;
+struct sljit_jump *jump;
+
+sljit_emit_fast_enter(compiler, RETURN_ADDR, 0, 1, 5, 5, common->localsize);
+jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xdc00);
+/* Do nothing, only return. */
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+
+JUMPHERE(jump);
+/* Combine two 16 bit characters. */
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 1);
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3ff);
+OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 10);
+OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3ff);
+OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, IN_UCHARS(1));
+OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000);
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+}
+#endif /* COMPILE_PCRE16 */
+
+#endif /* COMPILE_PCRE8 */
+
+#endif /* SUPPORT_UTF */
 
 #ifdef SUPPORT_UCP
 
@@ -1634,8 +1712,8 @@ struct sljit_label *newlinelabel = NULL;
 struct sljit_jump *start;
 struct sljit_jump *end = NULL;
 struct sljit_jump *nl = NULL;
-#ifdef SUPPORT_UTF8
-struct sljit_jump *singlebyte;
+#ifdef SUPPORT_UTF
+struct sljit_jump *singlechar;
 #endif
 jump_list *newline = NULL;
 BOOL newlinecheck = FALSE;
@@ -1708,13 +1786,25 @@ if (newlinecheck)
   CMPTO(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff, newlinelabel);
 
 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
 if (common->utf)
   {
-  singlebyte = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
+  singlechar = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
   OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)PRIV(utf8_table4) - 0xc0);
   OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
-  JUMPHERE(singlebyte);
+  JUMPHERE(singlechar);
+  }
+#endif
+#if defined SUPPORT_UTF && defined COMPILE_PCRE16
+if (common->utf)
+  {
+  singlechar = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xd800);
+  OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00);
+  OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xd800);
+  COND_VALUE(SLJIT_MOV, TMP1, 0, SLJIT_C_EQUAL);
+  OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
+  OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+  JUMPHERE(singlechar);
   }
 #endif
 JUMPHERE(start);
@@ -1770,7 +1860,7 @@ else
   }
 
 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
 if (common->utf)
   {
   CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0, start);
@@ -1778,6 +1868,17 @@ if (common->utf)
   OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
   }
 #endif
+#if defined SUPPORT_UTF && defined COMPILE_PCRE16
+if (common->utf)
+  {
+  CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xd800, start);
+  OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00);
+  OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xd800);
+  COND_VALUE(SLJIT_MOV, TMP1, 0, SLJIT_C_EQUAL);
+  OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
+  OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+  }
+#endif
 JUMPTO(SLJIT_JUMP, start);
 JUMPHERE(found);
 JUMPHERE(leave);
@@ -1900,7 +2001,7 @@ if (common->utf)
   OP1(SLJIT_MOV, TMP1, 0, TMP3, 0);
 #endif
 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
 if (common->utf)
   {
   CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0, start);
@@ -1908,6 +2009,17 @@ if (common->utf)
   OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
   }
 #endif
+#if defined SUPPORT_UTF && defined COMPILE_PCRE16
+if (common->utf)
+  {
+  CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xd800, start);
+  OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00);
+  OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xd800);
+  COND_VALUE(SLJIT_MOV, TMP1, 0, SLJIT_C_EQUAL);
+  OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
+  OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+  }
+#endif
 JUMPTO(SLJIT_JUMP, start);
 JUMPHERE(found);
 JUMPHERE(leave);
@@ -2335,10 +2447,10 @@ if (context->sourcereg == -1)
   context->sourcereg = TMP2;
   }
 
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
 utflength = 1;
-if (common->utf && *cc >= 0xc0)
-  utflength += PRIV(utf8_table4)[*cc & 0x3f];
+if (common->utf && HAS_EXTRALEN(*cc))
+  utflength += GET_EXTRALEN(*cc);
 
 do
   {
@@ -2523,8 +2635,8 @@ while (*cc != XCL_END)
   if (*cc == XCL_SINGLE)
     {
     cc += 2;
-#ifdef SUPPORT_UTF8
-    if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+#ifdef SUPPORT_UTF
+    if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 #endif
 #ifdef SUPPORT_UCP
     needschar = TRUE;
@@ -2533,12 +2645,12 @@ while (*cc != XCL_END)
   else if (*cc == XCL_RANGE)
     {
     cc += 2;
-#ifdef SUPPORT_UTF8
-    if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+#ifdef SUPPORT_UTF
+    if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 #endif
     cc++;
-#ifdef SUPPORT_UTF8
-    if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+#ifdef SUPPORT_UTF
+    if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 #endif
 #ifdef SUPPORT_UCP
     needschar = TRUE;
@@ -2875,24 +2987,35 @@ switch(type)
 
   case OP_ALLANY:
   check_input_end(common, fallbacks);
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
   if (common->utf)
     {
-    OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
-    OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+    OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+    OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+#ifdef COMPILE_PCRE8
     jump[0] = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
     OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)PRIV(utf8_table4) - 0xc0);
     OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+#else /* COMPILE_PCRE8 */
+#ifdef COMPILE_PCRE16
+    jump[0] = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xd800);
+    OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00);
+    OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xd800);
+    COND_VALUE(SLJIT_MOV, TMP1, 0, SLJIT_C_EQUAL);
+    OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
+    OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+#endif /* COMPILE_PCRE16 */
+#endif /* COMPILE_PCRE8 */
     JUMPHERE(jump[0]);
     return cc;
     }
 #endif
-  OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+  OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
   return cc;
 
   case OP_ANYBYTE:
   check_input_end(common, fallbacks);
-  OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+  OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
   return cc;
 
 #ifdef SUPPORT_UTF8
@@ -3095,8 +3218,8 @@ switch(type)
   case OP_CHAR:
   case OP_CHARI:
   length = 1;
-#ifdef SUPPORT_UTF8
-  if (common->utf && *cc >= 0xc0) length += PRIV(utf8_table4)[*cc & 0x3f];
+#ifdef SUPPORT_UTF
+  if (common->utf && HAS_EXTRALEN(*cc)) length += GET_EXTRALEN(*cc);
 #endif
   if (type == OP_CHAR || !char_has_othercase(common, cc) || char_get_othercase_bit(common, cc) != 0)
     {
@@ -3129,11 +3252,11 @@ switch(type)
 
   case OP_NOT:
   case OP_NOTI:
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
   if (common->utf)
     {
     length = 1;
-    if (*cc >= 0xc0) length += PRIV(utf8_table4)[*cc & 0x3f];
+    if (HAS_EXTRALEN(*cc)) length += GET_EXTRALEN(*cc);
 
     check_input_end(common, fallbacks);
     GETCHAR(c, cc);
@@ -3152,7 +3275,9 @@ switch(type)
       /* Skip the variable-length character. */
       OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
       jump[0] = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
+#ifdef COMPILE_PCRE8
       OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)PRIV(utf8_table4) - 0xc0);
+#endif
       OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
       JUMPHERE(jump[0]);
       return cc + length;
@@ -3268,21 +3393,21 @@ do
   if (*cc == OP_CHAR)
     {
     size = 1;
-#ifdef SUPPORT_UTF8
-    if (common->utf && cc[1] >= 0xc0)
-      size += PRIV(utf8_table4)[cc[1] & 0x3f];
+#ifdef SUPPORT_UTF
+    if (common->utf && HAS_EXTRALEN(cc[1]))
+      size += GET_EXTRALEN(cc[1]);
 #endif
     }
   else if (*cc == OP_CHARI)
     {
     size = 1;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
     if (common->utf)
       {
       if (char_has_othercase(common, cc + 1) && char_get_othercase_bit(common, cc + 1) == 0)
         size = 0;
-      else if (cc[1] >= 0xc0)
-        size += PRIV(utf8_table4)[cc[1] & 0x3f];
+      else if (HAS_EXTRALEN(cc[1]))
+        size += GET_EXTRALEN(cc[1]);
       }
     else
 #endif
@@ -4786,8 +4911,8 @@ if (*type == 0)
 if (end != NULL)
   {
   *end = cc + 1;
-#ifdef SUPPORT_UTF8
-  if (common->utf && *cc >= 0xc0) *end += PRIV(utf8_table4)[*cc & 0x3f];
+#ifdef SUPPORT_UTF
+  if (common->utf && HAS_EXTRALEN(*cc)) *end += GET_EXTRALEN(*cc);
 #endif
   }
 return cc;
@@ -6259,9 +6384,11 @@ common->utf = (re->options & PCRE_UTF8) != 0;
 #ifdef SUPPORT_UCP
 common->useucp = (re->options & PCRE_UCP) != 0;
 #endif
-common->utf8readchar = NULL;
-common->utf8readtype8 = NULL;
+common->utfreadchar = NULL;
+#ifdef COMPILE_PCRE8
+common->utfreadtype8 = NULL;
 #endif
+#endif /* SUPPORT_UTF8 */
 #ifdef SUPPORT_UCP
 common->getucd = NULL;
 #endif
@@ -6487,18 +6614,20 @@ if (common->caselesscmp != NULL)
   set_jumps(common->caselesscmp, LABEL());
   do_caselesscmp(common);
   }
-#ifdef SUPPORT_UTF8
-if (common->utf8readchar != NULL)
+#ifdef SUPPORT_UTF
+if (common->utfreadchar != NULL)
   {
-  set_jumps(common->utf8readchar, LABEL());
-  do_utf8readchar(common);
+  set_jumps(common->utfreadchar, LABEL());
+  do_utfreadchar(common);
   }
-if (common->utf8readtype8 != NULL)
+#ifdef COMPILE_PCRE8
+if (common->utfreadtype8 != NULL)
   {
-  set_jumps(common->utf8readtype8, LABEL());
-  do_utf8readtype8(common);
+  set_jumps(common->utfreadtype8, LABEL());
+  do_utfreadtype8(common);
   }
 #endif
+#endif /* COMPILE_PCRE8 */
 #ifdef SUPPORT_UCP
 if (common->getucd != NULL)
   {
diff --git a/pcre_printint.src b/pcre_printint.src
index 5a9f15d..2922e54 100644
--- a/pcre_printint.src
+++ b/pcre_printint.src
@@ -72,17 +72,20 @@ static const char *OP_names[] = { OP_NAME_LIST };
 *************************************************/
 
 static int
-print_char(FILE *f, pcre_uchar *ptr, BOOL utf8)
+print_char(FILE *f, pcre_uchar *ptr, BOOL utf)
 {
 int c = *ptr;
 
-#ifndef SUPPORT_UTF8
-(void)utf8;  /* Avoid compiler warning */
+#ifndef SUPPORT_UTF
+(void)utf;  /* Avoid compiler warning */
 if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
 return 0;
 
 #else
-if (!utf8 || (c & 0xc0) != 0xc0)
+
+#ifdef COMPILE_PCRE8
+
+if (!utf || (c & 0xc0) != 0xc0)
   {
   if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
   return 0;
@@ -110,14 +113,45 @@ else
     s -= 6;
     c |= (ptr[i] & 0x3f) << s;
     }
-  if (c < 128) fprintf(f, "\\x%02x", c); else fprintf(f, "\\x{%x}", c);
+  fprintf(f, "\\x{%x}", c);
   return a;
   }
-#endif
+
+#else
+
+#ifdef COMPILE_PCRE16
+
+if (!utf || (c & 0xfc00) != 0xd800)
+  {
+  if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
+  return 0;
+  }
+else
+  {
+  /* This is a check for malformed UTF-16; it should only occur if the sanity
+  check has been turned off. Rather than swallow a low surrogate, just stop if
+  we hit a bad one. Print it with \X instead of \x as an indication. */
+
+  if ((ptr[1] & 0xfc00) != 0xdc00)
+    {
+    fprintf(f, "\\X{%x}", c);
+    return 0;
+    }
+
+  c = (((c & 0x3ff) << 10) | (ptr[1] & 0x3ff)) + 0x10000;
+  fprintf(f, "\\x{%x}", c);
+  return 1;
+  }
+
+#endif /* COMPILE_PCRE16 */
+
+#endif /* COMPILE_PCRE8 */
+
+#endif /* SUPPORT_UTF */
 }
 
 /*************************************************
-*  Print uchar string (regardless of utf8)       *
+*  Print uchar string (regardless of utf)        *
 *************************************************/
 
 static void
@@ -168,7 +202,7 @@ pcre_printint(pcre *external_re, FILE *f, BOOL print_lengths)
 {
 real_pcre *re = (real_pcre *)external_re;
 pcre_uchar *codestart, *code;
-BOOL utf8;
+BOOL utf;
 
 unsigned int options = re->options;
 int offset = re->name_table_offset;
@@ -187,7 +221,8 @@ if (re->magic_number != MAGIC_NUMBER)
   }
 
 code = codestart = (pcre_uchar *)re + offset + count * size;
-utf8 = (options & PCRE_UTF8) != 0;
+/* PCRE_UTF16 has the same value as PCRE_UTF8. */
+utf = (options & PCRE_UTF8) != 0;
 
 for(;;)
   {
@@ -232,7 +267,7 @@ for(;;)
     do
       {
       code++;
-      code += 1 + print_char(f, code, utf8);
+      code += 1 + print_char(f, code, utf);
       }
     while (*code == OP_CHAR);
     fprintf(f, "\n");
@@ -243,7 +278,7 @@ for(;;)
     do
       {
       code++;
-      code += 1 + print_char(f, code, utf8);
+      code += 1 + print_char(f, code, utf);
       }
     while (*code == OP_CHARI);
     fprintf(f, "\n");
@@ -349,7 +384,7 @@ for(;;)
         extra = 2;
         }
       }
-    else extra = print_char(f, code+1, utf8);
+    else extra = print_char(f, code+1, utf);
     fprintf(f, "%s", OP_names[*code]);
     break;
 
@@ -364,7 +399,7 @@ for(;;)
     case OP_MINUPTO:
     case OP_POSUPTO:
     fprintf(f, " %s ", flag);
-    extra = print_char(f, code + 1 + IMM2_SIZE, utf8);
+    extra = print_char(f, code + 1 + IMM2_SIZE, utf);
     fprintf(f, "{");
     if (*code != OP_EXACT && *code != OP_EXACTI) fprintf(f, "0,");
     fprintf(f, "%d}", GET2(code,1));
@@ -557,7 +592,7 @@ for(;;)
           }
         }
 
-      /* Indicate a non-UTF8 class which was created by negation */
+      /* Indicate a non-UTF class which was created by negation */
 
       fprintf(f, "]%s", (*code == OP_NCLASS)? " (neg)" : "");
 
diff --git a/pcre_study.c b/pcre_study.c
index 098980d..1e10397 100644
--- a/pcre_study.c
+++ b/pcre_study.c
@@ -225,7 +225,7 @@ for (;;)
     branchlength++;
     cc += 2;
 #ifdef SUPPORT_UTF8
-    if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 #endif
     break;
 
@@ -246,7 +246,7 @@ for (;;)
     branchlength += GET2(cc,1);
     cc += 2 + IMM2_SIZE;
 #ifdef SUPPORT_UTF8
-    if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 #endif
     break;
 
@@ -487,7 +487,7 @@ for (;;)
 
     cc += PRIV(OP_lengths)[op];
 #ifdef SUPPORT_UTF8
-    if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 #endif
     break;
 
diff --git a/pcre_tables.c b/pcre_tables.c
index 7c52961..b8cabf3 100644
--- a/pcre_tables.c
+++ b/pcre_tables.c
@@ -65,7 +65,9 @@ const pcre_uint8 PRIV(OP_lengths)[] = { OP_LENGTHS };
 /* These are the breakpoints for different numbers of bytes in a UTF-8
 character. */
 
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
+
+#ifdef COMPILE_PCRE8
 
 const int PRIV(utf8_table1)[] =
   { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
@@ -87,6 +89,8 @@ const pcre_uint8 PRIV(utf8_table4)[] = {
   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
   3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
 
+#endif /* COMPILE_PCRE8 */
+
 /* Table to translate from particular type value to the general value. */
 
 const int PRIV(ucp_gentype)[] = {
@@ -554,6 +558,6 @@ const ucp_type_table PRIV(utt)[] = {
 
 const int PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);
 
-#endif  /* SUPPORT_UTF8 */
+#endif /* SUPPORT_UTF */
 
 /* End of pcre_tables.c */
author	zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2011-12-03 23:58:37 +0000
committer	zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2011-12-03 23:58:37 +0000
commit	216818740b54b629e7bd59cd49f783c72e244e23 (patch)
tree	35603a12be962c35a4e39e879a1a8e021f53d765
parent	ad1a6e3a96050e61e6e2127d3a00ded77a1eb80c (diff)
download	pcre-216818740b54b629e7bd59cd49f783c72e244e23.tar.gz