Improving UTF-16 support by fixing a lot of issues.

git-svn-id: svn://vcs.exim.org/pcre/code/branches/pcre16@785 2f5784b3-3f2a-0410-8824-cb99058d5e15
author: zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2011-12-05 20:12:24 +0000
committer: zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2011-12-05 20:12:24 +0000
commit: a9839b968cee5828bf35dbcb05a31859a49ab7a2 (patch)
tree: 836125e6c0ea7958e295ccda9f7d060b05102430
parent: 216818740b54b629e7bd59cd49f783c72e244e23 (diff)
download: pcre-a9839b968cee5828bf35dbcb05a31859a49ab7a2.tar.gz
16 files changed, 543 insertions, 164 deletions
diff --git a/Makefile.am b/Makefile.am
index c939f9f..817b01a 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -212,6 +212,8 @@ libpcre16_la_SOURCES = \
   pcre16_chartables.c \
   pcre16_compile.c \
   pcre16_exec.c \
+  pcre16_fullinfo.c \
+  pcre16_info.c \
   pcre16_jit_compile.c \
   pcre16_newline.c \
   pcre16_ord2utf16.c \
@@ -222,6 +224,7 @@ libpcre16_la_SOURCES = \
   pcre16_ucd.c \
   pcre16_utf16_utils.c \
   pcre16_valid_utf16.c \
+  pcre16_version.c \
   pcre16_xclass.c
 
 ## This file is generated as part of the building process, so don't distribute.
diff --git a/pcre.h.in b/pcre.h.in
index 7b2bca5..b9ec777 100644
--- a/pcre.h.in
+++ b/pcre.h.in
@@ -367,6 +367,8 @@ PCRE_EXP_DECL void pcre_free_substring(const char *);
 PCRE_EXP_DECL void pcre_free_substring_list(const char **);
 PCRE_EXP_DECL int  pcre_fullinfo(const pcre *, const pcre_extra *, int,
                   void *);
+PCRE_EXP_DECL int  pcre16_fullinfo(const pcre *, const pcre_extra *, int,
+                  void *);
 PCRE_EXP_DECL int  pcre_get_named_substring(const pcre *, const char *,
                   int *, int, const char *, const char **);
 PCRE_EXP_DECL int  pcre_get_stringnumber(const pcre *, const char *);
@@ -377,15 +379,19 @@ PCRE_EXP_DECL int  pcre_get_substring(const char *, int *, int, int,
 PCRE_EXP_DECL int  pcre_get_substring_list(const char *, int *, int,
                   const char ***);
 PCRE_EXP_DECL int  pcre_info(const pcre *, int *, int *);
+PCRE_EXP_DECL int  pcre16_info(const pcre *, int *, int *);
 PCRE_EXP_DECL const unsigned char *pcre_maketables(void);
 PCRE_EXP_DECL int  pcre_refcount(pcre *, int);
-PCRE_EXP_DECL int  pcre16_utf16_to_host_byte_order(PCRE_SCHAR16 *,
-                  PCRE_SPTR16, int, int);
 PCRE_EXP_DECL pcre_extra *pcre_study(const pcre *, int, const char **);
 PCRE_EXP_DECL pcre_extra *pcre16_study(const pcre *, int, const char **);
 PCRE_EXP_DECL void pcre_free_study(pcre_extra *);
 PCRE_EXP_DECL void pcre16_free_study(pcre_extra *);
 PCRE_EXP_DECL const char *pcre_version(void);
+PCRE_EXP_DECL const char *pcre16_version(void);
+
+/* Utility functions. */
+PCRE_EXP_DECL int  pcre16_utf16_to_host_byte_order(PCRE_SCHAR16 *,
+                  PCRE_SPTR16, int, int);
 
 /* JIT compiler related functions. */
 
diff --git a/pcre16_fullinfo.c b/pcre16_fullinfo.c
new file mode 100644
index 0000000..0e67deb
--- /dev/null
+++ b/pcre16_fullinfo.c
@@ -0,0 +1,45 @@
+/*************************************************
+*      Perl-Compatible Regular Expressions       *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+                       Written by Philip Hazel
+           Copyright (c) 1997-2011 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+/* Generate code with 16 bit character support. */
+#define COMPILE_PCRE16
+
+#include "pcre_fullinfo.c"
+
+/* End of pcre16_fullinfo.c */
diff --git a/pcre16_info.c b/pcre16_info.c
new file mode 100644
index 0000000..b4b221a
--- /dev/null
+++ b/pcre16_info.c
@@ -0,0 +1,45 @@
+/*************************************************
+*      Perl-Compatible Regular Expressions       *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+                       Written by Philip Hazel
+           Copyright (c) 1997-2011 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+/* Generate code with 16 bit character support. */
+#define COMPILE_PCRE16
+
+#include "pcre_info.c"
+
+/* End of pcre16_info.c */
diff --git a/pcre16_version.c b/pcre16_version.c
new file mode 100644
index 0000000..d4a3329
--- /dev/null
+++ b/pcre16_version.c
@@ -0,0 +1,45 @@
+/*************************************************
+*      Perl-Compatible Regular Expressions       *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+                       Written by Philip Hazel
+           Copyright (c) 1997-2011 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+/* Generate code with 16 bit character support. */
+#define COMPILE_PCRE16
+
+#include "pcre_version.c"
+
+/* End of pcre16_version.c */
diff --git a/pcre_compile.c b/pcre_compile.c
index 3461dbd..da22f59 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -102,6 +102,10 @@ overrun before it actually does run off the end of the data block. */
 #define REQ_CASELESS   0x10000000l      /* Indicates caselessness */
 #define REQ_VARY       0x20000000l      /* Reqchar followed non-literal item */
 
+/* Repeated character flags. */
+
+#define UTF_LENGTH     0x10000000l      /* The char contains its length. */
+
 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
 are simple data values; negative values are for special things like \d and so
 on. Zero means further processing is needed (for things like \x), or the escape
@@ -2896,7 +2900,7 @@ static BOOL
 check_auto_possessive(const pcre_uchar *previous, BOOL utf,
   const pcre_uchar *ptr, int options, compile_data *cd)
 {
-int c, next;
+pcre_int32 c, next;
 int op_code = *previous++;
 
 /* Skip whitespace and comments in extended mode */
@@ -2932,15 +2936,13 @@ if (*ptr == CHAR_BACKSLASH)
   if (temperrorcode != 0) return FALSE;
   ptr++;    /* Point after the escape sequence */
   }
-
-else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
+else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0)
   {
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
   if (utf) { GETCHARINC(next, ptr); } else
 #endif
   next = *ptr++;
   }
-
 else return FALSE;
 
 /* Skip whitespace and comments in extended mode */
@@ -4603,20 +4605,25 @@ for (;; ptr++)
 
       /* Deal with UTF characters that take up more than one character. It's
       easier to write this out separately than try to macrify it. Use c to
-      hold the length of the character in bytes, plus 0x80 to flag that it's a
-      length rather than a small character. */
+      hold the length of the character in bytes, plus UTF_LENGTH to flag that
+      it's a length rather than a small character. */
 
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
+#ifdef COMPILE_PCRE8
       if (utf && (code[-1] & 0x80) != 0)
+#endif /* COMPILE_PCRE8 */
+#ifdef COMPILE_PCRE16
+      if (utf && (code[-1] & 0xfc00) == 0xdc00)
+#endif /* COMPILE_PCRE8 */
         {
         pcre_uchar *lastchar = code - 1;
         BACKCHAR(lastchar);
         c = code - lastchar;            /* Length of UTF-8 character */
         memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
-        c |= 0x80;                      /* Flag c as a length */
+        c |= UTF_LENGTH;                /* Flag c as a length */
         }
       else
-#endif
+#endif /* SUPPORT_UTF */
 
       /* Handle the case of a single charater - either with no UTF support, or
       with UTF disabled, or for a single character UTF character. */
@@ -4758,14 +4765,14 @@ for (;; ptr++)
         we have to insert the character for the previous code. For a repeated
         Unicode property match, there are two extra bytes that define the
         required property. In UTF-8 mode, long characters have their length in
-        c, with the 0x80 bit as a flag. */
+        c, with the UTF_LENGTH bit as a flag. */
 
         if (repeat_max < 0)
           {
-#ifdef SUPPORT_UTF8
-          if (utf && c >= 128)
+#ifdef SUPPORT_UTF
+          if (utf && (c & UTF_LENGTH) != 0)
             {
-            memcpy(code, utf_chars, c & 7);
+            memcpy(code, utf_chars, IN_UCHARS(c & 7));
             code += c & 7;
             }
           else
@@ -4787,10 +4794,10 @@ for (;; ptr++)
 
         else if (repeat_max != repeat_min)
           {
-#ifdef SUPPORT_UTF8
-          if (utf && c >= 128)
+#ifdef SUPPORT_UTF
+          if (utf && (c & UTF_LENGTH) != 0)
             {
-            memcpy(code, utf_chars, c & 7);
+            memcpy(code, utf_chars, IN_UCHARS(c & 7));
             code += c & 7;
             }
           else
@@ -4817,10 +4824,10 @@ for (;; ptr++)
 
       /* The character or character type itself comes last in all cases. */
 
-#ifdef SUPPORT_UTF8
-      if (utf && c >= 128)
+#ifdef SUPPORT_UTF
+      if (utf && (c & UTF_LENGTH) != 0)
         {
-        memcpy(code, utf_chars, c & 7);
+        memcpy(code, utf_chars, IN_UCHARS(c & 7));
         code += c & 7;
         }
       else
@@ -6661,9 +6668,7 @@ for (;; ptr++)
 
 #ifdef SUPPORT_UTF
     if (utf && HAS_EXTRALEN(c))
-      {
-      INTERNALCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
-      }
+      ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
 #endif
 
     /* At this point we have the character's bytes in mcbuffer, and the length
@@ -7789,9 +7794,27 @@ if ((re->options & PCRE_ANCHORED) == 0)
       re->first_char = firstchar & 0xffff;
 #endif
 #endif
-      if ((firstchar & REQ_CASELESS) != 0 && MAX_255(re->first_char)
-        && cd->fcc[re->first_char] != re->first_char)
-        re->flags |= PCRE_FCH_CASELESS;
+      if ((firstchar & REQ_CASELESS) != 0)
+        {
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+        /* We ignore non-ASCII first chars in 8 bit mode. */
+        if (utf)
+          {
+          if (re->first_char < 128)
+            {
+            if (cd->fcc[re->first_char] != re->first_char)
+              re->flags |= PCRE_FCH_CASELESS;
+            }
+          else if ((options & PCRE_UCP) != 0
+              && UCD_OTHERCASE(re->first_char) != re->first_char)
+            re->flags |= PCRE_FCH_CASELESS;
+          }
+        else
+#endif
+        if (MAX_255(re->first_char)
+            && cd->fcc[re->first_char] != re->first_char)
+          re->flags |= PCRE_FCH_CASELESS;
+        }
 
       re->flags |= PCRE_FIRSTSET;
       }
@@ -7814,9 +7837,26 @@ if (reqchar >= 0 &&
   re->req_char = reqchar & 0xffff;
 #endif
 #endif
-  if ((reqchar & REQ_CASELESS) != 0 && MAX_255(re->req_char)
-    && cd->fcc[re->req_char] != re->req_char)
-    re->flags |= PCRE_RCH_CASELESS;
+  if ((reqchar & REQ_CASELESS) != 0)
+    {
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+    /* We ignore non-ASCII first chars in 8 bit mode. */
+    if (utf)
+      {
+      if (re->first_char < 128)
+        {
+        if (cd->fcc[re->first_char] != re->first_char)
+          re->flags |= PCRE_RCH_CASELESS;
+        }
+      else if ((options & PCRE_UCP) != 0
+          && UCD_OTHERCASE(re->first_char) != re->first_char)
+        re->flags |= PCRE_RCH_CASELESS;
+      }
+    else
+#endif
+    if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char)
+      re->flags |= PCRE_RCH_CASELESS;
+    }
 
   re->flags |= PCRE_REQCHSET;
   }
diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c
index d7b292d..1bc96c1 100644
--- a/pcre_dfa_exec.c
+++ b/pcre_dfa_exec.c
@@ -480,7 +480,7 @@ if (*first_op == OP_REVERSE)
       {
       if (current_subject <= start_subject) break;
       current_subject--;
-      INTERNALCHAR(current_subject > start_subject, *current_subject, current_subject--);
+      ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
       }
     }
   else
@@ -3199,7 +3199,13 @@ if (!anchored)
     has_first_char = TRUE;
     first_char = first_char2 = re->first_char;
     if ((re->flags & PCRE_FCH_CASELESS) != 0)
+      {
       first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+      if (first_char > 127 && utf && md->use_ucp)
+        first_char2 = UCD_OTHERCASE(first_char);
+#endif
+      }
     }
   else
     {
@@ -3217,7 +3223,13 @@ if ((re->flags & PCRE_REQCHSET) != 0)
   has_req_char = TRUE;
   req_char = req_char2 = re->req_char;
   if ((re->flags & PCRE_RCH_CASELESS) != 0)
+    {
     req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+    if (req_char > 127 && utf && md->use_ucp)
+      req_char2 = UCD_OTHERCASE(req_char);
+#endif
+    }
   }
 
 /* Call the main matching function, looping for a non-anchored regex after a
@@ -3246,7 +3258,7 @@ for (;;)
         while (t < md->end_subject && !IS_NEWLINE(t))
           {
           t++;
-          INTERNALCHAR(t < end_subject, *t, t++);
+          ACROSSCHAR(t < end_subject, *t, t++);
           }
         }
       else
@@ -3290,7 +3302,7 @@ for (;;)
                    !WAS_NEWLINE(current_subject))
               {
               current_subject++;
-              INTERNALCHAR(current_subject < end_subject, *current_subject,
+              ACROSSCHAR(current_subject < end_subject, *current_subject,
                 current_subject++);
               }
             }
@@ -3318,12 +3330,17 @@ for (;;)
         while (current_subject < end_subject)
           {
           register unsigned int c = *current_subject;
+#ifndef COMPILE_PCRE8
+          if (c > 255) c = 255;
+#endif
           if ((start_bits[c/8] & (1 << (c&7))) == 0)
             {
             current_subject++;
-#ifdef SUPPORT_UTF
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
+            /* In non 8-bit mode, the iteration will stop for
+            characters > 255 at the beginning or not stop at all. */
             if (utf)
-              INTERNALCHAR(current_subject < end_subject, *current_subject,
+              ACROSSCHAR(current_subject < end_subject, *current_subject,
                 current_subject++);
 #endif
             }
@@ -3434,7 +3451,7 @@ for (;;)
 #ifdef SUPPORT_UTF
   if (utf)
     {
-    INTERNALCHAR(current_subject < end_subject, *current_subject,
+    ACROSSCHAR(current_subject < end_subject, *current_subject,
       current_subject++);
     }
 #endif
diff --git a/pcre_exec.c b/pcre_exec.c
index 6761598..bb1b60a 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -2069,7 +2069,7 @@ for (;;)
       be "non-word" characters. Remember the earliest consulted character for
       partial matching. */
 
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
       if (utf)
         {
         /* Get status of previous character */
@@ -2190,7 +2190,7 @@ for (;;)
       }
     eptr++;
 #ifdef SUPPORT_UTF
-    if (utf) INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+    if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
 #endif
     ecode++;
     break;
@@ -3066,7 +3066,7 @@ for (;;)
     /* Match a single character, caselessly */
 
     case OP_CHARI:
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
     if (utf)
       {
       length = 1;
@@ -4089,7 +4089,7 @@ for (;;)
             }
           if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
           eptr++;
-          INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+          ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
           }
         break;
 
@@ -4102,7 +4102,7 @@ for (;;)
             MRRETURN(MATCH_NOMATCH);
             }
           eptr++;
-          INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+          ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
           }
         break;
 
@@ -4301,7 +4301,7 @@ for (;;)
           if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
             MRRETURN(MATCH_NOMATCH);
           eptr++;
-          INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+          ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
           }
         break;
 
@@ -4330,7 +4330,7 @@ for (;;)
           if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
             MRRETURN(MATCH_NOMATCH);
           eptr++;
-          INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+          ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
           }
         break;
 
@@ -5330,7 +5330,7 @@ for (;;)
                 }
               if (IS_NEWLINE(eptr)) break;
               eptr++;
-              INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+              ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
               }
             }
 
@@ -5347,7 +5347,7 @@ for (;;)
                 }
               if (IS_NEWLINE(eptr)) break;
               eptr++;
-              INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+              ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
               }
             }
           break;
@@ -5363,7 +5363,7 @@ for (;;)
                 break;
                 }
               eptr++;
-              INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+              ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
               }
             }
           else
@@ -6264,7 +6264,13 @@ if (!anchored)
     has_first_char = TRUE;
     first_char = first_char2 = re->first_char;
     if ((re->flags & PCRE_FCH_CASELESS) != 0)
+      {
       first_char2 = TABLE_GET(first_char, tables + fcc_offset, first_char);
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+      if (first_char > 127 && utf && md->use_ucp)
+        first_char2 = UCD_OTHERCASE(first_char);
+#endif
+      }
     }
   else
     if (!startline && study != NULL &&
@@ -6280,7 +6286,13 @@ if ((re->flags & PCRE_REQCHSET) != 0)
   has_req_char = TRUE;
   req_char = req_char2 = re->req_char;
   if ((re->flags & PCRE_RCH_CASELESS) != 0)
+    {
     req_char2 = TABLE_GET(req_char, tables + fcc_offset, req_char);
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+    if (req_char > 127 && utf && md->use_ucp)
+      req_char2 = UCD_OTHERCASE(req_char);
+#endif
+    }
   }
 
 
@@ -6309,7 +6321,7 @@ for(;;)
       while (t < md->end_subject && !IS_NEWLINE(t))
         {
         t++;
-        INTERNALCHAR(t < end_subject, *t, t++);
+        ACROSSCHAR(t < end_subject, *t, t++);
         }
       }
     else
@@ -6351,7 +6363,7 @@ for(;;)
           while (start_match < end_subject && !WAS_NEWLINE(start_match))
             {
             start_match++;
-            INTERNALCHAR(start_match < end_subject, *start_match,
+            ACROSSCHAR(start_match < end_subject, *start_match,
               start_match++);
             }
           }
@@ -6378,17 +6390,18 @@ for(;;)
       {
       while (start_match < end_subject)
         {
-#ifdef COMPILE_PCRE
         register unsigned int c = *start_match;
-#else
-        register unsigned int c = *start_match & 0xff;
+#ifndef COMPILE_PCRE8
+        if (c > 255) c = 255;
 #endif
         if ((start_bits[c/8] & (1 << (c&7))) == 0)
           {
           start_match++;
-#ifdef SUPPORT_UTF
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
+          /* In non 8-bit mode, the iteration will stop for
+          characters > 255 at the beginning or not stop at all. */
           if (utf)
-            INTERNALCHAR(start_match < end_subject, *start_match,
+            ACROSSCHAR(start_match < end_subject, *start_match,
               start_match++);
 #endif
           }
@@ -6520,7 +6533,7 @@ for(;;)
     new_start_match = start_match + 1;
 #ifdef SUPPORT_UTF
     if (utf)
-      INTERNALCHAR(new_start_match < end_subject, *new_start_match,
+      ACROSSCHAR(new_start_match < end_subject, *new_start_match,
         new_start_match++);
 #endif
     break;
diff --git a/pcre_fullinfo.c b/pcre_fullinfo.c
index 6c89121..2bdf24b 100644
--- a/pcre_fullinfo.c
+++ b/pcre_fullinfo.c
@@ -65,9 +65,15 @@ Arguments:
 Returns:           0 if data returned, negative on error
 */
 
+#ifdef COMPILE_PCRE8
 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
 pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what,
   void *where)
+#else
+PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
+pcre16_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what,
+  void *where)
+#endif
 {
 real_pcre internal_re;
 pcre_study_data internal_study;
diff --git a/pcre_info.c b/pcre_info.c
index 9211df4..e7b3730 100644
--- a/pcre_info.c
+++ b/pcre_info.c
@@ -72,8 +72,13 @@ Returns:        number of capturing subpatterns
                 or negative values on error
 */
 
+#ifdef COMPILE_PCRE8
 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
-pcre_info(const pcre *argument_re, int *optptr, int *first_byte)
+pcre_info(const pcre *argument_re, int *optptr, int *first_char)
+#else
+PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
+pcre16_info(const pcre *argument_re, int *optptr, int *first_char)
+#endif
 {
 real_pcre internal_re;
 const real_pcre *re = (const real_pcre *)argument_re;
@@ -84,8 +89,8 @@ if (re->magic_number != MAGIC_NUMBER)
   if (re == NULL) return PCRE_ERROR_BADMAGIC;
   }
 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_COMPILE_OPTIONS);
-if (first_byte != NULL)
-  *first_byte = ((re->flags & PCRE_FIRSTSET) != 0)? re->first_char :
+if (first_char != NULL)
+  *first_char = ((re->flags & PCRE_FIRSTSET) != 0)? re->first_char :
      ((re->flags & PCRE_STARTLINE) != 0)? -1 : -2;
 return re->top_bracket;
 }
diff --git a/pcre_internal.h b/pcre_internal.h
index 7642b91..4046e41 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -542,7 +542,7 @@ UTF-8 support is omitted, we don't even define them. */
 /* #define GETCHARLENTEST(c, eptr, len) */
 /* #define BACKCHAR(eptr) */
 /* #define FORWARDCHAR(eptr) */
-/* #define INTERNALCHAR(condition, eptr, action) */
+/* #define ACROSSCHAR(condition, eptr, action) */
 
 #else   /* SUPPORT_UTF */
 
@@ -708,7 +708,7 @@ because almost all calls are already within a block of UTF-8 only code. */
 #define FORWARDCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr++
 
 /* Same as above, but it allows a fully customizable form. */
-#define INTERNALCHAR(condition, eptr, action) \
+#define ACROSSCHAR(condition, eptr, action) \
   while((condition) && ((eptr) & 0xc0) == 0x80) action
 
 #else /* COMPILE_PCRE8 */
@@ -748,7 +748,7 @@ pointer. */
 the pointer. */
 
 #define GETUTF16INC(c, eptr) \
-   { c = (((c & 0x3ff) << 10) | (eptr[1] & 0x3ff)) + 0x10000; eptr++; }
+   { c = (((c & 0x3ff) << 10) | (*eptr++ & 0x3ff)) + 0x10000; }
 
 /* Get the next UTF-16 character, advancing the pointer. This is called when we
 know we are in UTF-16 mode. */
@@ -797,7 +797,7 @@ code. */
 #define FORWARDCHAR(eptr) if ((*eptr & 0xfc00) == 0xdc00) eptr++
 
 /* Same as above, but it allows a fully customizable form. */
-#define INTERNALCHAR(condition, eptr, action) \
+#define ACROSSCHAR(condition, eptr, action) \
   if ((condition) && ((eptr) & 0xfc00) == 0xdc00) action
 
 #endif
diff --git a/pcre_jit_compile.c b/pcre_jit_compile.c
index 03c7b2c..df158be 100644
--- a/pcre_jit_compile.c
+++ b/pcre_jit_compile.c
@@ -300,7 +300,7 @@ typedef struct compiler_common {
 #ifdef SUPPORT_UTF8
   BOOL utf;
 #ifdef SUPPORT_UCP
-  BOOL useucp;
+  BOOL use_ucp;
 #endif
   jump_list *utfreadchar;
 #ifdef COMPILE_PCRE8
@@ -390,10 +390,12 @@ the start pointers when the end of the capturing group has not yet reached. */
 #define PRIV_DATA(cc)    (common->localptrs[(cc) - common->start])
 
 #ifdef COMPILE_PCRE8
-#define MOV_UCHAR SLJIT_MOV_UB
+#define MOV_UCHAR  SLJIT_MOV_UB
+#define MOVU_UCHAR SLJIT_MOVU_UB
 #else
 #ifdef COMPILE_PCRE16
-#define MOV_UCHAR SLJIT_MOV_UH
+#define MOV_UCHAR  SLJIT_MOV_UH
+#define MOVU_UCHAR SLJIT_MOVU_UH
 #else
 #error Unsupported compiling mode
 #endif
@@ -1369,10 +1371,10 @@ if (common->utf && c > 65535)
   if (bit >= (1 << 10))
     bit >>= 10;
   else
-    return (bit <= 255) ? ((2 << 8) | bit) : ((3 << 8) | (bit >> 8));
+    return (bit < 256) ? ((2 << 8) | bit) : ((3 << 8) | (bit >> 8));
   }
 #endif /* SUPPORT_UTF16 */
-return (bit <= 255) ? ((0 << 8) | bit) : ((1 << 8) | (bit >> 8));
+return (bit < 256) ? ((0 << 8) | bit) : ((1 << 8) | (bit >> 8));
 #endif /* COMPILE_PCRE16 */
 
 #endif /* COMPILE_PCRE8 */
@@ -1420,7 +1422,7 @@ DEFINE_COMPILER;
 struct sljit_jump *jump;
 #endif
 
-OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
 #ifdef SUPPORT_UTF
 if (common->utf)
   {
@@ -1461,7 +1463,7 @@ if (common->utf)
 #else
 #ifdef COMPILE_PCRE16
   OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
-  jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 0xff);
+  jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 255);
   OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
   JUMPHERE(jump);
   /* Skip low surrogate if necessary. */
@@ -1478,9 +1480,9 @@ if (common->utf)
 OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 0);
 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
 #ifdef COMPILE_PCRE16
-/* The ctypes array contains only 255 values. */
+/* The ctypes array contains only 256 values. */
 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
-jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 0xff);
+jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 255);
 #endif
 OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
 #ifdef COMPILE_PCRE16
@@ -1542,7 +1544,7 @@ else if (nltype == NLTYPE_ANYCRLF)
   }
 else
   {
-  SLJIT_ASSERT(nltype == NLTYPE_FIXED && common->newline <= 255);
+  SLJIT_ASSERT(nltype == NLTYPE_FIXED && common->newline < 256);
   add_jump(compiler, fallbacks, CMP(jumpiftrue ? SLJIT_C_EQUAL : SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, common->newline));
   }
 }
@@ -1660,7 +1662,7 @@ sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
 
 JUMPHERE(jump);
 /* Combine two 16 bit characters. */
-OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 1);
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
 OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3ff);
 OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 10);
@@ -1818,7 +1820,7 @@ if (newlinecheck)
 return mainloop;
 }
 
-static SLJIT_INLINE void fast_forward_first_char(compiler_common *common, pcre_uchar firstchar, BOOL caseless, BOOL firstline)
+static SLJIT_INLINE void fast_forward_first_char(compiler_common *common, pcre_uchar first_char, BOOL caseless, BOOL firstline)
 {
 DEFINE_COMPILER;
 struct sljit_label *start;
@@ -1836,22 +1838,28 @@ start = LABEL();
 leave = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
 
-oc = firstchar;
+oc = first_char;
 if (caseless)
-  oc = TABLE_GET(firstchar, common->fcc, firstchar);
-if (firstchar == oc)
-  found = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, firstchar);
+  {
+  oc = TABLE_GET(first_char, common->fcc, first_char);
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+  if (first_char > 127 && common->utf && common->use_ucp)
+    oc = UCD_OTHERCASE(first_char);
+#endif
+  }
+if (first_char == oc)
+  found = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, first_char);
 else
   {
-  bit = firstchar ^ oc;
+  bit = first_char ^ oc;
   if (ispowerof2(bit))
     {
     OP2(SLJIT_OR, TMP2, 0, TMP1, 0, SLJIT_IMM, bit);
-    found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, firstchar | bit);
+    found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, first_char | bit);
     }
   else
     {
-    OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, firstchar);
+    OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, first_char);
     COND_VALUE(SLJIT_MOV, TMP2, 0, SLJIT_C_EQUAL);
     OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, oc);
     COND_VALUE(SLJIT_OR | SLJIT_SET_E, TMP2, 0, SLJIT_C_EQUAL);
@@ -1912,16 +1920,19 @@ if (common->nltype == NLTYPE_FIXED && common->newline > 255)
   OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, begin));
   firstchar = CMP(SLJIT_C_LESS_EQUAL, STR_PTR, 0, TMP2, 0);
 
-  OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);
+  OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(2));
   OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, STR_PTR, 0, TMP1, 0);
   COND_VALUE(SLJIT_MOV, TMP2, 0, SLJIT_C_GREATER_EQUAL);
+#ifdef COMPILE_PCRE16
+  OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 1);
+#endif
   OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
 
   loop = LABEL();
-  OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+  OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
   leave = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
-  OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), -2);
-  OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), -1);
+  OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2));
+  OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
   CMPTO(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff, loop);
   CMPTO(SLJIT_C_NOT_EQUAL, TMP2, 0, SLJIT_IMM, common->newline & 0xff, loop);
 
@@ -1952,9 +1963,12 @@ if (common->nltype == NLTYPE_ANY || common->nltype == NLTYPE_ANYCRLF)
   leave = JUMP(SLJIT_JUMP);
   JUMPHERE(foundcr);
   notfoundnl = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
-  OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+  OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
   OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, CHAR_NL);
   COND_VALUE(SLJIT_MOV, TMP1, 0, SLJIT_C_EQUAL);
+#ifdef COMPILE_PCRE16
+  OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
+#endif
   OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
   JUMPHERE(notfoundnl);
   JUMPHERE(leave);
@@ -1972,6 +1986,9 @@ DEFINE_COMPILER;
 struct sljit_label *start;
 struct sljit_jump *leave;
 struct sljit_jump *found;
+#ifndef COMPILE_PCRE8
+struct sljit_jump *jump;
+#endif
 
 if (firstline)
   {
@@ -1987,7 +2004,9 @@ if (common->utf)
   OP1(SLJIT_MOV, TMP3, 0, TMP1, 0);
 #endif
 #ifndef COMPILE_PCRE8
-OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xff);
+jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 255);
+OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 255);
+JUMPHERE(jump);
 #endif
 OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7);
 OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3);
@@ -2028,7 +2047,7 @@ if (firstline)
   OP1(SLJIT_MOV, STR_END, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), POSSESSIVE0);
 }
 
-static SLJIT_INLINE struct sljit_jump *search_requested_char(compiler_common *common, pcre_uchar reqchar, BOOL caseless, BOOL has_firstchar)
+static SLJIT_INLINE struct sljit_jump *search_requested_char(compiler_common *common, pcre_uchar req_char, BOOL caseless, BOOL has_firstchar)
 {
 DEFINE_COMPILER;
 struct sljit_label *loop;
@@ -2045,34 +2064,40 @@ toolong = CMP(SLJIT_C_LESS, TMP1, 0, STR_END, 0);
 alreadyfound = CMP(SLJIT_C_LESS, STR_PTR, 0, TMP2, 0);
 
 if (has_firstchar)
-  OP2(SLJIT_ADD, TMP1, 0, STR_PTR, 0, SLJIT_IMM, 1);
+  OP2(SLJIT_ADD, TMP1, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
 else
   OP1(SLJIT_MOV, TMP1, 0, STR_PTR, 0);
 
 loop = LABEL();
 notfound = CMP(SLJIT_C_GREATER_EQUAL, TMP1, 0, STR_END, 0);
 
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(TMP1), 0);
-oc = reqchar;
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(TMP1), 0);
+oc = req_char;
 if (caseless)
-  oc = TABLE_GET(reqchar, common->fcc, reqchar);
-if (reqchar == oc)
-  found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, reqchar);
+  {
+  oc = TABLE_GET(req_char, common->fcc, req_char);
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+  if (req_char > 127 && common->utf && common->use_ucp)
+    oc = UCD_OTHERCASE(req_char);
+#endif
+  }
+if (req_char == oc)
+  found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, req_char);
 else
   {
-  bit = reqchar ^ oc;
+  bit = req_char ^ oc;
   if (ispowerof2(bit))
     {
     OP2(SLJIT_OR, TMP2, 0, TMP2, 0, SLJIT_IMM, bit);
-    found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, reqchar | bit);
+    found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, req_char | bit);
     }
   else
     {
-    found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, reqchar);
+    found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, req_char);
     foundoc = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, oc);
     }
   }
-OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
+OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(1));
 JUMPTO(SLJIT_JUMP, loop);
 
 JUMPHERE(found);
@@ -2126,7 +2151,7 @@ static void check_wordboundary(compiler_common *common)
 {
 DEFINE_COMPILER;
 struct sljit_jump *beginend;
-#ifdef SUPPORT_UTF8
+#if !(defined COMPILE_PCRE8) || defined SUPPORT_UTF
 struct sljit_jump *jump;
 #endif
 
@@ -2143,7 +2168,7 @@ read_char(common);
 
 /* Testing char type. */
 #ifdef SUPPORT_UCP
-if (common->useucp)
+if (common->use_ucp)
   {
   OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 1);
   jump = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_UNDERSCORE);
@@ -2160,20 +2185,24 @@ if (common->useucp)
 else
 #endif
   {
-#ifdef SUPPORT_UTF8
+#ifndef COMPILE_PCRE8
+  jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
+#elif defined SUPPORT_UTF
   /* Here LOCALS1 has already been zeroed. */
   jump = NULL;
   if (common->utf)
     jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
-#endif
+#endif /* COMPILE_PCRE8 */
   OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), common->ctypes);
   OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 4 /* ctype_word */);
   OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
   OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS1, TMP1, 0);
-#ifdef SUPPORT_UTF8
+#ifndef COMPILE_PCRE8
+  JUMPHERE(jump);
+#elif defined SUPPORT_UTF
   if (jump != NULL)
     JUMPHERE(jump);
-#endif
+#endif /* COMPILE_PCRE8 */
   }
 JUMPHERE(beginend);
 
@@ -2183,7 +2212,7 @@ peek_char(common);
 
 /* Testing char type. This is a code duplication. */
 #ifdef SUPPORT_UCP
-if (common->useucp)
+if (common->use_ucp)
   {
   OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 1);
   jump = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_UNDERSCORE);
@@ -2199,7 +2228,11 @@ if (common->useucp)
 else
 #endif
   {
-#ifdef SUPPORT_UTF8
+#ifndef COMPILE_PCRE8
+  /* TMP2 may be destroyed by peek_char. */
+  OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 0);
+  jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
+#elif defined SUPPORT_UTF
   OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 0);
   jump = NULL;
   if (common->utf)
@@ -2208,10 +2241,12 @@ else
   OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(TMP1), common->ctypes);
   OP2(SLJIT_LSHR, TMP2, 0, TMP2, 0, SLJIT_IMM, 4 /* ctype_word */);
   OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 1);
-#ifdef SUPPORT_UTF8
+#ifndef COMPILE_PCRE8
+  JUMPHERE(jump);
+#elif defined SUPPORT_UTF
   if (jump != NULL)
     JUMPHERE(jump);
-#endif
+#endif /* COMPILE_PCRE8 */
   }
 JUMPHERE(beginend);
 
@@ -2314,18 +2349,18 @@ sljit_emit_fast_enter(compiler, RETURN_ADDR, 0, 1, 5, 5, common->localsize);
 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
 OP1(SLJIT_MOV, TMP3, 0, CHAR1, 0);
 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS0, CHAR2, 0);
-OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
-OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(1));
+OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
 
 label = LABEL();
-OP1(SLJIT_MOVU_UB, CHAR1, 0, SLJIT_MEM1(TMP1), 1);
-OP1(SLJIT_MOVU_UB, CHAR2, 0, SLJIT_MEM1(STR_PTR), 1);
+OP1(MOVU_UCHAR, CHAR1, 0, SLJIT_MEM1(TMP1), IN_UCHARS(1));
+OP1(MOVU_UCHAR, CHAR2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
 jump = CMP(SLJIT_C_NOT_EQUAL, CHAR1, 0, CHAR2, 0);
-OP2(SLJIT_SUB | SLJIT_SET_E, TMP2, 0, TMP2, 0, SLJIT_IMM, 1);
+OP2(SLJIT_SUB | SLJIT_SET_E, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(1));
 JUMPTO(SLJIT_C_NOT_ZERO, label);
 
 JUMPHERE(jump);
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
 OP1(SLJIT_MOV, CHAR1, 0, TMP3, 0);
 OP1(SLJIT_MOV, CHAR2, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS0);
 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
@@ -2346,20 +2381,30 @@ OP1(SLJIT_MOV, TMP3, 0, LCC_TABLE, 0);
 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS0, CHAR1, 0);
 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS1, CHAR2, 0);
 OP1(SLJIT_MOV, LCC_TABLE, 0, SLJIT_IMM, common->lcc);
-OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
-OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(1));
+OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
 
 label = LABEL();
-OP1(SLJIT_MOVU_UB, CHAR1, 0, SLJIT_MEM1(TMP1), 1);
-OP1(SLJIT_MOVU_UB, CHAR2, 0, SLJIT_MEM1(STR_PTR), 1);
+OP1(MOVU_UCHAR, CHAR1, 0, SLJIT_MEM1(TMP1), IN_UCHARS(1));
+OP1(MOVU_UCHAR, CHAR2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
+#ifndef COMPILE_PCRE8
+jump = CMP(SLJIT_C_GREATER, CHAR1, 0, SLJIT_IMM, 255);
+#endif
 OP1(SLJIT_MOV_UB, CHAR1, 0, SLJIT_MEM2(LCC_TABLE, CHAR1), 0);
+#ifndef COMPILE_PCRE8
+JUMPHERE(jump);
+jump = CMP(SLJIT_C_GREATER, CHAR2, 0, SLJIT_IMM, 255);
+#endif
 OP1(SLJIT_MOV_UB, CHAR2, 0, SLJIT_MEM2(LCC_TABLE, CHAR2), 0);
+#ifndef COMPILE_PCRE8
+JUMPHERE(jump);
+#endif
 jump = CMP(SLJIT_C_NOT_EQUAL, CHAR1, 0, CHAR2, 0);
-OP2(SLJIT_SUB | SLJIT_SET_E, TMP2, 0, TMP2, 0, SLJIT_IMM, 1);
+OP2(SLJIT_SUB | SLJIT_SET_E, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(1));
 JUMPTO(SLJIT_C_NOT_ZERO, label);
 
 JUMPHERE(jump);
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
 OP1(SLJIT_MOV, LCC_TABLE, 0, TMP3, 0);
 OP1(SLJIT_MOV, CHAR1, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS0);
 OP1(SLJIT_MOV, CHAR2, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS1);
@@ -2378,7 +2423,7 @@ static const pcre_uchar *SLJIT_CALL do_utf_caselesscmp(pcre_uchar *src1, jit_arg
 /* This function would be ineffective to do in JIT level. */
 int c1, c2;
 const pcre_uchar *src2 = args->ptr;
-const pcre_uchar *end2 = (pcre_uchar *)args->end;
+const pcre_uchar *end2 = args->end;
 
 while (src1 < end1)
   {
@@ -2976,7 +3021,7 @@ switch(type)
     {
     jump[0] = CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff);
     jump[1] = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
-    OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+    OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
     add_jump(compiler, fallbacks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, common->newline & 0xff));
     JUMPHERE(jump[1]);
     JUMPHERE(jump[0]);
@@ -3037,9 +3082,9 @@ switch(type)
   read_char(common);
   jump[0] = CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR);
   jump[1] = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
-  OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+  OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
   jump[2] = CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_NL);
-  OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+  OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
   jump[3] = JUMP(SLJIT_JUMP);
   JUMPHERE(jump[0]);
   check_newlinechar(common, common->bsr_nltype, fallbacks, FALSE);
@@ -3089,36 +3134,37 @@ switch(type)
   jump[0] = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
   if (common->nltype == NLTYPE_FIXED && common->newline > 255)
     {
-    OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 2);
-    OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+    OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
+    OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
     add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP2, 0, STR_END, 0));
-    OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 1);
+    OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
     add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff));
     add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP2, 0, SLJIT_IMM, common->newline & 0xff));
     }
   else if (common->nltype == NLTYPE_FIXED)
     {
-    OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 1);
-    OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+    OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+    OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
     add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP2, 0, STR_END, 0));
     add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, common->newline));
     }
   else
     {
-    OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+    OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
     jump[1] = CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR);
-    OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 2);
+    OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
     OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, TMP2, 0, STR_END, 0);
     jump[2] = JUMP(SLJIT_C_GREATER);
     add_jump(compiler, fallbacks, JUMP(SLJIT_C_LESS));
-    OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 1);
+    /* Equal. */
+    OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
     jump[3] = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_NL);
     add_jump(compiler, fallbacks, JUMP(SLJIT_JUMP));
 
     JUMPHERE(jump[1]);
     if (common->nltype == NLTYPE_ANYCRLF)
       {
-      OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 1);
+      OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
       add_jump(compiler, fallbacks, CMP(SLJIT_C_LESS, TMP2, 0, STR_END, 0));
       add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_NL));
       }
@@ -3158,15 +3204,13 @@ switch(type)
   jump[0] = JUMP(SLJIT_JUMP);
   JUMPHERE(jump[1]);
 
-  OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, end));
-  add_jump(compiler, fallbacks, CMP(SLJIT_C_EQUAL, TMP2, 0, STR_PTR, 0));
-
+  add_jump(compiler, fallbacks, CMP(SLJIT_C_EQUAL, STR_PTR, 0, STR_END, 0));
   if (common->nltype == NLTYPE_FIXED && common->newline > 255)
     {
-    OP2(SLJIT_SUB, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 2);
+    OP2(SLJIT_SUB, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
     add_jump(compiler, fallbacks, CMP(SLJIT_C_LESS, TMP2, 0, TMP1, 0));
-    OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), -2);
-    OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), -1);
+    OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2));
+    OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
     add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff));
     add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP2, 0, SLJIT_IMM, common->newline & 0xff));
     }
@@ -3200,10 +3244,10 @@ switch(type)
 
   if (common->nltype == NLTYPE_FIXED && common->newline > 255)
     {
-    OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 2);
+    OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
     add_jump(compiler, fallbacks, CMP(SLJIT_C_GREATER, TMP2, 0, STR_END, 0));
-    OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
-    OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 1);
+    OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
+    OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
     add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff));
     add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP2, 0, SLJIT_IMM, common->newline & 0xff));
     }
@@ -6382,7 +6426,7 @@ common->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
 common->utf = (re->options & PCRE_UTF8) != 0;
 #ifdef SUPPORT_UCP
-common->useucp = (re->options & PCRE_UCP) != 0;
+common->use_ucp = (re->options & PCRE_UCP) != 0;
 #endif
 common->utfreadchar = NULL;
 #ifdef COMPILE_PCRE8
diff --git a/pcre_newline.c b/pcre_newline.c
index 0c2ddcd..d618b80 100644
--- a/pcre_newline.c
+++ b/pcre_newline.c
@@ -77,7 +77,15 @@ PRIV(is_newline)(PCRE_PUCHAR ptr, int type, PCRE_PUCHAR endptr, int *lenptr,
   BOOL utf)
 {
 int c;
-if (utf) { GETCHAR(c, ptr); } else c = *ptr;
+(void)utf;
+#ifdef SUPPORT_UTF
+if (utf)
+  {
+  GETCHAR(c, ptr);
+  }
+else
+#endif  /* SUPPORT_UTF8 */
+  c = *ptr;
 
 if (type == NLTYPE_ANYCRLF) switch(c)
   {
@@ -96,9 +104,15 @@ else switch(c)
   case 0x000c: *lenptr = 1; return TRUE;             /* FF */
   case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1;
                return TRUE;                          /* CR */
+#ifdef COMPILE_PCRE8
   case 0x0085: *lenptr = utf? 2 : 1; return TRUE;    /* NEL */
   case 0x2028:                                       /* LS */
   case 0x2029: *lenptr = 3; return TRUE;             /* PS */
+#else
+  case 0x0085:                                       /* NEL */
+  case 0x2028:                                       /* LS */
+  case 0x2029: *lenptr = 1; return TRUE;             /* PS */
+#endif /* COMPILE_PCRE8 */
   default: return FALSE;
   }
 }
@@ -127,17 +141,17 @@ PRIV(was_newline)(PCRE_PUCHAR ptr, int type, PCRE_PUCHAR startptr, int *lenptr,
   BOOL utf)
 {
 int c;
+(void)utf;
 ptr--;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
 if (utf)
   {
   BACKCHAR(ptr);
   GETCHAR(c, ptr);
   }
-else c = *ptr;
-#else   /* no UTF-8 support */
-c = *ptr;
+else
 #endif  /* SUPPORT_UTF8 */
+  c = *ptr;
 
 if (type == NLTYPE_ANYCRLF) switch(c)
   {
@@ -154,9 +168,15 @@ else switch(c)
   case 0x000b:                                      /* VT */
   case 0x000c:                                      /* FF */
   case 0x000d: *lenptr = 1; return TRUE;            /* CR */
+#ifdef COMPILE_PCRE8
   case 0x0085: *lenptr = utf? 2 : 1; return TRUE;   /* NEL */
   case 0x2028:                                      /* LS */
   case 0x2029: *lenptr = 3; return TRUE;            /* PS */
+#else
+  case 0x0085:                                       /* NEL */
+  case 0x2028:                                       /* LS */
+  case 0x2029: *lenptr = 1; return TRUE;             /* PS */
+#endif /* COMPILE_PCRE8 */
   default: return FALSE;
   }
 }
diff --git a/pcre_printint.src b/pcre_printint.src
index 2922e54..d30619e 100644
--- a/pcre_printint.src
+++ b/pcre_printint.src
@@ -123,7 +123,9 @@ else
 
 if (!utf || (c & 0xfc00) != 0xd800)
   {
-  if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
+  if (PRINTABLE(c)) fprintf(f, "%c", c);
+  else if (c <= 0xff) fprintf(f, "\\x%02x", c);
+  else fprintf(f, "\\x{%x}", c);
   return 0;
   }
 else
diff --git a/pcre_study.c b/pcre_study.c
index 1e10397..3f25c3a 100644
--- a/pcre_study.c
+++ b/pcre_study.c
@@ -224,7 +224,7 @@ for (;;)
     case OP_NOTPOSPLUSI:
     branchlength++;
     cc += 2;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 #endif
     break;
@@ -245,7 +245,7 @@ for (;;)
     case OP_NOTEXACTI:
     branchlength += GET2(cc,1);
     cc += 2 + IMM2_SIZE;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 #endif
     break;
@@ -293,7 +293,7 @@ for (;;)
     appear, but leave the code, just in case.) */
 
     case OP_ANYBYTE:
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
     if (utf) return -1;
 #endif
     branchlength++;
@@ -486,7 +486,7 @@ for (;;)
     case OP_NOTPOSQUERYI:
 
     cc += PRIV(OP_lengths)[op];
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 #endif
     break;
@@ -549,9 +549,10 @@ set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
 {
 unsigned int c = *p;
 
+#ifdef COMPILE_PCRE8
 SET_BIT(c);
 
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
 if (utf && c > 127)
   {
   GETCHARINC(c, p);
@@ -572,6 +573,33 @@ if (utf && c > 127)
 
 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
 return p + 1;
+#endif
+
+#ifdef COMPILE_PCRE16
+if (c > 0xff)
+  c = 0xff;
+SET_BIT(c);
+
+#ifdef SUPPORT_UTF
+if (utf && c > 127)
+  {
+  GETCHARINC(c, p);
+#ifdef SUPPORT_UCP
+  if (caseless)
+    {
+    c = UCD_OTHERCASE(c);
+    if (c > 0xff)
+      c = 0xff;
+    SET_BIT(c);
+    }
+#endif
+  return p;
+  }
+#endif
+
+if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
+return p + 1;
+#endif
 }
 
 
@@ -602,7 +630,7 @@ set_type_bits(pcre_uint8 *start_bits, int cbit_type, int table_limit,
 {
 register int c;
 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
 if (table_limit == 32) return;
 for (c = 128; c < 256; c++)
   {
@@ -644,7 +672,9 @@ set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, int table_limit,
 {
 register int c;
 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
+#endif
 }
 
 
@@ -679,7 +709,11 @@ set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
 {
 register int c;
 int yield = SSB_DONE;
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
 int table_limit = utf? 16:32;
+#else
+int table_limit = 32;
+#endif
 
 #if 0
 /* ========================================================================= */
@@ -951,14 +985,23 @@ do
       case OP_HSPACE:
       SET_BIT(0x09);
       SET_BIT(0x20);
+#ifdef SUPPORT_UTF
       if (utf)
         {
+#ifdef COMPILE_PCRE8
         SET_BIT(0xC2);  /* For U+00A0 */
         SET_BIT(0xE1);  /* For U+1680, U+180E */
         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
         SET_BIT(0xE3);  /* For U+3000 */
+#endif
+#ifdef COMPILE_PCRE16
+        SET_BIT(0xA0);
+        SET_BIT(0xFF);  /* For characters > 255 */
+#endif
         }
-      else SET_BIT(0xA0);
+      else
+#endif /* SUPPORT_UTF */
+        SET_BIT(0xA0);
       try_next = FALSE;
       break;
 
@@ -968,12 +1011,21 @@ do
       SET_BIT(0x0B);
       SET_BIT(0x0C);
       SET_BIT(0x0D);
+#ifdef SUPPORT_UTF
       if (utf)
         {
+#ifdef COMPILE_PCRE8
         SET_BIT(0xC2);  /* For U+0085 */
         SET_BIT(0xE2);  /* For U+2028, U+2029 */
+#endif
+#ifdef COMPILE_PCRE16
+        SET_BIT(0x85);
+        SET_BIT(0xFF);  /* For characters > 255 */
+#endif
         }
-      else SET_BIT(0x85);
+      else
+#endif /* SUPPORT_UTF */
+        SET_BIT(0x85);
       try_next = FALSE;
       break;
 
@@ -1058,14 +1110,23 @@ do
         case OP_HSPACE:
         SET_BIT(0x09);
         SET_BIT(0x20);
+#ifdef COMPILE_PCRE8
         if (utf)
           {
+#ifdef COMPILE_PCRE8
           SET_BIT(0xC2);  /* For U+00A0 */
           SET_BIT(0xE1);  /* For U+1680, U+180E */
           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
           SET_BIT(0xE3);  /* For U+3000 */
+#endif
+#ifdef COMPILE_PCRE16
+          SET_BIT(0xA0);
+          SET_BIT(0xFF);  /* For characters > 255 */
+#endif
           }
-        else SET_BIT(0xA0);
+        else
+#endif /* SUPPORT_UTF */
+          SET_BIT(0xA0);
         break;
 
         case OP_ANYNL:
@@ -1074,12 +1135,21 @@ do
         SET_BIT(0x0B);
         SET_BIT(0x0C);
         SET_BIT(0x0D);
+#ifdef COMPILE_PCRE8
         if (utf)
           {
+#ifdef COMPILE_PCRE8
           SET_BIT(0xC2);  /* For U+0085 */
           SET_BIT(0xE2);  /* For U+2028, U+2029 */
+#endif
+#ifdef COMPILE_PCRE16
+          SET_BIT(0x85);
+          SET_BIT(0xFF);  /* For characters > 255 */
+#endif
           }
-        else SET_BIT(0x85);
+        else
+#endif /* SUPPORT_UTF */
+          SET_BIT(0x85);
         break;
 
         case OP_NOT_DIGIT:
@@ -1126,13 +1196,16 @@ do
       character with a value > 255. */
 
       case OP_NCLASS:
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
       if (utf)
         {
         start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
         memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
         }
 #endif
+#ifdef COMPILE_PCRE16
+      SET_BIT(0xFF);                         /* For characters > 255 */
+#endif
       /* Fall through */
 
       case OP_CLASS:
@@ -1147,7 +1220,7 @@ do
         value is > 127. In fact, there are only two possible starting bytes for
         characters in the range 128 - 255. */
 
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
         if (utf)
           {
           for (c = 0; c < 16; c++) start_bits[c] |= map[c];
@@ -1161,12 +1234,10 @@ do
               }
             }
           }
-
-        /* In non-UTF-8 mode, the two bit maps are completely compatible. */
-
         else
 #endif
           {
+          /* In non-UTF-8 mode, the two bit maps are completely compatible. */
           for (c = 0; c < 32; c++) start_bits[c] |= map[c];
           }
 
@@ -1342,6 +1413,18 @@ if (bits_set || min > 0
     memcpy(study->start_bits, start_bits, sizeof(start_bits));
     }
 
+#ifdef PCRE_DEBUG
+  if (bits_set)
+    {
+    pcre_uint8 *ptr = (pcre_uint32 *)start_bits;
+    int i;
+
+    printf("Start bits:\n");
+    for (i = 0; i < 32; i++)
+      printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
+    }
+#endif
+
   /* Always set the minlength value in the block, because the JIT compiler
   makes use of it. However, don't set the bit unless the length is greater than
   zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
diff --git a/pcre_version.c b/pcre_version.c
index 7067cd4..2269d4f 100644
--- a/pcre_version.c
+++ b/pcre_version.c
@@ -79,8 +79,13 @@ I could find no way of detecting that a macro is defined as an empty string at
 pre-processor time. This hack uses a standard trick for avoiding calling
 the STRING macro with an empty argument when doing the test. */
 
+#ifdef COMPILE_PCRE8
 PCRE_EXP_DEFN const char * PCRE_CALL_CONVENTION
 pcre_version(void)
+#else
+PCRE_EXP_DEFN const char * PCRE_CALL_CONVENTION
+pcre16_version(void)
+#endif
 {
 return (XSTRING(Z PCRE_PRERELEASE)[1] == 0)?
   XSTRING(PCRE_MAJOR.PCRE_MINOR PCRE_DATE) :
author	zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2011-12-05 20:12:24 +0000
committer	zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2011-12-05 20:12:24 +0000
commit	a9839b968cee5828bf35dbcb05a31859a49ab7a2 (patch)
tree	836125e6c0ea7958e295ccda9f7d060b05102430
parent	216818740b54b629e7bd59cd49f783c72e244e23 (diff)
download	pcre-a9839b968cee5828bf35dbcb05a31859a49ab7a2.tar.gz