Make character ranges 16 bit friendly

git-svn-id: svn://vcs.exim.org/pcre/code/branches/pcre16@770 2f5784b3-3f2a-0410-8824-cb99058d5e15
author: zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2011-11-28 20:39:30 +0000
committer: zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2011-11-28 20:39:30 +0000
commit: 00cc776fe74e502bc0774ceca2bb3f11283e189a (patch)
tree: 1f19206b0bbf8f9f4d2af9a8a34d0198c8a70808
parent: 4d715f1b6035e095635067d977ad56948ff4e4c2 (diff)
download: pcre-00cc776fe74e502bc0774ceca2bb3f11283e189a.tar.gz
10 files changed, 319 insertions, 174 deletions
diff --git a/Makefile.am b/Makefile.am
index 440d699..7d5de86 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -219,7 +219,8 @@ libpcre16_la_SOURCES = \
   pcre16_tables.c \
   pcre16_try_flipped.c \
   pcre16_utf16_utils.c \
-  pcre16_valid_utf16.c
+  pcre16_valid_utf16.c \
+  pcre16_xclass.c
 
 ## This file is generated as part of the building process, so don't distribute.
 nodist_libpcre16_la_SOURCES = \
diff --git a/pcre16_xclass.c b/pcre16_xclass.c
new file mode 100644
index 0000000..acb5631
--- /dev/null
+++ b/pcre16_xclass.c
@@ -0,0 +1,45 @@
+/*************************************************
+*      Perl-Compatible Regular Expressions       *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+                       Written by Philip Hazel
+           Copyright (c) 1997-2011 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+/* Generate code with 16 bit character support. */
+#define COMPILE_PCRE16
+
+#include "pcre_xclass.c"
+
+/* End of pcre16_xclass.c */
diff --git a/pcre_compile.c b/pcre_compile.c
index 1664506..46d881d 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -1764,15 +1764,15 @@ for (;;)
 
     /* Check a class for variable quantification */
 
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || defined COMPILE_PCRE16
     case OP_XCLASS:
-    cc += GET(cc, 1) - 33;
+    cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];
     /* Fall through */
 #endif
 
     case OP_CLASS:
     case OP_NCLASS:
-    cc += 33;
+    cc += PRIV(OP_lengths)[OP_CLASS];
 
     switch (*cc)
       {
@@ -2310,7 +2310,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
 
     case OP_CLASS:
     case OP_NCLASS:
-    ccode = code + 33;
+    ccode = code + PRIV(OP_lengths)[OP_CLASS];
 
 #ifdef SUPPORT_UTF8
     CHECK_CLASS_REPEAT:
@@ -3299,22 +3299,27 @@ const pcre_uchar *nestptr = NULL;
 pcre_uchar *previous = NULL;
 pcre_uchar *previous_callout = NULL;
 pcre_uchar *save_hwm = NULL;
-pcre_uchar classbits[32];
+pcre_uint8 classbits[32];
 
 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
 dynamically as we process the pattern. */
 
 #ifdef SUPPORT_UTF8
-BOOL class_utf8;
 BOOL utf8 = (options & PCRE_UTF8) != 0;
-pcre_uint8 *class_utf8data;
-pcre_uint8 *class_utf8data_base;
 pcre_uint8 utf8_char[6];
 #else
 BOOL utf8 = FALSE;
 #endif
 
+/* Helper variables for OP_XCLASS opcode (for characters > 255). */
+
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
+BOOL xclass;
+pcre_uchar *class_uchardata;
+pcre_uchar *class_uchardata_base;
+#endif
+
 #ifdef PCRE_DEBUG
 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
 #endif
@@ -3620,8 +3625,7 @@ for (;; ptr++)
         {
         if (ptr[1] == CHAR_E)
           ptr++;
-        else if (STRNCMP_UC_C8(ptr + 1,
-                          STR_Q STR_BACKSLASH STR_E, 3) == 0)
+        else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
           ptr += 3;
         else
           break;
@@ -3665,10 +3669,10 @@ for (;; ptr++)
 
     memset(classbits, 0, 32 * sizeof(pcre_uint8));
 
-#ifdef SUPPORT_UTF8
-    class_utf8 = FALSE;                       /* No chars >= 256 */
-    class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
-    class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
+    xclass = FALSE;                           /* No chars >= 256 */
+    class_uchardata = code + LINK_SIZE + 2;   /* For UTF-8 items */
+    class_uchardata_base = class_uchardata;   /* For resetting in pass 1 */
 #endif
 
     /* Process characters until ] is reached. By writing this as a "do" it
@@ -3684,18 +3688,19 @@ for (;; ptr++)
         {                           /* Braces are required because the */
         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
         }
+#endif
 
-      /* In the pre-compile phase, accumulate the length of any UTF-8 extra
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
+      /* In the pre-compile phase, accumulate the length of any extra
       data and reset the pointer. This is so that very large classes that
-      contain a zillion UTF-8 characters no longer overwrite the work space
+      contain a zillion > 255 characters no longer overwrite the work space
       (which is on the stack). */
 
       if (lengthptr != NULL)
         {
-        *lengthptr += class_utf8data - class_utf8data_base;
-        class_utf8data = class_utf8data_base;
+        *lengthptr += class_uchardata - class_uchardata_base;
+        class_uchardata = class_uchardata_base;
         }
-
 #endif
 
       /* Inside \Q...\E everything is literal except \E */
@@ -3896,23 +3901,23 @@ for (;; ptr++)
             SETBIT(classbits, 0x09); /* VT */
             SETBIT(classbits, 0x20); /* SPACE */
             SETBIT(classbits, 0xa0); /* NSBP */
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
             if (utf8)
               {
-              class_utf8 = TRUE;
-              *class_utf8data++ = XCL_SINGLE;
-              class_utf8data += PRIV(ord2utf8)(0x1680, class_utf8data);
-              *class_utf8data++ = XCL_SINGLE;
-              class_utf8data += PRIV(ord2utf8)(0x180e, class_utf8data);
-              *class_utf8data++ = XCL_RANGE;
-              class_utf8data += PRIV(ord2utf8)(0x2000, class_utf8data);
-              class_utf8data += PRIV(ord2utf8)(0x200A, class_utf8data);
-              *class_utf8data++ = XCL_SINGLE;
-              class_utf8data += PRIV(ord2utf8)(0x202f, class_utf8data);
-              *class_utf8data++ = XCL_SINGLE;
-              class_utf8data += PRIV(ord2utf8)(0x205f, class_utf8data);
-              *class_utf8data++ = XCL_SINGLE;
-              class_utf8data += PRIV(ord2utf8)(0x3000, class_utf8data);
+              xclass = TRUE;
+              *class_uchardata++ = XCL_SINGLE;
+              class_uchardata += PRIV(ord2utf8)(0x1680, class_uchardata);
+              *class_uchardata++ = XCL_SINGLE;
+              class_uchardata += PRIV(ord2utf8)(0x180e, class_uchardata);
+              *class_uchardata++ = XCL_RANGE;
+              class_uchardata += PRIV(ord2utf8)(0x2000, class_uchardata);
+              class_uchardata += PRIV(ord2utf8)(0x200A, class_uchardata);
+              *class_uchardata++ = XCL_SINGLE;
+              class_uchardata += PRIV(ord2utf8)(0x202f, class_uchardata);
+              *class_uchardata++ = XCL_SINGLE;
+              class_uchardata += PRIV(ord2utf8)(0x205f, class_uchardata);
+              *class_uchardata++ = XCL_SINGLE;
+              class_uchardata += PRIV(ord2utf8)(0x3000, class_uchardata);
               }
 #endif
             continue;
@@ -3931,31 +3936,31 @@ for (;; ptr++)
               classbits[c] |= x;
               }
 
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
             if (utf8)
               {
-              class_utf8 = TRUE;
-              *class_utf8data++ = XCL_RANGE;
-              class_utf8data += PRIV(ord2utf8)(0x0100, class_utf8data);
-              class_utf8data += PRIV(ord2utf8)(0x167f, class_utf8data);
-              *class_utf8data++ = XCL_RANGE;
-              class_utf8data += PRIV(ord2utf8)(0x1681, class_utf8data);
-              class_utf8data += PRIV(ord2utf8)(0x180d, class_utf8data);
-              *class_utf8data++ = XCL_RANGE;
-              class_utf8data += PRIV(ord2utf8)(0x180f, class_utf8data);
-              class_utf8data += PRIV(ord2utf8)(0x1fff, class_utf8data);
-              *class_utf8data++ = XCL_RANGE;
-              class_utf8data += PRIV(ord2utf8)(0x200B, class_utf8data);
-              class_utf8data += PRIV(ord2utf8)(0x202e, class_utf8data);
-              *class_utf8data++ = XCL_RANGE;
-              class_utf8data += PRIV(ord2utf8)(0x2030, class_utf8data);
-              class_utf8data += PRIV(ord2utf8)(0x205e, class_utf8data);
-              *class_utf8data++ = XCL_RANGE;
-              class_utf8data += PRIV(ord2utf8)(0x2060, class_utf8data);
-              class_utf8data += PRIV(ord2utf8)(0x2fff, class_utf8data);
-              *class_utf8data++ = XCL_RANGE;
-              class_utf8data += PRIV(ord2utf8)(0x3001, class_utf8data);
-              class_utf8data += PRIV(ord2utf8)(0x7fffffff, class_utf8data);
+              xclass = TRUE;
+              *class_uchardata++ = XCL_RANGE;
+              class_uchardata += PRIV(ord2utf8)(0x0100, class_uchardata);
+              class_uchardata += PRIV(ord2utf8)(0x167f, class_uchardata);
+              *class_uchardata++ = XCL_RANGE;
+              class_uchardata += PRIV(ord2utf8)(0x1681, class_uchardata);
+              class_uchardata += PRIV(ord2utf8)(0x180d, class_uchardata);
+              *class_uchardata++ = XCL_RANGE;
+              class_uchardata += PRIV(ord2utf8)(0x180f, class_uchardata);
+              class_uchardata += PRIV(ord2utf8)(0x1fff, class_uchardata);
+              *class_uchardata++ = XCL_RANGE;
+              class_uchardata += PRIV(ord2utf8)(0x200B, class_uchardata);
+              class_uchardata += PRIV(ord2utf8)(0x202e, class_uchardata);
+              *class_uchardata++ = XCL_RANGE;
+              class_uchardata += PRIV(ord2utf8)(0x2030, class_uchardata);
+              class_uchardata += PRIV(ord2utf8)(0x205e, class_uchardata);
+              *class_uchardata++ = XCL_RANGE;
+              class_uchardata += PRIV(ord2utf8)(0x2060, class_uchardata);
+              class_uchardata += PRIV(ord2utf8)(0x2fff, class_uchardata);
+              *class_uchardata++ = XCL_RANGE;
+              class_uchardata += PRIV(ord2utf8)(0x3001, class_uchardata);
+              class_uchardata += PRIV(ord2utf8)(0x7fffffff, class_uchardata);
               }
 #endif
             continue;
@@ -3966,13 +3971,13 @@ for (;; ptr++)
             SETBIT(classbits, 0x0c); /* FF */
             SETBIT(classbits, 0x0d); /* CR */
             SETBIT(classbits, 0x85); /* NEL */
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
             if (utf8)
               {
-              class_utf8 = TRUE;
-              *class_utf8data++ = XCL_RANGE;
-              class_utf8data += PRIV(ord2utf8)(0x2028, class_utf8data);
-              class_utf8data += PRIV(ord2utf8)(0x2029, class_utf8data);
+              xclass = TRUE;
+              *class_uchardata++ = XCL_RANGE;
+              class_uchardata += PRIV(ord2utf8)(0x2028, class_uchardata);
+              class_uchardata += PRIV(ord2utf8)(0x2029, class_uchardata);
               }
 #endif
             continue;
@@ -3994,16 +3999,16 @@ for (;; ptr++)
               classbits[c] |= x;
               }
 
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
             if (utf8)
               {
-              class_utf8 = TRUE;
-              *class_utf8data++ = XCL_RANGE;
-              class_utf8data += PRIV(ord2utf8)(0x0100, class_utf8data);
-              class_utf8data += PRIV(ord2utf8)(0x2027, class_utf8data);
-              *class_utf8data++ = XCL_RANGE;
-              class_utf8data += PRIV(ord2utf8)(0x2029, class_utf8data);
-              class_utf8data += PRIV(ord2utf8)(0x7fffffff, class_utf8data);
+              xclass = TRUE;
+              *class_uchardata++ = XCL_RANGE;
+              class_uchardata += PRIV(ord2utf8)(0x0100, class_uchardata);
+              class_uchardata += PRIV(ord2utf8)(0x2027, class_uchardata);
+              *class_uchardata++ = XCL_RANGE;
+              class_uchardata += PRIV(ord2utf8)(0x2029, class_uchardata);
+              class_uchardata += PRIV(ord2utf8)(0x7fffffff, class_uchardata);
               }
 #endif
             continue;
@@ -4016,11 +4021,11 @@ for (;; ptr++)
               int pdata;
               int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
               if (ptype < 0) goto FAILED;
-              class_utf8 = TRUE;
-              *class_utf8data++ = ((-c == ESC_p) != negated)?
+              xclass = TRUE;
+              *class_uchardata++ = ((-c == ESC_p) != negated)?
                 XCL_PROP : XCL_NOTPROP;
-              *class_utf8data++ = ptype;
-              *class_utf8data++ = pdata;
+              *class_uchardata++ = ptype;
+              *class_uchardata++ = pdata;
               class_charcount -= 2;   /* Not a < 256 character */
               continue;
               }
@@ -4042,7 +4047,7 @@ for (;; ptr++)
           }
 
         /* Fall through if we have a single character (c >= 0). This may be
-        greater than 256 in UTF-8 mode. */
+        greater than 256 mode. */
 
         }   /* End of backslash handling */
 
@@ -4140,10 +4145,15 @@ for (;; ptr++)
         matching for characters > 127 is available only if UCP support is
         available. */
 
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
         if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
+#endif
+#ifndef COMPILE_PCRE8
+        if (d > 255)
+#endif
+#if defined SUPPORT_UTF || defined COMPILE_PCRE16
           {
-          class_utf8 = TRUE;
+          xclass = TRUE;
 
           /* With UCP support, we can find the other case equivalents of
           the relevant characters. There may be several ranges. Optimize how
@@ -4176,14 +4186,14 @@ for (;; ptr++)
 
               if (occ == ocd)
                 {
-                *class_utf8data++ = XCL_SINGLE;
+                *class_uchardata++ = XCL_SINGLE;
                 }
               else
                 {
-                *class_utf8data++ = XCL_RANGE;
-                class_utf8data += PRIV(ord2utf8)(occ, class_utf8data);
+                *class_uchardata++ = XCL_RANGE;
+                class_uchardata += PRIV(ord2utf8)(occ, class_uchardata);
                 }
-              class_utf8data += PRIV(ord2utf8)(ocd, class_utf8data);
+              class_uchardata += PRIV(ord2utf8)(ocd, class_uchardata);
               }
             }
 #endif  /* SUPPORT_UCP */
@@ -4191,30 +4201,38 @@ for (;; ptr++)
           /* Now record the original range, possibly modified for UCP caseless
           overlapping ranges. */
 
-          *class_utf8data++ = XCL_RANGE;
-          class_utf8data += PRIV(ord2utf8)(c, class_utf8data);
-          class_utf8data += PRIV(ord2utf8)(d, class_utf8data);
+          *class_uchardata++ = XCL_RANGE;
+#ifdef SUPPORT_UTF
+          class_uchardata += PRIV(ord2utf8)(c, class_uchardata);
+          class_uchardata += PRIV(ord2utf8)(d, class_uchardata);
+#else
+          *class_uchardata++ = c;
+          *class_uchardata++ = d;
+#endif
 
           /* With UCP support, we are done. Without UCP support, there is no
-          caseless matching for UTF-8 characters > 127; we can use the bit map
-          for the smaller ones. */
+          caseless matching for UTF characters > 127; we can use the bit map
+          for the smaller ones. As for 16 bit characters without UTF, we
+          can still use  */
 
 #ifdef SUPPORT_UCP
           continue;    /* With next character in the class */
 #else
+#ifdef SUPPORT_UTF
           if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
-
           /* Adjust upper limit and fall through to set up the map */
-
           d = 127;
-
+#else
+          if (c > 255) continue;
+          /* Adjust upper limit and fall through to set up the map */
+          d = 255;
+#endif  /* SUPPORT_UTF */
 #endif  /* SUPPORT_UCP */
           }
-#endif  /* SUPPORT_UTF8 */
+#endif  /* SUPPORT_UTF8 || COMPILE_PCRE16 */
 
-        /* We use the bit map for all cases when not in UTF-8 mode; else
-        ranges that lie entirely within 0-127 when there is UCP support; else
-        for partial ranges without UCP support. */
+        /* We use the bit map for 8 bit mode, or when the characters fall
+        partially or entirely to [0-255] ([0-127] for UCP) ranges. */
 
         class_charcount += d - c + 1;
         class_lastchar = d;
@@ -4242,12 +4260,21 @@ for (;; ptr++)
 
       /* Handle a character that cannot go in the bit map */
 
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
       if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
+#endif
+#ifndef COMPILE_PCRE8
+      if (c > 255)
+#endif
+#if defined SUPPORT_UTF || defined COMPILE_PCRE16
         {
-        class_utf8 = TRUE;
-        *class_utf8data++ = XCL_SINGLE;
-        class_utf8data += PRIV(ord2utf8)(c, class_utf8data);
+        xclass = TRUE;
+        *class_uchardata++ = XCL_SINGLE;
+#ifdef SUPPORT_UTF
+        class_uchardata += PRIV(ord2utf8)(c, class_uchardata);
+#else
+        *class_uchardata++ = c;
+#endif
 
 #ifdef SUPPORT_UCP
         if ((options & PCRE_CASELESS) != 0)
@@ -4255,8 +4282,8 @@ for (;; ptr++)
           unsigned int othercase;
           if ((othercase = UCD_OTHERCASE(c)) != c)
             {
-            *class_utf8data++ = XCL_SINGLE;
-            class_utf8data += PRIV(ord2utf8)(othercase, class_utf8data);
+            *class_uchardata++ = XCL_SINGLE;
+            class_uchardata += PRIV(ord2utf8)(othercase, class_uchardata);
             }
           }
 #endif  /* SUPPORT_UCP */
@@ -4312,11 +4339,13 @@ for (;; ptr++)
     char if this item is first, whatever repeat count may follow. In the case
     of reqbyte, save the previous value for reinstating. */
 
-#ifdef SUPPORT_UTF8
-    if (class_charcount == 1 && !class_utf8 &&
+#ifdef SUPPORT_UTF
+    if (class_charcount == 1 && !xclass &&
       (!utf8 || !negate_class || class_lastchar < 128))
-#else
+#elif defined COMPILE_PCRE8
     if (class_charcount == 1)
+#else
+    if (class_charcount == 1 && !xclass)
 #endif
       {
       zeroreqbyte = reqbyte;
@@ -4364,13 +4393,18 @@ for (;; ptr++)
     be listed) there are no characters < 256, we can omit the bitmap in the
     actual compiled code. */
 
-#ifdef SUPPORT_UTF8
-    if (class_utf8 && (!should_flip_negation || (options & PCRE_UCP) != 0))
+#ifdef SUPPORT_UTF
+    if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
+#endif
+#ifndef COMPILE_PCRE8
+    if (xclass && !should_flip_negation)
+#endif
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
       {
-      *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
+      *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
       *code++ = OP_XCLASS;
       code += LINK_SIZE;
-      *code = negate_class? XCL_NOT : 0;
+      *code = negate_class? XCL_NOT:0;
 
       /* If the map is required, move up the extra data to make room for it;
       otherwise just move the code pointer to the end of the extra data. */
@@ -4378,11 +4412,12 @@ for (;; ptr++)
       if (class_charcount > 0)
         {
         *code++ |= XCL_MAP;
-        memmove(code + 32, code, class_utf8data - code);
+        memmove(code + (32 / sizeof(pcre_uchar)), code,
+          IN_UCHARS(class_uchardata - code));
         memcpy(code, classbits, 32);
-        code = class_utf8data + 32;
+        code = class_uchardata + (32 / sizeof(pcre_uchar));
         }
-      else code = class_utf8data;
+      else code = class_uchardata;
 
       /* Now fill in the complete length of the item */
 
@@ -4398,16 +4433,13 @@ for (;; ptr++)
     negating it if necessary. */
 
     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
-    if (negate_class)
-      {
-      if (lengthptr == NULL)    /* Save time in the pre-compile phase */
-        for (c = 0; c < 32; c++) code[c] = ~classbits[c];
-      }
-    else
+    if (lengthptr == NULL)    /* Save time in the pre-compile phase */
       {
+      if (negate_class)
+        for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
       memcpy(code, classbits, 32);
       }
-    code += 32;
+    code += 32 / sizeof(pcre_uchar);
     break;
 
 
@@ -4761,7 +4793,7 @@ for (;; ptr++)
 
     else if (*previous == OP_CLASS ||
              *previous == OP_NCLASS ||
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF8 || !defined COMPILE_PCRE8
              *previous == OP_XCLASS ||
 #endif
              *previous == OP_REF ||
diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c
index ea5b00c..0793897 100644
--- a/pcre_dfa_exec.c
+++ b/pcre_dfa_exec.c
@@ -2426,7 +2426,7 @@ for (;;)
 
         if (codevalue != OP_XCLASS)
           {
-          ecode = code + 33;
+          ecode = code + 1 + (32 / sizeof(pcre_uchar));
           if (clen > 0)
             {
             isinclass = (c > 255)? (codevalue == OP_NCLASS) :
diff --git a/pcre_exec.c b/pcre_exec.c
index 41a2482..e532513 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -2706,8 +2706,11 @@ for (;;)
     case OP_NCLASS:
     case OP_CLASS:
       {
+      /* The data variable is saved across frames, so the byte map needs to
+      be stored there. */
+#define BYTE_MAP ((pcre_uint8 *)data)
       data = ecode + 1;                /* Save for matching */
-      ecode += 33;                     /* Advance past the item */
+      ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
 
       switch (*ecode)
         {
@@ -2740,7 +2743,7 @@ for (;;)
 
       /* First, ensure the minimum number of matches are present. */
 
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
       /* UTF-8 mode */
       if (utf8)
         {
@@ -2757,9 +2760,7 @@ for (;;)
             if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
             }
           else
-            {
-            if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
-            }
+            if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
           }
         }
       else
@@ -2774,7 +2775,14 @@ for (;;)
             MRRETURN(MATCH_NOMATCH);
             }
           c = *eptr++;
-          if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
+#ifndef COMPILE_PCRE8
+          if (c > 255)
+            {
+            if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
+            }
+          else
+#endif
+            if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
           }
         }
 
@@ -2788,7 +2796,7 @@ for (;;)
 
       if (minimize)
         {
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
         /* UTF-8 mode */
         if (utf8)
           {
@@ -2808,9 +2816,7 @@ for (;;)
               if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
               }
             else
-              {
-              if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
-              }
+              if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
             }
           }
         else
@@ -2828,7 +2834,14 @@ for (;;)
               MRRETURN(MATCH_NOMATCH);
               }
             c = *eptr++;
-            if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
+#ifndef COMPILE_PCRE8
+            if (c > 255)
+              {
+              if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
+              }
+            else
+#endif
+              if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
             }
           }
         /* Control never gets here */
@@ -2840,8 +2853,8 @@ for (;;)
         {
         pp = eptr;
 
-#ifdef SUPPORT_UTF8
-        /* UTF-8 mode */
+#ifdef SUPPORT_UTF
+        /* UTF mode */
         if (utf8)
           {
           for (i = min; i < max; i++)
@@ -2858,9 +2871,7 @@ for (;;)
               if (op == OP_CLASS) break;
               }
             else
-              {
-              if ((data[c/8] & (1 << (c&7))) == 0) break;
-              }
+              if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
             eptr += len;
             }
           for (;;)
@@ -2873,7 +2884,7 @@ for (;;)
           }
         else
 #endif
-          /* Not UTF-8 mode */
+          /* Not UTF mode */
           {
           for (i = min; i < max; i++)
             {
@@ -2883,7 +2894,14 @@ for (;;)
               break;
               }
             c = *eptr;
-            if ((data[c/8] & (1 << (c&7))) == 0) break;
+#ifndef COMPILE_PCRE8
+            if (c > 255)
+              {
+              if (op == OP_CLASS) break;
+              }
+            else
+#endif
+              if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
             eptr++;
             }
           while (eptr >= pp)
@@ -2896,6 +2914,7 @@ for (;;)
 
         MRRETURN(MATCH_NOMATCH);
         }
+#undef BYTE_MAP
       }
     /* Control never gets here */
 
@@ -2904,7 +2923,7 @@ for (;;)
     when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
     mode, because Unicode properties are supported in non-UTF-8 mode. */
 
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
     case OP_XCLASS:
       {
       data = ecode + 1 + LINK_SIZE;                /* Save for matching */
@@ -2991,7 +3010,11 @@ for (;;)
             SCHECK_PARTIAL();
             break;
             }
+#ifdef SUPPORT_UTF
           GETCHARLENTEST(c, eptr, len);
+#else
+          c = *eptr;
+#endif
           if (!PRIV(xclass)(c, data)) break;
           eptr += len;
           }
@@ -3000,7 +3023,9 @@ for (;;)
           RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
           if (eptr-- == pp) break;        /* Stop if tried at original pos */
+#ifdef SUPPORT_UTF
           if (utf8) BACKCHAR(eptr);
+#endif
           }
         MRRETURN(MATCH_NOMATCH);
         }
@@ -6353,7 +6378,11 @@ for(;;)
       {
       while (start_match < end_subject)
         {
+#ifdef COMPILE_PCRE8
         register unsigned int c = *start_match;
+#else
+        register unsigned int c = *start_match & 0xff;
+#endif
         if ((start_bits[c/8] & (1 << (c&7))) == 0)
           {
           start_match++;
diff --git a/pcre_internal.h b/pcre_internal.h
index 0228207..b9f8dd4 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -71,6 +71,21 @@ script prevents both being selected, but not everybody uses "configure". */
 #define SUPPORT_UTF8 1
 #endif
 
+/* If SUPPORT_UCP is defined, SUPPORT_UTF16 must also be defined. The
+"configure" script ensures this, but not everybody uses "configure". */
+
+#if defined SUPPORT_UCP && defined COMPILE_PCRE16 && !defined SUPPORT_UTF16
+#define SUPPORT_UTF16 1
+#endif
+
+/* This macro is defined if either UTF-8 or UTF-16 support or both are
+enabled. */
+
+#if defined SUPPORT_UTF8 || defined SUPPORT_UTF16
+/* Unicode Transformation Format is enabled. */
+#define SUPPORT_UTF 1
+#endif
+
 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
 inline, and there are *still* stupid compilers about that don't like indented
 pre-processor statements, or at least there were when I first wrote this. After
@@ -1325,7 +1340,7 @@ only. */
 #define PT_WORD       8    /* Word - L plus N plus underscore */
 
 /* Flag bits and data types for the extended class (OP_XCLASS) for classes that
-contain UTF-8 characters with values greater than 255. */
+contain characters with values greater than 255. */
 
 #define XCL_NOT    0x01    /* Flag: this is a negative class */
 #define XCL_MAP    0x02    /* Flag: a 32-byte map is present */
@@ -1522,8 +1537,8 @@ enum {
   OP_CLASS,          /* 106 Match a character class, chars < 256 only */
   OP_NCLASS,         /* 107 Same, but the bitmap was created from a negative
                               class - the difference is relevant only when a
-                              UTF-8 character > 255 is encountered. */
-  OP_XCLASS,         /* 108 Extended class for handling UTF-8 chars within the
+                              character > 255 is encountered. */
+  OP_XCLASS,         /* 108 Extended class for handling > 255 chars within the
                               class. This does both positive and negative. */
   OP_REF,            /* 109 Match a back reference, casefully */
   OP_REFI,           /* 110 Match a back reference, caselessly */
@@ -1704,8 +1719,8 @@ in UTF-8 mode. The code that uses this table must know about such things. */
   /* Character class & ref repeats                                         */ \
   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */ \
   1+2*IMM2_SIZE, 1+2*IMM2_SIZE,  /* CRRANGE, CRMINRANGE                    */ \
- 33,                             /* CLASS                                  */ \
- 33,                             /* NCLASS                                 */ \
+  1+(32/sizeof(pcre_uchar)),     /* CLASS                                  */ \
+  1+(32/sizeof(pcre_uchar)),     /* NCLASS                                 */ \
   0,                             /* XCLASS - variable length               */ \
   1+IMM2_SIZE,                   /* REF                                    */ \
   1+IMM2_SIZE,                   /* REFI                                   */ \
diff --git a/pcre_jit_compile.c b/pcre_jit_compile.c
index 5fed4a1..7a2c41d 100644
--- a/pcre_jit_compile.c
+++ b/pcre_jit_compile.c
@@ -592,9 +592,9 @@ switch(*cc)
 
   case OP_CLASS:
   case OP_NCLASS:
-  return cc + 33;
+  return cc + 1 + 32 / sizeof(pcre_uchar);
 
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   case OP_XCLASS:
   return cc + GET(cc, 1);
 #endif
@@ -1879,11 +1879,14 @@ if (firstline)
 
 start = LABEL();
 leave = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
-OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
-#ifdef SUPPORT_UTF8
+OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+#ifdef SUPPORT_UTF
 if (common->utf8)
   OP1(SLJIT_MOV, TMP3, 0, TMP1, 0);
 #endif
+#ifndef COMPILE_PCRE8
+OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xff);
+#endif
 OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7);
 OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3);
 OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), start_bits);
@@ -1891,11 +1894,11 @@ OP2(SLJIT_SHL, TMP2, 0, SLJIT_IMM, 1, TMP2, 0);
 OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, TMP2, 0);
 found = JUMP(SLJIT_C_NOT_ZERO);
 
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
 if (common->utf8)
   OP1(SLJIT_MOV, TMP1, 0, TMP3, 0);
 #endif
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
 #ifdef SUPPORT_UTF8
 if (common->utf8)
   {
@@ -2435,7 +2438,7 @@ while (utf8length > 0);
 return cc;
 }
 
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 
 #define SET_TYPE_OFFSET(value) \
   if ((value) != typeoffset) \
@@ -2482,8 +2485,12 @@ read_char(common);
 if ((*cc++ & XCL_MAP) != 0)
   {
   OP1(SLJIT_MOV, TMP3, 0, TMP1, 0);
+#ifndef COMPILE_PCRE8
+  jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
+#elif defined SUPPORT_UTF8
   if (common->utf8)
     jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
+#endif
 
   OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7);
   OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3);
@@ -2492,13 +2499,17 @@ if ((*cc++ & XCL_MAP) != 0)
   OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, TMP2, 0);
   add_jump(compiler, list, JUMP(SLJIT_C_NOT_ZERO));
 
+#ifndef COMPILE_PCRE8
+  JUMPHERE(jump);
+#elif defined SUPPORT_UTF8
   if (common->utf8)
     JUMPHERE(jump);
+#endif
   OP1(SLJIT_MOV, TMP1, 0, TMP3, 0);
 #ifdef SUPPORT_UCP
   charsaved = TRUE;
 #endif
-  cc += 32;
+  cc += 32 / sizeof(pcre_uchar);
   }
 
 /* Scanning the necessary info. */
@@ -3179,9 +3190,12 @@ switch(type)
   case OP_NCLASS:
   check_input_end(common, fallbacks);
   read_char(common);
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   jump[0] = NULL;
+#ifdef SUPPORT_UTF8
+  /* This check can only be skipped in pure 8 bit mode. */
   if (common->utf8)
+#endif
     {
     jump[0] = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
     if (type == OP_CLASS)
@@ -3197,13 +3211,13 @@ switch(type)
   OP2(SLJIT_SHL, TMP2, 0, SLJIT_IMM, 1, TMP2, 0);
   OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, TMP2, 0);
   add_jump(compiler, fallbacks, JUMP(SLJIT_C_ZERO));
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   if (jump[0] != NULL)
     JUMPHERE(jump[0]);
 #endif
-  return cc + 32;
+  return cc + 32 / sizeof(pcre_uchar);
 
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || defined COMPILE_PCRE16
   case OP_XCLASS:
   compile_xclass_hotpath(common, cc + LINK_SIZE, fallbacks);
   return cc + GET(cc, 0) - 1;
@@ -4725,7 +4739,7 @@ else
   SLJIT_ASSERT(*opcode >= OP_CLASS || *opcode <= OP_XCLASS);
   *type = *opcode;
   cc++;
-  class_len = (*type < OP_XCLASS) ? 33 : GET(cc, 0);
+  class_len = (*type < OP_XCLASS) ? (1 + (32 / sizeof(pcre_uchar))) : GET(cc, 0);
   *opcode = cc[class_len - 1];
   if (*opcode >= OP_CRSTAR && *opcode <= OP_CRMINQUERY)
     {
@@ -5133,13 +5147,13 @@ while (cc < ccend)
 
     case OP_CLASS:
     case OP_NCLASS:
-    if (cc[33] >= OP_CRSTAR && cc[33] <= OP_CRMINRANGE)
+    if (cc[1 + (32 / sizeof(pcre_uchar))] >= OP_CRSTAR && cc[1 + (32 / sizeof(pcre_uchar))] <= OP_CRMINRANGE)
       cc = compile_iterator_hotpath(common, cc, parent);
     else
       cc = compile_char1_hotpath(common, *cc, cc + 1, parent->top != NULL ? &parent->top->nextfallbacks : &parent->topfallbacks);
     break;
 
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || defined COMPILE_PCRE16
     case OP_XCLASS:
     if (*(cc + GET(cc, 1)) >= OP_CRSTAR && *(cc + GET(cc, 1)) <= OP_CRMINRANGE)
       cc = compile_iterator_hotpath(common, cc, parent);
@@ -5994,7 +6008,9 @@ while (current)
     case OP_TYPEPOSUPTO:
     case OP_CLASS:
     case OP_NCLASS:
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
     case OP_XCLASS:
+#endif
     compile_iterator_fallbackpath(common, current);
     break;
 
diff --git a/pcre_printint.src b/pcre_printint.src
index a5670e5..5a9f15d 100644
--- a/pcre_printint.src
+++ b/pcre_printint.src
@@ -471,9 +471,9 @@ for(;;)
     fprintf(f, "    %s %s", OP_names[*code], get_ucpname(code[1], code[2]));
     break;
 
-    /* OP_XCLASS can only occur in UTF-8 mode. However, there's no harm in
-    having this code always here, and it makes it less messy without all those
-    #ifdefs. */
+    /* OP_XCLASS can only occur in UTF or PCRE16 modes. However, there's no
+    harm in having this code always here, and it makes it less messy without
+    all those #ifdefs. */
 
     case OP_CLASS:
     case OP_NCLASS:
@@ -481,6 +481,7 @@ for(;;)
       {
       int i, min, max;
       BOOL printmap;
+      pcre_uint8 *map;
 
       fprintf(f, "    [");
 
@@ -501,13 +502,14 @@ for(;;)
 
       if (printmap)
         {
+        map = (pcre_uint8 *)ccode;
         for (i = 0; i < 256; i++)
           {
-          if ((ccode[i/8] & (1 << (i&7))) != 0)
+          if ((map[i/8] & (1 << (i&7))) != 0)
             {
             int j;
             for (j = i+1; j < 256; j++)
-              if ((ccode[j/8] & (1 << (j&7))) == 0) break;
+              if ((map[j/8] & (1 << (j&7))) == 0) break;
             if (i == '-' || i == ']') fprintf(f, "\\");
             if (PRINTABLE(i)) fprintf(f, "%c", i);
               else fprintf(f, "\\x%02x", i);
@@ -521,7 +523,7 @@ for(;;)
             i = j;
             }
           }
-        ccode += 32;
+        ccode += 32 / sizeof(pcre_uchar);
         }
 
       /* For an XCLASS there is always some additional data */
diff --git a/pcre_study.c b/pcre_study.c
index 5253c49..661627d 100644
--- a/pcre_study.c
+++ b/pcre_study.c
@@ -322,15 +322,15 @@ for (;;)
 
     /* Check a class for variable quantification */
 
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF8 || !defined COMPILE_PCRE8
     case OP_XCLASS:
-    cc += GET(cc, 1) - 33;
+    cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];
     /* Fall through */
 #endif
 
     case OP_CLASS:
     case OP_NCLASS:
-    cc += 33;
+    cc += PRIV(OP_lengths)[OP_CLASS];
 
     switch (*cc)
       {
@@ -789,7 +789,9 @@ do
       case OP_SOM:
       case OP_THEN:
       case OP_THEN_ARG:
+#if defined SUPPORT_UTF8 || !defined COMPILE_PCRE8
       case OP_XCLASS:
+#endif
       return SSB_FAIL;
 
       /* We can ignore word boundary tests. */
@@ -1134,7 +1136,9 @@ do
 
       case OP_CLASS:
         {
+        pcre_uint8 *map;
         tcode++;
+        map = (pcre_uint8 *)tcode;
 
         /* In UTF-8 mode, the bits in a bit map correspond to character
         values, not to byte values. However, the bit map we are constructing is
@@ -1145,10 +1149,10 @@ do
 #ifdef SUPPORT_UTF8
         if (utf8)
           {
-          for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
+          for (c = 0; c < 16; c++) start_bits[c] |= map[c];
           for (c = 128; c < 256; c++)
             {
-            if ((tcode[c/8] && (1 << (c&7))) != 0)
+            if ((map[c/8] && (1 << (c&7))) != 0)
               {
               int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
               start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
@@ -1162,13 +1166,13 @@ do
         else
 #endif
           {
-          for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
+          for (c = 0; c < 32; c++) start_bits[c] |= map[c];
           }
 
         /* Advance past the bit map, and act on what follows. For a zero
         minimum repeat, continue; otherwise stop processing. */
 
-        tcode += 32;
+        tcode += 32 / sizeof(pcre_uchar);
         switch (*tcode)
           {
           case OP_CRSTAR:
diff --git a/pcre_xclass.c b/pcre_xclass.c
index 024d71d..cdb9d07 100644
--- a/pcre_xclass.c
+++ b/pcre_xclass.c
@@ -75,15 +75,16 @@ additional data. */
 
 if (c < 256)
   {
-  if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
-    return !negated;   /* char found */
+  if ((*data & XCL_MAP) != 0 &&
+    (((pcre_uint8 *)(data + 1))[c/8] & (1 << (c&7))) != 0)
+    return !negated; /* char found */
   }
 
 /* First skip the bit map if present. Then match against the list of Unicode
 properties or large chars or ranges that end with a large char. We won't ever
 encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */
 
-if ((*data++ & XCL_MAP) != 0) data += 32;
+if ((*data++ & XCL_MAP) != 0) data += 32 / sizeof(pcre_uchar);
 
 while ((t = *data++) != XCL_END)
   {
author	zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2011-11-28 20:39:30 +0000
committer	zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2011-11-28 20:39:30 +0000
commit	00cc776fe74e502bc0774ceca2bb3f11283e189a (patch)
tree	1f19206b0bbf8f9f4d2af9a8a34d0198c8a70808
parent	4d715f1b6035e095635067d977ad56948ff4e4c2 (diff)
download	pcre-00cc776fe74e502bc0774ceca2bb3f11283e189a.tar.gz