Revert RAWUCHAR macros, renaming them as UCHAR21 and adding an explanatory

comment. git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1431 2f5784b3-3f2a-0410-8824-cb99058d5e15
author: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2014-01-02 17:41:28 +0000
committer: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2014-01-02 17:41:28 +0000
commit: 31a692c6bfca25feffa7cc96dab542080b0a9d0c (patch)
tree: ce87e75e8ed049d97d5f667631fcaf7b30e86f70
parent: 62671ac7455a5eb508bc3f99e6f01585efd08c83 (diff)
download: pcre-31a692c6bfca25feffa7cc96dab542080b0a9d0c.tar.gz
5 files changed, 85 insertions, 67 deletions
diff --git a/ChangeLog b/ChangeLog
index a610c40..568e779 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -31,10 +31,11 @@ Version 8.35-RC1 xx-xxxx-201x
     must be bigger than the treshold as well. This function is useful, when
     the characters above the treshold are handled in the same way.
     
-7.  The macros RAWUCHAR and RAWUCHARTEST were identical (and the latter was not
-    testing anything, contrary to its name and comment) and were not really 
-    fulfilling any useful function, so I have replaced their use by plain code. 
-    Similarly for RAWUCHARINC and RAWUCHARINCTEST.  
+7.  The macros whose names start with RAWUCHAR are placeholders for a future 
+    mode in which only the bottom 21 bits of 32-bit data items are used. To 
+    make this more memorable for those maintaining the code, the names have 
+    been changed to start with UCHAR21, and an extensive comment has been added 
+    to their definition. 
 
 
 Version 8.34 15-December-2013
diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c
index 2b6dd10..fb0c7e8 100644
--- a/pcre_dfa_exec.c
+++ b/pcre_dfa_exec.c
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language (but see
 below for why this module is different).
 
                        Written by Philip Hazel
-           Copyright (c) 1997-2013 University of Cambridge
+           Copyright (c) 1997-2014 University of Cambridge
 
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -1473,7 +1473,7 @@ for (;;)
           goto ANYNL01;
 
           case CHAR_CR:
-          if (ptr + 1 < end_subject && ptr[1] == CHAR_LF) ncount = 1;
+          if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
           /* Fall through */
 
           ANYNL01:
@@ -1742,7 +1742,7 @@ for (;;)
           goto ANYNL02;
 
           case CHAR_CR:
-          if (ptr + 1 < end_subject && ptr[1] == CHAR_LF) ncount = 1;
+          if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
           /* Fall through */
 
           ANYNL02:
@@ -2012,7 +2012,7 @@ for (;;)
           goto ANYNL03;
 
           case CHAR_CR:
-          if (ptr + 1 < end_subject && ptr[1] == CHAR_LF) ncount = 1;
+          if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
           /* Fall through */
 
           ANYNL03:
@@ -2210,7 +2210,7 @@ for (;;)
           if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
             reset_could_continue = TRUE;
           }
-        else if (ptr[1] == CHAR_LF)
+        else if (UCHAR21TEST(ptr + 1) == CHAR_LF)
           {
           ADD_NEW_DATA(-(state_offset + 1), 0, 1);
           }
@@ -3474,12 +3474,12 @@ for (;;)
           {
           pcre_uchar csc;
           while (current_subject < end_subject &&
-                 (csc = *current_subject) != first_char && csc != first_char2)
+                 (csc = UCHAR21TEST(current_subject)) != first_char && csc != first_char2)
             current_subject++;
           }
         else
           while (current_subject < end_subject &&
-                 *current_subject != first_char)
+                 UCHAR21TEST(current_subject) != first_char)
             current_subject++;
         }
 
@@ -3509,9 +3509,10 @@ for (;;)
           ANYCRLF, and we are now at a LF, advance the match position by one
           more character. */
 
-          if (current_subject[-1] == CHAR_CR &&
+          if (UCHAR21TEST(current_subject - 1) == CHAR_CR &&
                (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
-               current_subject < end_subject && *current_subject == CHAR_NL)
+               current_subject < end_subject &&
+               UCHAR21TEST(current_subject) == CHAR_NL)
             current_subject++;
           }
         }
@@ -3522,7 +3523,7 @@ for (;;)
         {
         while (current_subject < end_subject)
           {
-          register pcre_uint32 c = *current_subject;
+          register pcre_uint32 c = UCHAR21TEST(current_subject);
 #ifndef COMPILE_PCRE8
           if (c > 255) c = 255;
 #endif
@@ -3579,7 +3580,7 @@ for (;;)
             {
             while (p < end_subject)
               {
-              register pcre_uint32 pp = *p++;
+              register pcre_uint32 pp = UCHAR21INCTEST(p);
               if (pp == req_char || pp == req_char2) { p--; break; }
               }
             }
@@ -3587,7 +3588,7 @@ for (;;)
             {
             while (p < end_subject)
               {
-              if (*p++ == req_char) { p--; break; }
+              if (UCHAR21INCTEST(p) == req_char) { p--; break; }
               }
             }
 
@@ -3655,9 +3656,9 @@ for (;;)
   not contain any explicit matches for \r or \n, and the newline option is CRLF
   or ANY or ANYCRLF, advance the match position by one more character. */
 
-  if (current_subject[-1] == CHAR_CR &&
+  if (UCHAR21TEST(current_subject - 1) == CHAR_CR &&
       current_subject < end_subject &&
-      *current_subject == CHAR_NL &&
+      UCHAR21TEST(current_subject) == CHAR_NL &&
       (re->flags & PCRE_HASCRORLF) == 0 &&
         (md->nltype == NLTYPE_ANY ||
          md->nltype == NLTYPE_ANYCRLF ||
diff --git a/pcre_exec.c b/pcre_exec.c
index beadc9c..7a59dbe 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -6,7 +6,7 @@
 and semantics are as close as possible to those of the Perl 5 language.
 
                        Written by Philip Hazel
-           Copyright (c) 1997-2013 University of Cambridge
+           Copyright (c) 1997-2014 University of Cambridge
 
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -134,7 +134,7 @@ pcre_uint32 c;
 BOOL utf = md->utf;
 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
 while (length-- > 0)
-  if (isprint(c = *p++)) printf("%c", (char)c); else printf("\\x{%02x}", c);
+  if (isprint(c = UCHAR21INCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
 }
 #endif
 
@@ -237,8 +237,8 @@ if (caseless)
       {
       pcre_uint32 cc, cp;
       if (eptr >= md->end_subject) return -2;   /* Partial match */
-      cc = *eptr;
-      cp = *p;
+      cc = UCHAR21TEST(eptr);
+      cp = UCHAR21TEST(p);
       if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
       p++;
       eptr++;
@@ -254,7 +254,7 @@ else
   while (length-- > 0)
     {
     if (eptr >= md->end_subject) return -2;   /* Partial match */
-    if (*p++ != *eptr++) return -1;
+    if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1;
     }
   }
 
@@ -2103,7 +2103,7 @@ for (;;)
             eptr + 1 >= md->end_subject &&
             NLBLOCK->nltype == NLTYPE_FIXED &&
             NLBLOCK->nllen == 2 &&
-            *eptr == NLBLOCK->nl[0])
+            UCHAR21TEST(eptr) == NLBLOCK->nl[0])
           {
           md->hitend = TRUE;
           if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
@@ -2147,7 +2147,7 @@ for (;;)
           eptr + 1 >= md->end_subject &&
           NLBLOCK->nltype == NLTYPE_FIXED &&
           NLBLOCK->nllen == 2 &&
-          *eptr == NLBLOCK->nl[0])
+          UCHAR21TEST(eptr) == NLBLOCK->nl[0])
         {
         md->hitend = TRUE;
         if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
@@ -2290,7 +2290,7 @@ for (;;)
         eptr + 1 >= md->end_subject &&
         NLBLOCK->nltype == NLTYPE_FIXED &&
         NLBLOCK->nllen == 2 &&
-        *eptr == NLBLOCK->nl[0])
+        UCHAR21TEST(eptr) == NLBLOCK->nl[0])
       {
       md->hitend = TRUE;
       if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
@@ -2444,7 +2444,7 @@ for (;;)
         {
         SCHECK_PARTIAL();
         }
-      else if (*eptr == CHAR_LF) eptr++;
+      else if (UCHAR21TEST(eptr) == CHAR_LF) eptr++;
       break;
 
       case CHAR_LF:
@@ -3218,7 +3218,7 @@ for (;;)
         CHECK_PARTIAL();             /* Not SCHECK_PARTIAL() */
         RRETURN(MATCH_NOMATCH);
         }
-      while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
+      while (length-- > 0) if (*ecode++ != UCHAR21INC(eptr)) RRETURN(MATCH_NOMATCH);
       }
     else
 #endif
@@ -3258,7 +3258,7 @@ for (;;)
 
       if (fc < 128)
         {
-        pcre_uint32 cc = *eptr;
+        pcre_uint32 cc = UCHAR21(eptr);
         if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
         ecode++;
         eptr++;
@@ -3527,7 +3527,7 @@ for (;;)
           SCHECK_PARTIAL();
           RRETURN(MATCH_NOMATCH);
           }
-        cc = *eptr;
+        cc = UCHAR21TEST(eptr);
         if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
         eptr++;
         }
@@ -3545,7 +3545,7 @@ for (;;)
             SCHECK_PARTIAL();
             RRETURN(MATCH_NOMATCH);
             }
-          cc = *eptr;
+          cc = UCHAR21TEST(eptr);
           if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
           eptr++;
           }
@@ -3562,7 +3562,7 @@ for (;;)
             SCHECK_PARTIAL();
             break;
             }
-          cc = *eptr;
+          cc = UCHAR21TEST(eptr);
           if (fc != cc && foc != cc) break;
           eptr++;
           }
@@ -3589,7 +3589,7 @@ for (;;)
           SCHECK_PARTIAL();
           RRETURN(MATCH_NOMATCH);
           }
-        if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
+        if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
         }
 
       if (min == max) continue;
@@ -3606,7 +3606,7 @@ for (;;)
             SCHECK_PARTIAL();
             RRETURN(MATCH_NOMATCH);
             }
-          if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
+          if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
           }
         /* Control never gets here */
         }
@@ -3620,7 +3620,7 @@ for (;;)
             SCHECK_PARTIAL();
             break;
             }
-          if (fc != *eptr) break;
+          if (fc != UCHAR21TEST(eptr)) break;
           eptr++;
           }
         if (possessive) continue;    /* No backtracking */
@@ -4375,7 +4375,7 @@ for (;;)
               eptr + 1 >= md->end_subject &&
               NLBLOCK->nltype == NLTYPE_FIXED &&
               NLBLOCK->nllen == 2 &&
-              *eptr == NLBLOCK->nl[0])
+              UCHAR21(eptr) == NLBLOCK->nl[0])
             {
             md->hitend = TRUE;
             if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
@@ -4417,7 +4417,7 @@ for (;;)
             default: RRETURN(MATCH_NOMATCH);
 
             case CHAR_CR:
-            if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
+            if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
             break;
 
             case CHAR_LF:
@@ -4527,7 +4527,7 @@ for (;;)
             SCHECK_PARTIAL();
             RRETURN(MATCH_NOMATCH);
             }
-          cc = *eptr;
+          cc = UCHAR21(eptr);
           if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
             RRETURN(MATCH_NOMATCH);
           eptr++;
@@ -4544,7 +4544,7 @@ for (;;)
             SCHECK_PARTIAL();
             RRETURN(MATCH_NOMATCH);
             }
-          cc = *eptr;
+          cc = UCHAR21(eptr);
           if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
             RRETURN(MATCH_NOMATCH);
           eptr++;
@@ -4561,7 +4561,7 @@ for (;;)
             SCHECK_PARTIAL();
             RRETURN(MATCH_NOMATCH);
             }
-          cc = *eptr;
+          cc = UCHAR21(eptr);
           if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
             RRETURN(MATCH_NOMATCH);
           eptr++;
@@ -4578,7 +4578,7 @@ for (;;)
             SCHECK_PARTIAL();
             RRETURN(MATCH_NOMATCH);
             }
-          cc = *eptr;
+          cc = UCHAR21(eptr);
           if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
             RRETURN(MATCH_NOMATCH);
           eptr++;
@@ -4595,7 +4595,7 @@ for (;;)
             SCHECK_PARTIAL();
             RRETURN(MATCH_NOMATCH);
             }
-          cc = *eptr;
+          cc = UCHAR21(eptr);
           if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
             RRETURN(MATCH_NOMATCH);
           eptr++;
@@ -5156,7 +5156,7 @@ for (;;)
               {
               default: RRETURN(MATCH_NOMATCH);
               case CHAR_CR:
-              if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
+              if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
               break;
 
               case CHAR_LF:
@@ -5695,7 +5695,7 @@ for (;;)
                   eptr + 1 >= md->end_subject &&
                   NLBLOCK->nltype == NLTYPE_FIXED &&
                   NLBLOCK->nllen == 2 &&
-                  *eptr == NLBLOCK->nl[0])
+                  UCHAR21(eptr) == NLBLOCK->nl[0])
                 {
                 md->hitend = TRUE;
                 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
@@ -5721,7 +5721,7 @@ for (;;)
                   eptr + 1 >= md->end_subject &&
                   NLBLOCK->nltype == NLTYPE_FIXED &&
                   NLBLOCK->nllen == 2 &&
-                  *eptr == NLBLOCK->nl[0])
+                  UCHAR21(eptr) == NLBLOCK->nl[0])
                 {
                 md->hitend = TRUE;
                 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
@@ -5778,7 +5778,7 @@ for (;;)
             if (c == CHAR_CR)
               {
               if (++eptr >= md->end_subject) break;
-              if (*eptr == CHAR_LF) eptr++;
+              if (UCHAR21(eptr) == CHAR_LF) eptr++;
               }
             else
               {
@@ -5941,8 +5941,8 @@ for (;;)
           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
           eptr--;
           BACKCHAR(eptr);
-          if (ctype == OP_ANYNL && eptr > pp  && *eptr == CHAR_NL &&
-              eptr[-1] == CHAR_CR) eptr--;
+          if (ctype == OP_ANYNL && eptr > pp  && UCHAR21(eptr) == CHAR_NL &&
+              UCHAR21(eptr - 1) == CHAR_CR) eptr--;
           }
         }
       else
@@ -6789,10 +6789,10 @@ for(;;)
 
       if (first_char != first_char2)
         while (start_match < end_subject &&
-          (smc = *start_match) != first_char && smc != first_char2)
+          (smc = UCHAR21TEST(start_match)) != first_char && smc != first_char2)
           start_match++;
       else
-        while (start_match < end_subject && *start_match != first_char)
+        while (start_match < end_subject && UCHAR21TEST(start_match) != first_char)
           start_match++;
       }
 
@@ -6824,7 +6824,7 @@ for(;;)
         if (start_match[-1] == CHAR_CR &&
              (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
              start_match < end_subject &&
-             *start_match == CHAR_NL)
+             UCHAR21TEST(start_match) == CHAR_NL)
           start_match++;
         }
       }
@@ -6835,7 +6835,7 @@ for(;;)
       {
       while (start_match < end_subject)
         {
-        register pcre_uint32 c = *start_match;
+        register pcre_uint32 c = UCHAR21TEST(start_match);
 #ifndef COMPILE_PCRE8
         if (c > 255) c = 255;
 #endif
@@ -6893,7 +6893,7 @@ for(;;)
           {
           while (p < end_subject)
             {
-            register pcre_uint32 pp = *p++;
+            register pcre_uint32 pp = UCHAR21INCTEST(p);
             if (pp == req_char || pp == req_char2) { p--; break; }
             }
           }
@@ -6901,7 +6901,7 @@ for(;;)
           {
           while (p < end_subject)
             {
-            if (*p++ == req_char) { p--; break; }
+            if (UCHAR21INCTEST(p) == req_char) { p--; break; }
             }
           }
 
diff --git a/pcre_internal.h b/pcre_internal.h
index dd50095..7e07d63 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -7,7 +7,7 @@
 and semantics are as close as possible to those of the Perl 5 language.
 
                        Written by Philip Hazel
-           Copyright (c) 1997-2013 University of Cambridge
+           Copyright (c) 1997-2014 University of Cambridge
 
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -316,7 +316,8 @@ start/end of string field names are. */
        &(NLBLOCK->nllen), utf)) \
     : \
     ((p) <= NLBLOCK->PSEND - NLBLOCK->nllen && \
-     *p == NLBLOCK->nl[0] && (NLBLOCK->nllen == 1 || p[1] == NLBLOCK->nl[1]) \
+     UCHAR21TEST(p) == NLBLOCK->nl[0] && \
+     (NLBLOCK->nllen == 1 || UCHAR21TEST(p+1) == NLBLOCK->nl[1])       \
     ) \
   )
 
@@ -329,8 +330,8 @@ start/end of string field names are. */
        &(NLBLOCK->nllen), utf)) \
     : \
     ((p) >= NLBLOCK->PSSTART + NLBLOCK->nllen && \
-     *(p - NLBLOCK->nllen) == NLBLOCK->nl[0] &&              \
-     (NLBLOCK->nllen == 1 || *(p - NLBLOCK->nllen + 1) == NLBLOCK->nl[1]) \
+     UCHAR21TEST(p - NLBLOCK->nllen) == NLBLOCK->nl[0] &&              \
+     (NLBLOCK->nllen == 1 || UCHAR21TEST(p - NLBLOCK->nllen + 1) == NLBLOCK->nl[1]) \
     ) \
   )
 
@@ -581,12 +582,27 @@ changed in future to be a fixed number of bytes or to depend on LINK_SIZE. */
 #define MAX_MARK ((1u << 8) - 1)
 #endif
 
+/* There is a proposed future special "UTF-21" mode, in which only the lowest 
+21 bits of a 32-bit character are interpreted as UTF, with the remaining 11 
+high-order bits available to the application for other uses. In preparation for 
+the future implementation of this mode, there are macros that load a data item
+and, if in this special mode, mask it to 21 bits. These macros all have names
+starting with UCHAR21. In all other modes, including the normal 32-bit
+library, the macros all have the same simple definitions. When the new mode is
+implemented, it is expected that these definitions will be varied appropriately
+using #ifdef when compiling the library that supports the special mode. */
+
+#define UCHAR21(eptr)        (*(eptr))
+#define UCHAR21TEST(eptr)    (*(eptr))
+#define UCHAR21INC(eptr)     (*(eptr)++)
+#define UCHAR21INCTEST(eptr) (*(eptr)++)
+
 /* When UTF encoding is being used, a character is no longer just a single
-byte. The macros for character handling generate simple sequences when used in
-character-mode, and more complicated ones for UTF characters. GETCHARLENTEST
-and other macros are not used when UTF is not supported, so they are not
-defined. To make sure they can never even appear when UTF support is omitted,
-we don't even define them. */
+byte in 8-bit mode or a single short in 16-bit mode. The macros for character
+handling generate simple sequences when used in the basic mode, and more
+complicated ones for UTF characters. GETCHARLENTEST and other macros are not
+used when UTF is not supported. To make sure they can never even appear when
+UTF support is omitted, we don't even define them. */
 
 #ifndef SUPPORT_UTF
 
diff --git a/pcre_string_utils.c b/pcre_string_utils.c
index 61a9b4b..25eacc8 100644
--- a/pcre_string_utils.c
+++ b/pcre_string_utils.c
@@ -6,7 +6,7 @@
 and semantics are as close as possible to those of the Perl 5 language.
 
                        Written by Philip Hazel
-           Copyright (c) 1997-2013 University of Cambridge
+           Copyright (c) 1997-2014 University of Cambridge
 
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -91,8 +91,8 @@ pcre_uchar c2;
 
 while (*str1 != '\0' || *str2 != '\0')
   {
-  c1 = *str1++;
-  c2 = *str2++;
+  c1 = UCHAR21INC(str1);
+  c2 = UCHAR21INC(str2);
   if (c1 != c2)
     return ((c1 > c2) << 1) - 1;
   }
@@ -131,7 +131,7 @@ pcre_uchar c2;
 
 while (*str1 != '\0' || *ustr2 != '\0')
   {
-  c1 = *str1++;
+  c1 = UCHAR21INC(str1);
   c2 = (pcre_uchar)*ustr2++;
   if (c1 != c2)
     return ((c1 > c2) << 1) - 1;
author	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2014-01-02 17:41:28 +0000
committer	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2014-01-02 17:41:28 +0000
commit	31a692c6bfca25feffa7cc96dab542080b0a9d0c (patch)
tree	ce87e75e8ed049d97d5f667631fcaf7b30e86f70
parent	62671ac7455a5eb508bc3f99e6f01585efd08c83 (diff)
download	pcre-31a692c6bfca25feffa7cc96dab542080b0a9d0c.tar.gz