Implement PCRE2_EXTRA_ALT_BSUX to support ECMAscript 6's \u{hhh..} syntax.

git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@1070 6239d852-aaf2-0410-a92c-79f79f948069
author: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069> 2019-02-12 17:50:19 +0000
committer: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069> 2019-02-12 17:50:19 +0000
commit: 4e01f37e73ba7afa29fbfbe45a5f923efb0a1c68 (patch)
tree: 2f92e9bdf9f05dbe278c16ae6162b5dd725f2749 /src/pcre2_compile.c
parent: 5a5285b1066d191d22eb858cbc9862b6e044ca9e (diff)
download: pcre2-4e01f37e73ba7afa29fbfbe45a5f923efb0a1c68.tar.gz
1 files changed, 89 insertions, 48 deletions
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
index d72d9bd..1edcf1a 100644
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@@ -764,7 +764,7 @@ are allowed. */
 #define PUBLIC_COMPILE_EXTRA_OPTIONS \
    (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
     PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
-    PCRE2_EXTRA_ESCAPED_CR_IS_LF)
+    PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX)
 
 /* Compile time error code numbers. They are given names so that they can more
 easily be tracked. When a new number is added, the tables called eint1 and
@@ -1459,7 +1459,8 @@ Returns:         zero => a data character
 
 int
 PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
-  int *errorcodeptr, uint32_t options, BOOL isclass, compile_block *cb)
+  int *errorcodeptr, uint32_t options, uint32_t extra_options, BOOL isclass, 
+  compile_block *cb)
 {
 BOOL utf = (options & PCRE2_UTF) != 0;
 PCRE2_SPTR ptr = *ptrptr;
@@ -1495,8 +1496,7 @@ else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
   if (i > 0)
     {
     c = (uint32_t)i;
-    if (cb != NULL && c == CHAR_CR &&
-        (cb->cx->extra_options & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
+    if (c == CHAR_CR && (extra_options & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
       c = CHAR_LF;
     }
   else  /* Negative table entry */
@@ -1551,22 +1551,28 @@ else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
 
 /* Escapes that need further processing, including those that are unknown, have
 a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
-\o, and \x are recognized (and \u when BSUX is set). */
+\o, and \x are recognized (\u and \U can never appear as they are used for case 
+forcing). */
 
 else
   {
+  int s;
   PCRE2_SPTR oldptr;
   BOOL overflow;
-  int s;
+  BOOL alt_bsux = 
+    ((options & PCRE2_ALT_BSUX) | (extra_options & PCRE2_EXTRA_ALT_BSUX)) != 0;
 
   /* Filter calls from pcre2_substitute(). */
 
-  if (cb == NULL && c != CHAR_c && c != CHAR_o && c != CHAR_x &&
-      (c != CHAR_u || (options & PCRE2_ALT_BSUX) != 0))
+  if (cb == NULL)
     {
-    *errorcodeptr = ERR3;
-    return 0;
-    }
+    if (c != CHAR_c && c != CHAR_o && c != CHAR_x)
+      {
+      *errorcodeptr = ERR3;
+      return 0;
+      }
+    alt_bsux = FALSE;   /* Do not modify \x handling */   
+    }   
 
   switch (c)
     {
@@ -1579,40 +1585,74 @@ else
     *errorcodeptr = ERR37;
     break;
 
-    /* \u is unrecognized when PCRE2_ALT_BSUX is not set. When it is treated
-    specially, \u must be followed by four hex digits. Otherwise it is a
-    lowercase u letter. */
+    /* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX
+    is set. Otherwise, \u must be followed by exactly four hex digits or, if
+    PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
+    Otherwise it is a lowercase u letter. This gives some compatibility with
+    ECMAScript (aka JavaScript). */
 
     case CHAR_u:
-    if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37; else
+    if (!alt_bsux) *errorcodeptr = ERR37; else
       {
       uint32_t xc;
-      if (ptrend - ptr < 4) break;              /* Less than 4 chars */
-      if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
-      if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
-      cc = (cc << 4) | xc;
-      if ((xc = XDIGIT(ptr[2])) == 0xff) break;  /* Not a hex digit */
-      cc = (cc << 4) | xc;
-      if ((xc = XDIGIT(ptr[3])) == 0xff) break;  /* Not a hex digit */
-      c = (cc << 4) | xc;
-      ptr += 4;
+      
+      if (*ptr == CHAR_LEFT_CURLY_BRACKET && 
+          (extra_options & PCRE2_EXTRA_ALT_BSUX) != 0)
+        {
+        PCRE2_SPTR hptr = ptr + 1;
+        cc = 0;
+        
+        while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
+          { 
+          if ((cc & 0xf0000000) != 0)  /* Test for 32-bit overflow */
+            {
+            *errorcodeptr = ERR77;
+            ptr = hptr;   /* Show where */
+            break;        /* *hptr != } will cause another break below */  
+            } 
+          cc = (cc << 4) | xc;
+          hptr++; 
+          } 
+          
+        if (hptr == ptr + 1 ||   /* No hex digits */
+            hptr >= ptrend ||    /* Hit end of input */
+            *hptr != CHAR_RIGHT_CURLY_BRACKET)  /* No } terminator */
+          break;         /* Hex escape not recognized */
+           
+        c = cc;          /* Accept the code point */
+        ptr = hptr + 1; 
+        }
+         
+      else  /* Must be exactly 4 hex digits */
+        {      
+        if (ptrend - ptr < 4) break;               /* Less than 4 chars */
+        if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
+        if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
+        cc = (cc << 4) | xc;
+        if ((xc = XDIGIT(ptr[2])) == 0xff) break;  /* Not a hex digit */
+        cc = (cc << 4) | xc;
+        if ((xc = XDIGIT(ptr[3])) == 0xff) break;  /* Not a hex digit */
+        c = (cc << 4) | xc;
+        ptr += 4;
+        } 
+ 
       if (utf)
         {
         if (c > 0x10ffffU) *errorcodeptr = ERR77;
         else
           if (c >= 0xd800 && c <= 0xdfff &&
-            (cb->cx->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
-              *errorcodeptr = ERR73;
+              (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
+                *errorcodeptr = ERR73;
         }
       else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
       }
     break;
 
-    /* \U is unrecognized unless PCRE2_ALT_BSUX is set, in which case it is an
-    upper case letter. */
+    /* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,
+    in which case it is an upper case letter. */
 
     case CHAR_U:
-    if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37;
+    if (!alt_bsux) *errorcodeptr = ERR37;
     break;
 
     /* In a character class, \g is just a literal "g". Outside a character
@@ -1791,8 +1831,8 @@ else
         }
       else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
         {
-        if (utf && c >= 0xd800 && c <= 0xdfff && (cb == NULL ||
-            (cb->cx->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0))
+        if (utf && c >= 0xd800 && c <= 0xdfff &&
+            (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
           {
           ptr--;
           *errorcodeptr = ERR73;
@@ -1806,11 +1846,11 @@ else
       }
     break;
 
-    /* \x is complicated. When PCRE2_ALT_BSUX is set, \x must be followed by
-    two hexadecimal digits. Otherwise it is a lowercase x letter. */
+    /* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed
+    by two hexadecimal digits. Otherwise it is a lowercase x letter. */
 
     case CHAR_x:
-    if ((options & PCRE2_ALT_BSUX) != 0)
+    if (alt_bsux)
       {
       uint32_t xc;
       if (ptrend - ptr < 2) break;               /* Less than 2 characters */
@@ -1818,9 +1858,9 @@ else
       if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
       c = (cc << 4) | xc;
       ptr += 2;
-      }    /* End PCRE2_ALT_BSUX handling */
+      }
 
-    /* Handle \x in Perl's style. \x{ddd} is a character number which can be
+    /* Handle \x in Perl's style. \x{ddd} is a character code which can be
     greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
     digits. If not, { used to be treated as a data character. However, Perl
     seems to read hex digits up to the first non-such, and ignore the rest, so
@@ -1864,8 +1904,8 @@ else
           }
         else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
           {
-          if (utf && c >= 0xd800 && c <= 0xdfff && (cb == NULL ||
-              (cb->cx->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0))
+          if (utf && c >= 0xd800 && c <= 0xdfff &&
+              (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
             {
             ptr--;
             *errorcodeptr = ERR73;
@@ -2438,6 +2478,7 @@ uint32_t *parsed_pattern = cb->parsed_pattern;
 uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
 uint32_t meta_quantifier = 0;
 uint32_t add_after_mark = 0;
+uint32_t extra_options = cb->cx->extra_options;
 uint16_t nest_depth = 0;
 int after_manual_callout = 0;
 int expect_cond_assert = 0;
@@ -2461,12 +2502,12 @@ nest_save *top_nest, *end_nests;
 /* Insert leading items for word and line matching (features provided for the
 benefit of pcre2grep). */
 
-if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
+if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
   {
   *parsed_pattern++ = META_CIRCUMFLEX;
   *parsed_pattern++ = META_NOCAPTURE;
   }
-else if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
+else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
   {
   *parsed_pattern++ = META_ESCAPE + ESC_b;
   *parsed_pattern++ = META_NOCAPTURE;
@@ -2631,7 +2672,7 @@ while (ptr < ptrend)
       if ((options & PCRE2_ALT_VERBNAMES) != 0)
         {
         escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
-          FALSE, cb);
+          cb->cx->extra_options, FALSE, cb);
         if (errorcode != 0) goto FAILED;
         }
       else escape = 0;   /* Treat all as literal */
@@ -2821,11 +2862,11 @@ while (ptr < ptrend)
     case CHAR_BACKSLASH:
     tempptr = ptr;
     escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
-      FALSE, cb);
+      cb->cx->extra_options, FALSE, cb);
     if (errorcode != 0)
       {
       ESCAPE_FAILED:
-      if ((cb->cx->extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
+      if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
         goto FAILED;
       ptr = tempptr;
       if (ptr >= ptrend) c = CHAR_BACKSLASH; else
@@ -3382,12 +3423,12 @@ while (ptr < ptrend)
       else
         {
         tempptr = ptr;
-        escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode,
-          options, TRUE, cb);
+        escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options, 
+          cb->cx->extra_options, TRUE, cb);
 
         if (errorcode != 0)
           {
-          if ((cb->cx->extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
+          if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
             goto FAILED;
           ptr = tempptr;
           if (ptr >= ptrend) c = CHAR_BACKSLASH; else
@@ -4545,12 +4586,12 @@ parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
 /* Insert trailing items for word and line matching (features provided for the
 benefit of pcre2grep). */
 
-if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
+if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
   {
   *parsed_pattern++ = META_KET;
   *parsed_pattern++ = META_DOLLAR;
   }
-else if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
+else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
   {
   *parsed_pattern++ = META_KET;
   *parsed_pattern++ = META_ESCAPE + ESC_b;
author	ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>	2019-02-12 17:50:19 +0000
committer	ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>	2019-02-12 17:50:19 +0000
commit	4e01f37e73ba7afa29fbfbe45a5f923efb0a1c68 (patch)
tree	2f92e9bdf9f05dbe278c16ae6162b5dd725f2749 /src/pcre2_compile.c
parent	5a5285b1066d191d22eb858cbc9862b6e044ca9e (diff)
download	pcre2-4e01f37e73ba7afa29fbfbe45a5f923efb0a1c68.tar.gz