Added PCRE_JAVASCRIPT_COMPAT option.

git-svn-id: svn://vcs.exim.org/pcre/code/trunk@336 2f5784b3-3f2a-0410-8824-cb99058d5e15
author: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2008-04-12 15:59:03 +0000
committer: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2008-04-12 15:59:03 +0000
commit: 7d260d0a46457dd958b689847e853fc06ee9f704 (patch)
tree: 6e7c512e248be27621b25b357cee15a43f73f0f0
parent: bd39c50b17337e4e0f4f77370c0794046e7d2768 (diff)
download: pcre-7d260d0a46457dd958b689847e853fc06ee9f704.tar.gz
12 files changed, 141 insertions, 54 deletions
diff --git a/ChangeLog b/ChangeLog
index 9c93ffe..8199744 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -57,6 +57,11 @@ Version 7.7 05-Mar-08
     (an internal error was given). Such groups are now left in the compiled 
     pattern, with a new opcode that causes them to be skipped at execution 
     time.
+    
+13. Added the PCRE_JAVASCRIPT_COMPAT option. This currently does two things:
+    (a) A lone ] character is dis-allowed (Perl treats it as data).
+    (b) A back reference to an unmatched subpattern matches an empty string 
+        (Perl fails the current match path).
 
 
 Version 7.6 28-Jan-08
diff --git a/doc/pcre.3 b/doc/pcre.3
index 2b41f7f..15ed6a9 100644
--- a/doc/pcre.3
+++ b/doc/pcre.3
@@ -6,8 +6,10 @@ PCRE - Perl-compatible regular expressions
 .sp
 The PCRE library is a set of functions that implement regular expression
 pattern matching using the same syntax and semantics as Perl, with just a few
-differences. (Certain features that appeared in Python and PCRE before they
-appeared in Perl are also available using the Python syntax.)
+differences. Certain features that appeared in Python and PCRE before they
+appeared in Perl are also available using the Python syntax. There is also some 
+support for certain .NET and Oniguruma syntax items, and there is an option for 
+requesting some minor changes that give better JavaScript compatibility.
 .P
 The current implementation of PCRE (release 7.x) corresponds approximately with
 Perl 5.10, including support for UTF-8 encoded strings and Unicode general
@@ -287,6 +289,6 @@ two digits 10, at the domain cam.ac.uk.
 .rs
 .sp
 .nf
-Last updated: 09 August 2007
-Copyright (c) 1997-2007 University of Cambridge.
+Last updated: 12 April 2008
+Copyright (c) 1997-2008 University of Cambridge.
 .fi
diff --git a/doc/pcre_compile.3 b/doc/pcre_compile.3
index 4e7e402..f934771 100644
--- a/doc/pcre_compile.3
+++ b/doc/pcre_compile.3
@@ -30,31 +30,33 @@ argument. Its arguments are:
 .sp
 The option bits are:
 .sp
-  PCRE_ANCHORED         Force pattern anchoring
-  PCRE_AUTO_CALLOUT     Compile automatic callouts
-  PCRE_BSR_ANYCRLF      \eR matches only CR, LF, or CRLF
-  PCRE_BSR_UNICODE      \eR matches all Unicode line endings
-  PCRE_CASELESS         Do caseless matching
-  PCRE_DOLLAR_ENDONLY   $ not to match newline at end
-  PCRE_DOTALL           . matches anything including NL
-  PCRE_DUPNAMES         Allow duplicate names for subpatterns
-  PCRE_EXTENDED         Ignore whitespace and # comments
-  PCRE_EXTRA            PCRE extra features
-                          (not much use currently)
-  PCRE_FIRSTLINE        Force matching to be before newline
-  PCRE_MULTILINE        ^ and $ match newlines within data
-  PCRE_NEWLINE_ANY      Recognize any Unicode newline sequence
-  PCRE_NEWLINE_ANYCRLF  Recognize CR, LF, and CRLF as newline sequences
-  PCRE_NEWLINE_CR       Set CR as the newline sequence
-  PCRE_NEWLINE_CRLF     Set CRLF as the newline sequence
-  PCRE_NEWLINE_LF       Set LF as the newline sequence
-  PCRE_NO_AUTO_CAPTURE  Disable numbered capturing paren-
-                          theses (named ones available)
-  PCRE_UNGREEDY         Invert greediness of quantifiers
-  PCRE_UTF8             Run in UTF-8 mode
-  PCRE_NO_UTF8_CHECK    Do not check the pattern for UTF-8
-                          validity (only relevant if
-                          PCRE_UTF8 is set)
+  PCRE_ANCHORED           Force pattern anchoring
+  PCRE_AUTO_CALLOUT       Compile automatic callouts
+  PCRE_BSR_ANYCRLF        \eR matches only CR, LF, or CRLF
+  PCRE_BSR_UNICODE        \eR matches all Unicode line endings
+  PCRE_CASELESS           Do caseless matching
+  PCRE_DOLLAR_ENDONLY     $ not to match newline at end
+  PCRE_DOTALL             . matches anything including NL
+  PCRE_DUPNAMES           Allow duplicate names for subpatterns
+  PCRE_EXTENDED           Ignore whitespace and # comments
+  PCRE_EXTRA              PCRE extra features
+                            (not much use currently)
+  PCRE_FIRSTLINE          Force matching to be before newline
+  PCRE_JAVASCRIPT_COMPAT  JavaScript compatibility
+  PCRE_MULTILINE          ^ and $ match newlines within data
+  PCRE_NEWLINE_ANY        Recognize any Unicode newline sequence
+  PCRE_NEWLINE_ANYCRLF    Recognize CR, LF, and CRLF as newline 
+                            sequences
+  PCRE_NEWLINE_CR         Set CR as the newline sequence
+  PCRE_NEWLINE_CRLF       Set CRLF as the newline sequence
+  PCRE_NEWLINE_LF         Set LF as the newline sequence
+  PCRE_NO_AUTO_CAPTURE    Disable numbered capturing paren-
+                            theses (named ones available)
+  PCRE_UNGREEDY           Invert greediness of quantifiers
+  PCRE_UTF8               Run in UTF-8 mode
+  PCRE_NO_UTF8_CHECK      Do not check the pattern for UTF-8
+                            validity (only relevant if
+                            PCRE_UTF8 is set)
 .sp
 PCRE must be built with UTF-8 support in order to use PCRE_UTF8 and
 PCRE_NO_UTF8_CHECK.
diff --git a/doc/pcreapi.3 b/doc/pcreapi.3
index 2320286..0174489 100644
--- a/doc/pcreapi.3
+++ b/doc/pcreapi.3
@@ -549,6 +549,20 @@ If this option is set, an unanchored pattern is required to match before or at
 the first newline in the subject string, though the matched text may continue
 over the newline.
 .sp
+  PCRE_JAVASCRIPT_COMPAT
+.sp
+If this option is set, PCRE's behaviour is changed in some ways so that it is 
+compatible with JavaScript rather than Perl. The changes are as follows:
+.P
+(1) A lone closing square bracket in a pattern causes a compile-time error,
+because this is illegal in JavaScript (by default it is treated as a data
+character). Thus, the pattern AB]CD becomes illegal when this option is set.
+.P
+(2) At run time, a back reference to an unset subpattern group matches an empty
+string (by default this causes the current matching path to fail). A pattern 
+such as (\1)(a) succeeds when this option is set (assuming it can find an "a" 
+in the subject), whereas it fails by default, for Perl compatibility.
+.sp
   PCRE_MULTILINE
 .sp
 By default, PCRE treats the subject string as consisting of a single line of
@@ -717,14 +731,15 @@ out of use. To avoid confusion, they have not been re-used.
   54  DEFINE group contains more than one branch
   55  repeating a DEFINE group is not allowed
   56  inconsistent NEWLINE options
-  57  \eg is not followed by a braced name or an optionally braced
-        non-zero number
-  58  (?+ or (?- or (?(+ or (?(- must be followed by a non-zero number
+  57  \eg is not followed by a braced, angle-bracketed, or quoted 
+        name/number or by a plain number 
+  58  a numbered reference must not be zero
   59  (*VERB) with an argument is not supported
   60  (*VERB) not recognized
   61  number is too big
   62  subpattern name expected
   63  digit expected after (?+
+  64  ] is an invalid data character in JavaScript compatibility mode
 .sp
 The numbers 32 and 10000 in errors 48 and 49 are defaults; different values may
 be used if the limits were changed when PCRE was built.
@@ -1960,6 +1975,6 @@ Cambridge CB2 3QH, England.
 .rs
 .sp
 .nf
-Last updated: 23 January 2008
+Last updated: 12 April 2008
 Copyright (c) 1997-2008 University of Cambridge.
 .fi
diff --git a/doc/pcretest.1 b/doc/pcretest.1
index 7e7d80f..a74797c 100644
--- a/doc/pcretest.1
+++ b/doc/pcretest.1
@@ -171,6 +171,7 @@ not correspond to anything in Perl:
   \fB/N\fP              PCRE_NO_AUTO_CAPTURE
   \fB/U\fP              PCRE_UNGREEDY
   \fB/X\fP              PCRE_EXTRA
+  \fB/<JS>\fP           PCRE_JAVASCRIPT_COMPAT 
   \fB/<cr>\fP           PCRE_NEWLINE_CR
   \fB/<lf>\fP           PCRE_NEWLINE_LF
   \fB/<crlf>\fP         PCRE_NEWLINE_CRLF
@@ -717,6 +718,6 @@ Cambridge CB2 3QH, England.
 .rs
 .sp
 .nf
-Last updated: 18 December 2007
-Copyright (c) 1997-2007 University of Cambridge.
+Last updated: 12 April 2008
+Copyright (c) 1997-2008 University of Cambridge.
 .fi
diff --git a/pcre_compile.c b/pcre_compile.c
index 492222a..9b10356 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -302,7 +302,8 @@ static const char error_texts[] =
   "(*VERB) not recognized\0"
   "number is too big\0"
   "subpattern name expected\0"
-  "digit expected after (?+";
+  "digit expected after (?+\0"
+  "] is an invalid data character in JavaScript compatibility mode"; 
 
 
 /* Table to identify digits and hex digits. This is used when compiling
@@ -2654,7 +2655,17 @@ for (;; ptr++)
     opcode is compiled. It may optionally have a bit map for characters < 256,
     but those above are are explicitly listed afterwards. A flag byte tells
     whether the bitmap is present, and whether this is a negated class or not.
-    */
+    
+    In JavaScript compatibility mode, an isolated ']' causes an error. In
+    default (Perl) mode, it is treated as a data character. */
+    
+    case ']':
+    if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
+      {
+      *errorcodeptr = ERR64;
+      goto FAILED;  
+      }
+    goto NORMAL_CHAR;      
 
     case '[':
     previous = code;
diff --git a/pcre_exec.c b/pcre_exec.c
index 89fe6c2..dceb244 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -1731,16 +1731,25 @@ for (;;)
     case OP_REF:
       {
       offset = GET2(ecode, 1) << 1;               /* Doubled ref number */
-      ecode += 3;                                 /* Advance past item */
-
-      /* If the reference is unset, set the length to be longer than the amount
-      of subject left; this ensures that every attempt at a match fails. We
-      can't just fail here, because of the possibility of quantifiers with zero
-      minima. */
-
-      length = (offset >= offset_top || md->offset_vector[offset] < 0)?
-        md->end_subject - eptr + 1 :
-        md->offset_vector[offset+1] - md->offset_vector[offset];
+      ecode += 3;   
+      
+      /* If the reference is unset, there are two possibilities:
+      
+      (a) In the default, Perl-compatible state, set the length to be longer
+      than the amount of subject left; this ensures that every attempt at a
+      match fails. We can't just fail here, because of the possibility of
+      quantifiers with zero minima.
+      
+      (b) If the JavaScript compatibility flag is set, set the length to zero 
+      so that the back reference matches an empty string. 
+      
+      Otherwise, set the length to the length of what was matched by the 
+      referenced subpattern. */
+      
+      if (offset >= offset_top || md->offset_vector[offset] < 0)
+        length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;  
+      else
+        length = md->offset_vector[offset+1] - md->offset_vector[offset];
 
       /* Set up for repetition, or handle the non-repeated case */
 
@@ -4458,6 +4467,7 @@ end_subject = md->end_subject;
 
 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
+md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
 
 md->notbol = (options & PCRE_NOTBOL) != 0;
 md->noteol = (options & PCRE_NOTEOL) != 0;
diff --git a/pcre_internal.h b/pcre_internal.h
index a2a30f4..54d9c01 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -514,7 +514,8 @@ time, run time, or study time, respectively. */
   (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
    PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
    PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \
-   PCRE_DUPNAMES|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)
+   PCRE_DUPNAMES|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \
+   PCRE_JAVASCRIPT_COMPAT)
 
 #define PUBLIC_EXEC_OPTIONS \
   (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
@@ -884,7 +885,7 @@ enum { ERR0,  ERR1,  ERR2,  ERR3,  ERR4,  ERR5,  ERR6,  ERR7,  ERR8,  ERR9,
        ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39,
        ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
        ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59,
-       ERR60, ERR61, ERR62, ERR63 };
+       ERR60, ERR61, ERR62, ERR63, ERR64 };
 
 /* The real format of the start of the pcre block; the index of names and the
 code vector run on as long as necessary after the end. We store an explicit
@@ -1009,6 +1010,7 @@ typedef struct match_data {
   BOOL   notbol;                /* NOTBOL flag */
   BOOL   noteol;                /* NOTEOL flag */
   BOOL   utf8;                  /* UTF8 flag */
+  BOOL   jscript_compat;        /* JAVASCRIPT_COMPAT flag */ 
   BOOL   endonly;               /* Dollar not before final \n */
   BOOL   notempty;              /* Empty string match not wanted */
   BOOL   partial;               /* PARTIAL flag */
diff --git a/pcreposix.c b/pcreposix.c
index d129c02..b09bba9 100644
--- a/pcreposix.c
+++ b/pcreposix.c
@@ -126,7 +126,8 @@ static const int eint[] = {
   REG_BADPAT,  /* (?+ or (?- must be followed by a non-zero number */
   REG_BADPAT,  /* number is too big */
   REG_BADPAT,  /* subpattern name expected */
-  REG_BADPAT   /* digit expected after (?+ */
+  REG_BADPAT,  /* digit expected after (?+ */
+  REG_BADPAT   /* ] is an invalid data character in JavaScript compatibility mode */
 };
 
 /* Table of texts corresponding to POSIX error codes */
diff --git a/pcretest.c b/pcretest.c
index d31bf3c..d195676 100644
--- a/pcretest.c
+++ b/pcretest.c
@@ -1247,10 +1247,18 @@ while (!done)
 
       case '<':
         {
-        int x = check_newline(pp, outfile);
-        if (x == 0) goto SKIP_DATA;
-        options |= x;
-        while (*pp++ != '>');
+        if (strncmp((char *)pp, "JS>", 3) == 0)
+          {
+          options |= PCRE_JAVASCRIPT_COMPAT;
+          pp += 3;  
+          }
+        else
+          {      
+          int x = check_newline(pp, outfile);
+          if (x == 0) goto SKIP_DATA;
+          options |= x;
+          while (*pp++ != '>');
+          } 
         }
       break;
 
diff --git a/testdata/testinput2 b/testdata/testinput2
index 16e712a..52d4ef8 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -2655,4 +2655,16 @@ a random value. /Ix
     ** Failers
     xxz  
 
+/(\3)(\1)(a)/
+    cat
+
+/(\3)(\1)(a)/<JS>
+    cat
+    
+/TA]/
+    The ACTA] comes 
+
+/TA]/<JS>
+    The ACTA] comes 
+
 / End of testinput2 /
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 1987cf8..783e383 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -9527,4 +9527,22 @@ No match
     xxz  
 No match
 
+/(\3)(\1)(a)/
+    cat
+No match
+
+/(\3)(\1)(a)/<JS>
+    cat
+ 0: a
+ 1: 
+ 2: 
+ 3: a
+    
+/TA]/
+    The ACTA] comes 
+ 0: TA]
+
+/TA]/<JS>
+Failed: ] is an invalid data character in JavaScript compatibility mode at offset 2
+
 / End of testinput2 /
author	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2008-04-12 15:59:03 +0000
committer	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2008-04-12 15:59:03 +0000
commit	7d260d0a46457dd958b689847e853fc06ee9f704 (patch)
tree	6e7c512e248be27621b25b357cee15a43f73f0f0
parent	bd39c50b17337e4e0f4f77370c0794046e7d2768 (diff)
download	pcre-7d260d0a46457dd958b689847e853fc06ee9f704.tar.gz