summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2008-04-12 15:59:03 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2008-04-12 15:59:03 +0000
commit7d260d0a46457dd958b689847e853fc06ee9f704 (patch)
tree6e7c512e248be27621b25b357cee15a43f73f0f0
parentbd39c50b17337e4e0f4f77370c0794046e7d2768 (diff)
downloadpcre-7d260d0a46457dd958b689847e853fc06ee9f704.tar.gz
Added PCRE_JAVASCRIPT_COMPAT option.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@336 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog5
-rw-r--r--doc/pcre.310
-rw-r--r--doc/pcre_compile.352
-rw-r--r--doc/pcreapi.323
-rw-r--r--doc/pcretest.15
-rw-r--r--pcre_compile.c15
-rw-r--r--pcre_exec.c30
-rw-r--r--pcre_internal.h6
-rw-r--r--pcreposix.c3
-rw-r--r--pcretest.c16
-rw-r--r--testdata/testinput212
-rw-r--r--testdata/testoutput218
12 files changed, 141 insertions, 54 deletions
diff --git a/ChangeLog b/ChangeLog
index 9c93ffe..8199744 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -57,6 +57,11 @@ Version 7.7 05-Mar-08
(an internal error was given). Such groups are now left in the compiled
pattern, with a new opcode that causes them to be skipped at execution
time.
+
+13. Added the PCRE_JAVASCRIPT_COMPAT option. This currently does two things:
+ (a) A lone ] character is dis-allowed (Perl treats it as data).
+ (b) A back reference to an unmatched subpattern matches an empty string
+ (Perl fails the current match path).
Version 7.6 28-Jan-08
diff --git a/doc/pcre.3 b/doc/pcre.3
index 2b41f7f..15ed6a9 100644
--- a/doc/pcre.3
+++ b/doc/pcre.3
@@ -6,8 +6,10 @@ PCRE - Perl-compatible regular expressions
.sp
The PCRE library is a set of functions that implement regular expression
pattern matching using the same syntax and semantics as Perl, with just a few
-differences. (Certain features that appeared in Python and PCRE before they
-appeared in Perl are also available using the Python syntax.)
+differences. Certain features that appeared in Python and PCRE before they
+appeared in Perl are also available using the Python syntax. There is also some
+support for certain .NET and Oniguruma syntax items, and there is an option for
+requesting some minor changes that give better JavaScript compatibility.
.P
The current implementation of PCRE (release 7.x) corresponds approximately with
Perl 5.10, including support for UTF-8 encoded strings and Unicode general
@@ -287,6 +289,6 @@ two digits 10, at the domain cam.ac.uk.
.rs
.sp
.nf
-Last updated: 09 August 2007
-Copyright (c) 1997-2007 University of Cambridge.
+Last updated: 12 April 2008
+Copyright (c) 1997-2008 University of Cambridge.
.fi
diff --git a/doc/pcre_compile.3 b/doc/pcre_compile.3
index 4e7e402..f934771 100644
--- a/doc/pcre_compile.3
+++ b/doc/pcre_compile.3
@@ -30,31 +30,33 @@ argument. Its arguments are:
.sp
The option bits are:
.sp
- PCRE_ANCHORED Force pattern anchoring
- PCRE_AUTO_CALLOUT Compile automatic callouts
- PCRE_BSR_ANYCRLF \eR matches only CR, LF, or CRLF
- PCRE_BSR_UNICODE \eR matches all Unicode line endings
- PCRE_CASELESS Do caseless matching
- PCRE_DOLLAR_ENDONLY $ not to match newline at end
- PCRE_DOTALL . matches anything including NL
- PCRE_DUPNAMES Allow duplicate names for subpatterns
- PCRE_EXTENDED Ignore whitespace and # comments
- PCRE_EXTRA PCRE extra features
- (not much use currently)
- PCRE_FIRSTLINE Force matching to be before newline
- PCRE_MULTILINE ^ and $ match newlines within data
- PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
- PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
- PCRE_NEWLINE_CR Set CR as the newline sequence
- PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
- PCRE_NEWLINE_LF Set LF as the newline sequence
- PCRE_NO_AUTO_CAPTURE Disable numbered capturing paren-
- theses (named ones available)
- PCRE_UNGREEDY Invert greediness of quantifiers
- PCRE_UTF8 Run in UTF-8 mode
- PCRE_NO_UTF8_CHECK Do not check the pattern for UTF-8
- validity (only relevant if
- PCRE_UTF8 is set)
+ PCRE_ANCHORED Force pattern anchoring
+ PCRE_AUTO_CALLOUT Compile automatic callouts
+ PCRE_BSR_ANYCRLF \eR matches only CR, LF, or CRLF
+ PCRE_BSR_UNICODE \eR matches all Unicode line endings
+ PCRE_CASELESS Do caseless matching
+ PCRE_DOLLAR_ENDONLY $ not to match newline at end
+ PCRE_DOTALL . matches anything including NL
+ PCRE_DUPNAMES Allow duplicate names for subpatterns
+ PCRE_EXTENDED Ignore whitespace and # comments
+ PCRE_EXTRA PCRE extra features
+ (not much use currently)
+ PCRE_FIRSTLINE Force matching to be before newline
+ PCRE_JAVASCRIPT_COMPAT JavaScript compatibility
+ PCRE_MULTILINE ^ and $ match newlines within data
+ PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
+ PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline
+ sequences
+ PCRE_NEWLINE_CR Set CR as the newline sequence
+ PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
+ PCRE_NEWLINE_LF Set LF as the newline sequence
+ PCRE_NO_AUTO_CAPTURE Disable numbered capturing paren-
+ theses (named ones available)
+ PCRE_UNGREEDY Invert greediness of quantifiers
+ PCRE_UTF8 Run in UTF-8 mode
+ PCRE_NO_UTF8_CHECK Do not check the pattern for UTF-8
+ validity (only relevant if
+ PCRE_UTF8 is set)
.sp
PCRE must be built with UTF-8 support in order to use PCRE_UTF8 and
PCRE_NO_UTF8_CHECK.
diff --git a/doc/pcreapi.3 b/doc/pcreapi.3
index 2320286..0174489 100644
--- a/doc/pcreapi.3
+++ b/doc/pcreapi.3
@@ -549,6 +549,20 @@ If this option is set, an unanchored pattern is required to match before or at
the first newline in the subject string, though the matched text may continue
over the newline.
.sp
+ PCRE_JAVASCRIPT_COMPAT
+.sp
+If this option is set, PCRE's behaviour is changed in some ways so that it is
+compatible with JavaScript rather than Perl. The changes are as follows:
+.P
+(1) A lone closing square bracket in a pattern causes a compile-time error,
+because this is illegal in JavaScript (by default it is treated as a data
+character). Thus, the pattern AB]CD becomes illegal when this option is set.
+.P
+(2) At run time, a back reference to an unset subpattern group matches an empty
+string (by default this causes the current matching path to fail). A pattern
+such as (\1)(a) succeeds when this option is set (assuming it can find an "a"
+in the subject), whereas it fails by default, for Perl compatibility.
+.sp
PCRE_MULTILINE
.sp
By default, PCRE treats the subject string as consisting of a single line of
@@ -717,14 +731,15 @@ out of use. To avoid confusion, they have not been re-used.
54 DEFINE group contains more than one branch
55 repeating a DEFINE group is not allowed
56 inconsistent NEWLINE options
- 57 \eg is not followed by a braced name or an optionally braced
- non-zero number
- 58 (?+ or (?- or (?(+ or (?(- must be followed by a non-zero number
+ 57 \eg is not followed by a braced, angle-bracketed, or quoted
+ name/number or by a plain number
+ 58 a numbered reference must not be zero
59 (*VERB) with an argument is not supported
60 (*VERB) not recognized
61 number is too big
62 subpattern name expected
63 digit expected after (?+
+ 64 ] is an invalid data character in JavaScript compatibility mode
.sp
The numbers 32 and 10000 in errors 48 and 49 are defaults; different values may
be used if the limits were changed when PCRE was built.
@@ -1960,6 +1975,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 23 January 2008
+Last updated: 12 April 2008
Copyright (c) 1997-2008 University of Cambridge.
.fi
diff --git a/doc/pcretest.1 b/doc/pcretest.1
index 7e7d80f..a74797c 100644
--- a/doc/pcretest.1
+++ b/doc/pcretest.1
@@ -171,6 +171,7 @@ not correspond to anything in Perl:
\fB/N\fP PCRE_NO_AUTO_CAPTURE
\fB/U\fP PCRE_UNGREEDY
\fB/X\fP PCRE_EXTRA
+ \fB/<JS>\fP PCRE_JAVASCRIPT_COMPAT
\fB/<cr>\fP PCRE_NEWLINE_CR
\fB/<lf>\fP PCRE_NEWLINE_LF
\fB/<crlf>\fP PCRE_NEWLINE_CRLF
@@ -717,6 +718,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 18 December 2007
-Copyright (c) 1997-2007 University of Cambridge.
+Last updated: 12 April 2008
+Copyright (c) 1997-2008 University of Cambridge.
.fi
diff --git a/pcre_compile.c b/pcre_compile.c
index 492222a..9b10356 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -302,7 +302,8 @@ static const char error_texts[] =
"(*VERB) not recognized\0"
"number is too big\0"
"subpattern name expected\0"
- "digit expected after (?+";
+ "digit expected after (?+\0"
+ "] is an invalid data character in JavaScript compatibility mode";
/* Table to identify digits and hex digits. This is used when compiling
@@ -2654,7 +2655,17 @@ for (;; ptr++)
opcode is compiled. It may optionally have a bit map for characters < 256,
but those above are are explicitly listed afterwards. A flag byte tells
whether the bitmap is present, and whether this is a negated class or not.
- */
+
+ In JavaScript compatibility mode, an isolated ']' causes an error. In
+ default (Perl) mode, it is treated as a data character. */
+
+ case ']':
+ if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
+ {
+ *errorcodeptr = ERR64;
+ goto FAILED;
+ }
+ goto NORMAL_CHAR;
case '[':
previous = code;
diff --git a/pcre_exec.c b/pcre_exec.c
index 89fe6c2..dceb244 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -1731,16 +1731,25 @@ for (;;)
case OP_REF:
{
offset = GET2(ecode, 1) << 1; /* Doubled ref number */
- ecode += 3; /* Advance past item */
-
- /* If the reference is unset, set the length to be longer than the amount
- of subject left; this ensures that every attempt at a match fails. We
- can't just fail here, because of the possibility of quantifiers with zero
- minima. */
-
- length = (offset >= offset_top || md->offset_vector[offset] < 0)?
- md->end_subject - eptr + 1 :
- md->offset_vector[offset+1] - md->offset_vector[offset];
+ ecode += 3;
+
+ /* If the reference is unset, there are two possibilities:
+
+ (a) In the default, Perl-compatible state, set the length to be longer
+ than the amount of subject left; this ensures that every attempt at a
+ match fails. We can't just fail here, because of the possibility of
+ quantifiers with zero minima.
+
+ (b) If the JavaScript compatibility flag is set, set the length to zero
+ so that the back reference matches an empty string.
+
+ Otherwise, set the length to the length of what was matched by the
+ referenced subpattern. */
+
+ if (offset >= offset_top || md->offset_vector[offset] < 0)
+ length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
+ else
+ length = md->offset_vector[offset+1] - md->offset_vector[offset];
/* Set up for repetition, or handle the non-repeated case */
@@ -4458,6 +4467,7 @@ end_subject = md->end_subject;
md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
+md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
md->notbol = (options & PCRE_NOTBOL) != 0;
md->noteol = (options & PCRE_NOTEOL) != 0;
diff --git a/pcre_internal.h b/pcre_internal.h
index a2a30f4..54d9c01 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -514,7 +514,8 @@ time, run time, or study time, respectively. */
(PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \
- PCRE_DUPNAMES|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)
+ PCRE_DUPNAMES|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \
+ PCRE_JAVASCRIPT_COMPAT)
#define PUBLIC_EXEC_OPTIONS \
(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
@@ -884,7 +885,7 @@ enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39,
ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59,
- ERR60, ERR61, ERR62, ERR63 };
+ ERR60, ERR61, ERR62, ERR63, ERR64 };
/* The real format of the start of the pcre block; the index of names and the
code vector run on as long as necessary after the end. We store an explicit
@@ -1009,6 +1010,7 @@ typedef struct match_data {
BOOL notbol; /* NOTBOL flag */
BOOL noteol; /* NOTEOL flag */
BOOL utf8; /* UTF8 flag */
+ BOOL jscript_compat; /* JAVASCRIPT_COMPAT flag */
BOOL endonly; /* Dollar not before final \n */
BOOL notempty; /* Empty string match not wanted */
BOOL partial; /* PARTIAL flag */
diff --git a/pcreposix.c b/pcreposix.c
index d129c02..b09bba9 100644
--- a/pcreposix.c
+++ b/pcreposix.c
@@ -126,7 +126,8 @@ static const int eint[] = {
REG_BADPAT, /* (?+ or (?- must be followed by a non-zero number */
REG_BADPAT, /* number is too big */
REG_BADPAT, /* subpattern name expected */
- REG_BADPAT /* digit expected after (?+ */
+ REG_BADPAT, /* digit expected after (?+ */
+ REG_BADPAT /* ] is an invalid data character in JavaScript compatibility mode */
};
/* Table of texts corresponding to POSIX error codes */
diff --git a/pcretest.c b/pcretest.c
index d31bf3c..d195676 100644
--- a/pcretest.c
+++ b/pcretest.c
@@ -1247,10 +1247,18 @@ while (!done)
case '<':
{
- int x = check_newline(pp, outfile);
- if (x == 0) goto SKIP_DATA;
- options |= x;
- while (*pp++ != '>');
+ if (strncmp((char *)pp, "JS>", 3) == 0)
+ {
+ options |= PCRE_JAVASCRIPT_COMPAT;
+ pp += 3;
+ }
+ else
+ {
+ int x = check_newline(pp, outfile);
+ if (x == 0) goto SKIP_DATA;
+ options |= x;
+ while (*pp++ != '>');
+ }
}
break;
diff --git a/testdata/testinput2 b/testdata/testinput2
index 16e712a..52d4ef8 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -2655,4 +2655,16 @@ a random value. /Ix
** Failers
xxz
+/(\3)(\1)(a)/
+ cat
+
+/(\3)(\1)(a)/<JS>
+ cat
+
+/TA]/
+ The ACTA] comes
+
+/TA]/<JS>
+ The ACTA] comes
+
/ End of testinput2 /
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 1987cf8..783e383 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -9527,4 +9527,22 @@ No match
xxz
No match
+/(\3)(\1)(a)/
+ cat
+No match
+
+/(\3)(\1)(a)/<JS>
+ cat
+ 0: a
+ 1:
+ 2:
+ 3: a
+
+/TA]/
+ The ACTA] comes
+ 0: TA]
+
+/TA]/<JS>
+Failed: ] is an invalid data character in JavaScript compatibility mode at offset 2
+
/ End of testinput2 /