summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>2019-02-13 17:30:24 +0000
committerph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>2019-02-13 17:30:24 +0000
commit88ef1a460d137e42bad6f3b958998b379ba0fb38 (patch)
tree19990e84521ff5e74413f2c232286f1f4bbc656b
parentc93653c87aa0bf7a24038961afb1189005a256a6 (diff)
downloadpcre2-88ef1a460d137e42bad6f3b958998b379ba0fb38.tar.gz
Compile \p{Any} the same as . in DOTALL mode, to benefit from auto-anchoring.
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@1072 6239d852-aaf2-0410-a92c-79f79f948069
-rw-r--r--ChangeLog3
-rw-r--r--src/pcre2_compile.c59
-rw-r--r--testdata/testinput54
-rw-r--r--testdata/testoutput547
4 files changed, 70 insertions, 43 deletions
diff --git a/ChangeLog b/ChangeLog
index 9997ba0..990a6a3 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -128,6 +128,9 @@ ClusterFuzz 12950, fixed before release.
31. Implemented PCRE2_EXTRA_ALT_BSUX to support ECMAScript 6's \u{hhh}
construct.
+32. Compile \p{Any} to be the same as . in DOTALL mode, so that it benefits
+from auto-anchoring if \p{Any}* starts a pattern.
+
Version 10.32 10-September-2018
-------------------------------
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
index 5a6f88c..90d30a5 100644
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@@ -1459,7 +1459,7 @@ Returns: zero => a data character
int
PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
- int *errorcodeptr, uint32_t options, uint32_t extra_options, BOOL isclass,
+ int *errorcodeptr, uint32_t options, uint32_t extra_options, BOOL isclass,
compile_block *cb)
{
BOOL utf = (options & PCRE2_UTF) != 0;
@@ -1551,7 +1551,7 @@ else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
/* Escapes that need further processing, including those that are unknown, have
a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
-\o, and \x are recognized (\u and \U can never appear as they are used for case
+\o, and \x are recognized (\u and \U can never appear as they are used for case
forcing). */
else
@@ -1559,7 +1559,7 @@ else
int s;
PCRE2_SPTR oldptr;
BOOL overflow;
- BOOL alt_bsux =
+ BOOL alt_bsux =
((options & PCRE2_ALT_BSUX) | (extra_options & PCRE2_EXTRA_ALT_BSUX)) != 0;
/* Filter calls from pcre2_substitute(). */
@@ -1571,8 +1571,8 @@ else
*errorcodeptr = ERR3;
return 0;
}
- alt_bsux = FALSE; /* Do not modify \x handling */
- }
+ alt_bsux = FALSE; /* Do not modify \x handling */
+ }
switch (c)
{
@@ -1595,37 +1595,37 @@ else
if (!alt_bsux) *errorcodeptr = ERR37; else
{
uint32_t xc;
-
+
if (ptr >= ptrend) break;
- if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
+ if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
(extra_options & PCRE2_EXTRA_ALT_BSUX) != 0)
{
PCRE2_SPTR hptr = ptr + 1;
cc = 0;
-
+
while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
- {
+ {
if ((cc & 0xf0000000) != 0) /* Test for 32-bit overflow */
{
*errorcodeptr = ERR77;
ptr = hptr; /* Show where */
- break; /* *hptr != } will cause another break below */
- }
+ break; /* *hptr != } will cause another break below */
+ }
cc = (cc << 4) | xc;
- hptr++;
- }
-
+ hptr++;
+ }
+
if (hptr == ptr + 1 || /* No hex digits */
hptr >= ptrend || /* Hit end of input */
*hptr != CHAR_RIGHT_CURLY_BRACKET) /* No } terminator */
break; /* Hex escape not recognized */
-
+
c = cc; /* Accept the code point */
- ptr = hptr + 1;
+ ptr = hptr + 1;
}
-
+
else /* Must be exactly 4 hex digits */
- {
+ {
if (ptrend - ptr < 4) break; /* Less than 4 chars */
if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */
if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
@@ -1635,8 +1635,8 @@ else
if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */
c = (cc << 4) | xc;
ptr += 4;
- }
-
+ }
+
if (utf)
{
if (c > 0x10ffffU) *errorcodeptr = ERR77;
@@ -3424,7 +3424,7 @@ while (ptr < ptrend)
else
{
tempptr = ptr;
- escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
+ escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
cb->cx->extra_options, TRUE, cb);
if (errorcode != 0)
@@ -7631,9 +7631,20 @@ for (;; pptr++)
{
uint32_t ptype = *(++pptr) >> 16;
uint32_t pdata = *pptr & 0xffff;
- *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
- *code++ = ptype;
- *code++ = pdata;
+
+ /* The special case of \p{Any} is compiled to OP_ALLANY so as to benefit
+ from the auto-anchoring code. */
+
+ if (meta_arg == ESC_p && ptype == PT_ANY)
+ {
+ *code++ = OP_ALLANY;
+ }
+ else
+ {
+ *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
+ *code++ = ptype;
+ *code++ = pdata;
+ }
break; /* End META_ESCAPE */
}
#endif
diff --git a/testdata/testinput5 b/testdata/testinput5
index 2c4e847..7c58145 100644
--- a/testdata/testinput5
+++ b/testdata/testinput5
@@ -2170,4 +2170,8 @@
/(?'X²ABC'...)/utf
+# -------
+
+/\p{Any}*xyz/I
+
# End of testinput5
diff --git a/testdata/testoutput5 b/testdata/testoutput5
index 23438dd..5d64d00 100644
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@@ -3294,27 +3294,27 @@ No match
/\p{Any}+\p{Any} \p{Any}+\P{Any} \p{Any}+\p{L&} \p{Any}+\p{L} \p{Any}+\p{Lu} \p{Any}+\p{Han} \p{Any}+\p{Xan} \p{Any}+\p{Xsp} \p{Any}+\p{Xps} \p{Xwd}+\p{Any} \p{Any}+\p{Xuc}/Bx,ucp
------------------------------------------------------------------
Bra
- prop Any +
- prop Any
- prop Any +
+ AllAny+
+ AllAny
+ AllAny+
notprop Any
- prop Any +
+ AllAny+
prop L&
- prop Any +
+ AllAny+
prop L
- prop Any +
+ AllAny+
prop Lu
- prop Any +
+ AllAny+
prop Han
- prop Any +
+ AllAny+
prop Xan
- prop Any +
+ AllAny+
prop Xsp
- prop Any +
+ AllAny+
prop Xps
prop Xwd +
- prop Any
- prop Any +
+ AllAny
+ AllAny+
prop Xuc
Ket
End
@@ -3324,7 +3324,7 @@ No match
------------------------------------------------------------------
Bra
prop L& +
- prop Any
+ AllAny
prop L& +
prop L&
notprop L& ++
@@ -3355,7 +3355,7 @@ No match
------------------------------------------------------------------
Bra
prop N +
- prop Any
+ AllAny
prop N +
prop L&
prop N ++
@@ -3386,7 +3386,7 @@ No match
------------------------------------------------------------------
Bra
prop Lu +
- prop Any
+ AllAny
prop Lu +
prop L&
prop Lu +
@@ -3448,7 +3448,7 @@ No match
------------------------------------------------------------------
Bra
prop Xan +
- prop Any
+ AllAny
prop Xan +
prop L&
notprop Xan ++
@@ -3479,7 +3479,7 @@ No match
------------------------------------------------------------------
Bra
prop Xsp +
- prop Any
+ AllAny
prop Xsp ++
prop L&
prop Xsp ++
@@ -3508,7 +3508,7 @@ No match
------------------------------------------------------------------
Bra
prop Xwd +
- prop Any
+ AllAny
prop Xwd +
prop L&
prop Xwd +
@@ -3537,7 +3537,7 @@ No match
------------------------------------------------------------------
Bra
prop Xuc +
- prop Any
+ AllAny
prop Xuc +
prop L&
prop Xuc +
@@ -4924,4 +4924,13 @@ Failed: error 162 at offset 3: subpattern name expected
/(?'X²ABC'...)/utf
Failed: error 142 at offset 4: syntax error in subpattern name (missing terminator?)
+# -------
+
+/\p{Any}*xyz/I
+Capture group count = 0
+Compile options: <none>
+Overall options: anchored
+Last code unit = 'z'
+Subject length lower bound = 3
+
# End of testinput5