summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--pod/perldebguts.pod4
-rw-r--r--regcomp.c60
-rw-r--r--regcomp.sym2
-rw-r--r--regexec.c9
-rw-r--r--regnodes.h2
-rw-r--r--t/re/anyof.t2
6 files changed, 56 insertions, 23 deletions
diff --git a/pod/perldebguts.pod b/pod/perldebguts.pod
index fee1d6900e..15b716b630 100644
--- a/pod/perldebguts.pod
+++ b/pod/perldebguts.pod
@@ -613,7 +613,9 @@ will be lost.
posixl
ANYOFH sv 1 Like ANYOF, but only has "High" matches,
- none in the bitmap;
+ none in the bitmap; the flags field
+ contains the lowest matchable UTF-8 start
+ byte
ANYOFHb sv 1 Like ANYOFH, but all matches share the same
UTF-8 start byte, given in the flags field
diff --git a/regcomp.c b/regcomp.c
index e54bc1976b..00460392ce 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -1582,7 +1582,9 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
unsigned int i;
const U32 n = ARG(node);
bool new_node_has_latin1 = FALSE;
- const U8 flags = OP(node) == ANYOFHb ? 0 : ANYOF_FLAGS(node);
+ const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFHb))
+ ? 0
+ : ANYOF_FLAGS(node);
PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
@@ -1722,7 +1724,9 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
* another SSC or a regular ANYOF class. Can create false positives. */
SV* anded_cp_list;
- U8 and_with_flags = (OP(and_with) == ANYOFHb) ? 0 : ANYOF_FLAGS(and_with);
+ U8 and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFHb)
+ ? 0
+ : ANYOF_FLAGS(and_with);
U8 anded_flags;
PERL_ARGS_ASSERT_SSC_AND;
@@ -1906,7 +1910,9 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
SV* ored_cp_list;
U8 ored_flags;
- U8 or_with_flags = (OP(or_with) == ANYOFHb) ? 0 : ANYOF_FLAGS(or_with);
+ U8 or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFHb)
+ ? 0
+ : ANYOF_FLAGS(or_with);
PERL_ARGS_ASSERT_SSC_OR;
@@ -19024,25 +19030,37 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
&& ! upper_latin1_only_utf8_matches
&& anyof_flags == 0)
{
+ U8 low_utf8[UTF8_MAXBYTES+1];
UV highest_cp = invlist_highest(cp_list);
- /* If the lowest and highest code point in the class have the same
- * UTF-8 first byte, then all do, and we can store that byte for
- * regexec.c to use so that it can more quickly scan the target
- * string for potential matches for this class. We co-opt the
- * flags field for this, and make the node ANYOFb. We do accept
- * here very large code points (for future use), but don't do
- * this optimization for them, as it would cause other
- * complications */
op = ANYOFH;
+
+ /* Currently the maximum allowed code point by the system is
+ * IV_MAX. Higher ones are reserved for future internal use. This
+ * particular regnode can be used for higher ones, but we can't
+ * calculate the code point of those. IV_MAX suffices though, as
+ * it will be a large first byte */
+ (void) uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX));
+
+ /* We store the lowest possible first byte of the UTF-8
+ * representation, using the flags field. This allows for quick
+ * ruling out of some inputs without having to convert from UTF-8
+ * to code point. For EBCDIC, this has to be I8. */
+ anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
+
+ /* If the lowest and highest code point in the class have the same
+ * UTF-8 first byte, then all have that byte, and we can get an
+ * exact first byte instead of a minimum. We signal this with a
+ * different regnode */
if (highest_cp <= IV_MAX) {
- U8 low_utf8[UTF8_MAXBYTES+1];
U8 high_utf8[UTF8_MAXBYTES+1];
- (void) uvchr_to_utf8(low_utf8, start[0]);
- (void) uvchr_to_utf8(high_utf8, invlist_highest(cp_list));
+ (void) uvchr_to_utf8(high_utf8, highest_cp);
if (low_utf8[0] == high_utf8[0]) {
+
+ /* No need to convert to I8 for EBCDIC as this is an exact
+ * match */
anyof_flags = low_utf8[0];
op = ANYOFHb;
}
@@ -20348,7 +20366,9 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
/* 2: embedded, otherwise 1 */
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
else if (k == ANYOF) {
- const U8 flags = (OP(o) == ANYOFHb) ? 0 : ANYOF_FLAGS(o);
+ const U8 flags = inRANGE(OP(o), ANYOFH, ANYOFHb)
+ ? 0
+ : ANYOF_FLAGS(o);
bool do_sep = FALSE; /* Do we need to separate various components of
the output? */
/* Set if there is still an unresolved user-defined property */
@@ -20502,8 +20522,14 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
/* And finally the matching, closing ']' */
Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
- if (OP(o) == ANYOFHb) {
- Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=\\x%02x)", FLAGS(o));
+ if (inRANGE(OP(o), ANYOFH, ANYOFHb)) {
+ Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=\\x%02x", FLAGS(o));
+ if (OP(o) == ANYOFH) {
+ /* Not strictly true for 32-bit or EBCDIC, but good
+ * enough */
+ Perl_sv_catpvf(aTHX_ sv, "..\\xff");
+ }
+ Perl_sv_catpvf(aTHX_ sv, ")");
}
diff --git a/regcomp.sym b/regcomp.sym
index 4b8885d0b3..9e2c6d3aea 100644
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -64,7 +64,7 @@ ANYOFL ANYOF, sv charclass S ; Like ANYOF, but /l is in effect
ANYOFPOSIXL ANYOF, sv charclass_posixl S ; Like ANYOFL, but matches [[:posix:]] classes
# Must be sequential
-ANYOFH ANYOF, sv 1 S ; Like ANYOF, but only has "High" matches, none in the bitmap;
+ANYOFH ANYOF, sv 1 S ; Like ANYOF, but only has "High" matches, none in the bitmap; the flags field contains the lowest matchable UTF-8 start byte
ANYOFHb ANYOF, sv 1 S ; Like ANYOFH, but all matches share the same UTF-8 start byte, given in the flags field
ANYOFM ANYOFM byte 1 S ; Like ANYOF, but matches an invariant byte as determined by the mask and arg
diff --git a/regexec.c b/regexec.c
index 6eec9c2400..a205696d87 100644
--- a/regexec.c
+++ b/regexec.c
@@ -2179,7 +2179,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
case ANYOFH:
if (utf8_target) { /* Can't possibly match a non-UTF-8 target */
REXEC_FBC_CLASS_SCAN(TRUE,
- reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
+ ( (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
+ && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)));
}
break;
@@ -6805,6 +6806,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
case ANYOFH:
if ( ! utf8_target
|| NEXTCHR_IS_EOS
+ || ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8((U8) *locinput)
|| ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
utf8_target))
{
@@ -9592,6 +9594,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
if (utf8_target) { /* ANYOFH only can match UTF-8 targets */
while ( hardcount < max
&& scan < this_eol
+ && NATIVE_UTF8_TO_I8((U8) *scan) >= ANYOF_FLAGS(p)
&& reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
{
scan += UTF8SKIP(scan);
@@ -9859,7 +9862,9 @@ STATIC bool
S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
{
dVAR;
- const char flags = (OP(n) == ANYOFHb) ? 0 : ANYOF_FLAGS(n);
+ const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHb))
+ ? 0
+ : ANYOF_FLAGS(n);
bool match = FALSE;
UV c = *p;
diff --git a/regnodes.h b/regnodes.h
index 487b6c2dee..5e39b5035b 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -33,7 +33,7 @@
#define ANYOFD 19 /* 0x13 Like ANYOF, but /d is in effect */
#define ANYOFL 20 /* 0x14 Like ANYOF, but /l is in effect */
#define ANYOFPOSIXL 21 /* 0x15 Like ANYOFL, but matches [[:posix:]] classes */
-#define ANYOFH 22 /* 0x16 Like ANYOF, but only has "High" matches, none in the bitmap; */
+#define ANYOFH 22 /* 0x16 Like ANYOF, but only has "High" matches, none in the bitmap; the flags field contains the lowest matchable UTF-8 start byte */
#define ANYOFHb 23 /* 0x17 Like ANYOFH, but all matches share the same UTF-8 start byte, given in the flags field */
#define ANYOFM 24 /* 0x18 Like ANYOF, but matches an invariant byte as determined by the mask and arg */
#define NANYOFM 25 /* 0x19 complement of ANYOFM */
diff --git a/t/re/anyof.t b/t/re/anyof.t
index aafcdb090c..b7656d68a5 100644
--- a/t/re/anyof.t
+++ b/t/re/anyof.t
@@ -874,7 +874,7 @@ while (defined (my $test = shift @tests)) {
my $result = get_compiled($test);
if ($expected =~ / ^ ANYOFH /x) {
like($result, qr/ ^ \Q$expected\E (?:\Q (First UTF-8 byte=\x\E
- [[:xdigit:]]{2}\) )? $ /x, $test_name);
+ [[:xdigit:]]{2} )? /x, $test_name);
}
else {
is($result, $expected,