diff options
-rw-r--r-- | pod/perldebguts.pod | 4 | ||||
-rw-r--r-- | regcomp.c | 60 | ||||
-rw-r--r-- | regcomp.sym | 2 | ||||
-rw-r--r-- | regexec.c | 9 | ||||
-rw-r--r-- | regnodes.h | 2 | ||||
-rw-r--r-- | t/re/anyof.t | 2 |
6 files changed, 56 insertions, 23 deletions
diff --git a/pod/perldebguts.pod b/pod/perldebguts.pod index fee1d6900e..15b716b630 100644 --- a/pod/perldebguts.pod +++ b/pod/perldebguts.pod @@ -613,7 +613,9 @@ will be lost. posixl ANYOFH sv 1 Like ANYOF, but only has "High" matches, - none in the bitmap; + none in the bitmap; the flags field + contains the lowest matchable UTF-8 start + byte ANYOFHb sv 1 Like ANYOFH, but all matches share the same UTF-8 start byte, given in the flags field @@ -1582,7 +1582,9 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, unsigned int i; const U32 n = ARG(node); bool new_node_has_latin1 = FALSE; - const U8 flags = OP(node) == ANYOFHb ? 0 : ANYOF_FLAGS(node); + const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFHb)) + ? 0 + : ANYOF_FLAGS(node); PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC; @@ -1722,7 +1724,9 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, * another SSC or a regular ANYOF class. Can create false positives. */ SV* anded_cp_list; - U8 and_with_flags = (OP(and_with) == ANYOFHb) ? 0 : ANYOF_FLAGS(and_with); + U8 and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFHb) + ? 0 + : ANYOF_FLAGS(and_with); U8 anded_flags; PERL_ARGS_ASSERT_SSC_AND; @@ -1906,7 +1910,9 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, SV* ored_cp_list; U8 ored_flags; - U8 or_with_flags = (OP(or_with) == ANYOFHb) ? 0 : ANYOF_FLAGS(or_with); + U8 or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFHb) + ? 0 + : ANYOF_FLAGS(or_with); PERL_ARGS_ASSERT_SSC_OR; @@ -19024,25 +19030,37 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, && ! upper_latin1_only_utf8_matches && anyof_flags == 0) { + U8 low_utf8[UTF8_MAXBYTES+1]; UV highest_cp = invlist_highest(cp_list); - /* If the lowest and highest code point in the class have the same - * UTF-8 first byte, then all do, and we can store that byte for - * regexec.c to use so that it can more quickly scan the target - * string for potential matches for this class. We co-opt the - * flags field for this, and make the node ANYOFb. We do accept - * here very large code points (for future use), but don't do - * this optimization for them, as it would cause other - * complications */ op = ANYOFH; + + /* Currently the maximum allowed code point by the system is + * IV_MAX. Higher ones are reserved for future internal use. This + * particular regnode can be used for higher ones, but we can't + * calculate the code point of those. IV_MAX suffices though, as + * it will be a large first byte */ + (void) uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX)); + + /* We store the lowest possible first byte of the UTF-8 + * representation, using the flags field. This allows for quick + * ruling out of some inputs without having to convert from UTF-8 + * to code point. For EBCDIC, this has to be I8. */ + anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]); + + /* If the lowest and highest code point in the class have the same + * UTF-8 first byte, then all have that byte, and we can get an + * exact first byte instead of a minimum. We signal this with a + * different regnode */ if (highest_cp <= IV_MAX) { - U8 low_utf8[UTF8_MAXBYTES+1]; U8 high_utf8[UTF8_MAXBYTES+1]; - (void) uvchr_to_utf8(low_utf8, start[0]); - (void) uvchr_to_utf8(high_utf8, invlist_highest(cp_list)); + (void) uvchr_to_utf8(high_utf8, highest_cp); if (low_utf8[0] == high_utf8[0]) { + + /* No need to convert to I8 for EBCDIC as this is an exact + * match */ anyof_flags = low_utf8[0]; op = ANYOFHb; } @@ -20348,7 +20366,9 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ /* 2: embedded, otherwise 1 */ Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); else if (k == ANYOF) { - const U8 flags = (OP(o) == ANYOFHb) ? 0 : ANYOF_FLAGS(o); + const U8 flags = inRANGE(OP(o), ANYOFH, ANYOFHb) + ? 0 + : ANYOF_FLAGS(o); bool do_sep = FALSE; /* Do we need to separate various components of the output? */ /* Set if there is still an unresolved user-defined property */ @@ -20502,8 +20522,14 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ /* And finally the matching, closing ']' */ Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); - if (OP(o) == ANYOFHb) { - Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=\\x%02x)", FLAGS(o)); + if (inRANGE(OP(o), ANYOFH, ANYOFHb)) { + Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=\\x%02x", FLAGS(o)); + if (OP(o) == ANYOFH) { + /* Not strictly true for 32-bit or EBCDIC, but good + * enough */ + Perl_sv_catpvf(aTHX_ sv, "..\\xff"); + } + Perl_sv_catpvf(aTHX_ sv, ")"); } diff --git a/regcomp.sym b/regcomp.sym index 4b8885d0b3..9e2c6d3aea 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -64,7 +64,7 @@ ANYOFL ANYOF, sv charclass S ; Like ANYOF, but /l is in effect ANYOFPOSIXL ANYOF, sv charclass_posixl S ; Like ANYOFL, but matches [[:posix:]] classes # Must be sequential -ANYOFH ANYOF, sv 1 S ; Like ANYOF, but only has "High" matches, none in the bitmap; +ANYOFH ANYOF, sv 1 S ; Like ANYOF, but only has "High" matches, none in the bitmap; the flags field contains the lowest matchable UTF-8 start byte ANYOFHb ANYOF, sv 1 S ; Like ANYOFH, but all matches share the same UTF-8 start byte, given in the flags field ANYOFM ANYOFM byte 1 S ; Like ANYOF, but matches an invariant byte as determined by the mask and arg @@ -2179,7 +2179,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, case ANYOFH: if (utf8_target) { /* Can't possibly match a non-UTF-8 target */ REXEC_FBC_CLASS_SCAN(TRUE, - reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)); + ( (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c) + && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target))); } break; @@ -6805,6 +6806,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) case ANYOFH: if ( ! utf8_target || NEXTCHR_IS_EOS + || ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8((U8) *locinput) || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol, utf8_target)) { @@ -9592,6 +9594,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, if (utf8_target) { /* ANYOFH only can match UTF-8 targets */ while ( hardcount < max && scan < this_eol + && NATIVE_UTF8_TO_I8((U8) *scan) >= ANYOF_FLAGS(p) && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE)) { scan += UTF8SKIP(scan); @@ -9859,7 +9862,9 @@ STATIC bool S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target) { dVAR; - const char flags = (OP(n) == ANYOFHb) ? 0 : ANYOF_FLAGS(n); + const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHb)) + ? 0 + : ANYOF_FLAGS(n); bool match = FALSE; UV c = *p; diff --git a/regnodes.h b/regnodes.h index 487b6c2dee..5e39b5035b 100644 --- a/regnodes.h +++ b/regnodes.h @@ -33,7 +33,7 @@ #define ANYOFD 19 /* 0x13 Like ANYOF, but /d is in effect */ #define ANYOFL 20 /* 0x14 Like ANYOF, but /l is in effect */ #define ANYOFPOSIXL 21 /* 0x15 Like ANYOFL, but matches [[:posix:]] classes */ -#define ANYOFH 22 /* 0x16 Like ANYOF, but only has "High" matches, none in the bitmap; */ +#define ANYOFH 22 /* 0x16 Like ANYOF, but only has "High" matches, none in the bitmap; the flags field contains the lowest matchable UTF-8 start byte */ #define ANYOFHb 23 /* 0x17 Like ANYOFH, but all matches share the same UTF-8 start byte, given in the flags field */ #define ANYOFM 24 /* 0x18 Like ANYOF, but matches an invariant byte as determined by the mask and arg */ #define NANYOFM 25 /* 0x19 complement of ANYOFM */ diff --git a/t/re/anyof.t b/t/re/anyof.t index aafcdb090c..b7656d68a5 100644 --- a/t/re/anyof.t +++ b/t/re/anyof.t @@ -874,7 +874,7 @@ while (defined (my $test = shift @tests)) { my $result = get_compiled($test); if ($expected =~ / ^ ANYOFH /x) { like($result, qr/ ^ \Q$expected\E (?:\Q (First UTF-8 byte=\x\E - [[:xdigit:]]{2}\) )? $ /x, $test_name); + [[:xdigit:]]{2} )? /x, $test_name); } else { is($result, $expected, |