diff options
author | Karl Williamson <khw@cpan.org> | 2022-06-29 11:09:50 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2022-07-10 11:56:49 -0600 |
commit | bcdc9e1e7a1c864e7edbd9f18610fc988429fbcc (patch) | |
tree | 7acd316ab25352cc75040e12a01f564a3d79d6e2 | |
parent | 543b9dd5e1ac26dc8f1c485eb2a370d5b37bc7a7 (diff) | |
download | perl-bcdc9e1e7a1c864e7edbd9f18610fc988429fbcc.tar.gz |
regex: Refactor bitmap vs non-bitmap of qr/[]/
A bracketed character class in a pattern is generally represented by
some form of ANYOF node, with matches of characters in the Latin1 range
handled by a bitmap, and an inversion list for higher code point
matches. But some patterns only have low matches, and some only high,
and some match everything that is high.
This commit refactors a little so that the distinction between nothing
high matches vs everything high matches is done through the same
technique. Previously one was indicated by a flag, and the other by a
special value in the node's structure. Now there are two special
values, and the flag is freed up for a potential future use. In the
past the meaning of the flags has had to be overloaded go accommodate
all the needs. freeing of a flag means
This all allows for some slight simplicfications.
-rw-r--r-- | embed.fnc | 1 | ||||
-rw-r--r-- | embed.h | 1 | ||||
-rw-r--r-- | invlist_inline.h | 36 | ||||
-rw-r--r-- | proto.h | 7 | ||||
-rw-r--r-- | regcomp.c | 81 | ||||
-rw-r--r-- | regcomp.h | 48 | ||||
-rw-r--r-- | regexec.c | 27 |
7 files changed, 123 insertions, 78 deletions
@@ -2031,6 +2031,7 @@ EiRT |bool |invlist_is_iterating|NN const SV* const invlist EiR |SV* |invlist_contents|NN SV* const invlist \ |const bool traditional_style EixRT |UV |invlist_lowest|NN SV* const invlist +EixRT |UV |invlist_highest_range_start|NN SV* const invlist ERS |SV* |make_exactf_invlist |NN RExC_state_t *pRExC_state \ |NN regnode *node ES |regnode_offset|reg_la_NOTHING |NN RExC_state_t *pRExC_state \ @@ -1026,6 +1026,7 @@ #define handle_regex_sets(a,b,c,d) S_handle_regex_sets(aTHX_ a,b,c,d) #define handle_user_defined_property(a,b,c,d,e,f,g,h,i,j) S_handle_user_defined_property(aTHX_ a,b,c,d,e,f,g,h,i,j) #define invlist_contents(a,b) S_invlist_contents(aTHX_ a,b) +#define invlist_highest_range_start S_invlist_highest_range_start #define invlist_is_iterating S_invlist_is_iterating #define invlist_lowest S_invlist_lowest #define is_ssc_worth_it S_is_ssc_worth_it diff --git a/invlist_inline.h b/invlist_inline.h index 9c6ee603af..f3f22382eb 100644 --- a/invlist_inline.h +++ b/invlist_inline.h @@ -161,6 +161,42 @@ S_invlist_highest(SV* const invlist) : array[len - 1] - 1; } +# if defined(PERL_IN_REGCOMP_C) + +PERL_STATIC_INLINE UV +S_invlist_highest_range_start(SV* const invlist) +{ + /* Returns the lowest code point of the highest range in the inversion list + * parameter. This API has an ambiguity, as it returns 0 under either the + * lowest is actually 0, or if the list is empty. If this distinction + * matters to you, check for emptiness before calling this function */ + + UV len = _invlist_len(invlist); + UV *array; + + PERL_ARGS_ASSERT_INVLIST_HIGHEST_RANGE_START; + + if (len == 0) { + return 0; + } + + array = invlist_array(invlist); + + /* The last element in the array in the inversion list always starts a + * range that goes to infinity. That range may be for code points that are + * matched in the inversion list, or it may be for ones that aren't + * matched. In the first case, the lowest code point in the matching range + * is that the one that started the range. If the other case, the final + * matching range begins at the next element down (which may be 0 in the + * edge case). */ + return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1)) + ? array[len - 1] + : len == 1 + ? 0 + : array[len - 2]; +} + +# endif #endif #if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_OP_C) @@ -6334,6 +6334,13 @@ PERL_STATIC_INLINE SV* S_invlist_contents(pTHX_ SV* const invlist, const bool tr #endif #ifndef PERL_NO_INLINE_FUNCTIONS +PERL_STATIC_INLINE UV S_invlist_highest_range_start(SV* const invlist) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_INVLIST_HIGHEST_RANGE_START \ + assert(invlist) +#endif + +#ifndef PERL_NO_INLINE_FUNCTIONS PERL_STATIC_INLINE bool S_invlist_is_iterating(const SV* const invlist) __attribute__warn_unused_result__; #define PERL_ARGS_ASSERT_INVLIST_IS_ITERATING \ @@ -1703,7 +1703,7 @@ S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc) Zero(ssc, 1, regnode_ssc); set_ANYOF_SYNTHETIC(ssc); - ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP); + ARG_SET(ssc, ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE); ssc_anything(ssc); /* If any portion of the regex is to operate under locale rules that aren't @@ -1770,7 +1770,6 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, SV* invlist = NULL; SV* only_utf8_locale_invlist = NULL; - const U32 n = ARG(node); bool new_node_has_latin1 = FALSE; const U8 flags = (PL_regkind[OP(node)] == ANYOF) ? ANYOF_FLAGS(node) @@ -1779,8 +1778,12 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC; /* Look at the data structure created by S_set_ANYOF_arg() */ - if (n != ANYOF_ONLY_HAS_BITMAP) { - assert(RExC_rxi->data->what[n] == 's'); + if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) { + invlist = sv_2mortal(_new_invlist(1)); + invlist = _add_range_to_invlist(invlist, NUM_ANYOF_CODE_POINTS, UV_MAX); + } + else if (ANYOF_HAS_AUX(node)) { + const U32 n = ARG(node); SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]); AV * const av = MUTABLE_AV(SvRV(rv)); SV **const ary = AvARRAY(av); @@ -1855,7 +1858,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, } /* Similarly for these */ - if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) { + if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) { _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist); } @@ -1911,8 +1914,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX) /* 'AND' a given class with another one. Can create false positives. 'ssc' - * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be - * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */ + * should not be inverted. */ STATIC void S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, @@ -15930,10 +15932,6 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) UV high; int i; - if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) { - ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP; - } - /* Quit if are above what we should change */ if (start >= NUM_ANYOF_CODE_POINTS) { break; @@ -15952,14 +15950,10 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) invlist_iterfinish(*invlist_ptr); /* Done with loop; remove any code points that are in the bitmap from - * *invlist_ptr; similarly for code points above the bitmap if we have - * a flag to match all of them anyways */ + * *invlist_ptr */ if (change_invlist) { _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr); } - if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) { - _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr); - } /* If have completely emptied it, remove it completely */ if (_invlist_len(*invlist_ptr) == 0) { @@ -20550,11 +20544,13 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, SV* const only_utf8_locale_list) { /* Sets the arg field of an ANYOF-type node 'node', using information about - * the node passed-in. If there is nothing outside the node's bitmap, the - * arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to - * the count returned by add_data(), having allocated and stored an array, - * av, as follows: + * the node passed-in. If only the bitmap is needed to determine what + * matches, the arg is set appropriately to either + * 1) ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE + * 2) ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE * + * Otherwise, it sets the argument to the count returned by add_data(), + * having allocated and stored an array, av, as follows: * av[0] stores the inversion list defining this class as far as known at * this time, or PL_sv_undef if nothing definite is now known. * av[1] stores the inversion list of code points that match only if the @@ -20569,12 +20565,21 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, if ( PL_regkind[OP(node)] == ANYOF && ! runtime_defns - && ! only_utf8_locale_list - && ! cp_list) + && ! only_utf8_locale_list) { - ARG_SET(node, ANYOF_ONLY_HAS_BITMAP); + if (! cp_list) { + ARG_SET(node, ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE); + return; + } + + if ( invlist_highest(cp_list) == UV_MAX + && invlist_highest_range_start(cp_list) <= NUM_ANYOF_CODE_POINTS) + { + ARG_SET(node, ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE); + return; + } } - else { + AV * const av = newAV(); SV *rv; @@ -20599,7 +20604,6 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, n = add_data(pRExC_state, STR_WITH_LEN("s")); RExC_rxi->data->data[n] = (void*)rv; ARG_SET(node, n); - } } SV * @@ -21763,7 +21767,6 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ else if (k == ANYOF || k == ANYOFH || k == ANYOFR) { U8 flags; char * bitmap; - U32 arg; bool do_sep = FALSE; /* Do we need to separate various components of the output? */ /* Set if there is still an unresolved user-defined property */ @@ -21783,12 +21786,10 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ if (k != ANYOF) { flags = 0; bitmap = NULL; - arg = 0; } else { flags = ANYOF_FLAGS(o); bitmap = ANYOF_BITMAP(o); - arg = ARG(o); } if (OP(o) == ANYOFL || OP(o) == ANYOFPOSIXL) { @@ -21830,17 +21831,22 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ ANYOFRbase(o) + ANYOFRdelta(o)); } } - else if (arg != ANYOF_ONLY_HAS_BITMAP) { + else if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(o)) { + nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist, + NUM_ANYOF_CODE_POINTS, + UV_MAX); + } + else if (ANYOF_HAS_AUX(o)) { (void) GET_REGCLASS_AUX_DATA(prog, o, FALSE, &unresolved, &only_utf8_locale_invlist, &nonbitmap_invlist); - /* The non-bitmap data may contain stuff that could fit in the - * bitmap. This could come from a user-defined property being - * finally resolved when this call was done; or much more likely - * because there are matches that require UTF-8 to be valid, and so - * aren't in the bitmap (or ANYOFR). This is teased apart later */ + /* The aux data may contain stuff that could fit in the bitmap. + * This could come from a user-defined property being finally + * resolved when this call was done; or much more likely because + * there are matches that require UTF-8 to be valid, and so aren't + * in the bitmap (or ANYOFR). This is teased apart later */ _invlist_intersection(nonbitmap_invlist, PL_InBitmap, &bitmap_range_not_in_bitmap); @@ -21850,13 +21856,6 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ &nonbitmap_invlist); } - /* Obey this flag to add all above-the-bitmap code points */ - if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) { - nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist, - NUM_ANYOF_CODE_POINTS, - UV_MAX); - } - /* Ready to start outputting. First, the initial left bracket */ Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); @@ -422,7 +422,21 @@ struct regnode_ssc { * specify all possible matches (with nothing outside it matching), no * inversion list is needed nor included, and the argument to the ANYOF node is * set to the following: */ -#define ANYOF_ONLY_HAS_BITMAP ((U32) -1) + +#define ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE U32_MAX +#define ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node) \ + (ARG(node) == ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE) + +#define ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE \ + /* Assumes ALL is odd */ (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE - 1) +#define ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(node) \ + (ARG(node) == ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE) + +#define ANYOF_ONLY_HAS_BITMAP_MASK ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE +#define ANYOF_ONLY_HAS_BITMAP(node) \ + ((ARG(node) & ANYOF_ONLY_HAS_BITMAP_MASK) == ANYOF_ONLY_HAS_BITMAP_MASK) + +#define ANYOF_HAS_AUX(node) (! ANYOF_ONLY_HAS_BITMAP(node)) /* There are also ANYOFM nodes, used when the bit patterns representing the * matched code points happen to be such that they can be checked by ANDing @@ -511,27 +525,13 @@ struct regnode_ssc { * supposed to be raised when matching certain categories of code points in the * target string. Flags are set to indicate this. This adds up to a bunch of * flags required, and we only have 8 available. That is why we share some. - * At the moment, there is one spare flag bit, but this could be increased by + * At the moment, there are two spare flag bits, but this could be increased by * various tricks: * - * If just one more bit is needed, as of this writing it seems to khw that the - * simplest choice would be to remove the ANYOF_MATCHES_ALL_ABOVE_BITMAP flag, - * and just add that range to the inversion list. But that would slow down - * some common cases, so something like this could be created - * - * #define ANYOF_MATCHES_ALL_ABOVE_BITMAP ((U32) -2) - * - * and access it through the ARG like ANYOF_ONLY_HAS_BITMAP already is. The - * reginclass() function call could be avoided through this for appropriate - * inputs. However, some cases where everything matches above the bit map - * still need an AV for some things within the bitmap. For those, this - * wouldn't be used, but the inversion list in AV[0] would be extended to match - * everything above the bitmap. - * - * Another possibility is based on the fact that ANYOF_MATCHES_POSIXL is - * redundant with the node type ANYOFPOSIXL. That flag could be removed, but - * at the expense of having to write extra code, which would take up space, and - * writing this turns out to be not hard, but not trivial. + * ANYOF_MATCHES_POSIXL is redundant with the node type ANYOFPOSIXL. That flag + * could be removed, but at the expense of having to write extra code, which + * would take up space, and writing this turns out to be not hard, but not + * trivial. * * If this is done, an extension would be to make all ANYOFL nodes contain the * extra 32 bits that ANYOFPOSIXL ones do, doubling each instance's size. The @@ -570,9 +570,7 @@ struct regnode_ssc { /* Spare: Be sure to change ANYOF_FLAGS_ALL if this gets used 0x10 */ -/* If set, the node matches every code point NUM_ANYOF_CODE_POINTS and above. - * Can be in an SSC */ -#define ANYOF_MATCHES_ALL_ABOVE_BITMAP 0x20 +/* Spare: Be sure to change ANYOF_FLAGS_ALL if this gets used 0x20 */ /* Shared bit that indicates that there are potential additional matches stored * outside the bitmap, as pointed to by the AV given by the node's argument. @@ -612,9 +610,9 @@ struct regnode_ssc { #define ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared 0x80 #define ANYOF_WARN_SUPER__shared 0x80 -#define ANYOF_FLAGS_ALL ((U8) ~0x10) +#define ANYOF_FLAGS_ALL ((U8) ~(0x10|0x20)) -#define ANYOF_LOCALE_FLAGS ( ANYOFL_FOLD \ +#define ANYOF_LOCALE_FLAGS ( ANYOFL_FOLD \ | ANYOF_MATCHES_POSIXL \ | ANYOFL_UTF8_LOCALE_REQD) @@ -2250,15 +2250,15 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, case ANYOFD_tb_p8: case ANYOF_tb_pb: case ANYOF_tb_p8: - if (ANYOF_FLAGS(c) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) { + if (! ANYOF_FLAGS(c) && ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(c)) { /* We know that s is in the bitmap range since the target isn't * UTF-8, so what happens for out-of-range values is not relevant, * so exclude that from the flags */ - REXEC_FBC_NON_UTF8_CLASS_SCAN(reginclass(prog,c, (U8*)s, (U8*)s+1, - 0)); + REXEC_FBC_NON_UTF8_CLASS_SCAN(ANYOF_BITMAP_TEST(c, *((U8*)s))); } else { - REXEC_FBC_NON_UTF8_CLASS_SCAN(ANYOF_BITMAP_TEST(c, *((U8*)s))); + REXEC_FBC_NON_UTF8_CLASS_SCAN(reginclass(prog,c, (U8*)s, (U8*)s+1, + 0)); } break; @@ -7401,7 +7401,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) if (NEXTCHR_IS_EOS || locinput >= loceol) sayNO; if ( (! utf8_target || UTF8_IS_INVARIANT(*locinput)) - && ! (ANYOF_FLAGS(scan) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP)) + && ! ANYOF_FLAGS(scan) + && ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(scan)) { if (! ANYOF_BITMAP_TEST(scan, * (U8 *) (locinput))) { sayNO; @@ -10248,7 +10249,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, case ANYOFD_tb: case ANYOF_tb: - if (ANYOF_FLAGS(p) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) { + if (ANYOF_FLAGS(p) || ANYOF_HAS_AUX(p)) { while ( scan < this_eol && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0)) scan++; @@ -10723,10 +10724,13 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const /* If the bitmap didn't (or couldn't) match, and something outside the * bitmap could match, try that. */ if (!match) { - if (c >= NUM_ANYOF_CODE_POINTS - && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)) + if ( c >= NUM_ANYOF_CODE_POINTS + && ANYOF_ONLY_HAS_BITMAP(n) + && ! (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES)) { - match = TRUE; /* Everything above the bitmap matches */ + /* In this case, the ARG is set up so that the final bit indicates + * whether it matches or not */ + match = ARG(n) & 1; } else /* Here, the main way it could match is if the code point is @@ -10735,8 +10739,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const * we need special handling. That handling doesn't come in to * effect for ANYOFD nodes unless the target string is UTF-8 and * that matters to code point being matched. */ - if ( ( c >= NUM_ANYOF_CODE_POINTS - && ARG(n) != ANYOF_ONLY_HAS_BITMAP) + if ( c >= NUM_ANYOF_CODE_POINTS || ( (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES) && ( UNLIKELY(OP(n) != ANYOFD) || (utf8_target && ! isASCII_uvchr(c) @@ -10753,7 +10756,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const * * First check if there is an inversion list and/or auxiliary data. * */ - if (ARG(n) != ANYOF_ONLY_HAS_BITMAP) { + if (ANYOF_HAS_AUX(n)) { SV* only_utf8_locale = NULL; /* This call will return in 'definition' the union of the base |