summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2022-06-29 11:09:50 -0600
committerKarl Williamson <khw@cpan.org>2022-07-10 11:56:49 -0600
commitbcdc9e1e7a1c864e7edbd9f18610fc988429fbcc (patch)
tree7acd316ab25352cc75040e12a01f564a3d79d6e2
parent543b9dd5e1ac26dc8f1c485eb2a370d5b37bc7a7 (diff)
downloadperl-bcdc9e1e7a1c864e7edbd9f18610fc988429fbcc.tar.gz
regex: Refactor bitmap vs non-bitmap of qr/[]/
A bracketed character class in a pattern is generally represented by some form of ANYOF node, with matches of characters in the Latin1 range handled by a bitmap, and an inversion list for higher code point matches. But some patterns only have low matches, and some only high, and some match everything that is high. This commit refactors a little so that the distinction between nothing high matches vs everything high matches is done through the same technique. Previously one was indicated by a flag, and the other by a special value in the node's structure. Now there are two special values, and the flag is freed up for a potential future use. In the past the meaning of the flags has had to be overloaded go accommodate all the needs. freeing of a flag means This all allows for some slight simplicfications.
-rw-r--r--embed.fnc1
-rw-r--r--embed.h1
-rw-r--r--invlist_inline.h36
-rw-r--r--proto.h7
-rw-r--r--regcomp.c81
-rw-r--r--regcomp.h48
-rw-r--r--regexec.c27
7 files changed, 123 insertions, 78 deletions
diff --git a/embed.fnc b/embed.fnc
index 73fd9cc7c2..8cedc5c52b 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -2031,6 +2031,7 @@ EiRT |bool |invlist_is_iterating|NN const SV* const invlist
EiR |SV* |invlist_contents|NN SV* const invlist \
|const bool traditional_style
EixRT |UV |invlist_lowest|NN SV* const invlist
+EixRT |UV |invlist_highest_range_start|NN SV* const invlist
ERS |SV* |make_exactf_invlist |NN RExC_state_t *pRExC_state \
|NN regnode *node
ES |regnode_offset|reg_la_NOTHING |NN RExC_state_t *pRExC_state \
diff --git a/embed.h b/embed.h
index 97fc9cae5d..1f7bdb58da 100644
--- a/embed.h
+++ b/embed.h
@@ -1026,6 +1026,7 @@
#define handle_regex_sets(a,b,c,d) S_handle_regex_sets(aTHX_ a,b,c,d)
#define handle_user_defined_property(a,b,c,d,e,f,g,h,i,j) S_handle_user_defined_property(aTHX_ a,b,c,d,e,f,g,h,i,j)
#define invlist_contents(a,b) S_invlist_contents(aTHX_ a,b)
+#define invlist_highest_range_start S_invlist_highest_range_start
#define invlist_is_iterating S_invlist_is_iterating
#define invlist_lowest S_invlist_lowest
#define is_ssc_worth_it S_is_ssc_worth_it
diff --git a/invlist_inline.h b/invlist_inline.h
index 9c6ee603af..f3f22382eb 100644
--- a/invlist_inline.h
+++ b/invlist_inline.h
@@ -161,6 +161,42 @@ S_invlist_highest(SV* const invlist)
: array[len - 1] - 1;
}
+# if defined(PERL_IN_REGCOMP_C)
+
+PERL_STATIC_INLINE UV
+S_invlist_highest_range_start(SV* const invlist)
+{
+ /* Returns the lowest code point of the highest range in the inversion list
+ * parameter. This API has an ambiguity, as it returns 0 under either the
+ * lowest is actually 0, or if the list is empty. If this distinction
+ * matters to you, check for emptiness before calling this function */
+
+ UV len = _invlist_len(invlist);
+ UV *array;
+
+ PERL_ARGS_ASSERT_INVLIST_HIGHEST_RANGE_START;
+
+ if (len == 0) {
+ return 0;
+ }
+
+ array = invlist_array(invlist);
+
+ /* The last element in the array in the inversion list always starts a
+ * range that goes to infinity. That range may be for code points that are
+ * matched in the inversion list, or it may be for ones that aren't
+ * matched. In the first case, the lowest code point in the matching range
+ * is that the one that started the range. If the other case, the final
+ * matching range begins at the next element down (which may be 0 in the
+ * edge case). */
+ return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
+ ? array[len - 1]
+ : len == 1
+ ? 0
+ : array[len - 2];
+}
+
+# endif
#endif
#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_OP_C)
diff --git a/proto.h b/proto.h
index 7446fd5148..3a93a5cad2 100644
--- a/proto.h
+++ b/proto.h
@@ -6334,6 +6334,13 @@ PERL_STATIC_INLINE SV* S_invlist_contents(pTHX_ SV* const invlist, const bool tr
#endif
#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE UV S_invlist_highest_range_start(SV* const invlist)
+ __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_INVLIST_HIGHEST_RANGE_START \
+ assert(invlist)
+#endif
+
+#ifndef PERL_NO_INLINE_FUNCTIONS
PERL_STATIC_INLINE bool S_invlist_is_iterating(const SV* const invlist)
__attribute__warn_unused_result__;
#define PERL_ARGS_ASSERT_INVLIST_IS_ITERATING \
diff --git a/regcomp.c b/regcomp.c
index 56940e897d..a677e65cb4 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -1703,7 +1703,7 @@ S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
Zero(ssc, 1, regnode_ssc);
set_ANYOF_SYNTHETIC(ssc);
- ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
+ ARG_SET(ssc, ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE);
ssc_anything(ssc);
/* If any portion of the regex is to operate under locale rules that aren't
@@ -1770,7 +1770,6 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
SV* invlist = NULL;
SV* only_utf8_locale_invlist = NULL;
- const U32 n = ARG(node);
bool new_node_has_latin1 = FALSE;
const U8 flags = (PL_regkind[OP(node)] == ANYOF)
? ANYOF_FLAGS(node)
@@ -1779,8 +1778,12 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
/* Look at the data structure created by S_set_ANYOF_arg() */
- if (n != ANYOF_ONLY_HAS_BITMAP) {
- assert(RExC_rxi->data->what[n] == 's');
+ if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) {
+ invlist = sv_2mortal(_new_invlist(1));
+ invlist = _add_range_to_invlist(invlist, NUM_ANYOF_CODE_POINTS, UV_MAX);
+ }
+ else if (ANYOF_HAS_AUX(node)) {
+ const U32 n = ARG(node);
SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
AV * const av = MUTABLE_AV(SvRV(rv));
SV **const ary = AvARRAY(av);
@@ -1855,7 +1858,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
}
/* Similarly for these */
- if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
+ if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) {
_invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
}
@@ -1911,8 +1914,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
/* 'AND' a given class with another one. Can create false positives. 'ssc'
- * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
- * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
+ * should not be inverted. */
STATIC void
S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
@@ -15930,10 +15932,6 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
UV high;
int i;
- if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
- ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
- }
-
/* Quit if are above what we should change */
if (start >= NUM_ANYOF_CODE_POINTS) {
break;
@@ -15952,14 +15950,10 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
invlist_iterfinish(*invlist_ptr);
/* Done with loop; remove any code points that are in the bitmap from
- * *invlist_ptr; similarly for code points above the bitmap if we have
- * a flag to match all of them anyways */
+ * *invlist_ptr */
if (change_invlist) {
_invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
}
- if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
- _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
- }
/* If have completely emptied it, remove it completely */
if (_invlist_len(*invlist_ptr) == 0) {
@@ -20550,11 +20544,13 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
SV* const only_utf8_locale_list)
{
/* Sets the arg field of an ANYOF-type node 'node', using information about
- * the node passed-in. If there is nothing outside the node's bitmap, the
- * arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to
- * the count returned by add_data(), having allocated and stored an array,
- * av, as follows:
+ * the node passed-in. If only the bitmap is needed to determine what
+ * matches, the arg is set appropriately to either
+ * 1) ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE
+ * 2) ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE
*
+ * Otherwise, it sets the argument to the count returned by add_data(),
+ * having allocated and stored an array, av, as follows:
* av[0] stores the inversion list defining this class as far as known at
* this time, or PL_sv_undef if nothing definite is now known.
* av[1] stores the inversion list of code points that match only if the
@@ -20569,12 +20565,21 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
if ( PL_regkind[OP(node)] == ANYOF
&& ! runtime_defns
- && ! only_utf8_locale_list
- && ! cp_list)
+ && ! only_utf8_locale_list)
{
- ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
+ if (! cp_list) {
+ ARG_SET(node, ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE);
+ return;
+ }
+
+ if ( invlist_highest(cp_list) == UV_MAX
+ && invlist_highest_range_start(cp_list) <= NUM_ANYOF_CODE_POINTS)
+ {
+ ARG_SET(node, ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE);
+ return;
+ }
}
- else {
+
AV * const av = newAV();
SV *rv;
@@ -20599,7 +20604,6 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
n = add_data(pRExC_state, STR_WITH_LEN("s"));
RExC_rxi->data->data[n] = (void*)rv;
ARG_SET(node, n);
- }
}
SV *
@@ -21763,7 +21767,6 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
else if (k == ANYOF || k == ANYOFH || k == ANYOFR) {
U8 flags;
char * bitmap;
- U32 arg;
bool do_sep = FALSE; /* Do we need to separate various components of
the output? */
/* Set if there is still an unresolved user-defined property */
@@ -21783,12 +21786,10 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
if (k != ANYOF) {
flags = 0;
bitmap = NULL;
- arg = 0;
}
else {
flags = ANYOF_FLAGS(o);
bitmap = ANYOF_BITMAP(o);
- arg = ARG(o);
}
if (OP(o) == ANYOFL || OP(o) == ANYOFPOSIXL) {
@@ -21830,17 +21831,22 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
ANYOFRbase(o) + ANYOFRdelta(o));
}
}
- else if (arg != ANYOF_ONLY_HAS_BITMAP) {
+ else if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(o)) {
+ nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
+ NUM_ANYOF_CODE_POINTS,
+ UV_MAX);
+ }
+ else if (ANYOF_HAS_AUX(o)) {
(void) GET_REGCLASS_AUX_DATA(prog, o, FALSE,
&unresolved,
&only_utf8_locale_invlist,
&nonbitmap_invlist);
- /* The non-bitmap data may contain stuff that could fit in the
- * bitmap. This could come from a user-defined property being
- * finally resolved when this call was done; or much more likely
- * because there are matches that require UTF-8 to be valid, and so
- * aren't in the bitmap (or ANYOFR). This is teased apart later */
+ /* The aux data may contain stuff that could fit in the bitmap.
+ * This could come from a user-defined property being finally
+ * resolved when this call was done; or much more likely because
+ * there are matches that require UTF-8 to be valid, and so aren't
+ * in the bitmap (or ANYOFR). This is teased apart later */
_invlist_intersection(nonbitmap_invlist,
PL_InBitmap,
&bitmap_range_not_in_bitmap);
@@ -21850,13 +21856,6 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
&nonbitmap_invlist);
}
- /* Obey this flag to add all above-the-bitmap code points */
- if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
- nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
- NUM_ANYOF_CODE_POINTS,
- UV_MAX);
- }
-
/* Ready to start outputting. First, the initial left bracket */
Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
diff --git a/regcomp.h b/regcomp.h
index 01ff59724c..7e2c03ce90 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -422,7 +422,21 @@ struct regnode_ssc {
* specify all possible matches (with nothing outside it matching), no
* inversion list is needed nor included, and the argument to the ANYOF node is
* set to the following: */
-#define ANYOF_ONLY_HAS_BITMAP ((U32) -1)
+
+#define ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE U32_MAX
+#define ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node) \
+ (ARG(node) == ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE)
+
+#define ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE \
+ /* Assumes ALL is odd */ (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE - 1)
+#define ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(node) \
+ (ARG(node) == ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE)
+
+#define ANYOF_ONLY_HAS_BITMAP_MASK ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE
+#define ANYOF_ONLY_HAS_BITMAP(node) \
+ ((ARG(node) & ANYOF_ONLY_HAS_BITMAP_MASK) == ANYOF_ONLY_HAS_BITMAP_MASK)
+
+#define ANYOF_HAS_AUX(node) (! ANYOF_ONLY_HAS_BITMAP(node))
/* There are also ANYOFM nodes, used when the bit patterns representing the
* matched code points happen to be such that they can be checked by ANDing
@@ -511,27 +525,13 @@ struct regnode_ssc {
* supposed to be raised when matching certain categories of code points in the
* target string. Flags are set to indicate this. This adds up to a bunch of
* flags required, and we only have 8 available. That is why we share some.
- * At the moment, there is one spare flag bit, but this could be increased by
+ * At the moment, there are two spare flag bits, but this could be increased by
* various tricks:
*
- * If just one more bit is needed, as of this writing it seems to khw that the
- * simplest choice would be to remove the ANYOF_MATCHES_ALL_ABOVE_BITMAP flag,
- * and just add that range to the inversion list. But that would slow down
- * some common cases, so something like this could be created
- *
- * #define ANYOF_MATCHES_ALL_ABOVE_BITMAP ((U32) -2)
- *
- * and access it through the ARG like ANYOF_ONLY_HAS_BITMAP already is. The
- * reginclass() function call could be avoided through this for appropriate
- * inputs. However, some cases where everything matches above the bit map
- * still need an AV for some things within the bitmap. For those, this
- * wouldn't be used, but the inversion list in AV[0] would be extended to match
- * everything above the bitmap.
- *
- * Another possibility is based on the fact that ANYOF_MATCHES_POSIXL is
- * redundant with the node type ANYOFPOSIXL. That flag could be removed, but
- * at the expense of having to write extra code, which would take up space, and
- * writing this turns out to be not hard, but not trivial.
+ * ANYOF_MATCHES_POSIXL is redundant with the node type ANYOFPOSIXL. That flag
+ * could be removed, but at the expense of having to write extra code, which
+ * would take up space, and writing this turns out to be not hard, but not
+ * trivial.
*
* If this is done, an extension would be to make all ANYOFL nodes contain the
* extra 32 bits that ANYOFPOSIXL ones do, doubling each instance's size. The
@@ -570,9 +570,7 @@ struct regnode_ssc {
/* Spare: Be sure to change ANYOF_FLAGS_ALL if this gets used 0x10 */
-/* If set, the node matches every code point NUM_ANYOF_CODE_POINTS and above.
- * Can be in an SSC */
-#define ANYOF_MATCHES_ALL_ABOVE_BITMAP 0x20
+/* Spare: Be sure to change ANYOF_FLAGS_ALL if this gets used 0x20 */
/* Shared bit that indicates that there are potential additional matches stored
* outside the bitmap, as pointed to by the AV given by the node's argument.
@@ -612,9 +610,9 @@ struct regnode_ssc {
#define ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared 0x80
#define ANYOF_WARN_SUPER__shared 0x80
-#define ANYOF_FLAGS_ALL ((U8) ~0x10)
+#define ANYOF_FLAGS_ALL ((U8) ~(0x10|0x20))
-#define ANYOF_LOCALE_FLAGS ( ANYOFL_FOLD \
+#define ANYOF_LOCALE_FLAGS ( ANYOFL_FOLD \
| ANYOF_MATCHES_POSIXL \
| ANYOFL_UTF8_LOCALE_REQD)
diff --git a/regexec.c b/regexec.c
index 1fdaeec851..ecc3f45bce 100644
--- a/regexec.c
+++ b/regexec.c
@@ -2250,15 +2250,15 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
case ANYOFD_tb_p8:
case ANYOF_tb_pb:
case ANYOF_tb_p8:
- if (ANYOF_FLAGS(c) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
+ if (! ANYOF_FLAGS(c) && ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(c)) {
/* We know that s is in the bitmap range since the target isn't
* UTF-8, so what happens for out-of-range values is not relevant,
* so exclude that from the flags */
- REXEC_FBC_NON_UTF8_CLASS_SCAN(reginclass(prog,c, (U8*)s, (U8*)s+1,
- 0));
+ REXEC_FBC_NON_UTF8_CLASS_SCAN(ANYOF_BITMAP_TEST(c, *((U8*)s)));
}
else {
- REXEC_FBC_NON_UTF8_CLASS_SCAN(ANYOF_BITMAP_TEST(c, *((U8*)s)));
+ REXEC_FBC_NON_UTF8_CLASS_SCAN(reginclass(prog,c, (U8*)s, (U8*)s+1,
+ 0));
}
break;
@@ -7401,7 +7401,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
if (NEXTCHR_IS_EOS || locinput >= loceol)
sayNO;
if ( (! utf8_target || UTF8_IS_INVARIANT(*locinput))
- && ! (ANYOF_FLAGS(scan) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP))
+ && ! ANYOF_FLAGS(scan)
+ && ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(scan))
{
if (! ANYOF_BITMAP_TEST(scan, * (U8 *) (locinput))) {
sayNO;
@@ -10248,7 +10249,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
case ANYOFD_tb:
case ANYOF_tb:
- if (ANYOF_FLAGS(p) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
+ if (ANYOF_FLAGS(p) || ANYOF_HAS_AUX(p)) {
while ( scan < this_eol
&& reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
scan++;
@@ -10723,10 +10724,13 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const
/* If the bitmap didn't (or couldn't) match, and something outside the
* bitmap could match, try that. */
if (!match) {
- if (c >= NUM_ANYOF_CODE_POINTS
- && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
+ if ( c >= NUM_ANYOF_CODE_POINTS
+ && ANYOF_ONLY_HAS_BITMAP(n)
+ && ! (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES))
{
- match = TRUE; /* Everything above the bitmap matches */
+ /* In this case, the ARG is set up so that the final bit indicates
+ * whether it matches or not */
+ match = ARG(n) & 1;
}
else
/* Here, the main way it could match is if the code point is
@@ -10735,8 +10739,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const
* we need special handling. That handling doesn't come in to
* effect for ANYOFD nodes unless the target string is UTF-8 and
* that matters to code point being matched. */
- if ( ( c >= NUM_ANYOF_CODE_POINTS
- && ARG(n) != ANYOF_ONLY_HAS_BITMAP)
+ if ( c >= NUM_ANYOF_CODE_POINTS
|| ( (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES)
&& ( UNLIKELY(OP(n) != ANYOFD)
|| (utf8_target && ! isASCII_uvchr(c)
@@ -10753,7 +10756,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const
*
* First check if there is an inversion list and/or auxiliary data.
* */
- if (ARG(n) != ANYOF_ONLY_HAS_BITMAP) {
+ if (ANYOF_HAS_AUX(n)) {
SV* only_utf8_locale = NULL;
/* This call will return in 'definition' the union of the base