summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--regcomp.c108
-rw-r--r--regcomp.h25
2 files changed, 71 insertions, 62 deletions
diff --git a/regcomp.c b/regcomp.c
index a1cd981bc7..7201082b34 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -805,19 +805,6 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
DEBUG_STUDYDATA("commit: ",data,0);
}
-/* These macros set, clear and test whether the synthetic start class ('SSC',
- * given by the parameter) matches an empty string (EOS). This uses the
- * 'next_off' field in the node, to save a bit in the flags field. The ssc
- * stands alone, so there is never a next_off, so this field is otherwise
- * unused. The EOS information is used only for compilation, but theoretically
- * it could be passed on to the execution code. This could be used to store
- * more than one bit of information, but only this one is currently used. This
- * flag could be moved back to the bitmap instead, shared with INVERT, as no
- * SSC is ever inverted */
-#define SET_SSC_EOS(node) STMT_START { (node)->next_off = TRUE; } STMT_END
-#define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END
-#define TEST_SSC_EOS(node) cBOOL((node)->next_off)
-
/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
* list that describes which code points it matches */
@@ -832,7 +819,7 @@ S_ssc_anything(pTHX_ regnode_ssc *ssc)
ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
_append_range_to_invlist(ssc->invlist, 0, UV_MAX);
- SET_SSC_EOS(ssc); /* Plus match empty string */
+ ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING; /* Plus match empty string */
}
STATIC int
@@ -848,7 +835,7 @@ S_ssc_is_anything(pTHX_ const regnode_ssc *ssc)
assert(OP(ssc) == ANYOF_SYNTHETIC);
- if (! TEST_SSC_EOS(ssc)) {
+ if (! ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING) {
return FALSE;
}
@@ -1039,7 +1026,9 @@ STATIC void
S_ssc_flags_and(regnode_ssc *ssc, const U8 and_with)
{
/* Take the flags 'and_with' and accumulate them anded into the flags for
- * the SSC 'ssc'. The non-SSC related flags in 'and_with' are ignored. */
+ * the SSC 'ssc'. The non-SSC related flags in 'and_with' are ignored.
+ * The flags 'and_with' should not come from another SSC (otherwise the
+ * EMPTY_STRING flag won't work) */
const U8 ssc_only_flags = ANYOF_FLAGS(ssc) & ~ANYOF_LOCALE_FLAGS;
@@ -1061,17 +1050,26 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
/* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
* another SSC or a regular ANYOF class. Can create false positives. */
- /* If 'and_with' is an SSC, we already have its inversion list; otherwise
- * have to calculate it */
- SV* anded_cp_list = (OP(and_with) == ANYOF_SYNTHETIC)
- ? and_with->invlist
- : get_ANYOF_cp_list_for_ssc(pRExC_state,
- (regnode_charclass_posixl*) and_with);
+ SV* anded_cp_list;
+ U8 anded_flags;
PERL_ARGS_ASSERT_SSC_AND;
assert(OP(ssc) == ANYOF_SYNTHETIC);
- assert(! (ANYOF_FLAGS(ssc) & ANYOF_INVERT)); /* SSCs are never inverted */
+
+ /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
+ * the code point inversion list and just the relevant flags */
+ if (OP(and_with) == ANYOF_SYNTHETIC) {
+ anded_cp_list = and_with->invlist;
+ anded_flags = ANYOF_FLAGS(and_with);
+ }
+ else {
+ anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
+ (regnode_charclass_posixl*) and_with);
+ anded_flags = ANYOF_FLAGS(and_with) & ANYOF_LOCALE_FLAGS;
+ }
+
+ ANYOF_FLAGS(ssc) &= anded_flags;
/* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
* C2 is the list of code points in 'and-with'; P2, its posix classes.
@@ -1104,11 +1102,11 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
* <= (C1 & ~C2) | (P1 & ~P2)
* */
- if (ANYOF_FLAGS(and_with) & ANYOF_INVERT) {
+ if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
+ && OP(and_with) != ANYOF_SYNTHETIC)
+ {
unsigned int i;
- assert(OP(and_with) != ANYOF_SYNTHETIC);
-
ssc_intersection(ssc,
anded_cp_list,
FALSE /* Has already been inverted */
@@ -1191,8 +1189,6 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
ssc_intersection(ssc, anded_cp_list, FALSE);
}
}
-
- ssc_flags_and(ssc, ANYOF_FLAGS(and_with));
}
STATIC void
@@ -1203,17 +1199,26 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
* another SSC or a regular ANYOF class. Can create false positives if
* 'or_with' is to be inverted. */
- /* If 'or_with' is an SSC, we already have its inversion list; otherwise
- * have to calculate it */
- SV* ored_cp_list = (OP(or_with) == ANYOF_SYNTHETIC)
- ? or_with->invlist
- : get_ANYOF_cp_list_for_ssc(pRExC_state,
- (regnode_charclass_posixl*) or_with);
+ SV* ored_cp_list;
+ U8 ored_flags;
PERL_ARGS_ASSERT_SSC_OR;
assert(OP(ssc) == ANYOF_SYNTHETIC);
- assert(! (ANYOF_FLAGS(ssc) & ANYOF_INVERT));
+
+ /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
+ * the code point inversion list and just the relevant flags */
+ if (OP(or_with) == ANYOF_SYNTHETIC) {
+ ored_cp_list = or_with->invlist;
+ ored_flags = ANYOF_FLAGS(or_with);
+ }
+ else {
+ ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
+ (regnode_charclass_posixl*) or_with);
+ ored_flags = ANYOF_FLAGS(or_with) & ANYOF_LOCALE_FLAGS;
+ }
+
+ ANYOF_FLAGS(ssc) |= ored_flags;
/* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
* C2 is the list of code points in 'or-with'; P2, its posix classes.
@@ -1233,11 +1238,9 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
* (which results in actually simpler code than the non-inverted case)
* */
- /* Use just the SSC-related flags from 'or_with' */
- ANYOF_FLAGS(ssc) |= (ANYOF_FLAGS(or_with) & ANYOF_LOCALE_FLAGS);
-
- if (ANYOF_FLAGS(or_with) & ANYOF_INVERT) {
- assert(OP(or_with) != ANYOF_SYNTHETIC);
+ if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
+ && OP(or_with) != ANYOF_SYNTHETIC)
+ {
/* We ignore P2, leaving P1 going forward */
}
else { /* Not inverted */
@@ -1346,6 +1349,11 @@ S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
assert(OP(ssc) == ANYOF_SYNTHETIC);
+ /* The code in this file assumes that all but these flags aren't relevant
+ * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
+ * time we reach here */
+ assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_LOCALE_FLAGS));
+
populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL, FALSE);
@@ -3921,7 +3929,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
* can't match null string */
if (flags & SCF_DO_STCLASS_AND) {
ssc_cp_and(data->start_class, uc);
- CLEAR_SSC_EOS(data->start_class);
+ ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
ssc_clear_locale(data->start_class);
}
else if (flags & SCF_DO_STCLASS_OR) {
@@ -4039,7 +4047,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
}
}
if (flags & SCF_DO_STCLASS_AND) {
- CLEAR_SSC_EOS(data->start_class);
+ ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
ANYOF_POSIXL_ZERO(data->start_class);
ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
}
@@ -4443,7 +4451,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
ssc_intersection(data->start_class,
PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
ssc_clear_locale(data->start_class);
- CLEAR_SSC_EOS(data->start_class); /* No match on empty */
+ ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
}
else if (flags & SCF_DO_STCLASS_OR) {
ssc_union(data->start_class,
@@ -4476,7 +4484,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
U8 namedclass;
if (flags & SCF_DO_STCLASS_AND) {
- CLEAR_SSC_EOS(data->start_class); /* No match on empty */
+ ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
}
/* Some of the logic below assumes that switching
@@ -4720,11 +4728,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
ssc_init(pRExC_state, data->start_class);
} else {
/* AND before and after: combine and continue */
- const int was = TEST_SSC_EOS(data->start_class);
-
ssc_and(pRExC_state, data->start_class, &intrnl);
- if (was)
- SET_SSC_EOS(data->start_class);
}
}
}
@@ -4792,11 +4796,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
*minnextp += min;
if (f & SCF_DO_STCLASS_AND) {
- const int was = TEST_SSC_EOS(data->start_class);
-
ssc_and(pRExC_state, data->start_class, &intrnl);
- if (was)
- SET_SSC_EOS(data->start_class);
}
if (data) {
if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
@@ -6613,7 +6613,7 @@ reStudy:
if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
&& stclass_flag
- && ! TEST_SSC_EOS(data.start_class)
+ && ! ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING
&& !ssc_is_anything(data.start_class))
{
const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
@@ -6687,7 +6687,7 @@ reStudy:
r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
= r->float_substr = r->float_utf8 = NULL;
- if (! TEST_SSC_EOS(data.start_class)
+ if (! ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING
&& !ssc_is_anything(data.start_class))
{
const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
diff --git a/regcomp.h b/regcomp.h
index eccb46690a..448b0e93bc 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -191,7 +191,9 @@ struct regnode_charclass_class {
U32 classflags; /* and run-time */
};
-/* Synthetic start class, is a regnode_charclass_class plus an SV* */
+/* Synthetic start class; is a regnode_charclass_class plus an SV*. Note that
+ * the 'next_off' field is unused, as the SSC stands alone, so there is never a
+ * next node. */
struct regnode_ssc {
U8 flags; /* ANYOF_POSIXL bit must go here */
U8 type;
@@ -314,14 +316,16 @@ struct regnode_ssc {
/* Flags for node->flags of ANYOF. These are in short supply, but there is one
* currently available. If more than this are needed, the ANYOF_LOCALE and
- * ANYOF_POSIXL bits could be shared, making a space penalty for all locale nodes.
- * Also, the ABOVE_LATIN1_ALL bit could be freed up by resorting to creating a
- * swash containing everything above 255. This introduces a performance
- * penalty. Better would be to split it off into a separate node, which
- * actually would improve performance a bit by allowing regexec.c to test for a
- * UTF-8 character being above 255 without having to call a function nor
+ * ANYOF_POSIXL bits could be shared, making a space penalty for all locale
+ * nodes. Also, the ABOVE_LATIN1_ALL bit could be freed up by resorting to
+ * creating a swash containing everything above 255. This introduces a
+ * performance penalty. Better would be to split it off into a separate node,
+ * which actually would improve performance a bit by allowing regexec.c to test
+ * for a UTF-8 character being above 255 without having to call a function nor
* calculate its code point value. However, this solution might need to have a
- * second node type, ANYOF_SYNTHETIC_ABOVE_LATIN1_ALL */
+ * second node type, ANYOF_SYNTHETIC_ABOVE_LATIN1_ALL. Several flags are not
+ * used in synthetic start class (SSC) nodes, so could be shared should new
+ * flags be needed for SSCs. */
#define ANYOF_LOCALE 0x01 /* /l modifier */
@@ -333,6 +337,11 @@ struct regnode_ssc {
#define ANYOF_INVERT 0x04
+/* For the SSC node only, which cannot be inverted, so is shared with that bit.
+ * This means "Does this SSC match an empty string?" This is used only during
+ * regex compilation. */
+#define ANYOF_EMPTY_STRING ANYOF_INVERT
+
/* Set if this is a regnode_charclass_posixl vs a regnode_charclass. This
* is used for runtime \d, \w, [:posix:], ..., which are used only in locale
* and the optimizer's synthetic start class. Non-locale \d, etc are resolved