summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYves Orton <demerphq@gmail.com>2012-02-19 21:32:05 +0100
committerYves Orton <demerphq@gmail.com>2012-02-20 15:32:17 +0100
commitda181531d429d85afc7242ab6c2b763d42fad35f (patch)
tree0a3a185fe1d022fd91f0b9a635343a277f602f17
parent69be4c6300d5b40e0d7a3562b2bcd870cc5f205a (diff)
downloadperl-smoke-me/trie2.tar.gz
rework how the trie logic handles the newer EXACT nodetypessmoke-me/trie2
This cleans up and simplifies and extends how the trie logic interacts with the new node types. This change ultimately makes the EXACTFU, EXACTFU_SS, EXACTFU_NO_TRIE (renamed to EXACTFU_TRICKYFOLD) work properly with the trie engine regardless of whether the string is utf8 or latin1. This patch depends on the following: EXACT => utf8 or "binary" text EXACTFU => either pre-folded utf8, or latin1 that has to be folded as though it was utf8 EXACTFU_SS => special case of EXACTFU to handle \xDF/ss (affects latin1 treatment) EXACTFU_TRICKYFOLD => special case of EXACTFU to handle tricky non-latin1 fold rules EXACTF => "old style fold logic" untriable nodetype EXACTFA => (currently) untriable nodetype EXACTFL => (currently) untriable nodetype See the comments in regcomp.sym for these fold types. This patch involves a number of distinct, but related parts. Starting from compilation: * Simplify how we detect a triable sequence given the new nodetypes, this also probably fixed some "bugs" in how we detected certain sequences, like /||foo|bar/. * Simplify how we read EXACTFU nodes under utf8 by removing the now redundant folding logic (EXACTFU nodes under utf8 are prefolded). Also extend this logic to handle latin1 patterns properly (in conjunction with other changes) * Part of the problems associated with EXACTFU_SS and EXACTFU_TRICKYFOLD have to do with how the trie logic interacts with the minlen logic. This change handles both by pessimising the minlen when encounting these nodetypes. One observation is that the minlen logic is basically broken, and works only because it conflates bytes and codepoints in such a way that we more or less always get a value small enough that things work out anyway. Fixing that is properly is the job of another patch. * Part of the problem of doing folding under unicode rules is that there are a lot of foldings possible, some with strange rules. This means that the bitmap logic does not work correctly in all cases, as we currently do not have any way to populate it properly. So this patch disables the bitmap entirely when folding is involved until that is fixed. The end result of this is: we can TRIE/AHOCORASICK any sequence of EXACT, or EXACTFU (ish) nodes, regardless of utf8 or not, but we disable the bitmap when folding. A note for follow up relating to this patch is that the way EXACTFU_XXX nodes are currently dealt with we wont build the "maximal" trie because of their presence, instead creating a "jumptrie" consisting of either a leading EXACTFU node followed by a EXACTFU_XXX node, or vice versa. We should eventually address that.
-rw-r--r--embed.fnc4
-rw-r--r--embed.h4
-rw-r--r--proto.h14
-rw-r--r--regcomp.c269
-rw-r--r--regcomp.sym4
-rw-r--r--regexec.c145
-rw-r--r--regnodes.h10
-rw-r--r--t/re/fold_grind.t2
8 files changed, 262 insertions, 190 deletions
diff --git a/embed.fnc b/embed.fnc
index a7e004fe26..1394cff00f 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -604,7 +604,9 @@ Ap |UV |to_uni_upper |UV c|NN U8 *p|NN STRLEN *lenp
Ap |UV |to_uni_title |UV c|NN U8 *p|NN STRLEN *lenp
#ifdef PERL_IN_UTF8_C
sR |U8 |to_lower_latin1|const U8 c|NULLOK U8 *p|NULLOK STRLEN *lenp
-p |UV |_to_fold_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const bool flags
+#endif
+#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C)
+EXp |UV |_to_fold_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const bool flags
#endif
#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C)
p |UV |_to_upper_title_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const char S_or_s
diff --git a/embed.h b/embed.h
index 541309e42e..bb9b76de16 100644
--- a/embed.h
+++ b/embed.h
@@ -983,6 +983,9 @@
#define reghop4 S_reghop4
# endif
# endif
+# if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C)
+#define _to_fold_latin1(a,b,c,d) Perl__to_fold_latin1(aTHX_ a,b,c,d)
+# endif
# if defined(PERL_OLD_COPY_ON_WRITE)
#define sv_setsv_cow(a,b) Perl_sv_setsv_cow(aTHX_ a,b)
# endif
@@ -1600,7 +1603,6 @@
#define isa_lookup(a,b,c,d) S_isa_lookup(aTHX_ a,b,c,d)
# endif
# if defined(PERL_IN_UTF8_C)
-#define _to_fold_latin1(a,b,c,d) Perl__to_fold_latin1(aTHX_ a,b,c,d)
#define check_locale_boundary_crossing(a,b,c,d) S_check_locale_boundary_crossing(aTHX_ a,b,c,d)
#define is_utf8_char_slow S_is_utf8_char_slow
#define is_utf8_common(a,b,c) S_is_utf8_common(aTHX_ a,b,c)
diff --git a/proto.h b/proto.h
index f01e7c3cdf..f696f7bc71 100644
--- a/proto.h
+++ b/proto.h
@@ -7115,12 +7115,6 @@ STATIC bool S_isa_lookup(pTHX_ HV *stash, const char * const name, STRLEN len, U
#endif
#if defined(PERL_IN_UTF8_C)
-PERL_CALLCONV UV Perl__to_fold_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp, const bool flags)
- __attribute__nonnull__(pTHX_2)
- __attribute__nonnull__(pTHX_3);
-#define PERL_ARGS_ASSERT__TO_FOLD_LATIN1 \
- assert(p); assert(lenp)
-
STATIC UV S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
__attribute__warn_unused_result__
__attribute__nonnull__(pTHX_1)
@@ -7167,6 +7161,14 @@ PERL_CALLCONV UV Perl__to_upper_title_latin1(pTHX_ const U8 c, U8 *p, STRLEN *le
assert(p); assert(lenp)
#endif
+#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C)
+PERL_CALLCONV UV Perl__to_fold_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp, const bool flags)
+ __attribute__nonnull__(pTHX_2)
+ __attribute__nonnull__(pTHX_3);
+#define PERL_ARGS_ASSERT__TO_FOLD_LATIN1 \
+ assert(p); assert(lenp)
+
+#endif
#if defined(PERL_IN_UTIL_C)
STATIC bool S_ckwarn_common(pTHX_ U32 w);
STATIC const COP* S_closest_cop(pTHX_ const COP *cop, const OP *o)
diff --git a/regcomp.c b/regcomp.c
index dd5a37c118..e829d94d15 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -1364,44 +1364,47 @@ is the recommended Unicode-aware way of saying
*(d++) = uv;
*/
-#define TRIE_STORE_REVCHAR \
+#define TRIE_STORE_REVCHAR(val) \
STMT_START { \
if (UTF) { \
- SV *zlopp = newSV(2); \
+ SV *zlopp = newSV(7); /* XXX: optimize me */ \
unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
- unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
+ unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
SvCUR_set(zlopp, kapow - flrbbbbb); \
SvPOK_on(zlopp); \
SvUTF8_on(zlopp); \
av_push(revcharmap, zlopp); \
} else { \
- char ooooff = (char)uvc; \
+ char ooooff = (char)val; \
av_push(revcharmap, newSVpvn(&ooooff, 1)); \
} \
} STMT_END
-#define TRIE_READ_CHAR STMT_START { \
- wordlen++; \
- if ( UTF ) { \
- if ( folder ) { \
- if ( foldlen > 0 ) { \
- uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
- foldlen -= len; \
- scan += len; \
- len = 0; \
- } else { \
- len = UTF8SKIP(uc);\
- uvc = to_utf8_fold( uc, foldbuf, &foldlen); \
- foldlen -= UNISKIP( uvc ); \
- scan = foldbuf + UNISKIP( uvc ); \
- } \
- } else { \
- uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
- } \
- } else { \
- uvc = (U32)*uc; \
- len = 1; \
- } \
+#define TRIE_READ_CHAR STMT_START { \
+ wordlen++; \
+ if ( UTF ) { \
+ /* if it is UTF then it is either already folded, or does not need folding */ \
+ uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
+ } \
+ else if (folder == PL_fold_latin1) { \
+ /* if we use this folder we have to obey unicode rules on latin-1 data */ \
+ if ( foldlen > 0 ) { \
+ uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
+ foldlen -= len; \
+ scan += len; \
+ len = 0; \
+ } else { \
+ len = 1; \
+ uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
+ skiplen = UNISKIP(uvc); \
+ foldlen -= skiplen; \
+ scan = foldbuf + skiplen; \
+ } \
+ } else { \
+ /* raw data, will be folded later if needed */ \
+ uvc = (U32)*uc; \
+ len = 1; \
+ } \
} STMT_END
@@ -1520,10 +1523,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
switch (flags) {
case EXACT: break;
case EXACTFA:
+ case EXACTFU_SS:
+ case EXACTFU_TRICKYFOLD:
case EXACTFU: folder = PL_fold_latin1; break;
case EXACTF: folder = PL_fold; break;
case EXACTFL: folder = PL_fold_locale; break;
- default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u", (unsigned) flags );
+ default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
}
trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
@@ -1532,7 +1537,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
trie->wordcount = word_count;
RExC_rxi->data->data[ data_slot ] = (void*)trie;
trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
- if (!(UTF && folder))
+ if (flags == EXACT)
trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
trie->wordcount+1, sizeof(reg_trie_wordinfo));
@@ -1592,6 +1597,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
const U8 * const e = uc + STR_LEN( noper );
STRLEN foldlen = 0;
U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
+ STRLEN skiplen = 0;
const U8 *scan = (U8*)NULL;
U32 wordlen = 0; /* required init */
STRLEN chars = 0;
@@ -1601,28 +1607,37 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
trie->minlen= 0;
continue;
}
- if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
+ if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
regardless of encoding */
-
+ if (OP( noper ) == EXACTFU_SS) {
+ /* false positives are ok, so just set this */
+ TRIE_BITMAP_SET(trie,0xDF);
+ }
+ }
for ( ; uc < e ; uc += len ) {
TRIE_CHARCOUNT(trie)++;
TRIE_READ_CHAR;
chars++;
if ( uvc < 256 ) {
+ if ( folder ) {
+ U8 folded= folder[ (U8) uvc ];
+ if ( !trie->charmap[ folded ] ) {
+ trie->charmap[ folded ]=( ++trie->uniquecharcount );
+ TRIE_STORE_REVCHAR( folded );
+ }
+ }
if ( !trie->charmap[ uvc ] ) {
trie->charmap[ uvc ]=( ++trie->uniquecharcount );
- if ( folder )
- trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
- TRIE_STORE_REVCHAR;
+ TRIE_STORE_REVCHAR( uvc );
}
if ( set_bit ) {
/* store the codepoint in the bitmap, and its folded
* equivalent. */
- TRIE_BITMAP_SET(trie,uvc);
+ TRIE_BITMAP_SET(trie, uvc);
/* store the folded codepoint */
- if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
+ if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
if ( !UTF ) {
/* store first byte of utf8 representation of
@@ -1645,17 +1660,28 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
if ( !SvTRUE( *svpp ) ) {
sv_setiv( *svpp, ++trie->uniquecharcount );
- TRIE_STORE_REVCHAR;
+ TRIE_STORE_REVCHAR(uvc);
}
}
}
if( cur == first ) {
- trie->minlen=chars;
- trie->maxlen=chars;
+ trie->minlen = chars;
+ trie->maxlen = chars;
} else if (chars < trie->minlen) {
- trie->minlen=chars;
+ trie->minlen = chars;
} else if (chars > trie->maxlen) {
- trie->maxlen=chars;
+ trie->maxlen = chars;
+ }
+ if (OP( noper ) == EXACTFU_SS) {
+ /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
+ if (trie->minlen > 1)
+ trie->minlen= 1;
+ }
+ if (OP( noper ) == EXACTFU_TRICKYFOLD) {
+ /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
+ * - We assume that any such sequence might match a 2 byte string */
+ if (trie->minlen > 2 )
+ trie->minlen= 2;
}
} /* end first pass */
@@ -1728,6 +1754,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
STRLEN foldlen = 0; /* required init */
U32 wordlen = 0; /* required init */
U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
+ STRLEN skiplen = 0;
if (OP(noper) != NOTHING) {
for ( ; uc < e ; uc += len ) {
@@ -1928,8 +1955,10 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
STRLEN foldlen = 0; /* required init */
U32 wordlen = 0; /* required init */
+ STRLEN skiplen = 0;
U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
+
if ( OP(noper) != NOTHING ) {
for ( ; uc < e ; uc += len ) {
@@ -2566,7 +2595,7 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode
* optimizer doesn't reject these possibilities based on size constraints.
* 2) These sequences are not currently correctly handled by the trie code
* either, so it changes the joined node type to ops that are not handled
- * by trie's, those new ops being EXACTFU_SS and EXACTFU_NO_TRIE.
+ * by trie's, those new ops being EXACTFU_SS and EXACTFU_TRICKYFOLD.
* 3) This is sufficient for the two Greek sequences (described below), but
* the one involving the Sharp s (\xDF) needs more. The node type
* EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
@@ -2821,7 +2850,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
* is. (I (khw) think it doesn't matter in regexec.c
* for UTF patterns, but no need to change it */
if (OP(scan) == EXACTFU) {
- OP(scan) = EXACTFU_NO_TRIE;
+ OP(scan) = EXACTFU_TRICKYFOLD;
}
s += 6; /* We already know what this sequence is. Skip
the rest of it */
@@ -3183,7 +3212,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
regnode *first = (regnode *)NULL;
regnode *last = (regnode *)NULL;
regnode *tail = scan;
- U8 optype = 0;
+ U8 trietype = 0;
U32 count=0;
#ifdef DEBUGGING
@@ -3214,32 +3243,59 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
/*
- step through the branches, cur represents each
- branch, noper is the first thing to be matched
- as part of that branch and noper_next is the
- regnext() of that node. if noper is an EXACT
- and noper_next is the same as scan (our current
- position in the regex) then the EXACT branch is
- a possible optimization target. Once we have
- two or more consecutive such branches we can
- create a trie of the EXACT's contents and stich
- it in place. If the sequence represents all of
- the branches we eliminate the whole thing and
- replace it with a single TRIE. If it is a
- subsequence then we need to stitch it in. This
- means the first branch has to remain, and needs
- to be repointed at the item on the branch chain
- following the last branch optimized. This could
- be either a BRANCH, in which case the
- subsequence is internal, or it could be the
- item following the branch sequence in which
- case the subsequence is at the end.
+ Step through the branches
+ cur represents each branch,
+ noper is the first thing to be matched as part of that branch
+ noper_next is the regnext() of that node.
+
+ We normally handle a case like this /FOO[xyz]|BAR[pqr]/
+ via a "jump trie" but we also support building with NOJUMPTRIE,
+ which restricts the trie logic to structures like /FOO|BAR/.
+
+ If noper is a trieable nodetype then the branch is a possible optimization
+ target. If we are building under NOJUMPTRIE then we require that noper_next
+ is the same as scan (our current position in the regex program).
+
+ Once we have two or more consecutive such branches we can create a
+ trie of the EXACT's contents and stitch it in place into the program.
+
+ If the sequence represents all of the branches in the alternation we
+ replace the entire thing with a single TRIE node.
+
+ Otherwise when it is a subsequence we need to stitch it in place and
+ replace only the relevant branches. This means the first branch has
+ to remain as it is used by the alternation logic, and its next pointer,
+ and needs to be repointed at the item on the branch chain following
+ the last branch we have optimized away.
+
+ This could be either a BRANCH, in which case the subsequence is internal,
+ or it could be the item following the branch sequence in which case the
+ subsequence is at the end (which does not necessarily mean the first node
+ is the start of the alternation).
+
+ TRIE_TYPE(X) is a define which maps the optype to a trietype.
+
+ optype | trietype
+ ----------------+-----------
+ NOTHING | NOTHING
+ EXACT | EXACT
+ EXACTFU | EXACTFU
+ EXACTFU_SS | EXACTFU
+ EXACTFU_TRICKYFOLD | EXACTFU
+ EXACTFA | 0
+
*/
+#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
+ ( EXACT == (X) ) ? EXACT : \
+ ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
+ 0 )
/* dont use tail as the end marker for this traverse */
for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
regnode * const noper = NEXTOPER( cur );
+ U8 noper_type = OP( noper );
+ U8 noper_trietype = TRIE_TYPE( noper_type );
#if defined(DEBUGGING) || defined(NOJUMPTRIE)
regnode * const noper_next = regnext( noper );
#endif
@@ -3261,59 +3317,68 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
});
- if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
- : PL_regkind[ OP( noper ) ] == EXACT )
- || OP(noper) == NOTHING )
+
+ /* Is noper a trieable nodetype that can be merged with the
+ * current trie (if there is one)? */
+ if ( noper_trietype
+ &&
+ (
+ ( noper_trietype == NOTHING )
+ ||
+ ( trietype == NOTHING )
+ ||
+ ( trietype == noper_trietype )
+ )
#ifdef NOJUMPTRIE
&& noper_next == tail
#endif
&& count < U16_MAX)
{
+ /* Handle mergable triable node
+ * Either we are the first node in a new trieable sequence,
+ * in which case we do some bookkeeping, otherwise we update
+ * the end pointer. */
count++;
- if ( !first || optype == NOTHING ) {
- if (!first) first = cur;
- optype = OP( noper );
+ if ( !first ) {
+ first = cur;
+ trietype = noper_trietype;
} else {
+ if ( trietype == NOTHING )
+ trietype = noper_trietype;
last = cur;
}
- } else {
-/*
- Currently the trie logic handles case insensitive matching properly only
- when the pattern is UTF-8 and the node is EXACTFU (thus forcing unicode
- semantics).
-
- If/when this is fixed the following define can be swapped
- in below to fully enable trie logic.
-
-#define TRIE_TYPE_IS_SAFE 1
-
-Note that join_exact() assumes that the other types of EXACTFish nodes are not
-used in tries, so that would have to be updated if this changed
-
-*/
-#define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT)
-
- if ( last && TRIE_TYPE_IS_SAFE ) {
+ } /* end handle mergable triable node */
+ else {
+ /* handle unmergable node -
+ * noper may either be a triable node which can not be tried
+ * together with the current trie, or a non triable node */
+ if ( last && trietype != NOTHING ) {
+ /* if last is set then we have found at least two triable branch
+ * sequences in a row of a similar trietype so we can turn them
+ * into a trie */
make_trie( pRExC_state,
startbranch, first, cur, tail, count,
- optype, depth+1 );
+ trietype, depth+1 );
+ last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
}
- if ( PL_regkind[ OP( noper ) ] == EXACT
+ if ( noper_trietype
#ifdef NOJUMPTRIE
&& noper_next == tail
#endif
){
+ /* noper is triable, so we can start a new trie sequence */
count = 1;
first = cur;
- optype = OP( noper );
- } else {
+ trietype = noper_trietype;
+ } else if (first) {
+ /* if we already saw a first but the current node is not triable then we have
+ * to reset the first information. */
count = 0;
first = NULL;
- optype = 0;
+ trietype = 0;
}
- last = NULL;
- }
- }
+ } /* end handle unmergable node */
+ } /* loop over branches */
DEBUG_OPTIMISE_r({
regprop(RExC_rx, mysv, cur);
PerlIO_printf( Perl_debug_log,
@@ -3321,9 +3386,11 @@ used in tries, so that would have to be updated if this changed
"", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
});
-
- if ( last && TRIE_TYPE_IS_SAFE ) {
- made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
+ if ( last && trietype != NOTHING ) {
+ /* the last branch of the sequence was part of a trie,
+ * so we have to construct it here outside of the loop
+ */
+ made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
#ifdef TRIE_STUDY_OPT
if ( ((made == MADE_EXACT_TRIE &&
startbranch == first)
@@ -3337,8 +3404,8 @@ used in tries, so that would have to be updated if this changed
}
}
#endif
- }
- }
+ } /* end if ( last) */
+ } /* TRIE_MAXBUF is non zero */
} /* do trie */
@@ -12027,7 +12094,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,
case EXACTFA:
case EXACTFU:
case EXACTFU_SS:
- case EXACTFU_NO_TRIE:
+ case EXACTFU_TRICKYFOLD:
case EXACTFL:
if( exact == PSEUDO )
exact= OP(scan);
diff --git a/regcomp.sym b/regcomp.sym
index f0ee701991..e6be6ac292 100644
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -92,14 +92,14 @@ BRANCH BRANCH, node 0 V ; Match this alternative, or the next...
# not used
BACK BACK, no 0 V ; Match "", "next" ptr points backward.
-#*Literals
+#*Literals - NOTE the relative ordering of these types is important do not change it
EXACT EXACT, str ; Match this string (preceded by length).
EXACTF EXACT, str ; Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len).
EXACTFL EXACT, str ; Match this string (not guaranteed to be folded) using /il rules (w/len).
EXACTFU EXACT, str ; Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len).
EXACTFU_SS EXACT, str ; Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len).
-EXACTFU_NO_TRIE EXACT, str ; Match this folded UTF-8 string using /iu rules, but don't generate a trie for it
+EXACTFU_TRICKYFOLD EXACT, str ; Match this folded UTF-8 string using /iu rules, but don't generate a trie for it
EXACTFA EXACT, str ; Match this string (not guaranteed to be folded) using /iaa rules (w/len).
#*Do nothing types
diff --git a/regexec.c b/regexec.c
index 76eb7f9bed..8ccb6f7032 100644
--- a/regexec.c
+++ b/regexec.c
@@ -303,13 +303,13 @@
/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
we don't need this definition. */
#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
-#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_NO_TRIE || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
+#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
#else
/* ... so we use this as its faster. */
#define IS_TEXT(rn) ( OP(rn)==EXACT )
-#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_NO_TRIE || OP(rn) == EXACTFA)
+#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
@@ -1187,58 +1187,61 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
#define DECL_TRIE_TYPE(scan) \
const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
- trie_type = (scan->flags != EXACT) \
- ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
- : (utf8_target ? trie_utf8 : trie_plain)
-
-#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
-uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
- switch (trie_type) { \
- case trie_utf8_fold: \
- if ( foldlen>0 ) { \
- uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
- foldlen -= len; \
- uscan += len; \
- len=0; \
- } else { \
- uvc = to_utf8_fold( (U8 *) uc, foldbuf, &foldlen ); \
- len = UTF8SKIP(uc); \
- foldlen -= UNISKIP( uvc ); \
- uscan = foldbuf + UNISKIP( uvc ); \
- } \
- break; \
- case trie_latin_utf8_fold: \
- if ( foldlen>0 ) { \
- uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
- foldlen -= len; \
- uscan += len; \
- len=0; \
- } else { \
- len = 1; \
- uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen ); \
- foldlen -= UNISKIP( uvc ); \
- uscan = foldbuf + UNISKIP( uvc ); \
- } \
- break; \
- case trie_utf8: \
- uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
- break; \
- case trie_plain: \
- uvc = (UV)*uc; \
- len = 1; \
- } \
- if (uvc < 256) { \
- charid = trie->charmap[ uvc ]; \
- } \
- else { \
- charid = 0; \
- if (widecharmap) { \
- SV** const svpp = hv_fetch(widecharmap, \
- (char*)&uvc, sizeof(UV), 0); \
- if (svpp) \
- charid = (U16)SvIV(*svpp); \
- } \
- } \
+ trie_type = ((scan->flags == EXACT) \
+ ? (utf8_target ? trie_utf8 : trie_plain) \
+ : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
+
+#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
+uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
+ STRLEN skiplen; \
+ switch (trie_type) { \
+ case trie_utf8_fold: \
+ if ( foldlen>0 ) { \
+ uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
+ foldlen -= len; \
+ uscan += len; \
+ len=0; \
+ } else { \
+ uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
+ len = UTF8SKIP(uc); \
+ skiplen = UNISKIP( uvc ); \
+ foldlen -= skiplen; \
+ uscan = foldbuf + skiplen; \
+ } \
+ break; \
+ case trie_latin_utf8_fold: \
+ if ( foldlen>0 ) { \
+ uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
+ foldlen -= len; \
+ uscan += len; \
+ len=0; \
+ } else { \
+ len = 1; \
+ uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
+ skiplen = UNISKIP( uvc ); \
+ foldlen -= skiplen; \
+ uscan = foldbuf + skiplen; \
+ } \
+ break; \
+ case trie_utf8: \
+ uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
+ break; \
+ case trie_plain: \
+ uvc = (UV)*uc; \
+ len = 1; \
+ } \
+ if (uvc < 256) { \
+ charid = trie->charmap[ uvc ]; \
+ } \
+ else { \
+ charid = 0; \
+ if (widecharmap) { \
+ SV** const svpp = hv_fetch(widecharmap, \
+ (char*)&uvc, sizeof(UV), 0); \
+ if (svpp) \
+ charid = (U16)SvIV(*svpp); \
+ } \
+ } \
} STMT_END
#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
@@ -1490,7 +1493,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
}
goto do_exactf_utf8;
- case EXACTFU_NO_TRIE:
+ case EXACTFU_TRICKYFOLD:
case EXACTFU:
if (UTF_PATTERN || utf8_target) {
utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
@@ -3267,16 +3270,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
/* In this case the charclass data is available inline so
we can fail fast without a lot of extra overhead.
*/
- if (scan->flags == EXACT || !utf8_target) {
- if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
- DEBUG_EXECUTE_r(
- PerlIO_printf(Perl_debug_log,
- "%*s %sfailed to match trie start class...%s\n",
- REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
- );
- sayNO_SILENT;
- /* NOTREACHED */
- }
+ if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
+ DEBUG_EXECUTE_r(
+ PerlIO_printf(Perl_debug_log,
+ "%*s %sfailed to match trie start class...%s\n",
+ REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
+ );
+ sayNO_SILENT;
+ /* NOTREACHED */
}
/* FALL THROUGH */
case TRIE:
@@ -3334,9 +3335,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
U32 state = trie->startstate;
- if (trie->bitmap && trie_type != trie_utf8_fold &&
- !TRIE_BITMAP_TEST(trie,*locinput)
- ) {
+ if (trie->bitmap && !TRIE_BITMAP_TEST(trie,*locinput) ) {
if (trie->states[ state ].wordnum) {
DEBUG_EXECUTE_r(
PerlIO_printf(Perl_debug_log,
@@ -3671,7 +3670,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
goto do_exactf;
case EXACTFU_SS:
- case EXACTFU_NO_TRIE:
+ case EXACTFU_TRICKYFOLD:
case EXACTFU:
folder = foldEQ_latin1;
fold_array = PL_fold_latin1;
@@ -5085,7 +5084,7 @@ NULL
case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
case EXACTFA:
case EXACTFU_SS:
- case EXACTFU_NO_TRIE:
+ case EXACTFU_TRICKYFOLD:
case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
default: ST.c2 = ST.c1;
@@ -5241,7 +5240,7 @@ NULL
case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
case EXACTFA:
case EXACTFU_SS:
- case EXACTFU_NO_TRIE:
+ case EXACTFU_TRICKYFOLD:
case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
default: ST.c2 = ST.c1; break;
@@ -6035,7 +6034,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
goto do_exactf;
case EXACTFU_SS:
- case EXACTFU_NO_TRIE:
+ case EXACTFU_TRICKYFOLD:
case EXACTFU:
utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
@@ -6077,7 +6076,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
switch (OP(p)) {
case EXACTF: folded = PL_fold[c]; break;
case EXACTFA:
- case EXACTFU_NO_TRIE:
+ case EXACTFU_TRICKYFOLD:
case EXACTFU: folded = PL_fold_latin1[c]; break;
case EXACTFL: folded = PL_fold_locale[c]; break;
default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
diff --git a/regnodes.h b/regnodes.h
index 994159342f..5561243882 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -62,7 +62,7 @@
#define EXACTFL 50 /* 0x32 Match this string (not guaranteed to be folded) using /il rules (w/len). */
#define EXACTFU 51 /* 0x33 Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len). */
#define EXACTFU_SS 52 /* 0x34 Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len). */
-#define EXACTFU_NO_TRIE 53 /* 0x35 Match this folded UTF-8 string using /iu rules, but don't generate a trie for it */
+#define EXACTFU_TRICKYFOLD 53 /* 0x35 Match this folded UTF-8 string using /iu rules, but don't generate a trie for it */
#define EXACTFA 54 /* 0x36 Match this string (not guaranteed to be folded) using /iaa rules (w/len). */
#define NOTHING 55 /* 0x37 Match empty string. */
#define TAIL 56 /* 0x38 Match empty string. Can jump here from outside. */
@@ -223,7 +223,7 @@ EXTCONST U8 PL_regkind[] = {
EXACT, /* EXACTFL */
EXACT, /* EXACTFU */
EXACT, /* EXACTFU_SS */
- EXACT, /* EXACTFU_NO_TRIE */
+ EXACT, /* EXACTFU_TRICKYFOLD */
EXACT, /* EXACTFA */
NOTHING, /* NOTHING */
NOTHING, /* TAIL */
@@ -384,7 +384,7 @@ static const U8 regarglen[] = {
0, /* EXACTFL */
0, /* EXACTFU */
0, /* EXACTFU_SS */
- 0, /* EXACTFU_NO_TRIE */
+ 0, /* EXACTFU_TRICKYFOLD */
0, /* EXACTFA */
0, /* NOTHING */
0, /* TAIL */
@@ -502,7 +502,7 @@ static const char reg_off_by_arg[] = {
0, /* EXACTFL */
0, /* EXACTFU */
0, /* EXACTFU_SS */
- 0, /* EXACTFU_NO_TRIE */
+ 0, /* EXACTFU_TRICKYFOLD */
0, /* EXACTFA */
0, /* NOTHING */
0, /* TAIL */
@@ -625,7 +625,7 @@ EXTCONST char * const PL_reg_name[] = {
"EXACTFL", /* 0x32 */
"EXACTFU", /* 0x33 */
"EXACTFU_SS", /* 0x34 */
- "EXACTFU_NO_TRIE", /* 0x35 */
+ "EXACTFU_TRICKYFOLD", /* 0x35 */
"EXACTFA", /* 0x36 */
"NOTHING", /* 0x37 */
"TAIL", /* 0x38 */
diff --git a/t/re/fold_grind.t b/t/re/fold_grind.t
index 98fc3966c5..e2153e3186 100644
--- a/t/re/fold_grind.t
+++ b/t/re/fold_grind.t
@@ -735,7 +735,7 @@ foreach my $test (sort { numerically } keys %tests) {
if (!$res || $ENV{PERL_DEBUG_FULL_TEST}) {
# Failed or debug; output the result
$count++;
- ok($res, $desc);
+ ok($res, "test $count - $desc");
} else {
# Just count the test as passed
$okays++;