diff options
author | Yves Orton <demerphq@gmail.com> | 2012-02-19 21:32:05 +0100 |
---|---|---|
committer | Yves Orton <demerphq@gmail.com> | 2012-03-03 13:27:28 +0100 |
commit | fab2782b37b5570d7f8f8065fd7d18621117ed49 (patch) | |
tree | 5c7ef3c0c69d4998c89e2469998092747584f68b | |
parent | 2f137bbd018b7f86a6a557d3552cbb7a760bb43a (diff) | |
download | perl-fab2782b37b5570d7f8f8065fd7d18621117ed49.tar.gz |
rework how the trie logic handles the newer EXACT nodetypes
This cleans up and simplifies and extends how the trie
logic interacts with the new node types. This change ultimately
makes the EXACTFU, EXACTFU_SS, EXACTFU_NO_TRIE (renamed to
EXACTFU_TRICKYFOLD) work properly with the trie engine regardless
of whether the string is utf8 or latin1.
This patch depends on the following:
EXACT => utf8 or "binary" text
EXACTFU => either pre-folded utf8, or latin1 that has to be folded as though it was utf8
EXACTFU_SS => special case of EXACTFU to handle \xDF/ss (affects latin1 treatment)
EXACTFU_TRICKYFOLD => special case of EXACTFU to handle tricky non-latin1 fold rules
EXACTF => "old style fold logic" untriable nodetype
EXACTFA => (currently) untriable nodetype
EXACTFL => (currently) untriable nodetype
See the comments in regcomp.sym for these fold types.
This patch involves a number of distinct, but related parts. Starting
from compilation:
* Simplify how we detect a triable sequence given the new nodetypes,
this also probably fixed some "bugs" in how we detected certain
sequences, like /||foo|bar/.
* Simplify how we read EXACTFU nodes under utf8 by removing the now
redundant folding logic (EXACTFU nodes under utf8 are prefolded).
Also extend this logic to handle latin1 patterns properly (in
conjunction with other changes)
* Part of the problems associated with EXACTFU_SS and EXACTFU_TRICKYFOLD
have to do with how the trie logic interacts with the minlen logic.
This change handles both by pessimising the minlen when encounting
these nodetypes. One observation is that the minlen logic is basically
broken, and works only because it conflates bytes and codepoints in
such a way that we more or less always get a value small enough that things work out
anyway. Fixing that is properly is the job of another patch.
* Part of the problem of doing folding under unicode rules is that
there are a lot of foldings possible, some with strange rules. This
means that the bitmap logic does not work correctly in all cases,
as we currently do not have any way to populate it properly.
So this patch disables the bitmap entirely when folding is involved
until that is fixed.
The end result of this is: we can TRIE/AHOCORASICK any sequence of
EXACT, or EXACTFU (ish) nodes, regardless of utf8 or not, but we disable
the bitmap when folding.
A note for follow up relating to this patch is that the way EXACTFU_XXX
nodes are currently dealt with we wont build the "maximal" trie because
of their presence, instead creating a "jumptrie" consisting of either a
leading EXACTFU node followed by a EXACTFU_XXX node, or vice versa. We
should eventually address that.
-rw-r--r-- | embed.fnc | 4 | ||||
-rw-r--r-- | embed.h | 4 | ||||
-rw-r--r-- | proto.h | 14 | ||||
-rw-r--r-- | regcomp.c | 269 | ||||
-rw-r--r-- | regcomp.sym | 4 | ||||
-rw-r--r-- | regexec.c | 145 | ||||
-rw-r--r-- | regnodes.h | 10 | ||||
-rw-r--r-- | t/re/fold_grind.t | 2 |
8 files changed, 262 insertions, 190 deletions
@@ -604,7 +604,9 @@ Ap |UV |to_uni_upper |UV c|NN U8 *p|NN STRLEN *lenp Ap |UV |to_uni_title |UV c|NN U8 *p|NN STRLEN *lenp #ifdef PERL_IN_UTF8_C sR |U8 |to_lower_latin1|const U8 c|NULLOK U8 *p|NULLOK STRLEN *lenp -p |UV |_to_fold_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const bool flags +#endif +#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) +EXp |UV |_to_fold_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const bool flags #endif #if defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C) p |UV |_to_upper_title_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const char S_or_s @@ -983,6 +983,9 @@ #define reghop4 S_reghop4 # endif # endif +# if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) +#define _to_fold_latin1(a,b,c,d) Perl__to_fold_latin1(aTHX_ a,b,c,d) +# endif # if defined(PERL_OLD_COPY_ON_WRITE) #define sv_setsv_cow(a,b) Perl_sv_setsv_cow(aTHX_ a,b) # endif @@ -1600,7 +1603,6 @@ #define isa_lookup(a,b,c,d) S_isa_lookup(aTHX_ a,b,c,d) # endif # if defined(PERL_IN_UTF8_C) -#define _to_fold_latin1(a,b,c,d) Perl__to_fold_latin1(aTHX_ a,b,c,d) #define check_locale_boundary_crossing(a,b,c,d) S_check_locale_boundary_crossing(aTHX_ a,b,c,d) #define is_utf8_char_slow S_is_utf8_char_slow #define is_utf8_common(a,b,c) S_is_utf8_common(aTHX_ a,b,c) @@ -7114,12 +7114,6 @@ STATIC bool S_isa_lookup(pTHX_ HV *stash, const char * const name, STRLEN len, U #endif #if defined(PERL_IN_UTF8_C) -PERL_CALLCONV UV Perl__to_fold_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp, const bool flags) - __attribute__nonnull__(pTHX_2) - __attribute__nonnull__(pTHX_3); -#define PERL_ARGS_ASSERT__TO_FOLD_LATIN1 \ - assert(p); assert(lenp) - STATIC UV S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp) __attribute__warn_unused_result__ __attribute__nonnull__(pTHX_1) @@ -7166,6 +7160,14 @@ PERL_CALLCONV UV Perl__to_upper_title_latin1(pTHX_ const U8 c, U8 *p, STRLEN *le assert(p); assert(lenp) #endif +#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) +PERL_CALLCONV UV Perl__to_fold_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp, const bool flags) + __attribute__nonnull__(pTHX_2) + __attribute__nonnull__(pTHX_3); +#define PERL_ARGS_ASSERT__TO_FOLD_LATIN1 \ + assert(p); assert(lenp) + +#endif #if defined(PERL_IN_UTIL_C) STATIC bool S_ckwarn_common(pTHX_ U32 w); STATIC const COP* S_closest_cop(pTHX_ const COP *cop, const OP *o) @@ -1366,44 +1366,47 @@ is the recommended Unicode-aware way of saying *(d++) = uv; */ -#define TRIE_STORE_REVCHAR \ +#define TRIE_STORE_REVCHAR(val) \ STMT_START { \ if (UTF) { \ - SV *zlopp = newSV(2); \ + SV *zlopp = newSV(7); /* XXX: optimize me */ \ unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ - unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \ + unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \ SvCUR_set(zlopp, kapow - flrbbbbb); \ SvPOK_on(zlopp); \ SvUTF8_on(zlopp); \ av_push(revcharmap, zlopp); \ } else { \ - char ooooff = (char)uvc; \ + char ooooff = (char)val; \ av_push(revcharmap, newSVpvn(&ooooff, 1)); \ } \ } STMT_END -#define TRIE_READ_CHAR STMT_START { \ - wordlen++; \ - if ( UTF ) { \ - if ( folder ) { \ - if ( foldlen > 0 ) { \ - uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \ - foldlen -= len; \ - scan += len; \ - len = 0; \ - } else { \ - len = UTF8SKIP(uc);\ - uvc = to_utf8_fold( uc, foldbuf, &foldlen); \ - foldlen -= UNISKIP( uvc ); \ - scan = foldbuf + UNISKIP( uvc ); \ - } \ - } else { \ - uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\ - } \ - } else { \ - uvc = (U32)*uc; \ - len = 1; \ - } \ +#define TRIE_READ_CHAR STMT_START { \ + wordlen++; \ + if ( UTF ) { \ + /* if it is UTF then it is either already folded, or does not need folding */ \ + uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \ + } \ + else if (folder == PL_fold_latin1) { \ + /* if we use this folder we have to obey unicode rules on latin-1 data */ \ + if ( foldlen > 0 ) { \ + uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \ + foldlen -= len; \ + scan += len; \ + len = 0; \ + } else { \ + len = 1; \ + uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \ + skiplen = UNISKIP(uvc); \ + foldlen -= skiplen; \ + scan = foldbuf + skiplen; \ + } \ + } else { \ + /* raw data, will be folded later if needed */ \ + uvc = (U32)*uc; \ + len = 1; \ + } \ } STMT_END @@ -1522,10 +1525,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs switch (flags) { case EXACT: break; case EXACTFA: + case EXACTFU_SS: + case EXACTFU_TRICKYFOLD: case EXACTFU: folder = PL_fold_latin1; break; case EXACTF: folder = PL_fold; break; case EXACTFL: folder = PL_fold_locale; break; - default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u", (unsigned) flags ); + default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] ); } trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) ); @@ -1534,7 +1539,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs trie->wordcount = word_count; RExC_rxi->data->data[ data_slot ] = (void*)trie; trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) ); - if (!(UTF && folder)) + if (flags == EXACT) trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 ); trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc( trie->wordcount+1, sizeof(reg_trie_wordinfo)); @@ -1594,6 +1599,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs const U8 * const e = uc + STR_LEN( noper ); STRLEN foldlen = 0; U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; + STRLEN skiplen = 0; const U8 *scan = (U8*)NULL; U32 wordlen = 0; /* required init */ STRLEN chars = 0; @@ -1603,28 +1609,37 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs trie->minlen= 0; continue; } - if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */ + if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */ TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte regardless of encoding */ - + if (OP( noper ) == EXACTFU_SS) { + /* false positives are ok, so just set this */ + TRIE_BITMAP_SET(trie,0xDF); + } + } for ( ; uc < e ; uc += len ) { TRIE_CHARCOUNT(trie)++; TRIE_READ_CHAR; chars++; if ( uvc < 256 ) { + if ( folder ) { + U8 folded= folder[ (U8) uvc ]; + if ( !trie->charmap[ folded ] ) { + trie->charmap[ folded ]=( ++trie->uniquecharcount ); + TRIE_STORE_REVCHAR( folded ); + } + } if ( !trie->charmap[ uvc ] ) { trie->charmap[ uvc ]=( ++trie->uniquecharcount ); - if ( folder ) - trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ]; - TRIE_STORE_REVCHAR; + TRIE_STORE_REVCHAR( uvc ); } if ( set_bit ) { /* store the codepoint in the bitmap, and its folded * equivalent. */ - TRIE_BITMAP_SET(trie,uvc); + TRIE_BITMAP_SET(trie, uvc); /* store the folded codepoint */ - if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]); + if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); if ( !UTF ) { /* store first byte of utf8 representation of @@ -1647,17 +1662,28 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( !SvTRUE( *svpp ) ) { sv_setiv( *svpp, ++trie->uniquecharcount ); - TRIE_STORE_REVCHAR; + TRIE_STORE_REVCHAR(uvc); } } } if( cur == first ) { - trie->minlen=chars; - trie->maxlen=chars; + trie->minlen = chars; + trie->maxlen = chars; } else if (chars < trie->minlen) { - trie->minlen=chars; + trie->minlen = chars; } else if (chars > trie->maxlen) { - trie->maxlen=chars; + trie->maxlen = chars; + } + if (OP( noper ) == EXACTFU_SS) { + /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/ + if (trie->minlen > 1) + trie->minlen= 1; + } + if (OP( noper ) == EXACTFU_TRICKYFOLD) { + /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}" + * - We assume that any such sequence might match a 2 byte string */ + if (trie->minlen > 2 ) + trie->minlen= 2; } } /* end first pass */ @@ -1730,6 +1756,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs STRLEN foldlen = 0; /* required init */ U32 wordlen = 0; /* required init */ U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; + STRLEN skiplen = 0; if (OP(noper) != NOTHING) { for ( ; uc < e ; uc += len ) { @@ -1930,8 +1957,10 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs STRLEN foldlen = 0; /* required init */ U32 wordlen = 0; /* required init */ + STRLEN skiplen = 0; U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; + if ( OP(noper) != NOTHING ) { for ( ; uc < e ; uc += len ) { @@ -2568,7 +2597,7 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * optimizer doesn't reject these possibilities based on size constraints. * 2) These sequences are not currently correctly handled by the trie code * either, so it changes the joined node type to ops that are not handled - * by trie's, those new ops being EXACTFU_SS and EXACTFU_NO_TRIE. + * by trie's, those new ops being EXACTFU_SS and EXACTFU_TRICKYFOLD. * 3) This is sufficient for the two Greek sequences (described below), but * the one involving the Sharp s (\xDF) needs more. The node type * EXACTFU_SS is used for an EXACTFU node that contains at least one "ss" @@ -2823,7 +2852,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b * is. (I (khw) think it doesn't matter in regexec.c * for UTF patterns, but no need to change it */ if (OP(scan) == EXACTFU) { - OP(scan) = EXACTFU_NO_TRIE; + OP(scan) = EXACTFU_TRICKYFOLD; } s += 6; /* We already know what this sequence is. Skip the rest of it */ @@ -3185,7 +3214,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, regnode *first = (regnode *)NULL; regnode *last = (regnode *)NULL; regnode *tail = scan; - U8 optype = 0; + U8 trietype = 0; U32 count=0; #ifdef DEBUGGING @@ -3216,32 +3245,59 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* - step through the branches, cur represents each - branch, noper is the first thing to be matched - as part of that branch and noper_next is the - regnext() of that node. if noper is an EXACT - and noper_next is the same as scan (our current - position in the regex) then the EXACT branch is - a possible optimization target. Once we have - two or more consecutive such branches we can - create a trie of the EXACT's contents and stich - it in place. If the sequence represents all of - the branches we eliminate the whole thing and - replace it with a single TRIE. If it is a - subsequence then we need to stitch it in. This - means the first branch has to remain, and needs - to be repointed at the item on the branch chain - following the last branch optimized. This could - be either a BRANCH, in which case the - subsequence is internal, or it could be the - item following the branch sequence in which - case the subsequence is at the end. + Step through the branches + cur represents each branch, + noper is the first thing to be matched as part of that branch + noper_next is the regnext() of that node. + + We normally handle a case like this /FOO[xyz]|BAR[pqr]/ + via a "jump trie" but we also support building with NOJUMPTRIE, + which restricts the trie logic to structures like /FOO|BAR/. + + If noper is a trieable nodetype then the branch is a possible optimization + target. If we are building under NOJUMPTRIE then we require that noper_next + is the same as scan (our current position in the regex program). + + Once we have two or more consecutive such branches we can create a + trie of the EXACT's contents and stitch it in place into the program. + + If the sequence represents all of the branches in the alternation we + replace the entire thing with a single TRIE node. + + Otherwise when it is a subsequence we need to stitch it in place and + replace only the relevant branches. This means the first branch has + to remain as it is used by the alternation logic, and its next pointer, + and needs to be repointed at the item on the branch chain following + the last branch we have optimized away. + + This could be either a BRANCH, in which case the subsequence is internal, + or it could be the item following the branch sequence in which case the + subsequence is at the end (which does not necessarily mean the first node + is the start of the alternation). + + TRIE_TYPE(X) is a define which maps the optype to a trietype. + + optype | trietype + ----------------+----------- + NOTHING | NOTHING + EXACT | EXACT + EXACTFU | EXACTFU + EXACTFU_SS | EXACTFU + EXACTFU_TRICKYFOLD | EXACTFU + EXACTFA | 0 + */ +#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \ + ( EXACT == (X) ) ? EXACT : \ + ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \ + 0 ) /* dont use tail as the end marker for this traverse */ for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) { regnode * const noper = NEXTOPER( cur ); + U8 noper_type = OP( noper ); + U8 noper_trietype = TRIE_TYPE( noper_type ); #if defined(DEBUGGING) || defined(NOJUMPTRIE) regnode * const noper_next = regnext( noper ); #endif @@ -3263,59 +3319,68 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n", REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) ); }); - if ( (((first && optype!=NOTHING) ? OP( noper ) == optype - : PL_regkind[ OP( noper ) ] == EXACT ) - || OP(noper) == NOTHING ) + + /* Is noper a trieable nodetype that can be merged with the + * current trie (if there is one)? */ + if ( noper_trietype + && + ( + ( noper_trietype == NOTHING ) + || + ( trietype == NOTHING ) + || + ( trietype == noper_trietype ) + ) #ifdef NOJUMPTRIE && noper_next == tail #endif && count < U16_MAX) { + /* Handle mergable triable node + * Either we are the first node in a new trieable sequence, + * in which case we do some bookkeeping, otherwise we update + * the end pointer. */ count++; - if ( !first || optype == NOTHING ) { - if (!first) first = cur; - optype = OP( noper ); + if ( !first ) { + first = cur; + trietype = noper_trietype; } else { + if ( trietype == NOTHING ) + trietype = noper_trietype; last = cur; } - } else { -/* - Currently the trie logic handles case insensitive matching properly only - when the pattern is UTF-8 and the node is EXACTFU (thus forcing unicode - semantics). - - If/when this is fixed the following define can be swapped - in below to fully enable trie logic. - -#define TRIE_TYPE_IS_SAFE 1 - -Note that join_exact() assumes that the other types of EXACTFish nodes are not -used in tries, so that would have to be updated if this changed - -*/ -#define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT) - - if ( last && TRIE_TYPE_IS_SAFE ) { + } /* end handle mergable triable node */ + else { + /* handle unmergable node - + * noper may either be a triable node which can not be tried + * together with the current trie, or a non triable node */ + if ( last && trietype != NOTHING ) { + /* if last is set then we have found at least two triable branch + * sequences in a row of a similar trietype so we can turn them + * into a trie */ make_trie( pRExC_state, startbranch, first, cur, tail, count, - optype, depth+1 ); + trietype, depth+1 ); + last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */ } - if ( PL_regkind[ OP( noper ) ] == EXACT + if ( noper_trietype #ifdef NOJUMPTRIE && noper_next == tail #endif ){ + /* noper is triable, so we can start a new trie sequence */ count = 1; first = cur; - optype = OP( noper ); - } else { + trietype = noper_trietype; + } else if (first) { + /* if we already saw a first but the current node is not triable then we have + * to reset the first information. */ count = 0; first = NULL; - optype = 0; + trietype = 0; } - last = NULL; - } - } + } /* end handle unmergable node */ + } /* loop over branches */ DEBUG_OPTIMISE_r({ regprop(RExC_rx, mysv, cur); PerlIO_printf( Perl_debug_log, @@ -3323,9 +3388,11 @@ used in tries, so that would have to be updated if this changed "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); }); - - if ( last && TRIE_TYPE_IS_SAFE ) { - made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 ); + if ( last && trietype != NOTHING ) { + /* the last branch of the sequence was part of a trie, + * so we have to construct it here outside of the loop + */ + made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 ); #ifdef TRIE_STUDY_OPT if ( ((made == MADE_EXACT_TRIE && startbranch == first) @@ -3339,8 +3406,8 @@ used in tries, so that would have to be updated if this changed } } #endif - } - } + } /* end if ( last) */ + } /* TRIE_MAXBUF is non zero */ } /* do trie */ @@ -12034,7 +12101,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, case EXACTFA: case EXACTFU: case EXACTFU_SS: - case EXACTFU_NO_TRIE: + case EXACTFU_TRICKYFOLD: case EXACTFL: if( exact == PSEUDO ) exact= OP(scan); diff --git a/regcomp.sym b/regcomp.sym index f0ee701991..e6be6ac292 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -92,14 +92,14 @@ BRANCH BRANCH, node 0 V ; Match this alternative, or the next... # not used BACK BACK, no 0 V ; Match "", "next" ptr points backward. -#*Literals +#*Literals - NOTE the relative ordering of these types is important do not change it EXACT EXACT, str ; Match this string (preceded by length). EXACTF EXACT, str ; Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len). EXACTFL EXACT, str ; Match this string (not guaranteed to be folded) using /il rules (w/len). EXACTFU EXACT, str ; Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len). EXACTFU_SS EXACT, str ; Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len). -EXACTFU_NO_TRIE EXACT, str ; Match this folded UTF-8 string using /iu rules, but don't generate a trie for it +EXACTFU_TRICKYFOLD EXACT, str ; Match this folded UTF-8 string using /iu rules, but don't generate a trie for it EXACTFA EXACT, str ; Match this string (not guaranteed to be folded) using /iaa rules (w/len). #*Do nothing types @@ -303,13 +303,13 @@ /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so we don't need this definition. */ #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF ) -#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_NO_TRIE || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) +#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL ) #else /* ... so we use this as its faster. */ #define IS_TEXT(rn) ( OP(rn)==EXACT ) -#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_NO_TRIE || OP(rn) == EXACTFA) +#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA) #define IS_TEXTF(rn) ( OP(rn)==EXACTF ) #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL ) @@ -1187,58 +1187,61 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, #define DECL_TRIE_TYPE(scan) \ const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \ - trie_type = (scan->flags != EXACT) \ - ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \ - : (utf8_target ? trie_utf8 : trie_plain) - -#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \ -uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \ - switch (trie_type) { \ - case trie_utf8_fold: \ - if ( foldlen>0 ) { \ - uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \ - foldlen -= len; \ - uscan += len; \ - len=0; \ - } else { \ - uvc = to_utf8_fold( (U8 *) uc, foldbuf, &foldlen ); \ - len = UTF8SKIP(uc); \ - foldlen -= UNISKIP( uvc ); \ - uscan = foldbuf + UNISKIP( uvc ); \ - } \ - break; \ - case trie_latin_utf8_fold: \ - if ( foldlen>0 ) { \ - uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \ - foldlen -= len; \ - uscan += len; \ - len=0; \ - } else { \ - len = 1; \ - uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen ); \ - foldlen -= UNISKIP( uvc ); \ - uscan = foldbuf + UNISKIP( uvc ); \ - } \ - break; \ - case trie_utf8: \ - uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \ - break; \ - case trie_plain: \ - uvc = (UV)*uc; \ - len = 1; \ - } \ - if (uvc < 256) { \ - charid = trie->charmap[ uvc ]; \ - } \ - else { \ - charid = 0; \ - if (widecharmap) { \ - SV** const svpp = hv_fetch(widecharmap, \ - (char*)&uvc, sizeof(UV), 0); \ - if (svpp) \ - charid = (U16)SvIV(*svpp); \ - } \ - } \ + trie_type = ((scan->flags == EXACT) \ + ? (utf8_target ? trie_utf8 : trie_plain) \ + : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold)) + +#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \ +uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \ + STRLEN skiplen; \ + switch (trie_type) { \ + case trie_utf8_fold: \ + if ( foldlen>0 ) { \ + uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ + foldlen -= len; \ + uscan += len; \ + len=0; \ + } else { \ + uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \ + len = UTF8SKIP(uc); \ + skiplen = UNISKIP( uvc ); \ + foldlen -= skiplen; \ + uscan = foldbuf + skiplen; \ + } \ + break; \ + case trie_latin_utf8_fold: \ + if ( foldlen>0 ) { \ + uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ + foldlen -= len; \ + uscan += len; \ + len=0; \ + } else { \ + len = 1; \ + uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \ + skiplen = UNISKIP( uvc ); \ + foldlen -= skiplen; \ + uscan = foldbuf + skiplen; \ + } \ + break; \ + case trie_utf8: \ + uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \ + break; \ + case trie_plain: \ + uvc = (UV)*uc; \ + len = 1; \ + } \ + if (uvc < 256) { \ + charid = trie->charmap[ uvc ]; \ + } \ + else { \ + charid = 0; \ + if (widecharmap) { \ + SV** const svpp = hv_fetch(widecharmap, \ + (char*)&uvc, sizeof(UV), 0); \ + if (svpp) \ + charid = (U16)SvIV(*svpp); \ + } \ + } \ } STMT_END #define REXEC_FBC_EXACTISH_SCAN(CoNd) \ @@ -1490,7 +1493,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } goto do_exactf_utf8; - case EXACTFU_NO_TRIE: + case EXACTFU_TRICKYFOLD: case EXACTFU: if (UTF_PATTERN || utf8_target) { utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0; @@ -3267,16 +3270,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) /* In this case the charclass data is available inline so we can fail fast without a lot of extra overhead. */ - if (scan->flags == EXACT || !utf8_target) { - if(!ANYOF_BITMAP_TEST(scan, *locinput)) { - DEBUG_EXECUTE_r( - PerlIO_printf(Perl_debug_log, - "%*s %sfailed to match trie start class...%s\n", - REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) - ); - sayNO_SILENT; - /* NOTREACHED */ - } + if(!ANYOF_BITMAP_TEST(scan, *locinput)) { + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s %sfailed to match trie start class...%s\n", + REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) + ); + sayNO_SILENT; + /* NOTREACHED */ } /* FALL THROUGH */ case TRIE: @@ -3334,9 +3335,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]); U32 state = trie->startstate; - if (trie->bitmap && trie_type != trie_utf8_fold && - !TRIE_BITMAP_TEST(trie,*locinput) - ) { + if (trie->bitmap && !TRIE_BITMAP_TEST(trie,*locinput) ) { if (trie->states[ state ].wordnum) { DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, @@ -3671,7 +3670,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) goto do_exactf; case EXACTFU_SS: - case EXACTFU_NO_TRIE: + case EXACTFU_TRICKYFOLD: case EXACTFU: folder = foldEQ_latin1; fold_array = PL_fold_latin1; @@ -5085,7 +5084,7 @@ NULL case EXACTF: ST.c2 = PL_fold[ST.c1]; break; case EXACTFA: case EXACTFU_SS: - case EXACTFU_NO_TRIE: + case EXACTFU_TRICKYFOLD: case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break; case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break; default: ST.c2 = ST.c1; @@ -5241,7 +5240,7 @@ NULL case EXACTF: ST.c2 = PL_fold[ST.c1]; break; case EXACTFA: case EXACTFU_SS: - case EXACTFU_NO_TRIE: + case EXACTFU_TRICKYFOLD: case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break; case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break; default: ST.c2 = ST.c1; break; @@ -6035,7 +6034,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) goto do_exactf; case EXACTFU_SS: - case EXACTFU_NO_TRIE: + case EXACTFU_TRICKYFOLD: case EXACTFU: utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0; @@ -6077,7 +6076,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) switch (OP(p)) { case EXACTF: folded = PL_fold[c]; break; case EXACTFA: - case EXACTFU_NO_TRIE: + case EXACTFU_TRICKYFOLD: case EXACTFU: folded = PL_fold_latin1[c]; break; case EXACTFL: folded = PL_fold_locale[c]; break; default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p)); diff --git a/regnodes.h b/regnodes.h index 994159342f..5561243882 100644 --- a/regnodes.h +++ b/regnodes.h @@ -62,7 +62,7 @@ #define EXACTFL 50 /* 0x32 Match this string (not guaranteed to be folded) using /il rules (w/len). */ #define EXACTFU 51 /* 0x33 Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len). */ #define EXACTFU_SS 52 /* 0x34 Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len). */ -#define EXACTFU_NO_TRIE 53 /* 0x35 Match this folded UTF-8 string using /iu rules, but don't generate a trie for it */ +#define EXACTFU_TRICKYFOLD 53 /* 0x35 Match this folded UTF-8 string using /iu rules, but don't generate a trie for it */ #define EXACTFA 54 /* 0x36 Match this string (not guaranteed to be folded) using /iaa rules (w/len). */ #define NOTHING 55 /* 0x37 Match empty string. */ #define TAIL 56 /* 0x38 Match empty string. Can jump here from outside. */ @@ -223,7 +223,7 @@ EXTCONST U8 PL_regkind[] = { EXACT, /* EXACTFL */ EXACT, /* EXACTFU */ EXACT, /* EXACTFU_SS */ - EXACT, /* EXACTFU_NO_TRIE */ + EXACT, /* EXACTFU_TRICKYFOLD */ EXACT, /* EXACTFA */ NOTHING, /* NOTHING */ NOTHING, /* TAIL */ @@ -384,7 +384,7 @@ static const U8 regarglen[] = { 0, /* EXACTFL */ 0, /* EXACTFU */ 0, /* EXACTFU_SS */ - 0, /* EXACTFU_NO_TRIE */ + 0, /* EXACTFU_TRICKYFOLD */ 0, /* EXACTFA */ 0, /* NOTHING */ 0, /* TAIL */ @@ -502,7 +502,7 @@ static const char reg_off_by_arg[] = { 0, /* EXACTFL */ 0, /* EXACTFU */ 0, /* EXACTFU_SS */ - 0, /* EXACTFU_NO_TRIE */ + 0, /* EXACTFU_TRICKYFOLD */ 0, /* EXACTFA */ 0, /* NOTHING */ 0, /* TAIL */ @@ -625,7 +625,7 @@ EXTCONST char * const PL_reg_name[] = { "EXACTFL", /* 0x32 */ "EXACTFU", /* 0x33 */ "EXACTFU_SS", /* 0x34 */ - "EXACTFU_NO_TRIE", /* 0x35 */ + "EXACTFU_TRICKYFOLD", /* 0x35 */ "EXACTFA", /* 0x36 */ "NOTHING", /* 0x37 */ "TAIL", /* 0x38 */ diff --git a/t/re/fold_grind.t b/t/re/fold_grind.t index 98fc3966c5..e2153e3186 100644 --- a/t/re/fold_grind.t +++ b/t/re/fold_grind.t @@ -735,7 +735,7 @@ foreach my $test (sort { numerically } keys %tests) { if (!$res || $ENV{PERL_DEBUG_FULL_TEST}) { # Failed or debug; output the result $count++; - ok($res, $desc); + ok($res, "test $count - $desc"); } else { # Just count the test as passed $okays++; |