diff options
-rw-r--r-- | embed.fnc | 4 | ||||
-rw-r--r-- | embed.h | 4 | ||||
-rw-r--r-- | proto.h | 14 | ||||
-rw-r--r-- | regcomp.c | 269 | ||||
-rw-r--r-- | regcomp.sym | 4 | ||||
-rw-r--r-- | regexec.c | 145 | ||||
-rw-r--r-- | regnodes.h | 10 | ||||
-rw-r--r-- | t/re/fold_grind.t | 2 |
8 files changed, 262 insertions, 190 deletions
@@ -604,7 +604,9 @@ Ap |UV |to_uni_upper |UV c|NN U8 *p|NN STRLEN *lenp Ap |UV |to_uni_title |UV c|NN U8 *p|NN STRLEN *lenp #ifdef PERL_IN_UTF8_C sR |U8 |to_lower_latin1|const U8 c|NULLOK U8 *p|NULLOK STRLEN *lenp -p |UV |_to_fold_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const bool flags +#endif +#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) +EXp |UV |_to_fold_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const bool flags #endif #if defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C) p |UV |_to_upper_title_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const char S_or_s @@ -983,6 +983,9 @@ #define reghop4 S_reghop4 # endif # endif +# if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) +#define _to_fold_latin1(a,b,c,d) Perl__to_fold_latin1(aTHX_ a,b,c,d) +# endif # if defined(PERL_OLD_COPY_ON_WRITE) #define sv_setsv_cow(a,b) Perl_sv_setsv_cow(aTHX_ a,b) # endif @@ -1600,7 +1603,6 @@ #define isa_lookup(a,b,c,d) S_isa_lookup(aTHX_ a,b,c,d) # endif # if defined(PERL_IN_UTF8_C) -#define _to_fold_latin1(a,b,c,d) Perl__to_fold_latin1(aTHX_ a,b,c,d) #define check_locale_boundary_crossing(a,b,c,d) S_check_locale_boundary_crossing(aTHX_ a,b,c,d) #define is_utf8_char_slow S_is_utf8_char_slow #define is_utf8_common(a,b,c) S_is_utf8_common(aTHX_ a,b,c) @@ -7115,12 +7115,6 @@ STATIC bool S_isa_lookup(pTHX_ HV *stash, const char * const name, STRLEN len, U #endif #if defined(PERL_IN_UTF8_C) -PERL_CALLCONV UV Perl__to_fold_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp, const bool flags) - __attribute__nonnull__(pTHX_2) - __attribute__nonnull__(pTHX_3); -#define PERL_ARGS_ASSERT__TO_FOLD_LATIN1 \ - assert(p); assert(lenp) - STATIC UV S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp) __attribute__warn_unused_result__ __attribute__nonnull__(pTHX_1) @@ -7167,6 +7161,14 @@ PERL_CALLCONV UV Perl__to_upper_title_latin1(pTHX_ const U8 c, U8 *p, STRLEN *le assert(p); assert(lenp) #endif +#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) +PERL_CALLCONV UV Perl__to_fold_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp, const bool flags) + __attribute__nonnull__(pTHX_2) + __attribute__nonnull__(pTHX_3); +#define PERL_ARGS_ASSERT__TO_FOLD_LATIN1 \ + assert(p); assert(lenp) + +#endif #if defined(PERL_IN_UTIL_C) STATIC bool S_ckwarn_common(pTHX_ U32 w); STATIC const COP* S_closest_cop(pTHX_ const COP *cop, const OP *o) @@ -1362,44 +1362,47 @@ is the recommended Unicode-aware way of saying *(d++) = uv; */ -#define TRIE_STORE_REVCHAR \ +#define TRIE_STORE_REVCHAR(val) \ STMT_START { \ if (UTF) { \ - SV *zlopp = newSV(2); \ + SV *zlopp = newSV(7); /* XXX: optimize me */ \ unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ - unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \ + unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \ SvCUR_set(zlopp, kapow - flrbbbbb); \ SvPOK_on(zlopp); \ SvUTF8_on(zlopp); \ av_push(revcharmap, zlopp); \ } else { \ - char ooooff = (char)uvc; \ + char ooooff = (char)val; \ av_push(revcharmap, newSVpvn(&ooooff, 1)); \ } \ } STMT_END -#define TRIE_READ_CHAR STMT_START { \ - wordlen++; \ - if ( UTF ) { \ - if ( folder ) { \ - if ( foldlen > 0 ) { \ - uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \ - foldlen -= len; \ - scan += len; \ - len = 0; \ - } else { \ - len = UTF8SKIP(uc);\ - uvc = to_utf8_fold( uc, foldbuf, &foldlen); \ - foldlen -= UNISKIP( uvc ); \ - scan = foldbuf + UNISKIP( uvc ); \ - } \ - } else { \ - uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\ - } \ - } else { \ - uvc = (U32)*uc; \ - len = 1; \ - } \ +#define TRIE_READ_CHAR STMT_START { \ + wordlen++; \ + if ( UTF ) { \ + /* if it is UTF then it is either already folded, or does not need folding */ \ + uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \ + } \ + else if (folder == PL_fold_latin1) { \ + /* if we use this folder we have to obey unicode rules on latin-1 data */ \ + if ( foldlen > 0 ) { \ + uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \ + foldlen -= len; \ + scan += len; \ + len = 0; \ + } else { \ + len = 1; \ + uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \ + skiplen = UNISKIP(uvc); \ + foldlen -= skiplen; \ + scan = foldbuf + skiplen; \ + } \ + } else { \ + /* raw data, will be folded later if needed */ \ + uvc = (U32)*uc; \ + len = 1; \ + } \ } STMT_END @@ -1518,10 +1521,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs switch (flags) { case EXACT: break; case EXACTFA: + case EXACTFU_SS: + case EXACTFU_TRICKYFOLD: case EXACTFU: folder = PL_fold_latin1; break; case EXACTF: folder = PL_fold; break; case EXACTFL: folder = PL_fold_locale; break; - default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u", (unsigned) flags ); + default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] ); } trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) ); @@ -1530,7 +1535,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs trie->wordcount = word_count; RExC_rxi->data->data[ data_slot ] = (void*)trie; trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) ); - if (!(UTF && folder)) + if (flags == EXACT) trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 ); trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc( trie->wordcount+1, sizeof(reg_trie_wordinfo)); @@ -1590,6 +1595,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs const U8 * const e = uc + STR_LEN( noper ); STRLEN foldlen = 0; U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; + STRLEN skiplen = 0; const U8 *scan = (U8*)NULL; U32 wordlen = 0; /* required init */ STRLEN chars = 0; @@ -1599,28 +1605,37 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs trie->minlen= 0; continue; } - if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */ + if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */ TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte regardless of encoding */ - + if (OP( noper ) == EXACTFU_SS) { + /* false positives are ok, so just set this */ + TRIE_BITMAP_SET(trie,0xDF); + } + } for ( ; uc < e ; uc += len ) { TRIE_CHARCOUNT(trie)++; TRIE_READ_CHAR; chars++; if ( uvc < 256 ) { + if ( folder ) { + U8 folded= folder[ (U8) uvc ]; + if ( !trie->charmap[ folded ] ) { + trie->charmap[ folded ]=( ++trie->uniquecharcount ); + TRIE_STORE_REVCHAR( folded ); + } + } if ( !trie->charmap[ uvc ] ) { trie->charmap[ uvc ]=( ++trie->uniquecharcount ); - if ( folder ) - trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ]; - TRIE_STORE_REVCHAR; + TRIE_STORE_REVCHAR( uvc ); } if ( set_bit ) { /* store the codepoint in the bitmap, and its folded * equivalent. */ - TRIE_BITMAP_SET(trie,uvc); + TRIE_BITMAP_SET(trie, uvc); /* store the folded codepoint */ - if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]); + if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); if ( !UTF ) { /* store first byte of utf8 representation of @@ -1643,17 +1658,28 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( !SvTRUE( *svpp ) ) { sv_setiv( *svpp, ++trie->uniquecharcount ); - TRIE_STORE_REVCHAR; + TRIE_STORE_REVCHAR(uvc); } } } if( cur == first ) { - trie->minlen=chars; - trie->maxlen=chars; + trie->minlen = chars; + trie->maxlen = chars; } else if (chars < trie->minlen) { - trie->minlen=chars; + trie->minlen = chars; } else if (chars > trie->maxlen) { - trie->maxlen=chars; + trie->maxlen = chars; + } + if (OP( noper ) == EXACTFU_SS) { + /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/ + if (trie->minlen > 1) + trie->minlen= 1; + } + if (OP( noper ) == EXACTFU_TRICKYFOLD) { + /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}" + * - We assume that any such sequence might match a 2 byte string */ + if (trie->minlen > 2 ) + trie->minlen= 2; } } /* end first pass */ @@ -1726,6 +1752,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs STRLEN foldlen = 0; /* required init */ U32 wordlen = 0; /* required init */ U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; + STRLEN skiplen = 0; if (OP(noper) != NOTHING) { for ( ; uc < e ; uc += len ) { @@ -1926,8 +1953,10 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs STRLEN foldlen = 0; /* required init */ U32 wordlen = 0; /* required init */ + STRLEN skiplen = 0; U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; + if ( OP(noper) != NOTHING ) { for ( ; uc < e ; uc += len ) { @@ -2564,7 +2593,7 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * optimizer doesn't reject these possibilities based on size constraints. * 2) These sequences are not currently correctly handled by the trie code * either, so it changes the joined node type to ops that are not handled - * by trie's, those new ops being EXACTFU_SS and EXACTFU_NO_TRIE. + * by trie's, those new ops being EXACTFU_SS and EXACTFU_TRICKYFOLD. * 3) This is sufficient for the two Greek sequences (described below), but * the one involving the Sharp s (\xDF) needs more. The node type * EXACTFU_SS is used for an EXACTFU node that contains at least one "ss" @@ -2819,7 +2848,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b * is. (I (khw) think it doesn't matter in regexec.c * for UTF patterns, but no need to change it */ if (OP(scan) == EXACTFU) { - OP(scan) = EXACTFU_NO_TRIE; + OP(scan) = EXACTFU_TRICKYFOLD; } s += 6; /* We already know what this sequence is. Skip the rest of it */ @@ -3181,7 +3210,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, regnode *first = (regnode *)NULL; regnode *last = (regnode *)NULL; regnode *tail = scan; - U8 optype = 0; + U8 trietype = 0; U32 count=0; #ifdef DEBUGGING @@ -3212,32 +3241,59 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* - step through the branches, cur represents each - branch, noper is the first thing to be matched - as part of that branch and noper_next is the - regnext() of that node. if noper is an EXACT - and noper_next is the same as scan (our current - position in the regex) then the EXACT branch is - a possible optimization target. Once we have - two or more consecutive such branches we can - create a trie of the EXACT's contents and stich - it in place. If the sequence represents all of - the branches we eliminate the whole thing and - replace it with a single TRIE. If it is a - subsequence then we need to stitch it in. This - means the first branch has to remain, and needs - to be repointed at the item on the branch chain - following the last branch optimized. This could - be either a BRANCH, in which case the - subsequence is internal, or it could be the - item following the branch sequence in which - case the subsequence is at the end. + Step through the branches + cur represents each branch, + noper is the first thing to be matched as part of that branch + noper_next is the regnext() of that node. + + We normally handle a case like this /FOO[xyz]|BAR[pqr]/ + via a "jump trie" but we also support building with NOJUMPTRIE, + which restricts the trie logic to structures like /FOO|BAR/. + + If noper is a trieable nodetype then the branch is a possible optimization + target. If we are building under NOJUMPTRIE then we require that noper_next + is the same as scan (our current position in the regex program). + + Once we have two or more consecutive such branches we can create a + trie of the EXACT's contents and stitch it in place into the program. + + If the sequence represents all of the branches in the alternation we + replace the entire thing with a single TRIE node. + + Otherwise when it is a subsequence we need to stitch it in place and + replace only the relevant branches. This means the first branch has + to remain as it is used by the alternation logic, and its next pointer, + and needs to be repointed at the item on the branch chain following + the last branch we have optimized away. + + This could be either a BRANCH, in which case the subsequence is internal, + or it could be the item following the branch sequence in which case the + subsequence is at the end (which does not necessarily mean the first node + is the start of the alternation). + + TRIE_TYPE(X) is a define which maps the optype to a trietype. + + optype | trietype + ----------------+----------- + NOTHING | NOTHING + EXACT | EXACT + EXACTFU | EXACTFU + EXACTFU_SS | EXACTFU + EXACTFU_TRICKYFOLD | EXACTFU + EXACTFA | 0 + */ +#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \ + ( EXACT == (X) ) ? EXACT : \ + ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \ + 0 ) /* dont use tail as the end marker for this traverse */ for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) { regnode * const noper = NEXTOPER( cur ); + U8 noper_type = OP( noper ); + U8 noper_trietype = TRIE_TYPE( noper_type ); #if defined(DEBUGGING) || defined(NOJUMPTRIE) regnode * const noper_next = regnext( noper ); #endif @@ -3259,59 +3315,68 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n", REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) ); }); - if ( (((first && optype!=NOTHING) ? OP( noper ) == optype - : PL_regkind[ OP( noper ) ] == EXACT ) - || OP(noper) == NOTHING ) + + /* Is noper a trieable nodetype that can be merged with the + * current trie (if there is one)? */ + if ( noper_trietype + && + ( + ( noper_trietype == NOTHING ) + || + ( trietype == NOTHING ) + || + ( trietype == noper_trietype ) + ) #ifdef NOJUMPTRIE && noper_next == tail #endif && count < U16_MAX) { + /* Handle mergable triable node + * Either we are the first node in a new trieable sequence, + * in which case we do some bookkeeping, otherwise we update + * the end pointer. */ count++; - if ( !first || optype == NOTHING ) { - if (!first) first = cur; - optype = OP( noper ); + if ( !first ) { + first = cur; + trietype = noper_trietype; } else { + if ( trietype == NOTHING ) + trietype = noper_trietype; last = cur; } - } else { -/* - Currently the trie logic handles case insensitive matching properly only - when the pattern is UTF-8 and the node is EXACTFU (thus forcing unicode - semantics). - - If/when this is fixed the following define can be swapped - in below to fully enable trie logic. - -#define TRIE_TYPE_IS_SAFE 1 - -Note that join_exact() assumes that the other types of EXACTFish nodes are not -used in tries, so that would have to be updated if this changed - -*/ -#define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT) - - if ( last && TRIE_TYPE_IS_SAFE ) { + } /* end handle mergable triable node */ + else { + /* handle unmergable node - + * noper may either be a triable node which can not be tried + * together with the current trie, or a non triable node */ + if ( last && trietype != NOTHING ) { + /* if last is set then we have found at least two triable branch + * sequences in a row of a similar trietype so we can turn them + * into a trie */ make_trie( pRExC_state, startbranch, first, cur, tail, count, - optype, depth+1 ); + trietype, depth+1 ); + last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */ } - if ( PL_regkind[ OP( noper ) ] == EXACT + if ( noper_trietype #ifdef NOJUMPTRIE && noper_next == tail #endif ){ + /* noper is triable, so we can start a new trie sequence */ count = 1; first = cur; - optype = OP( noper ); - } else { + trietype = noper_trietype; + } else if (first) { + /* if we already saw a first but the current node is not triable then we have + * to reset the first information. */ count = 0; first = NULL; - optype = 0; + trietype = 0; } - last = NULL; - } - } + } /* end handle unmergable node */ + } /* loop over branches */ DEBUG_OPTIMISE_r({ regprop(RExC_rx, mysv, cur); PerlIO_printf( Perl_debug_log, @@ -3319,9 +3384,11 @@ used in tries, so that would have to be updated if this changed "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); }); - - if ( last && TRIE_TYPE_IS_SAFE ) { - made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 ); + if ( last && trietype != NOTHING ) { + /* the last branch of the sequence was part of a trie, + * so we have to construct it here outside of the loop + */ + made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 ); #ifdef TRIE_STUDY_OPT if ( ((made == MADE_EXACT_TRIE && startbranch == first) @@ -3335,8 +3402,8 @@ used in tries, so that would have to be updated if this changed } } #endif - } - } + } /* end if ( last) */ + } /* TRIE_MAXBUF is non zero */ } /* do trie */ @@ -12023,7 +12090,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, case EXACTFA: case EXACTFU: case EXACTFU_SS: - case EXACTFU_NO_TRIE: + case EXACTFU_TRICKYFOLD: case EXACTFL: if( exact == PSEUDO ) exact= OP(scan); diff --git a/regcomp.sym b/regcomp.sym index f0ee701991..e6be6ac292 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -92,14 +92,14 @@ BRANCH BRANCH, node 0 V ; Match this alternative, or the next... # not used BACK BACK, no 0 V ; Match "", "next" ptr points backward. -#*Literals +#*Literals - NOTE the relative ordering of these types is important do not change it EXACT EXACT, str ; Match this string (preceded by length). EXACTF EXACT, str ; Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len). EXACTFL EXACT, str ; Match this string (not guaranteed to be folded) using /il rules (w/len). EXACTFU EXACT, str ; Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len). EXACTFU_SS EXACT, str ; Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len). -EXACTFU_NO_TRIE EXACT, str ; Match this folded UTF-8 string using /iu rules, but don't generate a trie for it +EXACTFU_TRICKYFOLD EXACT, str ; Match this folded UTF-8 string using /iu rules, but don't generate a trie for it EXACTFA EXACT, str ; Match this string (not guaranteed to be folded) using /iaa rules (w/len). #*Do nothing types @@ -303,13 +303,13 @@ /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so we don't need this definition. */ #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF ) -#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_NO_TRIE || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) +#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL ) #else /* ... so we use this as its faster. */ #define IS_TEXT(rn) ( OP(rn)==EXACT ) -#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_NO_TRIE || OP(rn) == EXACTFA) +#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA) #define IS_TEXTF(rn) ( OP(rn)==EXACTF ) #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL ) @@ -1187,58 +1187,61 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, #define DECL_TRIE_TYPE(scan) \ const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \ - trie_type = (scan->flags != EXACT) \ - ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \ - : (utf8_target ? trie_utf8 : trie_plain) - -#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \ -uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \ - switch (trie_type) { \ - case trie_utf8_fold: \ - if ( foldlen>0 ) { \ - uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \ - foldlen -= len; \ - uscan += len; \ - len=0; \ - } else { \ - uvc = to_utf8_fold( (U8 *) uc, foldbuf, &foldlen ); \ - len = UTF8SKIP(uc); \ - foldlen -= UNISKIP( uvc ); \ - uscan = foldbuf + UNISKIP( uvc ); \ - } \ - break; \ - case trie_latin_utf8_fold: \ - if ( foldlen>0 ) { \ - uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \ - foldlen -= len; \ - uscan += len; \ - len=0; \ - } else { \ - len = 1; \ - uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen ); \ - foldlen -= UNISKIP( uvc ); \ - uscan = foldbuf + UNISKIP( uvc ); \ - } \ - break; \ - case trie_utf8: \ - uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \ - break; \ - case trie_plain: \ - uvc = (UV)*uc; \ - len = 1; \ - } \ - if (uvc < 256) { \ - charid = trie->charmap[ uvc ]; \ - } \ - else { \ - charid = 0; \ - if (widecharmap) { \ - SV** const svpp = hv_fetch(widecharmap, \ - (char*)&uvc, sizeof(UV), 0); \ - if (svpp) \ - charid = (U16)SvIV(*svpp); \ - } \ - } \ + trie_type = ((scan->flags == EXACT) \ + ? (utf8_target ? trie_utf8 : trie_plain) \ + : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold)) + +#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \ +uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \ + STRLEN skiplen; \ + switch (trie_type) { \ + case trie_utf8_fold: \ + if ( foldlen>0 ) { \ + uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ + foldlen -= len; \ + uscan += len; \ + len=0; \ + } else { \ + uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \ + len = UTF8SKIP(uc); \ + skiplen = UNISKIP( uvc ); \ + foldlen -= skiplen; \ + uscan = foldbuf + skiplen; \ + } \ + break; \ + case trie_latin_utf8_fold: \ + if ( foldlen>0 ) { \ + uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ + foldlen -= len; \ + uscan += len; \ + len=0; \ + } else { \ + len = 1; \ + uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \ + skiplen = UNISKIP( uvc ); \ + foldlen -= skiplen; \ + uscan = foldbuf + skiplen; \ + } \ + break; \ + case trie_utf8: \ + uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \ + break; \ + case trie_plain: \ + uvc = (UV)*uc; \ + len = 1; \ + } \ + if (uvc < 256) { \ + charid = trie->charmap[ uvc ]; \ + } \ + else { \ + charid = 0; \ + if (widecharmap) { \ + SV** const svpp = hv_fetch(widecharmap, \ + (char*)&uvc, sizeof(UV), 0); \ + if (svpp) \ + charid = (U16)SvIV(*svpp); \ + } \ + } \ } STMT_END #define REXEC_FBC_EXACTISH_SCAN(CoNd) \ @@ -1490,7 +1493,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } goto do_exactf_utf8; - case EXACTFU_NO_TRIE: + case EXACTFU_TRICKYFOLD: case EXACTFU: if (UTF_PATTERN || utf8_target) { utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0; @@ -3267,16 +3270,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) /* In this case the charclass data is available inline so we can fail fast without a lot of extra overhead. */ - if (scan->flags == EXACT || !utf8_target) { - if(!ANYOF_BITMAP_TEST(scan, *locinput)) { - DEBUG_EXECUTE_r( - PerlIO_printf(Perl_debug_log, - "%*s %sfailed to match trie start class...%s\n", - REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) - ); - sayNO_SILENT; - /* NOTREACHED */ - } + if(!ANYOF_BITMAP_TEST(scan, *locinput)) { + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s %sfailed to match trie start class...%s\n", + REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) + ); + sayNO_SILENT; + /* NOTREACHED */ } /* FALL THROUGH */ case TRIE: @@ -3334,9 +3335,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]); U32 state = trie->startstate; - if (trie->bitmap && trie_type != trie_utf8_fold && - !TRIE_BITMAP_TEST(trie,*locinput) - ) { + if (trie->bitmap && !TRIE_BITMAP_TEST(trie,*locinput) ) { if (trie->states[ state ].wordnum) { DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, @@ -3671,7 +3670,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) goto do_exactf; case EXACTFU_SS: - case EXACTFU_NO_TRIE: + case EXACTFU_TRICKYFOLD: case EXACTFU: folder = foldEQ_latin1; fold_array = PL_fold_latin1; @@ -5085,7 +5084,7 @@ NULL case EXACTF: ST.c2 = PL_fold[ST.c1]; break; case EXACTFA: case EXACTFU_SS: - case EXACTFU_NO_TRIE: + case EXACTFU_TRICKYFOLD: case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break; case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break; default: ST.c2 = ST.c1; @@ -5241,7 +5240,7 @@ NULL case EXACTF: ST.c2 = PL_fold[ST.c1]; break; case EXACTFA: case EXACTFU_SS: - case EXACTFU_NO_TRIE: + case EXACTFU_TRICKYFOLD: case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break; case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break; default: ST.c2 = ST.c1; break; @@ -6035,7 +6034,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) goto do_exactf; case EXACTFU_SS: - case EXACTFU_NO_TRIE: + case EXACTFU_TRICKYFOLD: case EXACTFU: utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0; @@ -6077,7 +6076,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) switch (OP(p)) { case EXACTF: folded = PL_fold[c]; break; case EXACTFA: - case EXACTFU_NO_TRIE: + case EXACTFU_TRICKYFOLD: case EXACTFU: folded = PL_fold_latin1[c]; break; case EXACTFL: folded = PL_fold_locale[c]; break; default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p)); diff --git a/regnodes.h b/regnodes.h index 994159342f..5561243882 100644 --- a/regnodes.h +++ b/regnodes.h @@ -62,7 +62,7 @@ #define EXACTFL 50 /* 0x32 Match this string (not guaranteed to be folded) using /il rules (w/len). */ #define EXACTFU 51 /* 0x33 Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len). */ #define EXACTFU_SS 52 /* 0x34 Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len). */ -#define EXACTFU_NO_TRIE 53 /* 0x35 Match this folded UTF-8 string using /iu rules, but don't generate a trie for it */ +#define EXACTFU_TRICKYFOLD 53 /* 0x35 Match this folded UTF-8 string using /iu rules, but don't generate a trie for it */ #define EXACTFA 54 /* 0x36 Match this string (not guaranteed to be folded) using /iaa rules (w/len). */ #define NOTHING 55 /* 0x37 Match empty string. */ #define TAIL 56 /* 0x38 Match empty string. Can jump here from outside. */ @@ -223,7 +223,7 @@ EXTCONST U8 PL_regkind[] = { EXACT, /* EXACTFL */ EXACT, /* EXACTFU */ EXACT, /* EXACTFU_SS */ - EXACT, /* EXACTFU_NO_TRIE */ + EXACT, /* EXACTFU_TRICKYFOLD */ EXACT, /* EXACTFA */ NOTHING, /* NOTHING */ NOTHING, /* TAIL */ @@ -384,7 +384,7 @@ static const U8 regarglen[] = { 0, /* EXACTFL */ 0, /* EXACTFU */ 0, /* EXACTFU_SS */ - 0, /* EXACTFU_NO_TRIE */ + 0, /* EXACTFU_TRICKYFOLD */ 0, /* EXACTFA */ 0, /* NOTHING */ 0, /* TAIL */ @@ -502,7 +502,7 @@ static const char reg_off_by_arg[] = { 0, /* EXACTFL */ 0, /* EXACTFU */ 0, /* EXACTFU_SS */ - 0, /* EXACTFU_NO_TRIE */ + 0, /* EXACTFU_TRICKYFOLD */ 0, /* EXACTFA */ 0, /* NOTHING */ 0, /* TAIL */ @@ -625,7 +625,7 @@ EXTCONST char * const PL_reg_name[] = { "EXACTFL", /* 0x32 */ "EXACTFU", /* 0x33 */ "EXACTFU_SS", /* 0x34 */ - "EXACTFU_NO_TRIE", /* 0x35 */ + "EXACTFU_TRICKYFOLD", /* 0x35 */ "EXACTFA", /* 0x36 */ "NOTHING", /* 0x37 */ "TAIL", /* 0x38 */ diff --git a/t/re/fold_grind.t b/t/re/fold_grind.t index 98fc3966c5..e2153e3186 100644 --- a/t/re/fold_grind.t +++ b/t/re/fold_grind.t @@ -735,7 +735,7 @@ foreach my $test (sort { numerically } keys %tests) { if (!$res || $ENV{PERL_DEBUG_FULL_TEST}) { # Failed or debug; output the result $count++; - ok($res, $desc); + ok($res, "test $count - $desc"); } else { # Just count the test as passed $okays++; |