summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--embed.fnc4
-rw-r--r--embed.h4
-rw-r--r--proto.h14
-rw-r--r--regcomp.c269
-rw-r--r--regcomp.sym4
-rw-r--r--regexec.c145
-rw-r--r--regnodes.h10
-rw-r--r--t/re/fold_grind.t2
8 files changed, 262 insertions, 190 deletions
diff --git a/embed.fnc b/embed.fnc
index a7e004fe26..1394cff00f 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -604,7 +604,9 @@ Ap |UV |to_uni_upper |UV c|NN U8 *p|NN STRLEN *lenp
Ap |UV |to_uni_title |UV c|NN U8 *p|NN STRLEN *lenp
#ifdef PERL_IN_UTF8_C
sR |U8 |to_lower_latin1|const U8 c|NULLOK U8 *p|NULLOK STRLEN *lenp
-p |UV |_to_fold_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const bool flags
+#endif
+#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C)
+EXp |UV |_to_fold_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const bool flags
#endif
#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C)
p |UV |_to_upper_title_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const char S_or_s
diff --git a/embed.h b/embed.h
index 541309e42e..bb9b76de16 100644
--- a/embed.h
+++ b/embed.h
@@ -983,6 +983,9 @@
#define reghop4 S_reghop4
# endif
# endif
+# if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C)
+#define _to_fold_latin1(a,b,c,d) Perl__to_fold_latin1(aTHX_ a,b,c,d)
+# endif
# if defined(PERL_OLD_COPY_ON_WRITE)
#define sv_setsv_cow(a,b) Perl_sv_setsv_cow(aTHX_ a,b)
# endif
@@ -1600,7 +1603,6 @@
#define isa_lookup(a,b,c,d) S_isa_lookup(aTHX_ a,b,c,d)
# endif
# if defined(PERL_IN_UTF8_C)
-#define _to_fold_latin1(a,b,c,d) Perl__to_fold_latin1(aTHX_ a,b,c,d)
#define check_locale_boundary_crossing(a,b,c,d) S_check_locale_boundary_crossing(aTHX_ a,b,c,d)
#define is_utf8_char_slow S_is_utf8_char_slow
#define is_utf8_common(a,b,c) S_is_utf8_common(aTHX_ a,b,c)
diff --git a/proto.h b/proto.h
index f01e7c3cdf..f696f7bc71 100644
--- a/proto.h
+++ b/proto.h
@@ -7115,12 +7115,6 @@ STATIC bool S_isa_lookup(pTHX_ HV *stash, const char * const name, STRLEN len, U
#endif
#if defined(PERL_IN_UTF8_C)
-PERL_CALLCONV UV Perl__to_fold_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp, const bool flags)
- __attribute__nonnull__(pTHX_2)
- __attribute__nonnull__(pTHX_3);
-#define PERL_ARGS_ASSERT__TO_FOLD_LATIN1 \
- assert(p); assert(lenp)
-
STATIC UV S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
__attribute__warn_unused_result__
__attribute__nonnull__(pTHX_1)
@@ -7167,6 +7161,14 @@ PERL_CALLCONV UV Perl__to_upper_title_latin1(pTHX_ const U8 c, U8 *p, STRLEN *le
assert(p); assert(lenp)
#endif
+#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C)
+PERL_CALLCONV UV Perl__to_fold_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp, const bool flags)
+ __attribute__nonnull__(pTHX_2)
+ __attribute__nonnull__(pTHX_3);
+#define PERL_ARGS_ASSERT__TO_FOLD_LATIN1 \
+ assert(p); assert(lenp)
+
+#endif
#if defined(PERL_IN_UTIL_C)
STATIC bool S_ckwarn_common(pTHX_ U32 w);
STATIC const COP* S_closest_cop(pTHX_ const COP *cop, const OP *o)
diff --git a/regcomp.c b/regcomp.c
index a996049898..28301c31ce 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -1362,44 +1362,47 @@ is the recommended Unicode-aware way of saying
*(d++) = uv;
*/
-#define TRIE_STORE_REVCHAR \
+#define TRIE_STORE_REVCHAR(val) \
STMT_START { \
if (UTF) { \
- SV *zlopp = newSV(2); \
+ SV *zlopp = newSV(7); /* XXX: optimize me */ \
unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
- unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
+ unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
SvCUR_set(zlopp, kapow - flrbbbbb); \
SvPOK_on(zlopp); \
SvUTF8_on(zlopp); \
av_push(revcharmap, zlopp); \
} else { \
- char ooooff = (char)uvc; \
+ char ooooff = (char)val; \
av_push(revcharmap, newSVpvn(&ooooff, 1)); \
} \
} STMT_END
-#define TRIE_READ_CHAR STMT_START { \
- wordlen++; \
- if ( UTF ) { \
- if ( folder ) { \
- if ( foldlen > 0 ) { \
- uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
- foldlen -= len; \
- scan += len; \
- len = 0; \
- } else { \
- len = UTF8SKIP(uc);\
- uvc = to_utf8_fold( uc, foldbuf, &foldlen); \
- foldlen -= UNISKIP( uvc ); \
- scan = foldbuf + UNISKIP( uvc ); \
- } \
- } else { \
- uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
- } \
- } else { \
- uvc = (U32)*uc; \
- len = 1; \
- } \
+#define TRIE_READ_CHAR STMT_START { \
+ wordlen++; \
+ if ( UTF ) { \
+ /* if it is UTF then it is either already folded, or does not need folding */ \
+ uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
+ } \
+ else if (folder == PL_fold_latin1) { \
+ /* if we use this folder we have to obey unicode rules on latin-1 data */ \
+ if ( foldlen > 0 ) { \
+ uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
+ foldlen -= len; \
+ scan += len; \
+ len = 0; \
+ } else { \
+ len = 1; \
+ uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
+ skiplen = UNISKIP(uvc); \
+ foldlen -= skiplen; \
+ scan = foldbuf + skiplen; \
+ } \
+ } else { \
+ /* raw data, will be folded later if needed */ \
+ uvc = (U32)*uc; \
+ len = 1; \
+ } \
} STMT_END
@@ -1518,10 +1521,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
switch (flags) {
case EXACT: break;
case EXACTFA:
+ case EXACTFU_SS:
+ case EXACTFU_TRICKYFOLD:
case EXACTFU: folder = PL_fold_latin1; break;
case EXACTF: folder = PL_fold; break;
case EXACTFL: folder = PL_fold_locale; break;
- default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u", (unsigned) flags );
+ default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
}
trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
@@ -1530,7 +1535,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
trie->wordcount = word_count;
RExC_rxi->data->data[ data_slot ] = (void*)trie;
trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
- if (!(UTF && folder))
+ if (flags == EXACT)
trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
trie->wordcount+1, sizeof(reg_trie_wordinfo));
@@ -1590,6 +1595,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
const U8 * const e = uc + STR_LEN( noper );
STRLEN foldlen = 0;
U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
+ STRLEN skiplen = 0;
const U8 *scan = (U8*)NULL;
U32 wordlen = 0; /* required init */
STRLEN chars = 0;
@@ -1599,28 +1605,37 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
trie->minlen= 0;
continue;
}
- if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
+ if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
regardless of encoding */
-
+ if (OP( noper ) == EXACTFU_SS) {
+ /* false positives are ok, so just set this */
+ TRIE_BITMAP_SET(trie,0xDF);
+ }
+ }
for ( ; uc < e ; uc += len ) {
TRIE_CHARCOUNT(trie)++;
TRIE_READ_CHAR;
chars++;
if ( uvc < 256 ) {
+ if ( folder ) {
+ U8 folded= folder[ (U8) uvc ];
+ if ( !trie->charmap[ folded ] ) {
+ trie->charmap[ folded ]=( ++trie->uniquecharcount );
+ TRIE_STORE_REVCHAR( folded );
+ }
+ }
if ( !trie->charmap[ uvc ] ) {
trie->charmap[ uvc ]=( ++trie->uniquecharcount );
- if ( folder )
- trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
- TRIE_STORE_REVCHAR;
+ TRIE_STORE_REVCHAR( uvc );
}
if ( set_bit ) {
/* store the codepoint in the bitmap, and its folded
* equivalent. */
- TRIE_BITMAP_SET(trie,uvc);
+ TRIE_BITMAP_SET(trie, uvc);
/* store the folded codepoint */
- if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
+ if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
if ( !UTF ) {
/* store first byte of utf8 representation of
@@ -1643,17 +1658,28 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
if ( !SvTRUE( *svpp ) ) {
sv_setiv( *svpp, ++trie->uniquecharcount );
- TRIE_STORE_REVCHAR;
+ TRIE_STORE_REVCHAR(uvc);
}
}
}
if( cur == first ) {
- trie->minlen=chars;
- trie->maxlen=chars;
+ trie->minlen = chars;
+ trie->maxlen = chars;
} else if (chars < trie->minlen) {
- trie->minlen=chars;
+ trie->minlen = chars;
} else if (chars > trie->maxlen) {
- trie->maxlen=chars;
+ trie->maxlen = chars;
+ }
+ if (OP( noper ) == EXACTFU_SS) {
+ /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
+ if (trie->minlen > 1)
+ trie->minlen= 1;
+ }
+ if (OP( noper ) == EXACTFU_TRICKYFOLD) {
+ /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
+ * - We assume that any such sequence might match a 2 byte string */
+ if (trie->minlen > 2 )
+ trie->minlen= 2;
}
} /* end first pass */
@@ -1726,6 +1752,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
STRLEN foldlen = 0; /* required init */
U32 wordlen = 0; /* required init */
U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
+ STRLEN skiplen = 0;
if (OP(noper) != NOTHING) {
for ( ; uc < e ; uc += len ) {
@@ -1926,8 +1953,10 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
STRLEN foldlen = 0; /* required init */
U32 wordlen = 0; /* required init */
+ STRLEN skiplen = 0;
U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
+
if ( OP(noper) != NOTHING ) {
for ( ; uc < e ; uc += len ) {
@@ -2564,7 +2593,7 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode
* optimizer doesn't reject these possibilities based on size constraints.
* 2) These sequences are not currently correctly handled by the trie code
* either, so it changes the joined node type to ops that are not handled
- * by trie's, those new ops being EXACTFU_SS and EXACTFU_NO_TRIE.
+ * by trie's, those new ops being EXACTFU_SS and EXACTFU_TRICKYFOLD.
* 3) This is sufficient for the two Greek sequences (described below), but
* the one involving the Sharp s (\xDF) needs more. The node type
* EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
@@ -2819,7 +2848,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
* is. (I (khw) think it doesn't matter in regexec.c
* for UTF patterns, but no need to change it */
if (OP(scan) == EXACTFU) {
- OP(scan) = EXACTFU_NO_TRIE;
+ OP(scan) = EXACTFU_TRICKYFOLD;
}
s += 6; /* We already know what this sequence is. Skip
the rest of it */
@@ -3181,7 +3210,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
regnode *first = (regnode *)NULL;
regnode *last = (regnode *)NULL;
regnode *tail = scan;
- U8 optype = 0;
+ U8 trietype = 0;
U32 count=0;
#ifdef DEBUGGING
@@ -3212,32 +3241,59 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
/*
- step through the branches, cur represents each
- branch, noper is the first thing to be matched
- as part of that branch and noper_next is the
- regnext() of that node. if noper is an EXACT
- and noper_next is the same as scan (our current
- position in the regex) then the EXACT branch is
- a possible optimization target. Once we have
- two or more consecutive such branches we can
- create a trie of the EXACT's contents and stich
- it in place. If the sequence represents all of
- the branches we eliminate the whole thing and
- replace it with a single TRIE. If it is a
- subsequence then we need to stitch it in. This
- means the first branch has to remain, and needs
- to be repointed at the item on the branch chain
- following the last branch optimized. This could
- be either a BRANCH, in which case the
- subsequence is internal, or it could be the
- item following the branch sequence in which
- case the subsequence is at the end.
+ Step through the branches
+ cur represents each branch,
+ noper is the first thing to be matched as part of that branch
+ noper_next is the regnext() of that node.
+
+ We normally handle a case like this /FOO[xyz]|BAR[pqr]/
+ via a "jump trie" but we also support building with NOJUMPTRIE,
+ which restricts the trie logic to structures like /FOO|BAR/.
+
+ If noper is a trieable nodetype then the branch is a possible optimization
+ target. If we are building under NOJUMPTRIE then we require that noper_next
+ is the same as scan (our current position in the regex program).
+
+ Once we have two or more consecutive such branches we can create a
+ trie of the EXACT's contents and stitch it in place into the program.
+
+ If the sequence represents all of the branches in the alternation we
+ replace the entire thing with a single TRIE node.
+
+ Otherwise when it is a subsequence we need to stitch it in place and
+ replace only the relevant branches. This means the first branch has
+ to remain as it is used by the alternation logic, and its next pointer,
+ and needs to be repointed at the item on the branch chain following
+ the last branch we have optimized away.
+
+ This could be either a BRANCH, in which case the subsequence is internal,
+ or it could be the item following the branch sequence in which case the
+ subsequence is at the end (which does not necessarily mean the first node
+ is the start of the alternation).
+
+ TRIE_TYPE(X) is a define which maps the optype to a trietype.
+
+ optype | trietype
+ ----------------+-----------
+ NOTHING | NOTHING
+ EXACT | EXACT
+ EXACTFU | EXACTFU
+ EXACTFU_SS | EXACTFU
+ EXACTFU_TRICKYFOLD | EXACTFU
+ EXACTFA | 0
+
*/
+#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
+ ( EXACT == (X) ) ? EXACT : \
+ ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
+ 0 )
/* dont use tail as the end marker for this traverse */
for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
regnode * const noper = NEXTOPER( cur );
+ U8 noper_type = OP( noper );
+ U8 noper_trietype = TRIE_TYPE( noper_type );
#if defined(DEBUGGING) || defined(NOJUMPTRIE)
regnode * const noper_next = regnext( noper );
#endif
@@ -3259,59 +3315,68 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
});
- if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
- : PL_regkind[ OP( noper ) ] == EXACT )
- || OP(noper) == NOTHING )
+
+ /* Is noper a trieable nodetype that can be merged with the
+ * current trie (if there is one)? */
+ if ( noper_trietype
+ &&
+ (
+ ( noper_trietype == NOTHING )
+ ||
+ ( trietype == NOTHING )
+ ||
+ ( trietype == noper_trietype )
+ )
#ifdef NOJUMPTRIE
&& noper_next == tail
#endif
&& count < U16_MAX)
{
+ /* Handle mergable triable node
+ * Either we are the first node in a new trieable sequence,
+ * in which case we do some bookkeeping, otherwise we update
+ * the end pointer. */
count++;
- if ( !first || optype == NOTHING ) {
- if (!first) first = cur;
- optype = OP( noper );
+ if ( !first ) {
+ first = cur;
+ trietype = noper_trietype;
} else {
+ if ( trietype == NOTHING )
+ trietype = noper_trietype;
last = cur;
}
- } else {
-/*
- Currently the trie logic handles case insensitive matching properly only
- when the pattern is UTF-8 and the node is EXACTFU (thus forcing unicode
- semantics).
-
- If/when this is fixed the following define can be swapped
- in below to fully enable trie logic.
-
-#define TRIE_TYPE_IS_SAFE 1
-
-Note that join_exact() assumes that the other types of EXACTFish nodes are not
-used in tries, so that would have to be updated if this changed
-
-*/
-#define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT)
-
- if ( last && TRIE_TYPE_IS_SAFE ) {
+ } /* end handle mergable triable node */
+ else {
+ /* handle unmergable node -
+ * noper may either be a triable node which can not be tried
+ * together with the current trie, or a non triable node */
+ if ( last && trietype != NOTHING ) {
+ /* if last is set then we have found at least two triable branch
+ * sequences in a row of a similar trietype so we can turn them
+ * into a trie */
make_trie( pRExC_state,
startbranch, first, cur, tail, count,
- optype, depth+1 );
+ trietype, depth+1 );
+ last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
}
- if ( PL_regkind[ OP( noper ) ] == EXACT
+ if ( noper_trietype
#ifdef NOJUMPTRIE
&& noper_next == tail
#endif
){
+ /* noper is triable, so we can start a new trie sequence */
count = 1;
first = cur;
- optype = OP( noper );
- } else {
+ trietype = noper_trietype;
+ } else if (first) {
+ /* if we already saw a first but the current node is not triable then we have
+ * to reset the first information. */
count = 0;
first = NULL;
- optype = 0;
+ trietype = 0;
}
- last = NULL;
- }
- }
+ } /* end handle unmergable node */
+ } /* loop over branches */
DEBUG_OPTIMISE_r({
regprop(RExC_rx, mysv, cur);
PerlIO_printf( Perl_debug_log,
@@ -3319,9 +3384,11 @@ used in tries, so that would have to be updated if this changed
"", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
});
-
- if ( last && TRIE_TYPE_IS_SAFE ) {
- made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
+ if ( last && trietype != NOTHING ) {
+ /* the last branch of the sequence was part of a trie,
+ * so we have to construct it here outside of the loop
+ */
+ made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
#ifdef TRIE_STUDY_OPT
if ( ((made == MADE_EXACT_TRIE &&
startbranch == first)
@@ -3335,8 +3402,8 @@ used in tries, so that would have to be updated if this changed
}
}
#endif
- }
- }
+ } /* end if ( last) */
+ } /* TRIE_MAXBUF is non zero */
} /* do trie */
@@ -12023,7 +12090,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,
case EXACTFA:
case EXACTFU:
case EXACTFU_SS:
- case EXACTFU_NO_TRIE:
+ case EXACTFU_TRICKYFOLD:
case EXACTFL:
if( exact == PSEUDO )
exact= OP(scan);
diff --git a/regcomp.sym b/regcomp.sym
index f0ee701991..e6be6ac292 100644
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -92,14 +92,14 @@ BRANCH BRANCH, node 0 V ; Match this alternative, or the next...
# not used
BACK BACK, no 0 V ; Match "", "next" ptr points backward.
-#*Literals
+#*Literals - NOTE the relative ordering of these types is important do not change it
EXACT EXACT, str ; Match this string (preceded by length).
EXACTF EXACT, str ; Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len).
EXACTFL EXACT, str ; Match this string (not guaranteed to be folded) using /il rules (w/len).
EXACTFU EXACT, str ; Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len).
EXACTFU_SS EXACT, str ; Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len).
-EXACTFU_NO_TRIE EXACT, str ; Match this folded UTF-8 string using /iu rules, but don't generate a trie for it
+EXACTFU_TRICKYFOLD EXACT, str ; Match this folded UTF-8 string using /iu rules, but don't generate a trie for it
EXACTFA EXACT, str ; Match this string (not guaranteed to be folded) using /iaa rules (w/len).
#*Do nothing types
diff --git a/regexec.c b/regexec.c
index 76eb7f9bed..8ccb6f7032 100644
--- a/regexec.c
+++ b/regexec.c
@@ -303,13 +303,13 @@
/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
we don't need this definition. */
#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
-#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_NO_TRIE || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
+#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
#else
/* ... so we use this as its faster. */
#define IS_TEXT(rn) ( OP(rn)==EXACT )
-#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_NO_TRIE || OP(rn) == EXACTFA)
+#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
@@ -1187,58 +1187,61 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
#define DECL_TRIE_TYPE(scan) \
const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
- trie_type = (scan->flags != EXACT) \
- ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
- : (utf8_target ? trie_utf8 : trie_plain)
-
-#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
-uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
- switch (trie_type) { \
- case trie_utf8_fold: \
- if ( foldlen>0 ) { \
- uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
- foldlen -= len; \
- uscan += len; \
- len=0; \
- } else { \
- uvc = to_utf8_fold( (U8 *) uc, foldbuf, &foldlen ); \
- len = UTF8SKIP(uc); \
- foldlen -= UNISKIP( uvc ); \
- uscan = foldbuf + UNISKIP( uvc ); \
- } \
- break; \
- case trie_latin_utf8_fold: \
- if ( foldlen>0 ) { \
- uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
- foldlen -= len; \
- uscan += len; \
- len=0; \
- } else { \
- len = 1; \
- uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen ); \
- foldlen -= UNISKIP( uvc ); \
- uscan = foldbuf + UNISKIP( uvc ); \
- } \
- break; \
- case trie_utf8: \
- uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
- break; \
- case trie_plain: \
- uvc = (UV)*uc; \
- len = 1; \
- } \
- if (uvc < 256) { \
- charid = trie->charmap[ uvc ]; \
- } \
- else { \
- charid = 0; \
- if (widecharmap) { \
- SV** const svpp = hv_fetch(widecharmap, \
- (char*)&uvc, sizeof(UV), 0); \
- if (svpp) \
- charid = (U16)SvIV(*svpp); \
- } \
- } \
+ trie_type = ((scan->flags == EXACT) \
+ ? (utf8_target ? trie_utf8 : trie_plain) \
+ : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
+
+#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
+uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
+ STRLEN skiplen; \
+ switch (trie_type) { \
+ case trie_utf8_fold: \
+ if ( foldlen>0 ) { \
+ uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
+ foldlen -= len; \
+ uscan += len; \
+ len=0; \
+ } else { \
+ uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
+ len = UTF8SKIP(uc); \
+ skiplen = UNISKIP( uvc ); \
+ foldlen -= skiplen; \
+ uscan = foldbuf + skiplen; \
+ } \
+ break; \
+ case trie_latin_utf8_fold: \
+ if ( foldlen>0 ) { \
+ uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
+ foldlen -= len; \
+ uscan += len; \
+ len=0; \
+ } else { \
+ len = 1; \
+ uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
+ skiplen = UNISKIP( uvc ); \
+ foldlen -= skiplen; \
+ uscan = foldbuf + skiplen; \
+ } \
+ break; \
+ case trie_utf8: \
+ uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
+ break; \
+ case trie_plain: \
+ uvc = (UV)*uc; \
+ len = 1; \
+ } \
+ if (uvc < 256) { \
+ charid = trie->charmap[ uvc ]; \
+ } \
+ else { \
+ charid = 0; \
+ if (widecharmap) { \
+ SV** const svpp = hv_fetch(widecharmap, \
+ (char*)&uvc, sizeof(UV), 0); \
+ if (svpp) \
+ charid = (U16)SvIV(*svpp); \
+ } \
+ } \
} STMT_END
#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
@@ -1490,7 +1493,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
}
goto do_exactf_utf8;
- case EXACTFU_NO_TRIE:
+ case EXACTFU_TRICKYFOLD:
case EXACTFU:
if (UTF_PATTERN || utf8_target) {
utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
@@ -3267,16 +3270,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
/* In this case the charclass data is available inline so
we can fail fast without a lot of extra overhead.
*/
- if (scan->flags == EXACT || !utf8_target) {
- if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
- DEBUG_EXECUTE_r(
- PerlIO_printf(Perl_debug_log,
- "%*s %sfailed to match trie start class...%s\n",
- REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
- );
- sayNO_SILENT;
- /* NOTREACHED */
- }
+ if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
+ DEBUG_EXECUTE_r(
+ PerlIO_printf(Perl_debug_log,
+ "%*s %sfailed to match trie start class...%s\n",
+ REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
+ );
+ sayNO_SILENT;
+ /* NOTREACHED */
}
/* FALL THROUGH */
case TRIE:
@@ -3334,9 +3335,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
U32 state = trie->startstate;
- if (trie->bitmap && trie_type != trie_utf8_fold &&
- !TRIE_BITMAP_TEST(trie,*locinput)
- ) {
+ if (trie->bitmap && !TRIE_BITMAP_TEST(trie,*locinput) ) {
if (trie->states[ state ].wordnum) {
DEBUG_EXECUTE_r(
PerlIO_printf(Perl_debug_log,
@@ -3671,7 +3670,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
goto do_exactf;
case EXACTFU_SS:
- case EXACTFU_NO_TRIE:
+ case EXACTFU_TRICKYFOLD:
case EXACTFU:
folder = foldEQ_latin1;
fold_array = PL_fold_latin1;
@@ -5085,7 +5084,7 @@ NULL
case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
case EXACTFA:
case EXACTFU_SS:
- case EXACTFU_NO_TRIE:
+ case EXACTFU_TRICKYFOLD:
case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
default: ST.c2 = ST.c1;
@@ -5241,7 +5240,7 @@ NULL
case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
case EXACTFA:
case EXACTFU_SS:
- case EXACTFU_NO_TRIE:
+ case EXACTFU_TRICKYFOLD:
case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
default: ST.c2 = ST.c1; break;
@@ -6035,7 +6034,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
goto do_exactf;
case EXACTFU_SS:
- case EXACTFU_NO_TRIE:
+ case EXACTFU_TRICKYFOLD:
case EXACTFU:
utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
@@ -6077,7 +6076,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
switch (OP(p)) {
case EXACTF: folded = PL_fold[c]; break;
case EXACTFA:
- case EXACTFU_NO_TRIE:
+ case EXACTFU_TRICKYFOLD:
case EXACTFU: folded = PL_fold_latin1[c]; break;
case EXACTFL: folded = PL_fold_locale[c]; break;
default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
diff --git a/regnodes.h b/regnodes.h
index 994159342f..5561243882 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -62,7 +62,7 @@
#define EXACTFL 50 /* 0x32 Match this string (not guaranteed to be folded) using /il rules (w/len). */
#define EXACTFU 51 /* 0x33 Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len). */
#define EXACTFU_SS 52 /* 0x34 Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len). */
-#define EXACTFU_NO_TRIE 53 /* 0x35 Match this folded UTF-8 string using /iu rules, but don't generate a trie for it */
+#define EXACTFU_TRICKYFOLD 53 /* 0x35 Match this folded UTF-8 string using /iu rules, but don't generate a trie for it */
#define EXACTFA 54 /* 0x36 Match this string (not guaranteed to be folded) using /iaa rules (w/len). */
#define NOTHING 55 /* 0x37 Match empty string. */
#define TAIL 56 /* 0x38 Match empty string. Can jump here from outside. */
@@ -223,7 +223,7 @@ EXTCONST U8 PL_regkind[] = {
EXACT, /* EXACTFL */
EXACT, /* EXACTFU */
EXACT, /* EXACTFU_SS */
- EXACT, /* EXACTFU_NO_TRIE */
+ EXACT, /* EXACTFU_TRICKYFOLD */
EXACT, /* EXACTFA */
NOTHING, /* NOTHING */
NOTHING, /* TAIL */
@@ -384,7 +384,7 @@ static const U8 regarglen[] = {
0, /* EXACTFL */
0, /* EXACTFU */
0, /* EXACTFU_SS */
- 0, /* EXACTFU_NO_TRIE */
+ 0, /* EXACTFU_TRICKYFOLD */
0, /* EXACTFA */
0, /* NOTHING */
0, /* TAIL */
@@ -502,7 +502,7 @@ static const char reg_off_by_arg[] = {
0, /* EXACTFL */
0, /* EXACTFU */
0, /* EXACTFU_SS */
- 0, /* EXACTFU_NO_TRIE */
+ 0, /* EXACTFU_TRICKYFOLD */
0, /* EXACTFA */
0, /* NOTHING */
0, /* TAIL */
@@ -625,7 +625,7 @@ EXTCONST char * const PL_reg_name[] = {
"EXACTFL", /* 0x32 */
"EXACTFU", /* 0x33 */
"EXACTFU_SS", /* 0x34 */
- "EXACTFU_NO_TRIE", /* 0x35 */
+ "EXACTFU_TRICKYFOLD", /* 0x35 */
"EXACTFA", /* 0x36 */
"NOTHING", /* 0x37 */
"TAIL", /* 0x38 */
diff --git a/t/re/fold_grind.t b/t/re/fold_grind.t
index 98fc3966c5..e2153e3186 100644
--- a/t/re/fold_grind.t
+++ b/t/re/fold_grind.t
@@ -735,7 +735,7 @@ foreach my $test (sort { numerically } keys %tests) {
if (!$res || $ENV{PERL_DEBUG_FULL_TEST}) {
# Failed or debug; output the result
$count++;
- ok($res, $desc);
+ ok($res, "test $count - $desc");
} else {
# Just count the test as passed
$okays++;