diff options
-rw-r--r-- | ext/re/re.pm | 18 | ||||
-rw-r--r-- | ext/re/t/lexical_debug.pl | 4 | ||||
-rw-r--r-- | ext/re/t/lexical_debug.t | 25 | ||||
-rw-r--r-- | ext/re/t/regop.pl | 2 | ||||
-rw-r--r-- | ext/re/t/regop.t | 101 | ||||
-rw-r--r-- | pp_ctl.c | 15 | ||||
-rw-r--r-- | regcomp.c | 60 | ||||
-rw-r--r-- | regcomp.h | 3 |
8 files changed, 148 insertions, 80 deletions
diff --git a/ext/re/re.pm b/ext/re/re.pm index c9ea5804e5..fe64d4ab50 100644 --- a/ext/re/re.pm +++ b/ext/re/re.pm @@ -23,16 +23,16 @@ re - Perl pragma to alter regular expression behaviour /foo${pat}bar/; # disallowed (with or without -T switch) } - use re 'debug'; # NOT lexically scoped (as others are) - /^(.*)$/s; # output debugging info during - # compile and run time + use re 'debug'; # output debugging info during + /^(.*)$/s; # compile and run time + use re 'debugcolor'; # same as 'debug', but with colored output ... use re qw(Debug All); # Finer tuned debugging options. - use re qw(Debug More); # Similarly not lexically scoped. - no re qw(Debug ALL); # Turn of all re dugging and unload the module. + use re qw(Debug More); + no re qw(Debug ALL); # Turn of all re dugging in this scope (We use $^X in these examples because it's tainted by default.) @@ -188,9 +188,9 @@ Enable TRIE_MORE and all execute compile and execute options. =back -The directive C<use re 'debug'> and its equivalents are I<not> lexically -scoped, as the other directives are. They have both compile-time and run-time -effects. +As of 5.9.5 the directive C<use re 'debug'> and its equivalents are +lexically scoped, as the other directives are. However they have both +compile-time and run-time effects. See L<perlmodlib/Pragmatic Modules>. @@ -297,7 +297,7 @@ sub bits { } else { require Carp; Carp::carp("Unknown \"re\" Debug flag '$_[$idx]', possible flags: ", - join(", ",sort { $flags{$a} <=> $flags{$b} } keys %flags ) ); + join(", ",sort keys %flags ) ); } } _load_unload($on ? 1 : ${^RE_DEBUG_FLAGS}); diff --git a/ext/re/t/lexical_debug.pl b/ext/re/t/lexical_debug.pl index c8b7c5bc67..6cdfa49b66 100644 --- a/ext/re/t/lexical_debug.pl +++ b/ext/re/t/lexical_debug.pl @@ -20,6 +20,10 @@ no re 'debug'; /fop/ and $count++; +use re 'debug'; +my $var='zoo|zil|zap'; +/($var)/ or $count++; + print "Count=$count\n"; diff --git a/ext/re/t/lexical_debug.t b/ext/re/t/lexical_debug.t index affa7c50fc..b6a3dcb8ab 100644 --- a/ext/re/t/lexical_debug.t +++ b/ext/re/t/lexical_debug.t @@ -11,20 +11,27 @@ BEGIN { } use strict; -require "./test.pl"; + +# must use a BEGIN or the prototypes wont be respected meaning + # tests could pass that shouldn't +BEGIN { require "./test.pl"; } my $out = runperl(progfile => "../ext/re/t/lexical_debug.pl", stderr => 1 ); -print "1..7\n"; +print "1..10\n"; # Each pattern will produce an EXACT node with a specific string in # it, so we will look for that. We can't just look for the string # alone as the string being matched against contains all of them. -ok( $out =~ /EXACT <foo>/, "Expect 'foo'"); -ok( $out !~ /EXACT <bar>/, "No 'bar'"); -ok( $out =~ /EXACT <baz>/, "Expect 'baz'"); -ok( $out !~ /EXACT <bop>/, "No 'bop'"); -ok( $out =~ /EXACT <fip>/, "Expect 'fip'"); -ok( $out !~ /EXACT <fop>/, "No 'baz'"); -ok( $out =~ /Count=6\n/,"Count is 6"); +ok( $out =~ /EXACT <foo>/, "Expect 'foo'" ); +ok( $out !~ /EXACT <bar>/, "No 'bar'" ); +ok( $out =~ /EXACT <baz>/, "Expect 'baz'" ); +ok( $out !~ /EXACT <bop>/, "No 'bop'" ); +ok( $out =~ /EXACT <fip>/, "Expect 'fip'" ); +ok( $out !~ /EXACT <fop>/, "No 'baz'" ); +ok( $out =~ /<zil>/, "Got 'zil'" ); # in a TRIE so no EXACT +ok( $out =~ /<zoo>/, "Got 'zoo'" ); # in a TRIE so no EXACT +ok( $out =~ /<zap>/, "Got 'zap'" ); # in a TRIE so no EXACT +ok( $out =~ /Count=7\n/, "Count is 7") + or diag($out); diff --git a/ext/re/t/regop.pl b/ext/re/t/regop.pl index 88f9f28cdb..8969335220 100644 --- a/ext/re/t/regop.pl +++ b/ext/re/t/regop.pl @@ -1,4 +1,4 @@ -use re Debug=>qw(DUMP EXECUTE OFFSETS); +use re Debug=>qw(DUMP EXECUTE OFFSETS TRIEC); my @tests=( XY => 'X(A|[B]Q||C|D)Y' , foobar => '[f][o][o][b][a][r]', diff --git a/ext/re/t/regop.t b/ext/re/t/regop.t index be82dc925d..1ccf8b3eca 100644 --- a/ext/re/t/regop.t +++ b/ext/re/t/regop.t @@ -11,7 +11,7 @@ BEGIN { } use strict; -require "./test.pl"; +BEGIN { require "./test.pl"; } our $NUM_SECTS; chomp(my @strs= grep { !/^\s*\#/ } <DATA>); my $out = runperl(progfile => "../ext/re/t/regop.pl", stderr => 1 ); @@ -31,6 +31,7 @@ my $test= 1; foreach my $testout ( @tests ) { my ( $pattern )= $testout=~/Compiling REx "([^"]+)"/; ok( $pattern, "Pattern for test " . ($test++) ); + my $diaged; while (@strs) { local $_= shift @strs; last if !$_ @@ -38,7 +39,10 @@ foreach my $testout ( @tests ) { next if /^\s*#/; s/^\s+//; s/\s+$//; - ok( $testout=~/\Q$_\E/, "$_: /$pattern/" ); + ok( $testout=~/\Q$_\E/, "$_: /$pattern/" ) + or do { + !$diaged++ and diag("$_: /$pattern/\n$testout"); + }; } } @@ -85,7 +89,7 @@ __END__ #%MATCHED% #Freeing REx: "X(A|[B]Q||C|D)Y" Compiling REx "X(A|[B]Q||C|D)Y" -Start-Class:A-D] +[A-D] TRIE-EXACT <BQ> matched empty string @@ -95,9 +99,10 @@ Found anchored substr "X" at offset 0... Guessed: match at offset 0 checking floating minlen 2 -Words:5 -Unique:5 -States:6 +S:1/6 +W:5 +L:0/2 +C:5/5 %MATCHED% --- #Compiling REx "[f][o][o][b][a][r]" @@ -132,36 +137,60 @@ Freeing REx: "[f][o][o][b][a][r]" %FAILED% minlen 3 --- -#Compiling REx "(?:ABCP|ABCG|ABCE|ABCB|ABCA|ABCD)" -#size 20 nodes -# 1: EXACT <ABC>(3) -# 3: TRIE-EXACT(20) -# [Start:4 Words:6 Chars:24 Unique:7 States:10 Minlen:1 Maxlen:1 Start-Class:A-EGP] -# <ABCP> -# <ABCG> -# <ABCE> -# <ABCB> -# <ABCA> -# <ABCD> -# 19: TAIL(20) -# 20: END(0) -#minlen 4 -#Matching REx "(?:ABCP|ABCG|ABCE|ABCB|ABCA|ABCD)" against "ABCD" -# Setting an EVAL scope, savestack=140 -# 0 <> <ABCD> | 1: EXACT <ABC> -# 3 <ABC> <D> | 3: TRIE-EXACT -# only one match : #6 <ABCD> -# 4 <ABCD> <> | 20: END -#Match successful! -#POP STATE(1) -#%MATCHED% -#Freeing REx: "(?:ABCP|ABCG|ABCE|ABCB|ABCA|ABCD)" +# Compiling REx "(?:ABCP|ABCG|ABCE|ABCB|ABCA|ABCD)" +# Got 164 bytes for offset annotations. +# TRIE(NATIVE): W:6 C:24 Uq:7 Min:4 Max:4 +# Char : Match Base Ofs A B C P G E D +# State|--------------------------------------------------- +# # 1| @ 7 + 0[ 2 . . . . . .] +# # 2| @ 7 + 1[ . 3 . . . . .] +# # 3| @ 7 + 2[ . . 4 . . . .] +# # 4| @ A + 0[ 9 8 0 5 6 7 A] +# # 5| W 1 @ 0 +# # 6| W 2 @ 0 +# # 7| W 3 @ 0 +# # 8| W 4 @ 0 +# # 9| W 5 @ 0 +# # A| W 6 @ 0 +# Final program: +# 1: EXACT <ABC>(3) +# 3: TRIEC-EXACT<S:4/10 W:6 L:1/1 C:24/7>[A-EGP](20) +# <ABCP> +# <ABCG> +# <ABCE> +# <ABCB> +# <ABCA> +# <ABCD> +# 20: END(0) +# anchored "ABC" at 0 (checking anchored) minlen 4 +# Offsets: [20] +# 1:4[3] 3:4[15] 19:32[0] 20:34[0] +# Guessing start of match in sv for REx "(?:ABCP|ABCG|ABCE|ABCB|ABCA|ABCD)" against "ABCD" +# Found anchored substr "ABC" at offset 0... +# Guessed: match at offset 0 +# Matching REx "(?:ABCP|ABCG|ABCE|ABCB|ABCA|ABCD)" against "ABCD" +# 0 <> <ABCD> | 1:EXACT <ABC>(3) +# 3 <ABC> <D> | 3:TRIEC-EXACT<S:4/10 W:6 L:1/1 C:24/7>[A-EGP](20) +# 3 <ABC> <D> | State: 4 Accepted: 0 Charid: 7 CP: 44 After State: a +# 4 <ABCD> <> | State: a Accepted: 1 Charid: 6 CP: 0 After State: 0 +# got 1 possible matches +# only one match left: #6 <ABCD> +# 4 <ABCD> <> | 20:END(0) +# Match successful! +# %MATCHED% +# Freeing REx: "(?:ABCP|ABCG|ABCE|ABCB|ABCA|ABCD)" %MATCHED% EXACT <ABC> -Start-Class:A-EGP -only one match : #6 <ABCD> -Start:4 +TRIEC-EXACT +[A-EGP] +only one match left: #6 <ABCD> +S:4/10 +W:6 +L:1/1 +C:24/7 minlen 4 +(checking anchored) +anchored "ABC" at 0 --- #Compiling REx "(\.COM|\.EXE|\.BAT|\.CMD|\.VBS|\.VBE|\.JS|\.JSE|\.WSF|\.WSH|\.pyo|\.pyc|\.pyw|\.py)$" #size 48 nodes first at 3 @@ -202,12 +231,12 @@ minlen 4 #Freeing REx: "(\\.COM|\\.EXE|\\.BAT|\\.CMD|\\.VBS|\\.VBE|\\.JS|\\.JSE|\\."...... %MATCHED% floating ""$ at 3..4 (checking floating) -1:1[1] 3:2[1] 5:2[81] 45:83[1] 47:84[1] 48:85[0] -stclass "EXACTF <.>" minlen 3 +1:1[1] 3:2[1] 5:2[64] 45:83[1] 47:84[1] 48:85[0] +stclass EXACTF <.> minlen 3 Found floating substr ""$ at offset 30... Does not contradict STCLASS... Guessed: match at offset 26 -Matching stclass "EXACTF <.>" against ".exe" +Matching stclass EXACTF <.> against ".exe" --- #Compiling REx "[q]" #size 12 nodes Got 100 bytes for offset annotations. @@ -131,10 +131,19 @@ PP(pp_regcomp) if (!re || !re->precomp || re->prelen != (I32)len || memNE(re->precomp, t, len)) { + regexp_engine * eng = NULL; + if (re) { + eng = re->engine; ReREFCNT_dec(re); PM_SETRE(pm, NULL); /* crucial if regcomp aborts */ + } else if (PL_curcop->cop_hints_hash) { + SV *ptr = Perl_refcounted_he_fetch(aTHX_ PL_curcop->cop_hints_hash, 0, + "regcomp", 7, 0, 0); + if (ptr && SvIOK(ptr) && SvIV(ptr)) + eng = INT2PTR(regexp_engine*,SvIV(ptr)); } + if (PL_op->op_flags & OPf_SPECIAL) PL_reginterp_cnt = I32_MAX; /* Mark as safe. */ @@ -146,7 +155,11 @@ PP(pp_regcomp) if (pm->op_pmdynflags & PMdf_UTF8) t = (char*)bytes_to_utf8((U8*)t, &len); } - PM_SETRE(pm, CALLREGCOMP((char *)t, (char *)t + len, pm)); + if (eng) + PM_SETRE(pm, CALLREGCOMP_ENG(eng,(char *)t, (char *)t + len, pm)); + else + PM_SETRE(pm, CALLREGCOMP((char *)t, (char *)t + len, pm)); + if (!DO_UTF8(tmpstr) && (pm->op_pmdynflags & PMdf_UTF8)) Safefree(t); PL_reginterp_cnt = 0; /* XXXX Be extra paranoid - needed @@ -820,7 +820,7 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth) PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------"); PerlIO_printf( Perl_debug_log, "\n"); - for( state = 1 ; state < trie->laststate ; state++ ) { + for( state = 1 ; state < trie->statecount ; state++ ) { const U32 base = trie->states[ state ].trans.base; PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state); @@ -903,10 +903,13 @@ S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | PERL_PV_ESCAPE_FIRSTCHAR ) , - TRIE_LIST_ITEM(state,charid).forid, - (UV)TRIE_LIST_ITEM(state,charid).newstate - ); - } + TRIE_LIST_ITEM(state,charid).forid, + (UV)TRIE_LIST_ITEM(state,charid).newstate + ); + if (!(charid % 10)) + PerlIO_printf( Perl_debug_log, "\n%*s| ", + (depth * 2) + 14,""); + } } PerlIO_printf( Perl_debug_log, "\n"); } @@ -1098,10 +1101,11 @@ is the recommended Unicode-aware way of saying *(d++) = uv; */ -#define TRIE_STORE_REVCHAR \ +#define TRIE_STORE_REVCHAR \ STMT_START { \ - SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \ + SV *tmp = newSVpvs(""); \ if (UTF) SvUTF8_on(tmp); \ + Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \ av_push( TRIE_REVCHARMAP(trie), tmp ); \ } STMT_END @@ -1393,6 +1397,10 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs STRLEN transcount = 1; + DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, + "%*sCompiling trie using list compiler\n", + (int)depth * 2 + 2, "")); + Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state ); TRIE_LIST_NEW(1); next_alloc = 2; @@ -1455,13 +1463,14 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs } /* end second pass */ - trie->laststate = next_alloc; + /* next alloc is the NEXT state to be allocated */ + trie->statecount = next_alloc; Renew( trie->states, next_alloc, reg_trie_state ); /* and now dump it out before we compress it */ DEBUG_TRIE_COMPILE_MORE_r( dump_trie_interim_list(trie,next_alloc,depth+1) - ); + ); Newxz( trie->trans, transcount ,reg_trie_trans ); { @@ -1570,7 +1579,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs use TRIE_NODENUM() to convert. */ - + DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, + "%*sCompiling trie using table compiler\n", + (int)depth * 2 + 2, "")); Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1, reg_trie_trans ); @@ -1694,7 +1705,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs const U32 laststate = TRIE_NODENUM( next_alloc ); U32 state, charid; U32 pos = 0, zp=0; - trie->laststate = laststate; + trie->statecount = laststate; for ( state = 1 ; state < laststate ; state++ ) { U8 flag = 0; @@ -1731,7 +1742,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs } } trie->lasttrans = pos + 1; - Renew( trie->states, laststate + 1, reg_trie_state); + Renew( trie->states, laststate, reg_trie_state); DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n", @@ -1744,6 +1755,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs } /* end table compress */ } + DEBUG_TRIE_COMPILE_MORE_r( + PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n", + (int)depth * 2 + 2, "", + (UV)trie->statecount, + (UV)trie->lasttrans) + ); /* resize the trans array to remove unused space */ Renew( trie->trans, trie->lasttrans, reg_trie_trans); @@ -1799,12 +1816,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs trie->startstate= 1; if ( trie->bitmap && !trie->widecharmap && !trie->jump ) { U32 state; - DEBUG_OPTIMISE_r( - PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n", - (int)depth * 2 + 2, "", - (UV)trie->laststate) - ); - for ( state = 1 ; state < trie->laststate-1 ; state++ ) { + for ( state = 1 ; state < trie->statecount-1 ; state++ ) { U32 ofs = 0; I32 idx = -1; U32 count = 0; @@ -1981,7 +1993,7 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)]; U32 *q; const U32 ucharcount = trie->uniquecharcount; - const U32 numstates = trie->laststate; + const U32 numstates = trie->statecount; const U32 ubound = trie->lasttrans + ucharcount; U32 q_read = 0; U32 q_write = 0; @@ -2001,7 +2013,7 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode RExC_rx->data->data[ data_slot ] = (void*)aho; aho->trie=trie; aho->states=(reg_trie_state *)savepvn((const char*)trie->states, - (trie->laststate+1)*sizeof(reg_trie_state)); + numstates * sizeof(reg_trie_state)); Newxz( q, numstates, U32); Newxz( aho->fail, numstates, U32 ); aho->refcount = 1; @@ -2050,7 +2062,9 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode */ fail[ 0 ] = fail[ 1 ] = 0; DEBUG_TRIE_COMPILE_r({ - PerlIO_printf(Perl_debug_log, "%*sStclass Failtable: 0", (int)(depth * 2), ""); + PerlIO_printf(Perl_debug_log, "%*sStclass Failtable (%"UVuf" states): 0", + (int)(depth * 2), "", numstates + ); for( q_read=1; q_read<numstates; q_read++ ) { PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]); } @@ -3725,7 +3739,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) HV * const table = GvHV(PL_hintgv); if (table) { SV **ptr= hv_fetchs(table, "regcomp", FALSE); - if (ptr && SvIOK(*ptr)) { + if (ptr && SvIOK(*ptr) && SvIV(*ptr)) { const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr)); DEBUG_COMPILE_r({ PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n", @@ -7703,7 +7717,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) Perl_sv_catpvf(aTHX_ sv, "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">", (UV)trie->startstate, - (IV)trie->laststate-1, + (IV)trie->statecount-1, /* -1 because of the unused 0 element */ (UV)trie->wordcount, (UV)trie->minlen, (UV)trie->maxlen, @@ -519,7 +519,8 @@ struct _reg_trie_data { for the node following a given word. */ U16 *nextword; /* optional 1 indexed array to support linked list of duplicate wordnums */ - U32 laststate; /* Build only */ + U32 statecount; /* Build only - number of states in the states array + (including the unused zero state) */ U32 wordcount; /* Build only */ #ifdef DEBUGGING STRLEN charcount; /* Build only */ |