diff options
author | Ilya Zakharevich <ilya@math.berkeley.edu> | 1998-08-20 11:19:50 -0400 |
---|---|---|
committer | Gurusamy Sarathy <gsar@cpan.org> | 1998-09-23 05:42:41 +0000 |
commit | 0f5d15d614a56a8b655f35e4e8c6a0dba34da106 (patch) | |
tree | c3140d2320518142ad6ef0414aa648605d09b0a6 | |
parent | 51882d45c6d36f86de12444364ecbbfe87b21ab4 (diff) | |
download | perl-0f5d15d614a56a8b655f35e4e8c6a0dba34da106.tar.gz |
patch to support computed regular subexpressions
Message-Id: <199808201919.PAA04692@monk.mps.ohio-state.edu>
Subject: [5.005_5* PATCH] Postponed RE - now!
p4raw-id: //depot/perl@1813
-rw-r--r-- | embedvar.h | 6 | ||||
-rw-r--r-- | objXSUB.h | 4 | ||||
-rw-r--r-- | pod/perlre.pod | 22 | ||||
-rw-r--r-- | regcomp.c | 23 | ||||
-rw-r--r-- | regexec.c | 162 | ||||
-rwxr-xr-x | t/op/pat.t | 23 | ||||
-rw-r--r-- | t/op/re_tests | 1 | ||||
-rw-r--r-- | thrdvar.h | 2 | ||||
-rw-r--r-- | toke.c | 14 |
9 files changed, 238 insertions, 19 deletions
diff --git a/embedvar.h b/embedvar.h index e1c9e9a550..8c1b786b97 100644 --- a/embedvar.h +++ b/embedvar.h @@ -59,8 +59,10 @@ #define PL_ofslen (PL_curinterp->Tofslen) #define PL_op (PL_curinterp->Top) #define PL_opsave (PL_curinterp->Topsave) +#define PL_reg_call_cc (PL_curinterp->Treg_call_cc) #define PL_reg_eval_set (PL_curinterp->Treg_eval_set) #define PL_reg_flags (PL_curinterp->Treg_flags) +#define PL_reg_re (PL_curinterp->Treg_re) #define PL_reg_start_tmp (PL_curinterp->Treg_start_tmp) #define PL_reg_start_tmpl (PL_curinterp->Treg_start_tmpl) #define PL_regbol (PL_curinterp->Tregbol) @@ -438,8 +440,10 @@ #define PL_Tofslen PL_ofslen #define PL_Top PL_op #define PL_Topsave PL_opsave +#define PL_Treg_call_cc PL_reg_call_cc #define PL_Treg_eval_set PL_reg_eval_set #define PL_Treg_flags PL_reg_flags +#define PL_Treg_re PL_reg_re #define PL_Treg_start_tmp PL_reg_start_tmp #define PL_Treg_start_tmpl PL_reg_start_tmpl #define PL_Tregbol PL_regbol @@ -563,8 +567,10 @@ #define PL_ofslen (thr->Tofslen) #define PL_op (thr->Top) #define PL_opsave (thr->Topsave) +#define PL_reg_call_cc (thr->Treg_call_cc) #define PL_reg_eval_set (thr->Treg_eval_set) #define PL_reg_flags (thr->Treg_flags) +#define PL_reg_re (thr->Treg_re) #define PL_reg_start_tmp (thr->Treg_start_tmp) #define PL_reg_start_tmpl (thr->Treg_start_tmpl) #define PL_regbol (thr->Tregbol) @@ -93,6 +93,10 @@ #define PL_regbol pPerl->PL_regbol #undef PL_regcc #define PL_regcc pPerl->PL_regcc +#undef PL_reg_call_cc +#define PL_reg_call_cc pPerl->PL_reg_call_cc +#undef PL_reg_re +#define PL_reg_re pPerl->PL_reg_re #undef PL_regcode #define PL_regcode pPerl->PL_regcode #undef PL_regcomp_parse diff --git a/pod/perlre.pod b/pod/perlre.pod index f696525155..6ecb7ad12a 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -399,6 +399,28 @@ checks, thus to allow $re in the above snippet to contain C<(?{})> I<with tainting enabled>, one needs both C<use re 'eval'> and untaint the $re. +=item C<(?p{ code })> + +I<Very experimental> "postponed" regular subexpression. C<code> is evaluated +at runtime, at the moment this subexpression may match. The result of +evaluation is considered as a regular expression, and matched as if it +were inserted instead of this construct. + +C<code> is not interpolated. Currently the rules to +determine where the C<code> ends are somewhat convoluted. + +The following regular expression matches matching parenthesized group: + + $re = qr{ + \( + (?: + (?> [^()]+ ) # Non-parens without backtracking + | + (?p{ $re }) # Group with matching parens + )* + \) + }x; + =item C<(?E<gt>pattern)> An "independent" subexpression. Matches the substring that a @@ -709,6 +709,13 @@ study_chunk(regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 if (data) data->flags |= SF_HAS_EVAL; } + else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded */ + if (flags & SCF_DO_SUBSTR) { + scan_commit(data); + data->longest = &(data->longest_float); + } + is_inf = is_inf_internal = 1; + } /* Else: zero-length, ignore. */ scan = regnext(scan); } @@ -1082,6 +1089,7 @@ reg(I32 paren, I32 *flagp) if (*PL_regcomp_parse == '?') { U16 posflags = 0, negflags = 0; U16 *flagsp = &posflags; + int logical = 0; PL_regcomp_parse++; paren = *PL_regcomp_parse++; @@ -1112,6 +1120,10 @@ reg(I32 paren, I32 *flagp) nextchar(); *flagp = TRYAGAIN; return NULL; + case 'p': + logical = 1; + paren = *PL_regcomp_parse++; + /* FALL THROUGH */ case '{': { dTHR; @@ -1160,6 +1172,13 @@ reg(I32 paren, I32 *flagp) } nextchar(); + if (logical) { + ret = reg_node(LOGICAL); + if (!SIZE_ONLY) + ret->flags = 2; + regtail(ret, reganode(EVAL, n)); + return ret; + } return reganode(EVAL, n); } case '(': @@ -1171,6 +1190,8 @@ reg(I32 paren, I32 *flagp) I32 flag; ret = reg_node(LOGICAL); + if (!SIZE_ONLY) + ret->flags = 1; regtail(ret, reg(1, &flag)); goto insert_if; } @@ -3041,7 +3062,7 @@ regprop(SV *sv, regnode *o) sv_catpvf(sv, "GROUPP%d", ARG(o)); break; case LOGICAL: - p = "LOGICAL"; + sv_catpvf(sv, "LOGICAL[%d]", o->flags); break; case SUSPEND: p = "SUSPEND"; @@ -73,6 +73,8 @@ */ #include "EXTERN.h" #include "perl.h" +typedef MAGIC *my_magic; + #include "regcomp.h" #define RF_tainted 1 /* tainted information used? */ @@ -201,6 +203,25 @@ regcppop(void) return input; } +STATIC char * +regcp_set_to(I32 ss) +{ + I32 tmp = PL_savestack_ix; + + PL_savestack_ix = ss; + regcppop(); + PL_savestack_ix = tmp; +} + +typedef struct re_cc_state +{ + I32 ss; + regnode *node; + struct re_cc_state *prev; + CURCUR *cc; + regexp *re; +} re_cc_state; + #define regcpblow(cp) LEAVE_SCOPE(cp) /* @@ -222,6 +243,18 @@ pregexec(register regexp *prog, char *stringarg, register char *strend, regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL, nosave ? 0 : REXEC_COPY_STR); } + +STATIC void +cache_re(regexp *prog) +{ + PL_regprecomp = prog->precomp; /* Needed for FAIL. */ +#ifdef DEBUGGING + PL_regprogram = prog->program; +#endif + PL_regnpar = prog->nparens; + PL_regdata = prog->data; + PL_reg_re = prog; +} /* - regexec_flags - match a regexp against a string @@ -254,10 +287,9 @@ regexec_flags(register regexp *prog, char *stringarg, register char *strend, cc.oldcc = 0; PL_regcc = &cc; - PL_regprecomp = prog->precomp; /* Needed for error messages. */ + cache_re(prog); #ifdef DEBUGGING PL_regnarrate = PL_debug & 512; - PL_regprogram = prog->program; #endif /* Be paranoid... */ @@ -282,7 +314,6 @@ regexec_flags(register regexp *prog, char *stringarg, register char *strend, FAIL("corrupted regexp program"); } - PL_regnpar = prog->nparens; PL_reg_flags = 0; PL_reg_eval_set = 0; @@ -299,6 +330,9 @@ regexec_flags(register regexp *prog, char *stringarg, register char *strend, /* see how far we have to get to not match where we matched before */ PL_regtill = startpos+minend; + /* We start without call_cc context. */ + PL_reg_call_cc = 0; + /* If there is a "must appear" string, look for it. */ s = startpos; if (!(flags & REXEC_CHECKED) @@ -360,8 +394,6 @@ regexec_flags(register regexp *prog, char *stringarg, register char *strend, (strend - startpos > 60 ? "..." : "")) ); - PL_regdata = prog->data; - /* Simplest case: anchored match need be tried only once. */ /* [unless only anchor is BOL and multiline is set] */ if (prog->reganch & ROPT_ANCH) { @@ -1552,15 +1584,91 @@ regmatch(regnode *prog) ret = POPs; PUTBACK; + PL_op = oop; + PL_curpad = ocurpad; + PL_curcop = ocurcop; if (logical) { - logical = 0; + if (logical == 2) { /* Postponed subexpression. */ + regexp *re; + my_magic mg = Null(my_magic); + re_cc_state state; + CURCUR cctmp; + CHECKPOINT cp, lastcp; + + if(SvROK(ret) || SvRMAGICAL(ret)) { + SV *sv = SvROK(ret) ? SvRV(ret) : ret; + + if(SvMAGICAL(sv)) + mg = mg_find(sv, 'r'); + } + if (mg) { + re = (regexp *)mg->mg_obj; + ReREFCNT_inc(re); + } + else { + STRLEN len; + char *t = SvPV(ret, len); + PMOP pm; + char *oprecomp = PL_regprecomp; + I32 osize = PL_regsize; + I32 onpar = PL_regnpar; + + pm.op_pmflags = 0; + re = CALLREGCOMP(t, t + len, &pm); + if (!(SvFLAGS(ret) + & (SVs_TEMP | SVs_PADTMP | SVf_READONLY))) + sv_magic(ret,(SV*)ReREFCNT_inc(re),'r',0,0); + PL_regprecomp = oprecomp; + PL_regsize = osize; + PL_regnpar = onpar; + } + DEBUG_r( + PerlIO_printf(Perl_debug_log, + "Entering embedded `%s%.60s%s%s'\n", + PL_colors[0], + re->precomp, + PL_colors[1], + (strlen(re->precomp) > 60 ? "..." : "")) + ); + state.node = next; + state.prev = PL_reg_call_cc; + state.cc = PL_regcc; + state.re = PL_reg_re; + + cctmp.cur = 0; + cctmp.oldcc = 0; + PL_regcc = &cctmp; + + cp = regcppush(0); /* Save *all* the positions. */ + REGCP_SET; + cache_re(re); + state.ss = PL_savestack_ix; + *PL_reglastparen = 0; + PL_reg_call_cc = &state; + PL_reginput = locinput; + if (regmatch(re->program + 1)) { + ReREFCNT_dec(re); + regcpblow(cp); + sayYES; + } + DEBUG_r( + PerlIO_printf(Perl_debug_log, + "%*s failed...\n", + REPORT_CODE_OFF+PL_regindent*2, "") + ); + ReREFCNT_dec(re); + REGCP_UNWIND; + regcppop(); + PL_reg_call_cc = state.prev; + PL_regcc = state.cc; + PL_reg_re = state.re; + sayNO; + } sw = SvTRUE(ret); + logical = 0; } else sv_setsv(save_scalar(PL_replgv), ret); - PL_op = oop; - PL_curpad = ocurpad; - PL_curcop = ocurcop; break; } case OPEN: @@ -1590,7 +1698,7 @@ regmatch(regnode *prog) } break; case LOGICAL: - logical = 1; + logical = scan->flags; break; case CURLYX: { CURCUR cc; @@ -2086,6 +2194,40 @@ regmatch(regnode *prog) sayNO; break; case END: + if (PL_reg_call_cc) { + re_cc_state *cur_call_cc = PL_reg_call_cc; + CURCUR *cctmp = PL_regcc; + regexp *re = PL_reg_re; + CHECKPOINT cp, lastcp; + + cp = regcppush(0); /* Save *all* the positions. */ + REGCP_SET; + regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of + the caller. */ + PL_reginput = locinput; /* Make position available to + the callcc. */ + cache_re(PL_reg_call_cc->re); + PL_regcc = PL_reg_call_cc->cc; + PL_reg_call_cc = PL_reg_call_cc->prev; + if (regmatch(cur_call_cc->node)) { + PL_reg_call_cc = cur_call_cc; + regcpblow(cp); + sayYES; + } + REGCP_UNWIND; + regcppop(); + PL_reg_call_cc = cur_call_cc; + PL_regcc = cctmp; + PL_reg_re = re; + cache_re(re); + + DEBUG_r( + PerlIO_printf(Perl_debug_log, + "%*s continuation failed...\n", + REPORT_CODE_OFF+PL_regindent*2, "") + ); + sayNO; + } if (locinput < PL_regtill) sayNO; /* Cannot match: too short. */ /* Fall through */ diff --git a/t/op/pat.t b/t/op/pat.t index aec5f31d73..f588734062 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -4,7 +4,7 @@ # the format supported by op/regexp.t. If you want to add a test # that does fit that format, add it to op/re_tests, not here. -print "1..158\n"; +print "1..161\n"; BEGIN { chdir 't' if -d 't'; @@ -363,6 +363,7 @@ sub matchit { /xg; } +@ans = (); push @ans, $res while $res = matchit; print "# ans='@ans'\n# expect='$expect'\nnot " if "@ans" ne "1 1 1"; @@ -375,6 +376,26 @@ print "# ans='@ans'\n# expect='$expect'\nnot " if "@ans" ne $expect; print "ok $test\n"; $test++; +my $matched; +$matched = qr/\((?:(?>[^()]+)|(?p{$matched}))*\)/; + +@ans = @ans1 = (); +push(@ans, $res), push(@ans1, $&) while $res = m/$matched/g; + +print "# ans='@ans'\n# expect='$expect'\nnot " if "@ans" ne "1 1 1"; +print "ok $test\n"; +$test++; + +print "# ans1='@ans1'\n# expect='$expect'\nnot " if "@ans1" ne $expect; +print "ok $test\n"; +$test++; + +@ans = m/$matched/g; + +print "# ans='@ans'\n# expect='$expect'\nnot " if "@ans" ne $expect; +print "ok $test\n"; +$test++; + @ans = ('a/b' =~ m%(.*/)?(.*)%); # Stack may be bad print "not " if "@ans" ne 'a/ b'; print "ok $test\n"; diff --git a/t/op/re_tests b/t/op/re_tests index b6f654fb82..d1b1cecba8 100644 --- a/t/op/re_tests +++ b/t/op/re_tests @@ -485,3 +485,4 @@ b\Z a\nb y - - b\z a\nb y - - (^|x)(c) ca y $2 c a*abc?xyz+pqr{3}ab{2,}xy{4,5}pq{0,6}AB{0,}zz x n - - +a(?{$a=2;$b=3;($b)=$a})b yabz y $b 2 @@ -153,6 +153,8 @@ PERLVAR(Tregnarrate, I32) /* from regexec.c */ PERLVAR(Tregprogram, regnode *) /* from regexec.c */ PERLVARI(Tregindent, int, 0) /* from regexec.c */ PERLVAR(Tregcc, CURCUR *) /* from regexec.c */ +PERLVAR(Treg_call_cc, struct re_cc_state *) /* from regexec.c */ +PERLVAR(Treg_re, regexp *) /* from regexec.c */ PERLVARI(Tregcompp, regcomp_t, FUNC_NAME_TO_PTR(pregcomp)) /* Pointer to RE compiler */ @@ -952,14 +952,16 @@ scan_const(char *start) /* if we get here, we're not doing a transliteration */ - /* skip for regexp comments /(?#comment)/ */ + /* skip for regexp comments /(?#comment)/ and code /(?{code})/, + except for the last char, which will be done separately. */ else if (*s == '(' && PL_lex_inpat && s[1] == '?') { if (s[2] == '#') { while (s < send && *s != ')') *d++ = *s++; - } else if (s[2] == '{') { /* This should march regcomp.c */ + } else if (s[2] == '{' + || s[2] == 'p' && s[3] == '{') { /* This should march regcomp.c */ I32 count = 1; - char *regparse = s + 3; + char *regparse = s + (s[2] == '{' ? 3 : 4); char c; while (count && (c = *regparse)) { @@ -971,11 +973,9 @@ scan_const(char *start) count--; regparse++; } - if (*regparse == ')') - regparse++; - else + if (*regparse != ')') yyerror("Sequence (?{...}) not terminated or not {}-balanced"); - while (s < regparse && *s != ')') + while (s < regparse) *d++ = *s++; } } |