summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIlya Zakharevich <ilya@math.berkeley.edu>1998-08-20 11:19:50 -0400
committerGurusamy Sarathy <gsar@cpan.org>1998-09-23 05:42:41 +0000
commit0f5d15d614a56a8b655f35e4e8c6a0dba34da106 (patch)
treec3140d2320518142ad6ef0414aa648605d09b0a6
parent51882d45c6d36f86de12444364ecbbfe87b21ab4 (diff)
downloadperl-0f5d15d614a56a8b655f35e4e8c6a0dba34da106.tar.gz
patch to support computed regular subexpressions
Message-Id: <199808201919.PAA04692@monk.mps.ohio-state.edu> Subject: [5.005_5* PATCH] Postponed RE - now! p4raw-id: //depot/perl@1813
-rw-r--r--embedvar.h6
-rw-r--r--objXSUB.h4
-rw-r--r--pod/perlre.pod22
-rw-r--r--regcomp.c23
-rw-r--r--regexec.c162
-rwxr-xr-xt/op/pat.t23
-rw-r--r--t/op/re_tests1
-rw-r--r--thrdvar.h2
-rw-r--r--toke.c14
9 files changed, 238 insertions, 19 deletions
diff --git a/embedvar.h b/embedvar.h
index e1c9e9a550..8c1b786b97 100644
--- a/embedvar.h
+++ b/embedvar.h
@@ -59,8 +59,10 @@
#define PL_ofslen (PL_curinterp->Tofslen)
#define PL_op (PL_curinterp->Top)
#define PL_opsave (PL_curinterp->Topsave)
+#define PL_reg_call_cc (PL_curinterp->Treg_call_cc)
#define PL_reg_eval_set (PL_curinterp->Treg_eval_set)
#define PL_reg_flags (PL_curinterp->Treg_flags)
+#define PL_reg_re (PL_curinterp->Treg_re)
#define PL_reg_start_tmp (PL_curinterp->Treg_start_tmp)
#define PL_reg_start_tmpl (PL_curinterp->Treg_start_tmpl)
#define PL_regbol (PL_curinterp->Tregbol)
@@ -438,8 +440,10 @@
#define PL_Tofslen PL_ofslen
#define PL_Top PL_op
#define PL_Topsave PL_opsave
+#define PL_Treg_call_cc PL_reg_call_cc
#define PL_Treg_eval_set PL_reg_eval_set
#define PL_Treg_flags PL_reg_flags
+#define PL_Treg_re PL_reg_re
#define PL_Treg_start_tmp PL_reg_start_tmp
#define PL_Treg_start_tmpl PL_reg_start_tmpl
#define PL_Tregbol PL_regbol
@@ -563,8 +567,10 @@
#define PL_ofslen (thr->Tofslen)
#define PL_op (thr->Top)
#define PL_opsave (thr->Topsave)
+#define PL_reg_call_cc (thr->Treg_call_cc)
#define PL_reg_eval_set (thr->Treg_eval_set)
#define PL_reg_flags (thr->Treg_flags)
+#define PL_reg_re (thr->Treg_re)
#define PL_reg_start_tmp (thr->Treg_start_tmp)
#define PL_reg_start_tmpl (thr->Treg_start_tmpl)
#define PL_regbol (thr->Tregbol)
diff --git a/objXSUB.h b/objXSUB.h
index eee11787c5..1e6bc80f9e 100644
--- a/objXSUB.h
+++ b/objXSUB.h
@@ -93,6 +93,10 @@
#define PL_regbol pPerl->PL_regbol
#undef PL_regcc
#define PL_regcc pPerl->PL_regcc
+#undef PL_reg_call_cc
+#define PL_reg_call_cc pPerl->PL_reg_call_cc
+#undef PL_reg_re
+#define PL_reg_re pPerl->PL_reg_re
#undef PL_regcode
#define PL_regcode pPerl->PL_regcode
#undef PL_regcomp_parse
diff --git a/pod/perlre.pod b/pod/perlre.pod
index f696525155..6ecb7ad12a 100644
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -399,6 +399,28 @@ checks, thus to allow $re in the above snippet to contain C<(?{})>
I<with tainting enabled>, one needs both C<use re 'eval'> and untaint
the $re.
+=item C<(?p{ code })>
+
+I<Very experimental> "postponed" regular subexpression. C<code> is evaluated
+at runtime, at the moment this subexpression may match. The result of
+evaluation is considered as a regular expression, and matched as if it
+were inserted instead of this construct.
+
+C<code> is not interpolated. Currently the rules to
+determine where the C<code> ends are somewhat convoluted.
+
+The following regular expression matches matching parenthesized group:
+
+ $re = qr{
+ \(
+ (?:
+ (?> [^()]+ ) # Non-parens without backtracking
+ |
+ (?p{ $re }) # Group with matching parens
+ )*
+ \)
+ }x;
+
=item C<(?E<gt>pattern)>
An "independent" subexpression. Matches the substring that a
diff --git a/regcomp.c b/regcomp.c
index 8db8b8a58a..07822329ed 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -709,6 +709,13 @@ study_chunk(regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32
if (data)
data->flags |= SF_HAS_EVAL;
}
+ else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded */
+ if (flags & SCF_DO_SUBSTR) {
+ scan_commit(data);
+ data->longest = &(data->longest_float);
+ }
+ is_inf = is_inf_internal = 1;
+ }
/* Else: zero-length, ignore. */
scan = regnext(scan);
}
@@ -1082,6 +1089,7 @@ reg(I32 paren, I32 *flagp)
if (*PL_regcomp_parse == '?') {
U16 posflags = 0, negflags = 0;
U16 *flagsp = &posflags;
+ int logical = 0;
PL_regcomp_parse++;
paren = *PL_regcomp_parse++;
@@ -1112,6 +1120,10 @@ reg(I32 paren, I32 *flagp)
nextchar();
*flagp = TRYAGAIN;
return NULL;
+ case 'p':
+ logical = 1;
+ paren = *PL_regcomp_parse++;
+ /* FALL THROUGH */
case '{':
{
dTHR;
@@ -1160,6 +1172,13 @@ reg(I32 paren, I32 *flagp)
}
nextchar();
+ if (logical) {
+ ret = reg_node(LOGICAL);
+ if (!SIZE_ONLY)
+ ret->flags = 2;
+ regtail(ret, reganode(EVAL, n));
+ return ret;
+ }
return reganode(EVAL, n);
}
case '(':
@@ -1171,6 +1190,8 @@ reg(I32 paren, I32 *flagp)
I32 flag;
ret = reg_node(LOGICAL);
+ if (!SIZE_ONLY)
+ ret->flags = 1;
regtail(ret, reg(1, &flag));
goto insert_if;
}
@@ -3041,7 +3062,7 @@ regprop(SV *sv, regnode *o)
sv_catpvf(sv, "GROUPP%d", ARG(o));
break;
case LOGICAL:
- p = "LOGICAL";
+ sv_catpvf(sv, "LOGICAL[%d]", o->flags);
break;
case SUSPEND:
p = "SUSPEND";
diff --git a/regexec.c b/regexec.c
index 603120f72b..0627e2b154 100644
--- a/regexec.c
+++ b/regexec.c
@@ -73,6 +73,8 @@
*/
#include "EXTERN.h"
#include "perl.h"
+typedef MAGIC *my_magic;
+
#include "regcomp.h"
#define RF_tainted 1 /* tainted information used? */
@@ -201,6 +203,25 @@ regcppop(void)
return input;
}
+STATIC char *
+regcp_set_to(I32 ss)
+{
+ I32 tmp = PL_savestack_ix;
+
+ PL_savestack_ix = ss;
+ regcppop();
+ PL_savestack_ix = tmp;
+}
+
+typedef struct re_cc_state
+{
+ I32 ss;
+ regnode *node;
+ struct re_cc_state *prev;
+ CURCUR *cc;
+ regexp *re;
+} re_cc_state;
+
#define regcpblow(cp) LEAVE_SCOPE(cp)
/*
@@ -222,6 +243,18 @@ pregexec(register regexp *prog, char *stringarg, register char *strend,
regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
nosave ? 0 : REXEC_COPY_STR);
}
+
+STATIC void
+cache_re(regexp *prog)
+{
+ PL_regprecomp = prog->precomp; /* Needed for FAIL. */
+#ifdef DEBUGGING
+ PL_regprogram = prog->program;
+#endif
+ PL_regnpar = prog->nparens;
+ PL_regdata = prog->data;
+ PL_reg_re = prog;
+}
/*
- regexec_flags - match a regexp against a string
@@ -254,10 +287,9 @@ regexec_flags(register regexp *prog, char *stringarg, register char *strend,
cc.oldcc = 0;
PL_regcc = &cc;
- PL_regprecomp = prog->precomp; /* Needed for error messages. */
+ cache_re(prog);
#ifdef DEBUGGING
PL_regnarrate = PL_debug & 512;
- PL_regprogram = prog->program;
#endif
/* Be paranoid... */
@@ -282,7 +314,6 @@ regexec_flags(register regexp *prog, char *stringarg, register char *strend,
FAIL("corrupted regexp program");
}
- PL_regnpar = prog->nparens;
PL_reg_flags = 0;
PL_reg_eval_set = 0;
@@ -299,6 +330,9 @@ regexec_flags(register regexp *prog, char *stringarg, register char *strend,
/* see how far we have to get to not match where we matched before */
PL_regtill = startpos+minend;
+ /* We start without call_cc context. */
+ PL_reg_call_cc = 0;
+
/* If there is a "must appear" string, look for it. */
s = startpos;
if (!(flags & REXEC_CHECKED)
@@ -360,8 +394,6 @@ regexec_flags(register regexp *prog, char *stringarg, register char *strend,
(strend - startpos > 60 ? "..." : ""))
);
- PL_regdata = prog->data;
-
/* Simplest case: anchored match need be tried only once. */
/* [unless only anchor is BOL and multiline is set] */
if (prog->reganch & ROPT_ANCH) {
@@ -1552,15 +1584,91 @@ regmatch(regnode *prog)
ret = POPs;
PUTBACK;
+ PL_op = oop;
+ PL_curpad = ocurpad;
+ PL_curcop = ocurcop;
if (logical) {
- logical = 0;
+ if (logical == 2) { /* Postponed subexpression. */
+ regexp *re;
+ my_magic mg = Null(my_magic);
+ re_cc_state state;
+ CURCUR cctmp;
+ CHECKPOINT cp, lastcp;
+
+ if(SvROK(ret) || SvRMAGICAL(ret)) {
+ SV *sv = SvROK(ret) ? SvRV(ret) : ret;
+
+ if(SvMAGICAL(sv))
+ mg = mg_find(sv, 'r');
+ }
+ if (mg) {
+ re = (regexp *)mg->mg_obj;
+ ReREFCNT_inc(re);
+ }
+ else {
+ STRLEN len;
+ char *t = SvPV(ret, len);
+ PMOP pm;
+ char *oprecomp = PL_regprecomp;
+ I32 osize = PL_regsize;
+ I32 onpar = PL_regnpar;
+
+ pm.op_pmflags = 0;
+ re = CALLREGCOMP(t, t + len, &pm);
+ if (!(SvFLAGS(ret)
+ & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
+ sv_magic(ret,(SV*)ReREFCNT_inc(re),'r',0,0);
+ PL_regprecomp = oprecomp;
+ PL_regsize = osize;
+ PL_regnpar = onpar;
+ }
+ DEBUG_r(
+ PerlIO_printf(Perl_debug_log,
+ "Entering embedded `%s%.60s%s%s'\n",
+ PL_colors[0],
+ re->precomp,
+ PL_colors[1],
+ (strlen(re->precomp) > 60 ? "..." : ""))
+ );
+ state.node = next;
+ state.prev = PL_reg_call_cc;
+ state.cc = PL_regcc;
+ state.re = PL_reg_re;
+
+ cctmp.cur = 0;
+ cctmp.oldcc = 0;
+ PL_regcc = &cctmp;
+
+ cp = regcppush(0); /* Save *all* the positions. */
+ REGCP_SET;
+ cache_re(re);
+ state.ss = PL_savestack_ix;
+ *PL_reglastparen = 0;
+ PL_reg_call_cc = &state;
+ PL_reginput = locinput;
+ if (regmatch(re->program + 1)) {
+ ReREFCNT_dec(re);
+ regcpblow(cp);
+ sayYES;
+ }
+ DEBUG_r(
+ PerlIO_printf(Perl_debug_log,
+ "%*s failed...\n",
+ REPORT_CODE_OFF+PL_regindent*2, "")
+ );
+ ReREFCNT_dec(re);
+ REGCP_UNWIND;
+ regcppop();
+ PL_reg_call_cc = state.prev;
+ PL_regcc = state.cc;
+ PL_reg_re = state.re;
+ sayNO;
+ }
sw = SvTRUE(ret);
+ logical = 0;
}
else
sv_setsv(save_scalar(PL_replgv), ret);
- PL_op = oop;
- PL_curpad = ocurpad;
- PL_curcop = ocurcop;
break;
}
case OPEN:
@@ -1590,7 +1698,7 @@ regmatch(regnode *prog)
}
break;
case LOGICAL:
- logical = 1;
+ logical = scan->flags;
break;
case CURLYX: {
CURCUR cc;
@@ -2086,6 +2194,40 @@ regmatch(regnode *prog)
sayNO;
break;
case END:
+ if (PL_reg_call_cc) {
+ re_cc_state *cur_call_cc = PL_reg_call_cc;
+ CURCUR *cctmp = PL_regcc;
+ regexp *re = PL_reg_re;
+ CHECKPOINT cp, lastcp;
+
+ cp = regcppush(0); /* Save *all* the positions. */
+ REGCP_SET;
+ regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
+ the caller. */
+ PL_reginput = locinput; /* Make position available to
+ the callcc. */
+ cache_re(PL_reg_call_cc->re);
+ PL_regcc = PL_reg_call_cc->cc;
+ PL_reg_call_cc = PL_reg_call_cc->prev;
+ if (regmatch(cur_call_cc->node)) {
+ PL_reg_call_cc = cur_call_cc;
+ regcpblow(cp);
+ sayYES;
+ }
+ REGCP_UNWIND;
+ regcppop();
+ PL_reg_call_cc = cur_call_cc;
+ PL_regcc = cctmp;
+ PL_reg_re = re;
+ cache_re(re);
+
+ DEBUG_r(
+ PerlIO_printf(Perl_debug_log,
+ "%*s continuation failed...\n",
+ REPORT_CODE_OFF+PL_regindent*2, "")
+ );
+ sayNO;
+ }
if (locinput < PL_regtill)
sayNO; /* Cannot match: too short. */
/* Fall through */
diff --git a/t/op/pat.t b/t/op/pat.t
index aec5f31d73..f588734062 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -4,7 +4,7 @@
# the format supported by op/regexp.t. If you want to add a test
# that does fit that format, add it to op/re_tests, not here.
-print "1..158\n";
+print "1..161\n";
BEGIN {
chdir 't' if -d 't';
@@ -363,6 +363,7 @@ sub matchit {
/xg;
}
+@ans = ();
push @ans, $res while $res = matchit;
print "# ans='@ans'\n# expect='$expect'\nnot " if "@ans" ne "1 1 1";
@@ -375,6 +376,26 @@ print "# ans='@ans'\n# expect='$expect'\nnot " if "@ans" ne $expect;
print "ok $test\n";
$test++;
+my $matched;
+$matched = qr/\((?:(?>[^()]+)|(?p{$matched}))*\)/;
+
+@ans = @ans1 = ();
+push(@ans, $res), push(@ans1, $&) while $res = m/$matched/g;
+
+print "# ans='@ans'\n# expect='$expect'\nnot " if "@ans" ne "1 1 1";
+print "ok $test\n";
+$test++;
+
+print "# ans1='@ans1'\n# expect='$expect'\nnot " if "@ans1" ne $expect;
+print "ok $test\n";
+$test++;
+
+@ans = m/$matched/g;
+
+print "# ans='@ans'\n# expect='$expect'\nnot " if "@ans" ne $expect;
+print "ok $test\n";
+$test++;
+
@ans = ('a/b' =~ m%(.*/)?(.*)%); # Stack may be bad
print "not " if "@ans" ne 'a/ b';
print "ok $test\n";
diff --git a/t/op/re_tests b/t/op/re_tests
index b6f654fb82..d1b1cecba8 100644
--- a/t/op/re_tests
+++ b/t/op/re_tests
@@ -485,3 +485,4 @@ b\Z a\nb y - -
b\z a\nb y - -
(^|x)(c) ca y $2 c
a*abc?xyz+pqr{3}ab{2,}xy{4,5}pq{0,6}AB{0,}zz x n - -
+a(?{$a=2;$b=3;($b)=$a})b yabz y $b 2
diff --git a/thrdvar.h b/thrdvar.h
index 4ca3ccbd50..c247dc4d04 100644
--- a/thrdvar.h
+++ b/thrdvar.h
@@ -153,6 +153,8 @@ PERLVAR(Tregnarrate, I32) /* from regexec.c */
PERLVAR(Tregprogram, regnode *) /* from regexec.c */
PERLVARI(Tregindent, int, 0) /* from regexec.c */
PERLVAR(Tregcc, CURCUR *) /* from regexec.c */
+PERLVAR(Treg_call_cc, struct re_cc_state *) /* from regexec.c */
+PERLVAR(Treg_re, regexp *) /* from regexec.c */
PERLVARI(Tregcompp, regcomp_t, FUNC_NAME_TO_PTR(pregcomp))
/* Pointer to RE compiler */
diff --git a/toke.c b/toke.c
index 719867b9e4..2583a42559 100644
--- a/toke.c
+++ b/toke.c
@@ -952,14 +952,16 @@ scan_const(char *start)
/* if we get here, we're not doing a transliteration */
- /* skip for regexp comments /(?#comment)/ */
+ /* skip for regexp comments /(?#comment)/ and code /(?{code})/,
+ except for the last char, which will be done separately. */
else if (*s == '(' && PL_lex_inpat && s[1] == '?') {
if (s[2] == '#') {
while (s < send && *s != ')')
*d++ = *s++;
- } else if (s[2] == '{') { /* This should march regcomp.c */
+ } else if (s[2] == '{'
+ || s[2] == 'p' && s[3] == '{') { /* This should march regcomp.c */
I32 count = 1;
- char *regparse = s + 3;
+ char *regparse = s + (s[2] == '{' ? 3 : 4);
char c;
while (count && (c = *regparse)) {
@@ -971,11 +973,9 @@ scan_const(char *start)
count--;
regparse++;
}
- if (*regparse == ')')
- regparse++;
- else
+ if (*regparse != ')')
yyerror("Sequence (?{...}) not terminated or not {}-balanced");
- while (s < regparse && *s != ')')
+ while (s < regparse)
*d++ = *s++;
}
}