summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYves Orton <demerphq@gmail.com>2006-11-13 00:29:41 +0100
committerSteve Peters <steve@fisharerojo.org>2006-11-13 02:19:12 +0000
commitde8c53012b7e614137ab875e0d58a92474b317ce (patch)
treecc24fc09cc1af2e140a8d29a1bcd652cba6c4b00
parent7834bb7eff465724a885b368420973bce2d27483 (diff)
downloadperl-de8c53012b7e614137ab875e0d58a92474b317ce.tar.gz
Regex Utility Functions and Substituion Fix (XML::Twig core dump)
Message-ID: <9b18b3110611121429g1fc9d6c1t4007dc711f9e8396@mail.gmail.com> Plus a couple tweaks to ext/re/re.pm and t/op/pat.t to those patches to apply cleanly. p4raw-id: //depot/perl@29252
-rw-r--r--MANIFEST1
-rw-r--r--embed.fnc2
-rw-r--r--embed.h8
-rw-r--r--ext/re/re.pm341
-rw-r--r--ext/re/re.xs118
-rw-r--r--ext/re/re_top.h1
-rw-r--r--ext/re/t/re_funcs.t24
-rw-r--r--lib/ExtUtils/ParseXS.pm3
-rw-r--r--perl.h3
-rw-r--r--pod/perlreguts.pod30
-rw-r--r--pp.c2
-rw-r--r--pp_hot.c10
-rw-r--r--proto.h7
-rw-r--r--regcomp.c119
-rw-r--r--regcomp.h1
-rw-r--r--regexp.h4
-rw-r--r--sv.c95
-rwxr-xr-xt/op/pat.t14
18 files changed, 550 insertions, 233 deletions
diff --git a/MANIFEST b/MANIFEST
index 9987b37548..732627320d 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -987,6 +987,7 @@ ext/re/t/lexical_debug.t test that lexical re 'debug' works
ext/re/t/regop.pl generate debug output for various patterns
ext/re/t/regop.t test RE optimizations by scraping debug output
ext/re/t/re.t see if re pragma works
+ext/re/t/re_funcs.t see if exportable funcs from re.pm work
ext/Safe/t/safe1.t See if Safe works
ext/Safe/t/safe2.t See if Safe works
ext/Safe/t/safe3.t See if Safe works
diff --git a/embed.fnc b/embed.fnc
index 5b254b5abf..3ac4bc48d5 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -668,6 +668,7 @@ Ap |I32 |pregexec |NN regexp* prog|NN char* stringarg \
|NN char* strend|NN char* strbeg|I32 minend \
|NN SV* screamer|U32 nosave
Ap |void |pregfree |NULLOK struct regexp* r
+p |char * |reg_stringify |NN MAGIC *mg|NULLOK STRLEN *lp|NULLOK U32 *flags|NULLOK I32 *haseval
#if defined(USE_ITHREADS)
Ap |regexp*|regdupe |NN const regexp* r|NN CLONE_PARAMS* param
#endif
@@ -1429,7 +1430,6 @@ s |void |utf8_mg_pos_cache_update|NN SV *sv|NN MAGIC **mgp \
|STRLEN byte|STRLEN utf8|STRLEN blen
s |STRLEN |sv_pos_b2u_midway|NN const U8 *s|NN const U8 *const target \
|NN const U8 *end|STRLEN endu
-s |char * |stringify_regexp|NN SV *sv|NN MAGIC *mg|NULLOK STRLEN *lp
sn |char * |F0convert |NV nv|NN char *endbuf|NN STRLEN *len
# if defined(PERL_OLD_COPY_ON_WRITE)
sM |void |sv_release_COW |NN SV *sv|NN const char *pvx|STRLEN len|NN SV *after
diff --git a/embed.h b/embed.h
index fea5b27915..c4bf329a09 100644
--- a/embed.h
+++ b/embed.h
@@ -680,6 +680,9 @@
#define regclass_swash Perl_regclass_swash
#define pregexec Perl_pregexec
#define pregfree Perl_pregfree
+#ifdef PERL_CORE
+#define reg_stringify Perl_reg_stringify
+#endif
#if defined(USE_ITHREADS)
#define regdupe Perl_regdupe
#endif
@@ -1429,7 +1432,6 @@
#define sv_pos_u2b_cached S_sv_pos_u2b_cached
#define utf8_mg_pos_cache_update S_utf8_mg_pos_cache_update
#define sv_pos_b2u_midway S_sv_pos_b2u_midway
-#define stringify_regexp S_stringify_regexp
#define F0convert S_F0convert
#endif
# if defined(PERL_OLD_COPY_ON_WRITE)
@@ -2887,6 +2889,9 @@
#define regclass_swash(a,b,c,d,e) Perl_regclass_swash(aTHX_ a,b,c,d,e)
#define pregexec(a,b,c,d,e,f,g) Perl_pregexec(aTHX_ a,b,c,d,e,f,g)
#define pregfree(a) Perl_pregfree(aTHX_ a)
+#ifdef PERL_CORE
+#define reg_stringify(a,b,c,d) Perl_reg_stringify(aTHX_ a,b,c,d)
+#endif
#if defined(USE_ITHREADS)
#define regdupe(a,b) Perl_regdupe(aTHX_ a,b)
#endif
@@ -3632,7 +3637,6 @@
#define sv_pos_u2b_cached(a,b,c,d,e,f,g) S_sv_pos_u2b_cached(aTHX_ a,b,c,d,e,f,g)
#define utf8_mg_pos_cache_update(a,b,c,d,e) S_utf8_mg_pos_cache_update(aTHX_ a,b,c,d,e)
#define sv_pos_b2u_midway(a,b,c,d) S_sv_pos_b2u_midway(aTHX_ a,b,c,d)
-#define stringify_regexp(a,b,c) S_stringify_regexp(aTHX_ a,b,c)
#define F0convert S_F0convert
#endif
# if defined(PERL_OLD_COPY_ON_WRITE)
diff --git a/ext/re/re.pm b/ext/re/re.pm
index dfdfe8668a..e9d710f1b4 100644
--- a/ext/re/re.pm
+++ b/ext/re/re.pm
@@ -1,6 +1,170 @@
package re;
-our $VERSION = 0.06_03;
+# pragma for controlling the regex engine
+use strict;
+use warnings;
+
+our $VERSION = "0.06_03";
+our @ISA = qw(Exporter);
+our @EXPORT_OK = qw(is_regexp regexp_pattern);
+our %EXPORT_OK = map { $_ => 1 } @EXPORT_OK;
+
+# *** WARNING *** WARNING *** WARNING *** WARNING *** WARNING ***
+#
+# If you modify these values see comment below!
+
+my %bitmask = (
+ taint => 0x00100000, # HINT_RE_TAINT
+ eval => 0x00200000, # HINT_RE_EVAL
+);
+
+# - File::Basename contains a literal for 'taint' as a fallback. If
+# taint is changed here, File::Basename must be updated as well.
+#
+# - ExtUtils::ParseXS uses a hardcoded
+# BEGIN { $^H |= 0x00200000 }
+# in it to allow re.xs to be built. So if 'eval' is changed here then
+# ExtUtils::ParseXS must be changed as well.
+#
+# *** WARNING *** WARNING *** WARNING *** WARNING *** WARNING ***
+
+sub setcolor {
+ eval { # Ignore errors
+ require Term::Cap;
+
+ my $terminal = Tgetent Term::Cap ({OSPEED => 9600}); # Avoid warning.
+ my $props = $ENV{PERL_RE_TC} || 'md,me,so,se,us,ue';
+ my @props = split /,/, $props;
+ my $colors = join "\t", map {$terminal->Tputs($_,1)} @props;
+
+ $colors =~ s/\0//g;
+ $ENV{PERL_RE_COLORS} = $colors;
+ };
+ if ($@) {
+ $ENV{PERL_RE_COLORS} ||= qq'\t\t> <\t> <\t\t';
+ }
+
+}
+
+my %flags = (
+ COMPILE => 0x0000FF,
+ PARSE => 0x000001,
+ OPTIMISE => 0x000002,
+ TRIEC => 0x000004,
+ DUMP => 0x000008,
+
+ EXECUTE => 0x00FF00,
+ INTUIT => 0x000100,
+ MATCH => 0x000200,
+ TRIEE => 0x000400,
+
+ EXTRA => 0xFF0000,
+ TRIEM => 0x010000,
+ OFFSETS => 0x020000,
+ OFFSETSDBG => 0x040000,
+ STATE => 0x080000,
+ OPTIMISEM => 0x100000,
+ STACK => 0x280000,
+);
+$flags{ALL} = -1;
+$flags{All} = $flags{all} = $flags{DUMP} | $flags{EXECUTE};
+$flags{Extra} = $flags{EXECUTE} | $flags{COMPILE};
+$flags{More} = $flags{MORE} = $flags{All} | $flags{TRIEC} | $flags{TRIEM} | $flags{STATE};
+$flags{State} = $flags{DUMP} | $flags{EXECUTE} | $flags{STATE};
+$flags{TRIE} = $flags{DUMP} | $flags{EXECUTE} | $flags{TRIEC};
+
+my $installed;
+my $installed_error;
+
+sub _do_install {
+ if ( ! defined($installed) ) {
+ require XSLoader;
+ $installed = eval { XSLoader::load('re', $VERSION) } || 0;
+ $installed_error = $@;
+ }
+}
+
+sub _load_unload {
+ my ($on)= @_;
+ if ($on) {
+ _do_install();
+ if ( ! $installed ) {
+ die "'re' not installed!? ($installed_error)";
+ } else {
+ # We call install() every time, as if we didn't, we wouldn't
+ # "see" any changes to the color environment var since
+ # the last time it was called.
+
+ # install() returns an integer, which if casted properly
+ # in C resolves to a structure containing the regex
+ # hooks. Setting it to a random integer will guarantee
+ # segfaults.
+ $^H{regcomp} = install();
+ }
+ } else {
+ delete $^H{regcomp};
+ }
+}
+
+sub bits {
+ my $on = shift;
+ my $bits = 0;
+ unless (@_) {
+ require Carp;
+ Carp::carp("Useless use of \"re\" pragma");
+ }
+ foreach my $idx (0..$#_){
+ my $s=$_[$idx];
+ if ($s eq 'Debug' or $s eq 'Debugcolor') {
+ setcolor() if $s =~/color/i;
+ ${^RE_DEBUG_FLAGS} = 0 unless defined ${^RE_DEBUG_FLAGS};
+ for my $idx ($idx+1..$#_) {
+ if ($flags{$_[$idx]}) {
+ if ($on) {
+ ${^RE_DEBUG_FLAGS} |= $flags{$_[$idx]};
+ } else {
+ ${^RE_DEBUG_FLAGS} &= ~ $flags{$_[$idx]};
+ }
+ } else {
+ require Carp;
+ Carp::carp("Unknown \"re\" Debug flag '$_[$idx]', possible flags: ",
+ join(", ",sort keys %flags ) );
+ }
+ }
+ _load_unload($on ? 1 : ${^RE_DEBUG_FLAGS});
+ last;
+ } elsif ($s eq 'debug' or $s eq 'debugcolor') {
+ setcolor() if $s =~/color/i;
+ _load_unload($on);
+ } elsif (exists $bitmask{$s}) {
+ $bits |= $bitmask{$s};
+ } elsif ($EXPORT_OK{$s}) {
+ _do_install();
+ require Exporter;
+ re->export_to_level(2, 're', $s);
+ } else {
+ require Carp;
+ Carp::carp("Unknown \"re\" subpragma '$s' (known ones are: ",
+ join(', ', map {qq('$_')} 'debug', 'debugcolor', sort keys %bitmask),
+ ")");
+ }
+ }
+ $bits;
+}
+
+sub import {
+ shift;
+ $^H |= bits(1, @_);
+}
+
+sub unimport {
+ shift;
+ $^H &= ~ bits(0, @_);
+}
+
+1;
+
+__END__
=head1 NAME
@@ -33,17 +197,29 @@ re - Perl pragma to alter regular expression behaviour
use re qw(Debug All); # Finer tuned debugging options.
use re qw(Debug More);
no re qw(Debug ALL); # Turn of all re debugging in this scope
+
+ use re qw(is_regexp regexp_pattern); # import utility functions
+ my ($pat,$mods)=regexp_pattern(qr/foo/i);
+ if (is_regexp($obj)) {
+ print "Got regexp: ",
+ scalar regexp_pattern($obj); # just as perl would stringify it
+ } # but no hassle with blessed re's.
+
(We use $^X in these examples because it's tainted by default.)
=head1 DESCRIPTION
+=head2 'taint' mode
+
When C<use re 'taint'> is in effect, and a tainted string is the target
of a regex, the regex memories (or values returned by the m// operator
in list context) are tainted. This feature is useful when regex operations
on tainted data aren't meant to extract safe substrings, but to perform
other transformations.
+=head2 'eval' mode
+
When C<use re 'eval'> is in effect, a regex is allowed to contain
C<(?{ ... })> zero-width assertions even if regular expression contains
variable interpolation. That is normally disallowed, since it is a
@@ -60,6 +236,8 @@ interpolation. Thus:
I<is> allowed if $pat is a precompiled regular expression, even
if $pat contains C<(?{ ... })> assertions.
+=head2 'debug' mode
+
When C<use re 'debug'> is in effect, perl emits debugging messages when
compiling and using regular expressions. The output is the same as that
obtained by running a C<-DDEBUGGING>-enabled perl interpreter with the
@@ -71,6 +249,14 @@ comma-separated list of C<termcap> properties to use for highlighting
strings on/off, pre-point part on/off.
See L<perldebug/"Debugging regular expressions"> for additional info.
+As of 5.9.5 the directive C<use re 'debug'> and its equivalents are
+lexically scoped, as the other directives are. However they have both
+compile-time and run-time effects.
+
+See L<perlmodlib/Pragmatic Modules>.
+
+=head2 'Debug' mode
+
Similarly C<use re 'Debug'> produces debugging output, the difference
being that it allows the fine tuning of what debugging output will be
emitted. Options are divided into three groups, those related to
@@ -208,141 +394,50 @@ As of 5.9.5 the directive C<use re 'debug'> and its equivalents are
lexically scoped, as the other directives are. However they have both
compile-time and run-time effects.
-See L<perlmodlib/Pragmatic Modules>.
+=head2 Exportable Functions
-=cut
+As of perl 5.9.5 're' debug contains a number of utility functions that
+may be optionally exported into the callers namespace. They are listed
+below.
-# N.B. File::Basename contains a literal for 'taint' as a fallback. If
-# taint is changed here, File::Basename must be updated as well.
-my %bitmask = (
-taint => 0x00100000, # HINT_RE_TAINT
-eval => 0x00200000, # HINT_RE_EVAL
-);
+=over 4
-sub setcolor {
- eval { # Ignore errors
- require Term::Cap;
+=item is_regexp($ref)
- my $terminal = Tgetent Term::Cap ({OSPEED => 9600}); # Avoid warning.
- my $props = $ENV{PERL_RE_TC} || 'md,me,so,se,us,ue';
- my @props = split /,/, $props;
- my $colors = join "\t", map {$terminal->Tputs($_,1)} @props;
+Returns true if the argument is a compiled regular expression as returned
+by C<qr//>, false if it is not.
- $colors =~ s/\0//g;
- $ENV{PERL_RE_COLORS} = $colors;
- };
- if ($@) {
- $ENV{PERL_RE_COLORS}||=qq'\t\t> <\t> <\t\t'
- }
+This function will not be confused by overloading or blessing. In
+internals terms this extracts the regexp pointer out of the
+PERL_MAGIC_qr structure so it it cannot be fooled.
-}
+=item regexp_pattern($ref)
-my %flags = (
- COMPILE => 0x0000FF,
- PARSE => 0x000001,
- OPTIMISE => 0x000002,
- TRIEC => 0x000004,
- DUMP => 0x000008,
+If the argument is a compiled regular expression as returned by C<qr//>
+then this function returns the pattern.
- EXECUTE => 0x00FF00,
- INTUIT => 0x000100,
- MATCH => 0x000200,
- TRIEE => 0x000400,
+In list context it returns a two element list, the first element
+containing the pattern and the second containing the modifiers used when
+the pattern was compiled.
- EXTRA => 0xFF0000,
- TRIEM => 0x010000,
- OFFSETS => 0x020000,
- OFFSETSDBG => 0x040000,
- STATE => 0x080000,
- OPTIMISEM => 0x100000,
- STACK => 0x280000,
-);
-$flags{ALL} = -1;
-$flags{All} = $flags{all} = $flags{DUMP} | $flags{EXECUTE};
-$flags{Extra} = $flags{EXECUTE} | $flags{COMPILE};
-$flags{More} = $flags{MORE} = $flags{All} | $flags{TRIEC} | $flags{TRIEM} | $flags{STATE};
-$flags{State} = $flags{DUMP} | $flags{EXECUTE} | $flags{STATE};
-$flags{TRIE} = $flags{DUMP} | $flags{EXECUTE} | $flags{TRIEC};
+ my ($pat,$mods)=regexp_pattern($ref);
-my $installed;
-my $installed_error;
+In scalar context it returns the same as perl would when strigifying a
+raw qr// with the same pattern inside. If the argument is not a
+compiled reference then this routine returns false but defined in scalar
+context, and the empty list in list context. Thus the following
-sub _load_unload {
- my ($on)= @_;
- if ($on) {
- if ( ! defined($installed) ) {
- require XSLoader;
- $installed = eval { XSLoader::load('re') } || 0;
- $installed_error = $@;
- }
- if ( ! $installed ) {
- die "'re' not installed!? ($installed_error)";
- } else {
- # We call install() every time, as if we didn't, we wouldn't
- # "see" any changes to the color environment var since
- # the last time it was called.
+ if (regexp_pattern($ref) eq '(?i-xsm:foo)')
- # install() returns an integer, which if casted properly
- # in C resolves to a structure containing the regex
- # hooks. Setting it to a random integer will guarantee
- # segfaults.
- $^H{regcomp} = install();
- }
- } else {
- delete $^H{regcomp};
- }
-}
+will be warning free regardless of what $ref actually is.
-sub bits {
- my $on = shift;
- my $bits = 0;
- unless (@_) {
- require Carp;
- Carp::carp("Useless use of \"re\" pragma");
- }
- foreach my $idx (0..$#_){
- my $s=$_[$idx];
- if ($s eq 'Debug' or $s eq 'Debugcolor') {
- setcolor() if $s =~/color/i;
- ${^RE_DEBUG_FLAGS} = 0 unless defined ${^RE_DEBUG_FLAGS};
- for my $idx ($idx+1..$#_) {
- if ($flags{$_[$idx]}) {
- if ($on) {
- ${^RE_DEBUG_FLAGS} |= $flags{$_[$idx]};
- } else {
- ${^RE_DEBUG_FLAGS} &= ~ $flags{$_[$idx]};
- }
- } else {
- require Carp;
- Carp::carp("Unknown \"re\" Debug flag '$_[$idx]', possible flags: ",
- join(", ",sort keys %flags ) );
- }
- }
- _load_unload($on ? 1 : ${^RE_DEBUG_FLAGS});
- last;
- } elsif ($s eq 'debug' or $s eq 'debugcolor') {
- setcolor() if $s =~/color/i;
- _load_unload($on);
- } elsif (exists $bitmask{$s}) {
- $bits |= $bitmask{$s};
- } else {
- require Carp;
- Carp::carp("Unknown \"re\" subpragma '$s' (known ones are: ",
- join(', ', map {qq('$_')} 'debug', 'debugcolor', sort keys %bitmask),
- ")");
- }
- }
- $bits;
-}
+Like c<is_regexp> this function will not be confused by overloading
+or blessing of the object.
-sub import {
- shift;
- $^H |= bits(1, @_);
-}
+=back
-sub unimport {
- shift;
- $^H &= ~ bits(0, @_);
-}
+=head1 SEE ALSO
-1;
+L<perlmodlib/Pragmatic Modules>.
+
+=cut
diff --git a/ext/re/re.xs b/ext/re/re.xs
index 58fb124e10..b82062a8b0 100644
--- a/ext/re/re.xs
+++ b/ext/re/re.xs
@@ -19,6 +19,7 @@ extern char* my_re_intuit_start (pTHX_ regexp *prog, SV *sv, char *strpos,
char *strend, U32 flags,
struct re_scream_pos_data_s *data);
extern SV* my_re_intuit_string (pTHX_ regexp *prog);
+extern char* my_reg_stringify (pTHX_ MAGIC *mg, U32 *flags, STRLEN *lp, I32 *haseval);
#if defined(USE_ITHREADS)
extern regexp* my_regdupe (pTHX_ const regexp *r, CLONE_PARAMS *param);
@@ -30,6 +31,7 @@ EXTERN_C const struct regexp_engine my_reg_engine = {
my_re_intuit_start,
my_re_intuit_string,
my_regfree,
+ my_reg_stringify,
#if defined(USE_ITHREADS)
my_regdupe
#endif
@@ -46,3 +48,119 @@ install()
/* PL_debug |= DEBUG_r_FLAG; */
XPUSHs(sv_2mortal(newSViv(PTR2IV(&my_reg_engine))));
+
+void
+is_regexp(sv)
+ SV * sv
+PROTOTYPE: $
+PREINIT:
+ MAGIC *mg;
+PPCODE:
+{
+ if (SvMAGICAL(sv))
+ mg_get(sv);
+ if (SvROK(sv) &&
+ (sv = (SV*)SvRV(sv)) && /* assign deliberate */
+ SvTYPE(sv) == SVt_PVMG &&
+ (mg = mg_find(sv, PERL_MAGIC_qr))) /* assign deliberate */
+ {
+ XSRETURN_YES;
+ } else {
+ XSRETURN_NO;
+ }
+ /* NOTREACHED */
+}
+
+void
+regexp_pattern(sv)
+ SV * sv
+PROTOTYPE: $
+PREINIT:
+ MAGIC *mg;
+PPCODE:
+{
+ /*
+ Checks if a reference is a regex or not. If the parameter is
+ not a ref, or is not the result of a qr// then returns false
+ in scalar context and an empty list in list context.
+ Otherwise in list context it returns the pattern and the
+ modifiers, in scalar context it returns the pattern just as it
+ would if the qr// was stringified normally, regardless as
+ to the class of the variable and any strigification overloads
+ on the object.
+ */
+
+ if (SvMAGICAL(sv))
+ mg_get(sv);
+ if (SvROK(sv) &&
+ (sv = (SV*)SvRV(sv)) && /* assign deliberate */
+ SvTYPE(sv) == SVt_PVMG &&
+ (mg = mg_find(sv, PERL_MAGIC_qr))) /* assign deliberate */
+ {
+
+ /* Housten, we have a regex! */
+ SV *pattern;
+ regexp *re = (regexp *)mg->mg_obj;
+ STRLEN patlen = 0;
+ STRLEN left = 0;
+ char reflags[6];
+
+ if ( GIMME_V == G_ARRAY ) {
+ /*
+ we are in list context so stringify
+ the modifiers that apply. We ignore "negative
+ modifiers" in this scenario.
+ */
+
+ char *fptr = "msix";
+ char ch;
+ U16 reganch = (U16)((re->reganch & PMf_COMPILETIME) >> 12);
+
+ while((ch = *fptr++)) {
+ if(reganch & 1) {
+ reflags[left++] = ch;
+ }
+ reganch >>= 1;
+ }
+
+ pattern = sv_2mortal(newSVpvn(re->precomp,re->prelen));
+ if (re->reganch & ROPT_UTF8) SvUTF8_on(pattern);
+
+ /* return the pattern and the modifiers */
+ XPUSHs(pattern);
+ XPUSHs(sv_2mortal(newSVpvn(reflags,left)));
+ XSRETURN(2);
+ } else {
+ /* Scalar, so use the string that Perl would return */
+ if (!mg->mg_ptr)
+ CALLREG_STRINGIFY(mg,0,0);
+
+ /* return the pattern in (?msix:..) format */
+ pattern = sv_2mortal(newSVpvn(mg->mg_ptr,mg->mg_len));
+ if (re->reganch & ROPT_UTF8)
+ SvUTF8_on(pattern);
+ XPUSHs(pattern);
+ XSRETURN(1);
+ }
+ } else {
+ /* It ain't a regexp folks */
+ if ( GIMME_V == G_ARRAY ) {
+ /* return the empty list */
+ XSRETURN_UNDEF;
+ } else {
+ /* Because of the (?:..) wrapping involved in a
+ stringified pattern it is impossible to get a
+ result for a real regexp that would evaluate to
+ false. Therefore we can return PL_sv_no to signify
+ that the object is not a regex, this means that one
+ can say
+
+ if (regex($might_be_a_regex) eq '(?:foo)') { }
+
+ and not worry about undefined values.
+ */
+ XSRETURN_NO;
+ }
+ }
+ /* NOT-REACHED */
+} \ No newline at end of file
diff --git a/ext/re/re_top.h b/ext/re/re_top.h
index af729aed9f..39b7fd122f 100644
--- a/ext/re/re_top.h
+++ b/ext/re/re_top.h
@@ -16,6 +16,7 @@
#define Perl_pregfree my_regfree
#define Perl_re_intuit_string my_re_intuit_string
#define Perl_regdupe my_regdupe
+#define Perl_reg_stringify my_reg_stringify
#define PERL_NO_GET_CONTEXT
diff --git a/ext/re/t/re_funcs.t b/ext/re/t/re_funcs.t
new file mode 100644
index 0000000000..16ab86412a
--- /dev/null
+++ b/ext/re/t/re_funcs.t
@@ -0,0 +1,24 @@
+#!./perl
+
+BEGIN {
+ chdir 't' if -d 't';
+ @INC = '../lib';
+ require Config;
+ if (($Config::Config{'extensions'} !~ /\bre\b/) ){
+ print "1..0 # Skip -- Perl configured without re module\n";
+ exit 0;
+ }
+}
+
+use strict;
+
+use Test::More tests => 6;
+use re qw(is_regexp regexp_pattern);
+my $qr=qr/foo/i;
+
+ok(is_regexp($qr),'is_regexp($qr)');
+ok(!is_regexp(''),'is_regexp("")');
+is((regexp_pattern($qr))[0],'foo','regexp_pattern[0]');
+is((regexp_pattern($qr))[1],'i','regexp_pattern[1]');
+is(regexp_pattern($qr),'(?i-xsm:foo)','scalar regexp_pattern');
+ok(!regexp_pattern(''),'!regexp_pattern("")');
diff --git a/lib/ExtUtils/ParseXS.pm b/lib/ExtUtils/ParseXS.pm
index b6f422031b..0729397e20 100644
--- a/lib/ExtUtils/ParseXS.pm
+++ b/lib/ExtUtils/ParseXS.pm
@@ -203,7 +203,8 @@ sub process_file {
$size = qr[,\s* (??{ $bal }) ]x; # Third arg (to setpvn)
foreach my $key (keys %output_expr) {
- use re 'eval';
+ #use re 'eval';
+ BEGIN { $^H |= 0x00200000};
my ($t, $with_size, $arg, $sarg) =
($output_expr{$key} =~
diff --git a/perl.h b/perl.h
index d708f81706..51f26e4fd0 100644
--- a/perl.h
+++ b/perl.h
@@ -210,6 +210,9 @@
CALL_FPTR((prog)->engine->checkstr)(aTHX_ (prog))
#define CALLREGFREE(prog) \
if(prog) CALL_FPTR((prog)->engine->free)(aTHX_ (prog))
+#define CALLREG_AS_STR(mg,lp,flags,haseval) \
+ CALL_FPTR(((regexp *)((mg)->mg_obj))->engine->as_str)(aTHX_ (mg), (lp), (flags), (haseval))
+#define CALLREG_STRINGIFY(mg,lp,flags) CALLREG_AS_STR(mg,lp,flags,0)
#if defined(USE_ITHREADS)
#define CALLREGDUPE(prog,param) \
(prog ? CALL_FPTR((prog)->engine->dupe)(aTHX_ (prog),(param)) \
diff --git a/pod/perlreguts.pod b/pod/perlreguts.pod
index 4ee2be172f..937565745c 100644
--- a/pod/perlreguts.pod
+++ b/pod/perlreguts.pod
@@ -759,7 +759,8 @@ F<regexp.h> contains the base structure definition:
U32 *offsets; /* offset annotations 20001228 MJD */
I32 sublen; /* Length of string pointed by subbeg */
I32 refcnt;
- I32 minlen; /* mininum possible length of $& */
+ I32 minlen; /* mininum length of string to match */
+ I32 minlenret; /* mininum possible length of $& */
I32 prelen; /* length of precomp */
U32 nparens; /* number of parentheses */
U32 lastparen; /* last paren matched */
@@ -838,13 +839,28 @@ that handles this is called C<find_by_class()>. Sometimes this field
points at a regop embedded in the program, and sometimes it points at
an independent synthetic regop that has been constructed by the optimiser.
-=item C<minlen>
+=item C<minlen> C<minlenret>
-The minimum possible length of the final matching string. This is used
-to prune the search space by not bothering to match any closer to the
-end of a string than would allow a match. For instance there is no point
-in even starting the regex engine if the minlen is 10 but the string
-is only 5 characters long. There is no way that the pattern can match.
+C<minlen> is the minimum string length required for the pattern to match.
+This is used to prune the search space by not bothering to match any
+closer to the end of a string than would allow a match. For instance
+there is no point in even starting the regex engine if the minlen is
+10 but the string is only 5 characters long. There is no way that the
+pattern can match.
+
+C<minlenret> is the minimum length of the string that would be found
+in $& after a match.
+
+The difference between C<minlen> and C<minlenret> can be seen in the
+following pattern:
+
+ /ns(?=\d)/
+
+where the C<minlen> would be 3 but the minlen ret would only be 2 as
+the \d is required to match but is not actually included in the matched
+content. This distinction is particularly important as the substitution
+logic uses the C<minlenret> to tell whether it can do in-place substition
+which can result in considerable speedup.
=item C<reganch>
diff --git a/pp.c b/pp.c
index a9ca2366a6..0ec54bf005 100644
--- a/pp.c
+++ b/pp.c
@@ -4652,7 +4652,7 @@ PP(pp_split)
const int tail = (rx->reganch & RE_INTUIT_TAIL);
SV * const csv = CALLREG_INTUIT_STRING(rx);
- len = rx->minlen;
+ len = rx->minlenret;
if (len == 1 && !(rx->reganch & ROPT_UTF8) && !tail) {
const char c = *SvPV_nolen_const(csv);
while (--limit) {
diff --git a/pp_hot.c b/pp_hot.c
index d2e8e87b18..025e9571aa 100644
--- a/pp_hot.c
+++ b/pp_hot.c
@@ -1497,11 +1497,11 @@ yup: /* Confirmed by INTUIT */
rx->subbeg = (char *) truebase;
rx->startp[0] = s - truebase;
if (RX_MATCH_UTF8(rx)) {
- char * const t = (char*)utf8_hop((U8*)s, rx->minlen);
+ char * const t = (char*)utf8_hop((U8*)s, rx->minlenret);
rx->endp[0] = t - truebase;
}
else {
- rx->endp[0] = s - truebase + rx->minlen;
+ rx->endp[0] = s - truebase + rx->minlenret;
}
rx->sublen = strend - truebase;
goto gotcha;
@@ -1531,11 +1531,11 @@ yup: /* Confirmed by INTUIT */
rx->sublen = strend - t;
RX_MATCH_COPIED_on(rx);
off = rx->startp[0] = s - t;
- rx->endp[0] = off + rx->minlen;
+ rx->endp[0] = off + rx->minlenret;
}
else { /* startp/endp are used by @- @+. */
rx->startp[0] = s - truebase;
- rx->endp[0] = s - truebase + rx->minlen;
+ rx->endp[0] = s - truebase + rx->minlenret;
}
rx->nparens = rx->lastparen = rx->lastcloseparen = 0; /* used by @-, @+, and $^N */
LEAVE_SCOPE(oldsave);
@@ -2188,7 +2188,7 @@ PP(pp_subst)
#ifdef PERL_OLD_COPY_ON_WRITE
&& !is_cow
#endif
- && (I32)clen <= rx->minlen && (once || !(r_flags & REXEC_COPY_STR))
+ && (I32)clen <= rx->minlenret && (once || !(r_flags & REXEC_COPY_STR))
&& !(rx->reganch & ROPT_LOOKBEHIND_SEEN)
&& (!doutf8 || SvUTF8(TARG))) {
if (!CALLREGEXEC(rx, s, strend, orig, 0, TARG, NULL,
diff --git a/proto.h b/proto.h
index ce25ca0188..11a5fc4f0e 100644
--- a/proto.h
+++ b/proto.h
@@ -1830,6 +1830,9 @@ PERL_CALLCONV I32 Perl_pregexec(pTHX_ regexp* prog, char* stringarg, char* stren
__attribute__nonnull__(pTHX_6);
PERL_CALLCONV void Perl_pregfree(pTHX_ struct regexp* r);
+PERL_CALLCONV char * Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval)
+ __attribute__nonnull__(pTHX_1);
+
#if defined(USE_ITHREADS)
PERL_CALLCONV regexp* Perl_regdupe(pTHX_ const regexp* r, CLONE_PARAMS* param)
__attribute__nonnull__(pTHX_1)
@@ -3859,10 +3862,6 @@ STATIC STRLEN S_sv_pos_b2u_midway(pTHX_ const U8 *s, const U8 *const target, con
__attribute__nonnull__(pTHX_2)
__attribute__nonnull__(pTHX_3);
-STATIC char * S_stringify_regexp(pTHX_ SV *sv, MAGIC *mg, STRLEN *lp)
- __attribute__nonnull__(pTHX_1)
- __attribute__nonnull__(pTHX_2);
-
STATIC char * S_F0convert(NV nv, char *endbuf, STRLEN *len)
__attribute__nonnull__(2)
__attribute__nonnull__(3);
diff --git a/regcomp.c b/regcomp.c
index 46851dde43..4a2e52ba45 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -4216,8 +4216,7 @@ reStudy:
* it happens that c_offset_min has been invalidated, since the
* earlier string may buy us something the later one won't.]
*/
- minlen = 0;
-
+
data.longest_fixed = newSVpvs("");
data.longest_float = newSVpvs("");
data.last_found = newSVpvs("");
@@ -4230,7 +4229,7 @@ reStudy:
} else /* XXXX Check for BOUND? */
stclass_flag = 0;
data.last_closep = &last_close;
-
+
minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
&data, -1, NULL, NULL,
SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
@@ -4408,9 +4407,10 @@ reStudy:
data.start_class = &ch_class;
data.last_closep = &last_close;
+
minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
&data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
-
+
CHECK_RESTUDY_GOTO;
r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
@@ -4437,6 +4437,11 @@ reStudy:
/* Guard against an embedded (?=) or (?<=) with a longer minlen than
the "real" pattern. */
+ DEBUG_OPTIMISE_r({
+ PerlIO_printf(Perl_debug_log,"minlen: %d r->minlen:%d\n",
+ minlen, r->minlen);
+ });
+ r->minlenret = minlen;
if (r->minlen < minlen)
r->minlen = minlen;
@@ -8561,6 +8566,7 @@ Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
ret->precomp = SAVEPVN(r->precomp, r->prelen);
ret->refcnt = r->refcnt;
ret->minlen = r->minlen;
+ ret->minlenret = r->minlenret;
ret->prelen = r->prelen;
ret->nparens = r->nparens;
ret->lastparen = r->lastparen;
@@ -8586,6 +8592,111 @@ Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
}
#endif
+/*
+ reg_stringify()
+
+ converts a regexp embedded in a MAGIC struct to its stringified form,
+ caching the converted form in the struct and returns the cached
+ string.
+
+ If lp is nonnull then it is used to return the length of the
+ resulting string
+
+ If flags is nonnull and the returned string contains UTF8 then
+ (flags & 1) will be true.
+
+ If haseval is nonnull then it is used to return whether the pattern
+ contains evals.
+
+ Normally called via macro:
+
+ CALLREG_STRINGIFY(mg,0,0);
+
+ And internally with
+
+ CALLREG_AS_STR(mg,lp,flags,haseval)
+
+ See sv_2pv_flags() in sv.c for an example of internal usage.
+
+ */
+
+char *
+Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
+ dVAR;
+ const regexp * const re = (regexp *)mg->mg_obj;
+
+ if (!mg->mg_ptr) {
+ const char *fptr = "msix";
+ char reflags[6];
+ char ch;
+ int left = 0;
+ int right = 4;
+ bool need_newline = 0;
+ U16 reganch = (U16)((re->reganch & PMf_COMPILETIME) >> 12);
+
+ while((ch = *fptr++)) {
+ if(reganch & 1) {
+ reflags[left++] = ch;
+ }
+ else {
+ reflags[right--] = ch;
+ }
+ reganch >>= 1;
+ }
+ if(left != 4) {
+ reflags[left] = '-';
+ left = 5;
+ }
+
+ mg->mg_len = re->prelen + 4 + left;
+ /*
+ * If /x was used, we have to worry about a regex ending with a
+ * comment later being embedded within another regex. If so, we don't
+ * want this regex's "commentization" to leak out to the right part of
+ * the enclosing regex, we must cap it with a newline.
+ *
+ * So, if /x was used, we scan backwards from the end of the regex. If
+ * we find a '#' before we find a newline, we need to add a newline
+ * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
+ * we don't need to add anything. -jfriedl
+ */
+ if (PMf_EXTENDED & re->reganch) {
+ const char *endptr = re->precomp + re->prelen;
+ while (endptr >= re->precomp) {
+ const char c = *(endptr--);
+ if (c == '\n')
+ break; /* don't need another */
+ if (c == '#') {
+ /* we end while in a comment, so we need a newline */
+ mg->mg_len++; /* save space for it */
+ need_newline = 1; /* note to add it */
+ break;
+ }
+ }
+ }
+
+ Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
+ mg->mg_ptr[0] = '(';
+ mg->mg_ptr[1] = '?';
+ Copy(reflags, mg->mg_ptr+2, left, char);
+ *(mg->mg_ptr+left+2) = ':';
+ Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
+ if (need_newline)
+ mg->mg_ptr[mg->mg_len - 2] = '\n';
+ mg->mg_ptr[mg->mg_len - 1] = ')';
+ mg->mg_ptr[mg->mg_len] = 0;
+ }
+ if (haseval)
+ *haseval = re->program[0].next_off;
+ if (flags)
+ *flags = ((re->reganch & ROPT_UTF8) ? 1 : 0);
+
+ if (lp)
+ *lp = mg->mg_len;
+ return mg->mg_ptr;
+}
+
+
#ifndef PERL_IN_XSUB_RE
/*
- regnext - dig the "next" pointer out of a node
diff --git a/regcomp.h b/regcomp.h
index 2774a27ef9..e3d671d70e 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -398,6 +398,7 @@ EXTCONST regexp_engine PL_core_reg_engine = {
Perl_re_intuit_start,
Perl_re_intuit_string,
Perl_pregfree,
+ Perl_reg_stringify,
#if defined(USE_ITHREADS)
Perl_regdupe
#endif
diff --git a/regexp.h b/regexp.h
index f74f2af05b..5e3e947cb9 100644
--- a/regexp.h
+++ b/regexp.h
@@ -47,7 +47,8 @@ typedef struct regexp {
U32 *offsets; /* offset annotations 20001228 MJD */
I32 sublen; /* Length of string pointed by subbeg */
I32 refcnt;
- I32 minlen; /* mininum possible length of $& */
+ I32 minlen; /* mininum possible length of string to match */
+ I32 minlenret; /* mininum possible length of $& */
I32 prelen; /* length of precomp */
U32 nparens; /* number of parentheses */
U32 lastparen; /* last paren matched */
@@ -76,6 +77,7 @@ typedef struct regexp_engine {
struct re_scream_pos_data_s *data);
SV* (*checkstr) (pTHX_ regexp *prog);
void (*free) (pTHX_ struct regexp* r);
+ char* (*as_str) (pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval);
#ifdef USE_ITHREADS
regexp* (*dupe) (pTHX_ const regexp *r, CLONE_PARAMS *param);
#endif
diff --git a/sv.c b/sv.c
index ad31ce15ef..6696e9a185 100644
--- a/sv.c
+++ b/sv.c
@@ -2542,87 +2542,6 @@ S_uiv_2buf(char *buf, IV iv, UV uv, int is_uv, char **peob)
return ptr;
}
-/* stringify_regexp(): private routine for use by sv_2pv_flags(): converts
- * a regexp to its stringified form.
- */
-
-static char *
-S_stringify_regexp(pTHX_ SV *sv, MAGIC *mg, STRLEN *lp) {
- dVAR;
- const regexp * const re = (regexp *)mg->mg_obj;
-
- if (!mg->mg_ptr) {
- const char *fptr = "msix";
- char reflags[6];
- char ch;
- int left = 0;
- int right = 4;
- bool need_newline = 0;
- U16 reganch = (U16)((re->reganch & PMf_COMPILETIME) >> 12);
-
- while((ch = *fptr++)) {
- if(reganch & 1) {
- reflags[left++] = ch;
- }
- else {
- reflags[right--] = ch;
- }
- reganch >>= 1;
- }
- if(left != 4) {
- reflags[left] = '-';
- left = 5;
- }
-
- mg->mg_len = re->prelen + 4 + left;
- /*
- * If /x was used, we have to worry about a regex ending with a
- * comment later being embedded within another regex. If so, we don't
- * want this regex's "commentization" to leak out to the right part of
- * the enclosing regex, we must cap it with a newline.
- *
- * So, if /x was used, we scan backwards from the end of the regex. If
- * we find a '#' before we find a newline, we need to add a newline
- * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
- * we don't need to add anything. -jfriedl
- */
- if (PMf_EXTENDED & re->reganch) {
- const char *endptr = re->precomp + re->prelen;
- while (endptr >= re->precomp) {
- const char c = *(endptr--);
- if (c == '\n')
- break; /* don't need another */
- if (c == '#') {
- /* we end while in a comment, so we need a newline */
- mg->mg_len++; /* save space for it */
- need_newline = 1; /* note to add it */
- break;
- }
- }
- }
-
- Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
- mg->mg_ptr[0] = '(';
- mg->mg_ptr[1] = '?';
- Copy(reflags, mg->mg_ptr+2, left, char);
- *(mg->mg_ptr+left+2) = ':';
- Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
- if (need_newline)
- mg->mg_ptr[mg->mg_len - 2] = '\n';
- mg->mg_ptr[mg->mg_len - 1] = ')';
- mg->mg_ptr[mg->mg_len] = 0;
- }
- PL_reginterp_cnt += re->program[0].next_off;
-
- if (re->reganch & ROPT_UTF8)
- SvUTF8_on(sv);
- else
- SvUTF8_off(sv);
- if (lp)
- *lp = mg->mg_len;
- return mg->mg_ptr;
-}
-
/*
=for apidoc sv_2pv_flags
@@ -2740,8 +2659,18 @@ Perl_sv_2pv_flags(pTHX_ register SV *sv, STRLEN *lp, I32 flags)
&& ((SvFLAGS(referent) &
(SVs_OBJECT|SVf_OK|SVs_GMG|SVs_SMG|SVs_RMG))
== (SVs_OBJECT|SVs_SMG))
- && (mg = mg_find(referent, PERL_MAGIC_qr))) {
- return stringify_regexp(sv, mg, lp);
+ && (mg = mg_find(referent, PERL_MAGIC_qr)))
+ {
+ char *str = NULL;
+ I32 haseval = 0;
+ I32 flags = 0;
+ (str) = CALLREG_AS_STR(mg,lp,&flags,&haseval);
+ if (flags & 1)
+ SvUTF8_on(sv);
+ else
+ SvUTF8_off(sv);
+ PL_reginterp_cnt += haseval;
+ return str;
} else {
const char *const typestr = sv_reftype(referent, 0);
const STRLEN typelen = strlen(typestr);
diff --git a/t/op/pat.t b/t/op/pat.t
index 5405cf6099..0de3b14b41 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -3993,8 +3993,20 @@ ok((q(a)x 100) =~ /^(??{'(.)'x 100})/,
"Regexp /^(??{'(.)'x 100})/ crashes older perls")
or print "# Unexpected outcome: should pass or crash perl\n";
+{
+ $_="ns1ns1ns1";
+ s/ns(?=\d)/ns_/g;
+ iseq($_,"ns_1ns_1ns_1");
+ $_="ns1";
+ s/ns(?=\d)/ns_/;
+ iseq($_,"ns_1");
+ $_="123";
+ s/(?=\d+)|(?<=\d)/!Bang!/g;
+ iseq($_,"!Bang!1!Bang!2!Bang!3!Bang!");
+}
+
# Put new tests above the line, not here.
# Don't forget to update this!
-BEGIN{print "1..1344\n"};
+BEGIN{print "1..1347\n"};