regexec.c - fix accept in CURLYX/WHILEM construct.

The ACCEPT logic didnt know how to handle WHILEM, which for some reason does not have a next_off defined. I am not sure why. This was revealed by forcing CURLYX optimisations off. This includes a patch to test what happens if we embed an eval group in the tests run by regexp.t when run via regexp_normal.t, which disabled CURLYX -> CURLYN and CURLYM optimisations and revealed this issue. This adds t/re/regexp_normal.t which test "normalized" forms of the patterns in t/re/re_tests by munging them in various ways to see if they still behave as expected. For instance injecting a (?{}) can disable optimisations.
author: Yves Orton <demerphq@gmail.com> 2023-01-09 20:37:28 +0100
committer: Yves Orton <demerphq@gmail.com> 2023-01-15 17:21:03 +0100
commit: 5c6240fadac873b60c46677b4d5b180f4fb6074b (patch)
tree: ce9346482bac081a0cbfd59d2edc2610c740c2d5
parent: 37040543d024b3ecb0aecd78849bd5af61408d02 (diff)
download: perl-5c6240fadac873b60c46677b4d5b180f4fb6074b.tar.gz
6 files changed, 62 insertions, 12 deletions
diff --git a/MANIFEST b/MANIFEST
index a153ce7a06..f6b78371a8 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -6207,6 +6207,7 @@ t/re/regex_sets_compat.t	Test (?[ ]) is compatible with old [ ]
 t/re/regexp.t			See if regular expressions work
 t/re/regexp_noamp.t		See if regular expressions work with optimizations
 t/re/regexp_nonull.t		See if regexps work without trailing nulls
+t/re/regexp_normal.t		See if regexps work when expressions are normalized in various ways
 t/re/regexp_notrie.t		See if regular expressions work without trie optimisation
 t/re/regexp_qr.t		See if regular expressions work as qr//
 t/re/regexp_qr_embed.t		See if regular expressions work with embedded qr//
diff --git a/regexec.c b/regexec.c
index d2c020feb2..a022098380 100644
--- a/regexec.c
+++ b/regexec.c
@@ -8527,7 +8527,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                 for (
                     cursor = scan;
                     cursor && ( OP(cursor) != END );
-                    cursor = ( REGNODE_TYPE( OP(cursor) ) == END )
+                    cursor = (
+                               REGNODE_TYPE( OP(cursor) ) == END
+                               || REGNODE_TYPE( OP(cursor) ) == WHILEM
+                             )
                              ? REGNODE_AFTER(cursor)
                              : regnext(cursor)
                 ){
@@ -8741,7 +8744,6 @@ NULL
             );
 
             /* First just match a string of min A's. */
-
             if (n < min) {
                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
                 cur_curlyx->u.curlyx.lastloc = locinput;
diff --git a/t/re/pat_re_eval.t b/t/re/pat_re_eval.t
index b626f8eae1..95c2e33322 100644
--- a/t/re/pat_re_eval.t
+++ b/t/re/pat_re_eval.t
@@ -24,7 +24,7 @@ BEGIN {
 
 our @global;
 
-plan tests => 508;  # Update this when adding/deleting tests.
+plan tests => 510;  # Update this when adding/deleting tests.
 
 run_tests() unless caller;
 
@@ -126,7 +126,13 @@ sub run_tests {
                "Postponed UTF-8 string in non-UTF-8 re doesn't match non-UTF-8";
         }
     }
-
+    {
+        our $this_counter;
+        ok( "ABDE" =~ /(A(A|B(*ACCEPT)|C)+D)(E)(?{ $this_counter++ })/,
+            "ACCEPT/CURLYX/EVAL - pattern should match");
+        is( "$1-$2", "AB-B",
+            "Make sure that ACCEPT works in CURLYX by using EVAL");
+    }
 
     {
         # Test if $^N and $+ work in (?{})
diff --git a/t/re/re_tests b/t/re/re_tests
index a585e30bce..e961fcae17 100644
--- a/t/re/re_tests
+++ b/t/re/re_tests
@@ -964,8 +964,8 @@ tt+$	xxxtt	y	-	-
 (?i)		y	-	-
 (?a:((?u)\w)\W)	\xC0\xC0	y	$&	\xC0\xC0
 '(?!\A)x'm	a\nxb\n	y	-	-
-^(a(b)?)+$	aba	y	-$1-$2-	-a--
-^(aa(bb)?)+$	aabbaa	y	-$1-$2-	-aa--
+^(a(b)?)+$	aba	y	-$1-$2-	-a--		# !normal
+^(aa(bb)?)+$	aabbaa	y	-$1-$2-	-aa--		# !normal
 '^.{9}abc.*\n'm	123\nabcabcabcabc\n	y	-	-
 ^(a)?a$	a	y	-$1-	--
 ^(a)?(?(1)a|b)+$	a	n	-	-
diff --git a/t/re/regexp.t b/t/re/regexp.t
index 03a7178157..e536429e85 100644
--- a/t/re/regexp.t
+++ b/t/re/regexp.t
@@ -144,7 +144,7 @@ $nulnul = "\0" x 2;
 my $OP = $qr ? 'qr' : 'm';
 
 $| = 1;
-
+$::normalize_pat = $::normalize_pat; # silence warning
 TEST:
 foreach (@tests) {
     $test_num++;
@@ -186,6 +186,22 @@ foreach (@tests) {
     my $todo_qr = $qr_embed_thr && ($result =~ s/t//);
     my $skip = ($skip_amp ? ($result =~ s/B//i) : ($result =~ s/B//));
     ++$skip if $result =~ s/M// && !defined &DynaLoader::boot_DynaLoader;
+    
+    if ($::normalize_pat) {
+        my $opat= $pat;
+        # Convert (x)? to (?:(x)|) and (x)+ to (?:(x))+ and (x)* to (?:(x))*
+        $pat =~ s/\(([\w|.]+)\)\?(?![+*?])/(?:($1)|)/g;
+        $pat =~ s/\(([\w|.]+)\)([+*])(?![+*?])/(?:($1))$2/g;
+        if ($opat eq $pat) {
+            # we didn't change anything, no point in testing it again.
+            $skip++;
+            $reason = "Test not valid for $0";
+        } elsif ($comment=~/!\s*normal/) {
+            $result .= "T";
+            $comment = "# Known to be broken under $0";
+        }
+    }
+
     if ($result =~ s/ ( [Ss] ) //x) {
         if (($1 eq 'S' && $regex_sets) || ($1 eq 's' && ! $regex_sets)) {
             $skip++;
@@ -421,9 +437,28 @@ foreach (@tests) {
             $pat = $modified;
         }
     }
+    if ($::normalize_pat){
+        if (!$skip && ($result eq "y" or $result eq "n")) {
+            my $opat= $pat;
+            # Convert (x)? to (?:(x)|) and (x)+ to (?:(x))+ and (x)* to (?:(x))*
+            $pat =~ s/\(([\w|.]+)\)\?(?![+*?])/(?:($1)|)/g;
+            $pat =~ s/\(([\w|.]+)\)([+*])(?![+*?])/(?:($1))$2/g;
+            # inject an EVAL into the front of the pattern.
+            # this should disable all optimizations.
+            $pat =~ s/\A(.)/$1(?{ \$the_counter++ })/
+                or die $pat;
+        } elsif (!$skip) {
+            $skip = $reason = "Test not applicable to $0";
+        }
+    }
 
     for my $study ('', 'study $subject;', 'utf8::upgrade($subject);',
 		   'utf8::upgrade($subject); study $subject;') {
+        if ( $skip ) {
+            print "ok $testname # skipped", length($reason) ? ".  $reason" : '', "\n";
+            next TEST;
+        }
+        our $the_counter = 0; # used in normalization tests
 	# Need to make a copy, else the utf8::upgrade of an already studied
 	# scalar confuses things.
 	my $subject = $subject;
@@ -486,11 +521,7 @@ EOFCODE
 	    eval $code;
 	}
 	chomp( my $err = $@ );
-	if ( $skip ) {
-	    print "ok $testname # skipped", length($reason) ? ".  $reason" : '', "\n";
-	    next TEST;
-	}
-	elsif ($result eq 'c') {
+	if ($result eq 'c') {
 	    if ($err !~ m!^\Q$expect!) { print "not ok $testname$todo (compile) $input => '$err'\n"; next TEST }
 	    last;  # no need to study a syntax error
 	}
diff --git a/t/re/regexp_normal.t b/t/re/regexp_normal.t
new file mode 100644
index 0000000000..ca945bd8a6
--- /dev/null
+++ b/t/re/regexp_normal.t
@@ -0,0 +1,10 @@
+#!./perl
+
+$::normalize_pat = 1;
+for $file ('./re/regexp.t', './t/re/regexp.t', ':re:regexp.t') {
+    if (-r $file) {
+	do $file or die $@;
+	exit;
+    }
+}
+die "Cannot find ./re/regexp.t or ./t/re/regexp.t\n";
author	Yves Orton <demerphq@gmail.com>	2023-01-09 20:37:28 +0100
committer	Yves Orton <demerphq@gmail.com>	2023-01-15 17:21:03 +0100
commit	5c6240fadac873b60c46677b4d5b180f4fb6074b (patch)
tree	ce9346482bac081a0cbfd59d2edc2610c740c2d5
parent	37040543d024b3ecb0aecd78849bd5af61408d02 (diff)
download	perl-5c6240fadac873b60c46677b4d5b180f4fb6074b.tar.gz