pp_subst: optimize by not calling utf8_length

Length just isn't needed, and often took more cpu-time than the actual regex.
author: Loren Merritt <pengvado@videolan.org> 2022-07-14 00:09:06 +0000
committer: ℕicolas ℝ <nicolas@atoomic.org> 2022-07-20 15:03:07 -0600
commit: be76ad45a5a937ec83906e666e2318c0351115b4 (patch)
tree: 8bc97d8c1b33b15fe81ea192c063f270853b3918
parent: 7d47ba27dfa1ef9db23c4e6f934b0698d7fd539f (diff)
download: perl-be76ad45a5a937ec83906e666e2318c0351115b4.tar.gz
4 files changed, 10 insertions, 6 deletions
diff --git a/.mailmap b/.mailmap
index 71c997cddb..c879ff464c 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1110,6 +1110,7 @@ Lincoln D. Stein <lstein@cshl.org> Lincoln Stein <lstein@formaggio.cshl.org>
 Lincoln D. Stein <lstein@cshl.org> Lincoln Stein <lstein@genome.wi.mit.edu>
 Linda Walsh <unknown> Linda Walsh <unknown>
 Lionel Cons <lionel.cons@cern.ch> Lionel Cons <lionel.cons@cern.ch>
+Loren Merritt <pengvado@videolan.org> Loren Merritt <pengvado@videolan.org>
 Louis Strous <louis.strous@gmail.com> Louis Strous <louis.strous@gmail.com>
 Lubomir Rintel <lkundrak@v3.sk> Lubomir Rintel (GoodData) <lubo.rintel@gooddata.com>
 Lubomir Rintel <lkundrak@v3.sk> Lubomir Rintel <lkundrak@v3.sk>
diff --git a/AUTHORS b/AUTHORS
index 44e587bfe8..bd3984f578 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -827,6 +827,7 @@ Lesley Binks                   <lesley.binks@gmail.com>
 Lincoln D. Stein               <lstein@cshl.org>
 Linda Walsh
 Lionel Cons                    <lionel.cons@cern.ch>
+Loren Merritt                  <pengvado@videolan.org>
 Louis Strous                   <louis.strous@gmail.com>
 Lubomir Rintel                 <lkundrak@v3.sk>
 Luc St-Louis                   <luc.st-louis@ca.transport.bombardier.com>
diff --git a/pp_hot.c b/pp_hot.c
index f583261558..97985b7b5e 100644
--- a/pp_hot.c
+++ b/pp_hot.c
@@ -4250,7 +4250,6 @@ PP(pp_subst)
     STRLEN len;
     int force_on_match = 0;
     const I32 oldsave = PL_savestack_ix;
-    STRLEN slen;
     bool doutf8 = FALSE; /* whether replacement is in utf8 */
 #ifdef PERL_ANY_COW
     bool was_cow;
@@ -4316,10 +4315,12 @@ PP(pp_subst)
         DIE(aTHX_ "panic: pp_subst, pm=%p, orig=%p", pm, orig);
 
     strend = orig + len;
-    slen = DO_UTF8(TARG) ? utf8_length((U8*)orig, (U8*)strend) : len;
-    maxiters = 2 * slen + 10;	/* We can match twice at each
-                                   position, once with zero-length,
-                                   second time with non-zero. */
+    /* We can match twice at each position, once with zero-length,
+     * second time with non-zero.
+     * Don't handle utf8 specially; we can use length-in-bytes as an
+     * upper bound on length-in-characters, and avoid the cpu-cost of
+     * computing a tighter bound. */
+    maxiters = 2 * len + 10;
 
     /* handle the empty pattern */
     if (!RX_PRELEN(rx) && PL_curpm && !prog->mother_re) {
diff --git a/t/re/pat_rt_report.t b/t/re/pat_rt_report.t
index ced4fe670b..895da8ea81 100644
--- a/t/re/pat_rt_report.t
+++ b/t/re/pat_rt_report.t
@@ -1076,10 +1076,11 @@ SKIP: {
 	unless $Config{extensions} =~ / Encode /;
 
     # Test case cut down by jhi
-    fresh_perl_like(<<'EOP', qr!Malformed UTF-8 character \(unexpected end of string\) in substitution \(s///\) at!, {}, 'Segfault using HTML::Entities');
+    fresh_perl_like(<<'EOP', qr!Malformed UTF-8 character \(unexpected end of string\)!, {}, 'Segfault using HTML::Entities');
 use Encode;
 my $t = ord('A') == 193 ? "\xEA" : "\xE9";
 Encode::_utf8_on($t);
+substr($t,0);
 $t =~ s/([^a])//ge;
 EOP
     }
author	Loren Merritt <pengvado@videolan.org>	2022-07-14 00:09:06 +0000
committer	ℕicolas ℝ <nicolas@atoomic.org>	2022-07-20 15:03:07 -0600
commit	be76ad45a5a937ec83906e666e2318c0351115b4 (patch)
tree	8bc97d8c1b33b15fe81ea192c063f270853b3918
parent	7d47ba27dfa1ef9db23c4e6f934b0698d7fd539f (diff)
download	perl-be76ad45a5a937ec83906e666e2318c0351115b4.tar.gz