Properly handle UTF8-flagged strings when assigning to $0.

Issue #19331: Use of SvPV_const and SvPV_force in S_set_dollarzero() wrote the PV internals directly to argv, which causes an improper UTF-8 encode if the SV is UTF8-flagged/upgraded. This fixes that doing a downgrade prior to those SvPV* calls. If the string contains wide characters (and thus cannot be downgraded) a warning is thrown; this mirrors preexisting behavior with %ENV, print, and other output channels that convert Perl SVs to bytes. The relevant documentation is also updated.
author: Felipe Gasper <felipe@felipegasper.com> 2022-01-07 14:28:39 -0500
committer: Karl Williamson <khw@cpan.org> 2022-03-05 21:06:05 -0700
commit: 2d3b3561a39bb17fd6003fb262f52b3bc800770e (patch)
tree: 4d7e46f04c3e64e68e2621655a9322cb62bb9c11
parent: 286a1bfd696eb9d8dc1fa337234d1512f63beb67 (diff)
download: perl-2d3b3561a39bb17fd6003fb262f52b3bc800770e.tar.gz
4 files changed, 100 insertions, 1 deletions
diff --git a/mg.c b/mg.c
index 10813a7316..5f3eeae4fe 100644
--- a/mg.c
+++ b/mg.c
@@ -3367,6 +3367,16 @@ Perl_magic_set(pTHX_ SV *sv, MAGIC *mg)
         else sv_setiv(mg->mg_obj, (IV)PerlProc_getpid());
         break;
     case '0':
+        if (!sv_utf8_downgrade(sv, /* fail_ok */ TRUE)) {
+
+            /* Since we are going to set the string's UTF8-encoded form
+               as the process name we should update $0 itself to contain
+               that same (UTF8-encoded) value. */
+            sv_utf8_encode(GvSV(mg->mg_obj));
+
+            Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "Wide character in %s", "$0");
+        }
+
         LOCK_DOLLARZERO_MUTEX;
         S_set_dollarzero(aTHX_ sv);
         UNLOCK_DOLLARZERO_MUTEX;
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index cf427e0f5e..7e0a8ca873 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -212,6 +212,13 @@ and New Warnings
 
 XXX L<message|perldiag/"message">
 
+=item *
+
+L<Wide character in $0|perldiag/"Wide character in %s">
+
+Attempts to put wide characters into the program name (C<$0>) now
+provoke this warning.
+
 =back
 
 =head3 New Warnings
diff --git a/pod/perlvar.pod b/pod/perlvar.pod
index ecd870ecae..f0a6187f5e 100644
--- a/pod/perlvar.pod
+++ b/pod/perlvar.pod
@@ -257,6 +257,10 @@ legacy process name such as ps, top and killall will recognize the
 name you set when assigning to C<$0>.  The string you supply will be
 cut off at 16 bytes, this is a limitation imposed by Linux.
 
+Wide characters are invalid in C<$0> values. For historical reasons,
+though, Perl accepts them and encodes them to UTF-8. When this
+happens a wide-character warning is triggered.
+
 Mnemonic: same as B<sh> and B<ksh>.
 
 =item $REAL_GROUP_ID
diff --git a/t/op/magic.t b/t/op/magic.t
index e2e0b30211..8354e32252 100644
--- a/t/op/magic.t
+++ b/t/op/magic.t
@@ -5,7 +5,7 @@ BEGIN {
     chdir 't' if -d 't';
     require './test.pl';
     set_up_inc( '../lib' );
-    plan (tests => 197); # some tests are run in BEGIN block
+    plan (tests => 208); # some tests are run in BEGIN block
 }
 
 # Test that defined() returns true for magic variables created on the fly,
@@ -433,6 +433,84 @@ EOP
   }
 }
 
+# Check that assigning to $0 properly handles UTF-8-stored strings:
+{
+
+  # Test both ASCII and EBCDIC systems:
+  my $char = chr( utf8::native_to_unicode(0xe9) );
+
+  # We want $char_with_utf8_pv's PV to be UTF-8-encoded because we need to
+  # test that Perl translates UTF-8-stored code points to plain octets when
+  # assigning to $0.
+  #
+  my $char_with_utf8_pv = $char;
+  utf8::upgrade($char_with_utf8_pv);
+
+  # This will be the same logical code point as $char_with_utf8_pv, but
+  # implemented in Perl internally as a raw byte rather than UTF-8.
+  # (NB: $char is *probably* already utf8::downgrade()d, but let's not
+  # assume that to be the case.)
+  #
+  my $char_with_plain_pv = $char;
+  utf8::downgrade($char_with_plain_pv);
+
+  $0 = $char_with_utf8_pv;
+
+  # In case the assignment to $0 changed $char_with_utf8_pv, ensure that
+  # it is still interally double-UTF-8-encoded:
+  #
+  utf8::upgrade($char_with_utf8_pv);
+
+  is ($0, $char_with_utf8_pv, 'compare $0 to UTF8-flagged');
+  is ($0, $char_with_plain_pv, 'compare $0 to non-UTF8-flagged');
+
+  my $linux_cmdline_cr = sub {
+    open my $rfh, '<', "/proc/$$/cmdline" or skip "open: $!", 1;
+    return do { local $/; <$rfh> };
+  };
+
+  SKIP: {
+    skip "Test is for Linux, not $^O", 2 if $^O ne 'linux';
+
+    my $slurp = $linux_cmdline_cr->();
+    like( $slurp, qr<\A$char_with_utf8_pv\0>, '/proc cmdline shows as expected (compare to UTF8-flagged)' );
+    like( $slurp, qr<\A$char_with_plain_pv\0>, '/proc cmdline shows as expected (compare to non-UTF8-flagged)' );
+  }
+
+  my $name_unicode = "haha\x{100}hoho";
+
+  my $name_utf8_bytes = $name_unicode;
+  utf8::encode($name_utf8_bytes);
+
+  my @warnings;
+  {
+    local $SIG{'__WARN__'} = sub { push @warnings, @_ };
+    $0 = $name_unicode;
+  }
+
+  is( 0 + @warnings, 1, 'warning after assignment of wide character' );
+  like( $warnings[0], qr<wide>i, '.. and the warning is about a wide character' );
+  is( $0, $name_utf8_bytes, '.. and the UTF-8 version is written' );
+
+  SKIP: {
+    skip "Test is for Linux, not $^O" if $^O ne 'linux';
+    is( $linux_cmdline_cr->(), "$name_utf8_bytes\0", '.. and /proc cmdline shows that');
+  }
+
+  @warnings = ();
+  local $SIG{'__WARN__'} = sub { push @warnings, @_ };
+  { local $0 = "alpha"; }
+  is( 0 + @warnings, 0, '$0 from wide -> local non-wide: no warning');
+
+  { local $0 = "$name_unicode-redux" }
+  is( 0 + @warnings, 1, 'one warning: wide -> local wide' );
+
+  $0 = "aaaa";
+  @warnings = ();
+  { local $0 = "$name_unicode-redux" }
+  is( 0 + @warnings, 1, 'one warning: non-wide -> local wide' );
+}
+
 {
     my $ok = 1;
     my $warn = '';
author	Felipe Gasper <felipe@felipegasper.com>	2022-01-07 14:28:39 -0500
committer	Karl Williamson <khw@cpan.org>	2022-03-05 21:06:05 -0700
commit	2d3b3561a39bb17fd6003fb262f52b3bc800770e (patch)
tree	4d7e46f04c3e64e68e2621655a9322cb62bb9c11
parent	286a1bfd696eb9d8dc1fa337234d1512f63beb67 (diff)
download	perl-2d3b3561a39bb17fd6003fb262f52b3bc800770e.tar.gz