summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFelipe Gasper <felipe@felipegasper.com>2022-01-07 14:28:39 -0500
committerKarl Williamson <khw@cpan.org>2022-03-05 21:06:05 -0700
commit2d3b3561a39bb17fd6003fb262f52b3bc800770e (patch)
tree4d7e46f04c3e64e68e2621655a9322cb62bb9c11
parent286a1bfd696eb9d8dc1fa337234d1512f63beb67 (diff)
downloadperl-2d3b3561a39bb17fd6003fb262f52b3bc800770e.tar.gz
Properly handle UTF8-flagged strings when assigning to $0.
Issue #19331: Use of SvPV_const and SvPV_force in S_set_dollarzero() wrote the PV internals directly to argv, which causes an improper UTF-8 encode if the SV is UTF8-flagged/upgraded. This fixes that doing a downgrade prior to those SvPV* calls. If the string contains wide characters (and thus cannot be downgraded) a warning is thrown; this mirrors preexisting behavior with %ENV, print, and other output channels that convert Perl SVs to bytes. The relevant documentation is also updated.
-rw-r--r--mg.c10
-rw-r--r--pod/perldelta.pod7
-rw-r--r--pod/perlvar.pod4
-rw-r--r--t/op/magic.t80
4 files changed, 100 insertions, 1 deletions
diff --git a/mg.c b/mg.c
index 10813a7316..5f3eeae4fe 100644
--- a/mg.c
+++ b/mg.c
@@ -3367,6 +3367,16 @@ Perl_magic_set(pTHX_ SV *sv, MAGIC *mg)
else sv_setiv(mg->mg_obj, (IV)PerlProc_getpid());
break;
case '0':
+ if (!sv_utf8_downgrade(sv, /* fail_ok */ TRUE)) {
+
+ /* Since we are going to set the string's UTF8-encoded form
+ as the process name we should update $0 itself to contain
+ that same (UTF8-encoded) value. */
+ sv_utf8_encode(GvSV(mg->mg_obj));
+
+ Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "Wide character in %s", "$0");
+ }
+
LOCK_DOLLARZERO_MUTEX;
S_set_dollarzero(aTHX_ sv);
UNLOCK_DOLLARZERO_MUTEX;
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index cf427e0f5e..7e0a8ca873 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -212,6 +212,13 @@ and New Warnings
XXX L<message|perldiag/"message">
+=item *
+
+L<Wide character in $0|perldiag/"Wide character in %s">
+
+Attempts to put wide characters into the program name (C<$0>) now
+provoke this warning.
+
=back
=head3 New Warnings
diff --git a/pod/perlvar.pod b/pod/perlvar.pod
index ecd870ecae..f0a6187f5e 100644
--- a/pod/perlvar.pod
+++ b/pod/perlvar.pod
@@ -257,6 +257,10 @@ legacy process name such as ps, top and killall will recognize the
name you set when assigning to C<$0>. The string you supply will be
cut off at 16 bytes, this is a limitation imposed by Linux.
+Wide characters are invalid in C<$0> values. For historical reasons,
+though, Perl accepts them and encodes them to UTF-8. When this
+happens a wide-character warning is triggered.
+
Mnemonic: same as B<sh> and B<ksh>.
=item $REAL_GROUP_ID
diff --git a/t/op/magic.t b/t/op/magic.t
index e2e0b30211..8354e32252 100644
--- a/t/op/magic.t
+++ b/t/op/magic.t
@@ -5,7 +5,7 @@ BEGIN {
chdir 't' if -d 't';
require './test.pl';
set_up_inc( '../lib' );
- plan (tests => 197); # some tests are run in BEGIN block
+ plan (tests => 208); # some tests are run in BEGIN block
}
# Test that defined() returns true for magic variables created on the fly,
@@ -433,6 +433,84 @@ EOP
}
}
+# Check that assigning to $0 properly handles UTF-8-stored strings:
+{
+
+ # Test both ASCII and EBCDIC systems:
+ my $char = chr( utf8::native_to_unicode(0xe9) );
+
+ # We want $char_with_utf8_pv's PV to be UTF-8-encoded because we need to
+ # test that Perl translates UTF-8-stored code points to plain octets when
+ # assigning to $0.
+ #
+ my $char_with_utf8_pv = $char;
+ utf8::upgrade($char_with_utf8_pv);
+
+ # This will be the same logical code point as $char_with_utf8_pv, but
+ # implemented in Perl internally as a raw byte rather than UTF-8.
+ # (NB: $char is *probably* already utf8::downgrade()d, but let's not
+ # assume that to be the case.)
+ #
+ my $char_with_plain_pv = $char;
+ utf8::downgrade($char_with_plain_pv);
+
+ $0 = $char_with_utf8_pv;
+
+ # In case the assignment to $0 changed $char_with_utf8_pv, ensure that
+ # it is still interally double-UTF-8-encoded:
+ #
+ utf8::upgrade($char_with_utf8_pv);
+
+ is ($0, $char_with_utf8_pv, 'compare $0 to UTF8-flagged');
+ is ($0, $char_with_plain_pv, 'compare $0 to non-UTF8-flagged');
+
+ my $linux_cmdline_cr = sub {
+ open my $rfh, '<', "/proc/$$/cmdline" or skip "open: $!", 1;
+ return do { local $/; <$rfh> };
+ };
+
+ SKIP: {
+ skip "Test is for Linux, not $^O", 2 if $^O ne 'linux';
+
+ my $slurp = $linux_cmdline_cr->();
+ like( $slurp, qr<\A$char_with_utf8_pv\0>, '/proc cmdline shows as expected (compare to UTF8-flagged)' );
+ like( $slurp, qr<\A$char_with_plain_pv\0>, '/proc cmdline shows as expected (compare to non-UTF8-flagged)' );
+ }
+
+ my $name_unicode = "haha\x{100}hoho";
+
+ my $name_utf8_bytes = $name_unicode;
+ utf8::encode($name_utf8_bytes);
+
+ my @warnings;
+ {
+ local $SIG{'__WARN__'} = sub { push @warnings, @_ };
+ $0 = $name_unicode;
+ }
+
+ is( 0 + @warnings, 1, 'warning after assignment of wide character' );
+ like( $warnings[0], qr<wide>i, '.. and the warning is about a wide character' );
+ is( $0, $name_utf8_bytes, '.. and the UTF-8 version is written' );
+
+ SKIP: {
+ skip "Test is for Linux, not $^O" if $^O ne 'linux';
+ is( $linux_cmdline_cr->(), "$name_utf8_bytes\0", '.. and /proc cmdline shows that');
+ }
+
+ @warnings = ();
+ local $SIG{'__WARN__'} = sub { push @warnings, @_ };
+ { local $0 = "alpha"; }
+ is( 0 + @warnings, 0, '$0 from wide -> local non-wide: no warning');
+
+ { local $0 = "$name_unicode-redux" }
+ is( 0 + @warnings, 1, 'one warning: wide -> local wide' );
+
+ $0 = "aaaa";
+ @warnings = ();
+ { local $0 = "$name_unicode-redux" }
+ is( 0 + @warnings, 1, 'one warning: non-wide -> local wide' );
+}
+
{
my $ok = 1;
my $warn = '';