diff options
author | Felipe Gasper <felipe@felipegasper.com> | 2022-01-07 14:28:39 -0500 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2022-03-05 21:06:05 -0700 |
commit | 2d3b3561a39bb17fd6003fb262f52b3bc800770e (patch) | |
tree | 4d7e46f04c3e64e68e2621655a9322cb62bb9c11 | |
parent | 286a1bfd696eb9d8dc1fa337234d1512f63beb67 (diff) | |
download | perl-2d3b3561a39bb17fd6003fb262f52b3bc800770e.tar.gz |
Properly handle UTF8-flagged strings when assigning to $0.
Issue #19331: Use of SvPV_const and SvPV_force in S_set_dollarzero()
wrote the PV internals directly to argv, which causes an improper UTF-8
encode if the SV is UTF8-flagged/upgraded.
This fixes that doing a downgrade prior to those SvPV* calls. If the
string contains wide characters (and thus cannot be downgraded) a
warning is thrown; this mirrors preexisting behavior with %ENV, print,
and other output channels that convert Perl SVs to bytes.
The relevant documentation is also updated.
-rw-r--r-- | mg.c | 10 | ||||
-rw-r--r-- | pod/perldelta.pod | 7 | ||||
-rw-r--r-- | pod/perlvar.pod | 4 | ||||
-rw-r--r-- | t/op/magic.t | 80 |
4 files changed, 100 insertions, 1 deletions
@@ -3367,6 +3367,16 @@ Perl_magic_set(pTHX_ SV *sv, MAGIC *mg) else sv_setiv(mg->mg_obj, (IV)PerlProc_getpid()); break; case '0': + if (!sv_utf8_downgrade(sv, /* fail_ok */ TRUE)) { + + /* Since we are going to set the string's UTF8-encoded form + as the process name we should update $0 itself to contain + that same (UTF8-encoded) value. */ + sv_utf8_encode(GvSV(mg->mg_obj)); + + Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "Wide character in %s", "$0"); + } + LOCK_DOLLARZERO_MUTEX; S_set_dollarzero(aTHX_ sv); UNLOCK_DOLLARZERO_MUTEX; diff --git a/pod/perldelta.pod b/pod/perldelta.pod index cf427e0f5e..7e0a8ca873 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -212,6 +212,13 @@ and New Warnings XXX L<message|perldiag/"message"> +=item * + +L<Wide character in $0|perldiag/"Wide character in %s"> + +Attempts to put wide characters into the program name (C<$0>) now +provoke this warning. + =back =head3 New Warnings diff --git a/pod/perlvar.pod b/pod/perlvar.pod index ecd870ecae..f0a6187f5e 100644 --- a/pod/perlvar.pod +++ b/pod/perlvar.pod @@ -257,6 +257,10 @@ legacy process name such as ps, top and killall will recognize the name you set when assigning to C<$0>. The string you supply will be cut off at 16 bytes, this is a limitation imposed by Linux. +Wide characters are invalid in C<$0> values. For historical reasons, +though, Perl accepts them and encodes them to UTF-8. When this +happens a wide-character warning is triggered. + Mnemonic: same as B<sh> and B<ksh>. =item $REAL_GROUP_ID diff --git a/t/op/magic.t b/t/op/magic.t index e2e0b30211..8354e32252 100644 --- a/t/op/magic.t +++ b/t/op/magic.t @@ -5,7 +5,7 @@ BEGIN { chdir 't' if -d 't'; require './test.pl'; set_up_inc( '../lib' ); - plan (tests => 197); # some tests are run in BEGIN block + plan (tests => 208); # some tests are run in BEGIN block } # Test that defined() returns true for magic variables created on the fly, @@ -433,6 +433,84 @@ EOP } } +# Check that assigning to $0 properly handles UTF-8-stored strings: +{ + + # Test both ASCII and EBCDIC systems: + my $char = chr( utf8::native_to_unicode(0xe9) ); + + # We want $char_with_utf8_pv's PV to be UTF-8-encoded because we need to + # test that Perl translates UTF-8-stored code points to plain octets when + # assigning to $0. + # + my $char_with_utf8_pv = $char; + utf8::upgrade($char_with_utf8_pv); + + # This will be the same logical code point as $char_with_utf8_pv, but + # implemented in Perl internally as a raw byte rather than UTF-8. + # (NB: $char is *probably* already utf8::downgrade()d, but let's not + # assume that to be the case.) + # + my $char_with_plain_pv = $char; + utf8::downgrade($char_with_plain_pv); + + $0 = $char_with_utf8_pv; + + # In case the assignment to $0 changed $char_with_utf8_pv, ensure that + # it is still interally double-UTF-8-encoded: + # + utf8::upgrade($char_with_utf8_pv); + + is ($0, $char_with_utf8_pv, 'compare $0 to UTF8-flagged'); + is ($0, $char_with_plain_pv, 'compare $0 to non-UTF8-flagged'); + + my $linux_cmdline_cr = sub { + open my $rfh, '<', "/proc/$$/cmdline" or skip "open: $!", 1; + return do { local $/; <$rfh> }; + }; + + SKIP: { + skip "Test is for Linux, not $^O", 2 if $^O ne 'linux'; + + my $slurp = $linux_cmdline_cr->(); + like( $slurp, qr<\A$char_with_utf8_pv\0>, '/proc cmdline shows as expected (compare to UTF8-flagged)' ); + like( $slurp, qr<\A$char_with_plain_pv\0>, '/proc cmdline shows as expected (compare to non-UTF8-flagged)' ); + } + + my $name_unicode = "haha\x{100}hoho"; + + my $name_utf8_bytes = $name_unicode; + utf8::encode($name_utf8_bytes); + + my @warnings; + { + local $SIG{'__WARN__'} = sub { push @warnings, @_ }; + $0 = $name_unicode; + } + + is( 0 + @warnings, 1, 'warning after assignment of wide character' ); + like( $warnings[0], qr<wide>i, '.. and the warning is about a wide character' ); + is( $0, $name_utf8_bytes, '.. and the UTF-8 version is written' ); + + SKIP: { + skip "Test is for Linux, not $^O" if $^O ne 'linux'; + is( $linux_cmdline_cr->(), "$name_utf8_bytes\0", '.. and /proc cmdline shows that'); + } + + @warnings = (); + local $SIG{'__WARN__'} = sub { push @warnings, @_ }; + { local $0 = "alpha"; } + is( 0 + @warnings, 0, '$0 from wide -> local non-wide: no warning'); + + { local $0 = "$name_unicode-redux" } + is( 0 + @warnings, 1, 'one warning: wide -> local wide' ); + + $0 = "aaaa"; + @warnings = (); + { local $0 = "$name_unicode-redux" } + is( 0 + @warnings, 1, 'one warning: non-wide -> local wide' ); +} + { my $ok = 1; my $warn = ''; |