diff options
-rw-r--r-- | lib/utf8.pm | 74 | ||||
-rw-r--r-- | pod/perlapi.pod | 23 | ||||
-rw-r--r-- | sv.c | 27 | ||||
-rw-r--r-- | t/op/utftaint.t | 119 |
4 files changed, 186 insertions, 57 deletions
diff --git a/lib/utf8.pm b/lib/utf8.pm index 486a4faa88..0b57d6d968 100644 --- a/lib/utf8.pm +++ b/lib/utf8.pm @@ -113,45 +113,59 @@ you should not say that unless you really want to have UTF-8 source code. =item * $num_octets = utf8::upgrade($string) -Converts (in-place) internal representation of string to Perl's -internal I<UTF-X> form. Returns the number of octets necessary to -represent the string as I<UTF-X>. Can be used to make sure that the -UTF-8 flag is on, so that C<\w> or C<lc()> work as expected on strings -containing characters in the range 0x80-0xFF (oon ASCII and -derivatives). Note that this should not be used to convert a legacy -byte encoding to Unicode: use Encode for that. Affected by the -encoding pragma. +Converts in-place the octet sequence in the native encoding +(Latin-1 or EBCDIC) to the equivalent character sequence in I<UTF-X>. +I<$string> already encoded as characters does no harm. +Returns the number of octets necessary to represent the string as I<UTF-X>. +Can be used to make sure that the UTF-8 flag is on, +so that C<\w> or C<lc()> work as Unicode on strings +containing characters in the range 0x80-0xFF (on ASCII and +derivatives). + +B<Note that this function does not handle arbitrary encodings.> +Therefore I<Encode.pm> is recommended for the general purposes. + +Affected by the encoding pragma. =item * $success = utf8::downgrade($string[, FAIL_OK]) -Converts (in-place) internal representation of string to be un-encoded -bytes. Returns true on success. On failure dies or, if the value of -FAIL_OK is true, returns false. Can be used to make sure that the -UTF-8 flag is off, e.g. when you want to make sure that the substr() -or length() function works with the usually faster byte algorithm. -Note that this should not be used to convert Unicode back to a legacy -byte encoding: use Encode for that. B<Not> affected by the encoding -pragma. +Converts in-place the character sequence in I<UTF-X> +to the equivalent octet sequence in the native encoding (Latin-1 or EBCDIC). +I<$string> already encoded as octets does no harm. +Returns true on success. On failure dies or, if the value of +C<FAIL_OK> is true, returns false. +Can be used to make sure that the UTF-8 flag is off, +e.g. when you want to make sure that the substr() or length() function +works with the usually faster byte algorithm. + +B<Note that this function does not handle arbitrary encodings.> +Therefore I<Encode.pm> is recommended for the general purposes. + +B<Not> affected by the encoding pragma. + +B<NOTE:> this function is experimental and may change +or be removed without notice. =item * utf8::encode($string) -Converts in-place the octets of the I<$string> to the octet sequence -in Perl's I<UTF-X> encoding. Returns nothing. B<Note that this does -not change the "type" of I<$string> to UTF-8>, and that this handles -only ISO 8859-1 (or EBCDIC) as the source character set. Therefore -this should not be used to convert a legacy 8-bit encoding to Unicode: -use Encode::decode() for that. In the very limited case of wanting to -handle just ISO 8859-1 (or EBCDIC), you could use utf8::upgrade(). +Converts in-place the character sequence to the corresponding octet sequence +in I<UTF-X>. The UTF-8 flag is turned off. Returns nothing. + +B<Note that this function does not handle arbitrary encodings.> +Therefore I<Encode.pm> is recommended for the general purposes. =item * utf8::decode($string) -Attempts to convert I<$string> in-place from Perl's I<UTF-X> encoding -into octets. Returns nothing. B<Note that this does not change the -"type" of <$string> from UTF-8>, and that this handles only ISO 8859-1 -(or EBCDIC) as the destination character set. Therefore this should -not be used to convert Unicode back to a legacy 8-bit encoding: -use Encode::encode() for that. In the very limited case of wanting -to handle just ISO 8859-1 (or EBCDIC), you could use utf8::downgrade(). +Attempts to convert in-place the octet sequence in I<UTF-X> +to the corresponding character sequence. The UTF-8 flag is turned on +only if the source string contains multiple-byte I<UTF-X> characters. +If I<$string> is invalid as I<UTF-X>, returns false; otherwise returns true. + +B<Note that this function does not handle arbitrary encodings.> +Therefore I<Encode.pm> is recommended for the general purposes. + +B<NOTE:> this function is experimental and may change +or be removed without notice. =item * $flag = utf8::is_utf8(STRING) diff --git a/pod/perlapi.pod b/pod/perlapi.pod index bef662421e..75056528bf 100644 --- a/pod/perlapi.pod +++ b/pod/perlapi.pod @@ -4834,9 +4834,11 @@ Found in file sv.c =item sv_utf8_decode -Convert the octets in the PV from UTF-8 to chars. Scan for validity and then -turn off SvUTF8 if needed so that we see characters. Used as a building block -for decode_utf8 in Encode.xs +If the PV of the SV is an octet sequence in UTF-8 +and contains a multiple-byte character, the C<SvUTF8> flag is turned on +so that it looks like a character. If the PV contains only single-byte +characters, the C<SvUTF8> flag stays being off. +Scans PV for validity and returns false if the PV is invalid UTF-8. NOTE: this function is experimental and may change or be removed without notice. @@ -4848,9 +4850,9 @@ Found in file sv.c =item sv_utf8_downgrade -Attempt to convert the PV of an SV from UTF-8-encoded to byte encoding. -This may not be possible if the PV contains non-byte encoding characters; -if this is the case, either returns false or, if C<fail_ok> is not +Attempts to convert the PV of an SV from characters to bytes. +If the PV contains a character beyond byte, this conversion will fail; +in this case, either returns false or, if C<fail_ok> is not true, croaks. This is not as a general purpose Unicode to byte encoding interface: @@ -4866,9 +4868,8 @@ Found in file sv.c =item sv_utf8_encode -Convert the PV of an SV to UTF-8-encoded, but then turn off the C<SvUTF8> -flag so that it looks like octets again. Used as a building block -for encode_utf8 in Encode.xs +Converts the PV of an SV to UTF-8, but then turns the C<SvUTF8> +flag off so that it looks like octets again. void sv_utf8_encode(SV *sv) @@ -4877,7 +4878,7 @@ Found in file sv.c =item sv_utf8_upgrade -Convert the PV of an SV to its UTF-8-encoded form. +Converts the PV of an SV to its UTF-8-encoded form. Forces the SV to string form if it is not already. Always sets the SvUTF8 flag to avoid future validity checks even if all the bytes have hibit clear. @@ -4892,7 +4893,7 @@ Found in file sv.c =item sv_utf8_upgrade_flags -Convert the PV of an SV to its UTF-8-encoded form. +Converts the PV of an SV to its UTF-8-encoded form. Forces the SV to string form if it is not already. Always sets the SvUTF8 flag to avoid future validity checks even if all the bytes have hibit clear. If C<flags> has C<SV_GMAGIC> bit set, @@ -3907,7 +3907,7 @@ Perl_sv_utf8_upgrade(pTHX_ register SV *sv) /* =for apidoc sv_utf8_upgrade -Convert the PV of an SV to its UTF-8-encoded form. +Converts the PV of an SV to its UTF-8-encoded form. Forces the SV to string form if it is not already. Always sets the SvUTF8 flag to avoid future validity checks even if all the bytes have hibit clear. @@ -3917,7 +3917,7 @@ use the Encode extension for that. =for apidoc sv_utf8_upgrade_flags -Convert the PV of an SV to its UTF-8-encoded form. +Converts the PV of an SV to its UTF-8-encoded form. Forces the SV to string form if it is not already. Always sets the SvUTF8 flag to avoid future validity checks even if all the bytes have hibit clear. If C<flags> has C<SV_GMAGIC> bit set, @@ -3986,9 +3986,9 @@ Perl_sv_utf8_upgrade_flags(pTHX_ register SV *sv, I32 flags) /* =for apidoc sv_utf8_downgrade -Attempt to convert the PV of an SV from UTF-8-encoded to byte encoding. -This may not be possible if the PV contains non-byte encoding characters; -if this is the case, either returns false or, if C<fail_ok> is not +Attempts to convert the PV of an SV from characters to bytes. +If the PV contains a character beyond byte, this conversion will fail; +in this case, either returns false or, if C<fail_ok> is not true, croaks. This is not as a general purpose Unicode to byte encoding interface: @@ -4000,7 +4000,7 @@ use the Encode extension for that. bool Perl_sv_utf8_downgrade(pTHX_ register SV* sv, bool fail_ok) { - if (SvPOK(sv) && SvUTF8(sv)) { + if (SvPOKp(sv) && SvUTF8(sv)) { if (SvCUR(sv)) { U8 *s; STRLEN len; @@ -4030,9 +4030,8 @@ Perl_sv_utf8_downgrade(pTHX_ register SV* sv, bool fail_ok) /* =for apidoc sv_utf8_encode -Convert the PV of an SV to UTF-8-encoded, but then turn off the C<SvUTF8> -flag so that it looks like octets again. Used as a building block -for encode_utf8 in Encode.xs +Converts the PV of an SV to UTF-8, but then turns the C<SvUTF8> +flag off so that it looks like octets again. =cut */ @@ -4053,9 +4052,11 @@ Perl_sv_utf8_encode(pTHX_ register SV *sv) /* =for apidoc sv_utf8_decode -Convert the octets in the PV from UTF-8 to chars. Scan for validity and then -turn off SvUTF8 if needed so that we see characters. Used as a building block -for decode_utf8 in Encode.xs +If the PV of the SV is an octet sequence in UTF-8 +and contains a multiple-byte character, the C<SvUTF8> flag is turned on +so that it looks like a character. If the PV contains only single-byte +characters, the C<SvUTF8> flag stays being off. +Scans PV for validity and returns false if the PV is invalid UTF-8. =cut */ @@ -4063,7 +4064,7 @@ for decode_utf8 in Encode.xs bool Perl_sv_utf8_decode(pTHX_ register SV *sv) { - if (SvPOK(sv)) { + if (SvPOKp(sv)) { U8 *c; U8 *e; diff --git a/t/op/utftaint.t b/t/op/utftaint.t index 692c908668..cd44503e74 100644 --- a/t/op/utftaint.t +++ b/t/op/utftaint.t @@ -23,12 +23,17 @@ BEGIN { use Scalar::Util qw(tainted); use Test; -plan tests => 3*10; +plan tests => 3*10 + 3*8 + 2*16; my $cnt = 0; my $arg = $ENV{PATH}; # a tainted value use constant UTF8 => "\x{1234}"; +sub is_utf8 { + my $s = shift; + return 0xB6 != ord pack('a*', chr(0xB6).$s); +} + for my $ary ([ascii => 'perl'], [latin1 => "\xB6"], [utf8 => "\x{100}"]) { my $encode = $ary->[0]; my $string = $ary->[1]; @@ -40,7 +45,7 @@ for my $ary ([ascii => 'perl'], [latin1 => "\xB6"], [utf8 => "\x{100}"]) { my $lconcat = $taint; $lconcat .= UTF8; - print $lconcat eq $string."\x{1234}" + print $lconcat eq $string.UTF8 ? "ok " : "not ok ", ++$cnt, " # compare: $encode, concat left\n"; print tainted($lconcat) == tainted($arg) @@ -48,7 +53,7 @@ for my $ary ([ascii => 'perl'], [latin1 => "\xB6"], [utf8 => "\x{100}"]) { my $rconcat = UTF8; $rconcat .= $taint; - print $rconcat eq "\x{1234}".$string + print $rconcat eq UTF8.$string ? "ok " : "not ok ", ++$cnt, " # compare: $encode, concat right\n"; print tainted($rconcat) == tainted($arg) @@ -71,3 +76,111 @@ for my $ary ([ascii => 'perl'], [latin1 => "\xB6"], [utf8 => "\x{100}"]) { print tainted($taint) == tainted($arg) ? "ok " : "not ok ", ++$cnt, " # tainted: $encode, after test\n"; } + + +for my $ary ([ascii => 'perl'], [latin1 => "\xB6"], [utf8 => "\x{100}"]) { + my $encode = $ary->[0]; + + my $utf8 = pack('U*') . $ary->[1]; + my $byte = pack('C0a*', $utf8); + + my $taint = $arg; substr($taint, 0) = $utf8; + utf8::encode($taint); + + print $taint eq $byte + ? "ok " : "not ok ", ++$cnt, " # compare: $encode, encode utf8\n"; + + print pack('a*',$taint) eq pack('a*',$byte) + ? "ok " : "not ok ", ++$cnt, " # bytecmp: $encode, encode utf8\n"; + + print !is_utf8($taint) + ? "ok " : "not ok ", ++$cnt, " # is_utf8: $encode, encode utf8\n"; + + print tainted($taint) == tainted($arg) + ? "ok " : "not ok ", ++$cnt, " # tainted: $encode, encode utf8\n"; + + my $taint = $arg; substr($taint, 0) = $byte; + utf8::decode($taint); + + print $taint eq $utf8 + ? "ok " : "not ok ", ++$cnt, " # compare: $encode, decode byte\n"; + + print pack('a*',$taint) eq pack('a*',$utf8) + ? "ok " : "not ok ", ++$cnt, " # bytecmp: $encode, decode byte\n"; + + print is_utf8($taint) eq ($encode ne 'ascii') + ? "ok " : "not ok ", ++$cnt, " # is_utf8: $encode, decode byte\n"; + + print tainted($taint) == tainted($arg) + ? "ok " : "not ok ", ++$cnt, " # tainted: $encode, decode byte\n"; +} + + +for my $ary ([ascii => 'perl'], [latin1 => "\xB6"]) { + my $encode = $ary->[0]; + + my $up = pack('U*') . $ary->[1]; + my $down = pack('C0a*', $ary->[1]); + + my $taint = $arg; substr($taint, 0) = $up; + utf8::upgrade($taint); + + print $taint eq $up + ? "ok " : "not ok ", ++$cnt, " # compare: $encode, upgrade up\n"; + + print pack('a*',$taint) eq pack('a*',$up) + ? "ok " : "not ok ", ++$cnt, " # bytecmp: $encode, upgrade up\n"; + + print is_utf8($taint) + ? "ok " : "not ok ", ++$cnt, " # is_utf8: $encode, upgrade up\n"; + + print tainted($taint) == tainted($arg) + ? "ok " : "not ok ", ++$cnt, " # tainted: $encode, upgrade up\n"; + + my $taint = $arg; substr($taint, 0) = $down; + utf8::upgrade($taint); + + print $taint eq $up + ? "ok " : "not ok ", ++$cnt, " # compare: $encode, upgrade down\n"; + + print pack('a*',$taint) eq pack('a*',$up) + ? "ok " : "not ok ", ++$cnt, " # bytecmp: $encode, upgrade down\n"; + + print is_utf8($taint) + ? "ok " : "not ok ", ++$cnt, " # is_utf8: $encode, upgrade down\n"; + + print tainted($taint) == tainted($arg) + ? "ok " : "not ok ", ++$cnt, " # tainted: $encode, upgrade down\n"; + + my $taint = $arg; substr($taint, 0) = $up; + utf8::downgrade($taint); + + print $taint eq $down + ? "ok " : "not ok ", ++$cnt, " # compare: $encode, downgrade up\n"; + + print pack('a*',$taint) eq pack('a*',$down) + ? "ok " : "not ok ", ++$cnt, " # bytecmp: $encode, downgrade up\n"; + + print !is_utf8($taint) + ? "ok " : "not ok ", ++$cnt, " # is_utf8: $encode, downgrade up\n"; + + print tainted($taint) == tainted($arg) + ? "ok " : "not ok ", ++$cnt, " # tainted: $encode, downgrade up\n"; + + my $taint = $arg; substr($taint, 0) = $down; + utf8::downgrade($taint); + + print $taint eq $down + ? "ok " : "not ok ", ++$cnt, " # compare: $encode, downgrade down\n"; + + print pack('a*',$taint) eq pack('a*',$down) + ? "ok " : "not ok ", ++$cnt, " # bytecmp: $encode, downgrade down\n"; + + print !is_utf8($taint) + ? "ok " : "not ok ", ++$cnt, " # is_utf8: $encode, downgrade down\n"; + + print tainted($taint) == tainted($arg) + ? "ok " : "not ok ", ++$cnt, " # tainted: $encode, downgrade down\n"; +} + + |