diff options
author | Karl Williamson <public@khwilliamson.com> | 2012-01-18 09:35:52 -0700 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2012-01-21 10:02:55 -0700 |
commit | 66cbab2c91fca8c9abc65a7231a053898208efe3 (patch) | |
tree | cb9e838d32b251f9f52082d29bb7009f074d192f | |
parent | e439cacbc5a93fb9e6c524e31ac41772af51dfa0 (diff) | |
download | perl-66cbab2c91fca8c9abc65a7231a053898208efe3.tar.gz |
Add :not_characters parameter to 'use locale'
This adds the parameter handling, tests, and documentation for this new
feature which allows locale and Unicode to play well with each other.
-rw-r--r-- | lib/locale.pm | 49 | ||||
-rw-r--r-- | lib/locale.t | 522 | ||||
-rw-r--r-- | numeric.c | 4 | ||||
-rw-r--r-- | op.c | 5 | ||||
-rw-r--r-- | perl.h | 13 | ||||
-rw-r--r-- | pod/perldelta.pod | 17 | ||||
-rw-r--r-- | pod/perlfunc.pod | 15 | ||||
-rw-r--r-- | pod/perllocale.pod | 315 | ||||
-rw-r--r-- | pod/perlop.pod | 6 | ||||
-rw-r--r-- | pod/perlre.pod | 10 | ||||
-rw-r--r-- | pod/perlunicode.pod | 10 | ||||
-rw-r--r-- | pod/perluniintro.pod | 24 | ||||
-rw-r--r-- | t/porting/known_pod_issues.dat | 2 | ||||
-rw-r--r-- | utf8.h | 4 |
14 files changed, 817 insertions, 179 deletions
diff --git a/lib/locale.pm b/lib/locale.pm index 2398599595..e57a5fded2 100644 --- a/lib/locale.pm +++ b/lib/locale.pm @@ -2,6 +2,8 @@ package locale; our $VERSION = '1.01'; +$Carp::Internal{ (__PACKAGE__) } = 1; + =head1 NAME locale - Perl pragma to use or avoid POSIX locales for built-in operations @@ -23,19 +25,62 @@ expressions, LC_COLLATE for string comparison, and LC_NUMERIC for number formatting). Each "use locale" or "no locale" affects statements to the end of the enclosing BLOCK. +Starting in Perl 5.16, a hybrid mode for this pragma is available, + + use locale ':not_characters'; + +which enables only the portions of locales that don't affect the character +set (that is, all except LC_COLLATE and LC_CTYPE). This is useful when mixing +Unicode and locales, including UTF-8 locales. + + use locale ':not_characters'; + use open ":locale"; # Convert I/O to/from Unicode + use POSIX qw(locale_h); # Import the LC_ALL constant + setlocale(LC_ALL, ""); # Required for the next statement + # to take effect + printf "%.2f\n", 12345.67' # Locale-defined formatting + @x = sort @y; # Unicode-defined sorting order. + # (Note that you will get better + # results using Unicode::Collate.) + See L<perllocale> for more detailed information on how Perl supports locales. =cut +# A separate bit is used for each of the two forms of the pragma, as they are +# mostly independent, and interact with each other and the unicode_strings +# feature. This allows for fast determination of which one(s) of the three +# are to be used at any given point, and no code has to be written to deal +# with coming in and out of scopes--it falls automatically out from the hint +# handling + $locale::hint_bits = 0x4; +$locale::not_chars_hint_bits = 0x10; sub import { - $^H |= $locale::hint_bits; + shift; # should be 'locale'; not checked + my $found_not_chars = 0; + while (defined (my $arg = shift)) { + if ($arg eq ":not_characters") { + $^H |= $locale::not_chars_hint_bits; + + # This form of the pragma overrides the other + $^H &= ~$locale::hint_bits; + $found_not_chars = 1; + } + else { + require Carp; + Carp::croak("Unknown parameter '$arg' to 'use locale'"); + } + } + + # Use the plain form if not doing the :not_characters one. + $^H |= $locale::hint_bits unless $found_not_chars; } sub unimport { - $^H &= ~$locale::hint_bits; + $^H &= ~($locale::hint_bits|$locale::not_chars_hint_bits); } 1; diff --git a/lib/locale.t b/lib/locale.t index 81be59ea60..d2b5619622 100644 --- a/lib/locale.t +++ b/lib/locale.t @@ -1,5 +1,10 @@ #!./perl -wT +# This tests plain 'use locale' and adorned 'use locale ":not_characters"' +# Because these pragmas are compile time, and I (khw) am trying to test +# without using 'eval' as much as possible, which might cloud the issue, the +# crucial parts of the code are duplicated in a block for each pragma. + binmode STDOUT, ':utf8'; binmode STDERR, ':utf8'; @@ -248,6 +253,170 @@ check_taint_not $2; check_taint_not $a; +{ # This is just the previous tests copied here with a different + # compile-time pragma. + + use locale ':not_characters'; # engage restricted locale with different + # tainting rules + + check_taint_not $a; + + check_taint_not uc($a); + check_taint_not "\U$a"; + check_taint_not ucfirst($a); + check_taint_not "\u$a"; + check_taint_not lc($a); + check_taint_not "\L$a"; + check_taint_not lcfirst($a); + check_taint_not "\l$a"; + + check_taint_not sprintf('%e', 123.456); + check_taint_not sprintf('%f', 123.456); + check_taint_not sprintf('%g', 123.456); + check_taint_not sprintf('%d', 123.456); + check_taint_not sprintf('%x', 123.456); + + $_ = $a; # untaint $_ + + $_ = uc($a); # taint $_ + + check_taint_not $_; + + /(\w)/; # taint $&, $`, $', $+, $1. + check_taint_not $&; + check_taint_not $`; + check_taint_not $'; + check_taint_not $+; + check_taint_not $1; + check_taint_not $2; + + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&; + check_taint_not $`; + check_taint_not $'; + check_taint_not $+; + check_taint_not $1; + check_taint_not $2; + + /(\W)/; # taint $&, $`, $', $+, $1. + check_taint_not $&; + check_taint_not $`; + check_taint_not $'; + check_taint_not $+; + check_taint_not $1; + check_taint_not $2; + + /(\s)/; # taint $&, $`, $', $+, $1. + check_taint_not $&; + check_taint_not $`; + check_taint_not $'; + check_taint_not $+; + check_taint_not $1; + check_taint_not $2; + + /(\S)/; # taint $&, $`, $', $+, $1. + check_taint_not $&; + check_taint_not $`; + check_taint_not $'; + check_taint_not $+; + check_taint_not $1; + check_taint_not $2; + + $_ = $a; # untaint $_ + + check_taint_not $_; + + /(b)/; # this must not taint + check_taint_not $&; + check_taint_not $`; + check_taint_not $'; + check_taint_not $+; + check_taint_not $1; + check_taint_not $2; + + $_ = $a; # untaint $_ + + check_taint_not $_; + + $b = uc($a); # taint $b + s/(.+)/$b/; # this must taint only the $_ + + check_taint_not $_; + check_taint_not $&; + check_taint_not $`; + check_taint_not $'; + check_taint_not $+; + check_taint_not $1; + check_taint_not $2; + + $_ = $a; # untaint $_ + + s/(.+)/b/; # this must not taint + check_taint_not $_; + check_taint_not $&; + check_taint_not $`; + check_taint_not $'; + check_taint_not $+; + check_taint_not $1; + check_taint_not $2; + + $b = $a; # untaint $b + + ($b = $a) =~ s/\w/$&/; + check_taint_not $b; # $b should be tainted. + check_taint_not $a; # $a should be not. + + $_ = $a; # untaint $_ + + s/(\w)/\l$1/; # this must taint + check_taint_not $_; + check_taint_not $&; + check_taint_not $`; + check_taint_not $'; + check_taint_not $+; + check_taint_not $1; + check_taint_not $2; + + $_ = $a; # untaint $_ + + s/(\w)/\L$1/; # this must taint + check_taint_not $_; + check_taint_not $&; + check_taint_not $`; + check_taint_not $'; + check_taint_not $+; + check_taint_not $1; + check_taint_not $2; + + $_ = $a; # untaint $_ + + s/(\w)/\u$1/; # this must taint + check_taint_not $_; + check_taint_not $&; + check_taint_not $`; + check_taint_not $'; + check_taint_not $+; + check_taint_not $1; + check_taint_not $2; + + $_ = $a; # untaint $_ + + s/(\w)/\U$1/; # this must taint + check_taint_not $_; + check_taint_not $&; + check_taint_not $`; + check_taint_not $'; + check_taint_not $+; + check_taint_not $1; + check_taint_not $2; + + # After all this tainting $a should be cool. + + check_taint_not $a; +} + +# Here are in scope of 'use locale' + # I think we've seen quite enough of taint. # Let us do some *real* locale work now, # unless setlocale() is missing (i.e. minitest). @@ -517,15 +686,24 @@ foreach $Locale (@Locale) { next; } + # We test UTF-8 locales only under ':not_characters'; otherwise they have + # documented deficiencies. Non- UTF-8 locales are tested only under plain + # 'use locale', as otherwise we would have to convert everything in them + # to Unicode. + my $is_utf8_locale = $Locale =~ /UTF-?8/i; + + my %UPPER = (); + my %lower = (); + my %BoThCaSe = (); + + if (! $is_utf8_locale) { + use locale; @Alnum_ = sort grep /\w/, map { chr } 0..255; debug "# w = ", join("",@Alnum_), "\n"; # Sieve the uppercase and the lowercase. - my %UPPER = (); - my %lower = (); - my %BoThCaSe = (); for (@Alnum_) { if (/[^\d_]/) { # skip digits and the _ if (uc($_) eq $_) { @@ -536,6 +714,22 @@ foreach $Locale (@Locale) { } } } + } + else { + use locale ':not_characters'; + @Alnum_ = sort grep /\w/, map { chr } 0..255; + debug "# w = ", join("",@Alnum_), "\n"; + for (@Alnum_) { + if (/[^\d_]/) { # skip digits and the _ + if (uc($_) eq $_) { + $UPPER{$_} = $_; + } + if (lc($_) eq $_) { + $lower{$_} = $_; + } + } + } + } foreach (keys %UPPER) { $BoThCaSe{$_}++ if exists $lower{$_}; } @@ -583,26 +777,34 @@ foreach $Locale (@Locale) { ++$locales_test_number; $test_names{$locales_test_number} = 'Verify that alnums outside the C locale match \w'; - if ($Locale =~ /utf-?8/i) { - push @{$Okay{$locales_test_number}}, $Locale; - debug "# unknown whether locale and Unicode have the same \\w, skipping test $locales_test_number for locale '$Locale'\n"; - } else { - if ($word =~ /^(\w+)$/) { - tryneoalpha($Locale, $locales_test_number, 1); - } else { - tryneoalpha($Locale, $locales_test_number, 0); - } - } + my $ok; + if ($is_utf8_locale) { + use locale ':not_characters'; + $ok = $word =~ /^(\w+)$/; + } + else { + # Already in 'use locale'; this tests that exiting scopes works + $ok = $word =~ /^(\w+)$/; + } + tryneoalpha($Locale, $locales_test_number, $ok); # Cross-check the whole 8-bit character set. ++$locales_test_number; $test_names{$locales_test_number} = 'Verify that \w and \W are mutually exclusive, as are \d, \D; \s, \S'; for (map { chr } 0..255) { - tryneoalpha($Locale, $locales_test_number, - (/\w/ xor /\W/) || + if ($is_utf8_locale) { + use locale ':not_characters'; + $ok = (/\w/ xor /\W/) || + (/\d/ xor /\D/) || + (/\s/ xor /\S/); + } + else { + $ok = (/\w/ xor /\W/) || (/\d/ xor /\D/) || - (/\s/ xor /\S/)); + (/\s/ xor /\S/); + } + tryneoalpha($Locale, $locales_test_number, $ok); } # Test for read-only scalars' locale vs non-locale comparisons. @@ -610,11 +812,16 @@ foreach $Locale (@Locale) { { no locale; $a = "qwerty"; - { - use locale; - tryneoalpha($Locale, ++$locales_test_number, ($a cmp "qwerty") == 0); - $test_names{$locales_test_number} = 'Verify that cmp works with a read-only scalar; no- vs locale'; - } + if ($is_utf8_locale) { + use locale ':not_characters'; + $ok = ($a cmp "qwerty") == 0; + } + else { + use locale; + $ok = ($a cmp "qwerty") == 0; + } + tryneoalpha($Locale, ++$locales_test_number, $ok); + $test_names{$locales_test_number} = 'Verify that cmp works with a read-only scalar; no- vs locale'; } { @@ -634,9 +841,18 @@ foreach $Locale (@Locale) { $from++; $to++; $to = $#Alnum_ if ($to > $#Alnum_); $greater = join('', @Alnum_[$from..$to]); + if ($is_utf8_locale) { + use locale ':not_characters'; + ($yes, $no, $sign) = ($lesser lt $greater + ? (" ", "not ", 1) + : ("not ", " ", -1)); + } + else { + use locale; ($yes, $no, $sign) = ($lesser lt $greater ? (" ", "not ", 1) : ("not ", " ", -1)); + } # all these tests should FAIL (return 0). # Exact lt or gt cannot be tested because # in some locales, say, eacute and E may test equal. @@ -656,7 +872,14 @@ foreach $Locale (@Locale) { @test{@test} = 0 x @test; $test = 0; for my $ti (@test) { + if ($is_utf8_locale) { + use locale ':not_characters'; + $test{$ti} = eval $ti; + } + else { + # Already in 'use locale'; $test{$ti} = eval $ti; + } $test ||= $test{$ti} } tryneoalpha($Locale, $locales_test_number, $test == 0); @@ -691,6 +914,27 @@ foreach $Locale (@Locale) { ); } + my $ok1; + my $ok2; + my $ok3; + my $ok4; + my $ok5; + my $ok6; + my $ok7; + my $ok8; + my $ok9; + my $ok10; + my $ok11; + my $ok12; + my $ok13; + + my $c; + my $d; + my $e; + my $f; + my $g; + + if (! $is_utf8_locale) { use locale; my ($x, $y) = (1.23, 1.23); @@ -698,23 +942,14 @@ foreach $Locale (@Locale) { $a = "$x"; printf ''; # printf used to reset locale to "C" $b = "$y"; + $ok1 = $a eq $b; - tryneoalpha($Locale, ++$locales_test_number, $a eq $b); - $test_names{$locales_test_number} = 'Verify that an intervening printf doesn\'t change assignment results'; - my $first_a_test = $locales_test_number; - - debug "# $first_a_test..$locales_test_number: \$a = $a, \$b = $b, Locale = $Locale\n"; - - my $c = "$x"; + $c = "$x"; my $z = sprintf ''; # sprintf used to reset locale to "C" - my $d = "$y"; - - - tryneoalpha($Locale, ++$locales_test_number, $c eq $d); - $test_names{$locales_test_number} = 'Verify that an intervening sprintf doesn\'t change assignment results'; - my $first_c_test = $locales_test_number; - + $d = "$y"; + $ok2 = $c eq $d; { + use warnings; my $w = 0; local $SIG{__WARN__} = @@ -726,20 +961,12 @@ foreach $Locale (@Locale) { # The == (among other ops) used to warn for locales # that had something else than "." as the radix character. - tryneoalpha($Locale, ++$locales_test_number, $c == 1.23); - $test_names{$locales_test_number} = 'Verify that a different locale radix works when doing "==" with a constant'; - - tryneoalpha($Locale, ++$locales_test_number, $c == $x); - $test_names{$locales_test_number} = 'Verify that a different locale radix works when doing "==" with a scalar'; - - tryneoalpha($Locale, ++$locales_test_number, $c == $d); - $test_names{$locales_test_number} = 'Verify that a different locale radix works when doing "==" with a scalar and an intervening sprintf'; - - debug "# $first_c_test..$locales_test_number: \$c = $c, \$d = $d, Locale = $Locale\n"; - + $ok3 = $c == 1.23; + $ok4 = $c == $x; + $ok5 = $c == $d; { no locale; - + # The earlier test was $e = "$x". But this fails [perl #108378], # and the "no locale" was commented out. But doing that made all # the tests in the block after this one meaningless, as originally @@ -749,48 +976,123 @@ foreach $Locale (@Locale) { # work to add TODOs instead. Should this be fixed, the following # test names would need to be revised; they mostly don't really # test anything currently. - my $e = $x; + $e = $x; + + $ok6 = $e == 1.23; + $ok7 = $e == $x; + $ok8 = $e == $c; + } + + $f = "1.23"; + $g = 2.34; + + $ok9 = $f == 1.23; + $ok10 = $f == $x; + $ok11 = $f == $c; + $ok12 = abs(($f + $g) - 3.57) < 0.01; + $ok13 = $w == 0; + } + } + else { + use locale ':not_characters'; + + my ($x, $y) = (1.23, 1.23); + $a = "$x"; + printf ''; # printf used to reset locale to "C" + $b = "$y"; + $ok1 = $a eq $b; + + $c = "$x"; + my $z = sprintf ''; # sprintf used to reset locale to "C" + $d = "$y"; + $ok2 = $c eq $d; + { + use warnings; + my $w = 0; + local $SIG{__WARN__} = + sub { + print "# @_\n"; + $w++; + }; + $ok3 = $c == 1.23; + $ok4 = $c == $x; + $ok5 = $c == $d; + { + no locale; + $e = $x; + + $ok6 = $e == 1.23; + $ok7 = $e == $x; + $ok8 = $e == $c; + } + + $f = "1.23"; + $g = 2.34; + + $ok9 = $f == 1.23; + $ok10 = $f == $x; + $ok11 = $f == $c; + $ok12 = abs(($f + $g) - 3.57) < 0.01; + $ok13 = $w == 0; + } + } + + tryneoalpha($Locale, ++$locales_test_number, $ok1); + $test_names{$locales_test_number} = 'Verify that an intervening printf doesn\'t change assignment results'; + my $first_a_test = $locales_test_number; + + debug "# $first_a_test..$locales_test_number: \$a = $a, \$b = $b, Locale = $Locale\n"; + + tryneoalpha($Locale, ++$locales_test_number, $ok2); + $test_names{$locales_test_number} = 'Verify that an intervening sprintf doesn\'t change assignment results'; + + my $first_c_test = $locales_test_number; + + tryneoalpha($Locale, ++$locales_test_number, $ok3); + $test_names{$locales_test_number} = 'Verify that a different locale radix works when doing "==" with a constant'; - tryneoalpha($Locale, ++$locales_test_number, $e == 1.23); + tryneoalpha($Locale, ++$locales_test_number, $ok4); + $test_names{$locales_test_number} = 'Verify that a different locale radix works when doing "==" with a scalar'; + + tryneoalpha($Locale, ++$locales_test_number, $ok5); + $test_names{$locales_test_number} = 'Verify that a different locale radix works when doing "==" with a scalar and an intervening sprintf'; + + debug "# $first_c_test..$locales_test_number: \$c = $c, \$d = $d, Locale = $Locale\n"; + + tryneoalpha($Locale, ++$locales_test_number, $ok6); $test_names{$locales_test_number} = 'Verify that can assign numerically under inner no-locale block'; my $first_e_test = $locales_test_number; - tryneoalpha($Locale, ++$locales_test_number, $e == $x); + tryneoalpha($Locale, ++$locales_test_number, $ok7); $test_names{$locales_test_number} = 'Verify that "==" with a scalar still works in inner no locale'; - - tryneoalpha($Locale, ++$locales_test_number, $e == $c); + + tryneoalpha($Locale, ++$locales_test_number, $ok8); $test_names{$locales_test_number} = 'Verify that "==" with a scalar and an intervening sprintf still works in inner no locale'; - debug "# $first_e_test..$locales_test_number: e = \$e, no locale\n"; - } - - my $f = "1.23"; - my $g = 2.34; + debug "# $first_e_test..$locales_test_number: \$e = $e, no locale\n"; - tryneoalpha($Locale, ++$locales_test_number, $f == 1.23); + tryneoalpha($Locale, ++$locales_test_number, $ok9); $test_names{$locales_test_number} = 'Verify that after a no-locale block, a different locale radix still works when doing "==" with a constant'; my $first_f_test = $locales_test_number; - tryneoalpha($Locale, ++$locales_test_number, $f == $x); + tryneoalpha($Locale, ++$locales_test_number, $ok10); $test_names{$locales_test_number} = 'Verify that after a no-locale block, a different locale radix still works when doing "==" with a scalar'; - - tryneoalpha($Locale, ++$locales_test_number, $f == $c); + + tryneoalpha($Locale, ++$locales_test_number, $ok11); $test_names{$locales_test_number} = 'Verify that after a no-locale block, a different locale radix still works when doing "==" with a scalar and an intervening sprintf'; - tryneoalpha($Locale, ++$locales_test_number, abs(($f + $g) - 3.57) < 0.01); + tryneoalpha($Locale, ++$locales_test_number, $ok12); $test_names{$locales_test_number} = 'Verify that after a no-locale block, a different locale radix can participate in an addition and function call as numeric'; - tryneoalpha($Locale, ++$locales_test_number, $w == 0); + tryneoalpha($Locale, ++$locales_test_number, $ok13); $test_names{$locales_test_number} = 'Verify that don\'t get warning under "==" even if radix is not a dot'; debug "# $first_f_test..$locales_test_number: \$f = $f, \$g = $g, back to locale = $Locale\n"; - } - # Does taking lc separately differ from taking # the lc "in-line"? (This was the bug 19990704.002, change #3568.) # The bug was in the caching of the 'o'-magic. - { + if (! $is_utf8_locale) { use locale; sub lcA { @@ -810,8 +1112,29 @@ foreach $Locale (@Locale) { tryneoalpha($Locale, ++$locales_test_number, lcA($x, $y) == 1 && lcB($x, $y) == 1 || lcA($x, $z) == 0 && lcB($x, $z) == 0); - $test_names{$locales_test_number} = 'Verify "lc(foo) cmp lc(bar)" is the same as using intermediaries for the cmp'; } + else { + use locale ':not_characters'; + + sub lcC { + my $lc0 = lc $_[0]; + my $lc1 = lc $_[1]; + return $lc0 cmp $lc1; + } + + sub lcD { + return lc($_[0]) cmp lc($_[1]); + } + + my $x = "ab"; + my $y = "aa"; + my $z = "AB"; + + tryneoalpha($Locale, ++$locales_test_number, + lcC($x, $y) == 1 && lcD($x, $y) == 1 || + lcC($x, $z) == 0 && lcD($x, $z) == 0); + } + $test_names{$locales_test_number} = 'Verify "lc(foo) cmp lc(bar)" is the same as using intermediaries for the cmp'; # Does lc of an UPPER (if different from the UPPER) match # case-insensitively the UPPER, and does the UPPER match @@ -825,6 +1148,7 @@ foreach $Locale (@Locale) { ++$locales_test_number; $test_names{$locales_test_number} = 'Verify case insensitive matching works'; foreach my $x (keys %UPPER) { + if (! $is_utf8_locale) { my $y = lc $x; next unless uc $y eq $x; print "# UPPER $x lc $y ", @@ -861,9 +1185,23 @@ foreach $Locale (@Locale) { # With utf8 both will fail since the locale concept # of upper/lower does not work well in Unicode. push @f, $x unless $x =~ /$y/i == $y =~ /$x/i; + } + else { + use locale ':not_characters'; + my $y = lc $x; + next unless uc $y eq $x; + print "# UPPER $x lc $y ", + $x =~ /$y/i ? 1 : 0, " ", + $y =~ /$x/i ? 1 : 0, "\n" if 0; + + # Here, we can fully test things, unlike plain 'use locale', + # because this form does work well with Unicode + push @f, $x unless $x =~ /$y/i && $y =~ /$x/i; + } } foreach my $x (keys %lower) { + if (! $is_utf8_locale) { my $y = uc $x; next unless lc $y eq $x; print "# lower $x uc $y ", @@ -876,6 +1214,16 @@ foreach $Locale (@Locale) { # With utf8 both will fail since the locale concept # of upper/lower does not work well in Unicode. push @f, $x unless $x =~ /$y/i == $y =~ /$x/i; + } + else { + use locale ':not_characters'; + my $y = uc $x; + next unless lc $y eq $x; + print "# lower $x uc $y ", + $x =~ /$y/i ? 1 : 0, " ", + $y =~ /$x/i ? 1 : 0, "\n" if 0; + push @f, $x unless $x =~ /$y/i && $y =~ /$x/i; + } } tryneoalpha($Locale, $locales_test_number, @f == 0); if (@f) { @@ -1021,10 +1369,14 @@ setlocale(LC_ALL, "C"); $ascii_case_change_delta = +32; $above_latin1_case_change_delta = +1; } + foreach my $is_utf8_locale (0 .. 1) { foreach my $j (0 .. $#list) { my $char = $list[$j]; utf8::upgrade($char); - my $should_be = ($j == $#list) + my $should_be; + my $changed; + if (! $is_utf8_locale) { + $should_be = ($j == $#list) ? chr(ord($char) + $above_latin1_case_change_delta) : (length $char == 0 || ord($char) > 127) ? $char @@ -1032,7 +1384,7 @@ setlocale(LC_ALL, "C"); # This monstrosity is in order to avoid using an eval, which might # perturb the results - my $changed = ($function eq "uc") + $changed = ($function eq "uc") ? uc($char) : ($function eq "ucfirst") ? ucfirst($char) @@ -1041,15 +1393,43 @@ setlocale(LC_ALL, "C"); : ($function eq "lcfirst") ? lcfirst($char) : die("Unexpected function \"$function\""); - ok($changed eq $should_be, "$function(\"$char\") in C locale should be \"$should_be\", got \"$changed\""); + } + else { + { + no locale; + + # For utf8-locales the case changing functions should work + # just like they do outside of locale. Can use eval here + # because not testing it when not in locale. + $should_be = eval "$function('$char')"; + die "Unexpected eval error $@ from 'eval \"$function('$char')\"'" if $@; + + } + use locale ':not_characters'; + $changed = ($function eq "uc") + ? uc($char) + : ($function eq "ucfirst") + ? ucfirst($char) + : ($function eq "lc") + ? lc($char) + : ($function eq "lcfirst") + ? lcfirst($char) + : die("Unexpected function \"$function\""); + } + ok($changed eq $should_be, "$function(\"$char\") in C locale " + . (($is_utf8_locale) + ? "(use locale ':not_characters')" + : "(use locale)") + . " should be \"$should_be\", got \"$changed\""); # Tainting shouldn't happen for empty strings, or those characters # above 255. - (length($char) > 0 && ord($char) < 256) + (! $is_utf8_locale && length($char) > 0 && ord($char) < 256) ? check_taint($changed) : check_taint_not($changed); } } + } } print "1..$test_num\n"; @@ -533,7 +533,7 @@ Perl_grok_numeric_radix(pTHX_ const char **sp, const char *send) PERL_ARGS_ASSERT_GROK_NUMERIC_RADIX; - if (PL_numeric_radix_sv && IN_LOCALE) { + if (PL_numeric_radix_sv && IN_SOME_LOCALE_FORM) { STRLEN len; const char * const radix = SvPV(PL_numeric_radix_sv, len); if (*sp + len <= send && memEQ(*sp, radix, len)) { @@ -847,7 +847,7 @@ Perl_my_atof(pTHX_ const char* s) PERL_ARGS_ASSERT_MY_ATOF; - if (PL_numeric_local && IN_LOCALE) { + if (PL_numeric_local && IN_SOME_LOCALE_FORM) { NV y; /* Scan the number twice; once using locale and once without; @@ -4102,7 +4102,10 @@ Perl_newPMOP(pTHX_ I32 type, I32 flags) if (IN_LOCALE_COMPILETIME) { set_regex_charset(&(pmop->op_pmflags), REGEX_LOCALE_CHARSET); } - else if ((! (PL_hints & HINT_BYTES)) && (PL_hints & HINT_UNI_8_BIT)) { + else if ((! (PL_hints & HINT_BYTES)) + /* Both UNI_8_BIT and locale :not_characters imply Unicode */ + && (PL_hints & (HINT_UNI_8_BIT|HINT_LOCALE_NOT_CHARS))) + { set_regex_charset(&(pmop->op_pmflags), REGEX_UNICODE_CHARSET); } if (PL_hints & HINT_RE_FLAGS) { @@ -4824,6 +4824,7 @@ typedef enum { #define HINT_STRICT_REFS 0x00000002 /* strict pragma */ #define HINT_LOCALE 0x00000004 /* locale pragma */ #define HINT_BYTES 0x00000008 /* bytes pragma */ +#define HINT_LOCALE_NOT_CHARS 0x00000010 /* locale ':not_characters' pragma */ /* Note: 20,40,80 used for NATIVE_HINTS */ /* currently defined by vms/vmsish.h */ @@ -5264,11 +5265,23 @@ typedef struct am_table_short AMTS; #define SET_NUMERIC_LOCAL() \ set_numeric_local(); +/* Returns non-zero If the plain locale pragma without a parameter is in effect + */ #define IN_LOCALE_RUNTIME (CopHINTS_get(PL_curcop) & HINT_LOCALE) + +/* Returns non-zero If either form of the locale pragma is in effect */ +#define IN_SOME_LOCALE_FORM_RUNTIME \ + (CopHINTS_get(PL_curcop) & (HINT_LOCALE|HINT_LOCALE_NOT_CHARS)) + #define IN_LOCALE_COMPILETIME (PL_hints & HINT_LOCALE) +#define IN_SOME_LOCALE_FORM_COMPILETIME \ + (PL_hints & (HINT_LOCALE|HINT_LOCALE_NOT_CHARS)) #define IN_LOCALE \ (IN_PERL_COMPILETIME ? IN_LOCALE_COMPILETIME : IN_LOCALE_RUNTIME) +#define IN_SOME_LOCALE_FORM \ + (IN_PERL_COMPILETIME ? IN_SOME_LOCALE_FORM_COMPILETIME \ + : IN_SOME_LOCALE_FORM_RUNTIME) #define STORE_NUMERIC_LOCAL_SET_STANDARD() \ bool was_local = PL_numeric_local && IN_LOCALE; \ diff --git a/pod/perldelta.pod b/pod/perldelta.pod index ada04beddf..fd1a358d9a 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -28,6 +28,23 @@ here, but most should go in the L</Performance Enhancements> section. [ List each enhancement as a =head2 entry ] +=head2 Improved ability to mix locales and Unicode, including UTF-8 locales + +An optional parameter has been added to C<use locale> + + use locale ':not_characters'; + +which tells Perl to use all but the C<LC_CTYPE> and C<LC_COLLATE> +portions of the current locale. Instead, the character set is assumed +to be Unicode. This allows locales and Unicode to be seamlessly mixed, +including the increasingly frequent UTF-8 locales. When using this +hybrid form of locales, the C<:locale> layer to the L<open> pragma can +be used to interface with the file system, and there are CPAN modules +available for ARGV and environment variable conversions. + +Full details are in L<perllocale>. + +>>>>>>> Add :not_characters parameter to 'use locale' =head1 Security XXX Any security-related notices go here. In particular, any security diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod index 56c74521e7..a39ccfd357 100644 --- a/pod/perlfunc.pod +++ b/pod/perlfunc.pod @@ -2997,7 +2997,7 @@ respectively. =back -=item Otherwise, if C<use locale> is in effect +=item Otherwise, if C<use locale> (but not C<use locale ':not_characters'>) is in effect: Respects current LC_CTYPE locale for code points < 256; and uses Unicode semantics for the remaining code points (this last can only happen if @@ -3013,11 +3013,11 @@ exists in the locale, much less what code point it is. Perl returns the input character unchanged, for all instances (and there aren't many) where the 255/256 boundary would otherwise be crossed. -=item Otherwise, If EXPR has the UTF8 flag set +=item Otherwise, If EXPR has the UTF8 flag set: Unicode semantics are used for the case change. -=item Otherwise, if C<use feature 'unicode_strings'> is in effect: +=item Otherwise, if C<use feature 'unicode_strings'> or C<use locale ':not_characters'>) is in effect: Unicode semantics are used for the case change. @@ -4804,7 +4804,8 @@ list will be interpreted as the C<printf> format. See L<sprintf|/sprintf FORMAT, LIST> for an explanation of the format argument. If you omit the LIST, C<$_> is used; to use FILEHANDLE without a LIST, you must use a real filehandle like -C<FH>, not an indirect one like C<$fh>. If C<use locale> is in effect and +C<FH>, not an indirect one like C<$fh>. If C<use locale> (including +C<use locale ':not_characters'>) is in effect and POSIX::setlocale() has been called, the character used for the decimal separator in formatted floating-point numbers is affected by the LC_NUMERIC locale setting. See L<perllocale> and L<POSIX>. @@ -6013,7 +6014,8 @@ be modified. You also cannot exit out of the sort block or subroutine using any of the loop control operators described in L<perlsyn> or with C<goto>. -When C<use locale> is in effect, C<sort LIST> sorts LIST according to the +When C<use locale> (but not C<use locale 'not_characters'>) is in +effect, C<sort LIST> sorts LIST according to the current collation locale. See L<perllocale>. sort() returns aliases into the original list, much as a for loop's index @@ -6710,7 +6712,8 @@ index, the C<$> may need escaping: =back -If C<use locale> is in effect and POSIX::setlocale() has been called, +If C<use locale> (including C<use locale 'not_characters'>) is in effect +and POSIX::setlocale() has been called, the character used for the decimal separator in formatted floating-point numbers is affected by the LC_NUMERIC locale. See L<perllocale> and L<POSIX>. diff --git a/pod/perllocale.pod b/pod/perllocale.pod index 8549baf980..57d8a4fd0b 100644 --- a/pod/perllocale.pod +++ b/pod/perllocale.pod @@ -1,40 +1,101 @@ +=encoding utf8 + =head1 NAME perllocale - Perl locale handling (internationalization and localization) =head1 DESCRIPTION -Locales these days have been mostly been supplanted by Unicode, but Perl -continues to support them. See L</Unicode and UTF-8> below. - -Perl supports language-specific notions of data such as "is this -a letter", "what is the uppercase equivalent of this letter", and -"which of these letters comes first". These are important issues, -especially for languages other than English--but also for English: it -would be naE<iuml>ve to imagine that C<A-Za-z> defines all the "letters" -needed to write correct English. Perl is also aware that some character other -than "." may be preferred as a decimal point, and that output date -representations may be language-specific. The process of making an -application take account of its users' preferences in such matters is -called B<internationalization> (often abbreviated as B<i18n>); telling -such an application about a particular set of preferences is known as -B<localization> (B<l10n>). - -Perl can understand language-specific data via the standardized (ISO C, -XPG4, POSIX 1.c) method called "the locale system". The locale system is -controlled per application using one pragma, one function call, and -several environment variables. - -B<NOTE>: This feature is new in Perl 5.004, and does not apply unless an -application specifically requests it--see L<Backward compatibility>. -The one exception is that write() now B<always> uses the current locale -- see L<"NOTES">. +In the beginning there was ASCII, the "American Standard Code for +Information Interchange", which works quite well for Americans with +their English alphabet and dollar-denominated currency. But it doesn't +work so well even for other English speakers, who may use different +currencies, such as the pound sterling (as the symbol for that currency +is not in ASCII); and it's hopelessly inadequate for many of the +thousands of the world's other languages. + +To address these deficiencies, the concept of locales was invented +(formally the ISO C, XPG4, POSIX 1.c "locale system"). And applications +were and are being written that use the locale mechanism. The process of +making such an application take account of its users' preferences in +these kinds of matters is called B<internationalization> (often +abbreviated as B<i18n>); telling such an application about a particular +set of preferences is known as B<localization> (B<l10n>). + +Perl was extended, starting in 5.04, to support the locale system. This +is controlled per application by using one pragma, one function call, +and several environment variables. + +Unfortunately, there are quite a few deficiencies with the design (and +often, the implementations) of locales, and their use for character sets +has mostly been supplanted by Unicode (see L<perlunitut> for an +introduction to that, and keep on reading here for how Unicode interacts +with locales in Perl). + +Perl continues to support the old locale system, and starting in 5.16, +provides a hybrid way to use the Unicode character set, along with the +other portions of locales that may not be so problematic. +(Unicode is also creating C<CLDR>, the "Common Locale Data Repository", +L<http://cldr.unicode.org/> which includes more types of information than +are available in the POSIX locale system. At the time of this writing, +there was no CPAN module that provides access to this XML-encoded data. +However, many of its locales have the POSIX-only data extracted, and are +available at L<http://unicode.org/Public/cldr/latest/>.) + +=head1 WHAT IS A LOCALE + +A locale is a set of data that describes various aspects of how various +communities in the world categorize their world. These categories are +broken down into the following types (some of which include a brief +note here): + +=over + +=item Category LC_NUMERIC: Numeric formatting + +This indicates how numbers should be formatted for human readability, +for example the character used as the decimal point. + +=item Category LC_MONETARY: Formatting of monetary amounts + +=for comment +The nbsp below makes this look better + +E<160> + +=item Category LC_TIME: Date/Time formatting + +=for comment +The nbsp below makes this look better + +E<160> + +=item Category LC_MESSAGES: Error and other messages + +This for the most part is beyond the scope of Perl + +=item Category LC_COLLATE: Collation + +This indicates the ordering of letters for comparision and sorting. +In Latin alphabets, for example, "b", generally follows "a". + +=item Category LC_CTYPE: Character Types + +This indicates, for example if a character is an uppercase letter. + +=back + +More details on the categories are given below in L</LOCALE CATEGORIES>. + +Together, these categories go a long way towards being able to customize +a single program to run in many different locations. But there are +deficiencies, so keep reading. =head1 PREPARING TO USE LOCALES -If Perl applications are to understand and present your data -correctly according a locale of your choice, B<all> of the following -must be true: +Perl will not use locales unless specifically requested to (see L</NOTES> below +for the partial exception of C<write()>). But even if there is such a +request, B<all> of the following must be true for it to work properly: =over 4 @@ -74,7 +135,7 @@ appropriate, and B<at least one> of the following must be true: =item 1 -B<The locale-determining environment variables (see L<"ENVIRONMENT">) +B<The locale-determining environment variables (see L</"ENVIRONMENT">) must be correctly set up> at the time the application is started, either by yourself or by whomever set up your system account; or @@ -90,13 +151,25 @@ L<The setlocale function>. =head2 The use locale pragma By default, Perl ignores the current locale. The S<C<use locale>> -pragma tells Perl to use the -current locale for some operations (C</l> for just pattern matching). +pragma tells Perl to use the current locale for some operations. +Starting in 5.15, there is an optional parameter to this pragma: + + use locale ':not_characters'; + +This parameter allows better mixing of locales and Unicode, and is +described fully in L</Unicode and UTF-8>, but briefly, it tells Perl to +not use the character portions of the locale definition, that is +the C<LC_CTYPE> and C<LC_COLLATE> categories. Instead it will use the +native (extended by Unicode) character set. When using this parameter, +you are responsible for getting the external character set translated +into the native/Unicode one (which it already will be if it is one of +the increasingly popular UTF-8 locales). There are convenient ways of +doing this, as described in L</Unicode and UTF-8>. The current locale is set at execution time by L<setlocale()|/The setlocale function> described below. If that function hasn't yet been called in the course of the program's execution, the -current locale is that which was determined by the L<"ENVIRONMENT"> in +current locale is that which was determined by the L</"ENVIRONMENT"> in effect at the start of the program, except that C<L<LC_NUMERIC|/Category LC_NUMERIC: Numeric Formatting>> is always initialized to the C locale (mentioned under L<Finding locales>). @@ -107,6 +180,31 @@ The operations that are affected by locale are: =over 4 +=item B<Under C<use locale ':not_characters';>> + +=over 4 + +=item * + +B<Format declarations> (format()) use C<LC_NUMERIC> + +=item * + +B<The POSIX date formatting function> (strftime()) uses C<LC_TIME>. + +=back + +=for comment +The nbsp below makes this look better + +E<160> + +=item B<Under just plain C<use locale;>> + +The above operations are affected, as well as the following: + +=over 4 + =item * B<The comparison operators> (C<lt>, C<le>, C<cmp>, C<ge>, and C<gt>) and @@ -129,21 +227,15 @@ L<Category LC_COLLATE: Collation>. B<Regular expressions and case-modification functions> (uc(), lc(), ucfirst(), and lcfirst()) use C<LC_CTYPE> -=item * - -B<Format declarations> (format()) use C<LC_NUMERIC> - -=item * - -B<The POSIX date formatting function> (strftime()) uses C<LC_TIME>. - =back -C<LC_COLLATE>, C<LC_CTYPE>, and so on, are discussed further in -L<LOCALE CATEGORIES>. +=back The default behavior is restored with the S<C<no locale>> pragma, or upon reaching the end of the block enclosing C<use locale>. +Note that C<use locale> and C<use locale ':not_characters'> may be +nested, and that what is in effect within an inner scope will revert to +the outer scope's rules at the end of the inner scope. The string result of any operation that uses locale information is tainted, as it is possible for a locale to be @@ -178,7 +270,7 @@ POSIX::setlocale() function: The first argument of setlocale() gives the B<category>, the second the B<locale>. The category tells in what aspect of data processing you want to apply locale-specific rules. Category names are discussed in -L<LOCALE CATEGORIES> and L<"ENVIRONMENT">. The locale is the name of a +L</LOCALE CATEGORIES> and L</"ENVIRONMENT">. The locale is the name of a collection of customization information corresponding to a particular combination of language, country or territory, and codeset. Read on for hints on the naming of locales: not all systems name locales as in the @@ -212,6 +304,9 @@ be noticed, depending on your system's C library. If the second argument does not correspond to a valid locale, the locale for the category is not changed, and the function returns I<undef>. +Note that Perl ignores the current C<LC_CTYPE> and C<LC_COLLATE> locales +within the scope of a C<use locale ':not_characters'>. + For further information about the categories, consult setlocale(3). =head2 Finding locales @@ -482,7 +577,8 @@ basic category at a time. See L<"ENVIRONMENT"> for a discussion of these. =head2 Category LC_COLLATE: Collation -In the scope of S<C<use locale>>, Perl looks to the C<LC_COLLATE> +In the scope of S<C<use locale>> (but not a +C<use locale ':not_characters'>), Perl looks to the C<LC_COLLATE> environment variable to determine the application's notions on collation (ordering) of characters. For example, "b" follows "a" in Latin alphabets, but where do "E<aacute>" and "E<aring>" belong? And while @@ -562,7 +658,8 @@ results, and so always obey the current C<LC_COLLATE> locale. =head2 Category LC_CTYPE: Character Types -In the scope of S<C<use locale>>, Perl obeys the C<LC_CTYPE> locale +In the scope of S<C<use locale>> (but not a +C<use locale ':not_characters'>), Perl obeys the C<LC_CTYPE> locale setting. This controls the application's notion of which characters are alphabetic. This affects Perl's C<\w> regular expression metanotation, which stands for alphanumeric characters--that is, alphabetic, @@ -745,7 +842,7 @@ Scalar true/false (or less/equal/greater) result is never tainted. B<Case-mapping interpolation> (with C<\l>, C<\L>, C<\u> or C<\U>) Result string containing interpolated material is tainted if -C<use locale> is in effect. +C<use locale> (but not S<C<use locale ':not_characters'>>) is in effect. =item * @@ -754,7 +851,8 @@ B<Matching operator> (C<m//>): Scalar true/false result never tainted. Subpatterns, either delivered as a list-context result or as $1 etc. -are tainted if C<use locale> is in effect, and the subpattern regular +are tainted if C<use locale> (but not S<C<use locale ':not_characters'>>) +is in effect, and the subpattern regular expression contains C<\w> (to match an alphanumeric character), C<\W> (non-alphanumeric character), C<\s> (whitespace character), or C<\S> (non whitespace character). The matched-pattern variable, $&, $` @@ -767,8 +865,9 @@ C<\W>, C<\s>, or C<\S>. B<Substitution operator> (C<s///>): Has the same behavior as the match operator. Also, the left -operand of C<=~> becomes tainted when C<use locale> in effect -if modified as a result of a substitution based on a regular +operand of C<=~> becomes tainted when C<use locale> +(but not S<C<use locale ':not_characters'>>) is in effect if modified as +a result of a substitution based on a regular expression match involving C<\w>, C<\W>, C<\s>, or C<\S>; or of case-mapping with C<\l>, C<\L>,C<\u> or C<\U>. @@ -784,7 +883,8 @@ effect. B<Case-mapping functions> (lc(), lcfirst(), uc(), ucfirst()): -Results are tainted if C<use locale> is in effect. +Results are tainted if C<use locale> (but not +S<C<use locale ':not_characters'>>) is in effect. =item * @@ -998,6 +1098,11 @@ is called. =head2 Freely available locale definitions +The Unicode CLDR project extracts the POSIX portion of many of its +locales, available at + + http://unicode.org/Public/cldr/latest/ + There is a large collection of locale definitions at: http://std.dkuug.dk/i18n/WG15-collection/locales/ @@ -1028,10 +1133,44 @@ into bankers, bikers, gamers, and so on. =head1 Unicode and UTF-8 The support of Unicode is new starting from Perl version 5.6, and more fully -implemented in version 5.8 and later. See L<perluniintro>. Perl tries to -work with both Unicode and locales--but of course, there are problems. - -Perl does not handle multi-byte locales, such as have been used for various +implemented in version 5.8 and later. See L<perluniintro>. It is +strongly recommended that when combining Unicode and locale (starting in +5.16), you use + + use locale ':not_characters'; + +When this form of the pragma is used, only the non-character portions of +locales are used by Perl, for example C<LC_NUMERIC>. Perl assumes that +you have translated all the characters it is to operate on into Unicode +(actually the platform's native character set (ASCII or EBCDIC) plus +Unicode). For data in files, this can conveniently be done by also +specifying + + use open ':locale'; + +This pragma arranges for all inputs from files to be translated into +Unicode from the current locale as specified in the environment (see +L</ENVIRONMENT>), and all outputs to files to be translated back +into the locale. (See L<open>). On a per-filehandle basis, you can +instead use the L<PerlIO::locale> module, or the L<Encode::Locale> +module, both available from CPAN. The latter module also has methods to +ease the handling of C<ARGV> and environment variables, and can be used +on individual strings. Also, if you know that all your locales will be +UTF-8, as many are these days, you can use the L<B<-C>|perlrun/-C> +command line switch. + +This form of the pragma allows essentially seamless handling of locales +with Unicode. The collation order will be Unicode's. It is strongly +recommended that when you need to order and sort strings that you use +the standard module L<Unicode::Collate> which gives much better results +in many instances than you can get with the old-style locale handling. + +For pre-5.16 Perls, or if you use the locale pragma without the +C<:not_characters> parameter, Perl tries to work with both Unicode and +locales--but there are problems. + +Perl does not handle multi-byte locales in this case, such as have been +used for various Asian languages, such as Big5 or Shift JIS. However, the increasingly common multi-byte UTF-8 locales, if properly implemented, may work reasonably well (depending on your C library implementation) in this @@ -1045,14 +1184,13 @@ library may not work for UTF-8 locales with those functions, instead only working under the newer wide library functions like C<iswalnum()>. Perl generally takes the tack to use locale rules on code points that can fit -in a single byte, and Unicode rules for those that can't (though this wasn't -uniformly applied prior to Perl 5.14). This prevents many problems in locales -that aren't UTF-8. Suppose the locale is ISO8859-7, Greek. The character at -0xD7 there is a capital Chi. But in the ISO8859-1 locale, Latin1, it is a -multiplication sign. The POSIX regular expression character class -C<[[:alpha:]]> will magically match 0xD7 in the Greek locale but not in the -Latin one, even if the string is encoded in UTF-8, which would normally imply -Unicode semantics. (The "U" in UTF-8 stands for Unicode.) +in a single byte, and Unicode rules for those that can't (though this +isn't uniformly applied, see the note at the end of this section). This +prevents many problems in locales that aren't UTF-8. Suppose the locale +is ISO8859-7, Greek. The character at 0xD7 there is a capital Chi. But +in the ISO8859-1 locale, Latin1, it is a multiplication sign. The POSIX +regular expression character class C<[[:alpha:]]> will magically match +0xD7 in the Greek locale but not in the Latin one. However, there are places where this breaks down. Certain constructs are for Unicode only, such as C<\p{Alpha}>. They assume that 0xD7 always has its @@ -1060,11 +1198,20 @@ Unicode meaning (or the equivalent on EBCDIC platforms). Since Latin1 is a subset of Unicode and 0xD7 is the multiplication sign in both Latin1 and Unicode, C<\p{Alpha}> will never match it, regardless of locale. A similar issue occurs with C<\N{...}>. It is therefore a bad idea to use C<\p{}> or -C<\N{}> under C<use locale>--I<unless> you can guarantee that the locale will -be a ISO8859-1 or UTF-8 one. Use POSIX character classes instead. - - -The same problem ensues if you enable automatic UTF-8-ification of your +C<\N{}> under plain C<use locale>--I<unless> you can guarantee that the +locale will be a ISO8859-1. Use POSIX character classes instead. + +Another problem with this approach is that operations that cross the +single byte/multiple byte boundary are not well-defined, and so are +disallowed. (This boundary is between the codepoints at 255/256.). +For example, lower casing LATIN CAPITAL LETTER Y WITH DIAERESIS (U+0178) +should return LATIN SMALL LETTER Y WITH DIAERESIS (U+00FF). But in the +Greek locale, for example, there is no character at 0xFF, and Perl +has no way of knowing what the character at 0xFF is really supposed to +represent. Thus it disallows the operation. In this mode, the +lowercase of U+0178 is itself. + +The same problems ensue if you enable automatic UTF-8-ification of your standard file handles, default C<open()> layer, and C<@ARGV> on non-ISO8859-1, non-UTF-8 locales (by using either the B<-C> command line switch or the C<PERL_UNICODE> environment variable; see L<perlrun>). @@ -1072,19 +1219,37 @@ Things are read in as UTF-8, which would normally imply a Unicode interpretation, but the presence of a locale causes them to be interpreted in that locale instead. For example, a 0xD7 code point in the Unicode input, which should mean the multiplication sign, won't be interpreted by -Perl that way under the Greek locale. Again, this is not a problem +Perl that way under the Greek locale. This is not a problem I<provided> you make certain that all locales will always and only be either -an ISO8859-1 or a UTF-8 locale. +an ISO8859-1, or, if you don't have a deficient C library, a UTF-8 locale. Vendor locales are notoriously buggy, and it is difficult for Perl to test its locale-handling code because this interacts with code that Perl has no control over; therefore the locale-handling code in Perl may be buggy as -well. But if you I<do> have locales that work, using them may be -worthwhile for certain specific purposes, as long as you keep in mind the -gotchas already mentioned. For example, collation runs faster under -locales than under L<Unicode::Collate> (albeit with less flexibility), and -you gain access to such things as the local currency symbol and the names -of the months and days of the week. +well. (However, the Unicode-supplied locales should be better, and +there is a feed back mechanism to correct any problems. See +L</Freely available locale definitions>.) + +If you have Perl 5.16, the problems mentioned above go away if you use +the C<:not_characters> parameter to the locale pragma (except for vendor +bugs in the non-character portions). If you don't have 5.16, and you +I<do> have locales that work, using them may be worthwhile for certain +specific purposes, as long as you keep in mind the gotchas already +mentioned. For example, if the collation for your locales works, it +runs faster under locales than under L<Unicode::Collate>; and you gain +access to such things as the local currency symbol and the names of the +months and days of the week. (But to hammer home the point, in 5.16, +you get this access without the downsides of locales by using the +C<:not_characters> form of the pragma.) + +Note: The policy of using locale rules for code points that can fit in a +byte, and Unicode rules for those that can't is not uniformly applied. +Pre-5.12, it was somewhat haphazard; in 5.12 it was applied fairly +consistently to regular expression matching except for bracketed +character classes; in 5.14 it was extended to all regex matches; and in +5.16 to the casing operations such as C<"\L"> and C<uc()>. For +collation, in all releases, the system's C<strxfrm()> function is called, +and whatever it does is what you get. =head1 BUGS diff --git a/pod/perlop.pod b/pod/perlop.pod index 607f631d45..369164af95 100644 --- a/pod/perlop.pod +++ b/pod/perlop.pod @@ -484,7 +484,8 @@ is described in the next section. X<~~> "lt", "le", "ge", "gt" and "cmp" use the collation (sort) order specified -by the current locale if a legacy C<use locale> is in effect. See +by the current locale if a legacy C<use locale> (but not +C<use locale ':not_characters'>) is in effect. See L<perllocale>. Do not mix these with Unicode, only with legacy binary encodings. The standard L<Unicode::Collate> and L<Unicode::Collate::Locale> modules offer much more powerful solutions to @@ -1509,7 +1510,8 @@ C<\E> for each. For example: say"This \Qquoting \ubusiness \Uhere isn't quite\E done yet,\E is it?"; This quoting\ Business\ HERE\ ISN\'T\ QUITE\ done\ yet\, is it? -If C<use locale> is in effect, the case map used by C<\l>, C<\L>, +If C<use locale> is in effect (but not C<use locale ':not_characters'>), +the case map used by C<\l>, C<\L>, C<\u>, and C<\U> is taken from the current locale. See L<perllocale>. If Unicode (for example, C<\N{}> or code points of 0x100 or beyond) is being used, the case map used by C<\l>, C<\L>, C<\u>, and diff --git a/pod/perlre.pod b/pod/perlre.pod index a536525096..1a0581bc97 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -292,7 +292,8 @@ the C<LATIN CAPITAL LETTER SHARP S> will match any of C<SS>, C<Ss>, C<sS>, and C<ss>, otherwise not. This modifier may be specified to be the default by C<use feature -'unicode_strings> or C<L<use 5.012|perlfunc/use VERSION>> (or higher), +'unicode_strings>, C<use locale ':not_characters'>, or +C<L<use 5.012|perlfunc/use VERSION>> (or higher), but see L</Which character set modifier is in effect?>. X</u> @@ -429,10 +430,13 @@ within its scope. This pragma has precedence over the other pragmas listed below that also change the defaults. Otherwise, C<L<use locale|perllocale>> sets the default modifier to C</l>; -and C<L<use feature 'unicode_strings|feature>> or +and C<L<use feature 'unicode_strings|feature>>, or C<L<use 5.012|perlfunc/use VERSION>> (or higher) set the default to C</u> when not in the same scope as either C<L<use locale|perllocale>> -or C<L<use bytes|bytes>>. Unlike the mechanisms mentioned above, these +or C<L<use bytes|bytes>>. +(C<L<use locale ':not_characters'|perllocale/Unicode and UTF-8>> also +sets the default to C</u>, overriding any plain C<use locale>.) +Unlike the mechanisms mentioned above, these affect operations besides regular expressions pattern matching, and so give more consistent results with other operators, including using C<\U>, C<\l>, etc. in substitution replacements. diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index 1e1dc535d1..3a9ddd6dab 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -90,10 +90,12 @@ semantics. For operations where this determination cannot be made without additional information from the user, Perl decides in favor of compatibility and chooses to use byte semantics. -When C<use locale> is in effect (which overrides -C<use feature 'unicode_strings'> in the same scope), Perl uses the -semantics associated -with the current locale. Otherwise, Perl uses the platform's native +When C<use locale> (but not C<use locale ':not_characters'>) is in +effect, Perl uses the semantics associated with the current locale. +(C<use locale> overrides C<use feature 'unicode_strings'> in the same scope; +while C<use locale ':not_characters'> effectively also selects +C<use feature 'unicode_strings'> in its scope; see L<perllocale>.) +Otherwise, Perl uses the platform's native byte semantics for characters whose code points are less than 256, and Unicode semantics for those greater than 255. On EBCDIC platforms, this is almost seamless, as the EBCDIC code pages that Perl handles are diff --git a/pod/perluniintro.pod b/pod/perluniintro.pod index edd1ab48ce..63e2119ad2 100644 --- a/pod/perluniintro.pod +++ b/pod/perluniintro.pod @@ -801,17 +801,19 @@ L<http://www.cl.cam.ac.uk/~mgk25/unicode.html> How Does Unicode Work With Traditional Locales? -Perl tries to keep the two separated. Code points that are above 255 -are treated as Unicode; those below 256, generally as locale. This -works reasonably well except in some case-insensitive regular expression -pattern matches that in Unicode would cross the 255/256 boundary. These -are disallowed. -Also, the C<\p{}> and C<\N{}> constructs silently assume Unicode values -even for code points below 256. -See also L<perlrun> for the -description of the C<-C> switch and its environment counterpart, -C<$ENV{PERL_UNICODE}> to see how to enable various Unicode features, -for example by using locale settings. +Starting in Perl 5.16, you can specify + + use locale ':not_characters'; + +to get Perl to work well with tradtional locales. The catch is that you +have to translate from the locale character set to/from Unicode +yourself. See L</Unicode IE<sol>O> above for how to + + use open ':locale'; + +to accomplish this, but full details are in L<perllocale/Unicode and +UTF-8>, including gotchas that happen if you don't specifiy +C<:not_characters>. =back diff --git a/t/porting/known_pod_issues.dat b/t/porting/known_pod_issues.dat index 2e517c06ef..7ef0eeb444 100644 --- a/t/porting/known_pod_issues.dat +++ b/t/porting/known_pod_issues.dat @@ -39,6 +39,7 @@ Devel::NYTProf Devel::PPPort Devel::SawAmpersand dirname(1) +Encode::Locale Exporter::Easy ExtUtils::Constant::ProxySubs fetch(1) @@ -81,6 +82,7 @@ passwd(1) perl(1) Perl::Unsafe::Signals perlbug(1) +PerlIO::locale PerlIO::via::Base64 PerlIO::via::StripHTML perllexwarn(1) @@ -251,8 +251,8 @@ Perl's extended UTF-8 means we can have start bytes up to FF. #define IN_BYTES (CopHINTS_get(PL_curcop) & HINT_BYTES) #define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTES) -#define IN_UNI_8_BIT ( (CopHINTS_get(PL_curcop) & HINT_UNI_8_BIT) \ - && ! IN_LOCALE_RUNTIME && ! IN_BYTES) +#define IN_UNI_8_BIT \ + (CopHINTS_get(PL_curcop) & (HINT_UNI_8_BIT|HINT_LOCALE_NOT_CHARS)) #define UTF8_ALLOW_EMPTY 0x0001 /* Allow a zero length string */ |