diff options
-rw-r--r-- | MANIFEST | 2 | ||||
-rw-r--r-- | lib/utf8_heavy.pl | 18 | ||||
-rw-r--r-- | pod/perldelta.pod | 11 | ||||
-rw-r--r-- | pod/perldiag.pod | 9 | ||||
-rw-r--r-- | pod/perlfunc.pod | 8 | ||||
-rw-r--r-- | pod/perlunicode.pod | 207 | ||||
-rw-r--r-- | t/op/lc_user.t | 33 | ||||
-rw-r--r-- | t/op/turkish.t | 100 |
8 files changed, 20 insertions, 368 deletions
@@ -4970,7 +4970,6 @@ t/op/int.t See if int works t/op/join.t See if join works t/op/kill0.t See if kill(0, $pid) works t/op/lc.t See if lc, uc, lcfirst, ucfirst, quotemeta work -t/op/lc_user.t See if user-defined lc et alia work t/op/leaky-magic.t See whether vars' magic leaks into packages t/op/length.t See if length works t/op/lex_assign.t See if ops involving lexicals or pad temps work @@ -5056,7 +5055,6 @@ t/op/tie.t See if tie/untie functions work t/op/time_loop.t Test that very large values don't hang gmtime and localtime. t/op/time.t See if time functions work t/op/tr.t See if tr works -t/op/turkish.t See if we can implement Turkish casing t/op/undef.t See if undef works t/op/universal.t See if UNIVERSAL class works t/op/unlink.t See if unlink works diff --git a/lib/utf8_heavy.pl b/lib/utf8_heavy.pl index 3af914918b..4953127295 100644 --- a/lib/utf8_heavy.pl +++ b/lib/utf8_heavy.pl @@ -397,24 +397,6 @@ sub croak { require Carp; Carp::croak(@_) } print STDERR __LINE__, ": didn't find $property_and_table\n" if DEBUG; ## - ## See if it's a user-level "To". - ## - - my $caller0 = caller(0); - - if (defined $caller0 && $type =~ /^To(?:\w+)$/) { - my $map = $caller0 . "::" . $type; - - if (exists &{$map}) { - no strict 'refs'; - - $list = &{$map}; - warnings::warnif('deprecated', "User-defined case-mapping '$type' is deprecated"); - last GETFILE; - } - } - - ## ## Last attempt -- see if it's a standard "To" name ## (e.g. "ToLower") ToTitle is used by ucfirst(). ## The user-level way to access ToDigit() and ToFold() diff --git a/pod/perldelta.pod b/pod/perldelta.pod index 98f8c207d7..7939c2587e 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -38,12 +38,13 @@ L</Selected Bug Fixes> section. =head1 Incompatible Changes -XXX For a release on a stable branch, this section aspires to be: +=head2 User-defined case changing operations. - There are no changes intentionally incompatible with 5.XXX.XXX - If any exist, they are bugs and reports are welcome. - -[ List each incompatible change as a =head2 entry ] +This feature was deprecated in Perl 5.14, and has now been removed. +The CPAN module L<Unicode::Casing> provides better functionality without +the drawbacks that this feature had, as are detailed in the 5.14 +documentation: +L<http://perldoc.perl.org/5.14.0/perlunicode.html#User-Defined-Case-Mappings-%28for-serious-hackers-only%29> =head2 XSUBs are now 'static' diff --git a/pod/perldiag.pod b/pod/perldiag.pod index 8c9055f407..97f16a4b17 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -5331,15 +5331,6 @@ currently reserved for future use, as the exact behaviour has not been decided. (Simply returning the boolean opposite of the modified string is usually not particularly useful.) -=item User-defined case-mapping '%s' is deprecated - -(W deprecated) You defined a function, such as C<ToLower> that overrides -the standard case mapping, such as C<lc()> gives. This feature is being -deprecated due to its many issues, as documented in -L<perlunicode/User-Defined Case Mappings (for serious hackers only)>. -It is planned to remove this feature in Perl 5.16. A CPAN module -providing improved functionality is being prepared. - =item UTF-16 surrogate U+%X (W utf8, surrogate) You had a UTF-16 surrogate in a context where they are diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod index 04c6a05bf9..2beec4a95c 100644 --- a/pod/perlfunc.pod +++ b/pod/perlfunc.pod @@ -2910,10 +2910,7 @@ respectively. =item Otherwise, If EXPR has the UTF8 flag set -If the current package has a subroutine named C<ToLower>, it will be used to -change the case -(See L<perlunicode/"User-Defined Case Mappings (for serious hackers only)">.) -Otherwise Unicode semantics are used for the case change. +Unicode semantics are used for the case change. =item Otherwise, if C<use locale> is in effect @@ -2921,8 +2918,7 @@ Respects current LC_CTYPE locale. See L<perllocale>. =item Otherwise, if C<use feature 'unicode_strings'> is in effect: -Unicode semantics are used for the case change. Any subroutine named -C<ToLower> will be ignored. +Unicode semantics are used for the case change. =item Otherwise: diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index 4779cc5dca..5e1ff36074 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -260,11 +260,12 @@ complement B<and> the full character-wide bit complement. =item * -You can define your own mappings to be used in C<lc()>, -C<lcfirst()>, C<uc()>, and C<ucfirst()> (or their double-quoted string inlined -versions such as C<\U>). See -L<User-Defined Case-Mappings|/"User-Defined Case Mappings (for serious hackers only)"> -for more details. +There is a CPAN module, L<Unicode::Casing>, which allows you to define +your own mappings to be used in C<lc()>, C<lcfirst()>, C<uc()>, and +C<ucfirst()> (or their double-quoted string inlined versions such as +C<\U>). (Prior to Perl 5.16, this functionality was partially provided +in the Perl core, but suffered from a number of insurmountable +drawbacks, so the CPAN module was written instead.) =back @@ -915,190 +916,12 @@ would be intersecting with nothing, resulting in an empty set. =head2 User-Defined Case Mappings (for serious hackers only) -B<This featured is deprecated and is scheduled to be removed in Perl -5.16.> -The CPAN module L<Unicode::Casing> provides better functionality -without the drawbacks described below. - -You can define your own mappings to be used in C<lc()>, -C<lcfirst()>, C<uc()>, and C<ucfirst()> (or their string-inlined versions, -C<\L>, C<\l>, C<\U>, and C<\u>). The mappings are currently only valid -on strings encoded in UTF-8, but see below for a partial workaround for -this restriction. - -The principle is similar to that of user-defined character -properties: define subroutines that do the mappings. -C<ToLower> is used for C<lc()>, C<\L>, C<lcfirst()>, and C<\l>; C<ToTitle> for -C<ucfirst()> and C<\u>; and C<ToUpper> for C<uc()> and C<\U>. - -C<ToUpper()> should look something like this: - - sub ToUpper { - return <<END; - 0061\t007A\t0041 - 0101\t\t0100 - END - } - -This sample C<ToUpper()> has the effect of mapping "a-z" to "A-Z", 0x101 -to 0x100, and all other characters map to themselves. The first -returned line means to map the code point at 0x61 ("a") to 0x41 ("A"), -the code point at 0x62 ("b") to 0x42 ("B"), ..., 0x7A ("z") to 0x5A -("Z"). The second line maps just the code point 0x101 to 0x100. Since -there are no other mappings defined, all other code points map to -themselves. - -This mechanism is not well behaved as far as affecting other packages -and scopes. All non-threaded programs have exactly one uppercasing -behavior, one lowercasing behavior, and one titlecasing behavior in -effect for utf8-encoded strings for the duration of the program. Each -of these behaviors is irrevocably determined the first time the -corresponding function is called to change a utf8-encoded string's case. -If a corresponding C<To-> function has been defined in the package that -makes that first call, the mapping defined by that function will be the -mapping used for the duration of the program's execution across all -packages and scopes. If no corresponding C<To-> function has been -defined in that package, the standard official mapping will be used for -all packages and scopes, and any corresponding C<To-> function anywhere -will be ignored. Threaded programs have similar behavior. If the -program's casing behavior has been decided at the time of a thread's -creation, the thread will inherit that behavior. But, if the behavior -hasn't been decided, the thread gets to decide for itself, and its -decision does not affect other threads nor its creator. - -As shown by the example above, you have to furnish a complete mapping; -you can't just override a couple of characters and leave the rest -unchanged. You can find all the official mappings in the directory -C<$Config{privlib}>F</unicore/To/>. The mapping data is returned as the -here-document. The C<utf8::ToSpecI<Foo>> hashes in those files are special -exception mappings derived from -C<$Config{privlib}>F</unicore/SpecialCasing.txt>. (The "Digit" and -"Fold" mappings that one can see in the directory are not directly -user-accessible, one can use either the L<Unicode::UCD> module, or just match -case-insensitively, which is what uses the "Fold" mapping. Neither are user -overridable.) - -If you have many mappings to change, you can take the official mapping data, -change by hand the affected code points, and place the whole thing into your -subroutine. But this will only be valid on Perls that use the same Unicode -version. Another option would be to have your subroutine read the official -mapping files and overwrite the affected code points. - -If you have only a few mappings to change, starting in 5.14 you can use the -following trick, here illustrated for Turkish. - - use Config; - use charnames ":full"; - - sub ToUpper { - my $official = do "$Config{privlib}/unicore/To/Upper.pl"; - $utf8::ToSpecUpper{'i'} = - "\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}"; - return $official; - } - -This takes the official mappings and overrides just one, for "LATIN SMALL -LETTER I". The keys to the hash must be the bytes that form the UTF-8 -(on EBCDIC platforms, UTF-EBCDIC) of the character, as illustrated by -the inverse function. - - sub ToLower { - my $official = do $lower; - $utf8::ToSpecLower{"\xc4\xb0"} = "i"; - return $official; - } - -This example is for an ASCII platform, and C<\xc4\xb0> is the string of -bytes that together form the UTF-8 that represents C<\N{LATIN CAPITAL -LETTER I WITH DOT ABOVE}>, C<U+0130>. You can avoid having to figure out -these bytes, and at the same time make it work on all platforms by -instead writing: - - sub ToLower { - my $official = do $lower; - my $sequence = "\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}"; - utf8::encode($sequence); - $utf8::ToSpecLower{$sequence} = "i"; - return $official; - } - -This works because C<utf8::encode()> takes the single character and -converts it to the sequence of bytes that constitute it. Note that we took -advantage of the fact that C<"i"> is the same in UTF-8 or UTF_EBCIDIC as not; -otherwise we would have had to write - - $utf8::ToSpecLower{$sequence} = "\N{LATIN SMALL LETTER I}"; - -in the ToLower example, and in the ToUpper example, use - - my $sequence = "\N{LATIN SMALL LETTER I}"; - utf8::encode($sequence); - -A big caveat to the above trick and to this whole mechanism in general, -is that they work only on strings encoded in UTF-8. You can partially -get around this by using C<use subs>. (But better to just convert to -use L<Unicode::Casing>.) For example: -(The trick illustrated here does work in earlier releases, but only if all the -characters you want to override have ordinal values of 256 or higher, or -if you use the other tricks given just below.) - -The mappings are in effect only for the package they are defined in, and only -on scalars that have been marked as having Unicode characters, for example by -using C<utf8::upgrade()>. Although probably not advisable, you can -cause the mappings to be used globally by importing into C<CORE::GLOBAL> -(see L<CORE>). - -You can partially get around the restriction that the source strings -must be in utf8 by using C<use subs> (or by importing into C<CORE::GLOBAL>) by: - - use subs qw(uc ucfirst lc lcfirst); - - sub uc($) { - my $string = shift; - utf8::upgrade($string); - return CORE::uc($string); - } - - sub lc($) { - my $string = shift; - utf8::upgrade($string); - - # Unless an I is before a dot_above, it turns into a dotless i. - # (The character class with the combining classes matches non-above - # marks following the I. Any number of these may be between the - # 'I'and the dot_above, and the dot_above will still apply to the - # 'I'. - use charnames ":full"; - $string =~ - s/I - (?! [^\p{ccc=0}\p{ccc=Above}]* \N{COMBINING DOT ABOVE} ) - /\N{LATIN SMALL LETTER DOTLESS I}/gx; - - # But when the I is followed by a dot_above, remove the - # dot_above so the end result will be i. - $string =~ s/I - ([^\p{ccc=0}\p{ccc=Above}]* ) - \N{COMBINING DOT ABOVE} - /i$1/gx; - return CORE::lc($string); - } - -These examples (also for Turkish) make sure the input is in UTF-8, and then -call the corresponding official function, which will use the C<ToUpper()> and -C<ToLower()> functions you have defined. -(For Turkish, there are other required functions: C<ucfirst>, C<lcfirst>, -and C<ToTitle>. These are very similar to the ones given above.) - -The reason this is only a partial fix is that it doesn't affect the C<\l>, -C<\L>, C<\u>, and C<\U> case-change operations in regular expressions, -which still require the source to be encoded in utf8 (see L</The "Unicode -Bug">). (Again, use L<Unicode::Casing> instead.) - -The C<lc()> example shows how you can add context-dependent casing. Note -that context-dependent casing suffers from the problem that the string -passed to the casing function may not have sufficient context to make -the proper choice. Also, it will not be called for C<\l>, C<\L>, C<\u>, -and C<\U>. +B<This feature has been removed as of Perl 5.16.> +The CPAN module L<Unicode::Casing> provides better functionality without +the drawbacks that this feature had. If you are using a Perl earlier +than 5.16, this feature was most fully documented in the 5.14 version of +this pod: +L<http://perldoc.perl.org/5.14.0/perlunicode.html#User-Defined-Case-Mappings-%28for-serious-hackers-only%29> =head2 Character Encodings for Input and Output @@ -1557,12 +1380,6 @@ In C<quotemeta> or its inline equivalent C<\Q>, no characters code points above 127 are quoted in UTF-8 encoded strings, but in byte encoded strings, code points between 128-255 are always quoted. -=item * - -User-defined case change mappings. You can create a C<ToUpper()> function, for -example, which overrides Perl's built-in case mappings. The scalar must be -encoded in utf8 for your function to actually be invoked. - =back This behavior can lead to unexpected results in which a string's semantics diff --git a/t/op/lc_user.t b/t/op/lc_user.t deleted file mode 100644 index 664cc6c3ea..0000000000 --- a/t/op/lc_user.t +++ /dev/null @@ -1,33 +0,0 @@ -BEGIN { - chdir 't'; - @INC = '../lib'; - require './test.pl'; -} - -plan tests => 5; - -%utf8::ToSpecUpper = ( -"s" => "SS", # Make sure can handle weird ASCII translations -); - -sub ToUpper { - return <<END; -0061 0063 0041 -END -} - -is("\Ufoo\x{101}", "foo\x{101}", "no changes on 'foo'"); -is("\Ubar\x{101}", "BAr\x{101}", "changing 'ab' on 'bar' "); -my $s = 's'; -utf8::upgrade $s; -is(uc($s), "SS", "Verify uc('s') is 'SS' with our weird xlation, and utf8"); - -sub ToLower { - return <<END; -0041 0061 -END -} - -is("\LFOO\x{100}", "FOO\x{100}", "no changes on 'FOO'"); -is("\LBAR\x{100}", "BaR\x{100}", "changing 'A' on 'BAR' "); - diff --git a/t/op/turkish.t b/t/op/turkish.t deleted file mode 100644 index 628fcdc38a..0000000000 --- a/t/op/turkish.t +++ /dev/null @@ -1,100 +0,0 @@ -# Verifies that can implement Turkish casing as defined by Unicode 5.2. - -use Config; - -BEGIN { - chdir 't'; - @INC = '../lib'; - require './test.pl'; -} - -use subs qw(lc lcfirst uc ucfirst); - -sub uc($) { - my $string = shift; - utf8::upgrade($string); - return CORE::uc($string); -} - -sub ucfirst($) { - my $string = shift; - utf8::upgrade($string); - return CORE::ucfirst($string); -} - -sub lc($) { - my $string = shift; - utf8::upgrade($string); - - # Unless an I is before a dot_above, it turns into a dotless i. - $string =~ s/I (?! [^\p{ccc=0}\p{ccc=Above}]* \x{0307} )/\x{131}/gx; - - # But when the I is followed by a dot_above, remove the dot_above so - # the end result will be i. - $string =~ s/I ([^\p{ccc=0}\p{ccc=Above}]* ) \x{0307}/i$1/gx; - return CORE::lc($string); -} - -sub lcfirst($) { - my $string = shift; - utf8::upgrade($string); - - # Unless an I is before a dot_above, it turns into a dotless i. - $string =~ s/^I (?! [^\p{ccc=0}\p{ccc=Above}]* \x{0307} )/\x{131}/x; - - # But when the I is followed by a dot_above, remove the dot_above so - # the end result will be i. - $string =~ s/^I ([^\p{ccc=0}\p{ccc=Above}]* ) \x{0307}/i$1/x; - return CORE::lcfirst($string); -} - -plan tests => 22; - -my $map_directory = "../lib/unicore/To"; -my $upper = "$map_directory/Upper.pl"; -my $lower = "$map_directory/Lower.pl"; -my $title = "$map_directory/Title.pl"; - -sub ToUpper { - my $official = do $upper; - $utf8::ToSpecUpper{'i'} = "\x{0130}"; - return $official; -} - -sub ToTitle { - my $official = do $title; - $utf8::ToSpecTitle{'i'} = "\x{0130}"; - return $official; -} - -sub ToLower { - my $official = do $lower; - $utf8::ToSpecLower{"\xc4\xb0"} = "i"; - return $official; -} - -is(uc("\x{DF}\x{DF}"), "SSSS", "Verify that uc of non-overridden multi-char works"); -is(uc("aa"), "AA", "Verify that uc of non-overridden ASCII works"); -is(uc("\x{101}\x{101}"), "\x{100}\x{100}", "Verify that uc of non-overridden utf8 works"); -is(uc("ii"), "\x{130}\x{130}", "Verify uc('ii') eq \\x{130}\\x{130}"); - -is(ucfirst("\x{DF}\x{DF}"), "Ss\x{DF}", "Verify that ucfirst of non-overridden multi-char works"); -is(ucfirst("\x{101}\x{101}"), "\x{100}\x{101}", "Verify that ucfirst of non-overridden utf8 works"); -is(ucfirst("aa"), "Aa", "Verify that ucfirst of non-overridden ASCII works"); -is(ucfirst("ii"), "\x{130}i", "Verify ucfirst('ii') eq \"\\x{130}i\""); - -is(lc("AA"), "aa", "Verify that lc of non-overridden ASCII works"); -is(lc("\x{C0}\x{C0}"), "\x{E0}\x{E0}", "Verify that lc of non-overridden latin1 works"); -is(lc("\x{0178}\x{0178}"), "\x{FF}\x{FF}", "Verify that lc of non-overridden utf8 works"); -is(lc("II"), "\x{131}\x{131}", "Verify that lc('I') eq \\x{131}"); -is(lc("IG\x{0307}IG\x{0307}"), "\x{131}g\x{0307}\x{131}g\x{0307}", "Verify that lc(\"I...\\x{0307}\") eq \"\\x{131}...\\x{0307}\""); -is(lc("I\x{0307}I\x{0307}"), "ii", "Verify that lc(\"I\\x{0307}\") removes the \\x{0307}, leaving 'i'"); -is(lc("\x{130}\x{130}"), "ii", "Verify that lc(\"\\x{130}\\x{130}\") eq 'ii'"); - -is(lcfirst("AA"), "aA", "Verify that lcfirst of non-overridden ASCII works"); -is(lcfirst("\x{C0}\x{C0}"), "\x{E0}\x{C0}", "Verify that lcfirst of non-overridden latin1 works"); -is(lcfirst("\x{0178}\x{0178}"), "\x{FF}\x{0178}", "Verify that lcfirst of non-overridden utf8 works"); -is(lcfirst("I"), "\x{131}", "Verify that lcfirst('II') eq \"\\x{131}I\""); -is(lcfirst("IG\x{0307}"), "\x{131}G\x{0307}", "Verify that lcfirst(\"I...\\x{0307}\") eq \"\\x{131}...\\x{0307}\""); -is(lcfirst("I\x{0307}I\x{0307}"), "iI\x{0307}", "Verify that lcfirst(\"I\\x{0307}I\\x{0307}\") removes the first \\x{0307}, leaving 'iI\\x{0307}'"); -is(lcfirst("\x{130}\x{130}"), "i\x{130}", "Verify that lcfirst(\"\\x{130}\\x{130}\") eq \"i\\x{130}\""); |