diff options
author | Karl Williamson <khw@cpan.org> | 2015-08-23 10:30:02 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2015-08-24 12:11:33 -0600 |
commit | a0bd1a30d379f2625c307657d63fc50173d7a56d (patch) | |
tree | 7526c17ccb6fcaa679f4fe7cb4bbddf9a4eab466 /t | |
parent | 2d3d6e6e7c2d50b1cc47032cf089151823fb20a6 (diff) | |
download | perl-a0bd1a30d379f2625c307657d63fc50173d7a56d.tar.gz |
Make qr/(?[ ])/ work in UTF-8 locales
Previously use of this under /l regex rules was a compile time error.
Now it works like \b{wb} and \b{sb}, which compile under locale rules
and always work like Unicode says they should. A UTF-8 locale implies
Unicode rules, and the goal is for it to work seamlessly with the rest
of perl. This construct was the only one I am aware of that didn't work
seamlessly (not counting OS interfaces) under UTF-8 LC_CTYPE locales.
For all three of these constructs, use with a non-UTF-8 runtime locale
raises a warning, and Unicode rules are used anyway.
UTF-8 locale collation still has problems, but this is low priority to
fix, as it's a lot of work, and if one really cares, one should be using
Unicode::Collate.
Diffstat (limited to 't')
-rw-r--r-- | t/lib/warnings/regexec | 47 | ||||
-rw-r--r-- | t/re/reg_mesg.t | 2 | ||||
-rw-r--r-- | t/re/regex_sets.t | 41 |
3 files changed, 87 insertions, 3 deletions
diff --git a/t/lib/warnings/regexec b/t/lib/warnings/regexec index 750880e4db..1f3b65b167 100644 --- a/t/lib/warnings/regexec +++ b/t/lib/warnings/regexec @@ -212,3 +212,50 @@ Use of \b{} or \B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - Use of \b{} or \B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 16. Use of \b{} or \B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 17. Use of \b{} or \B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 17. +######## +# NAME (?[ ]) in non-UTF-8 locale +eval { require POSIX; POSIX->import("locale_h") }; +if ($@) { + print("SKIPPED\n# no POSIX\n"),exit; +} +no warnings 'experimental::regex_sets'; +use warnings 'locale'; +use locale; +setlocale(&POSIX::LC_CTYPE, "C"); +"\N{KELVIN SIGN}" =~ /(?[ \N{KELVIN SIGN} ])/i; +"K" =~ /(?[ \N{KELVIN SIGN} ])/i; +"k" =~ /(?[ \N{KELVIN SIGN} ])/i; +":" =~ /(?[ \: ])/; +no warnings 'locale'; +EXPECT +Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 9. +Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 9. +Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 10. +Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 10. +Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 11. +Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 11. +Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 12. +Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 12. +######## +# NAME (?[ ]) in UTF-8 locale +require '../loc_tools.pl'; +unless (locales_enabled()) { + print("SKIPPED\n# locales not available\n"),exit; +} +eval { require POSIX; POSIX->import("locale_h") }; +if ($@) { + print("SKIPPED\n# no POSIX\n"),exit; +} +my $utf8_locale = find_utf8_ctype_locale(); +unless ($utf8_locale) { + print("SKIPPED\n# No UTF-8 locale available\n"),exit; +} +no warnings 'experimental::regex_sets'; +use warnings 'locale'; +use locale; +setlocale(&POSIX::LC_CTYPE, $utf8_locale); +"\N{KELVIN SIGN}" =~ /(?[ \N{KELVIN SIGN} ])/i; +"K" =~ /(?[ \N{KELVIN SIGN} ])/i; +"k" =~ /(?[ \N{KELVIN SIGN} ])/i; +":" =~ /(?[ \: ])/; +EXPECT diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t index a0588241f3..d9d9d7437b 100644 --- a/t/re/reg_mesg.t +++ b/t/re/reg_mesg.t @@ -205,7 +205,6 @@ my @death = '/(?[[[:w:]]])/' => "POSIX class [:w:] unknown {#} m/(?[[[:w:]{#}]])/", '/(?[[:w:]])/' => "POSIX class [:w:] unknown {#} m/(?[[:w:]{#}])/", '/(?[a])/' => 'Unexpected character {#} m/(?[a{#}])/', - '/(?[\t])/l' => '(?[...]) not valid in locale {#} m/(?[{#}\t])/', '/(?[ + \t ])/' => 'Unexpected binary operator \'+\' with no preceding operand {#} m/(?[ +{#} \t ])/', '/(?[ \cK - ( + \t ) ])/' => 'Unexpected binary operator \'+\' with no preceding operand {#} m/(?[ \cK - ( +{#} \t ) ])/', '/(?[ \cK ( \t ) ])/' => 'Unexpected \'(\' with no preceding operator {#} m/(?[ \cK ({#} \t ) ])/', @@ -410,7 +409,6 @@ my @death_utf8 = mark_as_utf8( '/ネ(?[[[:ネ:]]])ネ/' => "POSIX class [:ネ:] unknown {#} m/ネ(?[[[:ネ:]{#}]])ネ/", '/ネ(?[[:ネ:]])ネ/' => "POSIX class [:ネ:] unknown {#} m/ネ(?[[:ネ:]{#}])ネ/", '/ネ(?[ネ])ネ/' => 'Unexpected character {#} m/ネ(?[ネ{#}])ネ/', - '/ネ(?[ネ])/l' => '(?[...]) not valid in locale {#} m/ネ(?[{#}ネ])/', '/ネ(?[ + [ネ] ])/' => 'Unexpected binary operator \'+\' with no preceding operand {#} m/ネ(?[ +{#} [ネ] ])/', '/ネ(?[ \cK - ( + [ネ] ) ])/' => 'Unexpected binary operator \'+\' with no preceding operand {#} m/ネ(?[ \cK - ( +{#} [ネ] ) ])/', '/ネ(?[ \cK ( [ネ] ) ])/' => 'Unexpected \'(\' with no preceding operator {#} m/ネ(?[ \cK ({#} [ネ] ) ])/', diff --git a/t/re/regex_sets.t b/t/re/regex_sets.t index 48a4f00b8e..c85fde6f2d 100644 --- a/t/re/regex_sets.t +++ b/t/re/regex_sets.t @@ -9,7 +9,8 @@ BEGIN { chdir 't' if -d 't'; @INC = ('../lib','.','../ext/re'); require './test.pl'; - require './test.pl'; require './charset_tools.pl'; + require './charset_tools.pl'; + require './loc_tools.pl'; skip_all_without_unicode_tables(); } @@ -96,6 +97,44 @@ like("k", $still_fold, "/i on interpolated (?[ ]) is retained in outer without / eval 'my $x = qr/(?[ [a] ])/; qr/(?[ $x ])/'; is($@, "", 'qr/(?[ [a] ])/ can be interpolated'); +if (! is_miniperl() && locales_enabled('LC_CTYPE')) { + my $utf8_locale = find_utf8_ctype_locale; + SKIP: { + skip("No utf8 locale available on this platform", 8) unless $utf8_locale; + + setlocale(&POSIX::LC_ALL, "C"); + use locale; + + $kelvin_fold = qr/(?[ \N{KELVIN SIGN} ])/i; + my $single_char_class = qr/(?[ \: ])/; + + setlocale(&POSIX::LC_ALL, $utf8_locale); + + like("\N{KELVIN SIGN}", $kelvin_fold, + '(?[ \N{KELVIN SIGN} ]) matches itself under /i in UTF8-locale'); + like("K", $kelvin_fold, + '(?[ \N{KELVIN SIGN} ]) matches "K" under /i in UTF8-locale'); + like("k", $kelvin_fold, + '(?[ \N{KELVIN SIGN} ]) matches "k" under /i in UTF8-locale'); + like(":", $single_char_class, + '(?[ : ]) matches itself in UTF8-locale (a single character class)'); + + setlocale(&POSIX::LC_ALL, "C"); + + # These should generate warnings (the above 4 shouldn't), but like() + # suppresses them, so the warnings tests are in t/lib/warnings/regexec + like("\N{KELVIN SIGN}", $kelvin_fold, + '(?[ \N{KELVIN SIGN} ]) matches itself under /i in C locale'); + like("K", $kelvin_fold, + '(?[ \N{KELVIN SIGN} ]) matches "K" under /i in C locale'); + like("k", $kelvin_fold, + '(?[ \N{KELVIN SIGN} ]) matches "k" under /i in C locale'); + like(":", $single_char_class, + '(?[ : ]) matches itself in C locale (a single character class)'); + } +} + + done_testing(); 1; |