Make qr/(?[ ])/ work in UTF-8 locales

Previously use of this under /l regex rules was a compile time error. Now it works like \b{wb} and \b{sb}, which compile under locale rules and always work like Unicode says they should. A UTF-8 locale implies Unicode rules, and the goal is for it to work seamlessly with the rest of perl. This construct was the only one I am aware of that didn't work seamlessly (not counting OS interfaces) under UTF-8 LC_CTYPE locales. For all three of these constructs, use with a non-UTF-8 runtime locale raises a warning, and Unicode rules are used anyway. UTF-8 locale collation still has problems, but this is low priority to fix, as it's a lot of work, and if one really cares, one should be using Unicode::Collate.
author: Karl Williamson <khw@cpan.org> 2015-08-23 10:30:02 -0600
committer: Karl Williamson <khw@cpan.org> 2015-08-24 12:11:33 -0600
commit: a0bd1a30d379f2625c307657d63fc50173d7a56d (patch)
tree: 7526c17ccb6fcaa679f4fe7cb4bbddf9a4eab466 /t
parent: 2d3d6e6e7c2d50b1cc47032cf089151823fb20a6 (diff)
download: perl-a0bd1a30d379f2625c307657d63fc50173d7a56d.tar.gz
3 files changed, 87 insertions, 3 deletions
diff --git a/t/lib/warnings/regexec b/t/lib/warnings/regexec
index 750880e4db..1f3b65b167 100644
--- a/t/lib/warnings/regexec
+++ b/t/lib/warnings/regexec
@@ -212,3 +212,50 @@ Use of \b{} or \B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale at -
 Use of \b{} or \B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale at - line 16.
 Use of \b{} or \B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale at - line 17.
 Use of \b{} or \B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale at - line 17.
+########
+# NAME (?[ ]) in non-UTF-8 locale
+eval { require POSIX; POSIX->import("locale_h") };
+if ($@) {
+    print("SKIPPED\n# no POSIX\n"),exit;
+}
+no warnings 'experimental::regex_sets';
+use warnings 'locale';
+use locale;
+setlocale(&POSIX::LC_CTYPE, "C");
+"\N{KELVIN SIGN}" =~ /(?[ \N{KELVIN SIGN} ])/i;
+"K" =~ /(?[ \N{KELVIN SIGN} ])/i;
+"k" =~ /(?[ \N{KELVIN SIGN} ])/i;
+":" =~ /(?[ \: ])/;
+no warnings 'locale';
+EXPECT
+Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale at - line 9.
+Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale at - line 9.
+Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale at - line 10.
+Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale at - line 10.
+Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale at - line 11.
+Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale at - line 11.
+Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale at - line 12.
+Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale at - line 12.
+########
+# NAME (?[ ]) in UTF-8 locale
+require '../loc_tools.pl';
+unless (locales_enabled()) {
+    print("SKIPPED\n# locales not available\n"),exit;
+}
+eval { require POSIX; POSIX->import("locale_h") };
+if ($@) {
+    print("SKIPPED\n# no POSIX\n"),exit;
+}
+my $utf8_locale = find_utf8_ctype_locale();
+unless ($utf8_locale) {
+    print("SKIPPED\n# No UTF-8 locale available\n"),exit;
+}
+no warnings 'experimental::regex_sets';
+use warnings 'locale';
+use locale;
+setlocale(&POSIX::LC_CTYPE, $utf8_locale);
+"\N{KELVIN SIGN}" =~ /(?[ \N{KELVIN SIGN} ])/i;
+"K" =~ /(?[ \N{KELVIN SIGN} ])/i;
+"k" =~ /(?[ \N{KELVIN SIGN} ])/i;
+":" =~ /(?[ \: ])/;
+EXPECT
diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t
index a0588241f3..d9d9d7437b 100644
--- a/t/re/reg_mesg.t
+++ b/t/re/reg_mesg.t
@@ -205,7 +205,6 @@ my @death =
  '/(?[[[:w:]]])/' => "POSIX class [:w:] unknown {#} m/(?[[[:w:]{#}]])/",
  '/(?[[:w:]])/' => "POSIX class [:w:] unknown {#} m/(?[[:w:]{#}])/",
  '/(?[a])/' =>  'Unexpected character {#} m/(?[a{#}])/',
- '/(?[\t])/l' => '(?[...]) not valid in locale {#} m/(?[{#}\t])/',
  '/(?[ + \t ])/' => 'Unexpected binary operator \'+\' with no preceding operand {#} m/(?[ +{#} \t ])/',
  '/(?[ \cK - ( + \t ) ])/' => 'Unexpected binary operator \'+\' with no preceding operand {#} m/(?[ \cK - ( +{#} \t ) ])/',
  '/(?[ \cK ( \t ) ])/' => 'Unexpected \'(\' with no preceding operator {#} m/(?[ \cK ({#} \t ) ])/',
@@ -410,7 +409,6 @@ my @death_utf8 = mark_as_utf8(
  '/ネ(?[[[:ネ:]]])ネ/' => "POSIX class [:ネ:] unknown {#} m/ネ(?[[[:ネ:]{#}]])ネ/",
  '/ネ(?[[:ネ:]])ネ/' => "POSIX class [:ネ:] unknown {#} m/ネ(?[[:ネ:]{#}])ネ/",
  '/ネ(?[ネ])ネ/' =>  'Unexpected character {#} m/ネ(?[ネ{#}])ネ/',
- '/ネ(?[ネ])/l' => '(?[...]) not valid in locale {#} m/ネ(?[{#}ネ])/',
  '/ネ(?[ + [ネ] ])/' => 'Unexpected binary operator \'+\' with no preceding operand {#} m/ネ(?[ +{#} [ネ] ])/',
  '/ネ(?[ \cK - ( + [ネ] ) ])/' => 'Unexpected binary operator \'+\' with no preceding operand {#} m/ネ(?[ \cK - ( +{#} [ネ] ) ])/',
  '/ネ(?[ \cK ( [ネ] ) ])/' => 'Unexpected \'(\' with no preceding operator {#} m/ネ(?[ \cK ({#} [ネ] ) ])/',
diff --git a/t/re/regex_sets.t b/t/re/regex_sets.t
index 48a4f00b8e..c85fde6f2d 100644
--- a/t/re/regex_sets.t
+++ b/t/re/regex_sets.t
@@ -9,7 +9,8 @@ BEGIN {
     chdir 't' if -d 't';
     @INC = ('../lib','.','../ext/re');
     require './test.pl';
-    require './test.pl'; require './charset_tools.pl';
+    require './charset_tools.pl';
+    require './loc_tools.pl';
     skip_all_without_unicode_tables();
 }
 
@@ -96,6 +97,44 @@ like("k", $still_fold, "/i on interpolated (?[ ]) is retained in outer without /
 eval 'my $x = qr/(?[ [a] ])/; qr/(?[ $x ])/';
 is($@, "", 'qr/(?[ [a] ])/ can be interpolated');
 
+if (! is_miniperl() && locales_enabled('LC_CTYPE')) {
+    my $utf8_locale = find_utf8_ctype_locale;
+    SKIP: {
+        skip("No utf8 locale available on this platform", 8) unless $utf8_locale;
+
+        setlocale(&POSIX::LC_ALL, "C");
+        use locale;
+
+        $kelvin_fold = qr/(?[ \N{KELVIN SIGN} ])/i;
+        my $single_char_class = qr/(?[ \: ])/;
+
+        setlocale(&POSIX::LC_ALL, $utf8_locale);
+
+        like("\N{KELVIN SIGN}", $kelvin_fold,
+             '(?[ \N{KELVIN SIGN} ]) matches itself under /i in UTF8-locale');
+        like("K", $kelvin_fold,
+                '(?[ \N{KELVIN SIGN} ]) matches "K" under /i in UTF8-locale');
+        like("k", $kelvin_fold,
+                '(?[ \N{KELVIN SIGN} ]) matches "k" under /i in UTF8-locale');
+        like(":", $single_char_class,
+             '(?[ : ]) matches itself in UTF8-locale (a single character class)');
+
+        setlocale(&POSIX::LC_ALL, "C");
+
+        # These should generate warnings (the above 4 shouldn't), but like()
+        # suppresses them, so the warnings tests are in t/lib/warnings/regexec
+        like("\N{KELVIN SIGN}", $kelvin_fold,
+             '(?[ \N{KELVIN SIGN} ]) matches itself under /i in C locale');
+        like("K", $kelvin_fold,
+                '(?[ \N{KELVIN SIGN} ]) matches "K" under /i in C locale');
+        like("k", $kelvin_fold,
+                '(?[ \N{KELVIN SIGN} ]) matches "k" under /i in C locale');
+        like(":", $single_char_class,
+             '(?[ : ]) matches itself in C locale (a single character class)');
+    }
+}
+
+
 done_testing();
 
 1;
author	Karl Williamson <khw@cpan.org>	2015-08-23 10:30:02 -0600
committer	Karl Williamson <khw@cpan.org>	2015-08-24 12:11:33 -0600
commit	a0bd1a30d379f2625c307657d63fc50173d7a56d (patch)
tree	7526c17ccb6fcaa679f4fe7cb4bbddf9a4eab466 /t
parent	2d3d6e6e7c2d50b1cc47032cf089151823fb20a6 (diff)
download	perl-a0bd1a30d379f2625c307657d63fc50173d7a56d.tar.gz