Introduce C<use feature "unicode_strings">

This turns on the unicode semantics for uc/lc/ucfirst/lcfirst operations on strings without the UTF8 bit set but with ASCII characters higher than 127. This replaces the "legacy" pragma experiment. Note that currently this feature sets both a bit in $^H and a (unused) key in %^H. The bit in $^H could be replaced by a flag on the uc/lc/etc op. It's probably not feasible to test a key in %^H in pp_uc in friends each time we want to know which semantics to apply.
author: Rafael Garcia-Suarez <rgs@consttype.org> 2009-12-20 16:23:36 +0100
committer: Rafael Garcia-Suarez <rgs@consttype.org> 2009-12-20 16:28:36 +0100
commit: 1863b87966ed39b042c45e12d1b4e0b90b9cc071 (patch)
tree: eae5c03c697269b036352d4b007f9c1294f189c9 /lib/feature
parent: 1d5fe431325abdb0f3947d563ebdef67bd4cb7cd (diff)
download: perl-1863b87966ed39b042c45e12d1b4e0b90b9cc071.tar.gz
1 files changed, 142 insertions, 0 deletions
diff --git a/lib/feature/unicode_strings.t b/lib/feature/unicode_strings.t
new file mode 100644
index 0000000000..dce34bd0a7
--- /dev/null
+++ b/lib/feature/unicode_strings.t
@@ -0,0 +1,142 @@
+use warnings;
+use strict;
+
+BEGIN {
+    chdir 't' if -d 't';
+    @INC = '../lib';
+    require './test.pl';
+}
+
+plan(13312);    # Determined by experimentation
+
+# Test the upper/lower/title case mappings for all characters 0-255.
+
+# First compute the case mappings without resorting to the functions we're
+# testing.
+
+# Initialize the arrays so each $i maps to itself.
+my @posix_to_upper;
+for my $i (0 .. 255) {
+    $posix_to_upper[$i] = chr($i);
+}
+my @posix_to_lower
+= my @posix_to_title
+= my @latin1_to_upper
+= my @latin1_to_lower
+= my @latin1_to_title
+= @posix_to_upper;
+
+# Override the elements in the to_lower arrays that have different lower case 
+# mappings
+for my $i (0x41 .. 0x5A) {
+    $posix_to_lower[$i] = chr(ord($posix_to_lower[$i]) + 32);
+    $latin1_to_lower[$i] = chr(ord($latin1_to_lower[$i]) + 32);
+}
+
+# Same for upper and title
+for my $i (0x61 .. 0x7A) {
+    $posix_to_upper[$i] = chr(ord($posix_to_upper[$i]) - 32);
+    $latin1_to_upper[$i] = chr(ord($latin1_to_upper[$i]) - 32);
+    $posix_to_title[$i] = chr(ord($posix_to_title[$i]) - 32);
+    $latin1_to_title[$i] = chr(ord($latin1_to_title[$i]) - 32);
+}
+
+# And the same for those in the latin1 range
+for my $i (0xC0 .. 0xD6, 0xD8 .. 0xDE) {
+    $latin1_to_lower[$i] = chr(ord($latin1_to_lower[$i]) + 32);
+}
+for my $i (0xE0 .. 0xF6, 0xF8 .. 0xFE) {
+    $latin1_to_upper[$i] = chr(ord($latin1_to_upper[$i]) - 32);
+    $latin1_to_title[$i] = chr(ord($latin1_to_title[$i]) - 32);
+}
+
+# Override the abnormal cases.
+$latin1_to_upper[0xB5] = chr(0x39C);
+$latin1_to_title[0xB5] = chr(0x39C);
+$latin1_to_upper[0xDF] = 'SS';
+$latin1_to_title[0xDF] = 'Ss';
+$latin1_to_upper[0xFF] = chr(0x178);
+$latin1_to_title[0xFF] = chr(0x178);
+
+my $repeat = 25;    # Length to make strings.
+
+# Create hashes of strings in several ranges, both for uc and lc.
+my %posix;
+$posix{'uc'} = 'A' x $repeat;
+$posix{'lc'} = 'a' x $repeat ;
+
+my %cyrillic;
+$cyrillic{'uc'} = chr(0x42F) x $repeat;
+$cyrillic{'lc'} = chr(0x44F) x $repeat;
+
+my %latin1;
+$latin1{'uc'} = chr(0xD8) x $repeat;
+$latin1{'lc'} = chr(0xF8) x $repeat;
+
+my %empty;
+$empty{'lc'} = $empty{'uc'} = "";
+
+# Loop so prefix each character being tested with nothing, and the various
+# strings; then loop for suffixes of those strings as well.
+for my  $prefix (\%empty, \%posix, \%cyrillic, \%latin1) {
+    for my  $suffix (\%empty, \%posix, \%cyrillic, \%latin1) {
+        for my $i (0 .. 255) {  # For each possible posix or latin1 character
+            my $cp = sprintf "U+%04X", $i;
+
+            # First try using latin1 (Unicode) semantics.
+            use feature "unicode_strings";    
+
+            my $phrase = 'with uni8bit';
+            my $char = chr($i);
+            my $pre_lc = $prefix->{'lc'};
+            my $pre_uc = $prefix->{'uc'};
+            my $post_lc = $suffix->{'lc'};
+            my $post_uc = $suffix->{'uc'};
+            my $to_upper = $pre_lc . $char . $post_lc;
+            my $expected_upper = $pre_uc . $latin1_to_upper[$i] . $post_uc;
+            my $to_lower = $pre_uc . $char . $post_uc;
+            my $expected_lower = $pre_lc . $latin1_to_lower[$i] . $post_lc;
+
+            is (uc($to_upper), $expected_upper,
+                display("$cp: $phrase: uc($to_upper) eq $expected_upper"));
+            is (lc($to_lower), $expected_lower,
+                display("$cp: $phrase: lc($to_lower) eq $expected_lower"));
+
+            if ($pre_uc eq "") {    # Title case if null prefix.
+                my $expected_title = $latin1_to_title[$i] . $post_lc;
+                is (ucfirst($to_upper), $expected_title,
+                    display("$cp: $phrase: ucfirst($to_upper) eq $expected_title"));
+                my $expected_lcfirst = $latin1_to_lower[$i] . $post_uc;
+                is (lcfirst($to_lower), $expected_lcfirst,
+                    display("$cp: $phrase: lcfirst($to_lower) eq $expected_lcfirst"));
+            }
+
+            # Then try with posix semantics.
+            no feature "unicode_strings";
+            $phrase = 'no uni8bit';
+
+            # These don't contribute anything in this case.
+            next if $suffix == \%cyrillic;
+            next if $suffix == \%latin1;
+            next if $prefix == \%cyrillic;
+            next if $prefix == \%latin1;
+
+            $expected_upper = $pre_uc . $posix_to_upper[$i] . $post_uc;
+            $expected_lower = $pre_lc . $posix_to_lower[$i] . $post_lc;
+
+            is (uc($to_upper), $expected_upper,
+                display("$cp: $phrase: uc($to_upper) eq $expected_upper"));
+            is (lc($to_lower), $expected_lower,
+                display("$cp: $phrase: lc($to_lower) eq $expected_lower"));
+
+            if ($pre_uc eq "") {
+                my $expected_title = $posix_to_title[$i] . $post_lc;
+                is (ucfirst($to_upper), $expected_title,
+                    display("$cp: $phrase: ucfirst($to_upper) eq $expected_title"));
+                my $expected_lcfirst = $posix_to_lower[$i] . $post_uc;
+                is (lcfirst($to_lower), $expected_lcfirst,
+                    display("$cp: $phrase: lcfirst($to_lower) eq $expected_lcfirst"));
+            }
+        }
+    }
+}
author	Rafael Garcia-Suarez <rgs@consttype.org>	2009-12-20 16:23:36 +0100
committer	Rafael Garcia-Suarez <rgs@consttype.org>	2009-12-20 16:28:36 +0100
commit	1863b87966ed39b042c45e12d1b4e0b90b9cc071 (patch)
tree	eae5c03c697269b036352d4b007f9c1294f189c9 /lib/feature
parent	1d5fe431325abdb0f3947d563ebdef67bd4cb7cd (diff)
download	perl-1863b87966ed39b042c45e12d1b4e0b90b9cc071.tar.gz