summaryrefslogtreecommitdiff
path: root/lib/feature
diff options
context:
space:
mode:
authorRafael Garcia-Suarez <rgs@consttype.org>2009-12-20 16:23:36 +0100
committerRafael Garcia-Suarez <rgs@consttype.org>2009-12-20 16:28:36 +0100
commit1863b87966ed39b042c45e12d1b4e0b90b9cc071 (patch)
treeeae5c03c697269b036352d4b007f9c1294f189c9 /lib/feature
parent1d5fe431325abdb0f3947d563ebdef67bd4cb7cd (diff)
downloadperl-1863b87966ed39b042c45e12d1b4e0b90b9cc071.tar.gz
Introduce C<use feature "unicode_strings">
This turns on the unicode semantics for uc/lc/ucfirst/lcfirst operations on strings without the UTF8 bit set but with ASCII characters higher than 127. This replaces the "legacy" pragma experiment. Note that currently this feature sets both a bit in $^H and a (unused) key in %^H. The bit in $^H could be replaced by a flag on the uc/lc/etc op. It's probably not feasible to test a key in %^H in pp_uc in friends each time we want to know which semantics to apply.
Diffstat (limited to 'lib/feature')
-rw-r--r--lib/feature/unicode_strings.t142
1 files changed, 142 insertions, 0 deletions
diff --git a/lib/feature/unicode_strings.t b/lib/feature/unicode_strings.t
new file mode 100644
index 0000000000..dce34bd0a7
--- /dev/null
+++ b/lib/feature/unicode_strings.t
@@ -0,0 +1,142 @@
+use warnings;
+use strict;
+
+BEGIN {
+ chdir 't' if -d 't';
+ @INC = '../lib';
+ require './test.pl';
+}
+
+plan(13312); # Determined by experimentation
+
+# Test the upper/lower/title case mappings for all characters 0-255.
+
+# First compute the case mappings without resorting to the functions we're
+# testing.
+
+# Initialize the arrays so each $i maps to itself.
+my @posix_to_upper;
+for my $i (0 .. 255) {
+ $posix_to_upper[$i] = chr($i);
+}
+my @posix_to_lower
+= my @posix_to_title
+= my @latin1_to_upper
+= my @latin1_to_lower
+= my @latin1_to_title
+= @posix_to_upper;
+
+# Override the elements in the to_lower arrays that have different lower case
+# mappings
+for my $i (0x41 .. 0x5A) {
+ $posix_to_lower[$i] = chr(ord($posix_to_lower[$i]) + 32);
+ $latin1_to_lower[$i] = chr(ord($latin1_to_lower[$i]) + 32);
+}
+
+# Same for upper and title
+for my $i (0x61 .. 0x7A) {
+ $posix_to_upper[$i] = chr(ord($posix_to_upper[$i]) - 32);
+ $latin1_to_upper[$i] = chr(ord($latin1_to_upper[$i]) - 32);
+ $posix_to_title[$i] = chr(ord($posix_to_title[$i]) - 32);
+ $latin1_to_title[$i] = chr(ord($latin1_to_title[$i]) - 32);
+}
+
+# And the same for those in the latin1 range
+for my $i (0xC0 .. 0xD6, 0xD8 .. 0xDE) {
+ $latin1_to_lower[$i] = chr(ord($latin1_to_lower[$i]) + 32);
+}
+for my $i (0xE0 .. 0xF6, 0xF8 .. 0xFE) {
+ $latin1_to_upper[$i] = chr(ord($latin1_to_upper[$i]) - 32);
+ $latin1_to_title[$i] = chr(ord($latin1_to_title[$i]) - 32);
+}
+
+# Override the abnormal cases.
+$latin1_to_upper[0xB5] = chr(0x39C);
+$latin1_to_title[0xB5] = chr(0x39C);
+$latin1_to_upper[0xDF] = 'SS';
+$latin1_to_title[0xDF] = 'Ss';
+$latin1_to_upper[0xFF] = chr(0x178);
+$latin1_to_title[0xFF] = chr(0x178);
+
+my $repeat = 25; # Length to make strings.
+
+# Create hashes of strings in several ranges, both for uc and lc.
+my %posix;
+$posix{'uc'} = 'A' x $repeat;
+$posix{'lc'} = 'a' x $repeat ;
+
+my %cyrillic;
+$cyrillic{'uc'} = chr(0x42F) x $repeat;
+$cyrillic{'lc'} = chr(0x44F) x $repeat;
+
+my %latin1;
+$latin1{'uc'} = chr(0xD8) x $repeat;
+$latin1{'lc'} = chr(0xF8) x $repeat;
+
+my %empty;
+$empty{'lc'} = $empty{'uc'} = "";
+
+# Loop so prefix each character being tested with nothing, and the various
+# strings; then loop for suffixes of those strings as well.
+for my $prefix (\%empty, \%posix, \%cyrillic, \%latin1) {
+ for my $suffix (\%empty, \%posix, \%cyrillic, \%latin1) {
+ for my $i (0 .. 255) { # For each possible posix or latin1 character
+ my $cp = sprintf "U+%04X", $i;
+
+ # First try using latin1 (Unicode) semantics.
+ use feature "unicode_strings";
+
+ my $phrase = 'with uni8bit';
+ my $char = chr($i);
+ my $pre_lc = $prefix->{'lc'};
+ my $pre_uc = $prefix->{'uc'};
+ my $post_lc = $suffix->{'lc'};
+ my $post_uc = $suffix->{'uc'};
+ my $to_upper = $pre_lc . $char . $post_lc;
+ my $expected_upper = $pre_uc . $latin1_to_upper[$i] . $post_uc;
+ my $to_lower = $pre_uc . $char . $post_uc;
+ my $expected_lower = $pre_lc . $latin1_to_lower[$i] . $post_lc;
+
+ is (uc($to_upper), $expected_upper,
+ display("$cp: $phrase: uc($to_upper) eq $expected_upper"));
+ is (lc($to_lower), $expected_lower,
+ display("$cp: $phrase: lc($to_lower) eq $expected_lower"));
+
+ if ($pre_uc eq "") { # Title case if null prefix.
+ my $expected_title = $latin1_to_title[$i] . $post_lc;
+ is (ucfirst($to_upper), $expected_title,
+ display("$cp: $phrase: ucfirst($to_upper) eq $expected_title"));
+ my $expected_lcfirst = $latin1_to_lower[$i] . $post_uc;
+ is (lcfirst($to_lower), $expected_lcfirst,
+ display("$cp: $phrase: lcfirst($to_lower) eq $expected_lcfirst"));
+ }
+
+ # Then try with posix semantics.
+ no feature "unicode_strings";
+ $phrase = 'no uni8bit';
+
+ # These don't contribute anything in this case.
+ next if $suffix == \%cyrillic;
+ next if $suffix == \%latin1;
+ next if $prefix == \%cyrillic;
+ next if $prefix == \%latin1;
+
+ $expected_upper = $pre_uc . $posix_to_upper[$i] . $post_uc;
+ $expected_lower = $pre_lc . $posix_to_lower[$i] . $post_lc;
+
+ is (uc($to_upper), $expected_upper,
+ display("$cp: $phrase: uc($to_upper) eq $expected_upper"));
+ is (lc($to_lower), $expected_lower,
+ display("$cp: $phrase: lc($to_lower) eq $expected_lower"));
+
+ if ($pre_uc eq "") {
+ my $expected_title = $posix_to_title[$i] . $post_lc;
+ is (ucfirst($to_upper), $expected_title,
+ display("$cp: $phrase: ucfirst($to_upper) eq $expected_title"));
+ my $expected_lcfirst = $posix_to_lower[$i] . $post_uc;
+ is (lcfirst($to_lower), $expected_lcfirst,
+ display("$cp: $phrase: lcfirst($to_lower) eq $expected_lcfirst"));
+ }
+ }
+ }
+}