diff options
author | Karl Williamson <public@khwilliamson.com> | 2011-01-15 14:13:24 -0700 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2011-01-16 08:20:30 -0700 |
commit | e8ed1101676693f3e8f3603aec4830e76ebe886e (patch) | |
tree | 1ab22eacb026c79cf4a47c60386899d5688ace16 /t/re/charset.t | |
parent | 003331de6a07527e753edc8c78b0fbc53b2ed0f8 (diff) | |
download | perl-e8ed1101676693f3e8f3603aec4830e76ebe886e.tar.gz |
Deliver t/re/charset.t
Diffstat (limited to 't/re/charset.t')
-rw-r--r-- | t/re/charset.t | 224 |
1 files changed, 224 insertions, 0 deletions
diff --git a/t/re/charset.t b/t/re/charset.t new file mode 100644 index 0000000000..756a905ddb --- /dev/null +++ b/t/re/charset.t @@ -0,0 +1,224 @@ +# Test the /a, /d, etc regex modifiers + +BEGIN { + chdir 't' if -d 't'; + @INC = '../lib'; + require './test.pl'; +} + +use strict; +use warnings; + +# Each case is a valid element of its hash key. Choose, where available, an +# ASCII-range, Latin-1 non-ASCII range, and above Latin1 range code point. +my %testcases = ( + '\w' => [ ord("A"), 0xE2, 0x16B ], # Below expects these to all be alpha + '\d' => [ ord("0"), 0x0662 ], + '\s' => [ ord("\t"), 0xA0, 0x1680 ], # Below expects these to be [:blank:] + '[:cntrl:]' => [ 0x00, 0x88 ], + '[:graph:]' => [ ord("&"), 0xF7, 0x02C7 ], # Below expects these to be + # [:print:] + '[:lower:]' => [ ord("g"), 0xE3, 0x0127 ], + '[:punct:]' => [ ord("!"), 0xBF, 0x055C ], + '[:upper:]' => [ ord("G"), 0xC3, 0x0126 ], + '[:xdigit:]' => [ ord("4"), 0xFF15 ], +); + +$testcases{'[:digit:]'} = $testcases{'\d'}; +$testcases{'[:alnum:]'} = $testcases{'\w'}; +$testcases{'[:alpha:]'} = $testcases{'\w'}; +$testcases{'[:blank:]'} = $testcases{'\s'}; +$testcases{'[:print:]'} = $testcases{'[:graph:]'}; +$testcases{'[:space:]'} = $testcases{'\s'}; +$testcases{'[:word:]'} = $testcases{'\w'}; + +# For each possible character set... +foreach my $charset ("d", "u") { + + # And in utf8 or not + foreach my $upgrade ("", 'utf8::upgrade($a); ') { + + # reverse gets the, \w, \s, \d first. + for my $class (reverse sort keys %testcases) { + + # The complement of \w is \W; of [:posix:] is [:^posix:] + my $complement = $class; + if ($complement !~ s/ ( \[: ) /$1^/x) { + $complement = uc($class); + } + + # For each test case + foreach my $ord (@{$testcases{$class}}) { + my $char = display(chr($ord)); + + # > 255 already implies upgraded. Skip the ones that don't + # have an explicit upgradei. This shows more clearly in the + # output which tests are in utf8, or not. + next if $ord > 255 && ! $upgrade; + + my $reason = ""; # Explanation output with each test + my $match = 1; # Calculated whether test regex should + # match or not + + # Everything always matches in ASCII, or under /u + if ($ord < 128 || $charset eq 'u') { + $reason = "\"$char\" is a $class under /$charset"; + } + elsif ($charset eq "a") { + $match = 0; + $reason = "\"$char\" is non-ASCII, which can't be a $class under /a"; + } + elsif ($ord > 255) { + $reason = "\"$char\" is a $class under /$charset"; + } + elsif ($charset eq 'l') { + + # We are using the C locale, which is essentially ASCII, + # but under utf8, the above-latin1 chars are treated as + # Unicode) + $reason = "\"$char\" is not a $class in this locale under /l"; + $match = 0; + } + elsif ($upgrade) { + $reason = "\"$char\" is a $class in utf8 under /d"; + } + else { + $reason = "\"$char\" is latin1, which requires utf8 to be a $class under /d"; + $match = 0; + } + $reason = "; $reason" if $reason; + + my $op; + my $neg_op; + if ($match) { + $op = '=~'; + $neg_op = '!~'; + } + else { + $op = '!~'; + $neg_op = '=~'; + } + + # In [...] or not + foreach my $bracketed (0, 1) { + my $lb = ""; + my $rb = ""; + if ($bracketed) { + + # Adds an extra char to the character class to make sure + # that the class doesn't get optimized away. + $lb = ($bracketed) ? '[_' : ""; + $rb = ($bracketed) ? ']' : ""; + } + else { # [:posix:] must be inside outer [ ] + next if $class =~ /\[/; + } + + my $length = 10; # For regexec.c regrepeat() cases by + # matching more than one item + # Test both class and its complement, and with one or more + # than one item to match. + foreach my $eval ( + qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset: $lb$class$rb ) /x], + qq[my \$a = "$char"; $upgrade\$a $neg_op qr/ (?$charset: $lb$complement$rb ) /x], + qq[my \$a = "$char" x $length; $upgrade\$a $op qr/ (?$charset: $lb$class$rb\{$length} ) /x], + qq[my \$a = "$char" x $length; $upgrade\$a $neg_op qr/ (?$charset: $lb$complement$rb\{$length} ) /x], + ) { + ok (eval $eval, $eval . $reason); + } + } + + next if $class ne '\w'; + + # Test \b, \B at beginning and end of string + foreach my $eval ( + qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset: ^ \\b . ) /x], + qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset: . \\b \$) /x], + qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset: ^ \\B . ) /x], + qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset: . \\B \$ ) /x], + ) { + ok (eval $eval, $eval . $reason); + } + + # Test \b, \B adjacent to a non-word char, both before it and + # after. We test with ASCII, Latin1 and Unicode non-word chars + foreach my $space_ord (@{$testcases{'\s'}}) { + + # Useless to try to test non-utf8 when the ord itself + # forces utf8 + next if $space_ord > 255 && ! $upgrade; + + my $space = display(chr $space_ord); + + foreach my $eval ( + qq[my \$a = "$space$char"; $upgrade\$a $op qr/ (?$charset: . \\b . ) /x], + qq[my \$a = "$char$space"; $upgrade\$a $op qr/ (?$charset: . \\b . ) /x], + qq[my \$a = "$space$char"; $upgrade\$a $neg_op qr/ (?$charset: . \\B . ) /x], + qq[my \$a = "$char$space"; $upgrade\$a $neg_op qr/ (?$charset: . \\B . ) /x], + ) { + ok (eval $eval, $eval . $reason . "; \"$space\" is not a \\w"); + } + } + + # Test \b, \B in the middle of two nominally word chars, but + # one or both may be considered non-word depending on range + # and charset. + foreach my $other_ord (@{$testcases{'\w'}}) { + next if $other_ord > 255 && ! $upgrade; + my $other = display(chr $other_ord); + + # Determine if the other char is a word char in current + # circumstances + my $other_is_word = 1; + my $other_reason = "\"$other\" is a $class under /$charset"; + if ($other_ord > 127 + && $charset ne 'u' + && ($charset eq "a" + || ($other_ord < 256 && ($charset eq 'l' || ! $upgrade)))) + { + $other_is_word = 0; + $other_reason = "\"$other\" is not a $class under /$charset"; + } + my $both_reason = $reason; + $both_reason .= "; $other_reason" if $other_ord != $ord; + + # If both are the same wordness, then \b will fail; \B + # succeed + if ($match == $other_is_word) { + $op = '!~'; + $neg_op = '=~'; + } + else { + $op = '=~'; + $neg_op = '!~'; + } + + foreach my $eval ( + qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset: $other \\b $char ) /x], + qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset: $char \\b $other ) /x], + qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset: $other \\B $char ) /x], + qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset: $char \\B $other ) /x], + ) { + ok (eval $eval, $eval . $both_reason); + } + + next if $other_ord == $ord; + + # These start with the \b or \B. They are included, based + # on source code analysis, to force the testing of the FBC + # (find_by_class) portions of regexec.c. + foreach my $eval ( + qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset: \\b $char ) /x], + qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset: \\b $other ) /x], + qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset: \\B $char ) /x], + qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset: \\B $other ) /x], + ) { + ok (eval $eval, $eval . $both_reason); + } + } + } # End of each test case in a class + } # End of \w, \s, ... + } # End of utf8 upgraded or not +} + +plan(curr_test() - 1); |