summaryrefslogtreecommitdiff
path: root/t
diff options
context:
space:
mode:
authorYves Orton <demerphq@gmail.com>2008-11-07 20:20:21 +0000
committerYves Orton <demerphq@gmail.com>2008-11-07 20:20:21 +0000
commitda7fcca4b8d6fb4dc88e0305bf9830bf24912ebd (patch)
treed05a14842c3d234ee9e4f5d1f692c20733133eb1 /t
parent463559e728b65f7b60e46efa081b43ff1b4b6fa4 (diff)
downloadperl-da7fcca4b8d6fb4dc88e0305bf9830bf24912ebd.tar.gz
create new unicode props as defined in POSIX spec (optionally use them in the regex engine)
Perlbug #60156 and #49302 (and probably others) resolve down to the problem that the definition of \s and \w and \d and the POSIX charclasses are different for unicode strings and for non-unicode strings. This broke the character class logic in the regex engine. The easiest fix to make the character class logic sane again is to define new properties which do match. This change creates new property classes that can be used instead of the traditional ones (it does not change the previously defined ones). If the define in regcomp.h: #define PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS 1 is changed to 0, then the new mappings will be used. This will fix a bunch of bugs that are reported as TODO items in the new reg_posixcc.t test file. p4raw-id: //depot/perl@34769
Diffstat (limited to 't')
-rwxr-xr-xt/op/pat.t3
-rw-r--r--t/op/reg_posixcc.t127
2 files changed, 130 insertions, 0 deletions
diff --git a/t/op/pat.t b/t/op/pat.t
index a2d49ac0b4..798a3da70e 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -4617,6 +4617,9 @@ sub kt
}
SKIP: {
+ # XXX: This set of tests is essentially broken, POSIX character classes
+ # should not have differing definitions under unicode.
+ # There are property names for that.
unless ($ordA == 65) { skip("Assumes ASCII", 4) }
my @notIsPunct = grep {/[[:punct:]]/ and not /\p{IsPunct}/}
diff --git a/t/op/reg_posixcc.t b/t/op/reg_posixcc.t
new file mode 100644
index 0000000000..73353993b2
--- /dev/null
+++ b/t/op/reg_posixcc.t
@@ -0,0 +1,127 @@
+#!perl
+
+BEGIN {
+ chdir 't' if -d 't';
+ @INC = '../lib';
+}
+
+use strict;
+use warnings;
+use Test::More tests => 1;
+my @pats=(
+ "\\w",
+ "\\W",
+ "\\s",
+ "\\S",
+ "\\d",
+ "\\D",
+ "[:alnum:]",
+ "[:^alnum:]",
+ "[:alpha:]",
+ "[:^alpha:]",
+ "[:ascii:]",
+ "[:^ascii:]",
+ "[:cntrl:]",
+ "[:^cntrl:]",
+ "[:graph:]",
+ "[:^graph:]",
+ "[:lower:]",
+ "[:^lower:]",
+ "[:print:]",
+ "[:^print:]",
+ "[:punct:]",
+ "[:^punct:]",
+ "[:upper:]",
+ "[:^upper:]",
+ "[:xdigit:]",
+ "[:^xdigit:]",
+ "[:space:]",
+ "[:^space:]",
+ "[:blank:]",
+ "[:^blank:]" );
+sub rangify {
+ my $ary= shift;
+ my $fmt= shift || '%d';
+ my $sep= shift || ' ';
+ my $rng= shift || '..';
+
+
+ my $first= $ary->[0];
+ my $last= $ary->[0];
+ my $ret= sprintf $fmt, $first;
+ for my $idx (1..$#$ary) {
+ if ( $ary->[$idx] != $last + 1) {
+ if ($last!=$first) {
+ $ret.=sprintf "%s$fmt",$rng, $last;
+ }
+ $first= $last= $ary->[$idx];
+ $ret.=sprintf "%s$fmt",$sep,$first;
+ } else {
+ $last= $ary->[$idx];
+ }
+ }
+ if ( $last != $first) {
+ $ret.=sprintf "%s$fmt",$rng, $last;
+ }
+ return $ret;
+}
+
+my $description = "";
+while (@pats) {
+ my ($yes,$no)= splice @pats,0,2;
+
+ my %err_by_type;
+ my %singles;
+ foreach my $b (0..255) {
+ my %got;
+ for my $type ('unicode','not-unicode') {
+ my $str=chr($b).chr($b);
+ if ($type eq 'unicode') {
+ $str.=chr(256);
+ chop $str;
+ }
+ if ($str=~/[$yes][$no]/) {
+ push @{$err_by_type{$type}},$b;
+ }
+ $got{"[$yes]"}{$type} = $str=~/[$yes]/ ? 1 : 0;
+ $got{"[$no]"}{$type} = $str=~/[$no]/ ? 1 : 0;
+ $got{"[^$yes]"}{$type} = $str=~/[^$yes]/ ? 1 : 0;
+ $got{"[^$no]"}{$type} = $str=~/[^$no]/ ? 1 : 0;
+ }
+ foreach my $which ("[$yes]","[$no]","[^$yes]","[^$no]") {
+ if ($got{$which}{'unicode'} != $got{$which}{'not-unicode'}) {
+ push @{$singles{$which}},$b;
+ }
+ }
+ }
+
+
+ if (%err_by_type || %singles) {
+ $description||=" Error:\n";
+ $description .= "/[$yes][$no]/\n";
+ if (%err_by_type) {
+ foreach my $type (keys %err_by_type) {
+ $description .= "\tmatches $type codepoints:\t";
+ $description .= rangify($err_by_type{$type});
+ $description .= "\n";
+ }
+ $description .= "\n";
+ }
+ if (%singles) {
+ $description .= "Unicode/Nonunicode mismatches:\n";
+ foreach my $type (keys %singles) {
+ $description .= "\t$type:\t";
+ $description .= rangify($singles{$type});
+ $description .= "\n";
+ }
+ $description .= "\n";
+ }
+
+ }
+
+}
+TODO: {
+ local $TODO = "Only works under PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS = 0";
+ is( $description, "", "POSIX and perl charclasses should not depend on string type");
+};
+__DATA__