summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYves Orton <demerphq@gmail.com>2007-04-24 18:46:05 +0200
committerRafael Garcia-Suarez <rgarciasuarez@gmail.com>2007-04-26 10:23:30 +0000
commit32e6a07c84b153f78f946de50870bc0ee030624f (patch)
tree80c02123a3e592ec2c9397c178cd62e38136d870
parent0f68039566ac464bc1d4ff8f5b574153a1f6e9e9 (diff)
downloadperl-32e6a07c84b153f78f946de50870bc0ee030624f.tar.gz
Re: Analysis of problems with mixed encoding case insensitive matches in regex engine.
Message-ID: <9b18b3110704240746u461e4bdcl208ef7d7f9c5ef64@mail.gmail.com> p4raw-id: //depot/perl@31081
-rw-r--r--Porting/regcharclass.pl63
-rw-r--r--regcharclass.h83
-rw-r--r--regcomp.sym2
-rw-r--r--regexec.c27
-rw-r--r--regnodes.h17
-rwxr-xr-xt/op/pat.t35
6 files changed, 185 insertions, 42 deletions
diff --git a/Porting/regcharclass.pl b/Porting/regcharclass.pl
index c895440585..8f5b3f13f2 100644
--- a/Porting/regcharclass.pl
+++ b/Porting/regcharclass.pl
@@ -247,7 +247,8 @@ sub combine {
? sprintf("$alu == $hex_fmt",$_->[0])
: sprintf("($hex_fmt <= $alu && $alu <= $hex_fmt)",@$_);
return $txt unless @_;
- return "( $txt || ( $alu > $_->[1] && \n".combine($alu,@_)." ) )";
+ return sprintf "( %s ||( %s > 0x%02X &&\n%s ) )",
+ $txt,$alu,$_->[1],combine($alu,@_);
}
# recursively convert a trie to an optree represented by
@@ -302,11 +303,15 @@ sub make_optree {
$size=1 if $type eq 'c';
if ( !$type ) {
my ( $u, $l );
- for ( my $sz= $size ; !$u && $sz > 0 ; $sz-- ) {
- $u= _trie_to_optree( $self->{trie}{u}{$sz}, 0, 0, $fmt );
+ if ($self->{trie}{u}) {
+ for ( my $sz= $size ; !$u && $sz > 0 ; $sz-- ) {
+ $u= _trie_to_optree( $self->{trie}{u}{$sz}, 0, 0, $fmt );
+ }
}
- for ( my $sz= $size ; !$l && $sz > 0 ; $sz-- ) {
- $l= _trie_to_optree( $self->{trie}{l}{$sz}, 0, 0, $fmt );
+ if ($self->{trie}{l}) {
+ for ( my $sz= $size ; !$l && $sz > 0 ; $sz-- ) {
+ $l= _trie_to_optree( $self->{trie}{l}{$sz}, 0, 0, $fmt );
+ }
}
if ( $u ) {
$else= [ '(is_utf8)', $u, $l || 0 ];
@@ -314,9 +319,13 @@ sub make_optree {
$else= [ '(!is_utf8)', $l, 0 ];
}
$type= 'n';
- $size-- while !$self->{trie}{n}{$size};
}
- return _trie_to_optree( $self->{trie}{$type}{$size}, 0, $else, $fmt );
+ if (!$self->{trie}{$type}) {
+ return $else;
+ } else {
+ $size-- while $size>0 && !$self->{trie}{$type}{$size};
+ return _trie_to_optree( $self->{trie}{$type}{$size}, 0, $else, $fmt );
+ }
}
# construct the optree for a type with length checks to prevent buffer
@@ -427,18 +436,23 @@ sub ternary {
return "/*** GENERATED CODE ***/\n"
. _macro "#define is_$self->{op}$ext($args)\n$code";
}
-
+$|++;
my $path=shift @ARGV;
+
if (!$path) {
$path= "regcharclass.h";
if (!-e $path) { $path="../$path" }
if (!-e $path) { die "Can't find regcharclass.h to update!\n" };
}
-
-rename $path,"$path.bak";
-open my $out_fh,">",$path
- or die "Can't write to '$path':$!";
-binmode $out_fh; # want unix line endings even when run on win32.
+my $out_fh;
+if ($path eq '-') {
+ $out_fh= \*STDOUT;
+} else {
+ rename $path,"$path.bak";
+ open $out_fh,">",$path
+ or die "Can't write to '$path':$!";
+ binmode $out_fh; # want unix line endings even when run on win32.
+}
my ($zero) = $0=~/([^\\\/]+)$/;
print $out_fh <<"HEADER";
/* -*- buffer-read-only: t -*-
@@ -458,17 +472,22 @@ print $out_fh <<"HEADER";
HEADER
-my ($op,$title,@strs,@txt);
+my ($op,$title,@strs,@txt,$type);
my $doit= sub {
return unless $op;
my $o= __PACKAGE__->new($title,$op,@strs);
print $out_fh "/*\n\t$o->{op}: $o->{title}\n\n";
print $out_fh join "\n",@txt,"*/","";
- for ('', 'U', 'L') {
- print $out_fh $o->ternary( $_ );
- print $out_fh $o->ternary( $_,'_safe' );
+ $type||="U L c _safe";
+ my @ext=("");
+ my @types=("",map{ if (length $_>1) { push @ext,$_; () } else { $_ } }
+ split /\s+/,$type);
+ for my $type (@types) {
+ for my $ext (@ext) {
+ next if $type eq 'c' and $ext eq '_safe';
+ print $out_fh $o->ternary( $type,$ext );
+ }
}
- print $out_fh $o->ternary( 'c' );
};
while (<DATA>) {
next unless /\S/;
@@ -477,6 +496,9 @@ while (<DATA>) {
$doit->();
($op,$title)=split /\s*:\s*/,$_,2;
@txt=@strs=();
+ $type="";
+ } elsif (/^=(.*)/) {
+ $type.=$1;
} else {
push @txt, "\t$_";
s/#.*$//;
@@ -489,7 +511,6 @@ while (<DATA>) {
}
$doit->();
print $out_fh "/* ex: set ro: */\n";
-print "$path has been updated\n";
__DATA__
LNBREAK: Line Break: \R
@@ -532,3 +553,7 @@ VERTWS: Vertical Whitespace: \v \V
0x2028 # LINE SEPARATOR
0x2029 # PARAGRAPH SEPARATOR
+TRICKYFOLD: Problematic fold case letters.
+0x00DF # LATIN SMALL LETTER SHARP S
+0x0390 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
+0x03B0 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
diff --git a/regcharclass.h b/regcharclass.h
index 40d21bf5ff..8425693b0b 100644
--- a/regcharclass.h
+++ b/regcharclass.h
@@ -9,7 +9,7 @@
*
* !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
* This file is built by Porting/regcharclass.pl.
- * (Generated at: Mon Apr 23 15:30:51 2007 GMT)
+ * (Generated at: Tue Apr 24 12:19:13 2007 GMT)
* Any changes made here will be lost!
*/
@@ -105,9 +105,9 @@
/*** GENERATED CODE ***/
#define is_LNBREAK_cp(cp) \
-( (0x0A <= cp && cp <= 0x0D) || ( cp > 13 && \
-( cp == 0x85 || ( cp > 133 && \
-( cp == 0x2028 || ( cp > 8232 && \
+( (0x0A <= cp && cp <= 0x0D) ||( cp > 0x0D && \
+( cp == 0x85 ||( cp > 0x85 && \
+( cp == 0x2028 ||( cp > 0x2028 && \
cp == 0x2029 ) ) ) ) ) )
/*
@@ -227,14 +227,14 @@ cp == 0x2029 ) ) ) ) ) )
/*** GENERATED CODE ***/
#define is_HORIZWS_cp(cp) \
-( cp == 0x09 || ( cp > 9 && \
-( cp == 0x20 || ( cp > 32 && \
-( cp == 0xA0 || ( cp > 160 && \
-( cp == 0x1680 || ( cp > 5760 && \
-( cp == 0x180E || ( cp > 6158 && \
-( (0x2000 <= cp && cp <= 0x200A) || ( cp > 8202 && \
-( cp == 0x202F || ( cp > 8239 && \
-( cp == 0x205F || ( cp > 8287 && \
+( cp == 0x09 ||( cp > 0x09 && \
+( cp == 0x20 ||( cp > 0x20 && \
+( cp == 0xA0 ||( cp > 0xA0 && \
+( cp == 0x1680 ||( cp > 0x1680 && \
+( cp == 0x180E ||( cp > 0x180E && \
+( (0x2000 <= cp && cp <= 0x200A) ||( cp > 0x200A && \
+( cp == 0x202F ||( cp > 0x202F && \
+( cp == 0x205F ||( cp > 0x205F && \
cp == 0x3000 ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) )
/*
@@ -310,9 +310,62 @@ cp == 0x3000 ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) )
/*** GENERATED CODE ***/
#define is_VERTWS_cp(cp) \
-( (0x0A <= cp && cp <= 0x0D) || ( cp > 13 && \
-( cp == 0x85 || ( cp > 133 && \
-( cp == 0x2028 || ( cp > 8232 && \
+( (0x0A <= cp && cp <= 0x0D) ||( cp > 0x0D && \
+( cp == 0x85 ||( cp > 0x85 && \
+( cp == 0x2028 ||( cp > 0x2028 && \
cp == 0x2029 ) ) ) ) ) )
+/*
+ TRICKYFOLD: Problematic fold case letters.
+
+ 0x00DF # LATIN SMALL LETTER SHARP S
+ 0x0390 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
+ 0x03B0 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
+*/
+/*** GENERATED CODE ***/
+#define is_TRICKYFOLD(s,is_utf8) \
+( (is_utf8) ? \
+ ( ( ((U8*)s)[0] == 0xC3 ) ? \
+ ( ( ((U8*)s)[1] == 0x9F ) ? 2 : 0 ) : \
+ ((( ((U8*)s)[0] == 0xCE ) && ( ((U8*)s)[1] == 0x90 || ((U8*)s)[1] == 0xB0 )) ? 2 : 0) ) :\
+ ( ((U8*)s)[0] == 0xDF ) )
+
+/*** GENERATED CODE ***/
+#define is_TRICKYFOLD_safe(s,e,is_utf8) \
+( ( (e) - (s) > 1 ) ? \
+( (is_utf8) ? \
+ ( ( ((U8*)s)[0] == 0xC3 ) ? \
+ ( ( ((U8*)s)[1] == 0x9F ) ? 2 : 0 ) : \
+ ((( ((U8*)s)[0] == 0xCE ) && ( ((U8*)s)[1] == 0x90 || ((U8*)s)[1] == 0xB0 )) ? 2 : 0) ) :\
+ ( ((U8*)s)[0] == 0xDF ) ) : \
+((( (e) - (s) > 0 ) && (!is_utf8)) ? ( ((U8*)s)[0] == 0xDF ) : 0) )
+
+/*** GENERATED CODE ***/
+#define is_TRICKYFOLD_utf8(s) \
+( ( ((U8*)s)[0] == 0xC3 ) ? \
+ ( ( ((U8*)s)[1] == 0x9F ) ? 2 : 0 ) : \
+ ((( ((U8*)s)[0] == 0xCE ) && ( ((U8*)s)[1] == 0x90 || ((U8*)s)[1] == 0xB0 )) ? 2 : 0) )
+
+/*** GENERATED CODE ***/
+#define is_TRICKYFOLD_utf8_safe(s,e) \
+( ( (e) - (s) > 1 ) ? \
+ ( ( ((U8*)s)[0] == 0xC3 ) ? \
+ ( ( ((U8*)s)[1] == 0x9F ) ? 2 : 0 ) : \
+ ((( ((U8*)s)[0] == 0xCE ) && ( ((U8*)s)[1] == 0x90 || ((U8*)s)[1] == 0xB0 )) ? 2 : 0) ) : 0 )
+
+/*** GENERATED CODE ***/
+#define is_TRICKYFOLD_latin1(s) \
+( ((U8*)s)[0] == 0xDF )
+
+/*** GENERATED CODE ***/
+#define is_TRICKYFOLD_latin1_safe(s,e) \
+( ( (e) - (s) > 0 ) ? \
+ ( ((U8*)s)[0] == 0xDF ) : 0 )
+
+/*** GENERATED CODE ***/
+#define is_TRICKYFOLD_cp(cp) \
+( cp == 0xDF ||( cp > 0xDF && \
+( cp == 0x390 ||( cp > 0x390 && \
+cp == 0x3B0 ) ) ) )
+
/* ex: set ro: */
diff --git a/regcomp.sym b/regcomp.sym
index 070fe98a63..b0b9faff7e 100644
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -192,6 +192,8 @@ NVERTWS NVERTWS, none not vertical whitespace (Perl 6)
HORIZWS HORIZWS, none horizontal whitespace (Perl 6)
NHORIZWS NHORIZWS, none not horizontal whitespace (Perl 6)
+FOLDCHAR FOLDCHAR, codepoint 1 codepoint with tricky case folding properties.
+
# NEW STUFF ABOVE THIS LINE
################################################################################
diff --git a/regexec.c b/regexec.c
index d84190b0d6..374d480be7 100644
--- a/regexec.c
+++ b/regexec.c
@@ -5004,7 +5004,34 @@ NULL
sayNO;
/* NOTREACHED */
#undef ST
+ case FOLDCHAR:
+ n = ARG(scan);
+ if (nextchr==n) {
+ locinput += UTF8SKIP(locinput);
+ } else {
+ /* This malarky is to handle LATIN SMALL LETTER SHARP S
+ properly. Sigh */
+ if (0xDF==n && (UTF||do_utf8) &&
+ toLOWER(locinput[0])=='s' && toLOWER(locinput[1])=='s')
+ {
+ locinput += 2;
+ } else if (do_utf8) {
+ U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
+ STRLEN tmplen1;
+ U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
+ STRLEN tmplen2;
+ to_uni_fold(n, tmpbuf1, &tmplen1);
+ to_utf8_fold(locinput, tmpbuf2, &tmplen2);
+ if (tmplen1!=tmplen2 || !strnEQ(tmpbuf1,tmpbuf2,tmplen1))
+ sayNO;
+ else
+ locinput += UTF8SKIP(locinput);
+ } else
+ sayNO;
+ }
+ nextchr = UCHARAT(locinput);
+ break;
case LNBREAK:
if ((n=is_LNBREAK(locinput,do_utf8))) {
locinput += n;
diff --git a/regnodes.h b/regnodes.h
index 3c3a5d6d29..4e0f44d5ca 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -6,8 +6,8 @@
/* Regops and State definitions */
-#define REGNODE_MAX 89
-#define REGMATCH_STATE_MAX 129
+#define REGNODE_MAX 90
+#define REGMATCH_STATE_MAX 130
#define END 0 /* 0000 End of program. */
#define SUCCEED 1 /* 0x01 Return from a subroutine, basically. */
@@ -97,8 +97,9 @@
#define NVERTWS 85 /* 0x55 not vertical whitespace (Perl 6) */
#define HORIZWS 86 /* 0x56 horizontal whitespace (Perl 6) */
#define NHORIZWS 87 /* 0x57 not horizontal whitespace (Perl 6) */
-#define OPTIMIZED 88 /* 0x58 Placeholder for dump. */
-#define PSEUDO 89 /* 0x59 Pseudo opcode for internal use. */
+#define FOLDCHAR 88 /* 0x58 codepoint with tricky case folding properties. */
+#define OPTIMIZED 89 /* 0x59 Placeholder for dump. */
+#define PSEUDO 90 /* 0x5a Pseudo opcode for internal use. */
/* ------------ States ------------- */
#define TRIE_next (REGNODE_MAX + 1) /* state for TRIE */
#define TRIE_next_fail (REGNODE_MAX + 2) /* state for TRIE */
@@ -235,6 +236,7 @@ EXTCONST U8 PL_regkind[] = {
NVERTWS, /* NVERTWS */
HORIZWS, /* HORIZWS */
NHORIZWS, /* NHORIZWS */
+ FOLDCHAR, /* FOLDCHAR */
NOTHING, /* OPTIMIZED */
PSEUDO, /* PSEUDO */
/* ------------ States ------------- */
@@ -373,6 +375,7 @@ static const U8 regarglen[] = {
0, /* NVERTWS */
0, /* HORIZWS */
0, /* NHORIZWS */
+ EXTRA_SIZE(struct regnode_1), /* FOLDCHAR */
0, /* OPTIMIZED */
0, /* PSEUDO */
};
@@ -468,6 +471,7 @@ static const char reg_off_by_arg[] = {
0, /* NVERTWS */
0, /* HORIZWS */
0, /* NHORIZWS */
+ 0, /* FOLDCHAR */
0, /* OPTIMIZED */
0, /* PSEUDO */
};
@@ -568,8 +572,9 @@ EXTCONST char * const PL_reg_name[] = {
"NVERTWS", /* 0x55 */
"HORIZWS", /* 0x56 */
"NHORIZWS", /* 0x57 */
- "OPTIMIZED", /* 0x58 */
- "PSEUDO", /* 0x59 */
+ "FOLDCHAR", /* 0x58 */
+ "OPTIMIZED", /* 0x59 */
+ "PSEUDO", /* 0x5a */
/* ------------ States ------------- */
"TRIE_next", /* REGNODE_MAX +0x01 */
"TRIE_next_fail", /* REGNODE_MAX +0x02 */
diff --git a/t/op/pat.t b/t/op/pat.t
index a5b98f6c6c..056e26a267 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -4346,7 +4346,38 @@ sub kt
}
}
}
-
+{
+ # test that \xDF matches properly. this is pretty hacky stuff,
+ # but its actually needed. the malarky with '-' is to prevent
+ # compilation caching from playing any role in the test.
+ my @df= (chr(0xDF),'-',chr(0xDF));
+ utf8::upgrade($df[2]);
+ my @strs= ('ss','sS','Ss','SS',chr(0xDF));
+ my @ss= map { ("$_", "$_") } @strs;
+ utf8::upgrade($ss[$_*2+1]) for 0..$#strs;
+
+ for my $ssi (0..$#ss) {
+ for my $dfi (0..$#df) {
+ my $pat= $df[$dfi];
+ my $str= $ss[$ssi];
+ my $utf_df= ($dfi > 1) ? 'utf8' : '';
+ my $utf_ss= ($ssi % 2) ? 'utf8' : '';
+ (my $sstr=$str)=~s/\xDF/\\xDF/;
+
+ if ($utf_df || $utf_ss || length($ss[$ssi])==1) {
+ my $ret= $str=~/$pat/i;
+ next if $pat eq '-';
+ ok($ret,
+ "\"$sstr\"=~/\\xDF/i (str is @{[$utf_ss||'latin']}, pat is @{[$utf_df||'latin']})");
+ } else {
+ my $ret= $str !~ /$pat/i;
+ next if $pat eq '-';
+ ok($ret,
+ "\"$sstr\"!~/\\xDF/i (str is @{[$utf_ss||'latin']}, pat is @{[$utf_df||'latin']})");
+ }
+ }
+ }
+}
# Test counter is at bottom of file. Put new tests above here.
#-------------------------------------------------------------------
# Keep the following tests last -- they may crash perl
@@ -4428,7 +4459,7 @@ ok($@=~/\QSequence \k... not terminated in regex;\E/);
iseq(0+$::test,$::TestCount,"Got the right number of tests!");
# Don't forget to update this!
BEGIN {
- $::TestCount = 1928;
+ $::TestCount = 1948;
print "1..$::TestCount\n";
}