summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYves Orton <demerphq@gmail.com>2007-12-17 16:21:46 +0100
committerRafael Garcia-Suarez <rgarciasuarez@gmail.com>2007-12-17 16:01:32 +0000
commita0a388a13daad79062b3c4b126f22d2f46fc82dd (patch)
treec1f1562353529ecc4edfee3469657379d1adf79d
parent5cf57fc61e657066a9798cca42453d9e2f71bd8c (diff)
downloadperl-a0a388a13daad79062b3c4b126f22d2f46fc82dd.tar.gz
Fix various bugs in regex engine with mixed utf8/latin pattern and strings. Related to [perl #36207] among others
Message-ID: <9b18b3110712170621h41de2c76k331971e3660abcb0@mail.gmail.com> p4raw-id: //depot/perl@32628
-rw-r--r--MANIFEST1
-rw-r--r--regcomp.c27
-rw-r--r--regexec.c88
-rwxr-xr-xt/op/pat.t23
-rw-r--r--t/op/reg_fold.t37
-rw-r--r--utf8.c5
6 files changed, 147 insertions, 34 deletions
diff --git a/MANIFEST b/MANIFEST
index 810f749510..490dc10bf0 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -3786,6 +3786,7 @@ t/op/read.t See if read() works
t/op/recurse.t See if deep recursion works
t/op/ref.t See if refs and objects work
t/op/reg_email.t See if regex recursion works by parsing email addresses
+t/op/reg_fold.t See if case folding works properly
t/op/regexp_noamp.t See if regular expressions work with optimizations
t/op/regexp_notrie.t See if regular expressions work without trie optimisation
t/op/regexp_qr_embed.t See if regular expressions work with embedded qr//
diff --git a/regcomp.c b/regcomp.c
index 19d6f15985..5a175ba807 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2791,7 +2791,15 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
last = cur;
}
} else {
- if ( last ) {
+/*
+ Currently we assume that the trie can handle unicode and ascii
+ matches fold cased matches. If this proves true then the following
+ define will prevent tries in this situation.
+
+ #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
+*/
+#define TRIE_TYPE_IS_SAFE 1
+ if ( last && TRIE_TYPE_IS_SAFE ) {
make_trie( pRExC_state,
startbranch, first, cur, tail, count,
optype, depth+1 );
@@ -2819,7 +2827,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
"", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
});
- if ( last ) {
+
+ if ( last && TRIE_TYPE_IS_SAFE ) {
made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
#ifdef TRIE_STUDY_OPT
if ( ((made == MADE_EXACT_TRIE &&
@@ -6867,6 +6876,7 @@ tryagain:
case 0xDF:
case 0xC3:
case 0xCE:
+ do_foldchar:
if (!LOC && FOLD) {
U32 len,cp;
len=0; /* silence a spurious compiler warning */
@@ -6893,7 +6903,11 @@ tryagain:
required, as the default for this switch is to jump to the
literal text handling code.
*/
- switch (*++RExC_parse) {
+ switch ((U8)*++RExC_parse) {
+ case 0xDF:
+ case 0xC3:
+ case 0xCE:
+ goto do_foldchar;
/* Special Escapes */
case 'A':
RExC_seen_zerolen++;
@@ -7211,8 +7225,13 @@ tryagain:
an unescaped equivalent literal.
*/
- switch (*++p) {
+ switch ((U8)*++p) {
/* These are all the special escapes. */
+ case 0xDF:
+ case 0xC3:
+ case 0xCE:
+ if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
+ goto normal_default;
case 'A': /* Start assertion */
case 'b': case 'B': /* Word-boundary assertion*/
case 'C': /* Single char !DANGEROUS! */
diff --git a/regexec.c b/regexec.c
index 1b9fca1165..6afcea4abf 100644
--- a/regexec.c
+++ b/regexec.c
@@ -989,7 +989,11 @@ Perl_re_intuit_start(pTHX_ REGEXP * const prog, SV *sv, char *strpos,
return NULL;
}
-
+#define DECL_TRIE_TYPE(scan) \
+ const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
+ trie_type = (scan->flags != EXACT) \
+ ? (do_utf8 ? trie_utf8_fold : (UTF ? trie_latin_utf8_fold : trie_plain)) \
+ : (do_utf8 ? trie_utf8 : trie_plain)
#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
@@ -1007,6 +1011,19 @@ uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
uscan = foldbuf + UNISKIP( uvc ); \
} \
break; \
+ case trie_latin_utf8_fold: \
+ if ( foldlen>0 ) { \
+ uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
+ foldlen -= len; \
+ uscan += len; \
+ len=0; \
+ } else { \
+ len = 1; \
+ uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen ); \
+ foldlen -= UNISKIP( uvc ); \
+ uscan = foldbuf + UNISKIP( uvc ); \
+ } \
+ break; \
case trie_utf8: \
uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
break; \
@@ -1029,12 +1046,14 @@ uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
} \
} STMT_END
-#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
+#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
+{ \
+ char *my_strend= (char *)strend; \
if ( (CoNd) \
&& (ln == len || \
- ibcmp_utf8(s, NULL, 0, do_utf8, \
+ !ibcmp_utf8(s, &my_strend, 0, do_utf8, \
m, NULL, ln, (bool)UTF)) \
- && (!reginfo || regtry(reginfo, &s)) ) \
+ && (!reginfo || regtry(reginfo, &s)) ) \
goto got_it; \
else { \
U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
@@ -1042,15 +1061,14 @@ uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
if ( f != c \
&& (f == c1 || f == c2) \
- && (ln == foldlen || \
- !ibcmp_utf8((char *) foldbuf, \
- NULL, foldlen, do_utf8, \
- m, \
- NULL, ln, (bool)UTF)) \
- && (!reginfo || regtry(reginfo, &s)) ) \
+ && (ln == len || \
+ !ibcmp_utf8(s, &my_strend, 0, do_utf8,\
+ m, NULL, ln, (bool)UTF)) \
+ && (!reginfo || regtry(reginfo, &s)) ) \
goto got_it; \
} \
- s += len
+} \
+s += len
#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
STMT_START { \
@@ -1210,14 +1228,26 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
const U32 uniflags = UTF8_ALLOW_DEFAULT;
-
- to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
- to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
-
+
+ /* XXX: Since the node will be case folded at compile
+ time this logic is a little odd, although im not
+ sure that its actually wrong. --dmq */
+
+ c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
+ c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
+
+ /* XXX: This is kinda strange. to_utf8_XYZ returns the
+ codepoint of the first character in the converted
+ form, yet originally we did the extra step.
+ No tests fail by commenting this code out however
+ so Ive left it out. -- dmq.
+
c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
0, uniflags);
c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
0, uniflags);
+ */
+
lnc = 0;
while (sm < ((U8 *) m + ln)) {
lnc++;
@@ -1252,24 +1282,33 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
* matching (called "loose matching" in Unicode).
* ibcmp_utf8() will do just that. */
- if (do_utf8) {
+ if (do_utf8 || UTF) {
UV c, f;
U8 tmpbuf [UTF8_MAXBYTES+1];
- STRLEN len, foldlen;
+ STRLEN len = 1;
+ STRLEN foldlen;
const U32 uniflags = UTF8_ALLOW_DEFAULT;
if (c1 == c2) {
/* Upper and lower of 1st char are equal -
* probably not a "letter". */
while (s <= e) {
- c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
+ if (do_utf8) {
+ c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
uniflags);
+ } else {
+ c = *((U8*)s);
+ }
REXEC_FBC_EXACTISH_CHECK(c == c1);
}
}
else {
while (s <= e) {
- c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
+ if (do_utf8) {
+ c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
uniflags);
+ } else {
+ c = *((U8*)s);
+ }
/* Handle some of the three Greek sigmas cases.
* Note that not all the possible combinations
@@ -1287,6 +1326,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
}
}
else {
+ /* Neither pattern nor string are UTF8 */
if (c1 == c2)
REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
else
@@ -1461,10 +1501,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
case AHOCORASICKC:
case AHOCORASICK:
{
- const enum { trie_plain, trie_utf8, trie_utf8_fold }
- trie_type = do_utf8 ?
- (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
- : trie_plain;
+ DECL_TRIE_TYPE(c);
/* what trie are we using right now */
reg_ac_data *aho
= (reg_ac_data*)progi->data->data[ ARG( c ) ];
@@ -2872,10 +2909,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
case TRIE:
{
/* what type of TRIE am I? (utf8 makes this contextual) */
- const enum { trie_plain, trie_utf8, trie_utf8_fold }
- trie_type = do_utf8 ?
- (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
- : trie_plain;
+ DECL_TRIE_TYPE(scan);
/* what trie are we using right now */
reg_trie_data * const trie
diff --git a/t/op/pat.t b/t/op/pat.t
index 2697157195..7d03eb6b82 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -3406,9 +3406,9 @@ if (!$ENV{PERL_SKIP_PSYCHO_TEST}){
ok($utf8 =~ /(abc|\xe9)/i, "utf8/latin trie");
ok($utf8 =~ /(abc|$latin1)/i, "utf8/latin trie runtime");
- ok("\xe9" =~ /$utf8/i, "# TODO latin/utf8");
+ ok("\xe9" =~ /$utf8/i, "# latin/utf8");
ok("\xe9" =~ /(abc|$utf8)/i, "# latin/utf8 trie");
- ok($latin1 =~ /$utf8/i, "# TODO latin/utf8 runtime");
+ ok($latin1 =~ /$utf8/i, "# latin/utf8 runtime");
ok($latin1 =~ /(abc|$utf8)/i, "# latin/utf8 trie runtime");
}
@@ -4487,6 +4487,23 @@ sub kt
iseq($1,"\xd6","#45605");
}
+{
+ # Regardless of utf8ness any character matches itself when
+ # doing a case insensitive match. See also [perl #36207]
+ for my $o (0..255) {
+ my @ch=(chr($o),chr($o));
+ utf8::upgrade($ch[1]);
+ for my $u_str (0,1) {
+ for my $u_pat (0,1) {
+ ok( $ch[$u_str]=~/\Q$ch[$u_pat]\E/i,
+ "\$c=~/\$c/i : chr($o) : u_str=$u_str u_pat=$u_pat");
+ ok( $ch[$u_str]=~/\Q$ch[$u_pat]\E|xyz/i,
+ "# \$c=~/\$c|xyz/i : chr($o) : u_str=$u_str u_pat=$u_pat");
+ }
+ }
+ }
+}
+
# Test counter is at bottom of file. Put new tests above here.
#-------------------------------------------------------------------
# Keep the following tests last -- they may crash perl
@@ -4545,6 +4562,6 @@ ok($@=~/\QSequence \k... not terminated in regex;\E/);
iseq(0+$::test,$::TestCount,"Got the right number of tests!");
# Don't forget to update this!
BEGIN {
- $::TestCount = 1965;
+ $::TestCount = 4013;
print "1..$::TestCount\n";
}
diff --git a/t/op/reg_fold.t b/t/op/reg_fold.t
new file mode 100644
index 0000000000..6064ecfa69
--- /dev/null
+++ b/t/op/reg_fold.t
@@ -0,0 +1,37 @@
+use strict;
+use warnings;
+use Test::More;
+my $count=1;
+my @tests;
+use Cwd;
+
+my $file="../lib/unicore/CaseFolding.txt";
+open my $fh,"<",$file
+ or die "Failed to read '$file' from '".cwd()."': $!";
+while (<$fh>) {
+ chomp;
+ my ($line,$comment)= split/\s+#\s+/, $_;
+ my ($cp,$type,@fc)=split/[\s;]+/,$line||'';
+ next unless $type and ($type eq 'F' or $type eq 'C');
+ $_="\\x{$_}" for @fc;
+ my $cpv=hex("0x$cp");
+ my $chr="chr(0x$cp)";
+ my @str;
+ push @str,$chr if $cpv<128 or $cpv>256;
+ if ($cpv<256) {
+ push @str,"do{my \$c=$chr; utf8::upgrade(\$c); \$c}"
+ }
+
+ foreach my $str ( @str ) {
+ my $expr="$str=~/@fc/ix";
+ my $t=($cpv > 256 || $str=~/^do/) ? "unicode" : "latin";
+ push @tests,
+ qq[ok($expr,'$chr=~/@fc/ix - $comment ($t string)')];
+ $tests[-1]="TODO: { local \$TODO='[13:41] <BinGOs> cue *It is all Greek to me* joke.';\n$tests[-1] }"
+ if $cp eq '0390' or $cp eq '03B0';
+ $count++;
+ }
+}
+eval join ";\n","plan tests=>".($count-1),@tests,"1"
+ or die $@;
+__DATA__
diff --git a/utf8.c b/utf8.c
index e21cb4f6b5..7bc2b099e8 100644
--- a/utf8.c
+++ b/utf8.c
@@ -2254,13 +2254,18 @@ Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const
if (pe1)
e1 = *(U8**)pe1;
+ /* assert(e1 || l1); */
if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
f1 = (const U8*)s1 + l1;
if (pe2)
e2 = *(U8**)pe2;
+ /* assert(e2 || l2); */
if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
f2 = (const U8*)s2 + l2;
+ /* This shouldn't happen. However, putting an assert() there makes some
+ * tests fail. */
+ /* assert((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)); */
if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
return 1; /* mismatch; possible infinite loop or false positive */