index and rindex couldn't correctly handle surprises from UTF-8

overloading. p4raw-id: //depot/perl@28022
author: Nicholas Clark <nick@ccl4.org> 2006-04-30 11:14:04 +0000
committer: Nicholas Clark <nick@ccl4.org> 2006-04-30 11:14:04 +0000
commit: 73ee8be2712c500c98e5976864ba96726bf311e2 (patch)
tree: 69af13d297713caa615426fcc1c9262ed09a8540
parent: d34786ba12ee5b96d9e34dd6fcdda158d7d2597b (diff)
download: perl-73ee8be2712c500c98e5976864ba96726bf311e2.tar.gz
3 files changed, 77 insertions, 24 deletions
diff --git a/pp.c b/pp.c
index b937e0ab94..65e1d50649 100644
--- a/pp.c
+++ b/pp.c
@@ -3183,8 +3183,8 @@ PP(pp_index)
     STRLEN llen = 0;
     I32 offset;
     I32 retval;
-    const char *tmps;
-    const char *tmps2;
+    const char *big_p;
+    const char *little_p;
     const I32 arybase = CopARYBASE_get(PL_curcop);
     bool big_utf8;
     bool little_utf8;
@@ -3197,6 +3197,9 @@ PP(pp_index)
     }
     little = POPs;
     big = POPs;
+    big_p = SvPV_const(big, biglen);
+    little_p = SvPV_const(little, llen);
+
     big_utf8 = DO_UTF8(big);
     little_utf8 = DO_UTF8(little);
     if (big_utf8 ^ little_utf8) {
@@ -3204,9 +3207,7 @@ PP(pp_index)
 	if (little_utf8 && !PL_encoding) {
 	    /* Well, maybe instead we might be able to downgrade the small
 	       string?  */
-	    STRLEN little_len;
-	    const U8 * const little_pv = (U8*) SvPV_const(little, little_len);
-	    char * const pv = (char*)bytes_from_utf8(little_pv, &little_len,
+	    char * const pv = (char*)bytes_from_utf8(little_p, &llen,
 						     &little_utf8);
 	    if (little_utf8) {
 		/* If the large string is ISO-8859-1, and it's not possible to
@@ -3219,13 +3220,11 @@ PP(pp_index)
 	    /* At this point, pv is a malloc()ed string. So donate it to temp
 	       to ensure it will get free()d  */
 	    little = temp = newSV(0);
-	    sv_usepvn(temp, pv, little_len);
+	    sv_usepvn(temp, pv, llen);
+	    little_p = SvPVX(little);
 	} else {
-	    SV * const bytes = little_utf8 ? big : little;
-	    STRLEN len;
-	    const char * const p = SvPV_const(bytes, len);
-
-	    temp = newSVpvn(p, len);
+	    temp = little_utf8
+		? newSVpvn(big_p, biglen) : newSVpvn(little_p, llen);
 
 	    if (PL_encoding) {
 		sv_recode_to_utf8(temp, PL_encoding);
@@ -3235,34 +3234,58 @@ PP(pp_index)
 	    if (little_utf8) {
 		big = temp;
 		big_utf8 = TRUE;
+		big_p = SvPV_const(big, biglen);
 	    } else {
 		little = temp;
+		little_p = SvPV_const(little, llen);
 	    }
 	}
     }
-    /* Don't actually need the NULL initialisation, but it keeps gcc quiet.  */
-    tmps2 = is_index ? NULL : SvPV_const(little, llen);
-    tmps = SvPV_const(big, biglen);
+    if (SvGAMAGIC(big)) {
+	/* Life just becomes a lot easier if I use a temporary here.
+	   Otherwise I need to avoid calls to sv_pos_u2b(), which (dangerously)
+	   will trigger magic and overloading again, as will fbm_instr()
+	*/
+	big = sv_2mortal(newSVpvn(big_p, biglen));
+	if (big_utf8)
+	    SvUTF8_on(big);
+	big_p = SvPVX(big);
+    }
+    if (SvGAMAGIC(little) || index && !SvOK(little)) {
+	/* index && SvOK() is a hack. fbm_instr() calls SvPV_const, which will
+	   warn on undef, and we've already triggered a warning with the
+	   SvPV_const some lines above. We can't remove that, as we need to
+	   call some SvPV to trigger overloading early and find out if the
+	   string is UTF-8.
+	   This is all getting to messy. The API isn't quite clean enough,
+	   because data access has side effects.
+	*/
+	little = sv_2mortal(newSVpvn(little_p, llen));
+	if (little_utf8)
+	    SvUTF8_on(little);
+	little_p = SvPVX(little);
+    }
 
     if (MAXARG < 3)
 	offset = is_index ? 0 : biglen;
     else {
 	if (big_utf8 && offset > 0)
 	    sv_pos_u2b(big, &offset, 0);
-	offset += llen;
+	if (!is_index)
+	    offset += llen;
     }
     if (offset < 0)
 	offset = 0;
     else if (offset > (I32)biglen)
 	offset = biglen;
-    if (!(tmps2 = is_index
-	  ? fbm_instr((unsigned char*)tmps + offset,
-		      (unsigned char*)tmps + biglen, little, 0)
-	  : rninstr(tmps,  tmps  + offset,
-		    tmps2, tmps2 + llen)))
+    if (!(little_p = is_index
+	  ? fbm_instr((unsigned char*)big_p + offset,
+		      (unsigned char*)big_p + biglen, little, 0)
+	  : rninstr(big_p,  big_p  + offset,
+		    little_p, little_p + llen)))
 	retval = -1;
     else {
-	retval = tmps2 - tmps;
+	retval = little_p - big_p;
 	if (retval > 0 && big_utf8)
 	    sv_pos_b2u(big, &retval);
     }
diff --git a/t/lib/warnings/9uninit b/t/lib/warnings/9uninit
index 575161d2a1..fadcd1bd2a 100644
--- a/t/lib/warnings/9uninit
+++ b/t/lib/warnings/9uninit
@@ -873,11 +873,11 @@ Use of uninitialized value $m2 in index at - line 14.
 Use of uninitialized value $g1 in index at - line 15.
 Use of uninitialized value $m1 in index at - line 15.
 Use of uninitialized value $m2 in index at - line 15.
-Use of uninitialized value $m2 in rindex at - line 16.
 Use of uninitialized value $m1 in rindex at - line 16.
+Use of uninitialized value $m2 in rindex at - line 16.
 Use of uninitialized value $g1 in rindex at - line 17.
-Use of uninitialized value $m2 in rindex at - line 17.
 Use of uninitialized value $m1 in rindex at - line 17.
+Use of uninitialized value $m2 in rindex at - line 17.
 ########
 use warnings 'uninitialized';
 my ($m1, $v);
diff --git a/t/uni/overload.t b/t/uni/overload.t
index 5812425ca5..ef61667448 100644
--- a/t/uni/overload.t
+++ b/t/uni/overload.t
@@ -7,7 +7,7 @@ BEGIN {
     }
 }
 
-use Test::More tests => 116;
+use Test::More tests => 190;
 
 package UTF8Toggle;
 use strict;
@@ -212,6 +212,36 @@ foreach my $operator ('print', 'syswrite', 'syswrite len', 'syswrite off',
     }
 }
 
+my $little = "\243\243";
+my $big = " \243 $little ! $little ! $little \243 ";
+my $right = rindex $big, $little;
+my $right1 = rindex $big, $little, 11;
+my $left = index $big, $little;
+my $left1 = index $big, $little, 4;
+
+cmp_ok ($right, ">", $right1, "Sanity check our rindex tests");
+cmp_ok ($left, "<", $left1, "Sanity check our index tests");
+
+foreach my $b ($big, UTF8Toggle->new($big)) {
+    foreach my $l ($little, UTF8Toggle->new($little),
+		   UTF8Toggle->new($little, 1)) {
+	is (rindex ($b, $l), $right, "rindex");
+	is (rindex ($b, $l), $right, "rindex");
+	is (rindex ($b, $l), $right, "rindex");
+
+	is (rindex ($b, $l, 11), $right1, "rindex 11");
+	is (rindex ($b, $l, 11), $right1, "rindex 11");
+	is (rindex ($b, $l, 11), $right1, "rindex 11");
+
+	is (index ($b, $l), $left, "index");
+	is (index ($b, $l), $left, "index");
+	is (index ($b, $l), $left, "index");
+
+	is (index ($b, $l, 4), $left1, "index 4");
+	is (index ($b, $l, 4), $left1, "index 4");
+	is (index ($b, $l, 4), $left1, "index 4");
+    }
+}
 
 END {
     1 while -f $tmpfile and unlink $tmpfile || die "unlink '$tmpfile': $!";
author	Nicholas Clark <nick@ccl4.org>	2006-04-30 11:14:04 +0000
committer	Nicholas Clark <nick@ccl4.org>	2006-04-30 11:14:04 +0000
commit	73ee8be2712c500c98e5976864ba96726bf311e2 (patch)
tree	69af13d297713caa615426fcc1c9262ed09a8540
parent	d34786ba12ee5b96d9e34dd6fcdda158d7d2597b (diff)
download	perl-73ee8be2712c500c98e5976864ba96726bf311e2.tar.gz