summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEric Brine <ikegami@adaelis.com>2010-02-11 20:28:29 -0500
committerNicholas Clark <nick@ccl4.org>2010-02-14 16:32:01 +0000
commit777f7c561610dee641c77666e5a4a0d9ac1d4230 (patch)
treea5af4c59239052b2538c566a2b9dfecf437e9b08
parent6e3b7bfa2b063f4ce0c55f84474edb7d2c652387 (diff)
downloadperl-777f7c561610dee641c77666e5a4a0d9ac1d4230.tar.gz
Removes 32-bit limit on substr arguments. The full range of IV and UV is available for the pos and len arguments, with safe conversion to STRLEN where it's smaller than an IV.
-rw-r--r--embed.fnc1
-rw-r--r--embed.h2
-rw-r--r--global.sym1
-rw-r--r--mg.c19
-rw-r--r--pp.c143
-rw-r--r--proto.h5
-rw-r--r--sv.c47
-rw-r--r--t/re/substr.t42
8 files changed, 193 insertions, 67 deletions
diff --git a/embed.fnc b/embed.fnc
index 7463274fe2..7e450aa1fb 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1165,6 +1165,7 @@ ApdR |SV* |sv_newmortal
Apd |SV* |sv_newref |NULLOK SV *const sv
Ap |char* |sv_peek |NULLOK SV* sv
Apd |void |sv_pos_u2b |NULLOK SV *const sv|NN I32 *const offsetp|NULLOK I32 *const lenp
+Apd |void |sv_pos_u2b_proper|NULLOK SV *const sv|NN STRLEN *const offsetp|NULLOK STRLEN *const lenp
Apd |void |sv_pos_b2u |NULLOK SV *const sv|NN I32 *const offsetp
Amdb |char* |sv_pvn_force |NN SV* sv|NULLOK STRLEN* lp
Apd |char* |sv_pvutf8n_force|NN SV *const sv|NULLOK STRLEN *const lp
diff --git a/embed.h b/embed.h
index 246106bcf9..1281fccfa4 100644
--- a/embed.h
+++ b/embed.h
@@ -967,6 +967,7 @@
#define sv_newref Perl_sv_newref
#define sv_peek Perl_sv_peek
#define sv_pos_u2b Perl_sv_pos_u2b
+#define sv_pos_u2b_proper Perl_sv_pos_u2b_proper
#define sv_pos_b2u Perl_sv_pos_b2u
#define sv_pvutf8n_force Perl_sv_pvutf8n_force
#define sv_pvbyten_force Perl_sv_pvbyten_force
@@ -3371,6 +3372,7 @@
#define sv_newref(a) Perl_sv_newref(aTHX_ a)
#define sv_peek(a) Perl_sv_peek(aTHX_ a)
#define sv_pos_u2b(a,b,c) Perl_sv_pos_u2b(aTHX_ a,b,c)
+#define sv_pos_u2b_proper(a,b,c) Perl_sv_pos_u2b_proper(aTHX_ a,b,c)
#define sv_pos_b2u(a,b) Perl_sv_pos_b2u(aTHX_ a,b)
#define sv_pvutf8n_force(a,b) Perl_sv_pvutf8n_force(aTHX_ a,b)
#define sv_pvbyten_force(a,b) Perl_sv_pvbyten_force(aTHX_ a,b)
diff --git a/global.sym b/global.sym
index f0361dfca5..f0e462ecd4 100644
--- a/global.sym
+++ b/global.sym
@@ -567,6 +567,7 @@ Perl_sv_newmortal
Perl_sv_newref
Perl_sv_peek
Perl_sv_pos_u2b
+Perl_sv_pos_u2b_proper
Perl_sv_pos_b2u
Perl_sv_pvn_force
Perl_sv_pvutf8n_force
diff --git a/mg.c b/mg.c
index b9a1464938..4f8207c1af 100644
--- a/mg.c
+++ b/mg.c
@@ -2008,17 +2008,17 @@ Perl_magic_getsubstr(pTHX_ SV *sv, MAGIC *mg)
STRLEN len;
SV * const lsv = LvTARG(sv);
const char * const tmps = SvPV_const(lsv,len);
- I32 offs = LvTARGOFF(sv);
- I32 rem = LvTARGLEN(sv);
+ STRLEN offs = LvTARGOFF(sv);
+ STRLEN rem = LvTARGLEN(sv);
PERL_ARGS_ASSERT_MAGIC_GETSUBSTR;
PERL_UNUSED_ARG(mg);
if (SvUTF8(lsv))
- sv_pos_u2b(lsv, &offs, &rem);
- if (offs > (I32)len)
+ sv_pos_u2b_proper(lsv, &offs, &rem);
+ if (offs > len)
offs = len;
- if (rem + offs > (I32)len)
+ if (rem > len - offs)
rem = len - offs;
sv_setpvn(sv, tmps + offs, (STRLEN)rem);
if (SvUTF8(lsv))
@@ -2033,22 +2033,22 @@ Perl_magic_setsubstr(pTHX_ SV *sv, MAGIC *mg)
STRLEN len;
const char * const tmps = SvPV_const(sv, len);
SV * const lsv = LvTARG(sv);
- I32 lvoff = LvTARGOFF(sv);
- I32 lvlen = LvTARGLEN(sv);
+ STRLEN lvoff = LvTARGOFF(sv);
+ STRLEN lvlen = LvTARGLEN(sv);
PERL_ARGS_ASSERT_MAGIC_SETSUBSTR;
PERL_UNUSED_ARG(mg);
if (DO_UTF8(sv)) {
sv_utf8_upgrade(lsv);
- sv_pos_u2b(lsv, &lvoff, &lvlen);
+ sv_pos_u2b_proper(lsv, &lvoff, &lvlen);
sv_insert(lsv, lvoff, lvlen, tmps, len);
LvTARGLEN(sv) = sv_len_utf8(sv);
SvUTF8_on(lsv);
}
else if (lsv && SvUTF8(lsv)) {
const char *utf8;
- sv_pos_u2b(lsv, &lvoff, &lvlen);
+ sv_pos_u2b_proper(lsv, &lvoff, &lvlen);
LvTARGLEN(sv) = len;
utf8 = (char*)bytes_to_utf8((U8*)tmps, &len);
sv_insert(lsv, lvoff, lvlen, utf8, len);
@@ -2059,7 +2059,6 @@ Perl_magic_setsubstr(pTHX_ SV *sv, MAGIC *mg)
LvTARGLEN(sv) = len;
}
-
return 0;
}
diff --git a/pp.c b/pp.c
index 2f4703bd20..95dc5fd15e 100644
--- a/pp.c
+++ b/pp.c
@@ -3079,15 +3079,19 @@ PP(pp_substr)
{
dVAR; dSP; dTARGET;
SV *sv;
- I32 len = 0;
STRLEN curlen;
STRLEN utf8_curlen;
- I32 pos;
- I32 rem;
- I32 fail;
+ SV * pos_sv;
+ IV pos1_iv;
+ int pos1_is_uv;
+ IV pos2_iv;
+ int pos2_is_uv;
+ SV * len_sv;
+ IV len_iv = 0;
+ int len_is_uv = 1;
const I32 lvalue = PL_op->op_flags & OPf_MOD || LVRET;
const char *tmps;
- const I32 arybase = CopARYBASE_get(PL_curcop);
+ const IV arybase = CopARYBASE_get(PL_curcop);
SV *repl_sv = NULL;
const char *repl = NULL;
STRLEN repl_len;
@@ -3103,9 +3107,13 @@ PP(pp_substr)
repl = SvPV_const(repl_sv, repl_len);
repl_is_utf8 = DO_UTF8(repl_sv) && SvCUR(repl_sv);
}
- len = POPi;
+ len_sv = POPs;
+ len_iv = SvIV(len_sv);
+ len_is_uv = SvIOK_UV(len_sv);
}
- pos = POPi;
+ pos_sv = POPs;
+ pos1_iv = SvIV(pos_sv);
+ pos1_is_uv = SvIOK_UV(pos_sv);
sv = POPs;
PUTBACK;
if (repl_sv) {
@@ -3127,51 +3135,80 @@ PP(pp_substr)
else
utf8_curlen = 0;
- if (pos >= arybase) {
- pos -= arybase;
- rem = curlen-pos;
- fail = rem;
- if (num_args > 2) {
- if (len < 0) {
- rem += len;
- if (rem < 0)
- rem = 0;
- }
- else if (rem > len)
- rem = len;
+ if ( (pos1_is_uv && arybase < 0) || (pos1_iv >= arybase) ) { /* pos >= $[ */
+ UV pos1_uv = pos1_iv-arybase;
+ /* Overflow can occur when $[ < 0 */
+ if (arybase < 0 && pos1_uv < (UV)pos1_iv)
+ goto BOUND_FAIL;
+ pos1_iv = pos1_uv;
+ pos1_is_uv = 1;
+ }
+ else if (pos1_is_uv ? (UV)pos1_iv > 0 : pos1_iv > 0) {
+ goto BOUND_FAIL; /* $[=3; substr($_,2,...) */
+ }
+ else { /* pos < $[ */
+ if (pos1_iv == 0) { /* $[=1; substr($_,0,...) */
+ pos1_iv = curlen;
+ pos1_is_uv = 1;
+ } else {
+ if (curlen) {
+ pos1_is_uv = curlen-1 > ~(UV)pos1_iv;
+ pos1_iv += curlen;
+ }
}
}
- else {
- pos += curlen;
- if (num_args < 3)
- rem = curlen;
- else if (len >= 0) {
- rem = pos+len;
- if (rem > (I32)curlen)
- rem = curlen;
+ if (pos1_is_uv || pos1_iv > 0) {
+ if ((UV)pos1_iv > curlen)
+ goto BOUND_FAIL;
+ }
+
+ if (num_args > 2) {
+ if (!len_is_uv && len_iv < 0) {
+ pos2_iv = curlen + len_iv;
+ if (curlen)
+ pos2_is_uv = curlen-1 > ~(UV)len_iv;
+ else
+ pos2_is_uv = 0;
+ } else { /* len_iv >= 0 */
+ if (!pos1_is_uv && pos1_iv < 0) {
+ pos2_iv = pos1_iv + len_iv;
+ pos2_is_uv = (UV)len_iv > (UV)IV_MAX;
+ } else {
+ if ((UV)len_iv > curlen-(UV)pos1_iv)
+ pos2_iv = curlen;
+ else
+ pos2_iv = pos1_iv+len_iv;
+ pos2_is_uv = 1;
+ }
}
- else {
- rem = curlen+len;
- if (rem < pos)
- rem = pos;
- }
- if (pos < 0)
- pos = 0;
- fail = rem;
- rem -= pos;
- }
- if (fail < 0) {
- if (lvalue || repl)
- Perl_croak(aTHX_ "substr outside of string");
- Perl_ck_warner(aTHX_ packWARN(WARN_SUBSTR), "substr outside of string");
- RETPUSHUNDEF;
}
else {
- const I32 upos = pos;
- const I32 urem = rem;
+ pos2_iv = curlen;
+ pos2_is_uv = 1;
+ }
+
+ if (!pos2_is_uv && pos2_iv < 0) {
+ if (!pos1_is_uv && pos1_iv < 0)
+ goto BOUND_FAIL;
+ pos2_iv = 0;
+ }
+ else if (!pos1_is_uv && pos1_iv < 0)
+ pos1_iv = 0;
+
+ if ((UV)pos2_iv < (UV)pos1_iv)
+ pos2_iv = pos1_iv;
+ if ((UV)pos2_iv > curlen)
+ pos2_iv = curlen;
+
+ {
+ /* pos1_iv and pos2_iv both in 0..curlen, so the cast is safe */
+ const STRLEN pos = (STRLEN)( (UV)pos1_iv );
+ const STRLEN len = (STRLEN)( (UV)pos2_iv - (UV)pos1_iv );
+ STRLEN byte_pos = pos;
+ STRLEN byte_len = len;
if (utf8_curlen)
- sv_pos_u2b(sv, (I32 *)&pos, (I32 *)&rem);
- tmps += pos;
+ sv_pos_u2b_proper(sv, &byte_pos, &byte_len);
+ tmps += byte_pos;
/* we either return a PV or an LV. If the TARG hasn't been used
* before, or is of that type, reuse it; otherwise use a mortal
* instead. Note that LVs can have an extended lifetime, so also
@@ -3185,7 +3222,7 @@ PP(pp_substr)
}
}
- sv_setpvn(TARG, tmps, rem);
+ sv_setpvn(TARG, tmps, byte_len);
#ifdef USE_LOCALE_COLLATE
sv_unmagic(TARG, PERL_MAGIC_collxfrm);
#endif
@@ -3202,7 +3239,7 @@ PP(pp_substr)
}
if (!SvOK(sv))
sv_setpvs(sv, "");
- sv_insert_flags(sv, pos, rem, repl, repl_len, 0);
+ sv_insert_flags(sv, byte_pos, byte_len, repl, repl_len, 0);
if (repl_is_utf8)
SvUTF8_on(sv);
SvREFCNT_dec(repl_sv_copy);
@@ -3232,13 +3269,19 @@ PP(pp_substr)
SvREFCNT_dec(LvTARG(TARG));
LvTARG(TARG) = SvREFCNT_inc_simple(sv);
}
- LvTARGOFF(TARG) = upos;
- LvTARGLEN(TARG) = urem;
+ LvTARGOFF(TARG) = pos;
+ LvTARGLEN(TARG) = len;
}
}
SPAGAIN;
PUSHs(TARG); /* avoid SvSETMAGIC here */
RETURN;
+
+BOUND_FAIL:
+ if (lvalue || repl)
+ Perl_croak(aTHX_ "substr outside of string");
+ Perl_ck_warner(aTHX_ packWARN(WARN_SUBSTR), "substr outside of string");
+ RETPUSHUNDEF;
}
PP(pp_vec)
diff --git a/proto.h b/proto.h
index 4a343becdc..ae485976dd 100644
--- a/proto.h
+++ b/proto.h
@@ -3374,6 +3374,11 @@ PERL_CALLCONV void Perl_sv_pos_u2b(pTHX_ SV *const sv, I32 *const offsetp, I32 *
#define PERL_ARGS_ASSERT_SV_POS_U2B \
assert(offsetp)
+PERL_CALLCONV void Perl_sv_pos_u2b_proper(pTHX_ SV *const sv, STRLEN *const offsetp, STRLEN *const lenp)
+ __attribute__nonnull__(pTHX_2);
+#define PERL_ARGS_ASSERT_SV_POS_U2B_PROPER \
+ assert(offsetp)
+
PERL_CALLCONV void Perl_sv_pos_b2u(pTHX_ SV *const sv, I32 *const offsetp)
__attribute__nonnull__(pTHX_2);
#define PERL_ARGS_ASSERT_SV_POS_B2U \
diff --git a/sv.c b/sv.c
index 4ab41f63d2..02be580b75 100644
--- a/sv.c
+++ b/sv.c
@@ -6240,7 +6240,7 @@ S_sv_pos_u2b_cached(pTHX_ SV *const sv, MAGIC **const mgp, const U8 *const start
/*
-=for apidoc sv_pos_u2b
+=for apidoc sv_pos_u2b_proper
Converts the value pointed to by offsetp from a count of UTF-8 chars from
the start of the string, to a count of the equivalent number of bytes; if
@@ -6252,14 +6252,14 @@ type coercion.
*/
/*
- * sv_pos_u2b() uses, like sv_pos_b2u(), the mg_ptr of the potential
+ * sv_pos_u2b_proper() uses, like sv_pos_b2u(), the mg_ptr of the potential
* PERL_MAGIC_utf8 of the sv to store the mapping between UTF-8 and
* byte offsets. See also the comments of S_utf8_mg_pos_cache_update().
*
*/
void
-Perl_sv_pos_u2b(pTHX_ register SV *const sv, I32 *const offsetp, I32 *const lenp)
+Perl_sv_pos_u2b_proper(pTHX_ register SV *const sv, STRLEN *const offsetp, STRLEN *const lenp)
{
const U8 *start;
STRLEN len;
@@ -6271,17 +6271,17 @@ Perl_sv_pos_u2b(pTHX_ register SV *const sv, I32 *const offsetp, I32 *const lenp
start = (U8*)SvPV_const(sv, len);
if (len) {
- STRLEN uoffset = (STRLEN) *offsetp;
+ STRLEN uoffset = *offsetp;
const U8 * const send = start + len;
MAGIC *mg = NULL;
const STRLEN boffset = sv_pos_u2b_cached(sv, &mg, start, send,
uoffset, 0, 0);
- *offsetp = (I32) boffset;
+ *offsetp = boffset;
if (lenp) {
/* Convert the relative offset to absolute. */
- const STRLEN uoffset2 = uoffset + (STRLEN) *lenp;
+ const STRLEN uoffset2 = uoffset + *lenp;
const STRLEN boffset2
= sv_pos_u2b_cached(sv, &mg, start, send, uoffset2,
uoffset, boffset) - boffset;
@@ -6298,6 +6298,41 @@ Perl_sv_pos_u2b(pTHX_ register SV *const sv, I32 *const offsetp, I32 *const lenp
return;
}
+/*
+=for apidoc sv_pos_u2b
+
+Converts the value pointed to by offsetp from a count of UTF-8 chars from
+the start of the string, to a count of the equivalent number of bytes; if
+lenp is non-zero, it does the same to lenp, but this time starting from
+the offset, rather than from the start of the string. Handles magic and
+type coercion.
+
+=cut
+*/
+
+/*
+ * sv_pos_u2b() uses, like sv_pos_b2u(), the mg_ptr of the potential
+ * PERL_MAGIC_utf8 of the sv to store the mapping between UTF-8 and
+ * byte offsets. See also the comments of S_utf8_mg_pos_cache_update().
+ *
+ */
+
+/* This function is subject to size and sign problems */
+
+void
+Perl_sv_pos_u2b(pTHX_ register SV *const sv, I32 *const offsetp, I32 *const lenp)
+{
+ STRLEN uoffset = (STRLEN)*offsetp;
+ if (lenp) {
+ STRLEN ulen = (STRLEN)*lenp;
+ sv_pos_u2b_proper(sv, &uoffset, &ulen);
+ *lenp = (I32)ulen;
+ } else {
+ sv_pos_u2b_proper(sv, &uoffset, NULL);
+ }
+ *offsetp = (I32)uoffset;
+}
+
/* Create and update the UTF8 magic offset cache, with the proffered utf8/
byte length pairing. The (byte) length of the total SV is passed in too,
as blen, because for some (more esoteric) SVs, the call to SvPV_const()
diff --git a/t/re/substr.t b/t/re/substr.t
index c3fa6e10e7..d0717ba8ff 100644
--- a/t/re/substr.t
+++ b/t/re/substr.t
@@ -24,7 +24,7 @@ $SIG{__WARN__} = sub {
require './test.pl';
-plan(334);
+plan(360);
run_tests() unless caller;
@@ -201,6 +201,11 @@ is($w--, 1);
eval{substr($a,1) = "" ; }; # P=R=S Q
like($@, $FATAL_MSG);
+$b = substr($a,-7,-6) ; # warn # Q R P S
+is($w--, 1);
+eval{substr($a,-7,-6) = "" ; }; # Q R P S
+like($@, $FATAL_MSG);
+
my $a = 'zxcvbnm';
substr($a,2,0) = '';
is($a, 'zxcvbnm');
@@ -682,4 +687,39 @@ is($x, "\x{100}\x{200}\xFFb");
is(substr($a,1,1), 'b');
}
+# [perl #62646] offsets exceeding 32 bits on 64-bit system
+SKIP: {
+ skip("32-bit system", 24) unless ~0 > 0xffffffff;
+ my $a = "abc";
+ my $s;
+ my $r;
+
+ utf8::downgrade($a);
+ for (1..2) {
+ $w = 0;
+ $r = substr($a, 0xffffffff, 1);
+ is($r, undef);
+ is($w, 1);
+
+ $w = 0;
+ $r = substr($a, 0xffffffff+1, 1);
+ is($r, undef);
+ is($w, 1);
+
+ $w = 0;
+ ok( !eval { $r = substr($s=$a, 0xffffffff, 1, "_"); 1 } );
+ is($r, undef);
+ is($s, $a);
+ is($w, 0);
+
+ $w = 0;
+ ok( !eval { $r = substr($s=$a, 0xffffffff+1, 1, "_"); 1 } );
+ is($r, undef);
+ is($s, $a);
+ is($w, 0);
+
+ utf8::upgrade($a);
+ }
+}
+
}