From 25fdce4a165b6305e760d4c8d94404ce055657a0 Mon Sep 17 00:00:00 2001 From: Father Chrysostomos Date: Tue, 23 Jul 2013 13:15:34 -0700 Subject: Stop pos() from being confused by changing utf8ness The value of pos() is stored as a byte offset. If it is stored on a tied variable or a reference (or glob), then the stringification could change, resulting in pos() now pointing to a different character off- set or pointing to the middle of a character: $ ./perl -Ilib -le '$x = bless [], chr 256; pos $x=1; bless $x, a; print pos $x' 2 $ ./perl -Ilib -le '$x = bless [], chr 256; pos $x=1; bless $x, "\x{1000}"; print pos $x' Malformed UTF-8 character (unexpected end of string) in match position at -e line 1. 0 So pos() should be stored as a character offset. The regular expression engine expects byte offsets always, so allow it to store bytes when possible (a pure non-magical string) but use char- acters otherwise. This does result in more complexity than I should like, but the alter- native (always storing a character offset) would slow down regular expressions, which is a big no-no. --- mg.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'mg.c') diff --git a/mg.c b/mg.c index f18a98a33a..c2d2186e96 100644 --- a/mg.c +++ b/mg.c @@ -2098,7 +2098,7 @@ Perl_magic_getpos(pTHX_ SV *sv, MAGIC *mg) if (found && found->mg_len != -1) { STRLEN i = found->mg_len; - if (DO_UTF8(lsv)) + if (found->mg_flags & MGf_BYTES && DO_UTF8(lsv)) i = sv_pos_b2u_flags(lsv, i, SV_GMAGIC|SV_CONST_RETURN); sv_setuv(sv, i); return 0; @@ -2149,12 +2149,8 @@ Perl_magic_setpos(pTHX_ SV *sv, MAGIC *mg) else if (pos > (SSize_t)len) pos = len; - if (ulen) { - pos = sv_or_pv_pos_u2b(lsv, s, pos, 0); - } - found->mg_len = pos; - found->mg_flags &= ~MGf_MINMATCH; + found->mg_flags &= ~(MGf_MINMATCH|MGf_BYTES); return 0; } -- cgit v1.2.1