From 25fdce4a165b6305e760d4c8d94404ce055657a0 Mon Sep 17 00:00:00 2001 From: Father Chrysostomos Date: Tue, 23 Jul 2013 13:15:34 -0700 Subject: Stop pos() from being confused by changing utf8ness The value of pos() is stored as a byte offset. If it is stored on a tied variable or a reference (or glob), then the stringification could change, resulting in pos() now pointing to a different character off- set or pointing to the middle of a character: $ ./perl -Ilib -le '$x = bless [], chr 256; pos $x=1; bless $x, a; print pos $x' 2 $ ./perl -Ilib -le '$x = bless [], chr 256; pos $x=1; bless $x, "\x{1000}"; print pos $x' Malformed UTF-8 character (unexpected end of string) in match position at -e line 1. 0 So pos() should be stored as a character offset. The regular expression engine expects byte offsets always, so allow it to store bytes when possible (a pure non-magical string) but use char- acters otherwise. This does result in more complexity than I should like, but the alter- native (always storing a character offset) would slow down regular expressions, which is a big no-no. --- pp_hot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'pp_hot.c') diff --git a/pp_hot.c b/pp_hot.c index 6068d21973..afecce8e11 100644 --- a/pp_hot.c +++ b/pp_hot.c @@ -1392,7 +1392,7 @@ PP(pp_match) if (global) { mg = mg_find_mglob(TARG); if (mg && mg->mg_len >= 0) { - curpos = mg->mg_len; + curpos = MgBYTEPOS(mg, TARG, truebase, len); /* last time pos() was set, it was zero-length match */ if (mg->mg_flags & MGf_MINMATCH) had_zerolen = 1; @@ -1448,7 +1448,7 @@ PP(pp_match) if (global && (gimme != G_ARRAY || (dynpm->op_pmflags & PMf_CONTINUE))) { if (!mg) mg = sv_magicext_mglob(TARG); - mg->mg_len = RX_OFFS(rx)[0].end; + MgBYTEPOS_set(mg, TARG, truebase, RX_OFFS(rx)[0].end); if (RX_ZERO_LEN(rx)) mg->mg_flags |= MGf_MINMATCH; else -- cgit v1.2.1