summaryrefslogtreecommitdiff
path: root/regexec.c
diff options
context:
space:
mode:
authorFather Chrysostomos <sprout@cpan.org>2013-07-23 13:15:34 -0700
committerFather Chrysostomos <sprout@cpan.org>2013-08-25 12:22:40 -0700
commit25fdce4a165b6305e760d4c8d94404ce055657a0 (patch)
tree7c3aa76b83b1518991bf23909ee072c55de29138 /regexec.c
parent428ccf1e2d78d72b07c5e959e967569a82ce07ba (diff)
downloadperl-25fdce4a165b6305e760d4c8d94404ce055657a0.tar.gz
Stop pos() from being confused by changing utf8ness
The value of pos() is stored as a byte offset. If it is stored on a tied variable or a reference (or glob), then the stringification could change, resulting in pos() now pointing to a different character off- set or pointing to the middle of a character: $ ./perl -Ilib -le '$x = bless [], chr 256; pos $x=1; bless $x, a; print pos $x' 2 $ ./perl -Ilib -le '$x = bless [], chr 256; pos $x=1; bless $x, "\x{1000}"; print pos $x' Malformed UTF-8 character (unexpected end of string) in match position at -e line 1. 0 So pos() should be stored as a character offset. The regular expression engine expects byte offsets always, so allow it to store bytes when possible (a pure non-magical string) but use char- acters otherwise. This does result in more complexity than I should like, but the alter- native (always storing a character offset) would slow down regular expressions, which is a big no-no.
Diffstat (limited to 'regexec.c')
-rw-r--r--regexec.c14
1 files changed, 11 insertions, 3 deletions
diff --git a/regexec.c b/regexec.c
index d207d0d951..44690b3280 100644
--- a/regexec.c
+++ b/regexec.c
@@ -2256,7 +2256,8 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
(flags & REXEC_IGNOREPOS)
? stringarg /* use start pos rather than pos() */
: (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
- ? strbeg + mg->mg_len /* Defined pos() */
+ /* Defined pos(): */
+ ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
: strbeg; /* pos() not defined; use start of string */
DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
@@ -5027,8 +5028,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
rex->offs[0].end = locinput - reginfo->strbeg;
if (reginfo->info_aux_eval->pos_magic)
- reginfo->info_aux_eval->pos_magic->mg_len
- = locinput - reginfo->strbeg;
+ MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
+ reginfo->sv, reginfo->strbeg,
+ locinput - reginfo->strbeg);
if (sv_yes_mark) {
SV *sv_mrk = get_sv("REGMARK", 1);
@@ -7648,6 +7650,7 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
}
eval_state->pos_magic = mg;
eval_state->pos = mg->mg_len;
+ eval_state->pos_flags = mg->mg_flags;
}
else
eval_state->pos_magic = NULL;
@@ -7722,7 +7725,12 @@ S_cleanup_regmatch_info_aux(pTHX_ void *arg)
RXp_MATCH_COPIED_on(rex);
}
if (eval_state->pos_magic)
+ {
eval_state->pos_magic->mg_len = eval_state->pos;
+ eval_state->pos_magic->mg_flags =
+ (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
+ | (eval_state->pos_flags & MGf_BYTES);
+ }
PL_curpm = eval_state->curpm;
}