summaryrefslogtreecommitdiff
path: root/inline.h
diff options
context:
space:
mode:
authorFather Chrysostomos <sprout@cpan.org>2013-07-23 13:15:34 -0700
committerFather Chrysostomos <sprout@cpan.org>2013-08-25 12:22:40 -0700
commit25fdce4a165b6305e760d4c8d94404ce055657a0 (patch)
tree7c3aa76b83b1518991bf23909ee072c55de29138 /inline.h
parent428ccf1e2d78d72b07c5e959e967569a82ce07ba (diff)
downloadperl-25fdce4a165b6305e760d4c8d94404ce055657a0.tar.gz
Stop pos() from being confused by changing utf8ness
The value of pos() is stored as a byte offset. If it is stored on a tied variable or a reference (or glob), then the stringification could change, resulting in pos() now pointing to a different character off- set or pointing to the middle of a character: $ ./perl -Ilib -le '$x = bless [], chr 256; pos $x=1; bless $x, a; print pos $x' 2 $ ./perl -Ilib -le '$x = bless [], chr 256; pos $x=1; bless $x, "\x{1000}"; print pos $x' Malformed UTF-8 character (unexpected end of string) in match position at -e line 1. 0 So pos() should be stored as a character offset. The regular expression engine expects byte offsets always, so allow it to store bytes when possible (a pure non-magical string) but use char- acters otherwise. This does result in more complexity than I should like, but the alter- native (always storing a character offset) would slow down regular expressions, which is a big no-no.
Diffstat (limited to 'inline.h')
-rw-r--r--inline.h23
1 files changed, 22 insertions, 1 deletions
diff --git a/inline.h b/inline.h
index 48cc187e4a..b33cd3fd6f 100644
--- a/inline.h
+++ b/inline.h
@@ -62,6 +62,26 @@ S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
}
#endif
+/* ------------------------------- mg.h ------------------------------- */
+
+#if defined(PERL_CORE) || defined(PERL_EXT)
+/* assumes get-magic and stringification have already occurred */
+PERL_STATIC_INLINE STRLEN
+S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
+{
+ assert(mg->mg_type == PERL_MAGIC_regex_global);
+ assert(mg->mg_len != -1);
+ if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
+ return (STRLEN)mg->mg_len;
+ else {
+ const STRLEN pos = (STRLEN)mg->mg_len;
+ /* Without this check, we may read past the end of the buffer: */
+ if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
+ return sv_or_pv_pos_u2b(sv, s, pos, NULL);
+ }
+}
+#endif
+
/* ----------------------------- regexp.h ----------------------------- */
PERL_STATIC_INLINE struct regexp *
@@ -151,10 +171,11 @@ S_SvPADSTALE_off(SV *sv)
assert(SvFLAGS(sv) & SVs_PADMY);
return SvFLAGS(sv) &= ~SVs_PADSTALE;
}
-#ifdef PERL_CORE
+#if defined(PERL_CORE) || defined (PERL_EXT)
PERL_STATIC_INLINE STRLEN
S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp)
{
+ PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B;
if (SvGAMAGIC(sv)) {
U8 *hopped = utf8_hop((U8 *)pv, pos);
if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped);