summaryrefslogtreecommitdiff
path: root/mg.h
diff options
context:
space:
mode:
authorFather Chrysostomos <sprout@cpan.org>2013-07-23 13:15:34 -0700
committerFather Chrysostomos <sprout@cpan.org>2013-08-25 12:22:40 -0700
commit25fdce4a165b6305e760d4c8d94404ce055657a0 (patch)
tree7c3aa76b83b1518991bf23909ee072c55de29138 /mg.h
parent428ccf1e2d78d72b07c5e959e967569a82ce07ba (diff)
downloadperl-25fdce4a165b6305e760d4c8d94404ce055657a0.tar.gz
Stop pos() from being confused by changing utf8ness
The value of pos() is stored as a byte offset. If it is stored on a tied variable or a reference (or glob), then the stringification could change, resulting in pos() now pointing to a different character off- set or pointing to the middle of a character: $ ./perl -Ilib -le '$x = bless [], chr 256; pos $x=1; bless $x, a; print pos $x' 2 $ ./perl -Ilib -le '$x = bless [], chr 256; pos $x=1; bless $x, "\x{1000}"; print pos $x' Malformed UTF-8 character (unexpected end of string) in match position at -e line 1. 0 So pos() should be stored as a character offset. The regular expression engine expects byte offsets always, so allow it to store bytes when possible (a pure non-magical string) but use char- acters otherwise. This does result in more complexity than I should like, but the alter- native (always storing a character offset) would slow down regular expressions, which is a big no-no.
Diffstat (limited to 'mg.h')
-rw-r--r--mg.h14
1 files changed, 14 insertions, 0 deletions
diff --git a/mg.h b/mg.h
index de673d4243..29e339f82c 100644
--- a/mg.h
+++ b/mg.h
@@ -38,6 +38,7 @@ struct magic {
#define MGf_COPY 8 /* has an svt_copy MGVTBL entry */
#define MGf_DUP 0x10 /* has an svt_dup MGVTBL entry */
#define MGf_LOCAL 0x20 /* has an svt_local MGVTBL entry */
+#define MGf_BYTES 0x40 /* PERL_MAGIC_regex_global only */
#define MgTAINTEDDIR(mg) (mg->mg_flags & MGf_TAINTEDDIR)
#define MgTAINTEDDIR_on(mg) (mg->mg_flags |= MGf_TAINTEDDIR)
@@ -57,6 +58,19 @@ struct magic {
#define SvTIED_obj(sv,mg) \
((mg)->mg_obj ? (mg)->mg_obj : sv_2mortal(newRV(sv)))
+#if defined(PERL_CORE) || defined(PERL_EXT)
+# define MgBYTEPOS(mg,sv,pv,len) S_MgBYTEPOS(aTHX_ mg,sv,pv,len)
+/* assumes get-magic and stringification have already occurred */
+# define MgBYTEPOS_set(mg,sv,pv,off) ( \
+ assert_((mg)->mg_type == PERL_MAGIC_regex_global) \
+ SvPOK(sv) && !SvGMAGICAL(sv) \
+ ? (mg)->mg_len = (off), (mg)->mg_flags |= MGf_BYTES \
+ : ((mg)->mg_len = DO_UTF8(sv) \
+ ? utf8_length((U8 *)(pv), (U8 *)(pv)+(off)) \
+ : (off), \
+ (mg)->mg_flags &= ~MGf_BYTES))
+#endif
+
#define whichsig(pv) whichsig_pv(pv)
/*