diff options
author | David Mitchell <davem@iabyn.com> | 2011-03-19 19:26:49 +0000 |
---|---|---|
committer | David Mitchell <davem@iabyn.com> | 2011-03-19 19:41:55 +0000 |
commit | 75da9d4c616bae3e6791af93d2ced52dc8080f06 (patch) | |
tree | 5c0043963158345e1bb1c1c21526aa604fa9f06f | |
parent | 912c63ed00375338703043928cac3c740d00cc9d (diff) | |
download | perl-75da9d4c616bae3e6791af93d2ced52dc8080f06.tar.gz |
reset pos and utf8 cache when de/encoding utf8 str
When using
utf8::upgrade
utf8::downgrade
utf8::encode
utf8::decode
or the underlying C-level functions
sv_utf8_upgrade_flags_grow
sv_utf8_downgrade
sv_utf8_encode
sv_utf8_decode
and
sv_recode_to_utf8
update the position of the pos magic, if any, and clear the utf8
length/position-mapping cache.
This fixes [perl #80190].
-rw-r--r-- | lib/utf8.t | 48 | ||||
-rw-r--r-- | sv.c | 70 |
2 files changed, 115 insertions, 3 deletions
diff --git a/lib/utf8.t b/lib/utf8.t index 715ca3e7e1..ae81ccdc46 100644 --- a/lib/utf8.t +++ b/lib/utf8.t @@ -484,4 +484,52 @@ SKIP: { } } +# #80190 update pos, and cached length/position-mapping after +# utf8 upgrade/downgrade, encode/decode + +for my $pos (0..5) { + + my $pos1 = ($pos >= 3) ? 2 : ($pos >= 1) ? 1 : 0; + my $pos2 = ($pos1 == 2) ? 3 : $pos1; + + my $p; + my $s = "A\xc8\x81\xe8\xab\x86\x{100}"; + chop($s); + + pos($s) = $pos; + # also sets cache + is(length($s), 6, "(pos $pos) len before utf8::downgrade"); + is(pos($s), $pos, "(pos $pos) pos before utf8::downgrade"); + utf8::downgrade($s); + is(length($s), 6, "(pos $pos) len after utf8::downgrade"); + is(pos($s), $pos, "(pos $pos) pos after utf8::downgrade"); + is($s, "A\xc8\x81\xe8\xab\x86","(pos $pos) str after utf8::downgrade"); + utf8::decode($s); + is(length($s), 3, "(pos $pos) len after D; utf8::decode"); + is(pos($s), $pos1, "(pos $pos) pos after D; utf8::decode"); + is($s, "A\x{201}\x{8ac6}", "(pos $pos) str after D; utf8::decode"); + utf8::encode($s); + is(length($s), 6, "(pos $pos) len after D; utf8::encode"); + is(pos($s), $pos2, "(pos $pos) pos after D; utf8::encode"); + is($s, "A\xc8\x81\xe8\xab\x86","(pos $pos) str after D; utf8::encode"); + + $s = "A\xc8\x81\xe8\xab\x86"; + + pos($s) = $pos; + is(length($s), 6, "(pos $pos) len before utf8::upgrade"); + is(pos($s), $pos, "(pos $pos) pos before utf8::upgrade"); + utf8::upgrade($s); + is(length($s), 6, "(pos $pos) len after utf8::upgrade"); + is(pos($s), $pos, "(pos $pos) pos after utf8::upgrade"); + is($s, "A\xc8\x81\xe8\xab\x86","(pos $pos) str after utf8::upgrade"); + utf8::decode($s); + is(length($s), 3, "(pos $pos) len after U; utf8::decode"); + is(pos($s), $pos1, "(pos $pos) pos after U; utf8::decode"); + is($s, "A\x{201}\x{8ac6}", "(pos $pos) str after U; utf8::decode"); + utf8::encode($s); + is(length($s), 6, "(pos $pos) len after U; utf8::encode"); + is(pos($s), $pos2, "(pos $pos) pos after U; utf8::encode"); + is($s, "A\xc8\x81\xe8\xab\x86","(pos $pos) str after U; utf8::encode"); +} + done_testing(); @@ -3433,6 +3433,29 @@ must_be_utf8: } } } + + if (SvTYPE(sv) >= SVt_PVMG && SvMAGIC(sv)) { + /* Update pos. We do it at the end rather than during + * the upgrade, to avoid slowing down the common case + * (upgrade without pos) */ + MAGIC * mg = mg_find(sv, PERL_MAGIC_regex_global); + if (mg) { + I32 pos = mg->mg_len; + if (pos > 0 && (U32)pos > invariant_head) { + U8 *d = (U8*) SvPVX(sv) + invariant_head; + STRLEN n = (U32)pos - invariant_head; + while (n > 0) { + if (UTF8_IS_START(*d)) + d++; + d++; + n--; + } + mg->mg_len = d - (U8*)SvPVX(sv); + } + } + if ((mg = mg_find(sv, PERL_MAGIC_utf8))) + magic_setutf8(sv,mg); /* clear UTF8 cache */ + } } } @@ -3467,11 +3490,28 @@ Perl_sv_utf8_downgrade(pTHX_ register SV *const sv, const bool fail_ok) if (SvCUR(sv)) { U8 *s; STRLEN len; + int mg_flags = SV_GMAGIC; if (SvIsCOW(sv)) { sv_force_normal_flags(sv, 0); } - s = (U8 *) SvPV(sv, len); + if (SvTYPE(sv) >= SVt_PVMG && SvMAGIC(sv)) { + /* update pos */ + MAGIC * mg = mg_find(sv, PERL_MAGIC_regex_global); + if (mg) { + I32 pos = mg->mg_len; + if (pos > 0) { + sv_pos_b2u(sv, &pos); + mg_flags = 0; /* sv_pos_b2u does get magic */ + mg->mg_len = pos; + } + } + if ((mg = mg_find(sv, PERL_MAGIC_utf8))) + magic_setutf8(sv,mg); /* clear UTF8 cache */ + + } + s = (U8 *) SvPV_flags(sv, len, mg_flags); + if (!utf8_to_bytes(s, &len)) { if (fail_ok) return FALSE; @@ -3532,7 +3572,7 @@ Perl_sv_utf8_decode(pTHX_ register SV *const sv) PERL_ARGS_ASSERT_SV_UTF8_DECODE; if (SvPOKp(sv)) { - const U8 *c; + const U8 *start, *c; const U8 *e; /* The octets may have got themselves encoded - get them back as @@ -3544,7 +3584,7 @@ Perl_sv_utf8_decode(pTHX_ register SV *const sv) /* it is actually just a matter of turning the utf8 flag on, but * we want to make sure everything inside is valid utf8 first. */ - c = (const U8 *) SvPVX_const(sv); + c = start = (const U8 *) SvPVX_const(sv); if (!is_utf8_string(c, SvCUR(sv)+1)) return FALSE; e = (const U8 *) SvEND(sv); @@ -3555,6 +3595,22 @@ Perl_sv_utf8_decode(pTHX_ register SV *const sv) break; } } + if (SvTYPE(sv) >= SVt_PVMG && SvMAGIC(sv)) { + /* adjust pos to the start of a UTF8 char sequence */ + MAGIC * mg = mg_find(sv, PERL_MAGIC_regex_global); + if (mg) { + I32 pos = mg->mg_len; + if (pos > 0) { + for (c = start + pos; c > start; c--) { + if (UTF8_IS_START(*c)) + break; + } + mg->mg_len = c - start; + } + } + if ((mg = mg_find(sv, PERL_MAGIC_utf8))) + magic_setutf8(sv,mg); /* clear UTF8 cache */ + } } return TRUE; } @@ -13535,6 +13591,14 @@ Perl_sv_recode_to_utf8(pTHX_ SV *sv, SV *encoding) } FREETMPS; LEAVE; + if (SvTYPE(sv) >= SVt_PVMG && SvMAGIC(sv)) { + /* clear pos and any utf8 cache */ + MAGIC * mg = mg_find(sv, PERL_MAGIC_regex_global); + if (mg) + mg->mg_len = -1; + if ((mg = mg_find(sv, PERL_MAGIC_utf8))) + magic_setutf8(sv,mg); /* clear UTF8 cache */ + } SvUTF8_on(sv); return SvPVX(sv); } |