diff options
author | Nicholas Clark <nick@ccl4.org> | 2006-04-29 13:43:26 +0000 |
---|---|---|
committer | Nicholas Clark <nick@ccl4.org> | 2006-04-29 13:43:26 +0000 |
commit | ec9af7d430b6660eff7240fa20757fa5feb233a8 (patch) | |
tree | 73503877ffc9eeabc96eb111fde7fab6b962fc92 /pp.c | |
parent | ca0270c4db0273594a19a3d140c7330546a185fe (diff) | |
download | perl-ec9af7d430b6660eff7240fa20757fa5feb233a8.tar.gz |
lc plus an 8 bit locale could mangle UTF-8 values returned by
overloaded stringification.
p4raw-id: //depot/perl@28011
Diffstat (limited to 'pp.c')
-rw-r--r-- | pp.c | 174 |
1 files changed, 91 insertions, 83 deletions
@@ -3570,107 +3570,115 @@ PP(pp_lc) { dVAR; dSP; - SV *sv = TOPs; + SV *source = TOPs; STRLEN len; + STRLEN min; + SV *dest; + const U8 *s; + U8 *d; - SvGETMAGIC(sv); - if (DO_UTF8(sv)) { + SvGETMAGIC(source); + + if (SvPADTMP(source) && !SvREADONLY(source) && !SvAMAGIC(source) + && !DO_UTF8(source)) { + /* We can convert in place. */ + + dest = source; + s = d = (U8*)SvPV_force_nomg(source, len); + min = len + 1; + } else { dTARGET; - const U8 *s; - STRLEN ulen; - register U8 *d; - const U8 *send; - U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; - s = (const U8*)SvPV_nomg_const(sv,len); - if (!len) { - SvUTF8_off(TARG); /* decontaminate */ - sv_setpvn(TARG, "", 0); - sv = TARG; - SETs(sv); + dest = TARG; + + /* The old implementation would copy source into TARG at this point. + This had the side effect that if source was undef, TARG was now + an undefined SV with PADTMP set, and they don't warn inside + sv_2pv_flags(). However, we're now getting the PV direct from + source, which doesn't have PADTMP set, so it would warn. Hence the + little games. */ + + if (SvOK(source)) { + s = (const U8*)SvPV_nomg_const(source, len); + } else { + s = ""; + len = 0; } - else { - STRLEN min = len + 1; + min = len + 1; - SvUPGRADE(TARG, SVt_PV); - SvGROW(TARG, min); - (void)SvPOK_only(TARG); - d = (U8*)SvPVX(TARG); - send = s + len; - while (s < send) { - const STRLEN u = UTF8SKIP(s); - const UV uv = toLOWER_utf8(s, tmpbuf, &ulen); + SvUPGRADE(dest, SVt_PV); + d = SvGROW(dest, min); + (void)SvPOK_only(dest); -#define GREEK_CAPITAL_LETTER_SIGMA 0x03A3 /* Unicode U+03A3 */ - if (uv == GREEK_CAPITAL_LETTER_SIGMA) { - NOOP; - /* - * Now if the sigma is NOT followed by - * /$ignorable_sequence$cased_letter/; - * and it IS preceded by - * /$cased_letter$ignorable_sequence/; - * where $ignorable_sequence is - * [\x{2010}\x{AD}\p{Mn}]* - * and $cased_letter is - * [\p{Ll}\p{Lo}\p{Lt}] - * then it should be mapped to 0x03C2, - * (GREEK SMALL LETTER FINAL SIGMA), - * instead of staying 0x03A3. - * "should be": in other words, - * this is not implemented yet. - * See lib/unicore/SpecialCasing.txt. - */ - } - if (ulen > u && (SvLEN(TARG) < (min += ulen - u))) { - /* If the eventually required minimum size outgrows - * the available space, we need to grow. */ - const UV o = d - (U8*)SvPVX_const(TARG); + SETs(dest); + } - /* If someone lowercases one million U+0130s we - * SvGROW() one million times. Or we could try - * guessing how much to allocate without allocating. - * too much. Such is life. */ - SvGROW(TARG, min); - d = (U8*)SvPVX(TARG) + o; - } - Copy(tmpbuf, d, ulen, U8); - d += ulen; - s += u; + /* Overloaded values may have toggled the UTF-8 flag on source, so we need + to check DO_UTF8 again here. */ + + if (DO_UTF8(source)) { + const U8 *const send = s + len; + U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; + + while (s < send) { + const STRLEN u = UTF8SKIP(s); + STRLEN ulen; + const UV uv = toLOWER_utf8(s, tmpbuf, &ulen); + +#define GREEK_CAPITAL_LETTER_SIGMA 0x03A3 /* Unicode U+03A3 */ + if (uv == GREEK_CAPITAL_LETTER_SIGMA) { + NOOP; + /* + * Now if the sigma is NOT followed by + * /$ignorable_sequence$cased_letter/; + * and it IS preceded by /$cased_letter$ignorable_sequence/; + * where $ignorable_sequence is [\x{2010}\x{AD}\p{Mn}]* + * and $cased_letter is [\p{Ll}\p{Lo}\p{Lt}] + * then it should be mapped to 0x03C2, + * (GREEK SMALL LETTER FINAL SIGMA), + * instead of staying 0x03A3. + * "should be": in other words, this is not implemented yet. + * See lib/unicore/SpecialCasing.txt. + */ } - *d = '\0'; - SvUTF8_on(TARG); - SvCUR_set(TARG, d - (U8*)SvPVX_const(TARG)); - sv = TARG; - SETs(sv); - } - } - else { - U8 *s; - if (!SvPADTMP(sv) || SvREADONLY(sv)) { - dTARGET; - SvUTF8_off(TARG); /* decontaminate */ - sv_setsv_nomg(TARG, sv); - sv = TARG; - SETs(sv); + if (ulen > u && (SvLEN(dest) < (min += ulen - u))) { + /* If the eventually required minimum size outgrows + * the available space, we need to grow. */ + const UV o = d - (U8*)SvPVX_const(dest); + + /* If someone lowercases one million U+0130s we SvGROW() one + * million times. Or we could try guessing how much to + allocate without allocating too much. Such is life. */ + SvGROW(dest, min); + d = (U8*)SvPVX(dest) + o; + } + Copy(tmpbuf, d, ulen, U8); + d += ulen; + s += u; } - - s = (U8*)SvPV_force_nomg(sv, len); + SvUTF8_on(dest); + *d = '\0'; + SvCUR_set(dest, d - (U8*)SvPVX_const(dest)); + } else { if (len) { - register const U8 * const send = s + len; - + const U8 *const send = s + len; if (IN_LOCALE_RUNTIME) { TAINT; - SvTAINTED_on(sv); - for (; s < send; s++) - *s = toLOWER_LC(*s); + SvTAINTED_on(dest); + for (; s < send; d++, s++) + *d = toLOWER_LC(*s); } else { - for (; s < send; s++) - *s = toLOWER(*s); + for (; s < send; d++, s++) + *d = toLOWER(*s); } } + if (source != dest) { + *d = '\0'; + SvCUR_set(dest, d - (U8*)SvPVX_const(dest)); + } } - SvSETMAGIC(sv); + SvSETMAGIC(dest); RETURN; } |