diff options
author | Brian Fraser <fraserbn@gmail.com> | 2012-01-12 17:22:05 -0300 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2012-01-29 10:07:40 -0700 |
commit | 838f2281125c4e0f98e5d741f9058f09c8242d33 (patch) | |
tree | 6fa3458446be72105180d95d66cf87a1d3b7cf3b /pp.c | |
parent | 2a4315f8fb099a3fd3bbd5d9994af3919a6c5b05 (diff) | |
download | perl-838f2281125c4e0f98e5d741f9058f09c8242d33.tar.gz |
Implement the fc keyword and the \F string escape.
Along with the simple_casefolding and full_casefolding features.
fc() stands for foldcase, a sort of pseudo case (like lowercase),
which is used to implement Unicode casefolding. It maps a string
to a form where all case differences are erased, so it's a
locale-independent way of checking if two strings are the same,
regardless of case.
This functionality was, and still is, available through the
regular expression engine -- /i matches would use casefolding
internally. The fc keyword merely exposes this for easier access.
Previously, one could attempt to case-insensitively test two strings
for equality by doing
lc($a) eq lc($b)
But that might get you wrong results, for example in the case of
\x{DF}, LATIN SMALL LETTER SHARP S.
Diffstat (limited to 'pp.c')
-rw-r--r-- | pp.c | 151 |
1 files changed, 151 insertions, 0 deletions
@@ -4117,6 +4117,157 @@ PP(pp_quotemeta) RETURN; } +PP(pp_fc) +{ + dVAR; + dTARGET; + dSP; + SV *source = TOPs; + STRLEN len; + STRLEN min; + SV *dest; + const U8 *s; + const U8 *send; + U8 *d; + U8 tmpbuf[UTF8_MAXBYTES * UTF8_MAX_FOLD_CHAR_EXPAND + 1]; + const bool full_folding = TRUE; + const U8 flags = ( full_folding ? FOLD_FLAGS_FULL : 0 ) + | ( IN_LOCALE_RUNTIME ? FOLD_FLAGS_LOCALE : 0 ); + + /* This is a facsimile of pp_lc, but with a thousand bugs thanks to me. + * You are welcome(?) -Hugmeir + */ + + SvGETMAGIC(source); + + dest = TARG; + + if (SvOK(source)) { + s = (const U8*)SvPV_nomg_const(source, len); + } else { + if (ckWARN(WARN_UNINITIALIZED)) + report_uninit(source); + s = (const U8*)""; + len = 0; + } + + min = len + 1; + + SvUPGRADE(dest, SVt_PV); + d = (U8*)SvGROW(dest, min); + (void)SvPOK_only(dest); + + SETs(dest); + + send = s + len; + if (DO_UTF8(source)) { /* UTF-8 flagged string. */ + bool tainted = FALSE; + while (s < send) { + const STRLEN u = UTF8SKIP(s); + STRLEN ulen; + + _to_utf8_fold_flags(s, tmpbuf, &ulen, flags, &tainted); + + if (ulen > u && (SvLEN(dest) < (min += ulen - u))) { + const UV o = d - (U8*)SvPVX_const(dest); + SvGROW(dest, min); + d = (U8*)SvPVX(dest) + o; + } + + Copy(tmpbuf, d, ulen, U8); + d += ulen; + s += u; + } + SvUTF8_on(dest); + if (tainted) { + TAINT; + SvTAINTED_on(dest); + } + } /* Unflagged string */ + else { + /* For locale, bytes, and nothing, the behavior is supposed to be the + * same as lc(). + */ + if ( IN_LOCALE_RUNTIME ) { /* Under locale */ + TAINT; + SvTAINTED_on(dest); + for (; s < send; d++, s++) + *d = toLOWER_LC(*s); + } + else if ( !IN_UNI_8_BIT ) { /* Under nothing, or bytes */ + for (; s < send; d++, s++) + *d = toLOWER(*s); + } + else { + /* For ASCII and the Latin-1 range, there's only two troublesome folds, + * \x{DF} (\N{LATIN SMALL LETTER SHARP S}), which under full casefolding + * becomes 'ss', and \x{B5} (\N{MICRO SIGN}), which under any fold becomes + * \x{3BC} (\N{GREEK SMALL LETTER MU}) -- For the rest, the casefold is + * their lowercase. + */ + for (; s < send; d++, s++) { + if (*s == MICRO_SIGN) { + /* \N{MICRO SIGN}'s casefold is \N{GREEK SMALL LETTER MU}, which + * is outside of the latin-1 range. There's a couple of ways to + * deal with this -- khw discusses them in pp_lc/uc, so go there :) + * What we do here is upgrade what we had already casefolded, + * then enter an inner loop that appends the rest of the characters + * as UTF-8. + */ + len = d - (U8*)SvPVX_const(dest); + SvCUR_set(dest, len); + len = sv_utf8_upgrade_flags_grow(dest, + SV_GMAGIC|SV_FORCE_UTF8_UPGRADE, + (send -s) * UTF8_MAX_FOLD_CHAR_EXPAND + 1); + d = (U8*)SvPVX(dest) + len; + + CAT_UNI_TO_UTF8_TWO_BYTE(d, GREEK_SMALL_LETTER_MU); + s++; + for (; s < send; s++) { + STRLEN ulen; + UV fc = _to_uni_fold_flags(*s, tmpbuf, &ulen, flags); + if UNI_IS_INVARIANT(fc) { + if ( full_folding && *s == LATIN_SMALL_LETTER_SHARP_S) { + *d++ = 's'; + *d++ = 's'; + } + else + *d++ = (U8)fc; + } + else { + Copy(tmpbuf, d, ulen, U8); + d += ulen; + } + } + break; + } + else if (full_folding && *s == LATIN_SMALL_LETTER_SHARP_S) { + /* Under full casefolding, LATIN SMALL LETTER SHARP S becomes "ss", + * which may require growing the SV. + */ + if (SvLEN(dest) < ++min) { + const UV o = d - (U8*)SvPVX_const(dest); + SvGROW(dest, min); + d = (U8*)SvPVX(dest) + o; + } + *(d)++ = 's'; + *d = 's'; + } + else { /* If it's not one of those two, the fold is their lower case */ + *d = toLOWER_LATIN1(*s); + } + } + } + } + *d = '\0'; + SvCUR_set(dest, d - (U8*)SvPVX_const(dest)); + + if (SvTAINTED(source)) + SvTAINT(dest); + SvSETMAGIC(dest); + RETURN; +} + /* Arrays. */ PP(pp_aslice) |