Implement the fc keyword and the \F string escape.

Along with the simple_casefolding and full_casefolding features. fc() stands for foldcase, a sort of pseudo case (like lowercase), which is used to implement Unicode casefolding. It maps a string to a form where all case differences are erased, so it's a locale-independent way of checking if two strings are the same, regardless of case. This functionality was, and still is, available through the regular expression engine -- /i matches would use casefolding internally. The fc keyword merely exposes this for easier access. Previously, one could attempt to case-insensitively test two strings for equality by doing lc($a) eq lc($b) But that might get you wrong results, for example in the case of \x{DF}, LATIN SMALL LETTER SHARP S.
author: Brian Fraser <fraserbn@gmail.com> 2012-01-12 17:22:05 -0300
committer: Karl Williamson <public@khwilliamson.com> 2012-01-29 10:07:40 -0700
commit: 838f2281125c4e0f98e5d741f9058f09c8242d33 (patch)
tree: 6fa3458446be72105180d95d66cf87a1d3b7cf3b /pp.c
parent: 2a4315f8fb099a3fd3bbd5d9994af3919a6c5b05 (diff)
download: perl-838f2281125c4e0f98e5d741f9058f09c8242d33.tar.gz
1 files changed, 151 insertions, 0 deletions
diff --git a/pp.c b/pp.c
index 2bc1a1a34f..1dc04608e4 100644
--- a/pp.c
+++ b/pp.c
@@ -4117,6 +4117,157 @@ PP(pp_quotemeta)
     RETURN;
 }
 
+PP(pp_fc)
+{
+    dVAR;
+    dTARGET;
+    dSP;
+    SV *source = TOPs;
+    STRLEN len;
+    STRLEN min;
+    SV *dest;
+    const U8 *s;
+    const U8 *send;
+    U8 *d;
+    U8 tmpbuf[UTF8_MAXBYTES * UTF8_MAX_FOLD_CHAR_EXPAND + 1];
+    const bool full_folding = TRUE;
+    const U8 flags = ( full_folding      ? FOLD_FLAGS_FULL   : 0 )
+                   | ( IN_LOCALE_RUNTIME ? FOLD_FLAGS_LOCALE : 0 );
+
+    /* This is a facsimile of pp_lc, but with a thousand bugs thanks to me.
+     * You are welcome(?) -Hugmeir
+     */
+
+    SvGETMAGIC(source);
+
+    dest = TARG;
+
+    if (SvOK(source)) {
+        s = (const U8*)SvPV_nomg_const(source, len);
+    } else {
+        if (ckWARN(WARN_UNINITIALIZED))
+	    report_uninit(source);
+	s = (const U8*)"";
+	len = 0;
+    }
+
+    min = len + 1;
+
+    SvUPGRADE(dest, SVt_PV);
+    d = (U8*)SvGROW(dest, min);
+    (void)SvPOK_only(dest);
+
+    SETs(dest);
+
+    send = s + len;
+    if (DO_UTF8(source)) { /* UTF-8 flagged string. */
+        bool tainted = FALSE;
+        while (s < send) {
+            const STRLEN u = UTF8SKIP(s);
+            STRLEN ulen;
+
+            _to_utf8_fold_flags(s, tmpbuf, &ulen, flags, &tainted);
+
+            if (ulen > u && (SvLEN(dest) < (min += ulen - u))) {
+                const UV o = d - (U8*)SvPVX_const(dest);
+                SvGROW(dest, min);
+                d = (U8*)SvPVX(dest) + o;
+            }
+
+            Copy(tmpbuf, d, ulen, U8);
+            d += ulen;
+            s += u;
+        }
+        SvUTF8_on(dest);
+	if (tainted) {
+	    TAINT;
+	    SvTAINTED_on(dest);
+	}
+    } /* Unflagged string */
+    else {
+        /* For locale, bytes, and nothing, the behavior is supposed to be the
+         * same as lc().
+         */
+        if ( IN_LOCALE_RUNTIME ) { /* Under locale */
+            TAINT;
+            SvTAINTED_on(dest);
+            for (; s < send; d++, s++)
+                *d = toLOWER_LC(*s);
+        }
+        else if ( !IN_UNI_8_BIT ) { /* Under nothing, or bytes */
+            for (; s < send; d++, s++)
+                *d = toLOWER(*s);
+        }
+        else {
+            /* For ASCII and the Latin-1 range, there's only two troublesome folds,
+            * \x{DF} (\N{LATIN SMALL LETTER SHARP S}), which under full casefolding
+            * becomes 'ss', and \x{B5} (\N{MICRO SIGN}), which under any fold becomes
+            * \x{3BC} (\N{GREEK SMALL LETTER MU}) -- For the rest, the casefold is
+            * their lowercase.
+            */
+            for (; s < send; d++, s++) {
+                if (*s == MICRO_SIGN) {
+                    /* \N{MICRO SIGN}'s casefold is \N{GREEK SMALL LETTER MU}, which
+                    * is outside of the latin-1 range. There's a couple of ways to
+                    * deal with this -- khw discusses them in pp_lc/uc, so go there :)
+                    * What we do here is upgrade what we had already casefolded,
+                    * then enter an inner loop that appends the rest of the characters
+                    * as UTF-8.
+                    */
+                    len = d - (U8*)SvPVX_const(dest);
+                    SvCUR_set(dest, len);
+                    len = sv_utf8_upgrade_flags_grow(dest,
+                                                SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+                                                (send -s) * UTF8_MAX_FOLD_CHAR_EXPAND + 1);
+                    d = (U8*)SvPVX(dest) + len;
+
+                    CAT_UNI_TO_UTF8_TWO_BYTE(d, GREEK_SMALL_LETTER_MU);
+                    s++;
+                    for (; s < send; s++) {
+                        STRLEN ulen;
+                        UV fc = _to_uni_fold_flags(*s, tmpbuf, &ulen, flags);
+                        if UNI_IS_INVARIANT(fc) {
+                            if ( full_folding && *s == LATIN_SMALL_LETTER_SHARP_S) {
+                                *d++ = 's';
+                                *d++ = 's';
+                            }
+                            else
+                                *d++ = (U8)fc;
+                        }
+                        else {
+                            Copy(tmpbuf, d, ulen, U8);
+                            d += ulen;
+                        }
+                    }
+                    break;
+                }
+                else if (full_folding && *s == LATIN_SMALL_LETTER_SHARP_S) {
+                    /* Under full casefolding, LATIN SMALL LETTER SHARP S becomes "ss",
+                    * which may require growing the SV.
+                    */
+                    if (SvLEN(dest) < ++min) {
+                        const UV o = d - (U8*)SvPVX_const(dest);
+                        SvGROW(dest, min);
+                        d = (U8*)SvPVX(dest) + o;
+                     }
+                    *(d)++ = 's';
+                    *d = 's';
+                }
+                else { /* If it's not one of those two, the fold is their lower case */
+                    *d = toLOWER_LATIN1(*s);
+                }
+             }
+        }
+    }
+    *d = '\0';
+    SvCUR_set(dest, d - (U8*)SvPVX_const(dest));
+
+    if (SvTAINTED(source))
+	SvTAINT(dest);
+    SvSETMAGIC(dest);
+    RETURN;
+}
+
 /* Arrays. */
 
 PP(pp_aslice)
author	Brian Fraser <fraserbn@gmail.com>	2012-01-12 17:22:05 -0300
committer	Karl Williamson <public@khwilliamson.com>	2012-01-29 10:07:40 -0700
commit	838f2281125c4e0f98e5d741f9058f09c8242d33 (patch)
tree	6fa3458446be72105180d95d66cf87a1d3b7cf3b /pp.c
parent	2a4315f8fb099a3fd3bbd5d9994af3919a6c5b05 (diff)
download	perl-838f2281125c4e0f98e5d741f9058f09c8242d33.tar.gz