summaryrefslogtreecommitdiff
path: root/regexec.c
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2010-11-07 15:25:31 -0700
committerFather Chrysostomos <sprout@cpan.org>2010-11-07 21:42:42 -0800
commit2726813d9af5d50f1451663cd931317e7172da50 (patch)
tree12ffa4ce7951e688df59ceceb9a061ab67d606de /regexec.c
parenta85c03da46d77cd5b9f4e0ba809245cf000962ad (diff)
downloadperl-2726813d9af5d50f1451663cd931317e7172da50.tar.gz
regexec.c: Don't give up on fold matching early
As noted in the comments of the code, "a" =~ /[A]/i doesn't work currently (except that regcomp.c knows about the ASCII characters and corrects for it, but not always, for example in cases like "a" =~ /\p{Upper}/i. This patch catches all those). It works by computing a list of all characters that (singly) fold to another one, and then checking each of those. The maximum length of the list is 3 in the current Unicode standard. I believe that a better long-term solution is to do this at compile rather than execution time, by generating a closure of everything matched. But this can't be done now because the data structure would need to be extensively revamped to list all non-byte characters, and user-defined \p{} matches are not known at compile-time. And it doesn't handle the multi-char folds. There is a separate ticket for those.
Diffstat (limited to 'regexec.c')
-rw-r--r--regexec.c45
1 files changed, 45 insertions, 0 deletions
diff --git a/regexec.c b/regexec.c
index 433bbeb8e5..a6da6ce0dc 100644
--- a/regexec.c
+++ b/regexec.c
@@ -6343,6 +6343,51 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n,
if (swash_fetch(sw, folded, 1)) { /* 1 => is utf8 */
match = TRUE;
}
+ else {
+ SV** listp;
+
+ /* Consider "k" =~ /[K]/i. The line above would
+ * have just folded the 'k' to itself, and that
+ * isn't going to match 'K'. So we look through
+ * the closure of everything that folds to 'k'.
+ * That will find the 'K'. Initialize the list, if
+ * necessary */
+ if (! PL_utf8_foldclosures) {
+
+ /* If the folds haven't been read in, call a fold
+ * function to force that */
+ if (! PL_utf8_tofold) {
+ U8 dummy[UTF8_MAXBYTES+1];
+ STRLEN dummy_len;
+ to_utf8_fold((U8*) "A", dummy, &dummy_len);
+ }
+ PL_utf8_foldclosures =
+ _swash_inversion_hash(PL_utf8_tofold);
+ }
+
+ /* The data structure is a hash with the keys every
+ * character that is folded to, like 'k', and the
+ * values each an array of everything that folds to
+ * its key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
+ if ((listp = hv_fetch(PL_utf8_foldclosures,
+ (char *) folded, foldlen, FALSE)))
+ {
+ AV* list = (AV*) *listp;
+ IV i;
+ for (i = 0; i <= av_len(list); i++) {
+ SV** try_p = av_fetch(list, i, FALSE);
+ if (try_p == NULL) {
+ Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
+ }
+ /* Don't have to worry about embeded nulls
+ * since NULL isn't folded or foldable */
+ if (swash_fetch(sw, (U8*) SvPVX(*try_p),1)) {
+ match = TRUE;
+ break;
+ }
+ }
+ }
+ }
}
}