regcomp.c: More prep for bitmap/nonbitmap folds

This sets things up in preparation for a future commit that will move calculating all folds involving characters in the bit map.
author: Karl Williamson <public@khwilliamson.com> 2011-02-27 15:19:04 -0700
committer: Karl Williamson <public@khwilliamson.com> 2011-02-27 19:21:33 -0700
commit: 537429566e5149c6661a619b1f9a77e25ba30b8f (patch)
tree: a09e332f0990ab6d87b95a6be3ab02fae0cfdc99 /regcomp.c
parent: 5bfec14d8f541613f52ee87efb2cd875bad0cb37 (diff)
download: perl-537429566e5149c6661a619b1f9a77e25ba30b8f.tar.gz
1 files changed, 32 insertions, 1 deletions
diff --git a/regcomp.c b/regcomp.c
index 0feb9eb136..67e5d55553 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -9319,8 +9319,27 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
 				      than just initialized.  */
     UV n;
+
+    /* code points this node matches that can't be stored in the bitmap */
     HV* nonbitmap = NULL;
+
+    /* The items that are to match that aren't stored in the bitmap, but are a
+     * result of things that are stored there.  This is the fold closure of
+     * such a character, either because it has DEPENDS semantics and shouldn't
+     * be matched unless the target string is utf8, or is a code point that is
+     * too large for the bit map, as for example, the fold of the MICRO SIGN is
+     * above 255.  This all is solely for performance reasons.  By having this
+     * code know the outside-the-bitmap folds that the bitmapped characters are
+     * involved with, we don't have to go out to disk to find the list of
+     * matches, unless the character class includes code points that aren't
+     * storable in the bit map.  That means that a character class with an 's'
+     * in it, for example, doesn't need to go out to disk to find everything
+     * that matches.  A 2nd list is used so that the 'nonbitmap' list is kept
+     * empty unless there is something whose fold we don't know about, and will
+     * have to go out to the disk to find. */
     HV* l1_fold_invlist = NULL;
+
+    /* List of multi-character folds that are matched by this node */
     AV* unicode_alternate  = NULL;
 #ifdef EBCDIC
     UV literal_endpoint = 0;
@@ -9936,6 +9955,7 @@ parseit:
 	else {
 	    nonbitmap = l1_fold_invlist;
 	}
+        l1_fold_invlist = NULL;
     }
     if (nonbitmap) {
 	UV i;
@@ -10106,6 +10126,16 @@ parseit:
 	} /* End of processing all the folds */
     }
 
+    /* Combine the two lists into one. */
+    if (l1_fold_invlist) {
+	if (nonbitmap) {
+	    nonbitmap = invlist_union(nonbitmap, l1_fold_invlist);
+	}
+	else {
+	    nonbitmap = l1_fold_invlist;
+	}
+    }
+
     /* Here, we have calculated what code points should be in the character
      * class.   Now we can see about various optimizations.  Fold calculation
      * needs to take place before inversion.  Otherwise /[^k]/i would invert to
@@ -10134,7 +10164,7 @@ parseit:
     /* Folding in the bitmap is taken care of above, but not for locale (for
      * which we have to wait to see what folding is in effect at runtime), and
      * for things not in the bitmap.  Set run-time fold flag for these */
-    if (FOLD && (LOC || nonbitmap)) {
+    if (FOLD && (LOC || nonbitmap || unicode_alternate)) {
 	ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
     }
 
@@ -10153,6 +10183,7 @@ parseit:
      * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
      * FI'. */
     if (! nonbitmap
+	&& ! unicode_alternate
 	&& SvCUR(listsv) == initial_listsv_len
 	&& ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
         && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
author	Karl Williamson <public@khwilliamson.com>	2011-02-27 15:19:04 -0700
committer	Karl Williamson <public@khwilliamson.com>	2011-02-27 19:21:33 -0700
commit	537429566e5149c6661a619b1f9a77e25ba30b8f (patch)
tree	a09e332f0990ab6d87b95a6be3ab02fae0cfdc99 /regcomp.c
parent	5bfec14d8f541613f52ee87efb2cd875bad0cb37 (diff)
download	perl-537429566e5149c6661a619b1f9a77e25ba30b8f.tar.gz