5 files changed, 70 insertions, 56 deletions
diff --git a/lib/unicore/mktables b/lib/unicore/mktables
index 7b7af71396..03f122b6f5 100644
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -15633,7 +15633,7 @@ sub make_property_test_script() {
 # others except DAge.txt (as data in an extracted file can be over-ridden by
 # the non-extracted.  Some other files depend on data derived from an earlier
 # file, like UnicodeData requires data from Jamo, and the case changing and
-# folding requires data from Unicode.  Mostly, it safest to order by first
+# folding requires data from Unicode.  Mostly, it is safest to order by first
 # version releases in (except the Jamo).  DAge.txt is read before the
 # extracted ones because of the rarely used feature $compare_versions.  In the
 # unlikely event that there were ever an extracted file that contained the Age
diff --git a/lib/utf8_heavy.pl b/lib/utf8_heavy.pl
index 84a81676f0..95758f7287 100644
--- a/lib/utf8_heavy.pl
+++ b/lib/utf8_heavy.pl
@@ -105,9 +105,8 @@ sub _loose_name ($) {
 
         if ($type)
         {
-
             # Verify that this isn't a recursive call for this property.
-            # Can't use croak, as it may try to recurse here itself.
+            # Can't use croak, as it may try to recurse to here itself.
             my $class_type = $class . "::$type";
             if (grep { $_ eq $class_type } @recursed) {
                 CORE::die "panic: Infinite recursion in SWASHNEW for '$type'\n";
diff --git a/regcomp.c b/regcomp.c
index 4669c27a0b..6e27f1ef97 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -5840,8 +5840,8 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
  * The 1th element is the first element beyond that not in the list.  In other
  * words, the first range is
  *  invlist[0]..(invlist[1]-1)
- * The other ranges follow.  Thus every element that is divisible by two marks
- * the beginning of a range that is in the list, and every element not
+ * The other ranges follow.  Thus every element whose index is divisible by two
+ * marks the beginning of a range that is in the list, and every element not
  * divisible by two marks the beginning of a range not in the list.  A single
  * element inversion list that contains the single code point N generally
  * consists of two elements
@@ -5922,7 +5922,8 @@ S_invlist_array(pTHX_ SV* const invlist)
 
     PERL_ARGS_ASSERT_INVLIST_ARRAY;
 
-    /* Must not be empty */
+    /* Must not be empty.  If these fail, you probably didn't check for <len>
+     * being non-zero before trying to get the array */
     assert(*get_invlist_len_addr(invlist));
     assert(*get_invlist_zero_addr(invlist) == 0
 	   || *get_invlist_zero_addr(invlist) == 1);
@@ -5948,7 +5949,8 @@ S_get_invlist_len_addr(pTHX_ SV* invlist)
 PERL_STATIC_INLINE UV
 S_invlist_len(pTHX_ SV* const invlist)
 {
-    /* Returns the current number of elements in the inversion list's array */
+    /* Returns the current number of elements stored in the inversion list's
+     * array */
 
     PERL_ARGS_ASSERT_INVLIST_LEN;
 
@@ -6059,7 +6061,6 @@ S_invlist_trim(pTHX_ SV* const invlist)
 
 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
  * etc */
-
 #define ELEMENT_IN_INVLIST_SET(i) (! ((i) & 1))
 #define PREV_ELEMENT_IN_INVLIST_SET(i) (! ELEMENT_IN_INVLIST_SET(i))
 
@@ -6105,7 +6106,7 @@ Perl__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV
 	    }
 	    else {
 		/* But if the end is the maximum representable on the machine,
-		 * just let the range that this would extend have no end */
+		 * just let the range that this would extend to have no end */
 		invlist_set_len(invlist, len - 1);
 	    }
 	    return;
@@ -6145,7 +6146,7 @@ void
 Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output)
 {
     /* Take the union of two inversion lists and point 'result' to it.  If
-     * 'result' on input points to one of the two lists, the reference count to
+     * 'output' on input points to one of the two lists, the reference count to
      * that list will be decremented.
      * The basis for this comes from "Unicode Demystified" Chapter 13 by
      * Richard Gillam, published by Addison-Wesley, and explained at some
@@ -6191,8 +6192,7 @@ Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output)
 	}
 	else if (output != &b) {
 	    *output = invlist_clone(b);
-	}
-	/* else *output already = b; */
+	} /* else *output already = b; */
 	return;
     }
     else if ((len_b = invlist_len(b)) == 0) {
@@ -6636,8 +6636,8 @@ S_invlist_clone(pTHX_ SV* const invlist)
 void
 Perl__invlist_subtract(pTHX_ SV* const a, SV* const b, SV** result)
 {
-    /* Point result to an inversion list which consists of all elements in 'a'
-     * that aren't also in 'b' */
+    /* Point <result> to an inversion list which consists of all elements in
+     * <a> that aren't also in <b> */
 
     PERL_ARGS_ASSERT__INVLIST_SUBTRACT;
 
@@ -6687,6 +6687,13 @@ S_invlist_iterinit(pTHX_ SV* invlist)	/* Initialize iterator for invlist */
 STATIC bool
 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
 {
+    /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
+     * This call sets in <*start> and <*end>, the next range in <invlist>.
+     * Returns <TRUE> if successful and the next call will return the next
+     * range; <FALSE> if was already at the end of the list.  If the latter,
+     * <*start> and <*end> are unchanged, and the next call to this function
+     * will start over at the beginning of the list */
+
     UV* pos = get_invlist_iter_addr(invlist);
     UV len = invlist_len(invlist);
     UV *array;
@@ -10469,10 +10476,10 @@ parseit:
 	    }
 	}
 
-	/* Only the characters in this class that participate in folds need
-	    * be checked.  Get the intersection of this class and all the
-	    * possible characters that are foldable.  This can quickly narrow
-	    * down a large class */
+	/* Only the characters in this class that participate in folds need be
+	 * checked.  Get the intersection of this class and all the possible
+	 * characters that are foldable.  This can quickly narrow down a large
+	 * class */
 	_invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection);
 
 	/* Now look at the foldable characters in this class individually */
@@ -10491,23 +10498,22 @@ parseit:
 
 		if (foldlen > (STRLEN)UNISKIP(f)) {
 
-		    /* Any multicharacter foldings (disallowed in
-			* lookbehind patterns) require the following
-			* transform: [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) where
-			* E folds into "pq" and F folds into "rst", all other
-			* characters fold to single characters.  We save away
-			* these multicharacter foldings, to be later saved as
-			* part of the additional "s" data. */
+		    /* Any multicharacter foldings (disallowed in lookbehind
+		     * patterns) require the following transform: [ABCDEF] ->
+		     * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
+		     * folds into "rst", all other characters fold to single
+		     * characters.  We save away these multicharacter foldings,
+		     * to be later saved as part of the additional "s" data. */
 		    if (! RExC_in_lookbehind) {
 			U8* loc = foldbuf;
 			U8* e = foldbuf + foldlen;
 
-			/* If any of the folded characters of this are in
-			    * the Latin1 range, tell the regex engine that
-			    * this can match a non-utf8 target string.  The
-			    * only multi-byte fold whose source is in the
-			    * Latin1 range (U+00DF) applies only when the
-			    * target string is utf8, or under unicode rules */
+			/* If any of the folded characters of this are in the
+			 * Latin1 range, tell the regex engine that this can
+			 * match a non-utf8 target string.  The only multi-byte
+			 * fold whose source is in the Latin1 range (U+00DF)
+			 * applies only when the target string is utf8, or
+			 * under unicode rules */
 			if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
 			    while (loc < e) {
 
@@ -10520,8 +10526,8 @@ parseit:
 				if (UTF8_IS_INVARIANT(*loc)
 				    || UTF8_IS_DOWNGRADEABLE_START(*loc))
 				{
-				    /* Can't mix above and below 256 under
-					* LOC */
+                                    /* Can't mix above and below 256 under LOC
+                                     */
 				    if (LOC) {
 					goto end_multi_fold;
 				    }
@@ -10551,13 +10557,13 @@ parseit:
 		}
 		else {
 		    /* Single character fold.  Add everything in its fold
-			* closure to the list that this node should match */
+		     * closure to the list that this node should match */
 		    SV** listp;
 
-		    /* The fold closures data structure is a hash with the
-			* keys being every character that is folded to, like
-			* 'k', and the values each an array of everything that
-			* folds to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
+		    /* The fold closures data structure is a hash with the keys
+		     * being every character that is folded to, like 'k', and
+		     * the values each an array of everything that folds to its
+		     * key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
 		    if ((listp = hv_fetch(PL_utf8_foldclosures,
 				    (char *) foldbuf, foldlen, FALSE)))
 		    {
@@ -10571,9 +10577,9 @@ parseit:
 			    }
 			    c = SvUV(*c_p);
 
-			    /* /aa doesn't allow folds between ASCII and
-				* non-; /l doesn't allow them between above
-				* and below 256 */
+			    /* /aa doesn't allow folds between ASCII and non-;
+			     * /l doesn't allow them between above and below
+			     * 256 */
 			    if ((MORE_ASCII_RESTRICTED
 				 && (isASCII(c) != isASCII(j)))
 				    || (LOC && ((c < 256) != (j < 256))))
@@ -10587,9 +10593,9 @@ parseit:
 					(U8) c,
 					&l1_fold_invlist, &unicode_alternate);
 			    }
-				/* It may be that the code point is already
-				    * in this range or already in the bitmap,
-				    * in which case we need do nothing */
+				/* It may be that the code point is already in
+				 * this range or already in the bitmap, in
+				 * which case we need do nothing */
 			    else if ((c < start || c > end)
 					&& (c > 255
 					    || ! ANYOF_BITMAP_TEST(ret, c)))
@@ -10616,21 +10622,25 @@ parseit:
     }
 
     /* Here, we have calculated what code points should be in the character
-     * class.   Now we can see about various optimizations.  Fold calculation
-     * needs to take place before inversion.  Otherwise /[^k]/i would invert to
-     * include K, which under /i would match k. */
+     * class.
+     *
+     * Now we can see about various optimizations.  Fold calculation (which we
+     * did above) needs to take place before inversion.  Otherwise /[^k]/i
+     * would invert to include K, which under /i would match k, which it
+     * shouldn't. */
 
     /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that we haven't
-     * set the FOLD flag yet, so this this does optimize those.  It doesn't
+     * set the FOLD flag yet, so this does optimize those.  It doesn't
      * optimize locale.  Doing so perhaps could be done as long as there is
      * nothing like \w in it; some thought also would have to be given to the
      * interaction with above 0x100 chars */
-    if (! LOC
-	&& (ANYOF_FLAGS(ret) & ANYOF_INVERT)
+    if ((ANYOF_FLAGS(ret) & ANYOF_INVERT)
+        && ! LOC
 	&& ! unicode_alternate
 	/* In case of /d, there are some things that should match only when in
 	 * not in the bitmap, i.e., they require UTF8 to match.  These are
-	 * listed in nonbitmap. */
+	 * listed in nonbitmap, but if ANYOF_NONBITMAP_NON_UTF8 is set in this
+	 * case, they don't require UTF8, so can invert here */
 	&& (! nonbitmap
 	    || ! DEPENDS_SEMANTICS
 	    || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
@@ -10657,6 +10667,8 @@ parseit:
 		    ANYOF_BITMAP_SET(ret, value);
 		}
 	    }
+
+	    /* And do the removal */
 	    _invlist_subtract(nonbitmap, remove_list, &nonbitmap);
 	    SvREFCNT_dec(remove_list);
 	}
@@ -10794,11 +10806,11 @@ parseit:
 	/* The 0th element stores the character class description
 	 * in its textual form: used later (regexec.c:Perl_regclass_swash())
 	 * to initialize the appropriate swash (which gets stored in
-	 * the 1st element), and also useful for dumping the regnode.
-	 * The 2nd element stores the multicharacter foldings,
+	 * element [1]), and also useful for dumping the regnode.
+	 * Element [2] stores the multicharacter foldings,
 	 * used later (regexec.c:S_reginclass()). */
 	av_store(av, 0, listsv);
-	av_store(av, 1, NULL);
+	av_store(av, 1, NULL);	/* Placeholder for generated swash */
 
         /* Store any computed multi-char folds only if we are allowing
          * them */
@@ -11610,14 +11622,14 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
 	    sv_catpvs(sv, "{outside bitmap}");
 
 	if (ANYOF_NONBITMAP(o)) {
-	    SV *lv;
+	    SV *lv; /* Set if there is something outside the bit map */
 	    SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
 	
 	    if (lv) {
 		if (sw) {
 		    U8 s[UTF8_MAXBYTES_CASE+1];
 
-		    for (i = 0; i <= 256; i++) { /* just the first 256 */
+		    for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
 			uvchr_to_utf8(s, i);
 			
 			if (i < 256 && swash_fetch(sw, s, TRUE)) {
diff --git a/regcomp.h b/regcomp.h
index 81c8a5ddd7..0540c637c0 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -311,6 +311,8 @@ struct regnode_charclass_class {
  * are done to share them, as described below.  If necessary, the ANYOF_LOCALE
  * and ANYOF_CLASS bits could be shared with a space penalty for locale nodes,
  * but this isn't quite so easy, as the optimizer also uses ANYOF_CLASS.
+ * Another option would be to push them into new nodes.  E.g. there could be an
+ * ANYOF_LOCALE node that would be in place of the flag of the same name.
  * Once the planned change to compile all the above-latin1 code points is done,
  * then the UNICODE_ALL bit can be freed up, with a small performance penalty.
  * If flags need to be added that are applicable to the synthetic start class
diff --git a/utf8.c b/utf8.c
index 23308a368d..61c6c23e03 100644
--- a/utf8.c
+++ b/utf8.c
@@ -2586,6 +2586,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
 
     PERL_ARGS_ASSERT_SWASH_FETCH;
 
+    /* Convert to utf8 if not already */
     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
 	tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
 	tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);