regcomp.c: Comments, white-space, rmv extra parens

This commit re-indents things after the previous commit added a block, fixes typos in comments, and removes obsolete references in them to the sizing pass, and adds some comments, and does some white-space changes. In one case it removes extraneous parentheses
author: Karl Williamson <khw@cpan.org> 2018-11-03 09:41:52 -0600
committer: Karl Williamson <khw@cpan.org> 2018-11-16 10:48:18 -0700
commit: d63d75f59284a29a35b3e7255d956475704ae54d (patch)
tree: 685224505b24d7f300a26c8ad98b4eeabb71f505
parent: b05a732d9d25b71bdbe97128041e14bbae45ff51 (diff)
download: perl-d63d75f59284a29a35b3e7255d956475704ae54d.tar.gz
1 files changed, 269 insertions, 274 deletions
diff --git a/regcomp.c b/regcomp.c
index d0bff113ff..3af555042c 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -352,10 +352,8 @@ struct RExC_state_t {
                                      }                                     \
                              } STMT_END
 
-/* Change from /d into /u rules, and restart the parse if we've already seen
- * something whose size would increase as a result, by setting *flagp and
- * returning 'restart_retval'.  RExC_uni_semantics is a flag that indicates
- * we've changed to /u during the parse.  */
+/* Change from /d into /u rules, and restart the parse.  RExC_uni_semantics is
+ * a flag that indicates we've changed to /u during the parse.  */
 #define REQUIRE_UNI_RULES(flagp, restart_retval)                            \
     STMT_START {                                                            \
             if (DEPENDS_SEMANTICS) {                                        \
@@ -7080,14 +7078,25 @@ S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
  * pm_flags field of the related PMOP. Currently we're only interested in
  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
  *
- * We can't allocate space until we know how big the compiled form will be,
- * but we can't compile it (and thus know how big it is) until we've got a
- * place to put the code.  So we cheat:  we compile it twice, once with code
- * generation turned off and size counting turned on, and once "for real".
- * This also means that we don't allocate space until we are sure that the
- * thing really will compile successfully, and we never have to move the
- * code and thus invalidate pointers into it.  (Note that it has to be in
- * one piece because free() must be able to free it all.) [NB: not true in perl]
+ * For many years this code had an initial sizing pass that calculated
+ * (sometimes incorrectly, leading to security holes) the size needed for the
+ * compiled pattern.  That was changed by commit
+ * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a
+ * node at a time, as parsing goes along.  Patches welcome to fix any obsolete
+ * references to this sizing pass.
+ *
+ * Now, an initial crude guess as to the size needed is made, based on the
+ * length of the pattern.  Patches welcome to improve that guess.  That amount
+ * of space is malloc'd and then immediately freed, and then clawed back node
+ * by node.  This design is to minimze, to the extent possible, memory churn
+ * when doing the the reallocs.
+ *
+ * A separate parentheses counting pass may be needed in some cases.
+ * (Previously the sizing pass did this.)  Patches welcome to reduce the number
+ * of these cases.
+ *
+ * The existence of a sizing pass necessitated design decisions that are no
+ * longer needed.  There are potential areas of simplification.
  *
  * Beware that the optimization-preparation code in here knows about some
  * of the structure of the compiled regexp.  [I'll say.]
@@ -7335,9 +7344,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
 
     rx_flags = orig_rx_flags;
 
-    if (   initial_charset == REGEX_DEPENDS_CHARSET
-        && (RExC_uni_semantics))
-    {
+    if (initial_charset == REGEX_DEPENDS_CHARSET && RExC_uni_semantics) {
 
 	/* Set to use unicode semantics if the pattern is in utf8 and has the
 	 * 'depends' charset specified, as it means unicode when utf8  */
@@ -10829,7 +10836,7 @@ S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
  *
  * Returns 0 otherwise, with *flagp set to indicate why:
  *  TRYAGAIN        at the end of (?) that only sets flags.
- *  RESTART_PARSE   if the sizing scan needs to be restarted, or'd with
+ *  RESTART_PARSE   if the parse needs to be restarted, or'd with
  *                  NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
  *  Otherwise would only return 0 if regbranch() returns 0, which cannot
  *  happen.  */
@@ -12107,7 +12114,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
  * On success, returns the offset at which any next node should be placed into
  * the regex engine program being compiled.
  *
- * Returns 0 otherwise, setting flagp to RESTART_PARSE if the sizing scan needs
+ * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
  * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
  * UTF-8
  */
@@ -12160,6 +12167,8 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
             if (     chain > (SSize_t) BRANCH_MAX_OFFSET
                 && ! RExC_use_BRANCHJ)
             {
+                /* XXX We could just redo this branch, but figuring out what
+                 * bookkeeping needs to be reset is a pain */
                 REQUIRE_BRANCHJ(flagp, 0);
             }
             REGTAIL(pRExC_state, chain, latest);
@@ -12193,7 +12202,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
  *
  * Returns 0 otherwise, with *flagp set to indicate why:
  *  TRYAGAIN        if regatom() returns 0 with TRYAGAIN.
- *  RESTART_PARSE   if the sizing scan needs to be restarted, or'd with
+ *  RESTART_PARSE   if the parse needs to be restarted, or'd with
  *                  NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
  */
 STATIC regnode_offset
@@ -12481,7 +12490,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
   * function calling S_reg().
   *
   * The final possibility is that it is premature to be calling this function;
-  * that pass1 needs to be restarted.  This can happen when this changes from
+  * the parse needs to be restarted.  This can happen when this changes from
   * /d to /u rules, or when the pattern needs to be upgraded to UTF-8.  The
   * latter occurs only when the fourth possibility would otherwise be in
   * effect, and is because one of those code points requires the pattern to be
@@ -12814,8 +12823,7 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
      *
      * If <len> is zero, the function assumes that the node is to contain only
      * the single character given by <code_point> and calculates what <len>
-     * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
-     * additionally will populate the node's STRING with <code_point> or its
+     * should be.  It populates the node's STRING with <code_point> or its
      * fold if folding.
      *
      * In both cases <*flagp> is appropriately set
@@ -12857,7 +12865,7 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
                 }
                 len = 1;
             }
-            else if (FOLD && (! LOC
+            else if (FOLD && (   ! LOC
                               || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
             {   /* Folding, and ok to do so now */
                 UV folded = _to_uni_fold_flags(
@@ -13038,7 +13046,7 @@ S_backref_value(char *p, char *e)
    at which any  next regnode should be placed.
 
    Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
-   Returns 0, setting *flagp to RESTART_PARSE if the sizing scan needs to be
+   Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be
    restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
    Otherwise does not return 0.
 
@@ -13201,7 +13209,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 	/* Special Escapes
 
 	   This switch handles escape sequences that resolve to some kind
-	   of special regop and not to literal text. Escape sequnces that
+	   of special regop and not to literal text. Escape sequences that
 	   resolve to literal text are handled below in the switch marked
 	   "Literal Escapes".
 
@@ -13217,8 +13225,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 	    RExC_seen_zerolen++;
 	    ret = reg_node(pRExC_state, SBOL);
             /* SBOL is shared with /^/ so we set the flags so we can tell
-             * /\A/ from /^/ in split. We check ret because first pass we
-             * have no regop struct to set the flags on. */
+             * /\A/ from /^/ in split. */
             FLAGS(REGNODE_p(ret)) = 1;
 	    *flagp |= SIMPLE;
 	    goto finish_meta_pat;
@@ -13698,15 +13705,17 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
             /* We start out as an EXACT node, even if under /i, until we find a
              * character which is in a fold.  The algorithm now segregates into
              * separate nodes, characters that fold from those that don't under
-             * /i.  (This hopefull will create nodes that are fixed strings
-             * even under /i, giving the optimizer something to grab onto to.)
+             * /i.  (This hopefully will create nodes that are fixed strings
+             * even under /i, giving the optimizer something to grab on to.)
              * So, if a node has something in it and the next character is in
              * the opposite category, that node is closed up, and the function
              * returns.  Then regatom is called again, and a new node is
              * created for the new category. */
             U8 node_type = EXACT;
 
-            /* Assume node will be fully used; the excess is given back at the end */
+            /* Assume the node will be fully used; the excess is given back at
+             * the end.  We can't make any other length assumptions, as a byte
+             * input sequence could shrink down. */
             Ptrdiff_t initial_size = STR_SZ(256);
 
             bool next_is_quantifier;
@@ -13720,8 +13729,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
              * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
              * contain only above-Latin1 characters (hence must be in UTF8),
              * which don't participate in folds with Latin1-range characters,
-             * as the latter's folds aren't known until runtime.  (We don't
-             * need to figure this out until pass 2) */
+             * as the latter's folds aren't known until runtime. */
             bool maybe_exactfu = TRUE;
 
             /* To see if RExC_uni_semantics changes during parsing of the node.
@@ -14118,19 +14126,14 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                     /* This code point means we can't simplify things */
                     maybe_exactfu = FALSE;
 
-                    /* A problematic code point in this context means that its
-                     * fold isn't known until runtime, so we can't fold it now.
-                     * (The non-problematic code points are the above-Latin1
-                     * ones that fold to also all above-Latin1.  Their folds
-                     * don't vary no matter what the locale is.) But here we
-                     * have characters whose fold depends on the locale.
-                     * Unlike the non-folding case above, we have to keep track
-                     * of these in the sizing pass, so that we can make sure we
-                     * don't split too-long nodes in the middle of a potential
-                     * multi-char fold.  And unlike the regular fold case
-                     * handled in the else clauses below, we don't actually
-                     * fold and don't have special cases to consider.  What we
-                     * do for both passes is the PASS2 code for non-folding */
+                    /* Here, we are adding a problematic fold character.
+                     * "Problematic" in this context means that its fold isn't
+                     * known until runtime.  (The non-problematic code points
+                     * are the above-Latin1 ones that fold to also all
+                     * above-Latin1.  Their folds don't vary no matter what the
+                     * locale is.) But here we have characters whose fold
+                     * depends on the locale.  We just add in the unfolded
+                     * character, and wait until runtime to fold it */
                     goto not_fold_common;
                 }
                 else                /* A regular FOLD code point */
@@ -14303,7 +14306,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                             added_len = foldlen;
                         }
                     }
-		}
+		} /* End of adding current character to the node */
 
                 len += added_len;
 
@@ -14477,6 +14480,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
           loopdone:   /* Jumped to when encounters something that shouldn't be
                          in the node */
 
+            /* Free up any over-allocated space */
             change_engine_size(pRExC_state, - (initial_size - STR_SZ(len)));
 
             /* I (khw) don't know if you can get here with zero length, but the
@@ -14697,17 +14701,7 @@ S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
      * In b) there may be errors or warnings generated.  If 'check_only' is
      * TRUE, then any errors are discarded.  Warnings are returned to the
      * caller via an AV* created into '*posix_warnings' if it is not NULL.  If
-     * instead it is NULL, warnings are suppressed.  This is done in all
-     * passes.  The reason for this is that the rest of the parsing is heavily
-     * dependent on whether this routine found a valid posix class or not.  If
-     * it did, the closing ']' is absorbed as part of the class.  If no class,
-     * or an invalid one is found, any ']' will be considered the terminator of
-     * the outer bracketed character class, leading to very different results.
-     * In particular, a '(?[ ])' construct will likely have a syntax error if
-     * the class is parsed other than intended, and this will happen in pass1,
-     * before the warnings would normally be output.  This mechanism allows the
-     * caller to output those warnings in pass1 just before dieing, giving a
-     * much better clue as to what is wrong.
+     * instead it is NULL, warnings are suppressed.
      *
      * The reason for this function, and its complexity is that a bracketed
      * character class can contain just about anything.  But it's easy to
@@ -16475,7 +16469,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
      * On success, returns the offset at which any next node should be placed
      * into the regex engine program being compiled.
      *
-     * Returns 0 otherwise, setting flagp to RESTART_PARSE if the sizing scan needs
+     * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
      * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
      * UTF-8
      */
@@ -17272,11 +17266,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                 POSIXL_SET(posixl, namedclass);
 
                 /* The above-Latin1 characters are not subject to locale rules.
-                 * Just add them, in the second pass, to the
-                 * unconditionally-matched list */
+                 * Just add them to the unconditionally-matched list */
 
-                /* Get the list of the above-Latin1 code points this
-                    * matches */
+                /* Get the list of the above-Latin1 code points this matches */
                 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
                                         PL_XPosix_ptrs[classnum],
 
@@ -17284,10 +17276,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                         * NDIGIT, NASCII, ... */
                                         namedclass % 2 != 0,
                                         &scratch_list);
-                /* Checking if 'cp_list' is NULL first saves an extra
-                    * clone.  Its reference count will be decremented at the
-                    * next union, etc, or if this is the only instance, at the
-                    * end of the routine */
+                /* Checking if 'cp_list' is NULL first saves an extra clone.
+                 * Its reference count will be decremented at the next union,
+                 * etc, or if this is the only instance, at the end of the
+                 * routine */
                 if (! cp_list) {
                     cp_list = scratch_list;
                 }
@@ -17299,10 +17291,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
             }
             else {
 
-                /* Here, not in pass1 (in that pass we skip calculating the
-                 * contents of this class), and is not /l, or is a POSIX class
-                 * for which /l doesn't matter (or is a Unicode property, which
-                 * is skipped here). */
+                /* Here, is not /l, or is a POSIX class for which /l doesn't
+                 * matter (or is a Unicode property, which is skipped here). */
                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
 
@@ -17446,8 +17436,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
          * <prevvalue> is the beginning of the range, if any; or <value> if
          * not. */
 
-	/* non-Latin1 code point implies unicode semantics.  Must be set in
-	 * pass1 so is there for the whole of pass 2 */
+	/* non-Latin1 code point implies unicode semantics. */
 	if (value > 255) {
             REQUIRE_UNI_RULES(flagp, 0);
 	}
@@ -18198,245 +18187,252 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
         }
         else if (cp_list && ! invert) {
 
-        invlist_iterinit(cp_list);
-        if (! invlist_iternext(cp_list, &start, &end)) {
-
-            /* Here, the list is empty.  This happens, for example, when a
-             * Unicode property that doesn't match anything is the only element
-             * in the character class (perluniprops.pod notes such properties).
-             * */
-            op = OPFAIL;
-            *flagp |= HASWIDTH|SIMPLE;
-        }
-        else if (start == end) {    /* The range is a single code point */
-            if (! invlist_iternext(cp_list, &start, &end)
+            invlist_iterinit(cp_list);
+            if (! invlist_iternext(cp_list, &start, &end)) {
 
-                    /* Don't do this optimization if it would require changing
-                     * the pattern to UTF-8 */
-                && (start < 256 || UTF))
-            {
-                /* Here, the list contains a single code point.  Can optimize
-                 * into an EXACTish node */
+                /* Here, the list is empty.  This happens, for example, when a
+                 * Unicode property that doesn't match anything is the only
+                 * element in the character class (perluniprops.pod notes such
+                 * properties).  */
+                op = OPFAIL;
+                *flagp |= HASWIDTH|SIMPLE;
+            }
+            else if (start == end) {    /* The range is a single code point */
+                if (! invlist_iternext(cp_list, &start, &end)
 
-                value = start;
+                        /* Don't do this optimization if it would require
+                         * changing the pattern to UTF-8 */
+                    && (start < 256 || UTF))
+                {
+                    /* Here, the list contains a single code point.  Can
+                     * optimize into an EXACTish node */
 
-                if (! FOLD) {
-                    op = (LOC)
-                         ? EXACTL
-                         : EXACT;
-                }
-                else if (LOC) {
+                    value = start;
 
-                    /* A locale node under folding with one code point can be
-                     * an EXACTFL, as its fold won't be calculated until
-                     * runtime */
-                    op = EXACTFL;
-                }
-                else {
+                    if (! FOLD) {
+                        op = (LOC)
+                             ? EXACTL
+                             : EXACT;
+                    }
+                    else if (LOC) {
 
-                    /* Here, we are generally folding, but there is only one
-                     * code point to match.  If we have to, we use an EXACT
-                     * node, but it would be better for joining with adjacent
-                     * nodes in the optimization pass if we used the same
-                     * EXACTFish node that any such are likely to be.  We can
-                     * do this iff the code point doesn't participate in any
-                     * folds.  For example, an EXACTF of a colon is the same as
-                     * an EXACT one, since nothing folds to or from a colon. */
-                    if (value < 256) {
-                        if (IS_IN_SOME_FOLD_L1(value)) {
-                            op = EXACT;
-                        }
+                        /* A locale node under folding with one code point can
+                         * be an EXACTFL, as its fold won't be calculated until
+                         * runtime */
+                        op = EXACTFL;
                     }
                     else {
-                        if (_invlist_contains_cp(PL_utf8_foldable, value)) {
-                            op = EXACT;
+
+                        /* Here, we are generally folding, but there is only
+                         * one code point to match.  If we have to, we use an
+                         * EXACT node, but it would be better for joining with
+                         * adjacent nodes in the optimization phase if we used
+                         * the same EXACTFish node that any such are likely to
+                         * be.  We can do this iff the code point doesn't
+                         * participate in any folds.  For example, an EXACTF of
+                         * a colon is the same as an EXACT one, since nothing
+                         * folds to or from a colon. */
+                        if (value < 256) {
+                            if (IS_IN_SOME_FOLD_L1(value)) {
+                                op = EXACT;
+                            }
+                        }
+                        else {
+                            if (_invlist_contains_cp(PL_utf8_foldable, value)) {
+                                op = EXACT;
+                            }
                         }
-                    }
 
-                    /* If we haven't found the node type, above, it means we
-                     * can use the prevailing one */
-                    if (op == END) {
-                        op = compute_EXACTish(pRExC_state);
+                        /* If we haven't found the node type, above, it means
+                         * we can use the prevailing one */
+                        if (op == END) {
+                            op = compute_EXACTish(pRExC_state);
+                        }
                     }
                 }
+            }   /* End of first range contains just a single code point */
+            else if (start == 0) {
+                if (end == UV_MAX) {
+                    op = SANY;
+                    *flagp |= HASWIDTH|SIMPLE;
+                    MARK_NAUGHTY(1);
+                }
+                else if (end == '\n' - 1
+                        && invlist_iternext(cp_list, &start, &end)
+                        && start == '\n' + 1 && end == UV_MAX)
+                {
+                    op = REG_ANY;
+                    *flagp |= HASWIDTH|SIMPLE;
+                    MARK_NAUGHTY(1);
+                }
             }
-        }   /* End of first range contains just a single code point */
-        else if (start == 0) {
-            if (end == UV_MAX) {
-                op = SANY;
-                *flagp |= HASWIDTH|SIMPLE;
-                MARK_NAUGHTY(1);
-            }
-            else if (end == '\n' - 1
-                    && invlist_iternext(cp_list, &start, &end)
-                    && start == '\n' + 1 && end == UV_MAX)
-            {
-                op = REG_ANY;
-                *flagp |= HASWIDTH|SIMPLE;
-                MARK_NAUGHTY(1);
-            }
-        }
-        invlist_iterfinish(cp_list);
+            invlist_iterfinish(cp_list);
 
-        if (op == END) {
+            if (op == END) {
 
-            /* Here, didn't find an optimization.  See if this matches any of
-             * the POSIX classes.  First try ASCII */
+                /* Here, didn't find an optimization.  See if this matches any
+                 * of the POSIX classes.  First try ASCII */
 
-            if (_invlistEQ(cp_list, PL_XPosix_ptrs[_CC_ASCII], 0)) {
-                op = ASCII;
-                *flagp |= HASWIDTH|SIMPLE;
-            }
-            else if (_invlistEQ(cp_list, PL_XPosix_ptrs[_CC_ASCII], 1)) {
-                op = NASCII;
-                *flagp |= HASWIDTH|SIMPLE;
-            }
-            else {
+                if (_invlistEQ(cp_list, PL_XPosix_ptrs[_CC_ASCII], 0)) {
+                    op = ASCII;
+                    *flagp |= HASWIDTH|SIMPLE;
+                }
+                else if (_invlistEQ(cp_list, PL_XPosix_ptrs[_CC_ASCII], 1)) {
+                    op = NASCII;
+                    *flagp |= HASWIDTH|SIMPLE;
+                }
+                else {
 
-                /* Then try the other POSIX classes.  The POSIXA ones are about
-                 * the same speed as ANYOF ops, but take less room; the ones
-                 * that have above-Latin1 code point matches are somewhat
-                 * faster than ANYOF. */
+                    /* Then try the other POSIX classes.  The POSIXA ones are
+                     * about the same speed as ANYOF ops, but take less room;
+                     * the ones that have above-Latin1 code point matches are
+                     * somewhat faster than ANYOF. */
 
-                for (posix_class = 0;
-                     posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
-                     posix_class++)
-                {
-                    int try_inverted;
+                    for (posix_class = 0;
+                         posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
+                         posix_class++)
+                    {
+                        int try_inverted;
 
-                    for (try_inverted = 0; try_inverted < 2; try_inverted++) {
+                        for (try_inverted = 0; try_inverted < 2; try_inverted++)
+                        {
 
-                        /* Check if matches POSIXA, normal or inverted */
-                        if (PL_Posix_ptrs[posix_class]) {
+                            /* Check if matches POSIXA, normal or inverted */
+                            if (PL_Posix_ptrs[posix_class]) {
+                                if (_invlistEQ(cp_list,
+                                               PL_Posix_ptrs[posix_class],
+                                               try_inverted))
+                                {
+                                    op = (try_inverted)
+                                        ? NPOSIXA
+                                        : POSIXA;
+                                    *flagp |= HASWIDTH|SIMPLE;
+                                    goto found_posix;
+                                }
+                            }
+
+                            /* Check if matches POSIXU, normal or inverted */
                             if (_invlistEQ(cp_list,
-                                           PL_Posix_ptrs[posix_class],
+                                           PL_XPosix_ptrs[posix_class],
                                            try_inverted))
                             {
                                 op = (try_inverted)
-                                    ? NPOSIXA
-                                    : POSIXA;
+                                     ? NPOSIXU
+                                     : POSIXU;
                                 *flagp |= HASWIDTH|SIMPLE;
                                 goto found_posix;
                             }
                         }
-
-                        /* Check if matches POSIXU, normal or inverted */
-                        if (_invlistEQ(cp_list,
-                                       PL_XPosix_ptrs[posix_class],
-                                       try_inverted))
-                        {
-                            op = (try_inverted)
-                                 ? NPOSIXU
-                                 : POSIXU;
-                            *flagp |= HASWIDTH|SIMPLE;
-                            goto found_posix;
-                        }
                     }
-                }
-              found_posix: ;
-            }
-
-            /* If it didn't match a POSIX class, it might be able to be turned
-             * into an ANYOFM node.  Compare two different bytes, bit-by-bit.
-             * In some positions, the bits in each will be 1; and in other
-             * positions both will be 0; and in some positions the bit will be
-             * 1 in one byte, and 0 in the other.  Let 'n' be the number of
-             * positions where the bits differ.  We create a mask which has
-             * exactly 'n' 0 bits, each in a position where the two bytes
-             * differ.  Now take the set of all bytes that when ANDed with the
-             * mask yield the same result.  That set has 2**n elements, and is
-             * representable by just two 8 bit numbers: the result and the
-             * mask.  Importantly, matching the set can be vectorized by
-             * creating a word full of the result bytes, and a word full of the
-             * mask bytes, yielding a significant speed up.  Here, see if this
-             * node matches such a set.  As a concrete example consider [01],
-             * and the byte representing '0' which is 0x30 on ASCII machines.
-             * It has the bits 0011 0000.  Take the mask 1111 1110.  If we AND
-             * 0x31 and 0x30 with that mask we get 0x30.  Any other bytes ANDed
-             * yield something else.  So [01], which is a common usage, is
-             * optimizable into ANYOFM, and can benefit from the speed up.  We
-             * can only do this on UTF-8 invariant bytes, because the variance
-             * would throw this off.  */
-            if (   op == END
-                && invlist_highest(cp_list) <=
+                  found_posix: ;
+                }
+
+                /* If it didn't match a POSIX class, it might be able to be
+                 * turned into an ANYOFM node.  Compare two different bytes,
+                 * bit-by-bit.  In some positions, the bits in each will be 1;
+                 * and in other positions both will be 0; and in some positions
+                 * the bit will be 1 in one byte, and 0 in the other.  Let 'n'
+                 * be the number of positions where the bits differ.  We create
+                 * a mask which has exactly 'n' 0 bits, each in a position
+                 * where the two bytes differ.  Now take the set of all bytes
+                 * that when ANDed with the mask yield the same result.  That
+                 * set has 2**n elements, and is representable by just two 8
+                 * bit numbers: the result and the mask.  Importantly, matching
+                 * the set can be vectorized by creating a word full of the
+                 * result bytes, and a word full of the mask bytes, yielding a
+                 * significant speed up.  Here, see if this node matches such a
+                 * set.  As a concrete example consider [01], and the byte
+                 * representing '0' which is 0x30 on ASCII machines.  It has
+                 * the bits 0011 0000.  Take the mask 1111 1110.  If we AND
+                 * 0x31 and 0x30 with that mask we get 0x30.  Any other bytes
+                 * ANDed yield something else.  So [01], which is a common
+                 * usage, is optimizable into ANYOFM, and can benefit from the
+                 * speed up.  We can only do this on UTF-8 invariant bytes,
+                 * because the variance would throw this off.  */
+                if (   op == END
+                    && invlist_highest(cp_list) <=
 #ifdef EBCDIC
-                                               0xFF
+                                                   0xFF
 #else
-                                               0x7F
+                                                   0x7F
 #endif
-            ) {
-                Size_t cp_count = 0;
-                bool first_time = TRUE;
-                unsigned int lowest_cp = 0xFF;
-                U8 bits_differing = 0;
-
-                /* Only needed on EBCDIC, as there, variants and non- are mixed
-                 * together.  Could #ifdef it out on ASCII, but probably the
-                 * compiler will optimize it out */
-                bool has_variant = FALSE;
-
-                /* Go through the bytes and find the bit positions that differ */
-                invlist_iterinit(cp_list);
-                while (invlist_iternext(cp_list, &start, &end)) {
-                    unsigned int i = start;
-
-                    cp_count += end - start + 1;
-
-                    if (first_time) {
-                        if (! UVCHR_IS_INVARIANT(i)) {
-                            has_variant = TRUE;
-                            continue;
-                        }
-
-                        first_time = FALSE;
-                        lowest_cp = start;
+                ) {
+                    Size_t cp_count = 0;
+                    bool first_time = TRUE;
+                    unsigned int lowest_cp = 0xFF;
+                    U8 bits_differing = 0;
+
+                    /* Only needed on EBCDIC, as there, variants and non- are
+                     * mixed together.  Could #ifdef it out on ASCII, but
+                     * probably the compiler will optimize it out */
+                    bool has_variant = FALSE;
+
+                    /* Go through the bytes and find the bit positions that
+                     * differ */
+                    invlist_iterinit(cp_list);
+                    while (invlist_iternext(cp_list, &start, &end)) {
+                        unsigned int i = start;
+
+                        cp_count += end - start + 1;
+
+                        if (first_time) {
+                            if (! UVCHR_IS_INVARIANT(i)) {
+                                has_variant = TRUE;
+                                continue;
+                            }
 
-                        i++;
-                    }
+                            first_time = FALSE;
+                            lowest_cp = start;
 
-                    /* Find the bit positions that differ from the lowest code
-                     * point in the node.  Keep track of all such positions by
-                     * OR'ing */
-                    for (; i <= end; i++) {
-                        if (! UVCHR_IS_INVARIANT(i)) {
-                            has_variant = TRUE;
-                            continue;
+                            i++;
                         }
 
-                        bits_differing  |= i ^ lowest_cp;
+                        /* Find the bit positions that differ from the lowest
+                         * code point in the node.  Keep track of all such
+                         * positions by OR'ing */
+                        for (; i <= end; i++) {
+                            if (! UVCHR_IS_INVARIANT(i)) {
+                                has_variant = TRUE;
+                                continue;
+                            }
+
+                            bits_differing  |= i ^ lowest_cp;
+                        }
                     }
-                }
-                invlist_iterfinish(cp_list);
-
-                /* At the end of the loop, we count how many bits differ from
-                 * the bits in lowest code point, call the count 'd'.  If the
-                 * set we found contains 2**d elements, it is the closure of
-                 * all code points that differ only in those bit positions.  To
-                 * convince yourself of that, first note that the number in the
-                 * closure must be a power of 2, which we test for.  The only
-                 * way we could have that count and it be some differing set,
-                 * is if we got some code points that don't differ from the
-                 * lowest code point in any position, but do differ from each
-                 * other in some other position.  That means one code point has
-                 * a 1 in that position, and another has a 0.  But that would
-                 * mean that one of them differs from the lowest code point in
-                 * that position, which possibility we've already excluded. */
-                if ( ! has_variant
-                    && cp_count == 1U << PL_bitcount[bits_differing])
-                {
-                    assert(cp_count > 1);
-                    op = ANYOFM;
+                    invlist_iterfinish(cp_list);
+
+                    /* At the end of the loop, we count how many bits differ
+                     * from the bits in lowest code point, call the count 'd'.
+                     * If the set we found contains 2**d elements, it is the
+                     * closure of all code points that differ only in those bit
+                     * positions.  To convince yourself of that, first note
+                     * that the number in the closure must be a power of 2,
+                     * which we test for.  The only way we could have that
+                     * count and it be some differing set, is if we got some
+                     * code points that don't differ from the lowest code point
+                     * in any position, but do differ from each other in some
+                     * other position.  That means one code point has a 1 in
+                     * that position, and another has a 0.  But that would mean
+                     * that one of them differs from the lowest code point in
+                     * that position, which possibility we've already excluded.
+                     * */
+                    if ( ! has_variant
+                        && cp_count == 1U << PL_bitcount[bits_differing])
+                    {
+                        assert(cp_count > 1);
+                        op = ANYOFM;
 
-                    /* We need to make the bits that differ be 0's */
-                    ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
+                        /* We need to make the bits that differ be 0's */
+                        ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS
+                                                         */
 
-                    /* The argument is the lowest code point */
-                    anode_arg = lowest_cp;
-                    *flagp |= HASWIDTH|SIMPLE;
+                        /* The argument is the lowest code point */
+                        anode_arg = lowest_cp;
+                        *flagp |= HASWIDTH|SIMPLE;
+                    }
                 }
             }
         }
-        }
 
         if (op != END) {
             RExC_parse = (char *)orig_parse;
@@ -18465,7 +18461,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
             SvREFCNT_dec_NN(cp_list);
             return ret;
         }
-    }
+    }   /* End of seeing if can optimize it into a different node */
 
     /* It's going to be an ANYOF node. */
     op = (use_anyofd)
@@ -19004,11 +19000,10 @@ S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
 STATIC regnode_offset
 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
 {
-    /* Allocate a regnode for 'op', with 'extra_size' extra space.  In pass1,
-     * it aligns and increments RExC_size; in pass2, RExC_emit
+    /* Allocate a regnode for 'op', with 'extra_size' extra space.  It aligns
+     * and increments RExC_size and RExC_emit
      *
-     * It returns the renode's offset into the regex engine program (meaningful
-     * only in pass2 */
+     * It returns the regnode's offset into the regex engine program */
 
     const regnode_offset ret = RExC_emit;
author	Karl Williamson <khw@cpan.org>	2018-11-03 09:41:52 -0600
committer	Karl Williamson <khw@cpan.org>	2018-11-16 10:48:18 -0700
commit	d63d75f59284a29a35b3e7255d956475704ae54d (patch)
tree	685224505b24d7f300a26c8ad98b4eeabb71f505
parent	b05a732d9d25b71bdbe97128041e14bbae45ff51 (diff)
download	perl-d63d75f59284a29a35b3e7255d956475704ae54d.tar.gz