regcomp.c: white-space; comments

author: Karl Williamson <khw@cpan.org> 2020-05-02 06:58:57 -0600
committer: Karl Williamson <khw@cpan.org> 2021-05-31 12:55:19 -0600
commit: 403d7eb3e4320188571cf61b9dab62ff10799f49 (patch)
tree: 092639203afcb911449ca5179ac30fcd965166a8 /regcomp.c
parent: 1f4fbd3b4b26604673abca2a5f911744e826b1f3 (diff)
download: perl-403d7eb3e4320188571cf61b9dab62ff10799f49.tar.gz
1 files changed, 239 insertions, 268 deletions
diff --git a/regcomp.c b/regcomp.c
index e28ff3d61e..962f4cb941 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -19295,7 +19295,8 @@ S_optimize_regclass(pTHX_
      * any created regnode.  If the new op is sufficiently like plain ANYOF, it
      * leaves *ret unchanged for allocation in S_regclass.
      *
-     * Certain of the parameters may be updated as a result of the changes herein */
+     * Certain of the parameters may be updated as a result of the changes
+     * herein */
 
         U8 op = ANYOF; /* The returned node-type, initialized to the unoptimized
                         one. */
@@ -19308,9 +19309,9 @@ S_optimize_regclass(pTHX_
 
         PERL_ARGS_ASSERT_OPTIMIZE_REGCLASS;
 
-        if (cp_list) { /* Count the code points in enough ranges that we would
-                          see all the ones possible in any fold in this version
-                          of Unicode */
+    if (cp_list) { /* Count the code points in enough ranges that we would see
+                      all the ones possible in any fold in this version of
+                      Unicode */
 
             invlist_iterinit(cp_list);
             for (i = 0; i <= MAX_FOLD_FROMS; i++) {
@@ -19326,8 +19327,8 @@ S_optimize_regclass(pTHX_
             invlist_iterfinish(cp_list);
         }
 
-        /* If we know at compile time that this matches every possible code
-         * point, any run-time dependencies don't matter */
+    /* If we know at compile time that this matches every possible code point,
+     * any run-time dependencies don't matter */
         if (start[0] == 0 && end[0] == UV_MAX) {
             if (*invert) {
                 op = OPFAIL;
@@ -19341,13 +19342,11 @@ S_optimize_regclass(pTHX_
             return op;
         }
 
-        /* Similarly, for /l posix classes, if both a class and its
-         * complement match, any run-time dependencies don't matter */
+    /* Similarly, for /l posix classes, if both a class and its complement
+     * match, any run-time dependencies don't matter */
         if (posixl) {
             int namedclass;
-            for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX;
-                                                        namedclass += 2)
-            {
+        for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX; namedclass += 2) {
                 if (   POSIXL_TEST(posixl, namedclass)      /* class */
                     && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
                 {
@@ -19364,20 +19363,20 @@ S_optimize_regclass(pTHX_
                 }
             }
 
-            /* For well-behaved locales, some classes are subsets of others,
-             * so complementing the subset and including the non-complemented
-             * superset should match everything, like [\D[:alnum:]], and
+        /* For well-behaved locales, some classes are subsets of others, so
+         * complementing the subset and including the non-complemented superset
+         * should match everything, like [\D[:alnum:]], and
              * [[:^alpha:][:alnum:]], but some implementations of locales are
              * buggy, and khw thinks its a bad idea to have optimization change
              * behavior, even if it avoids an OS bug in a given case */
 
 #define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
 
-            /* If is a single posix /l class, can optimize to just that op.
-             * Such a node will not match anything in the Latin1 range, as that
-             * is not determinable until runtime, but will match whatever the
-             * class does outside that range.  (Note that some classes won't
-             * match anything outside the range, like [:ascii:]) */
+        /* If is a single posix /l class, can optimize to just that op.  Such a
+         * node will not match anything in the Latin1 range, as that is not
+         * determinable until runtime, but will match whatever the class does
+         * outside that range.  (Note that some classes won't match anything
+         * outside the range, like [:ascii:]) */
             if (    isSINGLE_BIT_SET(posixl)
                 && (partial_cp_count == 0 || start[0] > 255))
             {
@@ -19390,8 +19389,7 @@ S_optimize_regclass(pTHX_
                  * ANYOF_CNTRL.  From
                  * https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
                  * */
-                static const int MultiplyDeBruijnBitPosition2[32] =
-                    {
+            static const int MultiplyDeBruijnBitPosition2[32] = {
                     0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
                     31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
                     };
@@ -19402,20 +19400,18 @@ S_optimize_regclass(pTHX_
 
                 /* The named classes are such that the inverted number is one
                  * larger than the non-inverted one */
-                already_inverted = namedclass
-                                 - classnum_to_namedclass(classnum);
+            already_inverted = namedclass - classnum_to_namedclass(classnum);
 
-                /* Create an inversion list of the official property, inverted
-                 * if the constructed node list is inverted, and restricted to
-                 * only the above latin1 code points, which are the only ones
-                 * known at compile time */
+            /* Create an inversion list of the official property, inverted if
+             * the constructed node list is inverted, and restricted to only
+             * the above latin1 code points, which are the only ones known at
+             * compile time */
                 _invlist_intersection_maybe_complement_2nd(
                                                     PL_AboveLatin1,
                                                     PL_XPosix_ptrs[classnum],
                                                     already_inverted,
                                                     &class_above_latin1);
-                are_equivalent = _invlistEQ(class_above_latin1, cp_list,
-                                                                        FALSE);
+            are_equivalent = _invlistEQ(class_above_latin1, cp_list, FALSE);
                 SvREFCNT_dec_NN(class_above_latin1);
 
                 if (are_equivalent) {
@@ -19432,18 +19428,17 @@ S_optimize_regclass(pTHX_
             }
         }
 
-        /* khw can't think of any other possible transformation involving
-         * these. */
+    /* khw can't think of any other possible transformation involving these. */
         if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
             return op;
         }
 
         if (! has_runtime_dependency) {
 
-            /* If the list is empty, nothing matches.  This happens, for
-             * example, when a Unicode property that doesn't match anything is
-             * the only element in the character class (perluniprops.pod notes
-             * such properties). */
+        /* If the list is empty, nothing matches.  This happens, for example,
+         * when a Unicode property that doesn't match anything is the only
+         * element in the character class (perluniprops.pod notes such
+         * properties). */
             if (partial_cp_count == 0) {
                 if (*invert) {
                     op = SANY;
@@ -19470,47 +19465,42 @@ S_optimize_regclass(pTHX_
         }
 
         /* Next see if can optimize classes that contain just a few code points
-         * into an EXACTish node.  The reason to do this is to let the
-         * optimizer join this node with adjacent EXACTish ones, and ANYOF
-         * nodes require conversion to code point from UTF-8.
+     * into an EXACTish node.  The reason to do this is to let the optimizer
+     * join this node with adjacent EXACTish ones, and ANYOF nodes require
+     * runtime conversion to code point from UTF-8.
          *
-         * An EXACTFish node can be generated even if not under /i, and vice
-         * versa.  But care must be taken.  An EXACTFish node has to be such
-         * that it only matches precisely the code points in the class, but we
-         * want to generate the least restrictive one that does that, to
-         * increase the odds of being able to join with an adjacent node.  For
-         * example, if the class contains [kK], we have to make it an EXACTFAA
-         * node to prevent the KELVIN SIGN from matching.  Whether we are under
-         * /i or not is irrelevant in this case.  Less obvious is the pattern
-         * qr/[\x{02BC}]n/i.  U+02BC is MODIFIER LETTER APOSTROPHE. That is
-         * supposed to match the single character U+0149 LATIN SMALL LETTER N
-         * PRECEDED BY APOSTROPHE.  And so even though there is no simple fold
-         * that includes \X{02BC}, there is a multi-char fold that does, and so
-         * the node generated for it must be an EXACTFish one.  On the other
-         * hand qr/:/i should generate a plain EXACT node since the colon
-         * participates in no fold whatsoever, and having it EXACT tells the
-         * optimizer the target string cannot match unless it has a colon in
-         * it.
+     * An EXACTFish node can be generated even if not under /i, and vice versa.
+     * But care must be taken.  An EXACTFish node has to be such that it only
+     * matches precisely the code points in the class, but we want to generate
+     * the least restrictive one that does that, to increase the odds of being
+     * able to join with an adjacent node.  For example, if the class contains
+     * [kK], we have to make it an EXACTFAA node to prevent the KELVIN SIGN
+     * from matching.  Whether we are under /i or not is irrelevant in this
+     * case.  Less obvious is the pattern qr/[\x{02BC}]n/i.  U+02BC is MODIFIER
+     * LETTER APOSTROPHE. That is supposed to match the single character U+0149
+     * LATIN SMALL LETTER N PRECEDED BY APOSTROPHE.  And so even though there
+     * is no simple fold that includes \X{02BC}, there is a multi-char fold
+     * that does, and so the node generated for it must be an EXACTFish one.
+     * On the other hand qr/:/i should generate a plain EXACT node since the
+     * colon participates in no fold whatsoever, and having it EXACT tells the
+     * optimizer the target string cannot match unless it has a colon in it.
          */
         if (   ! posixl
             && ! *invert
 
-                /* Only try if there are no more code points in the class than
-                 * in the max possible fold */
+        /* Only try if there are no more code points in the class than in
+         * the max possible fold */
             &&   inRANGE(partial_cp_count, 1, MAX_FOLD_FROMS + 1))
         {
-            if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches)
-            {
-                /* We can always make a single code point class into an
-                 * EXACTish node. */
-
+        /* We can always make a single code point class into an EXACTish node.
+         * */
+        if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches) {
                 if (LOC) {
 
-                    /* Here is /l:  Use EXACTL, except if there is a fold not
-                     * known until runtime so shows as only a single code point
-                     * here.  For code points above 255, we know which can
-                     * cause problems by having a potential fold to the Latin1
-                     * range. */
+            /* Here is /l:  Use EXACTL, except if there is a fold not known
+             * until runtime so shows as only a single code point here.
+             * For code points above 255, we know which can cause problems
+             * by having a potential fold to the Latin1 range. */
                     if (  ! FOLD
                         || (     start[0] > 255
                             && ! is_PROBLEMATIC_LOCALE_FOLD_cp(start[0])))
@@ -19528,30 +19518,28 @@ S_optimize_regclass(pTHX_
                                               small */
 
                     /* Under /i, it gets a little tricky.  A code point that
-                     * doesn't participate in a fold should be an EXACT node.
-                     * We know this one isn't the result of a simple fold, or
-                     * there'd be more than one code point in the list, but it
-                     * could be part of a multi- character fold.  In that case
-                     * we better not create an EXACT node, as we would wrongly
-                     * be telling the optimizer that this code point must be in
-                     * the target string, and that is wrong.  This is because
-                     * if the sequence around this code point forms a
-                     * multi-char fold, what needs to be in the string could be
-                     * the code point that folds to the sequence.
+                 * doesn't participate in a fold should be an EXACT node.  We
+                 * know this one isn't the result of a simple fold, or there'd
+                 * be more than one code point in the list, but it could be
+                 * part of a multi- character fold.  In that case we better not
+                 * create an EXACT node, as we would wrongly be telling the
+                 * optimizer that this code point must be in the target string,
+                 * and that is wrong.  This is because if the sequence around
+                 * this code point forms a multi-char fold, what needs to be in
+                 * the string could be the code point that folds to the
+                 * sequence.
                      *
-                     * This handles the case of below-255 code points, as we
-                     * have an easy look up for those.  The next clause handles
-                     * the above-256 one */
+                 * This handles the case of below-255 code points, as we have
+                 * an easy look up for those.  The next clause handles the
+                 * above-256 one */
                     op = IS_IN_SOME_FOLD_L1(start[0])
                          ? EXACTFU
                          : EXACT;
                 }
-                else {  /* /i, larger code point.  Since we are under /i, and
-                           have just this code point, we know that it can't
-                           fold to something else, so PL_InMultiCharFold
-                           applies to it */
-                    op = _invlist_contains_cp(PL_InMultiCharFold,
-                                              start[0])
+            else {  /* /i, larger code point.  Since we are under /i, and have
+                       just this code point, we know that it can't fold to
+                       something else, so PL_InMultiCharFold applies to it */
+                op = (_invlist_contains_cp(PL_InMultiCharFold, start[0]))
                          ? EXACTFU_REQ8
                          : EXACT_REQ8;
                 }
@@ -19561,17 +19549,17 @@ S_optimize_regclass(pTHX_
             else if (  ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
                      && _invlist_contains_cp(PL_in_some_fold, start[0]))
             {
-                /* Here, the only runtime dependency, if any, is from /d, and
-                 * the class matches more than one code point, and the lowest
-                 * code point participates in some fold.  It might be that the
-                 * other code points are /i equivalent to this one, and hence
-                 * they would representable by an EXACTFish node.  Above, we
-                 * eliminated classes that contain too many code points to be
-                 * EXACTFish, with the test for MAX_FOLD_FROMS
+            /* Here, the only runtime dependency, if any, is from /d, and the
+             * class matches more than one code point, and the lowest code
+             * point participates in some fold.  It might be that the other
+             * code points are /i equivalent to this one, and hence they would
+             * representable by an EXACTFish node.  Above, we eliminated
+             * classes that contain too many code points to be EXACTFish, with
+             * the test for MAX_FOLD_FROMS
                  *
-                 * First, special case the ASCII fold pairs, like 'B' and 'b'.
-                 * We do this because we have EXACTFAA at our disposal for the
-                 * ASCII range */
+             * First, special case the ASCII fold pairs, like 'B' and 'b'.  We
+             * do this because we have EXACTFAA at our disposal for the ASCII
+             * range */
                 if (partial_cp_count == 2 && isASCII(start[0])) {
 
                     /* The only ASCII characters that participate in folds are
@@ -19587,21 +19575,19 @@ S_optimize_regclass(pTHX_
                         if (   ASCII_FOLD_RESTRICTED
                             || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(start[0]))
                         {
-                            /* If the second clause just above was true, it
-                             * means we can't be under /i, or else the list
-                             * would have included more than this fold pair.
-                             * Therefore we have to exclude the possibility of
-                             * whatever else it is that folds to these, by
-                             * using EXACTFAA */
+                    /* If the second clause just above was true, it means
+                     * we can't be under /i, or else the list would have
+                     * included more than this fold pair.  Therefore we
+                     * have to exclude the possibility of whatever else it
+                     * is that folds to these, by using EXACTFAA */
                             op = EXACTFAA;
                         }
                         else if (HAS_NONLATIN1_FOLD_CLOSURE(start[0])) {
 
                             /* Here, there's no simple fold that start[0] is part
-                             * of, but there is a multi-character one.  If we
-                             * are not under /i, we want to exclude that
-                             * possibility; if under /i, we want to include it
-                             * */
+                         * of, but there is a multi-character one.  If we are
+                         * not under /i, we want to exclude that possibility;
+                         * if under /i, we want to include it */
                             op = (FOLD) ? EXACTFU : EXACTFAA;
                         }
                         else {
@@ -19615,46 +19601,44 @@ S_optimize_regclass(pTHX_
                     }
                 }
                 else if (  ! upper_latin1_only_utf8_matches
-                         || (   _invlist_len(upper_latin1_only_utf8_matches)
-                                                                          == 2
+                     || (   _invlist_len(upper_latin1_only_utf8_matches) == 2
                              && PL_fold_latin1[
                                invlist_highest(upper_latin1_only_utf8_matches)]
                              == start[0]))
                 {
-                    /* Here, the smallest character is non-ascii or there are
-                     * more than 2 code points matched by this node.  Also, we
-                     * either don't have /d UTF-8 dependent matches, or if we
-                     * do, they look like they could be a single character that
-                     * is the fold of the lowest one in the always-match list.
-                     * This test quickly excludes most of the false positives
-                     * when there are /d UTF-8 depdendent matches.  These are
-                     * like LATIN CAPITAL LETTER A WITH GRAVE matching LATIN
-                     * SMALL LETTER A WITH GRAVE iff the target string is
-                     * UTF-8.  (We don't have to worry above about exceeding
-                     * the array bounds of PL_fold_latin1[] because any code
-                     * point in 'upper_latin1_only_utf8_matches' is below 256.)
+                /* Here, the smallest character is non-ascii or there are more
+                 * than 2 code points matched by this node.  Also, we either
+                 * don't have /d UTF-8 dependent matches, or if we do, they
+                 * look like they could be a single character that is the fold
+                 * of the lowest one is in the always-match list.  This test
+                 * quickly excludes most of the false positives when there are
+                 * /d UTF-8 depdendent matches.  These are like LATIN CAPITAL
+                 * LETTER A WITH GRAVE matching LATIN SMALL LETTER A WITH GRAVE
+                 * iff the target string is UTF-8.  (We don't have to worry
+                 * above about exceeding the array bounds of PL_fold_latin1[]
+                 * because any code point in 'upper_latin1_only_utf8_matches'
+                 * is below 256.)
                      *
                      * EXACTFAA would apply only to pairs (hence exactly 2 code
                      * points) in the ASCII range, so we can't use it here to
-                     * artificially restrict the fold domain, so we check if
-                     * the class does or does not match some EXACTFish node.
-                     * Further, if we aren't under /i, and the folded-to
-                     * character is part of a multi-character fold, we can't do
-                     * this optimization, as the sequence around it could be
-                     * that multi-character fold, and we don't here know the
-                     * context, so we have to assume it is that multi-char
-                     * fold, to prevent potential bugs.
+                 * artificially restrict the fold domain, so we check if the
+                 * class does or does not match some EXACTFish node.  Further,
+                 * if we aren't under /i, and and the folded-to character is
+                 * part of a multi-character fold, we can't do this
+                 * optimization, as the sequence around it could be that
+                 * multi-character fold, and we don't here know the context, so
+                 * we have to assume it is that multi-char fold, to prevent
+                 * potential bugs.
                      *
-                     * To do the general case, we first find the fold of the
-                     * lowest code point (which may be higher than the lowest
-                     * one), then find everything that folds to it.  (The data
-                     * structure we have only maps from the folded code points,
-                     * so we have to do the earlier step.) */
+                 * To do the general case, we first find the fold of the lowest
+                 * code point (which may be higher than the lowest one), then
+                 * find everything that folds to it.  (The data structure we
+                 * have only maps from the folded code points, so we have to do
+                 * the earlier step.) */
 
                     Size_t foldlen;
                     U8 foldbuf[UTF8_MAXBYTES_CASE];
-                    UV folded = _to_uni_fold_flags(start[0],
-                                                        foldbuf, &foldlen, 0);
+                UV folded = _to_uni_fold_flags(start[0], foldbuf, &foldlen, 0);
                     U32 first_fold;
                     const U32 * remaining_folds;
                     Size_t folds_to_this_cp_count = _inverse_folds(
@@ -19691,24 +19675,22 @@ S_optimize_regclass(pTHX_
                         }
                     }
 
-                    /* If the fold list is identical to what's in this ANYOF
-                     * node, the node can be represented by an EXACTFish one
-                     * instead */
+                /* If the fold list is identical to what's in this ANYOF node,
+                 * the node can be represented by an EXACTFish one instead */
                     if (_invlistEQ(*use_this_list, fold_list,
                                    0 /* Don't complement */ )
                     ) {
 
-                        /* But, we have to be careful, as mentioned above.
-                         * Just the right sequence of characters could match
-                         * this if it is part of a multi-character fold.  That
-                         * IS what we want if we are under /i.  But it ISN'T
-                         * what we want if not under /i, as it could match when
-                         * it shouldn't.  So, when we aren't under /i and this
-                         * character participates in a multi-char fold, we
-                         * don't optimize into an EXACTFish node.  So, for each
-                         * case below we have to check if we are folding
-                         * and if not, if it is not part of a multi-char fold.
-                         * */
+                    /* But, we have to be careful, as mentioned above.  Just
+                     * the right sequence of characters could match this if it
+                     * is part of a multi-character fold.  That IS what we want
+                     * if we are under /i.  But it ISN'T what we want if not
+                     * under /i, as it could match when it shouldn't.  So, when
+                     * we aren't under /i and this character participates in a
+                     * multi-char fold, we don't optimize into an EXACTFish
+                     * node.  So, for each case below we have to check if we
+                     * are folding and if not, if it is not part of a
+                     * multi-char fold.  */
                         if (start[0] > 255) {    /* Highish code point */
                             if (FOLD || ! _invlist_contains_cp(
                                             PL_InMultiCharFold, folded))
@@ -19724,9 +19706,9 @@ S_optimize_regclass(pTHX_
                         else if (    FOLD
                                  &&  folded == 's'
                                  &&  DEPENDS_SEMANTICS)
-                        {   /* An EXACTF node containing a single character
-                                's', can be an EXACTFU if it doesn't get
-                                joined with an adjacent 's' */
+                    {   /* An EXACTF node containing a single character 's',
+                           can be an EXACTFU if it doesn't get joined with an
+                           adjacent 's' */
                             op = EXACTFU_S_EDGE;
                             value = folded;
                         }
@@ -19742,8 +19724,7 @@ S_optimize_regclass(pTHX_
                             }
                             else if (     UNLIKELY(start[0] == MICRO_SIGN)
                                      && ! UTF)
-                            {   /* EXACTFUP is a special node for this
-                                   character */
+                    {   /* EXACTFUP is a special node for this character */
                                 op = (ASCII_FOLD_RESTRICTED)
                                      ? EXACTFAA
                                      : EXACTFUP;
@@ -19751,8 +19732,8 @@ S_optimize_regclass(pTHX_
                             }
                             else if (     ASCII_FOLD_RESTRICTED
                                      && ! isASCII(start[0]))
-                            {   /* For ASCII under /iaa, we can use EXACTFU
-                                   below */
+                        {   /* For ASCII under /iaa, we can use EXACTFU below
+                             */
                                 op = EXACTFAA;
                                 value = folded;
                             }
@@ -19780,12 +19761,11 @@ S_optimize_regclass(pTHX_
                     }
 
                     /* This is a kludge to the special casing issues with this
-                     * ligature under /aa.  FB05 should fold to FB06, but the
-                     * call above to _to_uni_fold_flags() didn't find this, as
-                     * it didn't use the /aa restriction in order to not miss
-                     * other folds that would be affected.  This is the only
-                     * instance likely to ever be a problem in all of Unicode.
-                     * So special case it. */
+                 * ligature under /aa.  FB05 should fold to FB06, but the call
+                 * above to _to_uni_fold_flags() didn't find this, as it didn't
+                 * use the /aa restriction in order to not miss other folds
+                 * that would be affected.  This is the only instance likely to
+                 * ever be a problem in all of Unicode.  So special case it. */
                     if (   value == LATIN_SMALL_LIGATURE_LONG_S_T
                         && ASCII_FOLD_RESTRICTED)
                     {
@@ -19811,36 +19791,35 @@ S_optimize_regclass(pTHX_
 
         if (! has_runtime_dependency) {
 
-            /* See if this can be turned into an ANYOFM node.  Think about the
-             * bit patterns in two different bytes.  In some positions, the
-             * bits in each will be 1; and in other positions both will be 0;
-             * and in some positions the bit will be 1 in one byte, and 0 in
-             * the other.  Let 'n' be the number of positions where the bits
-             * differ.  We create a mask which has exactly 'n' 0 bits, each in
-             * a position where the two bytes differ.  Now take the set of all
-             * bytes that when ANDed with the mask yield the same result.  That
-             * set has 2**n elements, and is representable by just two 8 bit
-             * numbers: the result and the mask.  Importantly, matching the set
-             * can be vectorized by creating a word full of the result bytes,
-             * and a word full of the mask bytes, yielding a significant speed
-             * up.  Here, see if this node matches such a set.  As a concrete
-             * example consider [01], and the byte representing '0' which is
-             * 0x30 on ASCII machines.  It has the bits 0011 0000.  Take the
-             * mask 1111 1110.  If we AND 0x31 and 0x30 with that mask we get
-             * 0x30.  Any other bytes ANDed yield something else.  So [01],
-             * which is a common usage, is optimizable into ANYOFM, and can
-             * benefit from the speed up.  We can only do this on UTF-8
-             * invariant bytes, because they have the same bit patterns under
-             * UTF-8 as not. */
+        /* See if this can be turned into an ANYOFM node.  Think about the bit
+         * patterns in two different bytes.  In some positions, the bits in
+         * each will be 1; and in other positions both will be 0; and in some
+         * positions the bit will be 1 in one byte, and 0 in the other.  Let
+         * 'n' be the number of positions where the bits differ.  We create a
+         * mask which has exactly 'n' 0 bits, each in a position where the two
+         * bytes differ.  Now take the set of all bytes that when ANDed with
+         * the mask yield the same result.  That set has 2**n elements, and is
+         * representable by just two 8 bit numbers: the result and the mask.
+         * Importantly, matching the set can be vectorized by creating a word
+         * full of the result bytes, and a word full of the mask bytes,
+         * yielding a significant speed up.  Here, see if this node matches
+         * such a set.  As a concrete example consider [01], and the byte
+         * representing '0' which is 0x30 on ASCII machines.  It has the bits
+         * 0011 0000.  Take the mask 1111 1110.  If we AND 0x31 and 0x30 with
+         * that mask we get 0x30.  Any other bytes ANDed yield something else.
+         * So [01], which is a common usage, is optimizable into ANYOFM, and
+         * can benefit from the speed up.  We can only do this on UTF-8
+         * invariant bytes, because they have the same bit patterns under UTF-8
+         * as not. */
             PERL_UINT_FAST8_T inverted = 0;
 #ifdef EBCDIC
             const PERL_UINT_FAST8_T max_permissible = 0xFF;
 #else
             const PERL_UINT_FAST8_T max_permissible = 0x7F;
 #endif
-            /* If doesn't fit the criteria for ANYOFM, invert and try again.
-             * If that works we will instead later generate an NANYOFM, and
-             * invert back when through */
+        /* If doesn't fit the criteria for ANYOFM, invert and try again.  If
+         * that works we will instead later generate an NANYOFM, and invert
+         * back when through */
             if (invlist_highest(cp_list) > max_permissible) {
                 _invlist_invert(cp_list);
                 inverted = 1;
@@ -19853,8 +19832,7 @@ S_optimize_regclass(pTHX_
                 Size_t full_cp_count = 0;
                 bool first_time = TRUE;
 
-                /* Go through the bytes and find the bit positions that differ
-                 * */
+            /* Go through the bytes and find the bit positions that differ */
                 invlist_iterinit(cp_list);
                 while (invlist_iternext(cp_list, &this_start, &this_end)) {
                     unsigned int i = this_start;
@@ -19867,8 +19845,8 @@ S_optimize_regclass(pTHX_
                         first_time = FALSE;
                         lowest_cp = this_start;
 
-                        /* We have set up the code point to compare with.
-                         * Don't compare it with itself */
+                    /* We have set up the code point to compare with.  Don't
+                     * compare it with itself */
                         i++;
                     }
 
@@ -19886,19 +19864,19 @@ S_optimize_regclass(pTHX_
                     full_cp_count += this_end - this_start + 1;
                 }
 
-                /* At the end of the loop, we count how many bits differ from
-                 * the bits in lowest code point, call the count 'd'.  If the
-                 * set we found contains 2**d elements, it is the closure of
-                 * all code points that differ only in those bit positions.  To
-                 * convince yourself of that, first note that the number in the
-                 * closure must be a power of 2, which we test for.  The only
-                 * way we could have that count and it be some differing set,
-                 * is if we got some code points that don't differ from the
-                 * lowest code point in any position, but do differ from each
-                 * other in some other position.  That means one code point has
-                 * a 1 in that position, and another has a 0.  But that would
-                 * mean that one of them differs from the lowest code point in
-                 * that position, which possibility we've already excluded.  */
+            /* At the end of the loop, we count how many bits differ from the
+             * bits in lowest code point, call the count 'd'.  If the set we
+             * found contains 2**d elements, it is the closure of all code
+             * points that differ only in those bit positions.  To convince
+             * yourself of that, first note that the number in the closure must
+             * be a power of 2, which we test for.  The only way we could have
+             * that count and it be some differing set, is if we got some code
+             * points that don't differ from the lowest code point in any
+             * position, but do differ from each other in some other position.
+             * That means one code point has a 1 in that position, and another
+             * has a 0.  But that would mean that one of them differs from the
+             * lowest code point in that position, which possibility we've
+             * already excluded.  */
                 if (  (inverted || full_cp_count > 1)
                     && full_cp_count == 1U << PL_bitcount[bits_differing])
                 {
@@ -19926,12 +19904,12 @@ S_optimize_regclass(pTHX_
                 return op;
             }
 
-            /* XXX We could create an ANYOFR_LOW node here if we saved above if
-             * all were invariants, it wasn't inverted, and there is a single
-             * range.  This would be faster than some of the posix nodes we
-             * create below like /\d/a, but would be twice the size.  Without
-             * having actually measured the gain, khw doesn't think the
-             * tradeoff is really worth it */
+        /* XXX We could create an ANYOFR_LOW node here if we saved above if all
+         * were invariants, it wasn't inverted, and there is a single range.
+         * This would be faster than some of the posix nodes we create below
+         * like /\d/a, but would be twice the size.  Without having actually
+         * measured the gain, khw doesn't think the tradeoff is really worth it
+         * */
         }
 
         if (! (*anyof_flags & ANYOF_LOCALE_FLAGS)) {
@@ -19939,10 +19917,10 @@ S_optimize_regclass(pTHX_
             SV * intersection = NULL;
             SV* d_invlist = NULL;
 
-            /* See if this matches any of the POSIX classes.  The POSIXA and
-             * POSIXD ones are about the same speed as ANYOF ops, but take less
-             * room; the ones that have above-Latin1 code point matches are
-             * somewhat faster than ANYOF.  */
+        /* See if this matches any of the POSIX classes.  The POSIXA and POSIXD
+         * ones are about the same speed as ANYOF ops, but take less room; the
+         * ones that have above-Latin1 code point matches are somewhat faster
+         * than ANYOF.  */
 
             for (type = POSIXA; type >= POSIXD; type--) {
                 int posix_class;
@@ -19966,8 +19944,8 @@ S_optimize_regclass(pTHX_
                         official_code_points = &PL_XPosix_ptrs[posix_class];
                     }
 
-                    /* Skip non-existent classes of this type.  e.g. \v only
-                     * has an entry in PL_XPosix_ptrs */
+                /* Skip non-existent classes of this type.  e.g. \v only has an
+                 * entry in PL_XPosix_ptrs */
                     if (! *official_code_points) {
                         continue;
                     }
@@ -19978,8 +19956,8 @@ S_optimize_regclass(pTHX_
 
                         if (type != POSIXD) {
 
-                            /* This class that isn't /d can't match if we have
-                             * /d dependencies */
+                        /* This class that isn't /d can't match if we have /d
+                         * dependencies */
                             if (has_runtime_dependency
                                                     & HAS_D_RUNTIME_DEPENDENCY)
                             {
@@ -19988,8 +19966,8 @@ S_optimize_regclass(pTHX_
                         }
                         else /* is /d */ if (! this_inverted) {
 
-                            /* /d classes don't match anything non-ASCII below
-                             * 256 unconditionally (which cp_list contains) */
+                        /* /d classes don't match anything non-ASCII below 256
+                         * unconditionally (which cp_list contains) */
                             _invlist_intersection(cp_list, PL_UpperLatin1,
                                                            &intersection);
                             if (_invlist_len(intersection) != 0) {
@@ -19999,10 +19977,10 @@ S_optimize_regclass(pTHX_
                             SvREFCNT_dec(d_invlist);
                             d_invlist = invlist_clone(cp_list, NULL);
 
-                            /* But under UTF-8 it turns into using /u rules.
-                             * Add the things it matches under these conditions
-                             * so that we check below that these are identical
-                             * to what the tested class should match */
+                        /* But under UTF-8 it turns into using /u rules.  Add
+                         * the things it matches under these conditions so that
+                         * we check below that these are identical to what the
+                         * tested class should match */
                             if (upper_latin1_only_utf8_matches) {
                                 _invlist_union(
                                             d_invlist,
@@ -20020,8 +19998,8 @@ S_optimize_regclass(pTHX_
                             our_code_points = &cp_list;
                         }
 
-                        /* Here, have weeded out some things.  We want to see
-                         * if the list of characters this node contains
+                    /* Here, have weeded out some things.  We want to see if
+                     * the list of characters this node contains
                          * ('*our_code_points') precisely matches those of the
                          * class we are currently checking against
                          * ('*official_code_points'). */
@@ -20030,8 +20008,8 @@ S_optimize_regclass(pTHX_
                                        try_inverted))
                         {
                             /* Here, they precisely match.  Optimize this ANYOF
-                             * node into its equivalent POSIX one of the
-                             * correct type, possibly inverted */
+                         * node into its equivalent POSIX one of the correct
+                         * type, possibly inverted */
                             op = (try_inverted)
                                 ? type + NPOSIXA - POSIXA
                                 : type;
@@ -20048,13 +20026,12 @@ S_optimize_regclass(pTHX_
             SvREFCNT_dec(intersection);
         }
 
-        /* If it is a single contiguous range, ANYOFR is an efficient regnode,
-         * both in size and speed.  Currently, a 20 bit range base (smallest
-         * code point in the range), and a 12 bit maximum delta are packed into
-         * a 32 bit word.  This allows for using it on all of the Unicode code
-         * points except for the highest plane, which is only for private use
-         * code points.  khw doubts that a bigger delta is likely in real world
-         * applications */
+    /* If it is a single contiguous range, ANYOFR is an efficient regnode, both
+     * in size and speed.  Currently, a 20 bit range base (smallest code point
+     * in the range), and a 12 bit maximum delta are packed into a 32 bit word.
+     * This allows for using it on all of the Unicode code points except for
+     * the highest plane, which is only for private use code points.  khw
+     * doubts that a bigger delta is likely in real world applications */
         if (     single_range
             && ! has_runtime_dependency
             &&   *anyof_flags == 0
@@ -20071,26 +20048,23 @@ S_optimize_regclass(pTHX_
             *ret = reganode(pRExC_state, op,
                         (start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS));
 
-            /* Place the lowest UTF-8 start byte in the flags field, so as to
-             * allow efficient ruling out at run time of many possible inputs.
-             * */
+        /* Place the lowest UTF-8 start byte in the flags field, so as to allow
+         * efficient ruling out at run time of many possible inputs.  */
             (void) uvchr_to_utf8(low_utf8, start[0]);
             (void) uvchr_to_utf8(high_utf8, end[0]);
 
             /* If all code points share the same first byte, this can be an
              * ANYOFRb.  Otherwise store the lowest UTF-8 start byte which can
-             * quickly rule out many inputs at run-time without having to
-             * compute the code point from UTF-8.  For EBCDIC, we use I8, as
-             * not doing that transformation would not rule out nearly so many
-             * things */
+         * quickly rule out many inputs at run-time without having to compute
+         * the code point from UTF-8.  For EBCDIC, we use I8, as not doing that
+         * transformation would not rule out nearly so many things */
             if (low_utf8[0] == high_utf8[0]) {
                 op = ANYOFRb;
                 OP(REGNODE_p(*ret)) = op;
                 ANYOF_FLAGS(REGNODE_p(*ret)) = low_utf8[0];
             }
             else {
-                ANYOF_FLAGS(REGNODE_p(*ret))
-                                    = NATIVE_UTF8_TO_I8(low_utf8[0]);
+            ANYOF_FLAGS(REGNODE_p(*ret)) = NATIVE_UTF8_TO_I8(low_utf8[0]);
             }
 
             return op;
@@ -20106,19 +20080,19 @@ S_optimize_regclass(pTHX_
             U8 low_utf8[UTF8_MAXBYTES+1];
             UV highest_cp = invlist_highest(cp_list);
 
-            /* Currently the maximum allowed code point by the system is
-             * IV_MAX.  Higher ones are reserved for future internal use.  This
-             * particular regnode can be used for higher ones, but we can't
-             * calculate the code point of those.  IV_MAX suffices though, as
-             * it will be a large first byte */
+        /* Currently the maximum allowed code point by the system is IV_MAX.
+         * Higher ones are reserved for future internal use.  This particular
+         * regnode can be used for higher ones, but we can't calculate the code
+         * point of those.  IV_MAX suffices though, as it will be a large first
+         * byte */
             Size_t low_len = uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX))
                            - low_utf8;
 
-            /* We store the lowest possible first byte of the UTF-8
-             * representation, using the flags field.  This allows for quick
-             * ruling out of some inputs without having to convert from UTF-8
-             * to code point.  For EBCDIC, we use I8, as not doing that
-             * transformation would not rule out nearly so many things */
+        /* We store the lowest possible first byte of the UTF-8 representation,
+         * using the flags field.  This allows for quick ruling out of some
+         * inputs without having to convert from UTF-8 to code point.  For
+         * EBCDIC, we use I8, as not doing that transformation would not rule
+         * out nearly so many things */
             *anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
 
             op = ANYOFH;
@@ -20128,13 +20102,11 @@ S_optimize_regclass(pTHX_
              * well */
             if (highest_cp <= IV_MAX) {
                 U8 high_utf8[UTF8_MAXBYTES+1];
-                Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp)
-                                - high_utf8;
+            Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp) - high_utf8;
 
                 /* If the lowest and highest are the same, we can get an exact
-                 * first byte instead of a just minimum or even a sequence of
-                 * exact leading bytes.  We signal these with different
-                 * regnodes */
+             * first byte instead of a just minimum or even a sequence of exact
+             * leading bytes.  We signal these with different regnodes */
                 if (low_utf8[0] == high_utf8[0]) {
                     Size_t len = find_first_differing_byte_pos(low_utf8,
                                                                high_utf8,
@@ -20142,8 +20114,8 @@ S_optimize_regclass(pTHX_
 
                     if (len == 1) {
 
-                        /* No need to convert to I8 for EBCDIC as this is an
-                         * exact match */
+                    /* No need to convert to I8 for EBCDIC as this is an exact
+                     * match */
                         *anyof_flags = low_utf8[0];
                         op = ANYOFHb;
                     }
@@ -20164,15 +20136,14 @@ S_optimize_regclass(pTHX_
                         return op;
                     }
                 }
-                else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE)
-                {
+            else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE) {
 
-                    /* Here, the high byte is not the same as the low, but is
-                     * small enough that its reasonable to have a loose upper
-                     * bound, which is packed in with the strict lower bound.
-                     * See comments at the definition of MAX_ANYOF_HRx_BYTE.
-                     * On EBCDIC platforms, I8 is used.  On ASCII platforms I8
-                     * is the same thing as UTF-8 */
+                /* Here, the high byte is not the same as the low, but is small
+                 * enough that its reasonable to have a loose upper bound,
+                 * which is packed in with the strict lower bound.  See
+                 * comments at the definition of MAX_ANYOF_HRx_BYTE.  On EBCDIC
+                 * platforms, I8 is used.  On ASCII platforms I8 is the same
+                 * thing as UTF-8 */
 
                     U8 bits = 0;
                     U8 max_range_diff = MAX_ANYOF_HRx_BYTE - *anyof_flags;
author	Karl Williamson <khw@cpan.org>	2020-05-02 06:58:57 -0600
committer	Karl Williamson <khw@cpan.org>	2021-05-31 12:55:19 -0600
commit	403d7eb3e4320188571cf61b9dab62ff10799f49 (patch)
tree	092639203afcb911449ca5179ac30fcd965166a8 /regcomp.c
parent	1f4fbd3b4b26604673abca2a5f911744e826b1f3 (diff)
download	perl-403d7eb3e4320188571cf61b9dab62ff10799f49.tar.gz