summaryrefslogtreecommitdiff
path: root/regcomp.c
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2020-05-02 06:58:57 -0600
committerKarl Williamson <khw@cpan.org>2021-05-31 12:55:19 -0600
commit403d7eb3e4320188571cf61b9dab62ff10799f49 (patch)
tree092639203afcb911449ca5179ac30fcd965166a8 /regcomp.c
parent1f4fbd3b4b26604673abca2a5f911744e826b1f3 (diff)
downloadperl-403d7eb3e4320188571cf61b9dab62ff10799f49.tar.gz
regcomp.c: white-space; comments
Diffstat (limited to 'regcomp.c')
-rw-r--r--regcomp.c507
1 files changed, 239 insertions, 268 deletions
diff --git a/regcomp.c b/regcomp.c
index e28ff3d61e..962f4cb941 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -19295,7 +19295,8 @@ S_optimize_regclass(pTHX_
* any created regnode. If the new op is sufficiently like plain ANYOF, it
* leaves *ret unchanged for allocation in S_regclass.
*
- * Certain of the parameters may be updated as a result of the changes herein */
+ * Certain of the parameters may be updated as a result of the changes
+ * herein */
U8 op = ANYOF; /* The returned node-type, initialized to the unoptimized
one. */
@@ -19308,9 +19309,9 @@ S_optimize_regclass(pTHX_
PERL_ARGS_ASSERT_OPTIMIZE_REGCLASS;
- if (cp_list) { /* Count the code points in enough ranges that we would
- see all the ones possible in any fold in this version
- of Unicode */
+ if (cp_list) { /* Count the code points in enough ranges that we would see
+ all the ones possible in any fold in this version of
+ Unicode */
invlist_iterinit(cp_list);
for (i = 0; i <= MAX_FOLD_FROMS; i++) {
@@ -19326,8 +19327,8 @@ S_optimize_regclass(pTHX_
invlist_iterfinish(cp_list);
}
- /* If we know at compile time that this matches every possible code
- * point, any run-time dependencies don't matter */
+ /* If we know at compile time that this matches every possible code point,
+ * any run-time dependencies don't matter */
if (start[0] == 0 && end[0] == UV_MAX) {
if (*invert) {
op = OPFAIL;
@@ -19341,13 +19342,11 @@ S_optimize_regclass(pTHX_
return op;
}
- /* Similarly, for /l posix classes, if both a class and its
- * complement match, any run-time dependencies don't matter */
+ /* Similarly, for /l posix classes, if both a class and its complement
+ * match, any run-time dependencies don't matter */
if (posixl) {
int namedclass;
- for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX;
- namedclass += 2)
- {
+ for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX; namedclass += 2) {
if ( POSIXL_TEST(posixl, namedclass) /* class */
&& POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
{
@@ -19364,20 +19363,20 @@ S_optimize_regclass(pTHX_
}
}
- /* For well-behaved locales, some classes are subsets of others,
- * so complementing the subset and including the non-complemented
- * superset should match everything, like [\D[:alnum:]], and
+ /* For well-behaved locales, some classes are subsets of others, so
+ * complementing the subset and including the non-complemented superset
+ * should match everything, like [\D[:alnum:]], and
* [[:^alpha:][:alnum:]], but some implementations of locales are
* buggy, and khw thinks its a bad idea to have optimization change
* behavior, even if it avoids an OS bug in a given case */
#define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
- /* If is a single posix /l class, can optimize to just that op.
- * Such a node will not match anything in the Latin1 range, as that
- * is not determinable until runtime, but will match whatever the
- * class does outside that range. (Note that some classes won't
- * match anything outside the range, like [:ascii:]) */
+ /* If is a single posix /l class, can optimize to just that op. Such a
+ * node will not match anything in the Latin1 range, as that is not
+ * determinable until runtime, but will match whatever the class does
+ * outside that range. (Note that some classes won't match anything
+ * outside the range, like [:ascii:]) */
if ( isSINGLE_BIT_SET(posixl)
&& (partial_cp_count == 0 || start[0] > 255))
{
@@ -19390,8 +19389,7 @@ S_optimize_regclass(pTHX_
* ANYOF_CNTRL. From
* https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
* */
- static const int MultiplyDeBruijnBitPosition2[32] =
- {
+ static const int MultiplyDeBruijnBitPosition2[32] = {
0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
};
@@ -19402,20 +19400,18 @@ S_optimize_regclass(pTHX_
/* The named classes are such that the inverted number is one
* larger than the non-inverted one */
- already_inverted = namedclass
- - classnum_to_namedclass(classnum);
+ already_inverted = namedclass - classnum_to_namedclass(classnum);
- /* Create an inversion list of the official property, inverted
- * if the constructed node list is inverted, and restricted to
- * only the above latin1 code points, which are the only ones
- * known at compile time */
+ /* Create an inversion list of the official property, inverted if
+ * the constructed node list is inverted, and restricted to only
+ * the above latin1 code points, which are the only ones known at
+ * compile time */
_invlist_intersection_maybe_complement_2nd(
PL_AboveLatin1,
PL_XPosix_ptrs[classnum],
already_inverted,
&class_above_latin1);
- are_equivalent = _invlistEQ(class_above_latin1, cp_list,
- FALSE);
+ are_equivalent = _invlistEQ(class_above_latin1, cp_list, FALSE);
SvREFCNT_dec_NN(class_above_latin1);
if (are_equivalent) {
@@ -19432,18 +19428,17 @@ S_optimize_regclass(pTHX_
}
}
- /* khw can't think of any other possible transformation involving
- * these. */
+ /* khw can't think of any other possible transformation involving these. */
if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
return op;
}
if (! has_runtime_dependency) {
- /* If the list is empty, nothing matches. This happens, for
- * example, when a Unicode property that doesn't match anything is
- * the only element in the character class (perluniprops.pod notes
- * such properties). */
+ /* If the list is empty, nothing matches. This happens, for example,
+ * when a Unicode property that doesn't match anything is the only
+ * element in the character class (perluniprops.pod notes such
+ * properties). */
if (partial_cp_count == 0) {
if (*invert) {
op = SANY;
@@ -19470,47 +19465,42 @@ S_optimize_regclass(pTHX_
}
/* Next see if can optimize classes that contain just a few code points
- * into an EXACTish node. The reason to do this is to let the
- * optimizer join this node with adjacent EXACTish ones, and ANYOF
- * nodes require conversion to code point from UTF-8.
+ * into an EXACTish node. The reason to do this is to let the optimizer
+ * join this node with adjacent EXACTish ones, and ANYOF nodes require
+ * runtime conversion to code point from UTF-8.
*
- * An EXACTFish node can be generated even if not under /i, and vice
- * versa. But care must be taken. An EXACTFish node has to be such
- * that it only matches precisely the code points in the class, but we
- * want to generate the least restrictive one that does that, to
- * increase the odds of being able to join with an adjacent node. For
- * example, if the class contains [kK], we have to make it an EXACTFAA
- * node to prevent the KELVIN SIGN from matching. Whether we are under
- * /i or not is irrelevant in this case. Less obvious is the pattern
- * qr/[\x{02BC}]n/i. U+02BC is MODIFIER LETTER APOSTROPHE. That is
- * supposed to match the single character U+0149 LATIN SMALL LETTER N
- * PRECEDED BY APOSTROPHE. And so even though there is no simple fold
- * that includes \X{02BC}, there is a multi-char fold that does, and so
- * the node generated for it must be an EXACTFish one. On the other
- * hand qr/:/i should generate a plain EXACT node since the colon
- * participates in no fold whatsoever, and having it EXACT tells the
- * optimizer the target string cannot match unless it has a colon in
- * it.
+ * An EXACTFish node can be generated even if not under /i, and vice versa.
+ * But care must be taken. An EXACTFish node has to be such that it only
+ * matches precisely the code points in the class, but we want to generate
+ * the least restrictive one that does that, to increase the odds of being
+ * able to join with an adjacent node. For example, if the class contains
+ * [kK], we have to make it an EXACTFAA node to prevent the KELVIN SIGN
+ * from matching. Whether we are under /i or not is irrelevant in this
+ * case. Less obvious is the pattern qr/[\x{02BC}]n/i. U+02BC is MODIFIER
+ * LETTER APOSTROPHE. That is supposed to match the single character U+0149
+ * LATIN SMALL LETTER N PRECEDED BY APOSTROPHE. And so even though there
+ * is no simple fold that includes \X{02BC}, there is a multi-char fold
+ * that does, and so the node generated for it must be an EXACTFish one.
+ * On the other hand qr/:/i should generate a plain EXACT node since the
+ * colon participates in no fold whatsoever, and having it EXACT tells the
+ * optimizer the target string cannot match unless it has a colon in it.
*/
if ( ! posixl
&& ! *invert
- /* Only try if there are no more code points in the class than
- * in the max possible fold */
+ /* Only try if there are no more code points in the class than in
+ * the max possible fold */
&& inRANGE(partial_cp_count, 1, MAX_FOLD_FROMS + 1))
{
- if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches)
- {
- /* We can always make a single code point class into an
- * EXACTish node. */
-
+ /* We can always make a single code point class into an EXACTish node.
+ * */
+ if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches) {
if (LOC) {
- /* Here is /l: Use EXACTL, except if there is a fold not
- * known until runtime so shows as only a single code point
- * here. For code points above 255, we know which can
- * cause problems by having a potential fold to the Latin1
- * range. */
+ /* Here is /l: Use EXACTL, except if there is a fold not known
+ * until runtime so shows as only a single code point here.
+ * For code points above 255, we know which can cause problems
+ * by having a potential fold to the Latin1 range. */
if ( ! FOLD
|| ( start[0] > 255
&& ! is_PROBLEMATIC_LOCALE_FOLD_cp(start[0])))
@@ -19528,30 +19518,28 @@ S_optimize_regclass(pTHX_
small */
/* Under /i, it gets a little tricky. A code point that
- * doesn't participate in a fold should be an EXACT node.
- * We know this one isn't the result of a simple fold, or
- * there'd be more than one code point in the list, but it
- * could be part of a multi- character fold. In that case
- * we better not create an EXACT node, as we would wrongly
- * be telling the optimizer that this code point must be in
- * the target string, and that is wrong. This is because
- * if the sequence around this code point forms a
- * multi-char fold, what needs to be in the string could be
- * the code point that folds to the sequence.
+ * doesn't participate in a fold should be an EXACT node. We
+ * know this one isn't the result of a simple fold, or there'd
+ * be more than one code point in the list, but it could be
+ * part of a multi- character fold. In that case we better not
+ * create an EXACT node, as we would wrongly be telling the
+ * optimizer that this code point must be in the target string,
+ * and that is wrong. This is because if the sequence around
+ * this code point forms a multi-char fold, what needs to be in
+ * the string could be the code point that folds to the
+ * sequence.
*
- * This handles the case of below-255 code points, as we
- * have an easy look up for those. The next clause handles
- * the above-256 one */
+ * This handles the case of below-255 code points, as we have
+ * an easy look up for those. The next clause handles the
+ * above-256 one */
op = IS_IN_SOME_FOLD_L1(start[0])
? EXACTFU
: EXACT;
}
- else { /* /i, larger code point. Since we are under /i, and
- have just this code point, we know that it can't
- fold to something else, so PL_InMultiCharFold
- applies to it */
- op = _invlist_contains_cp(PL_InMultiCharFold,
- start[0])
+ else { /* /i, larger code point. Since we are under /i, and have
+ just this code point, we know that it can't fold to
+ something else, so PL_InMultiCharFold applies to it */
+ op = (_invlist_contains_cp(PL_InMultiCharFold, start[0]))
? EXACTFU_REQ8
: EXACT_REQ8;
}
@@ -19561,17 +19549,17 @@ S_optimize_regclass(pTHX_
else if ( ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
&& _invlist_contains_cp(PL_in_some_fold, start[0]))
{
- /* Here, the only runtime dependency, if any, is from /d, and
- * the class matches more than one code point, and the lowest
- * code point participates in some fold. It might be that the
- * other code points are /i equivalent to this one, and hence
- * they would representable by an EXACTFish node. Above, we
- * eliminated classes that contain too many code points to be
- * EXACTFish, with the test for MAX_FOLD_FROMS
+ /* Here, the only runtime dependency, if any, is from /d, and the
+ * class matches more than one code point, and the lowest code
+ * point participates in some fold. It might be that the other
+ * code points are /i equivalent to this one, and hence they would
+ * representable by an EXACTFish node. Above, we eliminated
+ * classes that contain too many code points to be EXACTFish, with
+ * the test for MAX_FOLD_FROMS
*
- * First, special case the ASCII fold pairs, like 'B' and 'b'.
- * We do this because we have EXACTFAA at our disposal for the
- * ASCII range */
+ * First, special case the ASCII fold pairs, like 'B' and 'b'. We
+ * do this because we have EXACTFAA at our disposal for the ASCII
+ * range */
if (partial_cp_count == 2 && isASCII(start[0])) {
/* The only ASCII characters that participate in folds are
@@ -19587,21 +19575,19 @@ S_optimize_regclass(pTHX_
if ( ASCII_FOLD_RESTRICTED
|| HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(start[0]))
{
- /* If the second clause just above was true, it
- * means we can't be under /i, or else the list
- * would have included more than this fold pair.
- * Therefore we have to exclude the possibility of
- * whatever else it is that folds to these, by
- * using EXACTFAA */
+ /* If the second clause just above was true, it means
+ * we can't be under /i, or else the list would have
+ * included more than this fold pair. Therefore we
+ * have to exclude the possibility of whatever else it
+ * is that folds to these, by using EXACTFAA */
op = EXACTFAA;
}
else if (HAS_NONLATIN1_FOLD_CLOSURE(start[0])) {
/* Here, there's no simple fold that start[0] is part
- * of, but there is a multi-character one. If we
- * are not under /i, we want to exclude that
- * possibility; if under /i, we want to include it
- * */
+ * of, but there is a multi-character one. If we are
+ * not under /i, we want to exclude that possibility;
+ * if under /i, we want to include it */
op = (FOLD) ? EXACTFU : EXACTFAA;
}
else {
@@ -19615,46 +19601,44 @@ S_optimize_regclass(pTHX_
}
}
else if ( ! upper_latin1_only_utf8_matches
- || ( _invlist_len(upper_latin1_only_utf8_matches)
- == 2
+ || ( _invlist_len(upper_latin1_only_utf8_matches) == 2
&& PL_fold_latin1[
invlist_highest(upper_latin1_only_utf8_matches)]
== start[0]))
{
- /* Here, the smallest character is non-ascii or there are
- * more than 2 code points matched by this node. Also, we
- * either don't have /d UTF-8 dependent matches, or if we
- * do, they look like they could be a single character that
- * is the fold of the lowest one in the always-match list.
- * This test quickly excludes most of the false positives
- * when there are /d UTF-8 depdendent matches. These are
- * like LATIN CAPITAL LETTER A WITH GRAVE matching LATIN
- * SMALL LETTER A WITH GRAVE iff the target string is
- * UTF-8. (We don't have to worry above about exceeding
- * the array bounds of PL_fold_latin1[] because any code
- * point in 'upper_latin1_only_utf8_matches' is below 256.)
+ /* Here, the smallest character is non-ascii or there are more
+ * than 2 code points matched by this node. Also, we either
+ * don't have /d UTF-8 dependent matches, or if we do, they
+ * look like they could be a single character that is the fold
+ * of the lowest one is in the always-match list. This test
+ * quickly excludes most of the false positives when there are
+ * /d UTF-8 depdendent matches. These are like LATIN CAPITAL
+ * LETTER A WITH GRAVE matching LATIN SMALL LETTER A WITH GRAVE
+ * iff the target string is UTF-8. (We don't have to worry
+ * above about exceeding the array bounds of PL_fold_latin1[]
+ * because any code point in 'upper_latin1_only_utf8_matches'
+ * is below 256.)
*
* EXACTFAA would apply only to pairs (hence exactly 2 code
* points) in the ASCII range, so we can't use it here to
- * artificially restrict the fold domain, so we check if
- * the class does or does not match some EXACTFish node.
- * Further, if we aren't under /i, and the folded-to
- * character is part of a multi-character fold, we can't do
- * this optimization, as the sequence around it could be
- * that multi-character fold, and we don't here know the
- * context, so we have to assume it is that multi-char
- * fold, to prevent potential bugs.
+ * artificially restrict the fold domain, so we check if the
+ * class does or does not match some EXACTFish node. Further,
+ * if we aren't under /i, and and the folded-to character is
+ * part of a multi-character fold, we can't do this
+ * optimization, as the sequence around it could be that
+ * multi-character fold, and we don't here know the context, so
+ * we have to assume it is that multi-char fold, to prevent
+ * potential bugs.
*
- * To do the general case, we first find the fold of the
- * lowest code point (which may be higher than the lowest
- * one), then find everything that folds to it. (The data
- * structure we have only maps from the folded code points,
- * so we have to do the earlier step.) */
+ * To do the general case, we first find the fold of the lowest
+ * code point (which may be higher than the lowest one), then
+ * find everything that folds to it. (The data structure we
+ * have only maps from the folded code points, so we have to do
+ * the earlier step.) */
Size_t foldlen;
U8 foldbuf[UTF8_MAXBYTES_CASE];
- UV folded = _to_uni_fold_flags(start[0],
- foldbuf, &foldlen, 0);
+ UV folded = _to_uni_fold_flags(start[0], foldbuf, &foldlen, 0);
U32 first_fold;
const U32 * remaining_folds;
Size_t folds_to_this_cp_count = _inverse_folds(
@@ -19691,24 +19675,22 @@ S_optimize_regclass(pTHX_
}
}
- /* If the fold list is identical to what's in this ANYOF
- * node, the node can be represented by an EXACTFish one
- * instead */
+ /* If the fold list is identical to what's in this ANYOF node,
+ * the node can be represented by an EXACTFish one instead */
if (_invlistEQ(*use_this_list, fold_list,
0 /* Don't complement */ )
) {
- /* But, we have to be careful, as mentioned above.
- * Just the right sequence of characters could match
- * this if it is part of a multi-character fold. That
- * IS what we want if we are under /i. But it ISN'T
- * what we want if not under /i, as it could match when
- * it shouldn't. So, when we aren't under /i and this
- * character participates in a multi-char fold, we
- * don't optimize into an EXACTFish node. So, for each
- * case below we have to check if we are folding
- * and if not, if it is not part of a multi-char fold.
- * */
+ /* But, we have to be careful, as mentioned above. Just
+ * the right sequence of characters could match this if it
+ * is part of a multi-character fold. That IS what we want
+ * if we are under /i. But it ISN'T what we want if not
+ * under /i, as it could match when it shouldn't. So, when
+ * we aren't under /i and this character participates in a
+ * multi-char fold, we don't optimize into an EXACTFish
+ * node. So, for each case below we have to check if we
+ * are folding and if not, if it is not part of a
+ * multi-char fold. */
if (start[0] > 255) { /* Highish code point */
if (FOLD || ! _invlist_contains_cp(
PL_InMultiCharFold, folded))
@@ -19724,9 +19706,9 @@ S_optimize_regclass(pTHX_
else if ( FOLD
&& folded == 's'
&& DEPENDS_SEMANTICS)
- { /* An EXACTF node containing a single character
- 's', can be an EXACTFU if it doesn't get
- joined with an adjacent 's' */
+ { /* An EXACTF node containing a single character 's',
+ can be an EXACTFU if it doesn't get joined with an
+ adjacent 's' */
op = EXACTFU_S_EDGE;
value = folded;
}
@@ -19742,8 +19724,7 @@ S_optimize_regclass(pTHX_
}
else if ( UNLIKELY(start[0] == MICRO_SIGN)
&& ! UTF)
- { /* EXACTFUP is a special node for this
- character */
+ { /* EXACTFUP is a special node for this character */
op = (ASCII_FOLD_RESTRICTED)
? EXACTFAA
: EXACTFUP;
@@ -19751,8 +19732,8 @@ S_optimize_regclass(pTHX_
}
else if ( ASCII_FOLD_RESTRICTED
&& ! isASCII(start[0]))
- { /* For ASCII under /iaa, we can use EXACTFU
- below */
+ { /* For ASCII under /iaa, we can use EXACTFU below
+ */
op = EXACTFAA;
value = folded;
}
@@ -19780,12 +19761,11 @@ S_optimize_regclass(pTHX_
}
/* This is a kludge to the special casing issues with this
- * ligature under /aa. FB05 should fold to FB06, but the
- * call above to _to_uni_fold_flags() didn't find this, as
- * it didn't use the /aa restriction in order to not miss
- * other folds that would be affected. This is the only
- * instance likely to ever be a problem in all of Unicode.
- * So special case it. */
+ * ligature under /aa. FB05 should fold to FB06, but the call
+ * above to _to_uni_fold_flags() didn't find this, as it didn't
+ * use the /aa restriction in order to not miss other folds
+ * that would be affected. This is the only instance likely to
+ * ever be a problem in all of Unicode. So special case it. */
if ( value == LATIN_SMALL_LIGATURE_LONG_S_T
&& ASCII_FOLD_RESTRICTED)
{
@@ -19811,36 +19791,35 @@ S_optimize_regclass(pTHX_
if (! has_runtime_dependency) {
- /* See if this can be turned into an ANYOFM node. Think about the
- * bit patterns in two different bytes. In some positions, the
- * bits in each will be 1; and in other positions both will be 0;
- * and in some positions the bit will be 1 in one byte, and 0 in
- * the other. Let 'n' be the number of positions where the bits
- * differ. We create a mask which has exactly 'n' 0 bits, each in
- * a position where the two bytes differ. Now take the set of all
- * bytes that when ANDed with the mask yield the same result. That
- * set has 2**n elements, and is representable by just two 8 bit
- * numbers: the result and the mask. Importantly, matching the set
- * can be vectorized by creating a word full of the result bytes,
- * and a word full of the mask bytes, yielding a significant speed
- * up. Here, see if this node matches such a set. As a concrete
- * example consider [01], and the byte representing '0' which is
- * 0x30 on ASCII machines. It has the bits 0011 0000. Take the
- * mask 1111 1110. If we AND 0x31 and 0x30 with that mask we get
- * 0x30. Any other bytes ANDed yield something else. So [01],
- * which is a common usage, is optimizable into ANYOFM, and can
- * benefit from the speed up. We can only do this on UTF-8
- * invariant bytes, because they have the same bit patterns under
- * UTF-8 as not. */
+ /* See if this can be turned into an ANYOFM node. Think about the bit
+ * patterns in two different bytes. In some positions, the bits in
+ * each will be 1; and in other positions both will be 0; and in some
+ * positions the bit will be 1 in one byte, and 0 in the other. Let
+ * 'n' be the number of positions where the bits differ. We create a
+ * mask which has exactly 'n' 0 bits, each in a position where the two
+ * bytes differ. Now take the set of all bytes that when ANDed with
+ * the mask yield the same result. That set has 2**n elements, and is
+ * representable by just two 8 bit numbers: the result and the mask.
+ * Importantly, matching the set can be vectorized by creating a word
+ * full of the result bytes, and a word full of the mask bytes,
+ * yielding a significant speed up. Here, see if this node matches
+ * such a set. As a concrete example consider [01], and the byte
+ * representing '0' which is 0x30 on ASCII machines. It has the bits
+ * 0011 0000. Take the mask 1111 1110. If we AND 0x31 and 0x30 with
+ * that mask we get 0x30. Any other bytes ANDed yield something else.
+ * So [01], which is a common usage, is optimizable into ANYOFM, and
+ * can benefit from the speed up. We can only do this on UTF-8
+ * invariant bytes, because they have the same bit patterns under UTF-8
+ * as not. */
PERL_UINT_FAST8_T inverted = 0;
#ifdef EBCDIC
const PERL_UINT_FAST8_T max_permissible = 0xFF;
#else
const PERL_UINT_FAST8_T max_permissible = 0x7F;
#endif
- /* If doesn't fit the criteria for ANYOFM, invert and try again.
- * If that works we will instead later generate an NANYOFM, and
- * invert back when through */
+ /* If doesn't fit the criteria for ANYOFM, invert and try again. If
+ * that works we will instead later generate an NANYOFM, and invert
+ * back when through */
if (invlist_highest(cp_list) > max_permissible) {
_invlist_invert(cp_list);
inverted = 1;
@@ -19853,8 +19832,7 @@ S_optimize_regclass(pTHX_
Size_t full_cp_count = 0;
bool first_time = TRUE;
- /* Go through the bytes and find the bit positions that differ
- * */
+ /* Go through the bytes and find the bit positions that differ */
invlist_iterinit(cp_list);
while (invlist_iternext(cp_list, &this_start, &this_end)) {
unsigned int i = this_start;
@@ -19867,8 +19845,8 @@ S_optimize_regclass(pTHX_
first_time = FALSE;
lowest_cp = this_start;
- /* We have set up the code point to compare with.
- * Don't compare it with itself */
+ /* We have set up the code point to compare with. Don't
+ * compare it with itself */
i++;
}
@@ -19886,19 +19864,19 @@ S_optimize_regclass(pTHX_
full_cp_count += this_end - this_start + 1;
}
- /* At the end of the loop, we count how many bits differ from
- * the bits in lowest code point, call the count 'd'. If the
- * set we found contains 2**d elements, it is the closure of
- * all code points that differ only in those bit positions. To
- * convince yourself of that, first note that the number in the
- * closure must be a power of 2, which we test for. The only
- * way we could have that count and it be some differing set,
- * is if we got some code points that don't differ from the
- * lowest code point in any position, but do differ from each
- * other in some other position. That means one code point has
- * a 1 in that position, and another has a 0. But that would
- * mean that one of them differs from the lowest code point in
- * that position, which possibility we've already excluded. */
+ /* At the end of the loop, we count how many bits differ from the
+ * bits in lowest code point, call the count 'd'. If the set we
+ * found contains 2**d elements, it is the closure of all code
+ * points that differ only in those bit positions. To convince
+ * yourself of that, first note that the number in the closure must
+ * be a power of 2, which we test for. The only way we could have
+ * that count and it be some differing set, is if we got some code
+ * points that don't differ from the lowest code point in any
+ * position, but do differ from each other in some other position.
+ * That means one code point has a 1 in that position, and another
+ * has a 0. But that would mean that one of them differs from the
+ * lowest code point in that position, which possibility we've
+ * already excluded. */
if ( (inverted || full_cp_count > 1)
&& full_cp_count == 1U << PL_bitcount[bits_differing])
{
@@ -19926,12 +19904,12 @@ S_optimize_regclass(pTHX_
return op;
}
- /* XXX We could create an ANYOFR_LOW node here if we saved above if
- * all were invariants, it wasn't inverted, and there is a single
- * range. This would be faster than some of the posix nodes we
- * create below like /\d/a, but would be twice the size. Without
- * having actually measured the gain, khw doesn't think the
- * tradeoff is really worth it */
+ /* XXX We could create an ANYOFR_LOW node here if we saved above if all
+ * were invariants, it wasn't inverted, and there is a single range.
+ * This would be faster than some of the posix nodes we create below
+ * like /\d/a, but would be twice the size. Without having actually
+ * measured the gain, khw doesn't think the tradeoff is really worth it
+ * */
}
if (! (*anyof_flags & ANYOF_LOCALE_FLAGS)) {
@@ -19939,10 +19917,10 @@ S_optimize_regclass(pTHX_
SV * intersection = NULL;
SV* d_invlist = NULL;
- /* See if this matches any of the POSIX classes. The POSIXA and
- * POSIXD ones are about the same speed as ANYOF ops, but take less
- * room; the ones that have above-Latin1 code point matches are
- * somewhat faster than ANYOF. */
+ /* See if this matches any of the POSIX classes. The POSIXA and POSIXD
+ * ones are about the same speed as ANYOF ops, but take less room; the
+ * ones that have above-Latin1 code point matches are somewhat faster
+ * than ANYOF. */
for (type = POSIXA; type >= POSIXD; type--) {
int posix_class;
@@ -19966,8 +19944,8 @@ S_optimize_regclass(pTHX_
official_code_points = &PL_XPosix_ptrs[posix_class];
}
- /* Skip non-existent classes of this type. e.g. \v only
- * has an entry in PL_XPosix_ptrs */
+ /* Skip non-existent classes of this type. e.g. \v only has an
+ * entry in PL_XPosix_ptrs */
if (! *official_code_points) {
continue;
}
@@ -19978,8 +19956,8 @@ S_optimize_regclass(pTHX_
if (type != POSIXD) {
- /* This class that isn't /d can't match if we have
- * /d dependencies */
+ /* This class that isn't /d can't match if we have /d
+ * dependencies */
if (has_runtime_dependency
& HAS_D_RUNTIME_DEPENDENCY)
{
@@ -19988,8 +19966,8 @@ S_optimize_regclass(pTHX_
}
else /* is /d */ if (! this_inverted) {
- /* /d classes don't match anything non-ASCII below
- * 256 unconditionally (which cp_list contains) */
+ /* /d classes don't match anything non-ASCII below 256
+ * unconditionally (which cp_list contains) */
_invlist_intersection(cp_list, PL_UpperLatin1,
&intersection);
if (_invlist_len(intersection) != 0) {
@@ -19999,10 +19977,10 @@ S_optimize_regclass(pTHX_
SvREFCNT_dec(d_invlist);
d_invlist = invlist_clone(cp_list, NULL);
- /* But under UTF-8 it turns into using /u rules.
- * Add the things it matches under these conditions
- * so that we check below that these are identical
- * to what the tested class should match */
+ /* But under UTF-8 it turns into using /u rules. Add
+ * the things it matches under these conditions so that
+ * we check below that these are identical to what the
+ * tested class should match */
if (upper_latin1_only_utf8_matches) {
_invlist_union(
d_invlist,
@@ -20020,8 +19998,8 @@ S_optimize_regclass(pTHX_
our_code_points = &cp_list;
}
- /* Here, have weeded out some things. We want to see
- * if the list of characters this node contains
+ /* Here, have weeded out some things. We want to see if
+ * the list of characters this node contains
* ('*our_code_points') precisely matches those of the
* class we are currently checking against
* ('*official_code_points'). */
@@ -20030,8 +20008,8 @@ S_optimize_regclass(pTHX_
try_inverted))
{
/* Here, they precisely match. Optimize this ANYOF
- * node into its equivalent POSIX one of the
- * correct type, possibly inverted */
+ * node into its equivalent POSIX one of the correct
+ * type, possibly inverted */
op = (try_inverted)
? type + NPOSIXA - POSIXA
: type;
@@ -20048,13 +20026,12 @@ S_optimize_regclass(pTHX_
SvREFCNT_dec(intersection);
}
- /* If it is a single contiguous range, ANYOFR is an efficient regnode,
- * both in size and speed. Currently, a 20 bit range base (smallest
- * code point in the range), and a 12 bit maximum delta are packed into
- * a 32 bit word. This allows for using it on all of the Unicode code
- * points except for the highest plane, which is only for private use
- * code points. khw doubts that a bigger delta is likely in real world
- * applications */
+ /* If it is a single contiguous range, ANYOFR is an efficient regnode, both
+ * in size and speed. Currently, a 20 bit range base (smallest code point
+ * in the range), and a 12 bit maximum delta are packed into a 32 bit word.
+ * This allows for using it on all of the Unicode code points except for
+ * the highest plane, which is only for private use code points. khw
+ * doubts that a bigger delta is likely in real world applications */
if ( single_range
&& ! has_runtime_dependency
&& *anyof_flags == 0
@@ -20071,26 +20048,23 @@ S_optimize_regclass(pTHX_
*ret = reganode(pRExC_state, op,
(start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS));
- /* Place the lowest UTF-8 start byte in the flags field, so as to
- * allow efficient ruling out at run time of many possible inputs.
- * */
+ /* Place the lowest UTF-8 start byte in the flags field, so as to allow
+ * efficient ruling out at run time of many possible inputs. */
(void) uvchr_to_utf8(low_utf8, start[0]);
(void) uvchr_to_utf8(high_utf8, end[0]);
/* If all code points share the same first byte, this can be an
* ANYOFRb. Otherwise store the lowest UTF-8 start byte which can
- * quickly rule out many inputs at run-time without having to
- * compute the code point from UTF-8. For EBCDIC, we use I8, as
- * not doing that transformation would not rule out nearly so many
- * things */
+ * quickly rule out many inputs at run-time without having to compute
+ * the code point from UTF-8. For EBCDIC, we use I8, as not doing that
+ * transformation would not rule out nearly so many things */
if (low_utf8[0] == high_utf8[0]) {
op = ANYOFRb;
OP(REGNODE_p(*ret)) = op;
ANYOF_FLAGS(REGNODE_p(*ret)) = low_utf8[0];
}
else {
- ANYOF_FLAGS(REGNODE_p(*ret))
- = NATIVE_UTF8_TO_I8(low_utf8[0]);
+ ANYOF_FLAGS(REGNODE_p(*ret)) = NATIVE_UTF8_TO_I8(low_utf8[0]);
}
return op;
@@ -20106,19 +20080,19 @@ S_optimize_regclass(pTHX_
U8 low_utf8[UTF8_MAXBYTES+1];
UV highest_cp = invlist_highest(cp_list);
- /* Currently the maximum allowed code point by the system is
- * IV_MAX. Higher ones are reserved for future internal use. This
- * particular regnode can be used for higher ones, but we can't
- * calculate the code point of those. IV_MAX suffices though, as
- * it will be a large first byte */
+ /* Currently the maximum allowed code point by the system is IV_MAX.
+ * Higher ones are reserved for future internal use. This particular
+ * regnode can be used for higher ones, but we can't calculate the code
+ * point of those. IV_MAX suffices though, as it will be a large first
+ * byte */
Size_t low_len = uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX))
- low_utf8;
- /* We store the lowest possible first byte of the UTF-8
- * representation, using the flags field. This allows for quick
- * ruling out of some inputs without having to convert from UTF-8
- * to code point. For EBCDIC, we use I8, as not doing that
- * transformation would not rule out nearly so many things */
+ /* We store the lowest possible first byte of the UTF-8 representation,
+ * using the flags field. This allows for quick ruling out of some
+ * inputs without having to convert from UTF-8 to code point. For
+ * EBCDIC, we use I8, as not doing that transformation would not rule
+ * out nearly so many things */
*anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
op = ANYOFH;
@@ -20128,13 +20102,11 @@ S_optimize_regclass(pTHX_
* well */
if (highest_cp <= IV_MAX) {
U8 high_utf8[UTF8_MAXBYTES+1];
- Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp)
- - high_utf8;
+ Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp) - high_utf8;
/* If the lowest and highest are the same, we can get an exact
- * first byte instead of a just minimum or even a sequence of
- * exact leading bytes. We signal these with different
- * regnodes */
+ * first byte instead of a just minimum or even a sequence of exact
+ * leading bytes. We signal these with different regnodes */
if (low_utf8[0] == high_utf8[0]) {
Size_t len = find_first_differing_byte_pos(low_utf8,
high_utf8,
@@ -20142,8 +20114,8 @@ S_optimize_regclass(pTHX_
if (len == 1) {
- /* No need to convert to I8 for EBCDIC as this is an
- * exact match */
+ /* No need to convert to I8 for EBCDIC as this is an exact
+ * match */
*anyof_flags = low_utf8[0];
op = ANYOFHb;
}
@@ -20164,15 +20136,14 @@ S_optimize_regclass(pTHX_
return op;
}
}
- else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE)
- {
+ else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE) {
- /* Here, the high byte is not the same as the low, but is
- * small enough that its reasonable to have a loose upper
- * bound, which is packed in with the strict lower bound.
- * See comments at the definition of MAX_ANYOF_HRx_BYTE.
- * On EBCDIC platforms, I8 is used. On ASCII platforms I8
- * is the same thing as UTF-8 */
+ /* Here, the high byte is not the same as the low, but is small
+ * enough that its reasonable to have a loose upper bound,
+ * which is packed in with the strict lower bound. See
+ * comments at the definition of MAX_ANYOF_HRx_BYTE. On EBCDIC
+ * platforms, I8 is used. On ASCII platforms I8 is the same
+ * thing as UTF-8 */
U8 bits = 0;
U8 max_range_diff = MAX_ANYOF_HRx_BYTE - *anyof_flags;