regcomp.c: Simply some node calculations

For the node types that have differing versions depending on the character set regex modifiers, /d, /l, /u, /a, and /aa, we can use the enum values as offsets from the base node number to derive the correct one. This eliminates a number of tests. Because there is no DIGITU node type, I added placeholders for it (and NDIGITU) to avoid some special casing of it (more important in future commits). We currently have many available node types, so can afford to waste these two.
author: Karl Williamson <public@khwilliamson.com> 2012-06-27 13:48:16 -0600
committer: Karl Williamson <public@khwilliamson.com> 2012-06-29 22:22:42 -0600
commit: 693fefec6759ebf0a9ec40a0f59346d86831349c (patch)
tree: d12041ac3714d0ae4c14ce2391f280380f038183 /regcomp.sym
parent: 8c1182fda8158a86281b1ea6464176d1c68f2f18 (diff)
download: perl-693fefec6759ebf0a9ec40a0f59346d86831349c.tar.gz
1 files changed, 16 insertions, 1 deletions
diff --git a/regcomp.sym b/regcomp.sym
index 13d3787965..c36a7fc2cd 100644
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -31,11 +31,17 @@ EOS         EOL,        no        ; Match "" at end of string.
 EOL         EOL,        no        ; Match "" at end of line.
 MEOL        EOL,        no        ; Same, assuming multiline.
 SEOL        EOL,        no        ; Same, assuming singleline.
+# The regops that have varieties that vary depending on the character set regex
+# modifiers have to ordered thusly: /d, /l, /u, /a, /aa.  This is because code
+# in regcomp.c uses the enum value of the modifier as an offset from the /d
+# version.  The complements must come after the non-complements.
+# BOUND, ALNUM, SPACE, DIGIT, and their complements are affected, as well as
+# EXACTF.
 BOUND       BOUND,      no        ; Match "" at any word boundary using native charset semantics for non-utf8
 BOUNDL      BOUND,      no        ; Match "" at any locale word boundary
 BOUNDU      BOUND,      no        ; Match "" at any word boundary using Unicode semantics
 BOUNDA      BOUND,      no         ; Match "" at any word boundary using ASCII semantics
-# All NBOUND nodes are required by a line regexec.c to be greater than all BOUND ones
+# All NBOUND nodes are required by code in regexec.c to be greater than all BOUND ones
 NBOUND      NBOUND,     no        ; Match "" at any word non-boundary using native charset semantics for non-utf8
 NBOUNDL     NBOUND,     no        ; Match "" at any locale word non-boundary
 NBOUNDU     NBOUND,     no        ; Match "" at any word non-boundary using Unicode semantics
@@ -49,6 +55,11 @@ SANY        REG_ANY,    no 0 S    ; Match any one character.
 CANY        REG_ANY,    no 0 S    ; Match any one byte.
 ANYOF       ANYOF,      sv 0 S    ; Match character in (or not in) this class, single char match only
 ANYOFV      ANYOF,      sv 0 V    ; Match character in (or not in) this class, can match-multiple chars
+
+# Order (within each group) of the below is important.  See ordering comment
+# above.  The PLACEHOLDERn ones are wasting a value.  Right now, we have plenty
+# to spare, but these would be obvious candidates if ever we ran out of node
+# types in a U8.
 ALNUM       ALNUM,      no 0 S    ; Match any alphanumeric character using native charset semantics for non-utf8
 ALNUML      ALNUM,      no 0 S    ; Match any alphanumeric char in locale
 ALNUMU      ALNUM,      no 0 S    ; Match any alphanumeric char using Unicode semantics
@@ -67,10 +78,14 @@ NSPACEU     NSPACE,     no 0 S    ; Match any non-whitespace char using Unicode
 NSPACEA     NSPACE,     no 0 S    ; Match [^ \t\n\f\r]
 DIGIT       DIGIT,      no 0 S    ; Match any numeric character using native charset semantics for non-utf8
 DIGITL      DIGIT,      no 0 S    ; Match any numeric character in locale
+PLACEHOLDER1 NOTHING,   no        ; placeholder for missing DIGITU
 DIGITA      DIGIT,      no 0 S    ; Match [0-9]
 NDIGIT      NDIGIT,     no 0 S    ; Match any non-numeric character using native charset semantics for non-utf8
 NDIGITL     NDIGIT,     no 0 S    ; Match any non-numeric character in locale
+PLACEHOLDER2 NOTHING,   no        ; placeholder for missing NDIGITU
 NDIGITA     NDIGIT,     no 0 S    ; Match [^0-9]
+# End of order is important (within groups)
+
 CLUMP       CLUMP,      no 0 V    ; Match any extended grapheme cluster sequence
 
 #* Alternation
author	Karl Williamson <public@khwilliamson.com>	2012-06-27 13:48:16 -0600
committer	Karl Williamson <public@khwilliamson.com>	2012-06-29 22:22:42 -0600
commit	693fefec6759ebf0a9ec40a0f59346d86831349c (patch)
tree	d12041ac3714d0ae4c14ce2391f280380f038183 /regcomp.sym
parent	8c1182fda8158a86281b1ea6464176d1c68f2f18 (diff)
download	perl-693fefec6759ebf0a9ec40a0f59346d86831349c.tar.gz