diff options
author | Karl Williamson <public@khwilliamson.com> | 2012-06-27 13:48:16 -0600 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2012-06-29 22:22:42 -0600 |
commit | 693fefec6759ebf0a9ec40a0f59346d86831349c (patch) | |
tree | d12041ac3714d0ae4c14ce2391f280380f038183 | |
parent | 8c1182fda8158a86281b1ea6464176d1c68f2f18 (diff) | |
download | perl-693fefec6759ebf0a9ec40a0f59346d86831349c.tar.gz |
regcomp.c: Simply some node calculations
For the node types that have differing versions depending on the
character set regex modifiers, /d, /l, /u, /a, and /aa, we can use the
enum values as offsets from the base node number to derive the correct
one. This eliminates a number of tests.
Because there is no DIGITU node type, I added placeholders for it (and
NDIGITU) to avoid some special casing of it (more important in future
commits). We currently have many available node types, so can afford to
waste these two.
-rw-r--r-- | op_reg_common.h | 4 | ||||
-rw-r--r-- | regcomp.c | 186 | ||||
-rw-r--r-- | regcomp.sym | 17 | ||||
-rw-r--r-- | regnodes.h | 306 |
4 files changed, 221 insertions, 292 deletions
diff --git a/op_reg_common.h b/op_reg_common.h index f35cb7d233..8a45b200b4 100644 --- a/op_reg_common.h +++ b/op_reg_common.h @@ -36,7 +36,9 @@ /* The character set for the regex is stored in a field of more than one bit * using an enum, for reasons of compactness and to ensure that the options are * mutually exclusive */ -/* Make sure to update ext/re/re.pm when changing this! */ +/* Make sure to update ext/re/re.pm and regcomp.sym (as these are used as + * offsets for various node types, like SPACE vs SPACEL, etc) when changing + * this! */ typedef enum { REGEX_DEPENDS_CHARSET = 0, REGEX_LOCALE_CHARSET, @@ -9912,43 +9912,17 @@ tryagain: *flagp |= HASWIDTH; goto finish_meta_pat; case 'w': - switch (get_regex_charset(RExC_flags)) { - case REGEX_LOCALE_CHARSET: - op = ALNUML; - break; - case REGEX_UNICODE_CHARSET: - op = ALNUMU; - break; - case REGEX_ASCII_RESTRICTED_CHARSET: - case REGEX_ASCII_MORE_RESTRICTED_CHARSET: - op = ALNUMA; - break; - case REGEX_DEPENDS_CHARSET: - op = ALNUM; - break; - default: - goto bad_charset; + op = ALNUM + get_regex_charset(RExC_flags); + if (op > ALNUMA) { /* /aa is same as /a */ + op = ALNUMA; } ret = reg_node(pRExC_state, op); *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'W': - switch (get_regex_charset(RExC_flags)) { - case REGEX_LOCALE_CHARSET: - op = NALNUML; - break; - case REGEX_UNICODE_CHARSET: - op = NALNUMU; - break; - case REGEX_ASCII_RESTRICTED_CHARSET: - case REGEX_ASCII_MORE_RESTRICTED_CHARSET: - op = NALNUMA; - break; - case REGEX_DEPENDS_CHARSET: - op = NALNUM; - break; - default: - goto bad_charset; + op = NALNUM + get_regex_charset(RExC_flags); + if (op > NALNUMA) { /* /aa is same as /a */ + op = NALNUMA; } ret = reg_node(pRExC_state, op); *flagp |= HASWIDTH|SIMPLE; @@ -9956,22 +9930,9 @@ tryagain: case 'b': RExC_seen_zerolen++; RExC_seen |= REG_SEEN_LOOKBEHIND; - switch (get_regex_charset(RExC_flags)) { - case REGEX_LOCALE_CHARSET: - op = BOUNDL; - break; - case REGEX_UNICODE_CHARSET: - op = BOUNDU; - break; - case REGEX_ASCII_RESTRICTED_CHARSET: - case REGEX_ASCII_MORE_RESTRICTED_CHARSET: - op = BOUNDA; - break; - case REGEX_DEPENDS_CHARSET: - op = BOUND; - break; - default: - goto bad_charset; + op = BOUND + get_regex_charset(RExC_flags); + if (op > BOUNDA) { /* /aa is same as /a */ + op = BOUNDA; } ret = reg_node(pRExC_state, op); FLAGS(ret) = get_regex_charset(RExC_flags); @@ -9980,103 +9941,45 @@ tryagain: case 'B': RExC_seen_zerolen++; RExC_seen |= REG_SEEN_LOOKBEHIND; - switch (get_regex_charset(RExC_flags)) { - case REGEX_LOCALE_CHARSET: - op = NBOUNDL; - break; - case REGEX_UNICODE_CHARSET: - op = NBOUNDU; - break; - case REGEX_ASCII_RESTRICTED_CHARSET: - case REGEX_ASCII_MORE_RESTRICTED_CHARSET: - op = NBOUNDA; - break; - case REGEX_DEPENDS_CHARSET: - op = NBOUND; - break; - default: - goto bad_charset; + op = NBOUND + get_regex_charset(RExC_flags); + if (op > NBOUNDA) { /* /aa is same as /a */ + op = NBOUNDA; } ret = reg_node(pRExC_state, op); FLAGS(ret) = get_regex_charset(RExC_flags); *flagp |= SIMPLE; goto finish_meta_pat; case 's': - switch (get_regex_charset(RExC_flags)) { - case REGEX_LOCALE_CHARSET: - op = SPACEL; - break; - case REGEX_UNICODE_CHARSET: - op = SPACEU; - break; - case REGEX_ASCII_RESTRICTED_CHARSET: - case REGEX_ASCII_MORE_RESTRICTED_CHARSET: - op = SPACEA; - break; - case REGEX_DEPENDS_CHARSET: - op = SPACE; - break; - default: - goto bad_charset; + op = SPACE + get_regex_charset(RExC_flags); + if (op > SPACEA) { /* /aa is same as /a */ + op = SPACEA; } ret = reg_node(pRExC_state, op); *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'S': - switch (get_regex_charset(RExC_flags)) { - case REGEX_LOCALE_CHARSET: - op = NSPACEL; - break; - case REGEX_UNICODE_CHARSET: - op = NSPACEU; - break; - case REGEX_ASCII_RESTRICTED_CHARSET: - case REGEX_ASCII_MORE_RESTRICTED_CHARSET: - op = NSPACEA; - break; - case REGEX_DEPENDS_CHARSET: - op = NSPACE; - break; - default: - goto bad_charset; - } - ret = reg_node(pRExC_state, op); - *flagp |= HASWIDTH|SIMPLE; - goto finish_meta_pat; - case 'd': - switch (get_regex_charset(RExC_flags)) { - case REGEX_LOCALE_CHARSET: - op = DIGITL; - break; - case REGEX_ASCII_RESTRICTED_CHARSET: - case REGEX_ASCII_MORE_RESTRICTED_CHARSET: - op = DIGITA; - break; - case REGEX_DEPENDS_CHARSET: /* No difference between these */ - case REGEX_UNICODE_CHARSET: - op = DIGIT; - break; - default: - goto bad_charset; + op = NSPACE + get_regex_charset(RExC_flags); + if (op > NSPACEA) { /* /aa is same as /a */ + op = NSPACEA; } ret = reg_node(pRExC_state, op); *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'D': - switch (get_regex_charset(RExC_flags)) { - case REGEX_LOCALE_CHARSET: - op = NDIGITL; - break; - case REGEX_ASCII_RESTRICTED_CHARSET: - case REGEX_ASCII_MORE_RESTRICTED_CHARSET: - op = NDIGITA; - break; - case REGEX_DEPENDS_CHARSET: /* No difference between these */ - case REGEX_UNICODE_CHARSET: - op = NDIGIT; - break; - default: - goto bad_charset; + op = NDIGIT; + goto join_D_and_d; + case 'd': + op = DIGIT; + join_D_and_d: + { + U8 offset = get_regex_charset(RExC_flags); + if (offset == REGEX_UNICODE_CHARSET) { + offset = REGEX_DEPENDS_CHARSET; + } + else if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) { + offset = REGEX_ASCII_RESTRICTED_CHARSET; + } + op += offset; } ret = reg_node(pRExC_state, op); *flagp |= HASWIDTH|SIMPLE; @@ -10305,14 +10208,18 @@ tryagain: bool is_exactfu_sharp_s; ender = 0; - node_type = ((! FOLD) ? EXACT - : (LOC) - ? EXACTFL - : (MORE_ASCII_RESTRICTED) - ? EXACTFA - : (AT_LEAST_UNI_SEMANTICS) - ? EXACTFU - : EXACTF); + if (! FOLD) { + node_type = EXACT; + } + else { + node_type = get_regex_charset(RExC_flags); + if (node_type >= REGEX_ASCII_RESTRICTED_CHARSET) { + node_type--; /* /a is same as /u, and map /aa's offset to + what /a's would have been, so there is no + hole */ + } + node_type += EXACTF; + } ret = reg_node(pRExC_state, node_type); s = STRING(ret); @@ -10706,11 +10613,6 @@ tryagain: } return(ret); - -/* Jumped to when an unrecognized character set is encountered */ -bad_charset: - Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags)); - return(NULL); } STATIC char * diff --git a/regcomp.sym b/regcomp.sym index 13d3787965..c36a7fc2cd 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -31,11 +31,17 @@ EOS EOL, no ; Match "" at end of string. EOL EOL, no ; Match "" at end of line. MEOL EOL, no ; Same, assuming multiline. SEOL EOL, no ; Same, assuming singleline. +# The regops that have varieties that vary depending on the character set regex +# modifiers have to ordered thusly: /d, /l, /u, /a, /aa. This is because code +# in regcomp.c uses the enum value of the modifier as an offset from the /d +# version. The complements must come after the non-complements. +# BOUND, ALNUM, SPACE, DIGIT, and their complements are affected, as well as +# EXACTF. BOUND BOUND, no ; Match "" at any word boundary using native charset semantics for non-utf8 BOUNDL BOUND, no ; Match "" at any locale word boundary BOUNDU BOUND, no ; Match "" at any word boundary using Unicode semantics BOUNDA BOUND, no ; Match "" at any word boundary using ASCII semantics -# All NBOUND nodes are required by a line regexec.c to be greater than all BOUND ones +# All NBOUND nodes are required by code in regexec.c to be greater than all BOUND ones NBOUND NBOUND, no ; Match "" at any word non-boundary using native charset semantics for non-utf8 NBOUNDL NBOUND, no ; Match "" at any locale word non-boundary NBOUNDU NBOUND, no ; Match "" at any word non-boundary using Unicode semantics @@ -49,6 +55,11 @@ SANY REG_ANY, no 0 S ; Match any one character. CANY REG_ANY, no 0 S ; Match any one byte. ANYOF ANYOF, sv 0 S ; Match character in (or not in) this class, single char match only ANYOFV ANYOF, sv 0 V ; Match character in (or not in) this class, can match-multiple chars + +# Order (within each group) of the below is important. See ordering comment +# above. The PLACEHOLDERn ones are wasting a value. Right now, we have plenty +# to spare, but these would be obvious candidates if ever we ran out of node +# types in a U8. ALNUM ALNUM, no 0 S ; Match any alphanumeric character using native charset semantics for non-utf8 ALNUML ALNUM, no 0 S ; Match any alphanumeric char in locale ALNUMU ALNUM, no 0 S ; Match any alphanumeric char using Unicode semantics @@ -67,10 +78,14 @@ NSPACEU NSPACE, no 0 S ; Match any non-whitespace char using Unicode NSPACEA NSPACE, no 0 S ; Match [^ \t\n\f\r] DIGIT DIGIT, no 0 S ; Match any numeric character using native charset semantics for non-utf8 DIGITL DIGIT, no 0 S ; Match any numeric character in locale +PLACEHOLDER1 NOTHING, no ; placeholder for missing DIGITU DIGITA DIGIT, no 0 S ; Match [0-9] NDIGIT NDIGIT, no 0 S ; Match any non-numeric character using native charset semantics for non-utf8 NDIGITL NDIGIT, no 0 S ; Match any non-numeric character in locale +PLACEHOLDER2 NOTHING, no ; placeholder for missing NDIGITU NDIGITA NDIGIT, no 0 S ; Match [^0-9] +# End of order is important (within groups) + CLUMP CLUMP, no 0 V ; Match any extended grapheme cluster sequence #* Alternation diff --git a/regnodes.h b/regnodes.h index ff3ba3f3ef..84096d6c8e 100644 --- a/regnodes.h +++ b/regnodes.h @@ -6,8 +6,8 @@ /* Regops and State definitions */ -#define REGNODE_MAX 112 -#define REGMATCH_STATE_MAX 152 +#define REGNODE_MAX 114 +#define REGMATCH_STATE_MAX 154 #define END 0 /* 0000 End of program. */ #define SUCCEED 1 /* 0x01 Return from a subroutine, basically. */ @@ -50,78 +50,80 @@ #define NSPACEA 38 /* 0x26 Match [^ \t\n\f\r] */ #define DIGIT 39 /* 0x27 Match any numeric character using native charset semantics for non-utf8 */ #define DIGITL 40 /* 0x28 Match any numeric character in locale */ -#define DIGITA 41 /* 0x29 Match [0-9] */ -#define NDIGIT 42 /* 0x2a Match any non-numeric character using native charset semantics for non-utf8 */ -#define NDIGITL 43 /* 0x2b Match any non-numeric character in locale */ -#define NDIGITA 44 /* 0x2c Match [^0-9] */ -#define CLUMP 45 /* 0x2d Match any extended grapheme cluster sequence */ -#define BRANCH 46 /* 0x2e Match this alternative, or the next... */ -#define BACK 47 /* 0x2f Match "", "next" ptr points backward. */ -#define EXACT 48 /* 0x30 Match this string (preceded by length). */ -#define EXACTF 49 /* 0x31 Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len). */ -#define EXACTFL 50 /* 0x32 Match this string (not guaranteed to be folded) using /il rules (w/len). */ -#define EXACTFU 51 /* 0x33 Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len). */ -#define EXACTFA 52 /* 0x34 Match this string (not guaranteed to be folded) using /iaa rules (w/len). */ -#define EXACTFU_SS 53 /* 0x35 Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len). */ -#define EXACTFU_TRICKYFOLD 54 /* 0x36 Match this folded UTF-8 string using /iu rules */ -#define NOTHING 55 /* 0x37 Match empty string. */ -#define TAIL 56 /* 0x38 Match empty string. Can jump here from outside. */ -#define STAR 57 /* 0x39 Match this (simple) thing 0 or more times. */ -#define PLUS 58 /* 0x3a Match this (simple) thing 1 or more times. */ -#define CURLY 59 /* 0x3b Match this simple thing {n,m} times. */ -#define CURLYN 60 /* 0x3c Capture next-after-this simple thing */ -#define CURLYM 61 /* 0x3d Capture this medium-complex thing {n,m} times. */ -#define CURLYX 62 /* 0x3e Match this complex thing {n,m} times. */ -#define WHILEM 63 /* 0x3f Do curly processing and see if rest matches. */ -#define OPEN 64 /* 0x40 Mark this point in input as start of */ -#define CLOSE 65 /* 0x41 Analogous to OPEN. */ -#define REF 66 /* 0x42 Match some already matched string */ -#define REFF 67 /* 0x43 Match already matched string, folded using native charset semantics for non-utf8 */ -#define REFFL 68 /* 0x44 Match already matched string, folded in loc. */ -#define REFFU 69 /* 0x45 Match already matched string, folded using unicode semantics for non-utf8 */ -#define REFFA 70 /* 0x46 Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */ -#define NREF 71 /* 0x47 Match some already matched string */ -#define NREFF 72 /* 0x48 Match already matched string, folded using native charset semantics for non-utf8 */ -#define NREFFL 73 /* 0x49 Match already matched string, folded in loc. */ -#define NREFFU 74 /* 0x4a Match already matched string, folded using unicode semantics for non-utf8 */ -#define NREFFA 75 /* 0x4b Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */ -#define IFMATCH 76 /* 0x4c Succeeds if the following matches. */ -#define UNLESSM 77 /* 0x4d Fails if the following matches. */ -#define SUSPEND 78 /* 0x4e "Independent" sub-RE. */ -#define IFTHEN 79 /* 0x4f Switch, should be preceded by switcher . */ -#define GROUPP 80 /* 0x50 Whether the group matched. */ -#define LONGJMP 81 /* 0x51 Jump far away. */ -#define BRANCHJ 82 /* 0x52 BRANCH with long offset. */ -#define EVAL 83 /* 0x53 Execute some Perl code. */ -#define MINMOD 84 /* 0x54 Next operator is not greedy. */ -#define LOGICAL 85 /* 0x55 Next opcode should set the flag only. */ -#define RENUM 86 /* 0x56 Group with independently numbered parens. */ -#define TRIE 87 /* 0x57 Match many EXACT(F[ALU]?)? at once. flags==type */ -#define TRIEC 88 /* 0x58 Same as TRIE, but with embedded charclass data */ -#define AHOCORASICK 89 /* 0x59 Aho Corasick stclass. flags==type */ -#define AHOCORASICKC 90 /* 0x5a Same as AHOCORASICK, but with embedded charclass data */ -#define GOSUB 91 /* 0x5b recurse to paren arg1 at (signed) ofs arg2 */ -#define GOSTART 92 /* 0x5c recurse to start of pattern */ -#define NGROUPP 93 /* 0x5d Whether the group matched. */ -#define INSUBP 94 /* 0x5e Whether we are in a specific recurse. */ -#define DEFINEP 95 /* 0x5f Never execute directly. */ -#define ENDLIKE 96 /* 0x60 Used only for the type field of verbs */ -#define OPFAIL 97 /* 0x61 Same as (?!) */ -#define ACCEPT 98 /* 0x62 Accepts the current matched string. */ -#define VERB 99 /* 0x63 Used only for the type field of verbs */ -#define PRUNE 100 /* 0x64 Pattern fails at this startpoint if no-backtracking through this */ -#define MARKPOINT 101 /* 0x65 Push the current location for rollback by cut. */ -#define SKIP 102 /* 0x66 On failure skip forward (to the mark) before retrying */ -#define COMMIT 103 /* 0x67 Pattern fails outright if backtracking through this */ -#define CUTGROUP 104 /* 0x68 On failure go to the next alternation in the group */ -#define KEEPS 105 /* 0x69 $& begins here. */ -#define LNBREAK 106 /* 0x6a generic newline pattern */ -#define VERTWS 107 /* 0x6b vertical whitespace (Perl 6) */ -#define NVERTWS 108 /* 0x6c not vertical whitespace (Perl 6) */ -#define HORIZWS 109 /* 0x6d horizontal whitespace (Perl 6) */ -#define NHORIZWS 110 /* 0x6e not horizontal whitespace (Perl 6) */ -#define OPTIMIZED 111 /* 0x6f Placeholder for dump. */ -#define PSEUDO 112 /* 0x70 Pseudo opcode for internal use. */ +#define PLACEHOLDER1 41 /* 0x29 placeholder for missing DIGITU */ +#define DIGITA 42 /* 0x2a Match [0-9] */ +#define NDIGIT 43 /* 0x2b Match any non-numeric character using native charset semantics for non-utf8 */ +#define NDIGITL 44 /* 0x2c Match any non-numeric character in locale */ +#define PLACEHOLDER2 45 /* 0x2d placeholder for missing NDIGITU */ +#define NDIGITA 46 /* 0x2e Match [^0-9] */ +#define CLUMP 47 /* 0x2f Match any extended grapheme cluster sequence */ +#define BRANCH 48 /* 0x30 Match this alternative, or the next... */ +#define BACK 49 /* 0x31 Match "", "next" ptr points backward. */ +#define EXACT 50 /* 0x32 Match this string (preceded by length). */ +#define EXACTF 51 /* 0x33 Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len). */ +#define EXACTFL 52 /* 0x34 Match this string (not guaranteed to be folded) using /il rules (w/len). */ +#define EXACTFU 53 /* 0x35 Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len). */ +#define EXACTFA 54 /* 0x36 Match this string (not guaranteed to be folded) using /iaa rules (w/len). */ +#define EXACTFU_SS 55 /* 0x37 Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len). */ +#define EXACTFU_TRICKYFOLD 56 /* 0x38 Match this folded UTF-8 string using /iu rules */ +#define NOTHING 57 /* 0x39 Match empty string. */ +#define TAIL 58 /* 0x3a Match empty string. Can jump here from outside. */ +#define STAR 59 /* 0x3b Match this (simple) thing 0 or more times. */ +#define PLUS 60 /* 0x3c Match this (simple) thing 1 or more times. */ +#define CURLY 61 /* 0x3d Match this simple thing {n,m} times. */ +#define CURLYN 62 /* 0x3e Capture next-after-this simple thing */ +#define CURLYM 63 /* 0x3f Capture this medium-complex thing {n,m} times. */ +#define CURLYX 64 /* 0x40 Match this complex thing {n,m} times. */ +#define WHILEM 65 /* 0x41 Do curly processing and see if rest matches. */ +#define OPEN 66 /* 0x42 Mark this point in input as start of */ +#define CLOSE 67 /* 0x43 Analogous to OPEN. */ +#define REF 68 /* 0x44 Match some already matched string */ +#define REFF 69 /* 0x45 Match already matched string, folded using native charset semantics for non-utf8 */ +#define REFFL 70 /* 0x46 Match already matched string, folded in loc. */ +#define REFFU 71 /* 0x47 Match already matched string, folded using unicode semantics for non-utf8 */ +#define REFFA 72 /* 0x48 Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */ +#define NREF 73 /* 0x49 Match some already matched string */ +#define NREFF 74 /* 0x4a Match already matched string, folded using native charset semantics for non-utf8 */ +#define NREFFL 75 /* 0x4b Match already matched string, folded in loc. */ +#define NREFFU 76 /* 0x4c Match already matched string, folded using unicode semantics for non-utf8 */ +#define NREFFA 77 /* 0x4d Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */ +#define IFMATCH 78 /* 0x4e Succeeds if the following matches. */ +#define UNLESSM 79 /* 0x4f Fails if the following matches. */ +#define SUSPEND 80 /* 0x50 "Independent" sub-RE. */ +#define IFTHEN 81 /* 0x51 Switch, should be preceded by switcher . */ +#define GROUPP 82 /* 0x52 Whether the group matched. */ +#define LONGJMP 83 /* 0x53 Jump far away. */ +#define BRANCHJ 84 /* 0x54 BRANCH with long offset. */ +#define EVAL 85 /* 0x55 Execute some Perl code. */ +#define MINMOD 86 /* 0x56 Next operator is not greedy. */ +#define LOGICAL 87 /* 0x57 Next opcode should set the flag only. */ +#define RENUM 88 /* 0x58 Group with independently numbered parens. */ +#define TRIE 89 /* 0x59 Match many EXACT(F[ALU]?)? at once. flags==type */ +#define TRIEC 90 /* 0x5a Same as TRIE, but with embedded charclass data */ +#define AHOCORASICK 91 /* 0x5b Aho Corasick stclass. flags==type */ +#define AHOCORASICKC 92 /* 0x5c Same as AHOCORASICK, but with embedded charclass data */ +#define GOSUB 93 /* 0x5d recurse to paren arg1 at (signed) ofs arg2 */ +#define GOSTART 94 /* 0x5e recurse to start of pattern */ +#define NGROUPP 95 /* 0x5f Whether the group matched. */ +#define INSUBP 96 /* 0x60 Whether we are in a specific recurse. */ +#define DEFINEP 97 /* 0x61 Never execute directly. */ +#define ENDLIKE 98 /* 0x62 Used only for the type field of verbs */ +#define OPFAIL 99 /* 0x63 Same as (?!) */ +#define ACCEPT 100 /* 0x64 Accepts the current matched string. */ +#define VERB 101 /* 0x65 Used only for the type field of verbs */ +#define PRUNE 102 /* 0x66 Pattern fails at this startpoint if no-backtracking through this */ +#define MARKPOINT 103 /* 0x67 Push the current location for rollback by cut. */ +#define SKIP 104 /* 0x68 On failure skip forward (to the mark) before retrying */ +#define COMMIT 105 /* 0x69 Pattern fails outright if backtracking through this */ +#define CUTGROUP 106 /* 0x6a On failure go to the next alternation in the group */ +#define KEEPS 107 /* 0x6b $& begins here. */ +#define LNBREAK 108 /* 0x6c generic newline pattern */ +#define VERTWS 109 /* 0x6d vertical whitespace (Perl 6) */ +#define NVERTWS 110 /* 0x6e not vertical whitespace (Perl 6) */ +#define HORIZWS 111 /* 0x6f horizontal whitespace (Perl 6) */ +#define NHORIZWS 112 /* 0x70 not horizontal whitespace (Perl 6) */ +#define OPTIMIZED 113 /* 0x71 Placeholder for dump. */ +#define PSEUDO 114 /* 0x72 Pseudo opcode for internal use. */ /* ------------ States ------------- */ #define TRIE_next (REGNODE_MAX + 1) /* state for TRIE */ #define TRIE_next_fail (REGNODE_MAX + 2) /* state for TRIE */ @@ -211,9 +213,11 @@ EXTCONST U8 PL_regkind[] = { NSPACE, /* NSPACEA */ DIGIT, /* DIGIT */ DIGIT, /* DIGITL */ + NOTHING, /* PLACEHOLDER1 */ DIGIT, /* DIGITA */ NDIGIT, /* NDIGIT */ NDIGIT, /* NDIGITL */ + NOTHING, /* PLACEHOLDER2 */ NDIGIT, /* NDIGITA */ CLUMP, /* CLUMP */ BRANCH, /* BRANCH */ @@ -372,9 +376,11 @@ static const U8 regarglen[] = { 0, /* NSPACEA */ 0, /* DIGIT */ 0, /* DIGITL */ + 0, /* PLACEHOLDER1 */ 0, /* DIGITA */ 0, /* NDIGIT */ 0, /* NDIGITL */ + 0, /* PLACEHOLDER2 */ 0, /* NDIGITA */ 0, /* CLUMP */ 0, /* BRANCH */ @@ -490,9 +496,11 @@ static const char reg_off_by_arg[] = { 0, /* NSPACEA */ 0, /* DIGIT */ 0, /* DIGITL */ + 0, /* PLACEHOLDER1 */ 0, /* DIGITA */ 0, /* NDIGIT */ 0, /* NDIGITL */ + 0, /* PLACEHOLDER2 */ 0, /* NDIGITA */ 0, /* CLUMP */ 0, /* BRANCH */ @@ -613,78 +621,80 @@ EXTCONST char * const PL_reg_name[] = { "NSPACEA", /* 0x26 */ "DIGIT", /* 0x27 */ "DIGITL", /* 0x28 */ - "DIGITA", /* 0x29 */ - "NDIGIT", /* 0x2a */ - "NDIGITL", /* 0x2b */ - "NDIGITA", /* 0x2c */ - "CLUMP", /* 0x2d */ - "BRANCH", /* 0x2e */ - "BACK", /* 0x2f */ - "EXACT", /* 0x30 */ - "EXACTF", /* 0x31 */ - "EXACTFL", /* 0x32 */ - "EXACTFU", /* 0x33 */ - "EXACTFA", /* 0x34 */ - "EXACTFU_SS", /* 0x35 */ - "EXACTFU_TRICKYFOLD", /* 0x36 */ - "NOTHING", /* 0x37 */ - "TAIL", /* 0x38 */ - "STAR", /* 0x39 */ - "PLUS", /* 0x3a */ - "CURLY", /* 0x3b */ - "CURLYN", /* 0x3c */ - "CURLYM", /* 0x3d */ - "CURLYX", /* 0x3e */ - "WHILEM", /* 0x3f */ - "OPEN", /* 0x40 */ - "CLOSE", /* 0x41 */ - "REF", /* 0x42 */ - "REFF", /* 0x43 */ - "REFFL", /* 0x44 */ - "REFFU", /* 0x45 */ - "REFFA", /* 0x46 */ - "NREF", /* 0x47 */ - "NREFF", /* 0x48 */ - "NREFFL", /* 0x49 */ - "NREFFU", /* 0x4a */ - "NREFFA", /* 0x4b */ - "IFMATCH", /* 0x4c */ - "UNLESSM", /* 0x4d */ - "SUSPEND", /* 0x4e */ - "IFTHEN", /* 0x4f */ - "GROUPP", /* 0x50 */ - "LONGJMP", /* 0x51 */ - "BRANCHJ", /* 0x52 */ - "EVAL", /* 0x53 */ - "MINMOD", /* 0x54 */ - "LOGICAL", /* 0x55 */ - "RENUM", /* 0x56 */ - "TRIE", /* 0x57 */ - "TRIEC", /* 0x58 */ - "AHOCORASICK", /* 0x59 */ - "AHOCORASICKC", /* 0x5a */ - "GOSUB", /* 0x5b */ - "GOSTART", /* 0x5c */ - "NGROUPP", /* 0x5d */ - "INSUBP", /* 0x5e */ - "DEFINEP", /* 0x5f */ - "ENDLIKE", /* 0x60 */ - "OPFAIL", /* 0x61 */ - "ACCEPT", /* 0x62 */ - "VERB", /* 0x63 */ - "PRUNE", /* 0x64 */ - "MARKPOINT", /* 0x65 */ - "SKIP", /* 0x66 */ - "COMMIT", /* 0x67 */ - "CUTGROUP", /* 0x68 */ - "KEEPS", /* 0x69 */ - "LNBREAK", /* 0x6a */ - "VERTWS", /* 0x6b */ - "NVERTWS", /* 0x6c */ - "HORIZWS", /* 0x6d */ - "NHORIZWS", /* 0x6e */ - "OPTIMIZED", /* 0x6f */ - "PSEUDO", /* 0x70 */ + "PLACEHOLDER1", /* 0x29 */ + "DIGITA", /* 0x2a */ + "NDIGIT", /* 0x2b */ + "NDIGITL", /* 0x2c */ + "PLACEHOLDER2", /* 0x2d */ + "NDIGITA", /* 0x2e */ + "CLUMP", /* 0x2f */ + "BRANCH", /* 0x30 */ + "BACK", /* 0x31 */ + "EXACT", /* 0x32 */ + "EXACTF", /* 0x33 */ + "EXACTFL", /* 0x34 */ + "EXACTFU", /* 0x35 */ + "EXACTFA", /* 0x36 */ + "EXACTFU_SS", /* 0x37 */ + "EXACTFU_TRICKYFOLD", /* 0x38 */ + "NOTHING", /* 0x39 */ + "TAIL", /* 0x3a */ + "STAR", /* 0x3b */ + "PLUS", /* 0x3c */ + "CURLY", /* 0x3d */ + "CURLYN", /* 0x3e */ + "CURLYM", /* 0x3f */ + "CURLYX", /* 0x40 */ + "WHILEM", /* 0x41 */ + "OPEN", /* 0x42 */ + "CLOSE", /* 0x43 */ + "REF", /* 0x44 */ + "REFF", /* 0x45 */ + "REFFL", /* 0x46 */ + "REFFU", /* 0x47 */ + "REFFA", /* 0x48 */ + "NREF", /* 0x49 */ + "NREFF", /* 0x4a */ + "NREFFL", /* 0x4b */ + "NREFFU", /* 0x4c */ + "NREFFA", /* 0x4d */ + "IFMATCH", /* 0x4e */ + "UNLESSM", /* 0x4f */ + "SUSPEND", /* 0x50 */ + "IFTHEN", /* 0x51 */ + "GROUPP", /* 0x52 */ + "LONGJMP", /* 0x53 */ + "BRANCHJ", /* 0x54 */ + "EVAL", /* 0x55 */ + "MINMOD", /* 0x56 */ + "LOGICAL", /* 0x57 */ + "RENUM", /* 0x58 */ + "TRIE", /* 0x59 */ + "TRIEC", /* 0x5a */ + "AHOCORASICK", /* 0x5b */ + "AHOCORASICKC", /* 0x5c */ + "GOSUB", /* 0x5d */ + "GOSTART", /* 0x5e */ + "NGROUPP", /* 0x5f */ + "INSUBP", /* 0x60 */ + "DEFINEP", /* 0x61 */ + "ENDLIKE", /* 0x62 */ + "OPFAIL", /* 0x63 */ + "ACCEPT", /* 0x64 */ + "VERB", /* 0x65 */ + "PRUNE", /* 0x66 */ + "MARKPOINT", /* 0x67 */ + "SKIP", /* 0x68 */ + "COMMIT", /* 0x69 */ + "CUTGROUP", /* 0x6a */ + "KEEPS", /* 0x6b */ + "LNBREAK", /* 0x6c */ + "VERTWS", /* 0x6d */ + "NVERTWS", /* 0x6e */ + "HORIZWS", /* 0x6f */ + "NHORIZWS", /* 0x70 */ + "OPTIMIZED", /* 0x71 */ + "PSEUDO", /* 0x72 */ /* ------------ States ------------- */ "TRIE_next", /* REGNODE_MAX +0x01 */ "TRIE_next_fail", /* REGNODE_MAX +0x02 */ @@ -789,7 +799,7 @@ EXTCONST U8 PL_varies[] __attribute__deprecated__ = { EXTCONST U8 PL_varies_bitmask[]; #else EXTCONST U8 PL_varies_bitmask[] = { - 0x00, 0x00, 0x40, 0x00, 0x00, 0xE0, 0x00, 0xFE, 0xFC, 0xCF, 0x04, 0x00, 0x00, 0x00, 0x00 + 0x00, 0x00, 0x40, 0x00, 0x00, 0x80, 0x03, 0xF8, 0xF3, 0x3F, 0x13, 0x00, 0x00, 0x00, 0x00 }; #endif /* DOINIT */ @@ -813,7 +823,7 @@ EXTCONST U8 PL_simple[] __attribute__deprecated__ = { EXTCONST U8 PL_simple_bitmask[]; #else EXTCONST U8 PL_simple_bitmask[] = { - 0x00, 0x00, 0xBC, 0xFF, 0xFF, 0x1F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x78, 0x00 + 0x00, 0x00, 0xBC, 0xFF, 0xFF, 0x5D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x01 }; #endif /* DOINIT */ |