regcomp.sym


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256

# regcomp.sym
#
# File has two sections, divided by a line of dashes '-'. 
#
# Empty rows after #-comment are removed from input are ignored
#
# First section is for regops, second section is for regmatch-states
#
# Note that the order in this file is important.
#
# Format for first section: 
# NAME \s+ TYPE, arg-description [flags] [num-args] [longjump-len] ; DESCRIPTION
#
#
# run perl regen.pl after editing this file


#* Exit points

END         END,        no        ; End of program.
SUCCEED     END,        no        ; Return from a subroutine, basically.

#* Anchors:

BOL         BOL,        no        ; Match "" at beginning of line.
MBOL        BOL,        no        ; Same, assuming multiline.
SBOL        BOL,        no        ; Same, assuming singleline.
EOS         EOL,        no        ; Match "" at end of string.
EOL         EOL,        no        ; Match "" at end of line.
MEOL        EOL,        no        ; Same, assuming multiline.
SEOL        EOL,        no        ; Same, assuming singleline.
BOUND       BOUND,      no        ; Match "" at any word boundary using native charset semantics for non-utf8
BOUNDL      BOUND,      no        ; Match "" at any locale word boundary
BOUNDU      BOUND,      no        ; Match "" at any word boundary using Unicode semantics
BOUNDA      BOUND,      no         ; Match "" at any word boundary using ASCII semantics
# All NBOUND nodes are required by a line regexec.c to be greater than all BOUND ones
NBOUND      NBOUND,     no        ; Match "" at any word non-boundary using native charset semantics for non-utf8
NBOUNDL     NBOUND,     no        ; Match "" at any locale word non-boundary
NBOUNDU     NBOUND,     no        ; Match "" at any word non-boundary using Unicode semantics
NBOUNDA     NBOUND,     no        ; Match "" at any word non-boundary using ASCII semantics
GPOS        GPOS,       no        ; Matches where last m//g left off.

#* [Special] alternatives:

REG_ANY     REG_ANY,    no 0 S    ; Match any one character (except newline).
SANY        REG_ANY,    no 0 S    ; Match any one character.
CANY        REG_ANY,    no 0 S    ; Match any one byte.
ANYOF       ANYOF,      sv 0 S    ; Match character in (or not in) this class, single char match only
ANYOFV      ANYOF,      sv 0 V    ; Match character in (or not in) this class, can match-multiple chars
ALNUM       ALNUM,      no 0 S    ; Match any alphanumeric character using native charset semantics for non-utf8
ALNUML      ALNUM,      no 0 S    ; Match any alphanumeric char in locale
ALNUMU      ALNUM,      no 0 S    ; Match any alphanumeric char using Unicode semantics
ALNUMA      ALNUM,      no 0 S    ; Match [A-Za-z_0-9]
NALNUM      NALNUM,     no 0 S    ; Match any non-alphanumeric character using native charset semantics for non-utf8
NALNUML     NALNUM,     no 0 S    ; Match any non-alphanumeric char in locale
NALNUMU     NALNUM,     no 0 S    ; Match any non-alphanumeric char using Unicode semantics
NALNUMA     NALNUM,     no 0 S    ; Match [^A-Za-z_0-9]
SPACE       SPACE,      no 0 S    ; Match any whitespace character using native charset semantics for non-utf8
SPACEL      SPACE,      no 0 S    ; Match any whitespace char in locale
SPACEU      SPACE,      no 0 S    ; Match any whitespace char using Unicode semantics
SPACEA      SPACE,      no 0 S    ; Match [ \t\n\f\r]
NSPACE      NSPACE,     no 0 S    ; Match any non-whitespace character using native charset semantics for non-utf8
NSPACEL     NSPACE,     no 0 S    ; Match any non-whitespace char in locale
NSPACEU     NSPACE,     no 0 S    ; Match any non-whitespace char using Unicode semantics
NSPACEA     NSPACE,     no 0 S    ; Match [^ \t\n\f\r]
DIGIT       DIGIT,      no 0 S    ; Match any numeric character using native charset semantics for non-utf8
DIGITL      DIGIT,      no 0 S    ; Match any numeric character in locale
DIGITA      DIGIT,      no 0 S    ; Match [0-9]
NDIGIT      NDIGIT,     no 0 S    ; Match any non-numeric character using native charset semantics for non-utf8
NDIGITL     NDIGIT,     no 0 S    ; Match any non-numeric character in locale
NDIGITA     NDIGIT,     no 0 S    ; Match [^0-9]
CLUMP       CLUMP,      no 0 V    ; Match any extended grapheme cluster sequence

#* Alternation

# BRANCH        The set of branches constituting a single choice are hooked
#               together with their "next" pointers, since precedence prevents
#               anything being concatenated to any individual branch.  The
#               "next" pointer of the last BRANCH in a choice points to the
#               thing following the whole choice.  This is also where the
#               final "next" pointer of each individual branch points; each
#               branch starts with the operand node of a BRANCH node.
#
BRANCH      BRANCH,     node 0 V  ; Match this alternative, or the next...

#*Back pointer

# BACK          Normal "next" pointers all implicitly point forward; BACK
#               exists to make loop structures possible.
# not used
BACK        BACK,       no 0 V    ; Match "", "next" ptr points backward.

#*Literals

EXACT       EXACT,      str       ; Match this string (preceded by length).
EXACTF      EXACT,      str       ; Match this string, folded, native charset semantics for non-utf8 (prec. by length).
EXACTFL     EXACT,      str       ; Match this string, folded in locale (w/len).
EXACTFU     EXACT,      str	  ; Match this string, folded, Unicode semantics for non-utf8 (prec. by length).

#*Do nothing types

NOTHING     NOTHING,    no        ; Match empty string.
# A variant of above which delimits a group, thus stops optimizations
TAIL        NOTHING,    no        ; Match empty string. Can jump here from outside.

#*Loops

# STAR,PLUS    '?', and complex '*' and '+', are implemented as circular
#               BRANCH structures using BACK.  Simple cases (one character
#               per match) are implemented with STAR and PLUS for speed
#               and to minimize recursive plunges.
#
STAR        STAR,       node 0 V  ; Match this (simple) thing 0 or more times.
PLUS        PLUS,       node 0 V  ; Match this (simple) thing 1 or more times.

CURLY       CURLY,      sv 2 V    ; Match this simple thing {n,m} times.
CURLYN      CURLY,      no 2 V    ; Capture next-after-this simple thing 
CURLYM      CURLY,      no 2 V    ; Capture this medium-complex thing {n,m} times. 
CURLYX      CURLY,      sv 2 V    ; Match this complex thing {n,m} times.

# This terminator creates a loop structure for CURLYX
WHILEM      WHILEM,     no 0 V    ; Do curly processing and see if rest matches.

#*Buffer related

# OPEN,CLOSE,GROUPP     ...are numbered at compile time.
OPEN        OPEN,       num 1     ; Mark this point in input as start of #n.
CLOSE       CLOSE,      num 1     ; Analogous to OPEN.

REF         REF,        num 1 V   ; Match some already matched string
REFF        REF,        num 1 V   ; Match already matched string, folded using native charset semantics for non-utf8
REFFL       REF,        num 1 V   ; Match already matched string, folded in loc.
# REFFU and NREFFU could have been implemented using the FLAGS field of the
# regnode, but by having a separate node type, we can use the existing switch
# statement to avoid some tests
REFFU       REF,        num 1 V   ; Match already matched string, folded using unicode semantics for non-utf8

#*Named references.  Code in regcomp.c assumes that these all are after the numbered references
NREF        REF,        no-sv 1 V ; Match some already matched string
NREFF       REF,        no-sv 1 V ; Match already matched string, folded using native charset semantics for non-utf8
NREFFL      REF,        no-sv 1 V ; Match already matched string, folded in loc.
NREFFU      REF,        num   1 V ; Match already matched string, folded using unicode semantics for non-utf8

IFMATCH     BRANCHJ,    off 1 . 2 ; Succeeds if the following matches.
UNLESSM     BRANCHJ,    off 1 . 2 ; Fails if the following matches.
SUSPEND     BRANCHJ,    off 1 V 1 ; "Independent" sub-RE.
IFTHEN      BRANCHJ,    off 1 V 1 ; Switch, should be preceded by switcher .
GROUPP      GROUPP,     num 1     ; Whether the group matched.

#*Support for long RE

LONGJMP     LONGJMP,    off 1 . 1 ; Jump far away.
BRANCHJ     BRANCHJ,    off 1 V 1 ; BRANCH with long offset.

#*The heavy worker

EVAL        EVAL,       evl 1     ; Execute some Perl code.

#*Modifiers

MINMOD      MINMOD,     no        ; Next operator is not greedy.
LOGICAL     LOGICAL,    no        ; Next opcode should set the flag only.

# This is not used yet
RENUM       BRANCHJ,    off 1 . 1 ; Group with independently numbered parens.

#*Trie Related

# Behave the same as A|LIST|OF|WORDS would. The '..C' variants have  
# inline charclass data (ascii only), the 'C' store it in the structure.
# NOTE: the relative order of the TRIE-like regops  is significant

TRIE        TRIE,       trie 1    ; Match many EXACT(F[LU]?)? at once. flags==type
TRIEC       TRIE,trie charclass   ; Same as TRIE, but with embedded charclass data

# For start classes, contains an added fail table.
AHOCORASICK     TRIE,   trie 1    ; Aho Corasick stclass. flags==type
AHOCORASICKC    TRIE,trie charclass   ; Same as AHOCORASICK, but with embedded charclass data

#*Regex Subroutines
GOSUB       GOSUB,      num/ofs 2L    ; recurse to paren arg1 at (signed) ofs arg2
GOSTART     GOSTART,    no        ; recurse to start of pattern

#*Special conditionals
NGROUPP     NGROUPP,    no-sv 1   ; Whether the group matched.            
INSUBP      INSUBP,     num 1     ; Whether we are in a specific recurse.  
DEFINEP     DEFINEP,    none 1    ; Never execute directly.               

#*Backtracking Verbs
ENDLIKE     ENDLIKE,    none      ; Used only for the type field of verbs
OPFAIL      ENDLIKE,    none      ; Same as (?!)
ACCEPT      ENDLIKE,    parno 1   ; Accepts the current matched string.


#*Verbs With Arguments
VERB        VERB,       no-sv 1   ; Used only for the type field of verbs
PRUNE       VERB,       no-sv 1   ; Pattern fails at this startpoint if no-backtracking through this 
MARKPOINT   VERB,       no-sv 1   ; Push the current location for rollback by cut.
SKIP        VERB,       no-sv 1   ; On failure skip forward (to the mark) before retrying
COMMIT      VERB,       no-sv 1   ; Pattern fails outright if backtracking through this
CUTGROUP    VERB,       no-sv 1   ; On failure go to the next alternation in the group

#*Control what to keep in $&.
KEEPS       KEEPS,      no        ; $& begins here.

#*New charclass like patterns
LNBREAK     LNBREAK,    none      ; generic newline pattern
VERTWS      VERTWS,     none 0 S  ; vertical whitespace         (Perl 6)
NVERTWS     NVERTWS,    none 0 S  ; not vertical whitespace     (Perl 6)
HORIZWS     HORIZWS,    none 0 S  ; horizontal whitespace       (Perl 6)
NHORIZWS    NHORIZWS,   none 0 S  ; not horizontal whitespace   (Perl 6)

FOLDCHAR    FOLDCHAR,   codepoint 1 ; codepoint with tricky case folding properties.


# NEW STUFF SOMEWHERE ABOVE THIS LINE

################################################################################

#*SPECIAL  REGOPS

# This is not really a node, but an optimized away piece of a "long" node.
# To simplify debugging output, we mark it as if it were a node
OPTIMIZED   NOTHING,    off       ; Placeholder for dump.

# Special opcode with the property that no opcode in a compiled program
# will ever be of this type. Thus it can be used as a flag value that
# no other opcode has been seen. END is used similarly, in that an END
# node cant be optimized. So END implies "unoptimizable" and PSEUDO mean
# "not seen anything to optimize yet".
PSEUDO      PSEUDO,     off       ; Pseudo opcode for internal use.

-------------------------------------------------------------------------------
# Format for second section:
# REGOP \t typelist [ \t typelist] [# Comment]
# typelist= namelist
#         = namelist:FAIL
#         = name:count

# Anything below is a state
#
#
TRIE            next:FAIL
EVAL            AB:FAIL
CURLYX          end:FAIL
WHILEM          A_pre,A_min,A_max,B_min,B_max:FAIL
BRANCH          next:FAIL
CURLYM          A,B:FAIL
IFMATCH         A:FAIL
CURLY           B_min_known,B_min,B_max:FAIL
COMMIT          next:FAIL
MARKPOINT       next:FAIL
SKIP            next:FAIL
CUTGROUP        next:FAIL
KEEPS           next:FAIL