summaryrefslogtreecommitdiff
path: root/regcomp.sym
blob: 63e66e0725daf802f86756218dfd1afbdff2b454 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
# regcomp.sym
#
# File has two sections, divided by a line of dashes '-'. 
#
# Empty rows after #-comment are removed from input are ignored
#
# First section is for regops, second sectionis for regmatch-states
#
# Note that the order in this file is important.
#
# Format for first section: 
# NAME \s+ TYPE, arg-description [flags] [num-args] [longjump-len] ; DESCRIPTION
#
#
# run perl regen.pl after editing this file



#* Exit points (0,1)

END         END,        no        ; End of program.
SUCCEED     END,        no        ; Return from a subroutine, basically.

#* Anchors: (2..13)

BOL         BOL,        no        ; Match "" at beginning of line.
MBOL        BOL,        no        ; Same, assuming multiline.
SBOL        BOL,        no        ; Same, assuming singleline.
EOS         EOL,        no        ; Match "" at end of string.
EOL         EOL,        no        ; Match "" at end of line.
MEOL        EOL,        no        ; Same, assuming multiline.
SEOL        EOL,        no        ; Same, assuming singleline.
BOUND       BOUND,      no        ; Match "" at any word boundary
BOUNDL      BOUND,      no        ; Match "" at any word boundary
NBOUND      NBOUND,     no        ; Match "" at any word non-boundary
NBOUNDL     NBOUND,     no        ; Match "" at any word non-boundary
GPOS        GPOS,       no        ; Matches where last m//g left off.

#* [Special] alternatives: (14..30)

REG_ANY     REG_ANY,    no 0 S    ; Match any one character (except newline).
SANY        REG_ANY,    no 0 S    ; Match any one character.
CANY        REG_ANY,    no 0 S    ; Match any one byte.
ANYOF       ANYOF,      sv 0 S    ; Match character in (or not in) this class, folding is native charset for non-utf8.
ALNUM       ALNUM,      no 0 S    ; Match any alphanumeric character
ALNUML      ALNUM,      no 0 S    ; Match any alphanumeric char in locale
NALNUM      NALNUM,     no 0 S    ; Match any non-alphanumeric character
NALNUML     NALNUM,     no 0 S    ; Match any non-alphanumeric char in locale
SPACE       SPACE,      no 0 S    ; Match any whitespace character
SPACEL      SPACE,      no 0 S    ; Match any whitespace char in locale
NSPACE      NSPACE,     no 0 S    ; Match any non-whitespace character
NSPACEL     NSPACE,     no 0 S    ; Match any non-whitespace char in locale
DIGIT       DIGIT,      no 0 S    ; Match any numeric character
DIGITL      DIGIT,      no        ; Match any numeric character in locale
NDIGIT      NDIGIT,     no 0 S    ; Match any non-numeric character
NDIGITL     NDIGIT,     no        ; Match any non-numeric character in locale
CLUMP       CLUMP,      no 0 V    ; Match any extended grapheme cluster sequence

#* Alternation (31)

# BRANCH        The set of branches constituting a single choice are hooked
#               together with their "next" pointers, since precedence prevents
#               anything being concatenated to any individual branch.  The
#               "next" pointer of the last BRANCH in a choice points to the
#               thing following the whole choice.  This is also where the
#               final "next" pointer of each individual branch points; each
#               branch starts with the operand node of a BRANCH node.
#
BRANCH      BRANCH,     node 0 V  ; Match this alternative, or the next...

#*Back pointer (32)

# BACK          Normal "next" pointers all implicitly point forward; BACK
#               exists to make loop structures possible.
# not used
BACK        BACK,       no 0 V    ; Match "", "next" ptr points backward.

#*Literals (33..35)

EXACT       EXACT,      str       ; Match this string (preceded by length).
EXACTF      EXACT,      str       ; Match this string, folded, native charset semantics for non-utf8 (prec. by length).
EXACTFL     EXACT,      str       ; Match this string, folded in locale (w/len).

#*Do nothing types (36..37)

NOTHING     NOTHING,    no        ; Match empty string.
# A variant of above which delimits a group, thus stops optimizations
TAIL        NOTHING,    no        ; Match empty string. Can jump here from outside.

#*Loops (38..44)

# STAR,PLUS    '?', and complex '*' and '+', are implemented as circular
#               BRANCH structures using BACK.  Simple cases (one character
#               per match) are implemented with STAR and PLUS for speed
#               and to minimize recursive plunges.
#
STAR        STAR,       node 0 V  ; Match this (simple) thing 0 or more times.
PLUS        PLUS,       node 0 V  ; Match this (simple) thing 1 or more times.

CURLY       CURLY,      sv 2 V    ; Match this simple thing {n,m} times.
CURLYN      CURLY,      no 2 V    ; Capture next-after-this simple thing 
CURLYM      CURLY,      no 2 V    ; Capture this medium-complex thing {n,m} times. 
CURLYX      CURLY,      sv 2 V    ; Match this complex thing {n,m} times.

# This terminator creates a loop structure for CURLYX
WHILEM      WHILEM,     no 0 V    ; Do curly processing and see if rest matches.

#*Buffer related (45..49)

# OPEN,CLOSE,GROUPP     ...are numbered at compile time.
OPEN        OPEN,       num 1     ; Mark this point in input as start of #n.
CLOSE       CLOSE,      num 1     ; Analogous to OPEN.

REF         REF,        num 1 V   ; Match some already matched string
REFF        REF,        num 1 V   ; Match already matched string, folded using native charset semantics for non-utf8
REFFL       REF,        num 1 V   ; Match already matched string, folded in loc.


IFMATCH     BRANCHJ,    off 1 . 2 ; Succeeds if the following matches.
UNLESSM     BRANCHJ,    off 1 . 2 ; Fails if the following matches.
SUSPEND     BRANCHJ,    off 1 V 1 ; "Independent" sub-RE.
IFTHEN      BRANCHJ,    off 1 V 1 ; Switch, should be preceeded by switcher .
GROUPP      GROUPP,     num 1     ; Whether the group matched.

#*Support for long RE (55..56)

LONGJMP     LONGJMP,    off 1 . 1 ; Jump far away.
BRANCHJ     BRANCHJ,    off 1 V 1 ; BRANCH with long offset.

#*The heavy worker (57)

EVAL        EVAL,       evl 1     ; Execute some Perl code.

#*Modifiers (58..59)

MINMOD      MINMOD,     no        ; Next operator is not greedy.
LOGICAL     LOGICAL,    no        ; Next opcode should set the flag only.

# This is not used yet (60)
RENUM       BRANCHJ,    off 1 . 1 ; Group with independently numbered parens.

#*Trie Related (61..62)

# Behave the same as A|LIST|OF|WORDS would. The '..C' variants have  
# inline charclass data (ascii only), the 'C' store it in the structure.
# NOTE: the relative order of the TRIE-like regops  is signifigant

TRIE        TRIE,       trie 1    ; Match many EXACT(FL?)? at once. flags==type
TRIEC       TRIE,trie charclass   ; Same as TRIE, but with embedded charclass data

# For start classes, contains an added fail table.
AHOCORASICK     TRIE,   trie 1    ; Aho Corasick stclass. flags==type
AHOCORASICKC    TRIE,trie charclass   ; Same as AHOCORASICK, but with embedded charclass data

#*Regex Subroutines (65..66) 
GOSUB       GOSUB,      num/ofs 2L    ; recurse to paren arg1 at (signed) ofs arg2
GOSTART     GOSTART,    no        ; recurse to start of pattern

#*Named references (67..69)
NREF        REF,        no-sv 1 V ; Match some already matched string
NREFF       REF,        no-sv 1 V ; Match already matched string, folded using native charset semantics for non-utf8
NREFFL      REF,        no-sv 1 V ; Match already matched string, folded in loc.


#*Special conditionals  (70..72)
NGROUPP     NGROUPP,    no-sv 1   ; Whether the group matched.            
INSUBP      INSUBP,     num 1     ; Whether we are in a specific recurse.  
DEFINEP     DEFINEP,    none 1    ; Never execute directly.               

#*Bactracking Verbs
ENDLIKE     ENDLIKE,    none      ; Used only for the type field of verbs
OPFAIL      ENDLIKE,    none      ; Same as (?!)
ACCEPT      ENDLIKE,    parno 1   ; Accepts the current matched string.


#*Verbs With Arguments
VERB        VERB,       no-sv 1   ; Used only for the type field of verbs
PRUNE       VERB,       no-sv 1   ; Pattern fails at this startpoint if no-backtracking through this 
MARKPOINT   VERB,       no-sv 1   ; Push the current location for rollback by cut.
SKIP        VERB,       no-sv 1   ; On failure skip forward (to the mark) before retrying
COMMIT      VERB,       no-sv 1   ; Pattern fails outright if backtracking through this
CUTGROUP    VERB,       no-sv 1   ; On failure go to the next alternation in the group

#*Control what to keep in $&.
KEEPS       KEEPS,      no        ; $& begins here.

#*New charclass like patterns
LNBREAK     LNBREAK,    none      ; generic newline pattern
VERTWS      VERTWS,     none 0 S  ; vertical whitespace         (Perl 6)
NVERTWS     NVERTWS,    none 0 S  ; not vertical whitespace     (Perl 6)
HORIZWS     HORIZWS,    none 0 S  ; horizontal whitespace       (Perl 6)
NHORIZWS    NHORIZWS,   none 0 S  ; not horizontal whitespace   (Perl 6)

FOLDCHAR    FOLDCHAR,   codepoint 1 ; codepoint with tricky case folding properties.

# NEW STUFF ABOVE THIS LINE  

################################################################################

#*SPECIAL  REGOPS

# This is not really a node, but an optimized away piece of a "long" node.
# To simplify debugging output, we mark it as if it were a node
OPTIMIZED   NOTHING,    off       ; Placeholder for dump.

# Special opcode with the property that no opcode in a compiled program
# will ever be of this type. Thus it can be used as a flag value that
# no other opcode has been seen. END is used similarly, in that an END
# node cant be optimized. So END implies "unoptimizable" and PSEUDO mean
# "not seen anything to optimize yet".
PSEUDO      PSEUDO,     off       ; Pseudo opcode for internal use.

-------------------------------------------------------------------------------
# Format for second section:
# REGOP \t typelist [ \t typelist] [# Comment]
# typelist= namelist
#         = namelist:FAIL
#         = name:count

# Anything below is a state
#
#
TRIE            next:FAIL
EVAL            AB:FAIL
CURLYX          end:FAIL
WHILEM          A_pre,A_min,A_max,B_min,B_max:FAIL
BRANCH          next:FAIL
CURLYM          A,B:FAIL
IFMATCH         A:FAIL
CURLY           B_min_known,B_min,B_max:FAIL
COMMIT          next:FAIL
MARKPOINT       next:FAIL
SKIP            next:FAIL
CUTGROUP        next:FAIL
KEEPS           next:FAIL