summaryrefslogtreecommitdiff
path: root/regcomp.sym
blob: 4e5c1c1ab2213b78c4213ad068c514783be059dc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
# Format:
# NAME \t TYPE, arg-description [num-args] [longjump-len] \t DESCRIPTION

# Empty rows and #-comment rows are ignored.

# Exit points
END		END,    no	End of program.
SUCCEED		END,    no	Return from a subroutine, basically.

# Anchors:
BOL		BOL,    no	Match "" at beginning of line.
MBOL		BOL,    no	Same, assuming multiline.
SBOL		BOL,    no	Same, assuming singleline.
EOS		EOL,    no	Match "" at end of string.
EOL		EOL,    no	Match "" at end of line.
MEOL		EOL,    no	Same, assuming multiline.
SEOL		EOL,    no	Same, assuming singleline.
BOUND		BOUND,  no	Match "" at any word boundary
BOUNDUTF8	BOUND,  no	Match "" at any word boundary
BOUNDL		BOUND,  no	Match "" at any word boundary
BOUNDLUTF8	BOUND,  no	Match "" at any word boundary
NBOUND		NBOUND, no	Match "" at any word non-boundary
NBOUNDUTF8	NBOUND, no	Match "" at any word non-boundary
NBOUNDL		NBOUND, no	Match "" at any word non-boundary
NBOUNDLUTF8	NBOUND, no	Match "" at any word non-boundary
GPOS		GPOS,   no	Matches where last m//g left off.

# [Special] alternatives
REG_ANY		REG_ANY,    no	Match any one character (except newline).
ANYUTF8		REG_ANY,    no	Match any one Unicode character (except newline).
SANY		REG_ANY,    no	Match any one character.
SANYUTF8	REG_ANY,    no	Match any one Unicode character.
ANYOF		ANYOF,  sv	Match character in (or not in) this class.
ANYOFUTF8	ANYOF,  sv 1	Match character in (or not in) this class.
ALNUM		ALNUM,  no	Match any alphanumeric character
ALNUMUTF8	ALNUM,  no	Match any alphanumeric character
ALNUML		ALNUM,  no	Match any alphanumeric char in locale
ALNUMLUTF8	ALNUM,  no	Match any alphanumeric char in locale
NALNUM		NALNUM, no	Match any non-alphanumeric character
NALNUMUTF8	NALNUM, no	Match any non-alphanumeric character
NALNUML		NALNUM, no	Match any non-alphanumeric char in locale
NALNUMLUTF8	NALNUM, no	Match any non-alphanumeric char in locale
SPACE		SPACE,  no	Match any whitespace character
SPACEUTF8	SPACE,  no	Match any whitespace character
SPACEL		SPACE,  no	Match any whitespace char in locale
SPACELUTF8	SPACE,  no	Match any whitespace char in locale
NSPACE		NSPACE, no	Match any non-whitespace character
NSPACEUTF8	NSPACE, no	Match any non-whitespace character
NSPACEL		NSPACE, no	Match any non-whitespace char in locale
NSPACELUTF8	NSPACE, no	Match any non-whitespace char in locale
DIGIT		DIGIT,  no	Match any numeric character
DIGITUTF8	DIGIT,  no	Match any numeric character
DIGITL		DIGIT,  no	Match any numeric character in locale
DIGITLUTF8	DIGIT,  no	Match any numeric character in locale
NDIGIT		NDIGIT, no	Match any non-numeric character
NDIGITUTF8	NDIGIT, no	Match any non-numeric character
NDIGITL		NDIGIT, no	Match any non-numeric character in locale
NDIGITLUTF8	NDIGIT, no	Match any non-numeric character in locale
ALNUMC		ALNUMC,  no	Match any alphanumeric character
ALNUMCUTF8	ALNUMC,  no	Match any alphanumeric character
ALNUMCL		ALNUMC,  no	Match any alphanumeric character in locale
ALNUMCLUTF8	ALNUMC,  no	Match any alphanumeric character in locale
NALNUMC		NALNUMC, no	Match any non-alphanumeric character
NALNUMCUTF8	NALNUMC, no	Match any non-alphanumeric character
NALNUMCL	NALNUMC, no	Match any non-alphanumeric character in locale
NALNUMCLUTF8	NALNUMC, no	Match any non-alphanumeric character in locale
ALPHA		ALPHA,  no	Match any alphabetic character
ALPHAUTF8	ALPHA,  no	Match any alphabetic character
ALPHAL		ALPHA,  no	Match any alphabetic character in locale
ALPHALUTF8	ALPHA,  no	Match any alphabetic character in locale
NALPHA		NALPHA, no	Match any non-alphabetic character
NALPHAUTF8	NALPHA, no	Match any non-alphabetic character
NALPHAL		NALPHA, no	Match any non-alphabetic character in locale
NALPHALUTF8	NALPHA, no	Match any non-alphabetic character in locale
ASCII		ASCII,  no	Match any ASCII character
NASCII		NASCII, no	Match any non-ASCII character
CNTRL		CNTRL,  no	Match any control character
CNTRLUTF8	CNTRL,  no	Match any control character
CNTRLL		CNTRL,  no	Match any control character in locale
CNTRLLUTF8	CNTRL,  no	Match any control character in locale
NCNTRL		NCNTRL, no	Match any non-control character
NCNTRLUTF8	NCNTRL, no	Match any non-control character
NCNTRLL		NCNTRL, no	Match any non-control character in locale
NCNTRLLUTF8	NCNTRL, no	Match any non-control character in locale
GRAPH		GRAPH,  no	Match any graphical character
GRAPHUTF8	GRAPH,  no	Match any graphical character
GRAPHL		GRAPH,  no	Match any graphical character in locale
GRAPHLUTF8	GRAPH,  no	Match any graphical character in locale
NGRAPH		NGRAPH, no	Match any non-graphical character
NGRAPHUTF8	NGRAPH, no	Match any non-graphical character
NGRAPHL		NGRAPH, no	Match any non-graphical character in locale
NGRAPHLUTF8	NGRAPH, no	Match any non-graphical character in locale
LOWER		LOWER,  no	Match any lowercase character
LOWERUTF8	LOWER,  no	Match any lowercase character
LOWERL		LOWER,  no	Match any lowercase character in locale
LOWERLUTF8	LOWER,  no	Match any lowercase character in locale
NLOWER		NLOWER, no	Match any non-lowercase character
NLOWERUTF8	NLOWER, no	Match any non-lowercase character
NLOWERL		NLOWER, no	Match any non-lowercase character in locale
NLOWERLUTF8	NLOWER, no	Match any non-lowercase character in locale
PRINT		PRINT,  no	Match any printable character
PRINTUTF8	PRINT,  no	Match any printable character
PRINTL		PRINT,  no	Match any printable character in locale
PRINTLUTF8	PRINT,  no	Match any printable character in locale
NPRINT		NPRINT, no	Match any non-printable character
NPRINTUTF8	NPRINT, no	Match any non-printable character
NPRINTL		NPRINT, no	Match any non-printable character in locale
NPRINTLUTF8	NPRINT, no	Match any non-printable character in locale
PUNCT		PUNCT,  no	Match any punctuation character
PUNCTUTF8	PUNCT,  no	Match any punctuation character
PUNCTL		PUNCT,  no	Match any punctuation character in locale
PUNCTLUTF8	PUNCT,  no	Match any punctuation character in locale
NPUNCT		NPUNCT, no	Match any non-punctuation character
NPUNCTUTF8	NPUNCT, no	Match any non-punctuation character
NPUNCTL		NPUNCT, no	Match any non-punctuation character in locale
NPUNCTLUTF8	NPUNCT, no	Match any non-punctuation character in locale
UPPER		UPPER,  no	Match any uppercase character
UPPERUTF8	UPPER,  no	Match any uppercase character
UPPERL		UPPER,  no	Match any uppercase character in locale
UPPERLUTF8	UPPER,  no	Match any uppercase character in locale
NUPPER		NUPPER, no	Match any non-uppercase character
NUPPERUTF8	NUPPER, no	Match any non-uppercase character
NUPPERL		NUPPER, no	Match any non-uppercase character in locale
NUPPERLUTF8	NUPPER, no	Match any non-uppercase character in locale
XDIGIT		XDIGIT,  no	Match any hexdigit character
NXDIGIT		NXDIGIT, no	Match any non-hexdigit character
CLUMP		CLUMP,  no	Match any combining character sequence

# BRANCH	The set of branches constituting a single choice are hooked
#		together with their "next" pointers, since precedence prevents
#		anything being concatenated to any individual branch.  The
#		"next" pointer of the last BRANCH in a choice points to the
#		thing following the whole choice.  This is also where the
#		final "next" pointer of each individual branch points; each
#		branch starts with the operand node of a BRANCH node.
#
BRANCH		BRANCH, node	Match this alternative, or the next...

# BACK		Normal "next" pointers all implicitly point forward; BACK
#		exists to make loop structures possible.
# not used
BACK		BACK,   no	Match "", "next" ptr points backward.

# Literals
EXACT		EXACT,  sv	Match this string (preceded by length).
EXACTF		EXACT,  sv	Match this string, folded (prec. by length).
EXACTFL		EXACT,  sv	Match this string, folded in locale (w/len).

# Do nothing
NOTHING		NOTHING,no	Match empty string.
# A variant of above which delimits a group, thus stops optimizations
TAIL		NOTHING,no	Match empty string. Can jump here from outside.

# STAR,PLUS	'?', and complex '*' and '+', are implemented as circular
#		BRANCH structures using BACK.  Simple cases (one character
#		per match) are implemented with STAR and PLUS for speed
#		and to minimize recursive plunges.
#
STAR		STAR,   node	Match this (simple) thing 0 or more times.
PLUS		PLUS,   node	Match this (simple) thing 1 or more times.

CURLY		CURLY,  sv 2	Match this simple thing {n,m} times.
CURLYN		CURLY,  no 2	Match next-after-this simple thing 
#				{n,m} times, set parenths.
CURLYM		CURLY,  no 2	Match this medium-complex thing {n,m} times.
CURLYX		CURLY,  sv 2	Match this complex thing {n,m} times.

# This terminator creates a loop structure for CURLYX
WHILEM		WHILEM, no	Do curly processing and see if rest matches.

# OPEN,CLOSE,GROUPP	...are numbered at compile time.
OPEN		OPEN,   num 1	Mark this point in input as start of #n.
CLOSE		CLOSE,  num 1	Analogous to OPEN.

REF		REF,    num 1	Match some already matched string
REFF		REF,    num 1	Match already matched string, folded
REFFL		REF,    num 1	Match already matched string, folded in loc.

# grouping assertions
IFMATCH		BRANCHJ,off 1 2	Succeeds if the following matches.
UNLESSM		BRANCHJ,off 1 2	Fails if the following matches.
SUSPEND		BRANCHJ,off 1 1	"Independent" sub-RE.
IFTHEN		BRANCHJ,off 1 1	Switch, should be preceeded by switcher .
GROUPP		GROUPP, num 1	Whether the group matched.

# Support for long RE
LONGJMP		LONGJMP,off 1 1	Jump far away.
BRANCHJ		BRANCHJ,off 1 1	BRANCH with long offset.

# The heavy worker
EVAL		EVAL,   evl 1	Execute some Perl code.

# Modifiers
MINMOD		MINMOD, no	Next operator is not greedy.
LOGICAL		LOGICAL,no	Next opcode should set the flag only.

# This is not used yet
RENUM		BRANCHJ,off 1 1	Group with independently numbered parens.

# This is not really a node, but an optimized away piece of a "long" node.
# To simplify debugging output, we mark it as if it were a node
OPTIMIZED	NOTHING,off	Placeholder for dump.