summaryrefslogtreecommitdiff
path: root/regcomp.sym
blob: 1a2bd3101b2078228e98ea0ca083b4b87325ca0f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# Format:
# NAME \t TYPE, arg-description [num-args] [longjump-len] \t DESCRIPTION

# Empty rows and #-comment rows are ignored.

# Note that the order in this file is important.
#
# Add new regops to the end, and do not re-order the existing ops.
#

# Exit points
END		END,    no	End of program.
SUCCEED		END,    no	Return from a subroutine, basically.

# Anchors:
BOL		BOL,    no	Match "" at beginning of line.
MBOL		BOL,    no	Same, assuming multiline.
SBOL		BOL,    no	Same, assuming singleline.
EOS		EOL,    no	Match "" at end of string.
EOL		EOL,    no	Match "" at end of line.
MEOL		EOL,    no	Same, assuming multiline.
SEOL		EOL,    no	Same, assuming singleline.
BOUND		BOUND,  no	Match "" at any word boundary
BOUNDL		BOUND,  no	Match "" at any word boundary
NBOUND		NBOUND, no	Match "" at any word non-boundary
NBOUNDL		NBOUND, no	Match "" at any word non-boundary
GPOS		GPOS,   no	Matches where last m//g left off.

# [Special] alternatives
REG_ANY		REG_ANY,    no	Match any one character (except newline).
SANY		REG_ANY,    no	Match any one character.
CANY		REG_ANY,    no	Match any one byte.
ANYOF		ANYOF,  sv	Match character in (or not in) this class.
ALNUM		ALNUM,  no	Match any alphanumeric character
ALNUML		ALNUM,  no	Match any alphanumeric char in locale
NALNUM		NALNUM, no	Match any non-alphanumeric character
NALNUML		NALNUM, no	Match any non-alphanumeric char in locale
SPACE		SPACE,  no	Match any whitespace character
SPACEL		SPACE,  no	Match any whitespace char in locale
NSPACE		NSPACE, no	Match any non-whitespace character
NSPACEL		NSPACE, no	Match any non-whitespace char in locale
DIGIT		DIGIT,  no	Match any numeric character
DIGITL		DIGIT,  no	Match any numeric character in locale
NDIGIT		NDIGIT, no	Match any non-numeric character
NDIGITL		NDIGIT, no	Match any non-numeric character in locale
CLUMP		CLUMP,  no	Match any combining character sequence

# BRANCH	The set of branches constituting a single choice are hooked
#		together with their "next" pointers, since precedence prevents
#		anything being concatenated to any individual branch.  The
#		"next" pointer of the last BRANCH in a choice points to the
#		thing following the whole choice.  This is also where the
#		final "next" pointer of each individual branch points; each
#		branch starts with the operand node of a BRANCH node.
#
BRANCH		BRANCH, node	Match this alternative, or the next...

# BACK		Normal "next" pointers all implicitly point forward; BACK
#		exists to make loop structures possible.
# not used
BACK		BACK,   no	Match "", "next" ptr points backward.

# Literals
EXACT		EXACT,  sv	Match this string (preceded by length).
EXACTF		EXACT,  sv	Match this string, folded (prec. by length).
EXACTFL		EXACT,  sv	Match this string, folded in locale (w/len).

# Do nothing
NOTHING		NOTHING,no	Match empty string.
# A variant of above which delimits a group, thus stops optimizations
TAIL		NOTHING,no	Match empty string. Can jump here from outside.

# STAR,PLUS	'?', and complex '*' and '+', are implemented as circular
#		BRANCH structures using BACK.  Simple cases (one character
#		per match) are implemented with STAR and PLUS for speed
#		and to minimize recursive plunges.
#
STAR		STAR,   node	Match this (simple) thing 0 or more times.
PLUS		PLUS,   node	Match this (simple) thing 1 or more times.

CURLY		CURLY,  sv 2	Match this simple thing {n,m} times.
CURLYN		CURLY,  no 2	Match next-after-this simple thing 
#				{n,m} times, set parenths.
CURLYM		CURLY,  no 2	Match this medium-complex thing {n,m} times.
CURLYX		CURLY,  sv 2	Match this complex thing {n,m} times.

# This terminator creates a loop structure for CURLYX
WHILEM		WHILEM, no	Do curly processing and see if rest matches.

# OPEN,CLOSE,GROUPP	...are numbered at compile time.
OPEN		OPEN,   num 1	Mark this point in input as start of #n.
CLOSE		CLOSE,  num 1	Analogous to OPEN.

REF		REF,    num 1	Match some already matched string
REFF		REF,    num 1	Match already matched string, folded
REFFL		REF,    num 1	Match already matched string, folded in loc.

# grouping assertions
IFMATCH		BRANCHJ,off 1 2	Succeeds if the following matches.
UNLESSM		BRANCHJ,off 1 2	Fails if the following matches.
SUSPEND		BRANCHJ,off 1 1	"Independent" sub-RE.
IFTHEN		BRANCHJ,off 1 1	Switch, should be preceeded by switcher .
GROUPP		GROUPP, num 1	Whether the group matched.

# Support for long RE
LONGJMP		LONGJMP,off 1 1	Jump far away.
BRANCHJ		BRANCHJ,off 1 1	BRANCH with long offset.

# The heavy worker
EVAL		EVAL,   evl 1	Execute some Perl code.

# Modifiers
MINMOD		MINMOD, no	Next operator is not greedy.
LOGICAL		LOGICAL,no	Next opcode should set the flag only.

# This is not used yet
RENUM		BRANCHJ,off 1 1	Group with independently numbered parens.

# This is not really a node, but an optimized away piece of a "long" node.
# To simplify debugging output, we mark it as if it were a node
OPTIMIZED	NOTHING,off	Placeholder for dump.

# Trie Related (behave the same as A|LIST|OF|WORDS would)
TRIE		TRIE,   trie 1	Match many EXACT(FL?)? at once. flags==type
TRIEC		TRIE,   trie 1	Trie + charclass. (unused at present)

# Special opcode with the property that no opcode in a compiled program
# will ever be of this type. Thus it can be used as a flag value that
# no other opcode has been seen. END is used similarly, in that an END
# node cant be optimized. So END implies "unoptimizable" and PSEUDO mean
# "not seen anything to optimize yet".
PSEUDO		PSEUDO,off	Pseudo opcode for internal use.