summaryrefslogtreecommitdiff
path: root/grammar/pcre.lm
blob: 691b130747821745dcfac2088955b2d0a62fbb29 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
global Backrefs: int = 0

lex
	token pre_equals /'='/
end

token alpha_char
	/ [a-zA-Z] /

token digit_char
	/ [0-9] /

rl alpha_nums
	/ (alpha_char | '_' ) (alpha_char | '_' | digit_char)* /

rl alpha_numeric
	/ 'a'..'z' | 'A'..'Z' | '0'..'9' /

rl alpha_numerics
	/ alpha_numeric+ /

rl hex_digit
	/ '0'..'9' | 'a'..'f' | 'A'..'F' /

literal `| `^
literal `. `? `+ `*
literal `{ `}

# It is important that these all go into the same lexical region, so we get a
# longest-match with no backtracking among these lexical options. Probably need
# to separate mainline regex from character class regex lexical, but for now
# they are the same regions.
lex
	literal `[
	token cc_open_caret /"[^"/
	token cc_open_caret_close /"[^]"/
	token cc_open_close /"[]"/
end

literal `] 
literal `( `)
literal `< `>
literal `, `: `- `_ `= `!
literal `# `& `$

token NL
	/ '\r' ? '\n' /

token number
	/[0-9]+/

# With greedy (default) or lazy (?), we are always attempting all matches. But
# possessive (+) prunes paths, so it must force the pattern to become a
# prefilter.
def quantifier_type
	[`+]
|	[`?]
|	[]

def general_repetition
	[`{ number `} ]
|	[`{ number comma `} ]
|	[`{ number comma number `} ]

def quantifier
	[`? quantifier_type] :Question
|	[`* quantifier_type] :Star
|	[`+ quantifier_type] :Plus
|	[general_repetition quantifier_type] :General
|	[] :Base

token sr_R /'R'/
token sr_P /'P'/

def subroutine_reference
	[`( `? sr_R `)]
|	[`( `? number `)]
|	[`( `? `+ number `)]
|	[`( `? `- number `)]
|	[`( `? `& name `)]
|	[`( `? sr_P `> name `)]
|	[br_g `< name `>]
|	[br_g `< number `>]
|	[br_g `< `+ number `>]
|	[br_g `< `- number `>]
|	[br_g single_quote name single_quote ]
|	[br_g single_quote number single_quote]
|	[br_g single_quote `+ number single_quote]
|	[br_g single_quote `- number single_quote]

token ns_open /'[[:'/

lex
	token ns_caret /'^'/
	token ns_word  /alpha_numerics/
	token ns_close /':]]'/
end

def posix_named_set
	[ns_open ns_caret? ns_word ns_close]

token reset_start_match
	/ '\\K' /

def shared_atom
	[decimal_digit]               :DecimalDigit
|	[not_decimal_digit]           :NotDecimalDigit
|	[horizontal_white_space]      :HorizonalWhiteSpace
|	[not_horizontal_white_space]  :NotHorizontalWhiteSpace
|	[not_new_line]                :NotNewLine
|	[new_line_sequence]           :NewLineSequence
|	[white_space]                 :WhiteSpace
|	[not_white_space]             :NotWhiteSpace
|	[vertical_white_space]        :VerticalWhiteSpace
|	[not_vertical_white_space]    :NotVerticalWhiteSpace
|	[word_char]                   :WordChar
|	[not_word_char]               :NotWordChar
|	[posix_named_set]             :PosixNamedSet
|	[char_with_property]          :CharWithProperty
|	[char_without_property]       :CharWithoutProperty
|	[control_char]                :ControlChar

def shared_literal
	[octal]                       :Octal
|	[alpha_char]                  :AlphaChar
|	[digit_char]                  :DigitChar
|	[bell_char]                   :BellChar
|	[escape_char]                 :EscapeChar
|	[form_feed]                   :FormFeed
|	[new_line]                    :NewLine
|	[carriage_ret]                :CarriageRet
|	[tab]                         :Tab
|	[hex_char_fixed]              :HexCharFixed
|	[hex_char_var]                :HexCharVar
|	[quoted]                      :Quoted
|	[block_quoted]                :BlockQuoted
|	[open_brace]                  :OpenBrace
|	[close_brace]                 :CloseBrace
|	[comma]                       :Comma
|	[hyphen]                      :Hypen
|	[less_than]                   :LessThan
|	[greater_than]                :GreaterThan
|	[single_quote]                :SingleQuote
|	[underscore]                  :Underscore
|	[colon]                       :Colon
|	[hash]                        :Hash
|	[equals]                      :Equals
|	[exclamation]                 :Excalmation
|	[ampersand]                   :Ampersand
|	[other_char_printable]        :OtherCharPrintable
|	[other_char_non_printable]    :OhterCharNonPrintable

token name
	/ alpha_nums /

token bell_char    / '\\a' /
token escape_char  / '\\e' /
token form_feed    / '\\f' /
token new_line     / '\\n' /
token carriage_ret / '\\r' /
token tab          / '\\t' /
token control_char
	/ '\\c' ( 0x00 .. 0x7c ) /

token underscore_alpha_numerics
	/ ('_' | alpha_numeric)+ /

rl non_alpha_numeric
	/ ^alpha_numeric /

token quoted
	/'\\' non_alpha_numeric/

token bs_Q
	/'\\Q'/

lex
	# String of non-backslash chars. Or a single backslash.
	token block_data / ( [^\\]+ ) | '\\' /
	token block_end /'\\E'/
end

token block_quoted
	/bs_Q block_data* block_end/

def hyphen        [ `- ]
def less_than     [ `< ]
def greater_than  [ `> ]
def underscore    [ `_ ]
def colon         [ `: ]
def equals        [ `= ]
def exclamation   [ `! ]
def ampersand     [ `& ]
def hash          [ `# ]
def dollar        [ `$ ]

token single_quote
	/ "'" /

token other_char_printable
	/ ' ' | '~' | ';' | '@' | '%' | '`' | '"' | '/' /

token other_char_non_printable
	/ ^( 0 .. 127 ) /

token P / 'P' /

def capture_form
	[`? `< name `>  regex] :NamedPerl
|	[`? single_quote name single_quote regex] :NamedQuoted
|	[`? P `< name `> regex] :NamedPython
|	[regex] :Unamed
	
def capture
	# This ID is for the ragel implementation. We use the nfa repetition
	# operator, which needs an id. 
	[`( capture_form `)] :Capture
	{
		Backrefs = Backrefs + 1
	}

def option_spec
	[Add: option_flags `-  Remove: option_flags]
|	[Add: option_flags]
|	[`- Remove: option_flags]

def non_capture
	[`( `? `: regex `)]
|	[`( `? option_spec `: regex `)]
|	[`( `? `| regex `)]
|	[`( `? `> regex `)]

token non_close_parens
	/ [^)]+ /

def comment
	[ `( `? `# non_close_parens? `) ]

def option
	[`( `? option_spec `)]
|	[`( `* no_start_opt  `)]
|	[`( `* utf8 `)]
|	[`( `* utf16 `)]
|	[`( `* ucp `)]

def option_flags
	[option_flag+]

token option_flag / 'i' | 'J' | 'm' | 's' | 'U' | 'x' /

token no_start_opt / 'NO_START_OPT' /
token utf8  / 'UTF8' /
token utf16 / 'UTF16' /
token ucp   / 'UCP' /

def look_ahead
	[`( `? `= regex `)]
|	[`( `? `! regex `)]

def look_behind
	[`( `? `< `= regex `)]
|	[`( `? `< `! regex `)]

def look_around
	[look_ahead]
|	[look_behind]

token br_g / '\\g' /
token br_k / '\\k' /

token maybe_backref / '\\' [1-9] [0-9]* /

lex
	token maybe_octal /
	   '\\' (
			[1-3] [0-7] [0-7] |
			[1-7] [0-7]
	   )
	/

	token def_octal /
		'\\' (
			[0] [0-7] [0-7] |
			[0] [0-7] |
			[0]
		)
	/
end

token else_digits / '\\' [0-9]+ /

bool is_backref( Num: str )
{
	Num = suffix( Num, 1 )
	Ref: int = atoi( Num )
	if ( Ref < 8 || Ref <= Backrefs )
		return true
	return false
}

# Simple disambig between octals and backrefs. Reject octals that can be a
# backref, as determined by counting the number of captures.
def octal
	[maybe_octal] :Maybe
	{
		if ( is_backref( $lhs.maybe_octal ) )
			reject
	}
|	[def_octal] :Def

def backref
	[maybe_backref]
	{
		if ( !is_backref( $lhs.maybe_backref ) )
			reject
	}
|	[br_g number]
|	[br_g `{ number `}]
|	[br_g `{ `- number `}]
|	[br_k `< name `>]
|	[br_k single_quote name single_quote]
|	[br_g `{ name `}]
|	[br_k `{ name `}]
|	[`( `? P `= name `)]

def literal_digits
	[else_digits]

def cond_ref
	[number]
|	[`+ number]
|	[`- number]
|	[`< name `>]
|	[single_quote name single_quote]
|	[cond_ref_R number]
|	[cond_ref_R]
|	[cond_ref_R `& name]
|	[cond_ref_DEFINE]
|	[cond_ref_assert]
|	[name]

token cond_ref_DEFINE   / 'DEFINE' /
token cond_ref_assert   / 'assert' /
token cond_ref_R        / 'R' /

def cond_false
	[`| regex ]

def conditional
	[`( `? `( cond_ref `) regex cond_false? `)]

token btc_accept      / 'ACCEPT' /
token btc_fail        / 'F' ( 'AIL' )? /
token btc_mark_name   /  ('MARK')? ':NAME' /
token btc_commit      / 'COMMIT' /
token btc_prune       / 'PRUNE' /
token btc_prune_name  / 'PRUNE:NAME)' /
token btc_skip        / 'SKIP' /
token btc_skip_name   / 'SKIP:NAME' /
token btc_then        / 'THEN' /
token btc_then_name   / 'THEN:NAME' /

def btc_type
	[btc_accept]
|	[btc_fail]
|	[btc_mark_name]
|	[btc_commit]
|	[btc_prune]
|	[btc_prune_name]
|	[btc_skip]
|	[btc_skip_name]
|	[btc_then]
|	[btc_then_name]

def backtrack_control
	[ `( `* btc_type `) ]

token nlc_cr           / 'CR' /
token nlc_lf           / 'LF' /
token nlc_crlf         / 'CRLF' /
token nlc_anycrlf      / 'ANYCRLF' /
token nlc_any          / 'ANY' /
token nlc_bsr_anycrlf  / 'BSR_ANYCRLF' /
token nlc_bsr_unicodo  / 'BSR_UNICODE' /

def nlc_type
	[nlc_cr]
|	[nlc_lf]
|	[nlc_crlf]
|	[nlc_anycrlf]
|	[nlc_any]
|	[nlc_bsr_anycrlf]
|	[nlc_bsr_unicodo]

def newline_convention
	[ `( `* nlc_type `) ]

token callout_C / 'C' /

def callout
	[ `( `? callout_C `) ]
|	[ `( `? callout_C number `) ]

def char_class_start [ `[ ]
def char_class_end   [ `] ]
def dot              [ `. ]
def caret            [ `^ ]
def question_mark    [ `? ]
def plus             [ `+ ]
def star             [ `* ]
def open_brace       [ `{ ]
def close_brace      [ `} ]
def comma            [ `, ]
def pipe             [ `| ]
def open_paren       [ `( ]
def close_paren      [ `) ]

lex 
	token hex_char_fixed
		/ '\\x' hex_digit hex_digit /

	token hex_char_var
		/ '\\x' '{' hex_digit hex_digit hex_digit+ '}' /
end

#
# Anchors
#

token word_boundary       / '\\b' /
token non_word_boundary   / '\\B' /

token sos_A
	/ '\\A' /

def start_of_subject
	[`^]
|	[sos_A]

token eos_z / '\\z' /
token eos_Z / '\\Z' /

def end_of_subject
	[`$]
|	[eos_Z]
|	[eos_z]

token first_matching_pos
	/ '\\G' /

def anchor
	[word_boundary]
|	[non_word_boundary]
|	[start_of_subject]
|	[end_of_subject]
|	[first_matching_pos]

#
# Character classes
#

def cc_atom_list
	[cc_atom cc_atom*]

def character_class
	[`[ cc_atom_list `]]
|	[cc_open_caret       cc_atom_list `]]
|	[cc_open_caret_close cc_atom* `]]
|	[cc_open_close       cc_atom* `]]
|	[cc_open_caret_close hyphen cc_atom_end_range cc_atom* `]]
|	[cc_open_close       hyphen cc_atom_end_range cc_atom* `]]
 
def cc_atom_end_range
	[cc_atom]

def cc_atom
	[cc_literal hyphen cc_literal]
|	[shared_atom]
|	[cc_literal]
|	[octal]

def cc_literal
	[shared_literal]
|	[dot]
|	[char_class_start]
|	[caret]
|	[question_mark]
|	[plus]
|	[star]
|	[word_boundary]
|	[non_word_boundary]
|	[dollar]
|	[pipe]
|	[open_paren]
|	[close_paren]

token decimal_digit              / '\\d' /
token not_decimal_digit          / '\\D' /
token horizontal_white_space     / '\\h' /
token not_horizontal_white_space / '\\H' /
token not_new_line               / '\\N' /
token new_line_sequence          / '\\R' /
token white_space                / '\\s' /
token not_white_space            / '\\S' /
token vertical_white_space       / '\\v' /
token not_vertical_white_space   / '\\V' /
token word_char                  / '\\w' /
token not_word_char              / '\\W' /

token one_data_unit              / '\\C' /
token extended_unicode_char      / '\\X' /

token with_property_open         / '\\p' /
token without_property_open      / '\\P' /

def char_with_property
	[with_property_open `{ underscore_alpha_numerics `}]
def char_without_property
	[without_property_open `{ underscore_alpha_numerics `}]

def atom
	[shared_atom]           :SharedAtom
|	[shared_literal]        :SharedLiteral
|	[char_class_end]        :CharClassEnd
|	[dot]                   :Dot
|	[character_class]       :CharacterClass
|	[capture]               :Capture
|	[non_capture]           :NonCapture
|	[anchor]                :Anchor
|	[look_around]           :LookAround
|	[option]                :Option
|	[newline_convention]    :NewlineConvention
|	[callout]               :Callout
|	[reset_start_match]     :ResetStartMatch
|	[one_data_unit]         :OneDataUnit
|	[extended_unicode_char] :ExtendedUnicodeChar
|	[backtrack_control]     :BacktrackControl
|	[backref]               :Backref
|	[literal_digits]        :LiteralDigits
|	[subroutine_reference]  :SubroutineReference
|	[conditional]           :Conditional
|	[comment]               :Comment

def element
	[atom quantifier] :Atom

def term
	[element term] :Element
|	[] :Base

def expr
	[expr `| term] :Union
|	[term] :Base

def regex
	[expr] :Expr

def init
	[]
	{
		Backrefs = 0
	}

token unparseable /[^\n]*/

def line
	[init regex NL] :Regex commit
|	[unparseable NL] :Unparseable commit

def file
	[line*]


parse F: file [stdin]

if !F
	print "parse error: [error]
else {
	for U: unparseable in F
		print "unparseable: [U]
	for B: backref in F
		print "backref: [B]
}