summaryrefslogtreecommitdiff
path: root/grammar
diff options
context:
space:
mode:
authorAdrian Thurston <thurston@colm.net>2019-10-27 10:14:29 -0300
committerAdrian Thurston <thurston@colm.net>2019-10-27 10:21:51 -0300
commit85735638c6dbf04bc666e86afbc0839f57e8a8c3 (patch)
tree2c45b4cf30a848aa73e452a36b9cbd9b85bb20f4 /grammar
parenta8ff0c82a3a26adf1547e8bef9919f29341849b1 (diff)
downloadcolm-85735638c6dbf04bc666e86afbc0839f57e8a8c3.tar.gz
PCRE grammar in colm
Diffstat (limited to 'grammar')
-rw-r--r--grammar/.gitignore2
-rw-r--r--grammar/Makefile16
-rw-r--r--grammar/pcre.lm583
3 files changed, 596 insertions, 5 deletions
diff --git a/grammar/.gitignore b/grammar/.gitignore
index adcb030d..1edf212e 100644
--- a/grammar/.gitignore
+++ b/grammar/.gitignore
@@ -2,3 +2,5 @@
/rust
/pcre.c
/pcre
+/pcre-colm.c
+/pcre-colm
diff --git a/grammar/Makefile b/grammar/Makefile
index f269524f..840c2599 100644
--- a/grammar/Makefile
+++ b/grammar/Makefile
@@ -1,8 +1,14 @@
-all: rust pcre
+all: rust pcre pcre-colm
-rust: rust.lm parserust.lm
- ../colm/colm -o rust parserust.lm
+RAGEL = ../ragel/ragel
+COLM = ../colm/colm
-pcre: pcre.rl
- ../ragel/ragel -G2 pcre.rl
+rust: rust.lm parserust.lm $(COLM)
+ $(COLM) -o rust parserust.lm
+
+pcre: pcre.rl $(RAGEL)
+ $(RAGEL) -G2 pcre.rl
gcc -g -Wall -o pcre pcre.c
+
+pcre-colm: pcre.lm
+ $(COLM) -o pcre-colm pcre.lm
diff --git a/grammar/pcre.lm b/grammar/pcre.lm
new file mode 100644
index 00000000..691b1307
--- /dev/null
+++ b/grammar/pcre.lm
@@ -0,0 +1,583 @@
+global Backrefs: int = 0
+
+lex
+ token pre_equals /'='/
+end
+
+token alpha_char
+ / [a-zA-Z] /
+
+token digit_char
+ / [0-9] /
+
+rl alpha_nums
+ / (alpha_char | '_' ) (alpha_char | '_' | digit_char)* /
+
+rl alpha_numeric
+ / 'a'..'z' | 'A'..'Z' | '0'..'9' /
+
+rl alpha_numerics
+ / alpha_numeric+ /
+
+rl hex_digit
+ / '0'..'9' | 'a'..'f' | 'A'..'F' /
+
+literal `| `^
+literal `. `? `+ `*
+literal `{ `}
+
+# It is important that these all go into the same lexical region, so we get a
+# longest-match with no backtracking among these lexical options. Probably need
+# to separate mainline regex from character class regex lexical, but for now
+# they are the same regions.
+lex
+ literal `[
+ token cc_open_caret /"[^"/
+ token cc_open_caret_close /"[^]"/
+ token cc_open_close /"[]"/
+end
+
+literal `]
+literal `( `)
+literal `< `>
+literal `, `: `- `_ `= `!
+literal `# `& `$
+
+token NL
+ / '\r' ? '\n' /
+
+token number
+ /[0-9]+/
+
+# With greedy (default) or lazy (?), we are always attempting all matches. But
+# possessive (+) prunes paths, so it must force the pattern to become a
+# prefilter.
+def quantifier_type
+ [`+]
+| [`?]
+| []
+
+def general_repetition
+ [`{ number `} ]
+| [`{ number comma `} ]
+| [`{ number comma number `} ]
+
+def quantifier
+ [`? quantifier_type] :Question
+| [`* quantifier_type] :Star
+| [`+ quantifier_type] :Plus
+| [general_repetition quantifier_type] :General
+| [] :Base
+
+token sr_R /'R'/
+token sr_P /'P'/
+
+def subroutine_reference
+ [`( `? sr_R `)]
+| [`( `? number `)]
+| [`( `? `+ number `)]
+| [`( `? `- number `)]
+| [`( `? `& name `)]
+| [`( `? sr_P `> name `)]
+| [br_g `< name `>]
+| [br_g `< number `>]
+| [br_g `< `+ number `>]
+| [br_g `< `- number `>]
+| [br_g single_quote name single_quote ]
+| [br_g single_quote number single_quote]
+| [br_g single_quote `+ number single_quote]
+| [br_g single_quote `- number single_quote]
+
+token ns_open /'[[:'/
+
+lex
+ token ns_caret /'^'/
+ token ns_word /alpha_numerics/
+ token ns_close /':]]'/
+end
+
+def posix_named_set
+ [ns_open ns_caret? ns_word ns_close]
+
+token reset_start_match
+ / '\\K' /
+
+def shared_atom
+ [decimal_digit] :DecimalDigit
+| [not_decimal_digit] :NotDecimalDigit
+| [horizontal_white_space] :HorizonalWhiteSpace
+| [not_horizontal_white_space] :NotHorizontalWhiteSpace
+| [not_new_line] :NotNewLine
+| [new_line_sequence] :NewLineSequence
+| [white_space] :WhiteSpace
+| [not_white_space] :NotWhiteSpace
+| [vertical_white_space] :VerticalWhiteSpace
+| [not_vertical_white_space] :NotVerticalWhiteSpace
+| [word_char] :WordChar
+| [not_word_char] :NotWordChar
+| [posix_named_set] :PosixNamedSet
+| [char_with_property] :CharWithProperty
+| [char_without_property] :CharWithoutProperty
+| [control_char] :ControlChar
+
+def shared_literal
+ [octal] :Octal
+| [alpha_char] :AlphaChar
+| [digit_char] :DigitChar
+| [bell_char] :BellChar
+| [escape_char] :EscapeChar
+| [form_feed] :FormFeed
+| [new_line] :NewLine
+| [carriage_ret] :CarriageRet
+| [tab] :Tab
+| [hex_char_fixed] :HexCharFixed
+| [hex_char_var] :HexCharVar
+| [quoted] :Quoted
+| [block_quoted] :BlockQuoted
+| [open_brace] :OpenBrace
+| [close_brace] :CloseBrace
+| [comma] :Comma
+| [hyphen] :Hypen
+| [less_than] :LessThan
+| [greater_than] :GreaterThan
+| [single_quote] :SingleQuote
+| [underscore] :Underscore
+| [colon] :Colon
+| [hash] :Hash
+| [equals] :Equals
+| [exclamation] :Excalmation
+| [ampersand] :Ampersand
+| [other_char_printable] :OtherCharPrintable
+| [other_char_non_printable] :OhterCharNonPrintable
+
+token name
+ / alpha_nums /
+
+token bell_char / '\\a' /
+token escape_char / '\\e' /
+token form_feed / '\\f' /
+token new_line / '\\n' /
+token carriage_ret / '\\r' /
+token tab / '\\t' /
+token control_char
+ / '\\c' ( 0x00 .. 0x7c ) /
+
+token underscore_alpha_numerics
+ / ('_' | alpha_numeric)+ /
+
+rl non_alpha_numeric
+ / ^alpha_numeric /
+
+token quoted
+ /'\\' non_alpha_numeric/
+
+token bs_Q
+ /'\\Q'/
+
+lex
+ # String of non-backslash chars. Or a single backslash.
+ token block_data / ( [^\\]+ ) | '\\' /
+ token block_end /'\\E'/
+end
+
+token block_quoted
+ /bs_Q block_data* block_end/
+
+def hyphen [ `- ]
+def less_than [ `< ]
+def greater_than [ `> ]
+def underscore [ `_ ]
+def colon [ `: ]
+def equals [ `= ]
+def exclamation [ `! ]
+def ampersand [ `& ]
+def hash [ `# ]
+def dollar [ `$ ]
+
+token single_quote
+ / "'" /
+
+token other_char_printable
+ / ' ' | '~' | ';' | '@' | '%' | '`' | '"' | '/' /
+
+token other_char_non_printable
+ / ^( 0 .. 127 ) /
+
+token P / 'P' /
+
+def capture_form
+ [`? `< name `> regex] :NamedPerl
+| [`? single_quote name single_quote regex] :NamedQuoted
+| [`? P `< name `> regex] :NamedPython
+| [regex] :Unamed
+
+def capture
+ # This ID is for the ragel implementation. We use the nfa repetition
+ # operator, which needs an id.
+ [`( capture_form `)] :Capture
+ {
+ Backrefs = Backrefs + 1
+ }
+
+def option_spec
+ [Add: option_flags `- Remove: option_flags]
+| [Add: option_flags]
+| [`- Remove: option_flags]
+
+def non_capture
+ [`( `? `: regex `)]
+| [`( `? option_spec `: regex `)]
+| [`( `? `| regex `)]
+| [`( `? `> regex `)]
+
+token non_close_parens
+ / [^)]+ /
+
+def comment
+ [ `( `? `# non_close_parens? `) ]
+
+def option
+ [`( `? option_spec `)]
+| [`( `* no_start_opt `)]
+| [`( `* utf8 `)]
+| [`( `* utf16 `)]
+| [`( `* ucp `)]
+
+def option_flags
+ [option_flag+]
+
+token option_flag / 'i' | 'J' | 'm' | 's' | 'U' | 'x' /
+
+token no_start_opt / 'NO_START_OPT' /
+token utf8 / 'UTF8' /
+token utf16 / 'UTF16' /
+token ucp / 'UCP' /
+
+def look_ahead
+ [`( `? `= regex `)]
+| [`( `? `! regex `)]
+
+def look_behind
+ [`( `? `< `= regex `)]
+| [`( `? `< `! regex `)]
+
+def look_around
+ [look_ahead]
+| [look_behind]
+
+token br_g / '\\g' /
+token br_k / '\\k' /
+
+token maybe_backref / '\\' [1-9] [0-9]* /
+
+lex
+ token maybe_octal /
+ '\\' (
+ [1-3] [0-7] [0-7] |
+ [1-7] [0-7]
+ )
+ /
+
+ token def_octal /
+ '\\' (
+ [0] [0-7] [0-7] |
+ [0] [0-7] |
+ [0]
+ )
+ /
+end
+
+token else_digits / '\\' [0-9]+ /
+
+bool is_backref( Num: str )
+{
+ Num = suffix( Num, 1 )
+ Ref: int = atoi( Num )
+ if ( Ref < 8 || Ref <= Backrefs )
+ return true
+ return false
+}
+
+# Simple disambig between octals and backrefs. Reject octals that can be a
+# backref, as determined by counting the number of captures.
+def octal
+ [maybe_octal] :Maybe
+ {
+ if ( is_backref( $lhs.maybe_octal ) )
+ reject
+ }
+| [def_octal] :Def
+
+def backref
+ [maybe_backref]
+ {
+ if ( !is_backref( $lhs.maybe_backref ) )
+ reject
+ }
+| [br_g number]
+| [br_g `{ number `}]
+| [br_g `{ `- number `}]
+| [br_k `< name `>]
+| [br_k single_quote name single_quote]
+| [br_g `{ name `}]
+| [br_k `{ name `}]
+| [`( `? P `= name `)]
+
+def literal_digits
+ [else_digits]
+
+def cond_ref
+ [number]
+| [`+ number]
+| [`- number]
+| [`< name `>]
+| [single_quote name single_quote]
+| [cond_ref_R number]
+| [cond_ref_R]
+| [cond_ref_R `& name]
+| [cond_ref_DEFINE]
+| [cond_ref_assert]
+| [name]
+
+token cond_ref_DEFINE / 'DEFINE' /
+token cond_ref_assert / 'assert' /
+token cond_ref_R / 'R' /
+
+def cond_false
+ [`| regex ]
+
+def conditional
+ [`( `? `( cond_ref `) regex cond_false? `)]
+
+token btc_accept / 'ACCEPT' /
+token btc_fail / 'F' ( 'AIL' )? /
+token btc_mark_name / ('MARK')? ':NAME' /
+token btc_commit / 'COMMIT' /
+token btc_prune / 'PRUNE' /
+token btc_prune_name / 'PRUNE:NAME)' /
+token btc_skip / 'SKIP' /
+token btc_skip_name / 'SKIP:NAME' /
+token btc_then / 'THEN' /
+token btc_then_name / 'THEN:NAME' /
+
+def btc_type
+ [btc_accept]
+| [btc_fail]
+| [btc_mark_name]
+| [btc_commit]
+| [btc_prune]
+| [btc_prune_name]
+| [btc_skip]
+| [btc_skip_name]
+| [btc_then]
+| [btc_then_name]
+
+def backtrack_control
+ [ `( `* btc_type `) ]
+
+token nlc_cr / 'CR' /
+token nlc_lf / 'LF' /
+token nlc_crlf / 'CRLF' /
+token nlc_anycrlf / 'ANYCRLF' /
+token nlc_any / 'ANY' /
+token nlc_bsr_anycrlf / 'BSR_ANYCRLF' /
+token nlc_bsr_unicodo / 'BSR_UNICODE' /
+
+def nlc_type
+ [nlc_cr]
+| [nlc_lf]
+| [nlc_crlf]
+| [nlc_anycrlf]
+| [nlc_any]
+| [nlc_bsr_anycrlf]
+| [nlc_bsr_unicodo]
+
+def newline_convention
+ [ `( `* nlc_type `) ]
+
+token callout_C / 'C' /
+
+def callout
+ [ `( `? callout_C `) ]
+| [ `( `? callout_C number `) ]
+
+def char_class_start [ `[ ]
+def char_class_end [ `] ]
+def dot [ `. ]
+def caret [ `^ ]
+def question_mark [ `? ]
+def plus [ `+ ]
+def star [ `* ]
+def open_brace [ `{ ]
+def close_brace [ `} ]
+def comma [ `, ]
+def pipe [ `| ]
+def open_paren [ `( ]
+def close_paren [ `) ]
+
+lex
+ token hex_char_fixed
+ / '\\x' hex_digit hex_digit /
+
+ token hex_char_var
+ / '\\x' '{' hex_digit hex_digit hex_digit+ '}' /
+end
+
+#
+# Anchors
+#
+
+token word_boundary / '\\b' /
+token non_word_boundary / '\\B' /
+
+token sos_A
+ / '\\A' /
+
+def start_of_subject
+ [`^]
+| [sos_A]
+
+token eos_z / '\\z' /
+token eos_Z / '\\Z' /
+
+def end_of_subject
+ [`$]
+| [eos_Z]
+| [eos_z]
+
+token first_matching_pos
+ / '\\G' /
+
+def anchor
+ [word_boundary]
+| [non_word_boundary]
+| [start_of_subject]
+| [end_of_subject]
+| [first_matching_pos]
+
+#
+# Character classes
+#
+
+def cc_atom_list
+ [cc_atom cc_atom*]
+
+def character_class
+ [`[ cc_atom_list `]]
+| [cc_open_caret cc_atom_list `]]
+| [cc_open_caret_close cc_atom* `]]
+| [cc_open_close cc_atom* `]]
+| [cc_open_caret_close hyphen cc_atom_end_range cc_atom* `]]
+| [cc_open_close hyphen cc_atom_end_range cc_atom* `]]
+
+def cc_atom_end_range
+ [cc_atom]
+
+def cc_atom
+ [cc_literal hyphen cc_literal]
+| [shared_atom]
+| [cc_literal]
+| [octal]
+
+def cc_literal
+ [shared_literal]
+| [dot]
+| [char_class_start]
+| [caret]
+| [question_mark]
+| [plus]
+| [star]
+| [word_boundary]
+| [non_word_boundary]
+| [dollar]
+| [pipe]
+| [open_paren]
+| [close_paren]
+
+token decimal_digit / '\\d' /
+token not_decimal_digit / '\\D' /
+token horizontal_white_space / '\\h' /
+token not_horizontal_white_space / '\\H' /
+token not_new_line / '\\N' /
+token new_line_sequence / '\\R' /
+token white_space / '\\s' /
+token not_white_space / '\\S' /
+token vertical_white_space / '\\v' /
+token not_vertical_white_space / '\\V' /
+token word_char / '\\w' /
+token not_word_char / '\\W' /
+
+token one_data_unit / '\\C' /
+token extended_unicode_char / '\\X' /
+
+token with_property_open / '\\p' /
+token without_property_open / '\\P' /
+
+def char_with_property
+ [with_property_open `{ underscore_alpha_numerics `}]
+def char_without_property
+ [without_property_open `{ underscore_alpha_numerics `}]
+
+def atom
+ [shared_atom] :SharedAtom
+| [shared_literal] :SharedLiteral
+| [char_class_end] :CharClassEnd
+| [dot] :Dot
+| [character_class] :CharacterClass
+| [capture] :Capture
+| [non_capture] :NonCapture
+| [anchor] :Anchor
+| [look_around] :LookAround
+| [option] :Option
+| [newline_convention] :NewlineConvention
+| [callout] :Callout
+| [reset_start_match] :ResetStartMatch
+| [one_data_unit] :OneDataUnit
+| [extended_unicode_char] :ExtendedUnicodeChar
+| [backtrack_control] :BacktrackControl
+| [backref] :Backref
+| [literal_digits] :LiteralDigits
+| [subroutine_reference] :SubroutineReference
+| [conditional] :Conditional
+| [comment] :Comment
+
+def element
+ [atom quantifier] :Atom
+
+def term
+ [element term] :Element
+| [] :Base
+
+def expr
+ [expr `| term] :Union
+| [term] :Base
+
+def regex
+ [expr] :Expr
+
+def init
+ []
+ {
+ Backrefs = 0
+ }
+
+token unparseable /[^\n]*/
+
+def line
+ [init regex NL] :Regex commit
+| [unparseable NL] :Unparseable commit
+
+def file
+ [line*]
+
+
+parse F: file [stdin]
+
+if !F
+ print "parse error: [error]
+else {
+ for U: unparseable in F
+ print "unparseable: [U]
+ for B: backref in F
+ print "backref: [B]
+}