From 85735638c6dbf04bc666e86afbc0839f57e8a8c3 Mon Sep 17 00:00:00 2001 From: Adrian Thurston Date: Sun, 27 Oct 2019 10:14:29 -0300 Subject: PCRE grammar in colm --- grammar/.gitignore | 2 + grammar/Makefile | 16 +- grammar/pcre.lm | 583 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 596 insertions(+), 5 deletions(-) create mode 100644 grammar/pcre.lm (limited to 'grammar') diff --git a/grammar/.gitignore b/grammar/.gitignore index adcb030d..1edf212e 100644 --- a/grammar/.gitignore +++ b/grammar/.gitignore @@ -2,3 +2,5 @@ /rust /pcre.c /pcre +/pcre-colm.c +/pcre-colm diff --git a/grammar/Makefile b/grammar/Makefile index f269524f..840c2599 100644 --- a/grammar/Makefile +++ b/grammar/Makefile @@ -1,8 +1,14 @@ -all: rust pcre +all: rust pcre pcre-colm -rust: rust.lm parserust.lm - ../colm/colm -o rust parserust.lm +RAGEL = ../ragel/ragel +COLM = ../colm/colm -pcre: pcre.rl - ../ragel/ragel -G2 pcre.rl +rust: rust.lm parserust.lm $(COLM) + $(COLM) -o rust parserust.lm + +pcre: pcre.rl $(RAGEL) + $(RAGEL) -G2 pcre.rl gcc -g -Wall -o pcre pcre.c + +pcre-colm: pcre.lm + $(COLM) -o pcre-colm pcre.lm diff --git a/grammar/pcre.lm b/grammar/pcre.lm new file mode 100644 index 00000000..691b1307 --- /dev/null +++ b/grammar/pcre.lm @@ -0,0 +1,583 @@ +global Backrefs: int = 0 + +lex + token pre_equals /'='/ +end + +token alpha_char + / [a-zA-Z] / + +token digit_char + / [0-9] / + +rl alpha_nums + / (alpha_char | '_' ) (alpha_char | '_' | digit_char)* / + +rl alpha_numeric + / 'a'..'z' | 'A'..'Z' | '0'..'9' / + +rl alpha_numerics + / alpha_numeric+ / + +rl hex_digit + / '0'..'9' | 'a'..'f' | 'A'..'F' / + +literal `| `^ +literal `. `? `+ `* +literal `{ `} + +# It is important that these all go into the same lexical region, so we get a +# longest-match with no backtracking among these lexical options. Probably need +# to separate mainline regex from character class regex lexical, but for now +# they are the same regions. +lex + literal `[ + token cc_open_caret /"[^"/ + token cc_open_caret_close /"[^]"/ + token cc_open_close /"[]"/ +end + +literal `] +literal `( `) +literal `< `> +literal `, `: `- `_ `= `! +literal `# `& `$ + +token NL + / '\r' ? '\n' / + +token number + /[0-9]+/ + +# With greedy (default) or lazy (?), we are always attempting all matches. But +# possessive (+) prunes paths, so it must force the pattern to become a +# prefilter. +def quantifier_type + [`+] +| [`?] +| [] + +def general_repetition + [`{ number `} ] +| [`{ number comma `} ] +| [`{ number comma number `} ] + +def quantifier + [`? quantifier_type] :Question +| [`* quantifier_type] :Star +| [`+ quantifier_type] :Plus +| [general_repetition quantifier_type] :General +| [] :Base + +token sr_R /'R'/ +token sr_P /'P'/ + +def subroutine_reference + [`( `? sr_R `)] +| [`( `? number `)] +| [`( `? `+ number `)] +| [`( `? `- number `)] +| [`( `? `& name `)] +| [`( `? sr_P `> name `)] +| [br_g `< name `>] +| [br_g `< number `>] +| [br_g `< `+ number `>] +| [br_g `< `- number `>] +| [br_g single_quote name single_quote ] +| [br_g single_quote number single_quote] +| [br_g single_quote `+ number single_quote] +| [br_g single_quote `- number single_quote] + +token ns_open /'[[:'/ + +lex + token ns_caret /'^'/ + token ns_word /alpha_numerics/ + token ns_close /':]]'/ +end + +def posix_named_set + [ns_open ns_caret? ns_word ns_close] + +token reset_start_match + / '\\K' / + +def shared_atom + [decimal_digit] :DecimalDigit +| [not_decimal_digit] :NotDecimalDigit +| [horizontal_white_space] :HorizonalWhiteSpace +| [not_horizontal_white_space] :NotHorizontalWhiteSpace +| [not_new_line] :NotNewLine +| [new_line_sequence] :NewLineSequence +| [white_space] :WhiteSpace +| [not_white_space] :NotWhiteSpace +| [vertical_white_space] :VerticalWhiteSpace +| [not_vertical_white_space] :NotVerticalWhiteSpace +| [word_char] :WordChar +| [not_word_char] :NotWordChar +| [posix_named_set] :PosixNamedSet +| [char_with_property] :CharWithProperty +| [char_without_property] :CharWithoutProperty +| [control_char] :ControlChar + +def shared_literal + [octal] :Octal +| [alpha_char] :AlphaChar +| [digit_char] :DigitChar +| [bell_char] :BellChar +| [escape_char] :EscapeChar +| [form_feed] :FormFeed +| [new_line] :NewLine +| [carriage_ret] :CarriageRet +| [tab] :Tab +| [hex_char_fixed] :HexCharFixed +| [hex_char_var] :HexCharVar +| [quoted] :Quoted +| [block_quoted] :BlockQuoted +| [open_brace] :OpenBrace +| [close_brace] :CloseBrace +| [comma] :Comma +| [hyphen] :Hypen +| [less_than] :LessThan +| [greater_than] :GreaterThan +| [single_quote] :SingleQuote +| [underscore] :Underscore +| [colon] :Colon +| [hash] :Hash +| [equals] :Equals +| [exclamation] :Excalmation +| [ampersand] :Ampersand +| [other_char_printable] :OtherCharPrintable +| [other_char_non_printable] :OhterCharNonPrintable + +token name + / alpha_nums / + +token bell_char / '\\a' / +token escape_char / '\\e' / +token form_feed / '\\f' / +token new_line / '\\n' / +token carriage_ret / '\\r' / +token tab / '\\t' / +token control_char + / '\\c' ( 0x00 .. 0x7c ) / + +token underscore_alpha_numerics + / ('_' | alpha_numeric)+ / + +rl non_alpha_numeric + / ^alpha_numeric / + +token quoted + /'\\' non_alpha_numeric/ + +token bs_Q + /'\\Q'/ + +lex + # String of non-backslash chars. Or a single backslash. + token block_data / ( [^\\]+ ) | '\\' / + token block_end /'\\E'/ +end + +token block_quoted + /bs_Q block_data* block_end/ + +def hyphen [ `- ] +def less_than [ `< ] +def greater_than [ `> ] +def underscore [ `_ ] +def colon [ `: ] +def equals [ `= ] +def exclamation [ `! ] +def ampersand [ `& ] +def hash [ `# ] +def dollar [ `$ ] + +token single_quote + / "'" / + +token other_char_printable + / ' ' | '~' | ';' | '@' | '%' | '`' | '"' | '/' / + +token other_char_non_printable + / ^( 0 .. 127 ) / + +token P / 'P' / + +def capture_form + [`? `< name `> regex] :NamedPerl +| [`? single_quote name single_quote regex] :NamedQuoted +| [`? P `< name `> regex] :NamedPython +| [regex] :Unamed + +def capture + # This ID is for the ragel implementation. We use the nfa repetition + # operator, which needs an id. + [`( capture_form `)] :Capture + { + Backrefs = Backrefs + 1 + } + +def option_spec + [Add: option_flags `- Remove: option_flags] +| [Add: option_flags] +| [`- Remove: option_flags] + +def non_capture + [`( `? `: regex `)] +| [`( `? option_spec `: regex `)] +| [`( `? `| regex `)] +| [`( `? `> regex `)] + +token non_close_parens + / [^)]+ / + +def comment + [ `( `? `# non_close_parens? `) ] + +def option + [`( `? option_spec `)] +| [`( `* no_start_opt `)] +| [`( `* utf8 `)] +| [`( `* utf16 `)] +| [`( `* ucp `)] + +def option_flags + [option_flag+] + +token option_flag / 'i' | 'J' | 'm' | 's' | 'U' | 'x' / + +token no_start_opt / 'NO_START_OPT' / +token utf8 / 'UTF8' / +token utf16 / 'UTF16' / +token ucp / 'UCP' / + +def look_ahead + [`( `? `= regex `)] +| [`( `? `! regex `)] + +def look_behind + [`( `? `< `= regex `)] +| [`( `? `< `! regex `)] + +def look_around + [look_ahead] +| [look_behind] + +token br_g / '\\g' / +token br_k / '\\k' / + +token maybe_backref / '\\' [1-9] [0-9]* / + +lex + token maybe_octal / + '\\' ( + [1-3] [0-7] [0-7] | + [1-7] [0-7] + ) + / + + token def_octal / + '\\' ( + [0] [0-7] [0-7] | + [0] [0-7] | + [0] + ) + / +end + +token else_digits / '\\' [0-9]+ / + +bool is_backref( Num: str ) +{ + Num = suffix( Num, 1 ) + Ref: int = atoi( Num ) + if ( Ref < 8 || Ref <= Backrefs ) + return true + return false +} + +# Simple disambig between octals and backrefs. Reject octals that can be a +# backref, as determined by counting the number of captures. +def octal + [maybe_octal] :Maybe + { + if ( is_backref( $lhs.maybe_octal ) ) + reject + } +| [def_octal] :Def + +def backref + [maybe_backref] + { + if ( !is_backref( $lhs.maybe_backref ) ) + reject + } +| [br_g number] +| [br_g `{ number `}] +| [br_g `{ `- number `}] +| [br_k `< name `>] +| [br_k single_quote name single_quote] +| [br_g `{ name `}] +| [br_k `{ name `}] +| [`( `? P `= name `)] + +def literal_digits + [else_digits] + +def cond_ref + [number] +| [`+ number] +| [`- number] +| [`< name `>] +| [single_quote name single_quote] +| [cond_ref_R number] +| [cond_ref_R] +| [cond_ref_R `& name] +| [cond_ref_DEFINE] +| [cond_ref_assert] +| [name] + +token cond_ref_DEFINE / 'DEFINE' / +token cond_ref_assert / 'assert' / +token cond_ref_R / 'R' / + +def cond_false + [`| regex ] + +def conditional + [`( `? `( cond_ref `) regex cond_false? `)] + +token btc_accept / 'ACCEPT' / +token btc_fail / 'F' ( 'AIL' )? / +token btc_mark_name / ('MARK')? ':NAME' / +token btc_commit / 'COMMIT' / +token btc_prune / 'PRUNE' / +token btc_prune_name / 'PRUNE:NAME)' / +token btc_skip / 'SKIP' / +token btc_skip_name / 'SKIP:NAME' / +token btc_then / 'THEN' / +token btc_then_name / 'THEN:NAME' / + +def btc_type + [btc_accept] +| [btc_fail] +| [btc_mark_name] +| [btc_commit] +| [btc_prune] +| [btc_prune_name] +| [btc_skip] +| [btc_skip_name] +| [btc_then] +| [btc_then_name] + +def backtrack_control + [ `( `* btc_type `) ] + +token nlc_cr / 'CR' / +token nlc_lf / 'LF' / +token nlc_crlf / 'CRLF' / +token nlc_anycrlf / 'ANYCRLF' / +token nlc_any / 'ANY' / +token nlc_bsr_anycrlf / 'BSR_ANYCRLF' / +token nlc_bsr_unicodo / 'BSR_UNICODE' / + +def nlc_type + [nlc_cr] +| [nlc_lf] +| [nlc_crlf] +| [nlc_anycrlf] +| [nlc_any] +| [nlc_bsr_anycrlf] +| [nlc_bsr_unicodo] + +def newline_convention + [ `( `* nlc_type `) ] + +token callout_C / 'C' / + +def callout + [ `( `? callout_C `) ] +| [ `( `? callout_C number `) ] + +def char_class_start [ `[ ] +def char_class_end [ `] ] +def dot [ `. ] +def caret [ `^ ] +def question_mark [ `? ] +def plus [ `+ ] +def star [ `* ] +def open_brace [ `{ ] +def close_brace [ `} ] +def comma [ `, ] +def pipe [ `| ] +def open_paren [ `( ] +def close_paren [ `) ] + +lex + token hex_char_fixed + / '\\x' hex_digit hex_digit / + + token hex_char_var + / '\\x' '{' hex_digit hex_digit hex_digit+ '}' / +end + +# +# Anchors +# + +token word_boundary / '\\b' / +token non_word_boundary / '\\B' / + +token sos_A + / '\\A' / + +def start_of_subject + [`^] +| [sos_A] + +token eos_z / '\\z' / +token eos_Z / '\\Z' / + +def end_of_subject + [`$] +| [eos_Z] +| [eos_z] + +token first_matching_pos + / '\\G' / + +def anchor + [word_boundary] +| [non_word_boundary] +| [start_of_subject] +| [end_of_subject] +| [first_matching_pos] + +# +# Character classes +# + +def cc_atom_list + [cc_atom cc_atom*] + +def character_class + [`[ cc_atom_list `]] +| [cc_open_caret cc_atom_list `]] +| [cc_open_caret_close cc_atom* `]] +| [cc_open_close cc_atom* `]] +| [cc_open_caret_close hyphen cc_atom_end_range cc_atom* `]] +| [cc_open_close hyphen cc_atom_end_range cc_atom* `]] + +def cc_atom_end_range + [cc_atom] + +def cc_atom + [cc_literal hyphen cc_literal] +| [shared_atom] +| [cc_literal] +| [octal] + +def cc_literal + [shared_literal] +| [dot] +| [char_class_start] +| [caret] +| [question_mark] +| [plus] +| [star] +| [word_boundary] +| [non_word_boundary] +| [dollar] +| [pipe] +| [open_paren] +| [close_paren] + +token decimal_digit / '\\d' / +token not_decimal_digit / '\\D' / +token horizontal_white_space / '\\h' / +token not_horizontal_white_space / '\\H' / +token not_new_line / '\\N' / +token new_line_sequence / '\\R' / +token white_space / '\\s' / +token not_white_space / '\\S' / +token vertical_white_space / '\\v' / +token not_vertical_white_space / '\\V' / +token word_char / '\\w' / +token not_word_char / '\\W' / + +token one_data_unit / '\\C' / +token extended_unicode_char / '\\X' / + +token with_property_open / '\\p' / +token without_property_open / '\\P' / + +def char_with_property + [with_property_open `{ underscore_alpha_numerics `}] +def char_without_property + [without_property_open `{ underscore_alpha_numerics `}] + +def atom + [shared_atom] :SharedAtom +| [shared_literal] :SharedLiteral +| [char_class_end] :CharClassEnd +| [dot] :Dot +| [character_class] :CharacterClass +| [capture] :Capture +| [non_capture] :NonCapture +| [anchor] :Anchor +| [look_around] :LookAround +| [option] :Option +| [newline_convention] :NewlineConvention +| [callout] :Callout +| [reset_start_match] :ResetStartMatch +| [one_data_unit] :OneDataUnit +| [extended_unicode_char] :ExtendedUnicodeChar +| [backtrack_control] :BacktrackControl +| [backref] :Backref +| [literal_digits] :LiteralDigits +| [subroutine_reference] :SubroutineReference +| [conditional] :Conditional +| [comment] :Comment + +def element + [atom quantifier] :Atom + +def term + [element term] :Element +| [] :Base + +def expr + [expr `| term] :Union +| [term] :Base + +def regex + [expr] :Expr + +def init + [] + { + Backrefs = 0 + } + +token unparseable /[^\n]*/ + +def line + [init regex NL] :Regex commit +| [unparseable NL] :Unparseable commit + +def file + [line*] + + +parse F: file [stdin] + +if !F + print "parse error: [error] +else { + for U: unparseable in F + print "unparseable: [U] + for B: backref in F + print "backref: [B] +} -- cgit v1.2.1