global Backrefs: int = 0

lex
	token pre_equals /'='/
end

token alpha_char
	/ [a-zA-Z] /

token digit_char
	/ [0-9] /

rl alpha_nums
	/ (alpha_char | '_' ) (alpha_char | '_' | digit_char)* /

rl alpha_numeric
	/ 'a'..'z' | 'A'..'Z' | '0'..'9' /

rl alpha_numerics
	/ alpha_numeric+ /

rl hex_digit
	/ '0'..'9' | 'a'..'f' | 'A'..'F' /

literal `| `^
literal `. `? `+ `*
literal `{ `}

# It is important that these all go into the same lexical region, so we get a
# longest-match with no backtracking among these lexical options. Probably need
# to separate mainline regex from character class regex lexical, but for now
# they are the same regions.
lex
	literal `[
	token cc_open_caret /"[^"/
	token cc_open_caret_close /"[^]"/
	token cc_open_close /"[]"/
end

literal `] 
literal `( `)
literal `< `>
literal `, `: `- `_ `= `!
literal `# `& `$

token NL
	/ '\r' ? '\n' /

token number
	/[0-9]+/

# With greedy (default) or lazy (?), we are always attempting all matches. But
# possessive (+) prunes paths, so it must force the pattern to become a
# prefilter.
def quantifier_type
	[`+]
|	[`?]
|	[]

def general_repetition
	[`{ number `} ]
|	[`{ number comma `} ]
|	[`{ number comma number `} ]

def quantifier
	[`? quantifier_type] :Question
|	[`* quantifier_type] :Star
|	[`+ quantifier_type] :Plus
|	[general_repetition quantifier_type] :General
|	[] :Base

token sr_R /'R'/
token sr_P /'P'/

def subroutine_reference
	[`( `? sr_R `)]
|	[`( `? number `)]
|	[`( `? `+ number `)]
|	[`( `? `- number `)]
|	[`( `? `& name `)]
|	[`( `? sr_P `> name `)]
|	[br_g `< name `>]
|	[br_g `< number `>]
|	[br_g `< `+ number `>]
|	[br_g `< `- number `>]
|	[br_g single_quote name single_quote ]
|	[br_g single_quote number single_quote]
|	[br_g single_quote `+ number single_quote]
|	[br_g single_quote `- number single_quote]

token ns_open /'[[:'/

lex
	token ns_caret /'^'/
	token ns_word  /alpha_numerics/
	token ns_close /':]]'/
end

def posix_named_set
	[ns_open ns_caret? ns_word ns_close]

token reset_start_match
	/ '\\K' /

def shared_atom
	[decimal_digit]               :DecimalDigit
|	[not_decimal_digit]           :NotDecimalDigit
|	[horizontal_white_space]      :HorizonalWhiteSpace
|	[not_horizontal_white_space]  :NotHorizontalWhiteSpace
|	[not_new_line]                :NotNewLine
|	[new_line_sequence]           :NewLineSequence
|	[white_space]                 :WhiteSpace
|	[not_white_space]             :NotWhiteSpace
|	[vertical_white_space]        :VerticalWhiteSpace
|	[not_vertical_white_space]    :NotVerticalWhiteSpace
|	[word_char]                   :WordChar
|	[not_word_char]               :NotWordChar
|	[posix_named_set]             :PosixNamedSet
|	[char_with_property]          :CharWithProperty
|	[char_without_property]       :CharWithoutProperty
|	[control_char]                :ControlChar

def shared_literal
	[octal]                       :Octal
|	[alpha_char]                  :AlphaChar
|	[digit_char]                  :DigitChar
|	[bell_char]                   :BellChar
|	[escape_char]                 :EscapeChar
|	[form_feed]                   :FormFeed
|	[new_line]                    :NewLine
|	[carriage_ret]                :CarriageRet
|	[tab]                         :Tab
|	[hex_char_fixed]              :HexCharFixed
|	[hex_char_var]                :HexCharVar
|	[quoted]                      :Quoted
|	[block_quoted]                :BlockQuoted
|	[open_brace]                  :OpenBrace
|	[close_brace]                 :CloseBrace
|	[comma]                       :Comma
|	[hyphen]                      :Hypen
|	[less_than]                   :LessThan
|	[greater_than]                :GreaterThan
|	[single_quote]                :SingleQuote
|	[underscore]                  :Underscore
|	[colon]                       :Colon
|	[hash]                        :Hash
|	[equals]                      :Equals
|	[exclamation]                 :Excalmation
|	[ampersand]                   :Ampersand
|	[other_char_printable]        :OtherCharPrintable
|	[other_char_non_printable]    :OhterCharNonPrintable

token name
	/ alpha_nums /

token bell_char    / '\\a' /
token escape_char  / '\\e' /
token form_feed    / '\\f' /
token new_line     / '\\n' /
token carriage_ret / '\\r' /
token tab          / '\\t' /
token control_char
	/ '\\c' ( 0x00 .. 0x7c ) /

token underscore_alpha_numerics
	/ ('_' | alpha_numeric)+ /

rl non_alpha_numeric
	/ ^alpha_numeric /

token quoted
	/'\\' non_alpha_numeric/

token bs_Q
	/'\\Q'/

lex
	# String of non-backslash chars. Or a single backslash.
	token block_data / ( [^\\]+ ) | '\\' /
	token block_end /'\\E'/
end

token block_quoted
	/bs_Q block_data* block_end/

def hyphen        [ `- ]
def less_than     [ `< ]
def greater_than  [ `> ]
def underscore    [ `_ ]
def colon         [ `: ]
def equals        [ `= ]
def exclamation   [ `! ]
def ampersand     [ `& ]
def hash          [ `# ]
def dollar        [ `$ ]

token single_quote
	/ "'" /

token other_char_printable
	/ ' ' | '~' | ';' | '@' | '%' | '`' | '"' | '/' /

token other_char_non_printable
	/ ^( 0 .. 127 ) /

token P / 'P' /

def capture_form
	[`? `< name `>  regex] :NamedPerl
|	[`? single_quote name single_quote regex] :NamedQuoted
|	[`? P `< name `> regex] :NamedPython
|	[regex] :Unamed
	
def capture
	# This ID is for the ragel implementation. We use the nfa repetition
	# operator, which needs an id. 
	[`( capture_form `)] :Capture
	{
		Backrefs = Backrefs + 1
	}

def option_spec
	[Add: option_flags `-  Remove: option_flags]
|	[Add: option_flags]
|	[`- Remove: option_flags]

def non_capture
	[`( `? `: regex `)]
|	[`( `? option_spec `: regex `)]
|	[`( `? `| regex `)]
|	[`( `? `> regex `)]

token non_close_parens
	/ [^)]+ /

def comment
	[ `( `? `# non_close_parens? `) ]

def option
	[`( `? option_spec `)]
|	[`( `* no_start_opt  `)]
|	[`( `* utf8 `)]
|	[`( `* utf16 `)]
|	[`( `* ucp `)]

def option_flags
	[option_flag+]

token option_flag / 'i' | 'J' | 'm' | 's' | 'U' | 'x' /

token no_start_opt / 'NO_START_OPT' /
token utf8  / 'UTF8' /
token utf16 / 'UTF16' /
token ucp   / 'UCP' /

def look_ahead
	[`( `? `= regex `)]
|	[`( `? `! regex `)]

def look_behind
	[`( `? `< `= regex `)]
|	[`( `? `< `! regex `)]

def look_around
	[look_ahead]
|	[look_behind]

token br_g / '\\g' /
token br_k / '\\k' /

token maybe_backref / '\\' [1-9] [0-9]* /

lex
	token maybe_octal /
	   '\\' (
			[1-3] [0-7] [0-7] |
			[1-7] [0-7]
	   )
	/

	token def_octal /
		'\\' (
			[0] [0-7] [0-7] |
			[0] [0-7] |
			[0]
		)
	/
end

token else_digits / '\\' [0-9]+ /

bool is_backref( Num: str )
{
	Num = suffix( Num, 1 )
	Ref: int = atoi( Num )
	if ( Ref < 8 || Ref <= Backrefs )
		return true
	return false
}

# Simple disambig between octals and backrefs. Reject octals that can be a
# backref, as determined by counting the number of captures.
def octal
	[maybe_octal] :Maybe
	{
		if ( is_backref( $lhs.maybe_octal ) )
			reject
	}
|	[def_octal] :Def

def backref
	[maybe_backref]
	{
		if ( !is_backref( $lhs.maybe_backref ) )
			reject
	}
|	[br_g number]
|	[br_g `{ number `}]
|	[br_g `{ `- number `}]
|	[br_k `< name `>]
|	[br_k single_quote name single_quote]
|	[br_g `{ name `}]
|	[br_k `{ name `}]
|	[`( `? P `= name `)]

def literal_digits
	[else_digits]

def cond_ref
	[number]
|	[`+ number]
|	[`- number]
|	[`< name `>]
|	[single_quote name single_quote]
|	[cond_ref_R number]
|	[cond_ref_R]
|	[cond_ref_R `& name]
|	[cond_ref_DEFINE]
|	[cond_ref_assert]
|	[name]

token cond_ref_DEFINE   / 'DEFINE' /
token cond_ref_assert   / 'assert' /
token cond_ref_R        / 'R' /

def cond_false
	[`| regex ]

def conditional
	[`( `? `( cond_ref `) regex cond_false? `)]

token btc_accept      / 'ACCEPT' /
token btc_fail        / 'F' ( 'AIL' )? /
token btc_mark_name   /  ('MARK')? ':NAME' /
token btc_commit      / 'COMMIT' /
token btc_prune       / 'PRUNE' /
token btc_prune_name  / 'PRUNE:NAME)' /
token btc_skip        / 'SKIP' /
token btc_skip_name   / 'SKIP:NAME' /
token btc_then        / 'THEN' /
token btc_then_name   / 'THEN:NAME' /

def btc_type
	[btc_accept]
|	[btc_fail]
|	[btc_mark_name]
|	[btc_commit]
|	[btc_prune]
|	[btc_prune_name]
|	[btc_skip]
|	[btc_skip_name]
|	[btc_then]
|	[btc_then_name]

def backtrack_control
	[ `( `* btc_type `) ]

token nlc_cr           / 'CR' /
token nlc_lf           / 'LF' /
token nlc_crlf         / 'CRLF' /
token nlc_anycrlf      / 'ANYCRLF' /
token nlc_any          / 'ANY' /
token nlc_bsr_anycrlf  / 'BSR_ANYCRLF' /
token nlc_bsr_unicodo  / 'BSR_UNICODE' /

def nlc_type
	[nlc_cr]
|	[nlc_lf]
|	[nlc_crlf]
|	[nlc_anycrlf]
|	[nlc_any]
|	[nlc_bsr_anycrlf]
|	[nlc_bsr_unicodo]

def newline_convention
	[ `( `* nlc_type `) ]

token callout_C / 'C' /

def callout
	[ `( `? callout_C `) ]
|	[ `( `? callout_C number `) ]

def char_class_start [ `[ ]
def char_class_end   [ `] ]
def dot              [ `. ]
def caret            [ `^ ]
def question_mark    [ `? ]
def plus             [ `+ ]
def star             [ `* ]
def open_brace       [ `{ ]
def close_brace      [ `} ]
def comma            [ `, ]
def pipe             [ `| ]
def open_paren       [ `( ]
def close_paren      [ `) ]

lex 
	token hex_char_fixed
		/ '\\x' hex_digit hex_digit /

	token hex_char_var
		/ '\\x' '{' hex_digit hex_digit hex_digit+ '}' /
end

#
# Anchors
#

token word_boundary       / '\\b' /
token non_word_boundary   / '\\B' /

token sos_A
	/ '\\A' /

def start_of_subject
	[`^]
|	[sos_A]

token eos_z / '\\z' /
token eos_Z / '\\Z' /

def end_of_subject
	[`$]
|	[eos_Z]
|	[eos_z]

token first_matching_pos
	/ '\\G' /

def anchor
	[word_boundary]
|	[non_word_boundary]
|	[start_of_subject]
|	[end_of_subject]
|	[first_matching_pos]

#
# Character classes
#

def cc_atom_list
	[cc_atom cc_atom*]

def character_class
	[`[ cc_atom_list `]]
|	[cc_open_caret       cc_atom_list `]]
|	[cc_open_caret_close cc_atom* `]]
|	[cc_open_close       cc_atom* `]]
|	[cc_open_caret_close hyphen cc_atom_end_range cc_atom* `]]
|	[cc_open_close       hyphen cc_atom_end_range cc_atom* `]]
 
def cc_atom_end_range
	[cc_atom]

def cc_atom
	[cc_literal hyphen cc_literal]
|	[shared_atom]
|	[cc_literal]
|	[octal]

def cc_literal
	[shared_literal]
|	[dot]
|	[char_class_start]
|	[caret]
|	[question_mark]
|	[plus]
|	[star]
|	[word_boundary]
|	[non_word_boundary]
|	[dollar]
|	[pipe]
|	[open_paren]
|	[close_paren]

token decimal_digit              / '\\d' /
token not_decimal_digit          / '\\D' /
token horizontal_white_space     / '\\h' /
token not_horizontal_white_space / '\\H' /
token not_new_line               / '\\N' /
token new_line_sequence          / '\\R' /
token white_space                / '\\s' /
token not_white_space            / '\\S' /
token vertical_white_space       / '\\v' /
token not_vertical_white_space   / '\\V' /
token word_char                  / '\\w' /
token not_word_char              / '\\W' /

token one_data_unit              / '\\C' /
token extended_unicode_char      / '\\X' /

token with_property_open         / '\\p' /
token without_property_open      / '\\P' /

def char_with_property
	[with_property_open `{ underscore_alpha_numerics `}]
def char_without_property
	[without_property_open `{ underscore_alpha_numerics `}]

def atom
	[shared_atom]           :SharedAtom
|	[shared_literal]        :SharedLiteral
|	[char_class_end]        :CharClassEnd
|	[dot]                   :Dot
|	[character_class]       :CharacterClass
|	[capture]               :Capture
|	[non_capture]           :NonCapture
|	[anchor]                :Anchor
|	[look_around]           :LookAround
|	[option]                :Option
|	[newline_convention]    :NewlineConvention
|	[callout]               :Callout
|	[reset_start_match]     :ResetStartMatch
|	[one_data_unit]         :OneDataUnit
|	[extended_unicode_char] :ExtendedUnicodeChar
|	[backtrack_control]     :BacktrackControl
|	[backref]               :Backref
|	[literal_digits]        :LiteralDigits
|	[subroutine_reference]  :SubroutineReference
|	[conditional]           :Conditional
|	[comment]               :Comment

def element
	[atom quantifier] :Atom

def term
	[element term] :Element
|	[] :Base

def expr
	[expr `| term] :Union
|	[term] :Base

def regex
	[expr] :Expr

def init
	[]
	{
		Backrefs = 0
	}

token unparseable /[^\n]*/

def line
	[init regex NL] :Regex commit
|	[unparseable NL] :Unparseable commit

def file
	[line*]


parse F: file [stdin]

if !F
	print "parse error: [error]
else {
	for U: unparseable in F
		print "unparseable: [U]
	for B: backref in F
		print "backref: [B]
}