moved grammars out to their own dirs

We also want some testing/validation support for each grammar. This dir will get quite messy if we move files.
author: Adrian Thurston <thurston@colm.net> 2019-12-01 09:05:03 -0800
committer: Adrian Thurston <thurston@colm.net> 2019-12-01 09:05:03 -0800
commit: 5a4407de2c4c03083f1c25b5c7fdb849cdc08a12 (patch)
tree: 4436efeb998fd54de016723d92d74c400d48e070 /grammar/pcre
parent: 479376c313208ee764d212846892cb904b8e5b89 (diff)
download: colm-5a4407de2c4c03083f1c25b5c7fdb849cdc08a12.tar.gz
4 files changed, 1462 insertions, 0 deletions
diff --git a/grammar/pcre/.gitignore b/grammar/pcre/.gitignore
new file mode 100644
index 00000000..05b136b8
--- /dev/null
+++ b/grammar/pcre/.gitignore
@@ -0,0 +1,4 @@
+/pcre-colm.c
+/pcre-colm
+/pcre-ragel.c
+/pcre-ragel
diff --git a/grammar/pcre/Makefile b/grammar/pcre/Makefile
new file mode 100644
index 00000000..6f895485
--- /dev/null
+++ b/grammar/pcre/Makefile
@@ -0,0 +1,11 @@
+COLM = ../../colm/colm
+RAGEL = ../../ragel/ragel
+
+all: pcre-colm pcre-ragel
+
+pcre-colm: pcre.lm
+	$(COLM) -o $@ pcre.lm
+
+pcre-ragel: pcre.rl $(RAGEL)
+	$(RAGEL) -G2 pcre.rl
+	gcc -g -Wall -o $@ pcre.c
diff --git a/grammar/pcre/pcre.lm b/grammar/pcre/pcre.lm
new file mode 100644
index 00000000..691b1307
--- /dev/null
+++ b/grammar/pcre/pcre.lm
@@ -0,0 +1,583 @@
+global Backrefs: int = 0
+
+lex
+	token pre_equals /'='/
+end
+
+token alpha_char
+	/ [a-zA-Z] /
+
+token digit_char
+	/ [0-9] /
+
+rl alpha_nums
+	/ (alpha_char | '_' ) (alpha_char | '_' | digit_char)* /
+
+rl alpha_numeric
+	/ 'a'..'z' | 'A'..'Z' | '0'..'9' /
+
+rl alpha_numerics
+	/ alpha_numeric+ /
+
+rl hex_digit
+	/ '0'..'9' | 'a'..'f' | 'A'..'F' /
+
+literal `| `^
+literal `. `? `+ `*
+literal `{ `}
+
+# It is important that these all go into the same lexical region, so we get a
+# longest-match with no backtracking among these lexical options. Probably need
+# to separate mainline regex from character class regex lexical, but for now
+# they are the same regions.
+lex
+	literal `[
+	token cc_open_caret /"[^"/
+	token cc_open_caret_close /"[^]"/
+	token cc_open_close /"[]"/
+end
+
+literal `] 
+literal `( `)
+literal `< `>
+literal `, `: `- `_ `= `!
+literal `# `& `$
+
+token NL
+	/ '\r' ? '\n' /
+
+token number
+	/[0-9]+/
+
+# With greedy (default) or lazy (?), we are always attempting all matches. But
+# possessive (+) prunes paths, so it must force the pattern to become a
+# prefilter.
+def quantifier_type
+	[`+]
+|	[`?]
+|	[]
+
+def general_repetition
+	[`{ number `} ]
+|	[`{ number comma `} ]
+|	[`{ number comma number `} ]
+
+def quantifier
+	[`? quantifier_type] :Question
+|	[`* quantifier_type] :Star
+|	[`+ quantifier_type] :Plus
+|	[general_repetition quantifier_type] :General
+|	[] :Base
+
+token sr_R /'R'/
+token sr_P /'P'/
+
+def subroutine_reference
+	[`( `? sr_R `)]
+|	[`( `? number `)]
+|	[`( `? `+ number `)]
+|	[`( `? `- number `)]
+|	[`( `? `& name `)]
+|	[`( `? sr_P `> name `)]
+|	[br_g `< name `>]
+|	[br_g `< number `>]
+|	[br_g `< `+ number `>]
+|	[br_g `< `- number `>]
+|	[br_g single_quote name single_quote ]
+|	[br_g single_quote number single_quote]
+|	[br_g single_quote `+ number single_quote]
+|	[br_g single_quote `- number single_quote]
+
+token ns_open /'[[:'/
+
+lex
+	token ns_caret /'^'/
+	token ns_word  /alpha_numerics/
+	token ns_close /':]]'/
+end
+
+def posix_named_set
+	[ns_open ns_caret? ns_word ns_close]
+
+token reset_start_match
+	/ '\\K' /
+
+def shared_atom
+	[decimal_digit]               :DecimalDigit
+|	[not_decimal_digit]           :NotDecimalDigit
+|	[horizontal_white_space]      :HorizonalWhiteSpace
+|	[not_horizontal_white_space]  :NotHorizontalWhiteSpace
+|	[not_new_line]                :NotNewLine
+|	[new_line_sequence]           :NewLineSequence
+|	[white_space]                 :WhiteSpace
+|	[not_white_space]             :NotWhiteSpace
+|	[vertical_white_space]        :VerticalWhiteSpace
+|	[not_vertical_white_space]    :NotVerticalWhiteSpace
+|	[word_char]                   :WordChar
+|	[not_word_char]               :NotWordChar
+|	[posix_named_set]             :PosixNamedSet
+|	[char_with_property]          :CharWithProperty
+|	[char_without_property]       :CharWithoutProperty
+|	[control_char]                :ControlChar
+
+def shared_literal
+	[octal]                       :Octal
+|	[alpha_char]                  :AlphaChar
+|	[digit_char]                  :DigitChar
+|	[bell_char]                   :BellChar
+|	[escape_char]                 :EscapeChar
+|	[form_feed]                   :FormFeed
+|	[new_line]                    :NewLine
+|	[carriage_ret]                :CarriageRet
+|	[tab]                         :Tab
+|	[hex_char_fixed]              :HexCharFixed
+|	[hex_char_var]                :HexCharVar
+|	[quoted]                      :Quoted
+|	[block_quoted]                :BlockQuoted
+|	[open_brace]                  :OpenBrace
+|	[close_brace]                 :CloseBrace
+|	[comma]                       :Comma
+|	[hyphen]                      :Hypen
+|	[less_than]                   :LessThan
+|	[greater_than]                :GreaterThan
+|	[single_quote]                :SingleQuote
+|	[underscore]                  :Underscore
+|	[colon]                       :Colon
+|	[hash]                        :Hash
+|	[equals]                      :Equals
+|	[exclamation]                 :Excalmation
+|	[ampersand]                   :Ampersand
+|	[other_char_printable]        :OtherCharPrintable
+|	[other_char_non_printable]    :OhterCharNonPrintable
+
+token name
+	/ alpha_nums /
+
+token bell_char    / '\\a' /
+token escape_char  / '\\e' /
+token form_feed    / '\\f' /
+token new_line     / '\\n' /
+token carriage_ret / '\\r' /
+token tab          / '\\t' /
+token control_char
+	/ '\\c' ( 0x00 .. 0x7c ) /
+
+token underscore_alpha_numerics
+	/ ('_' | alpha_numeric)+ /
+
+rl non_alpha_numeric
+	/ ^alpha_numeric /
+
+token quoted
+	/'\\' non_alpha_numeric/
+
+token bs_Q
+	/'\\Q'/
+
+lex
+	# String of non-backslash chars. Or a single backslash.
+	token block_data / ( [^\\]+ ) | '\\' /
+	token block_end /'\\E'/
+end
+
+token block_quoted
+	/bs_Q block_data* block_end/
+
+def hyphen        [ `- ]
+def less_than     [ `< ]
+def greater_than  [ `> ]
+def underscore    [ `_ ]
+def colon         [ `: ]
+def equals        [ `= ]
+def exclamation   [ `! ]
+def ampersand     [ `& ]
+def hash          [ `# ]
+def dollar        [ `$ ]
+
+token single_quote
+	/ "'" /
+
+token other_char_printable
+	/ ' ' | '~' | ';' | '@' | '%' | '`' | '"' | '/' /
+
+token other_char_non_printable
+	/ ^( 0 .. 127 ) /
+
+token P / 'P' /
+
+def capture_form
+	[`? `< name `>  regex] :NamedPerl
+|	[`? single_quote name single_quote regex] :NamedQuoted
+|	[`? P `< name `> regex] :NamedPython
+|	[regex] :Unamed
+	
+def capture
+	# This ID is for the ragel implementation. We use the nfa repetition
+	# operator, which needs an id. 
+	[`( capture_form `)] :Capture
+	{
+		Backrefs = Backrefs + 1
+	}
+
+def option_spec
+	[Add: option_flags `-  Remove: option_flags]
+|	[Add: option_flags]
+|	[`- Remove: option_flags]
+
+def non_capture
+	[`( `? `: regex `)]
+|	[`( `? option_spec `: regex `)]
+|	[`( `? `| regex `)]
+|	[`( `? `> regex `)]
+
+token non_close_parens
+	/ [^)]+ /
+
+def comment
+	[ `( `? `# non_close_parens? `) ]
+
+def option
+	[`( `? option_spec `)]
+|	[`( `* no_start_opt  `)]
+|	[`( `* utf8 `)]
+|	[`( `* utf16 `)]
+|	[`( `* ucp `)]
+
+def option_flags
+	[option_flag+]
+
+token option_flag / 'i' | 'J' | 'm' | 's' | 'U' | 'x' /
+
+token no_start_opt / 'NO_START_OPT' /
+token utf8  / 'UTF8' /
+token utf16 / 'UTF16' /
+token ucp   / 'UCP' /
+
+def look_ahead
+	[`( `? `= regex `)]
+|	[`( `? `! regex `)]
+
+def look_behind
+	[`( `? `< `= regex `)]
+|	[`( `? `< `! regex `)]
+
+def look_around
+	[look_ahead]
+|	[look_behind]
+
+token br_g / '\\g' /
+token br_k / '\\k' /
+
+token maybe_backref / '\\' [1-9] [0-9]* /
+
+lex
+	token maybe_octal /
+	   '\\' (
+			[1-3] [0-7] [0-7] |
+			[1-7] [0-7]
+	   )
+	/
+
+	token def_octal /
+		'\\' (
+			[0] [0-7] [0-7] |
+			[0] [0-7] |
+			[0]
+		)
+	/
+end
+
+token else_digits / '\\' [0-9]+ /
+
+bool is_backref( Num: str )
+{
+	Num = suffix( Num, 1 )
+	Ref: int = atoi( Num )
+	if ( Ref < 8 || Ref <= Backrefs )
+		return true
+	return false
+}
+
+# Simple disambig between octals and backrefs. Reject octals that can be a
+# backref, as determined by counting the number of captures.
+def octal
+	[maybe_octal] :Maybe
+	{
+		if ( is_backref( $lhs.maybe_octal ) )
+			reject
+	}
+|	[def_octal] :Def
+
+def backref
+	[maybe_backref]
+	{
+		if ( !is_backref( $lhs.maybe_backref ) )
+			reject
+	}
+|	[br_g number]
+|	[br_g `{ number `}]
+|	[br_g `{ `- number `}]
+|	[br_k `< name `>]
+|	[br_k single_quote name single_quote]
+|	[br_g `{ name `}]
+|	[br_k `{ name `}]
+|	[`( `? P `= name `)]
+
+def literal_digits
+	[else_digits]
+
+def cond_ref
+	[number]
+|	[`+ number]
+|	[`- number]
+|	[`< name `>]
+|	[single_quote name single_quote]
+|	[cond_ref_R number]
+|	[cond_ref_R]
+|	[cond_ref_R `& name]
+|	[cond_ref_DEFINE]
+|	[cond_ref_assert]
+|	[name]
+
+token cond_ref_DEFINE   / 'DEFINE' /
+token cond_ref_assert   / 'assert' /
+token cond_ref_R        / 'R' /
+
+def cond_false
+	[`| regex ]
+
+def conditional
+	[`( `? `( cond_ref `) regex cond_false? `)]
+
+token btc_accept      / 'ACCEPT' /
+token btc_fail        / 'F' ( 'AIL' )? /
+token btc_mark_name   /  ('MARK')? ':NAME' /
+token btc_commit      / 'COMMIT' /
+token btc_prune       / 'PRUNE' /
+token btc_prune_name  / 'PRUNE:NAME)' /
+token btc_skip        / 'SKIP' /
+token btc_skip_name   / 'SKIP:NAME' /
+token btc_then        / 'THEN' /
+token btc_then_name   / 'THEN:NAME' /
+
+def btc_type
+	[btc_accept]
+|	[btc_fail]
+|	[btc_mark_name]
+|	[btc_commit]
+|	[btc_prune]
+|	[btc_prune_name]
+|	[btc_skip]
+|	[btc_skip_name]
+|	[btc_then]
+|	[btc_then_name]
+
+def backtrack_control
+	[ `( `* btc_type `) ]
+
+token nlc_cr           / 'CR' /
+token nlc_lf           / 'LF' /
+token nlc_crlf         / 'CRLF' /
+token nlc_anycrlf      / 'ANYCRLF' /
+token nlc_any          / 'ANY' /
+token nlc_bsr_anycrlf  / 'BSR_ANYCRLF' /
+token nlc_bsr_unicodo  / 'BSR_UNICODE' /
+
+def nlc_type
+	[nlc_cr]
+|	[nlc_lf]
+|	[nlc_crlf]
+|	[nlc_anycrlf]
+|	[nlc_any]
+|	[nlc_bsr_anycrlf]
+|	[nlc_bsr_unicodo]
+
+def newline_convention
+	[ `( `* nlc_type `) ]
+
+token callout_C / 'C' /
+
+def callout
+	[ `( `? callout_C `) ]
+|	[ `( `? callout_C number `) ]
+
+def char_class_start [ `[ ]
+def char_class_end   [ `] ]
+def dot              [ `. ]
+def caret            [ `^ ]
+def question_mark    [ `? ]
+def plus             [ `+ ]
+def star             [ `* ]
+def open_brace       [ `{ ]
+def close_brace      [ `} ]
+def comma            [ `, ]
+def pipe             [ `| ]
+def open_paren       [ `( ]
+def close_paren      [ `) ]
+
+lex 
+	token hex_char_fixed
+		/ '\\x' hex_digit hex_digit /
+
+	token hex_char_var
+		/ '\\x' '{' hex_digit hex_digit hex_digit+ '}' /
+end
+
+#
+# Anchors
+#
+
+token word_boundary       / '\\b' /
+token non_word_boundary   / '\\B' /
+
+token sos_A
+	/ '\\A' /
+
+def start_of_subject
+	[`^]
+|	[sos_A]
+
+token eos_z / '\\z' /
+token eos_Z / '\\Z' /
+
+def end_of_subject
+	[`$]
+|	[eos_Z]
+|	[eos_z]
+
+token first_matching_pos
+	/ '\\G' /
+
+def anchor
+	[word_boundary]
+|	[non_word_boundary]
+|	[start_of_subject]
+|	[end_of_subject]
+|	[first_matching_pos]
+
+#
+# Character classes
+#
+
+def cc_atom_list
+	[cc_atom cc_atom*]
+
+def character_class
+	[`[ cc_atom_list `]]
+|	[cc_open_caret       cc_atom_list `]]
+|	[cc_open_caret_close cc_atom* `]]
+|	[cc_open_close       cc_atom* `]]
+|	[cc_open_caret_close hyphen cc_atom_end_range cc_atom* `]]
+|	[cc_open_close       hyphen cc_atom_end_range cc_atom* `]]
+ 
+def cc_atom_end_range
+	[cc_atom]
+
+def cc_atom
+	[cc_literal hyphen cc_literal]
+|	[shared_atom]
+|	[cc_literal]
+|	[octal]
+
+def cc_literal
+	[shared_literal]
+|	[dot]
+|	[char_class_start]
+|	[caret]
+|	[question_mark]
+|	[plus]
+|	[star]
+|	[word_boundary]
+|	[non_word_boundary]
+|	[dollar]
+|	[pipe]
+|	[open_paren]
+|	[close_paren]
+
+token decimal_digit              / '\\d' /
+token not_decimal_digit          / '\\D' /
+token horizontal_white_space     / '\\h' /
+token not_horizontal_white_space / '\\H' /
+token not_new_line               / '\\N' /
+token new_line_sequence          / '\\R' /
+token white_space                / '\\s' /
+token not_white_space            / '\\S' /
+token vertical_white_space       / '\\v' /
+token not_vertical_white_space   / '\\V' /
+token word_char                  / '\\w' /
+token not_word_char              / '\\W' /
+
+token one_data_unit              / '\\C' /
+token extended_unicode_char      / '\\X' /
+
+token with_property_open         / '\\p' /
+token without_property_open      / '\\P' /
+
+def char_with_property
+	[with_property_open `{ underscore_alpha_numerics `}]
+def char_without_property
+	[without_property_open `{ underscore_alpha_numerics `}]
+
+def atom
+	[shared_atom]           :SharedAtom
+|	[shared_literal]        :SharedLiteral
+|	[char_class_end]        :CharClassEnd
+|	[dot]                   :Dot
+|	[character_class]       :CharacterClass
+|	[capture]               :Capture
+|	[non_capture]           :NonCapture
+|	[anchor]                :Anchor
+|	[look_around]           :LookAround
+|	[option]                :Option
+|	[newline_convention]    :NewlineConvention
+|	[callout]               :Callout
+|	[reset_start_match]     :ResetStartMatch
+|	[one_data_unit]         :OneDataUnit
+|	[extended_unicode_char] :ExtendedUnicodeChar
+|	[backtrack_control]     :BacktrackControl
+|	[backref]               :Backref
+|	[literal_digits]        :LiteralDigits
+|	[subroutine_reference]  :SubroutineReference
+|	[conditional]           :Conditional
+|	[comment]               :Comment
+
+def element
+	[atom quantifier] :Atom
+
+def term
+	[element term] :Element
+|	[] :Base
+
+def expr
+	[expr `| term] :Union
+|	[term] :Base
+
+def regex
+	[expr] :Expr
+
+def init
+	[]
+	{
+		Backrefs = 0
+	}
+
+token unparseable /[^\n]*/
+
+def line
+	[init regex NL] :Regex commit
+|	[unparseable NL] :Unparseable commit
+
+def file
+	[line*]
+
+
+parse F: file [stdin]
+
+if !F
+	print "parse error: [error]
+else {
+	for U: unparseable in F
+		print "unparseable: [U]
+	for B: backref in F
+		print "backref: [B]
+}
diff --git a/grammar/pcre/pcre.rl b/grammar/pcre/pcre.rl
new file mode 100644
index 00000000..307d5467
--- /dev/null
+++ b/grammar/pcre/pcre.rl
@@ -0,0 +1,864 @@
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define LIST_APPEND( pphead, element ) \
+	{ \
+		element->next = *pphead; \
+		*pphead = element; \
+	}
+
+#define LIST_REMOVE( pphead ) \
+	{ \
+		*pphead = (*pphead)->next; \
+	}
+
+#define STACK_PUSH( pphead, element ) \
+	{ \
+		element->parent = *pphead; \
+		*pphead = element; \
+	}
+
+#define STACK_POP( pphead ) \
+	{ \
+		*pphead = (*pphead)->parent; \
+	}
+
+#define REVERSE_LIST( S, pphead ) \
+	if ( *pphead != 0 ) { \
+		S *front = (*pphead)->next; \
+		(*pphead)->next = 0; \
+		while ( front != NULL ) { \
+			S *next = front->next; \
+			front->next = *pphead; \
+			*pphead = front; \
+			front = next; \
+		} \
+	}
+
+#define MAX_VERB 40
+#define QUANTIFIER_INF -1
+
+struct value
+{
+	int v;
+
+	struct value *next;
+};
+
+enum value_type
+{
+	dot = 256,
+};
+
+enum quantifer_type
+{
+	quant_greedy = 1,
+	quant_lazy,
+	quant_possessive,
+};
+
+struct quantifier
+{
+	int min, max;
+	enum quantifer_type type;
+};
+
+enum element_type
+{
+	element_value_type = 1,
+	element_regex_type,
+	element_look_around_type,
+	element_char_class_type,
+	element_backref_type,
+	element_verb_type,
+};
+
+struct element
+{
+	enum element_type type;
+	struct regex *regex;
+	struct look_around *look_around;
+	struct quantifier *quantifier;
+	struct value *value_list;
+	struct backref *backref;
+	struct verb *verb;
+
+	struct element *next;
+};
+
+struct term
+{
+	struct element *element_list;
+
+	struct term *next;
+	struct term *parent;
+};
+
+enum regex_type
+{
+	regex_non_capture = 1,
+	regex_capture,
+	regex_look_around
+};
+
+#define REGEX_ATOMIC_GROUP   0x01
+#define REGEX_CAPTURE_RESET  0x02
+
+struct regex
+{
+	struct term *term_list;
+
+	enum regex_type type;
+	int capture;
+	int node_id;
+	int options;
+
+	int flags;
+
+	struct regex *next;
+	struct regex *parent;
+};
+
+struct pattern
+{
+	struct regex *regex;
+	int node_id;
+	int is_look_around;
+
+	struct look_around *look_arounds;
+	struct pattern *parent;
+};
+
+enum look_around_type {
+	lat_pos_ahead = 1,
+	lat_neg_ahead,
+	lat_pos_behind,
+	lat_neg_behind,
+};
+
+struct look_around
+{
+	enum look_around_type type;
+	struct pattern *pattern;
+	int node_id;
+
+	struct look_around *next;
+};
+
+struct backref
+{
+	int number;
+};
+
+struct verb
+{
+	char *name;
+	char *value;
+	int number;
+};
+
+struct quantifier *new_quantifier()
+{
+	struct quantifier *quant = malloc( sizeof( struct value ) );
+	memset( quant, 0, sizeof(struct quantifier) );
+	quant->type = quant_greedy;
+	return quant;
+}
+
+struct value *new_value( int v )
+{
+	struct value *value = malloc( sizeof( struct value ) );
+	memset( value, 0, sizeof(struct value) );
+	value->v = v;
+	return value;
+}
+
+struct term *new_term( void )
+{
+	struct term *term = malloc( sizeof( struct term ) );
+	memset( term, 0, sizeof(struct term) );
+	return term;
+}
+
+struct element *new_element( enum element_type type, struct regex *regex, struct look_around *look_around )
+{
+	struct element *element = malloc( sizeof( struct element ) );
+	memset( element, 0, sizeof(struct element) );
+	element->type = type;
+	element->regex = regex;
+	element->look_around = look_around;
+	return element;
+}
+
+struct regex *new_regex( enum regex_type type, int options )
+{
+	struct regex *regex = malloc( sizeof( struct regex ) );
+	memset( regex, 0, sizeof(struct regex) );
+	regex->type = type;
+	regex->options = options;
+	return regex;
+}
+
+struct look_around *new_look_around( enum look_around_type lat, struct pattern *pattern )
+{
+	struct look_around *look_around = malloc( sizeof( struct look_around ) );
+	memset( look_around, 0, sizeof(struct look_around) );
+	look_around->type = lat;
+	look_around->pattern = pattern;
+	return look_around;
+}
+
+struct pattern *new_pattern( struct regex *regex, int is_look_around )
+{
+	struct pattern *pattern = malloc( sizeof( struct pattern ) );
+	pattern->regex = regex;
+	pattern->is_look_around = is_look_around;
+	return pattern;
+}
+
+struct backref *new_backref( int number )
+{
+	struct backref *backref = malloc( sizeof( struct backref ) );
+	memset( backref, 0, sizeof(struct backref) );
+	backref->number = number;
+	return backref;
+}
+
+struct verb *new_verb( char *d, int len )
+{
+	struct verb *verb = malloc( sizeof( struct verb ) );
+	verb->name = malloc( len + 1 );
+	memcpy( verb->name, d, len );
+	verb->name[len] = 0;
+	verb->value = 0;
+	verb->number = -1;
+	return verb;
+}
+
+void append_element_value( struct term *term, int value )
+{
+	struct element *el = term->element_list;
+	if ( el == NULL ||
+			el->type != element_value_type ||
+			el->quantifier != 0 )
+	{
+		el = new_element( element_value_type, NULL, NULL );
+		LIST_APPEND( &term->element_list, el );
+	}
+	
+	struct value *v = new_value( value );
+	LIST_APPEND( &el->value_list, v );
+}
+
+void reverse_value_list( struct value **pphead )
+{
+	REVERSE_LIST( struct value, pphead );
+}
+
+void reverse_element_list( struct element **pphead )
+{
+	REVERSE_LIST( struct element, pphead );
+}
+
+void reverse_term_list( struct term **pphead )
+{
+	REVERSE_LIST( struct term, pphead );
+}
+
+int is_backref( int num, int num_closed_captures )
+{
+	if ( num < 8 || num <= num_closed_captures )
+		return 1;
+	return 0;
+}
+
+void append_backref( struct term *term, int number )
+{
+	struct element *el = new_element( element_backref_type, NULL, NULL );
+	el->backref = new_backref( number );
+	LIST_APPEND( &term->element_list, el );
+}
+
+/* Provides initial ragel call state stack. */
+int *init_ragel_stack( int *size )
+{
+	*size = 24;
+	return malloc( *size * sizeof(int) );
+}
+
+/* Grows the ragel call state stack. Use in pre-push. */
+int *grow_ragel_stack( int *size, int *stack )
+{
+	int new_size = *size * 2;
+	int *new_stack = malloc( new_size * sizeof(int) ) ;
+	memcpy( new_stack, stack, *size * sizeof(int) );
+	free( stack );
+
+	*size = new_size;
+	return new_stack;
+}
+
+void apply_quantifier( struct term *term, int min, int max )
+{
+	struct element *el = term->element_list;
+
+	/* We may need to remove the last value from the element's value list. */
+	if ( el->type == element_value_type && el->value_list != NULL && el->value_list->next != NULL ) {
+		/* Take the value off and add it to an element of its own. */
+		struct value *last = el->value_list;
+		LIST_REMOVE( &el->value_list );
+
+		el = new_element( element_value_type, NULL, NULL );
+
+		LIST_APPEND( &el->value_list, last );
+		LIST_APPEND( &term->element_list, el );
+	}
+
+	el->quantifier = new_quantifier();
+	el->quantifier->min = min;
+	el->quantifier->max = max;
+}
+
+void apply_lazy( struct term *term )
+{
+	struct element *el = term->element_list;
+	el->quantifier->type = quant_lazy;
+}
+
+void apply_possessive( struct term *term )
+{
+	struct element *el = term->element_list;
+	el->quantifier->type = quant_possessive;
+}
+
+%%{
+	machine PCRE;
+
+	action enter_term {
+		struct term *term = new_term();
+		STACK_PUSH( &s_term, term );
+		LIST_APPEND( &s_regex->term_list, term );
+	}
+
+	action leave_term {
+		reverse_element_list( &s_term->element_list );
+		struct element *el = s_term->element_list;
+		while ( el != 0 ) {
+			reverse_value_list( &el->value_list );
+			el = el->next;
+		}
+		STACK_POP( &s_term );
+	}
+
+	action enter_regex {
+		struct regex *regex = new_regex( regex_non_capture, s_regex->options );
+		struct element *element = new_element( element_regex_type, regex, NULL );
+		LIST_APPEND( &s_term->element_list, element );
+		STACK_PUSH( &s_regex, regex );
+	}
+
+	action enter_lookaround {
+		struct regex *regex = new_regex( regex_look_around, s_regex->options );
+		struct pattern *pattern = new_pattern( regex, 1 );
+		struct look_around *la = new_look_around( lat, pattern );
+		pattern->node_id = la->node_id;
+
+		LIST_APPEND( &s_pattern->look_arounds, la );
+
+		struct element *element = new_element( element_look_around_type, NULL, la );
+		LIST_APPEND( &s_term->element_list, element );
+
+		STACK_PUSH( &s_regex, regex );
+		STACK_PUSH( &s_pattern, pattern );
+	}
+
+	action leave_regex {
+		if ( s_regex->type == regex_capture )
+			closed_captures += 1;
+
+		if ( s_regex->type == regex_look_around )
+			STACK_POP( &s_pattern );
+
+		reverse_term_list( &s_regex->term_list );
+		STACK_POP( &s_regex );
+	}
+
+	#
+	# Numbers
+	#
+	action init_number
+		{ number = 0; }
+
+	action decimal_digit
+		{ number = number * 10 + ( *p - '0' ); }
+
+	number = [0-9]+ >init_number $decimal_digit;
+
+	#
+	# Verbs
+	#
+	verb = ( [a-zA-Z_] [a-zA-Z_0-9]* ) >{ verb_len = 0; } ${
+		if ( verb_len < MAX_VERB ) {
+			verb[verb_len++] = *p;
+		}
+	};
+
+	action append_verb {
+		struct element *el = new_element( element_verb_type, NULL, NULL );
+		cur_verb = el->verb = new_verb( verb, verb_len );
+
+		LIST_APPEND( &s_term->element_list, el );
+	}
+
+	action verb_value
+	{
+		cur_verb->value = malloc( verb_len + 1 );
+		memcpy( cur_verb->value, verb, verb_len );
+		cur_verb->value[verb_len] = 0;
+	}
+
+	action verb_number
+	{
+		cur_verb->number = number;
+	}
+
+	verb_forms =
+			verb %append_verb ( ':' verb %verb_value | '=' number %verb_number )?;
+
+	action options_only { fret; }
+
+	options = (
+			'c' # case-sensitive matching
+		|	'i' # case-insensitive matching
+	)*;
+
+	alpha_char = [a-zA-Z];
+	digit_char = [0-9];
+
+	open_paren    = '(';
+	close_paren   = ')';
+
+	char_class_start = '[';
+	char_class_end   = ']';
+
+	open_curly = '{';
+	close_curly = '}';
+
+	ampersand     = '&';
+	colon         = ':';
+	comma         = ',';
+	dollar        = '$';
+	dot           = '.';
+	equals        = '=';
+	exclamation   = '!';
+	greater_than  = '>';
+	hash          = '#';
+	hyphen        = '-';
+	less_than     = '<';
+	pipe          = '|';
+	single_quote  = "'";
+	underscore    = '_';
+
+	other_char_printable =
+		' ' | '~' | ';' | '@' | '%' | '`' | '"' | '/';
+
+	other_char_non_printable = ^( 0 .. 127 );
+
+	capture_non_capture =
+		'(' @{ fcall paren_open; };
+
+	literal_sym =
+		comma | hyphen | less_than |
+		greater_than | single_quote | underscore | colon |
+		hash | equals | exclamation | ampersand;
+
+	action append_char {
+		append_element_value( s_term, *p );
+	}
+
+	literal = (
+		alpha_char               |
+		digit_char               |
+		literal_sym              |
+		other_char_printable     |
+		other_char_non_printable 
+	) @append_char;
+
+	#
+	# Octals and backrefs. There is an ambiguity between these two, therefore
+	# the grammar handles them together.
+	#
+
+	action init_octal
+		{ octal = 0; }
+
+	action octal_digit
+		{ octal = octal * 8 + ( *p - '0' ); }
+
+	backref_num = [1-9] [0-9]*;
+
+	octal_num = 
+			[0-3] [0-7] [0-7] |
+			[0-7] [0-7] |
+			[0];
+
+	action octal { append_element_value( s_term, octal ); }
+
+	action octal_or_backref
+	{
+		if ( is_backref( number, closed_captures ) )
+			append_backref( s_term, number );
+		else
+			append_element_value( s_term, octal );
+	}
+
+	action backref
+	{
+		if ( number > closed_captures ) {
+			printf( "invalid backref: \\%d\n", number );
+			fbreak;
+		}
+			
+		append_backref( s_term, number );
+	}
+
+	# Certainly octal. All octals, with possibly backrefs removed.
+	def_octal =
+		'\\' @init_octal
+		( octal_num - backref_num ) $octal_digit;
+	
+	# All cases that can be either octal or backref and we need to inspect the
+	# backref number to decide. Use the intersection of the two numbers as the
+	# pattern.
+	octal_or_backref =
+		'\\' @init_octal @init_number
+		( octal_num & backref_num ) $octal_digit $decimal_digit;
+
+	# Definitely backref. The backref pattern excluding anything with a prefix
+	# that could be octal.
+	def_backref =
+		'\\' @init_number
+		( backref_num - ( octal_num any* ) ) $decimal_digit;
+
+	octal =
+		def_octal %octal |
+		octal_or_backref %octal_or_backref;
+
+	backref =
+		def_backref       %backref |
+		octal_or_backref  %octal_or_backref;
+
+	atom =
+		literal |
+		char_class_end  @append_char |
+		dot             @{ append_element_value( s_term, dot ); } |
+		octal |
+		backref |
+		open_paren      @{ fcall open_paren_forms; }
+	;
+
+	non_greedy = 
+		'?' @{ apply_lazy( s_term ); } |
+		'+' @{ apply_possessive( s_term ); };
+
+	quant_min = number %{ quant_min = number; };
+	quant_max = number %{ quant_max = number; };
+		
+	action quant_zero_plus { apply_quantifier( s_term, 0, QUANTIFIER_INF ); }
+	action quant_one_plus  { apply_quantifier( s_term, 1, QUANTIFIER_INF ); }
+	action quant_optional  { apply_quantifier( s_term, 0, 1 ); }
+	action quant_exact     { apply_quantifier( s_term, quant_min, quant_min ); }
+	action quant_min       { apply_quantifier( s_term, quant_min, QUANTIFIER_INF ); }
+	action quant_min_max   { apply_quantifier( s_term, quant_min, quant_max ); }
+
+	quant_forms = 
+		'*' @quant_zero_plus |
+		'+' @quant_one_plus  |
+		'?' @quant_optional  |
+		open_curly quant_min close_curly               @quant_exact |
+		open_curly quant_min ',' close_curly           @quant_min |
+		open_curly quant_min ',' quant_max close_curly @quant_min_max;
+
+	quantifier = quant_forms non_greedy?;
+
+	element = ( atom quantifier? );
+
+	term = ( element** ) >enter_term;
+
+	#
+	# Expression
+	#
+	expr = term ( '|' @leave_term term )*;
+
+	#
+	# Regex
+	#
+	regex = expr;
+
+
+	open_paren_forms :=
+		# Look at the first few charcters to see what the form is. What we
+		# handle here:
+		#  (re)    capturing parens
+		#  (?:re)  non-capturing parens
+		#  (?=re)  positive lookahead
+		#  (?!re)  negative lookahead
+		#  (?<=re) positive lookbehind 
+		#  (?<!re) negative lookbehind
+		(
+			# Non-capture. 
+			'?' options (
+				')' @options_only |
+				':' @enter_regex
+			) |
+
+			# Atomic group.
+			'?>' @enter_regex
+				@{ s_regex->flags |= REGEX_ATOMIC_GROUP; } |
+
+			# Capture number reset on branch.
+			'?|' @enter_regex
+				@{ s_regex->flags |= REGEX_CAPTURE_RESET; } |
+
+			# Lookaround
+			(
+				'?=' @{ lat = lat_pos_ahead; } @enter_lookaround |
+				'?!' @{ lat = lat_neg_ahead; } @enter_lookaround |
+				'?<=' @{ lat = lat_pos_behind; } @enter_lookaround |
+				'?<!' @{ lat = lat_neg_behind; } @enter_lookaround
+			) |
+
+			# Verbs
+			'*' verb_forms ')' @{fret;} |
+
+			# Catpuring.
+			^( '?' | '*' ) @enter_regex @{
+				s_regex->type = regex_capture;
+				s_regex->capture = next_capture++;
+				fhold;
+			}
+
+		)
+		regex ')' @leave_term @leave_regex @{ fret; };
+
+
+	main := regex '\n' @leave_term @leave_regex @{ success = 1; };
+}%%
+
+%% write data;
+
+int pcre_parse( struct pattern **result_pattern, char *line, int len )
+{
+	int *stack;
+	int stack_size;
+	int cs, top;
+	char *p, *pe;
+	int success = 0;
+	int result = 1;
+
+	int number = 0;
+	int octal = 0;
+	int quant_min, quant_max;
+
+	struct pattern *s_pattern = NULL;
+	struct regex *s_regex = NULL;
+	struct term *s_term = NULL;
+
+	/* Collect into this. */
+	struct regex *root_regex = new_regex( regex_non_capture, 0 );
+	struct pattern *root_pattern = new_pattern( root_regex, 0 );
+	STACK_PUSH( &s_pattern, root_pattern );
+	STACK_PUSH( &s_regex, root_regex );
+
+	int closed_captures = 0;
+	int next_capture = 1;
+
+	enum look_around_type lat;
+
+	int verb_len;
+	char verb[MAX_VERB];
+	struct verb *cur_verb;
+
+	stack = init_ragel_stack( &stack_size );
+
+	%% write init;
+
+	p = line;
+	pe = line + len;
+
+	%% write exec;
+
+	if ( !success ) {
+		printf( "parse error at %ld\n", ( p - line + 1) );
+		result = 0;
+	}
+	else if (
+			s_pattern == NULL || s_pattern->parent != NULL ||
+			s_regex != NULL || 
+			s_term != NULL )
+	{
+		printf( "parse error: items not closed\n" );
+		result = 0;
+	}
+
+	if ( result ) {
+		*result_pattern = root_pattern;
+	}
+	else {
+	
+	}
+
+	free( stack );
+
+	return result;
+}
+
+extern void print_value( int indent, struct value *value );
+extern void print_element( int indent, struct element *el );
+extern void print_term( int indent, struct term *term );
+extern void print_regex( int indent, struct regex *regex );
+
+void print_indent( int indent )
+{
+	while ( indent-- > 0 )
+		printf( "  " );
+}
+
+void print_value( int indent, struct value *value )
+{
+	printf( "v(%d)", value->v );
+}
+
+void print_element( int indent, struct element *el )
+{
+	print_indent( indent );
+	printf( "el: " );
+	if ( el->type == element_value_type ) {
+		struct value *value = el->value_list;
+		while ( value != NULL ) {
+			print_value( indent + 1, value );
+			if ( value->next != 0 )
+				printf( " . " );
+			value = value->next;
+		}
+	}
+	else if ( el->type == element_regex_type ) {
+		if ( el->regex != 0 )
+			print_regex( indent, el->regex );
+	}
+	else if ( el->type == element_look_around_type ) {
+		printf( "lookaround " );
+		switch ( el->look_around->type ) {
+			case lat_pos_ahead: printf( "=" ); break;
+			case lat_neg_ahead: printf( "!" ); break;
+			case lat_pos_behind: printf( "<=" ); break;
+			case lat_neg_behind: printf( "<!" ); break;
+		}
+		printf( " " );
+		print_regex( indent, el->look_around->pattern->regex );
+	}
+	else if ( el->type == element_char_class_type )
+		printf( "cc" );
+	else if ( el->type == element_backref_type )
+		printf( "backref(%d)", el->backref->number );
+	else if ( el->type == element_verb_type ) {
+		printf( "verb(%s", el->verb->name );
+		if ( el->verb->value != NULL )
+			printf( ":%s", el->verb->value );
+		else if ( el->verb->number >= 0 )
+			printf( "=%d", el->verb->number );
+		printf( ")" );
+	}
+
+	if ( el->quantifier != NULL ) {
+		printf( " {%d,", el->quantifier->min );
+
+		if ( el->quantifier->max == QUANTIFIER_INF )
+			printf( "inf" );
+		else
+			printf( "%d", el->quantifier->max );
+
+		printf( "}" );
+
+		if ( el->quantifier->type == quant_lazy )
+			printf( "?" );
+		else if ( el->quantifier->type == quant_possessive )
+			printf( "+" );
+
+	}
+
+	printf("\n");
+}
+
+void print_term( int indent, struct term *term )
+{
+	print_indent( indent );
+	printf( "term:\n" );
+	struct element *el = term->element_list;
+	while ( el != NULL ) {
+		print_element( indent + 1, el );
+		el = el->next;
+	}
+}
+
+void print_regex( int indent, struct regex *regex )
+{
+	printf( "reg: " );
+	if ( regex->type == regex_capture )
+		printf( "cap(%d) ", regex->capture );
+	if ( regex->flags & REGEX_ATOMIC_GROUP )
+		printf( "atomic_group " );
+	if ( regex->flags & REGEX_CAPTURE_RESET )
+		printf( "capture_reset " );
+	printf( "(\n" );
+
+	struct term *term = regex->term_list;
+	while ( term != NULL ) {
+		print_term( indent + 1, term );
+		term = term->next;
+	}
+	print_indent( indent );
+	printf( ")" );
+}
+
+void print_pattern( int indent, struct pattern *pat )
+{
+	printf( "pat: " );
+	print_regex( indent, pat->regex );
+	printf( "\n" );
+}
+
+char linebuf[2048];
+
+int main( int argc, char **argv )
+{
+	if ( argc < 2 ) {
+		fprintf( stderr, "usage: ./pcre <regex>\n" );
+		return -1;
+	}
+
+	FILE *input = fopen( argv[1], "r" );
+	if ( input == NULL ) {
+		fprintf( stderr, "failed to open %s: %s", argv[0], strerror(errno) );
+		return -1;
+	}
+
+	char *line = NULL;
+	size_t n = 0;
+	ssize_t read;
+
+	while ( (read = getline( &line, &n, input )) != -1 ) {
+		struct pattern *pat;
+		int parsed = pcre_parse( &pat, line, (int)read );
+		if ( parsed ) {
+			print_pattern( 0, pat );
+		}
+	}
+
+	fclose( input );
+	free( line );
+	return 0;
+}
+
author	Adrian Thurston <thurston@colm.net>	2019-12-01 09:05:03 -0800
committer	Adrian Thurston <thurston@colm.net>	2019-12-01 09:05:03 -0800
commit	5a4407de2c4c03083f1c25b5c7fdb849cdc08a12 (patch)
tree	4436efeb998fd54de016723d92d74c400d48e070 /grammar/pcre
parent	479376c313208ee764d212846892cb904b8e5b89 (diff)
download	colm-5a4407de2c4c03083f1c25b5c7fdb849cdc08a12.tar.gz