summaryrefslogtreecommitdiff
path: root/grammar/pcre
diff options
context:
space:
mode:
authorAdrian Thurston <thurston@colm.net>2019-12-01 09:05:03 -0800
committerAdrian Thurston <thurston@colm.net>2019-12-01 09:05:03 -0800
commit5a4407de2c4c03083f1c25b5c7fdb849cdc08a12 (patch)
tree4436efeb998fd54de016723d92d74c400d48e070 /grammar/pcre
parent479376c313208ee764d212846892cb904b8e5b89 (diff)
downloadcolm-5a4407de2c4c03083f1c25b5c7fdb849cdc08a12.tar.gz
moved grammars out to their own dirs
We also want some testing/validation support for each grammar. This dir will get quite messy if we move files.
Diffstat (limited to 'grammar/pcre')
-rw-r--r--grammar/pcre/.gitignore4
-rw-r--r--grammar/pcre/Makefile11
-rw-r--r--grammar/pcre/pcre.lm583
-rw-r--r--grammar/pcre/pcre.rl864
4 files changed, 1462 insertions, 0 deletions
diff --git a/grammar/pcre/.gitignore b/grammar/pcre/.gitignore
new file mode 100644
index 00000000..05b136b8
--- /dev/null
+++ b/grammar/pcre/.gitignore
@@ -0,0 +1,4 @@
+/pcre-colm.c
+/pcre-colm
+/pcre-ragel.c
+/pcre-ragel
diff --git a/grammar/pcre/Makefile b/grammar/pcre/Makefile
new file mode 100644
index 00000000..6f895485
--- /dev/null
+++ b/grammar/pcre/Makefile
@@ -0,0 +1,11 @@
+COLM = ../../colm/colm
+RAGEL = ../../ragel/ragel
+
+all: pcre-colm pcre-ragel
+
+pcre-colm: pcre.lm
+ $(COLM) -o $@ pcre.lm
+
+pcre-ragel: pcre.rl $(RAGEL)
+ $(RAGEL) -G2 pcre.rl
+ gcc -g -Wall -o $@ pcre.c
diff --git a/grammar/pcre/pcre.lm b/grammar/pcre/pcre.lm
new file mode 100644
index 00000000..691b1307
--- /dev/null
+++ b/grammar/pcre/pcre.lm
@@ -0,0 +1,583 @@
+global Backrefs: int = 0
+
+lex
+ token pre_equals /'='/
+end
+
+token alpha_char
+ / [a-zA-Z] /
+
+token digit_char
+ / [0-9] /
+
+rl alpha_nums
+ / (alpha_char | '_' ) (alpha_char | '_' | digit_char)* /
+
+rl alpha_numeric
+ / 'a'..'z' | 'A'..'Z' | '0'..'9' /
+
+rl alpha_numerics
+ / alpha_numeric+ /
+
+rl hex_digit
+ / '0'..'9' | 'a'..'f' | 'A'..'F' /
+
+literal `| `^
+literal `. `? `+ `*
+literal `{ `}
+
+# It is important that these all go into the same lexical region, so we get a
+# longest-match with no backtracking among these lexical options. Probably need
+# to separate mainline regex from character class regex lexical, but for now
+# they are the same regions.
+lex
+ literal `[
+ token cc_open_caret /"[^"/
+ token cc_open_caret_close /"[^]"/
+ token cc_open_close /"[]"/
+end
+
+literal `]
+literal `( `)
+literal `< `>
+literal `, `: `- `_ `= `!
+literal `# `& `$
+
+token NL
+ / '\r' ? '\n' /
+
+token number
+ /[0-9]+/
+
+# With greedy (default) or lazy (?), we are always attempting all matches. But
+# possessive (+) prunes paths, so it must force the pattern to become a
+# prefilter.
+def quantifier_type
+ [`+]
+| [`?]
+| []
+
+def general_repetition
+ [`{ number `} ]
+| [`{ number comma `} ]
+| [`{ number comma number `} ]
+
+def quantifier
+ [`? quantifier_type] :Question
+| [`* quantifier_type] :Star
+| [`+ quantifier_type] :Plus
+| [general_repetition quantifier_type] :General
+| [] :Base
+
+token sr_R /'R'/
+token sr_P /'P'/
+
+def subroutine_reference
+ [`( `? sr_R `)]
+| [`( `? number `)]
+| [`( `? `+ number `)]
+| [`( `? `- number `)]
+| [`( `? `& name `)]
+| [`( `? sr_P `> name `)]
+| [br_g `< name `>]
+| [br_g `< number `>]
+| [br_g `< `+ number `>]
+| [br_g `< `- number `>]
+| [br_g single_quote name single_quote ]
+| [br_g single_quote number single_quote]
+| [br_g single_quote `+ number single_quote]
+| [br_g single_quote `- number single_quote]
+
+token ns_open /'[[:'/
+
+lex
+ token ns_caret /'^'/
+ token ns_word /alpha_numerics/
+ token ns_close /':]]'/
+end
+
+def posix_named_set
+ [ns_open ns_caret? ns_word ns_close]
+
+token reset_start_match
+ / '\\K' /
+
+def shared_atom
+ [decimal_digit] :DecimalDigit
+| [not_decimal_digit] :NotDecimalDigit
+| [horizontal_white_space] :HorizonalWhiteSpace
+| [not_horizontal_white_space] :NotHorizontalWhiteSpace
+| [not_new_line] :NotNewLine
+| [new_line_sequence] :NewLineSequence
+| [white_space] :WhiteSpace
+| [not_white_space] :NotWhiteSpace
+| [vertical_white_space] :VerticalWhiteSpace
+| [not_vertical_white_space] :NotVerticalWhiteSpace
+| [word_char] :WordChar
+| [not_word_char] :NotWordChar
+| [posix_named_set] :PosixNamedSet
+| [char_with_property] :CharWithProperty
+| [char_without_property] :CharWithoutProperty
+| [control_char] :ControlChar
+
+def shared_literal
+ [octal] :Octal
+| [alpha_char] :AlphaChar
+| [digit_char] :DigitChar
+| [bell_char] :BellChar
+| [escape_char] :EscapeChar
+| [form_feed] :FormFeed
+| [new_line] :NewLine
+| [carriage_ret] :CarriageRet
+| [tab] :Tab
+| [hex_char_fixed] :HexCharFixed
+| [hex_char_var] :HexCharVar
+| [quoted] :Quoted
+| [block_quoted] :BlockQuoted
+| [open_brace] :OpenBrace
+| [close_brace] :CloseBrace
+| [comma] :Comma
+| [hyphen] :Hypen
+| [less_than] :LessThan
+| [greater_than] :GreaterThan
+| [single_quote] :SingleQuote
+| [underscore] :Underscore
+| [colon] :Colon
+| [hash] :Hash
+| [equals] :Equals
+| [exclamation] :Excalmation
+| [ampersand] :Ampersand
+| [other_char_printable] :OtherCharPrintable
+| [other_char_non_printable] :OhterCharNonPrintable
+
+token name
+ / alpha_nums /
+
+token bell_char / '\\a' /
+token escape_char / '\\e' /
+token form_feed / '\\f' /
+token new_line / '\\n' /
+token carriage_ret / '\\r' /
+token tab / '\\t' /
+token control_char
+ / '\\c' ( 0x00 .. 0x7c ) /
+
+token underscore_alpha_numerics
+ / ('_' | alpha_numeric)+ /
+
+rl non_alpha_numeric
+ / ^alpha_numeric /
+
+token quoted
+ /'\\' non_alpha_numeric/
+
+token bs_Q
+ /'\\Q'/
+
+lex
+ # String of non-backslash chars. Or a single backslash.
+ token block_data / ( [^\\]+ ) | '\\' /
+ token block_end /'\\E'/
+end
+
+token block_quoted
+ /bs_Q block_data* block_end/
+
+def hyphen [ `- ]
+def less_than [ `< ]
+def greater_than [ `> ]
+def underscore [ `_ ]
+def colon [ `: ]
+def equals [ `= ]
+def exclamation [ `! ]
+def ampersand [ `& ]
+def hash [ `# ]
+def dollar [ `$ ]
+
+token single_quote
+ / "'" /
+
+token other_char_printable
+ / ' ' | '~' | ';' | '@' | '%' | '`' | '"' | '/' /
+
+token other_char_non_printable
+ / ^( 0 .. 127 ) /
+
+token P / 'P' /
+
+def capture_form
+ [`? `< name `> regex] :NamedPerl
+| [`? single_quote name single_quote regex] :NamedQuoted
+| [`? P `< name `> regex] :NamedPython
+| [regex] :Unamed
+
+def capture
+ # This ID is for the ragel implementation. We use the nfa repetition
+ # operator, which needs an id.
+ [`( capture_form `)] :Capture
+ {
+ Backrefs = Backrefs + 1
+ }
+
+def option_spec
+ [Add: option_flags `- Remove: option_flags]
+| [Add: option_flags]
+| [`- Remove: option_flags]
+
+def non_capture
+ [`( `? `: regex `)]
+| [`( `? option_spec `: regex `)]
+| [`( `? `| regex `)]
+| [`( `? `> regex `)]
+
+token non_close_parens
+ / [^)]+ /
+
+def comment
+ [ `( `? `# non_close_parens? `) ]
+
+def option
+ [`( `? option_spec `)]
+| [`( `* no_start_opt `)]
+| [`( `* utf8 `)]
+| [`( `* utf16 `)]
+| [`( `* ucp `)]
+
+def option_flags
+ [option_flag+]
+
+token option_flag / 'i' | 'J' | 'm' | 's' | 'U' | 'x' /
+
+token no_start_opt / 'NO_START_OPT' /
+token utf8 / 'UTF8' /
+token utf16 / 'UTF16' /
+token ucp / 'UCP' /
+
+def look_ahead
+ [`( `? `= regex `)]
+| [`( `? `! regex `)]
+
+def look_behind
+ [`( `? `< `= regex `)]
+| [`( `? `< `! regex `)]
+
+def look_around
+ [look_ahead]
+| [look_behind]
+
+token br_g / '\\g' /
+token br_k / '\\k' /
+
+token maybe_backref / '\\' [1-9] [0-9]* /
+
+lex
+ token maybe_octal /
+ '\\' (
+ [1-3] [0-7] [0-7] |
+ [1-7] [0-7]
+ )
+ /
+
+ token def_octal /
+ '\\' (
+ [0] [0-7] [0-7] |
+ [0] [0-7] |
+ [0]
+ )
+ /
+end
+
+token else_digits / '\\' [0-9]+ /
+
+bool is_backref( Num: str )
+{
+ Num = suffix( Num, 1 )
+ Ref: int = atoi( Num )
+ if ( Ref < 8 || Ref <= Backrefs )
+ return true
+ return false
+}
+
+# Simple disambig between octals and backrefs. Reject octals that can be a
+# backref, as determined by counting the number of captures.
+def octal
+ [maybe_octal] :Maybe
+ {
+ if ( is_backref( $lhs.maybe_octal ) )
+ reject
+ }
+| [def_octal] :Def
+
+def backref
+ [maybe_backref]
+ {
+ if ( !is_backref( $lhs.maybe_backref ) )
+ reject
+ }
+| [br_g number]
+| [br_g `{ number `}]
+| [br_g `{ `- number `}]
+| [br_k `< name `>]
+| [br_k single_quote name single_quote]
+| [br_g `{ name `}]
+| [br_k `{ name `}]
+| [`( `? P `= name `)]
+
+def literal_digits
+ [else_digits]
+
+def cond_ref
+ [number]
+| [`+ number]
+| [`- number]
+| [`< name `>]
+| [single_quote name single_quote]
+| [cond_ref_R number]
+| [cond_ref_R]
+| [cond_ref_R `& name]
+| [cond_ref_DEFINE]
+| [cond_ref_assert]
+| [name]
+
+token cond_ref_DEFINE / 'DEFINE' /
+token cond_ref_assert / 'assert' /
+token cond_ref_R / 'R' /
+
+def cond_false
+ [`| regex ]
+
+def conditional
+ [`( `? `( cond_ref `) regex cond_false? `)]
+
+token btc_accept / 'ACCEPT' /
+token btc_fail / 'F' ( 'AIL' )? /
+token btc_mark_name / ('MARK')? ':NAME' /
+token btc_commit / 'COMMIT' /
+token btc_prune / 'PRUNE' /
+token btc_prune_name / 'PRUNE:NAME)' /
+token btc_skip / 'SKIP' /
+token btc_skip_name / 'SKIP:NAME' /
+token btc_then / 'THEN' /
+token btc_then_name / 'THEN:NAME' /
+
+def btc_type
+ [btc_accept]
+| [btc_fail]
+| [btc_mark_name]
+| [btc_commit]
+| [btc_prune]
+| [btc_prune_name]
+| [btc_skip]
+| [btc_skip_name]
+| [btc_then]
+| [btc_then_name]
+
+def backtrack_control
+ [ `( `* btc_type `) ]
+
+token nlc_cr / 'CR' /
+token nlc_lf / 'LF' /
+token nlc_crlf / 'CRLF' /
+token nlc_anycrlf / 'ANYCRLF' /
+token nlc_any / 'ANY' /
+token nlc_bsr_anycrlf / 'BSR_ANYCRLF' /
+token nlc_bsr_unicodo / 'BSR_UNICODE' /
+
+def nlc_type
+ [nlc_cr]
+| [nlc_lf]
+| [nlc_crlf]
+| [nlc_anycrlf]
+| [nlc_any]
+| [nlc_bsr_anycrlf]
+| [nlc_bsr_unicodo]
+
+def newline_convention
+ [ `( `* nlc_type `) ]
+
+token callout_C / 'C' /
+
+def callout
+ [ `( `? callout_C `) ]
+| [ `( `? callout_C number `) ]
+
+def char_class_start [ `[ ]
+def char_class_end [ `] ]
+def dot [ `. ]
+def caret [ `^ ]
+def question_mark [ `? ]
+def plus [ `+ ]
+def star [ `* ]
+def open_brace [ `{ ]
+def close_brace [ `} ]
+def comma [ `, ]
+def pipe [ `| ]
+def open_paren [ `( ]
+def close_paren [ `) ]
+
+lex
+ token hex_char_fixed
+ / '\\x' hex_digit hex_digit /
+
+ token hex_char_var
+ / '\\x' '{' hex_digit hex_digit hex_digit+ '}' /
+end
+
+#
+# Anchors
+#
+
+token word_boundary / '\\b' /
+token non_word_boundary / '\\B' /
+
+token sos_A
+ / '\\A' /
+
+def start_of_subject
+ [`^]
+| [sos_A]
+
+token eos_z / '\\z' /
+token eos_Z / '\\Z' /
+
+def end_of_subject
+ [`$]
+| [eos_Z]
+| [eos_z]
+
+token first_matching_pos
+ / '\\G' /
+
+def anchor
+ [word_boundary]
+| [non_word_boundary]
+| [start_of_subject]
+| [end_of_subject]
+| [first_matching_pos]
+
+#
+# Character classes
+#
+
+def cc_atom_list
+ [cc_atom cc_atom*]
+
+def character_class
+ [`[ cc_atom_list `]]
+| [cc_open_caret cc_atom_list `]]
+| [cc_open_caret_close cc_atom* `]]
+| [cc_open_close cc_atom* `]]
+| [cc_open_caret_close hyphen cc_atom_end_range cc_atom* `]]
+| [cc_open_close hyphen cc_atom_end_range cc_atom* `]]
+
+def cc_atom_end_range
+ [cc_atom]
+
+def cc_atom
+ [cc_literal hyphen cc_literal]
+| [shared_atom]
+| [cc_literal]
+| [octal]
+
+def cc_literal
+ [shared_literal]
+| [dot]
+| [char_class_start]
+| [caret]
+| [question_mark]
+| [plus]
+| [star]
+| [word_boundary]
+| [non_word_boundary]
+| [dollar]
+| [pipe]
+| [open_paren]
+| [close_paren]
+
+token decimal_digit / '\\d' /
+token not_decimal_digit / '\\D' /
+token horizontal_white_space / '\\h' /
+token not_horizontal_white_space / '\\H' /
+token not_new_line / '\\N' /
+token new_line_sequence / '\\R' /
+token white_space / '\\s' /
+token not_white_space / '\\S' /
+token vertical_white_space / '\\v' /
+token not_vertical_white_space / '\\V' /
+token word_char / '\\w' /
+token not_word_char / '\\W' /
+
+token one_data_unit / '\\C' /
+token extended_unicode_char / '\\X' /
+
+token with_property_open / '\\p' /
+token without_property_open / '\\P' /
+
+def char_with_property
+ [with_property_open `{ underscore_alpha_numerics `}]
+def char_without_property
+ [without_property_open `{ underscore_alpha_numerics `}]
+
+def atom
+ [shared_atom] :SharedAtom
+| [shared_literal] :SharedLiteral
+| [char_class_end] :CharClassEnd
+| [dot] :Dot
+| [character_class] :CharacterClass
+| [capture] :Capture
+| [non_capture] :NonCapture
+| [anchor] :Anchor
+| [look_around] :LookAround
+| [option] :Option
+| [newline_convention] :NewlineConvention
+| [callout] :Callout
+| [reset_start_match] :ResetStartMatch
+| [one_data_unit] :OneDataUnit
+| [extended_unicode_char] :ExtendedUnicodeChar
+| [backtrack_control] :BacktrackControl
+| [backref] :Backref
+| [literal_digits] :LiteralDigits
+| [subroutine_reference] :SubroutineReference
+| [conditional] :Conditional
+| [comment] :Comment
+
+def element
+ [atom quantifier] :Atom
+
+def term
+ [element term] :Element
+| [] :Base
+
+def expr
+ [expr `| term] :Union
+| [term] :Base
+
+def regex
+ [expr] :Expr
+
+def init
+ []
+ {
+ Backrefs = 0
+ }
+
+token unparseable /[^\n]*/
+
+def line
+ [init regex NL] :Regex commit
+| [unparseable NL] :Unparseable commit
+
+def file
+ [line*]
+
+
+parse F: file [stdin]
+
+if !F
+ print "parse error: [error]
+else {
+ for U: unparseable in F
+ print "unparseable: [U]
+ for B: backref in F
+ print "backref: [B]
+}
diff --git a/grammar/pcre/pcre.rl b/grammar/pcre/pcre.rl
new file mode 100644
index 00000000..307d5467
--- /dev/null
+++ b/grammar/pcre/pcre.rl
@@ -0,0 +1,864 @@
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define LIST_APPEND( pphead, element ) \
+ { \
+ element->next = *pphead; \
+ *pphead = element; \
+ }
+
+#define LIST_REMOVE( pphead ) \
+ { \
+ *pphead = (*pphead)->next; \
+ }
+
+#define STACK_PUSH( pphead, element ) \
+ { \
+ element->parent = *pphead; \
+ *pphead = element; \
+ }
+
+#define STACK_POP( pphead ) \
+ { \
+ *pphead = (*pphead)->parent; \
+ }
+
+#define REVERSE_LIST( S, pphead ) \
+ if ( *pphead != 0 ) { \
+ S *front = (*pphead)->next; \
+ (*pphead)->next = 0; \
+ while ( front != NULL ) { \
+ S *next = front->next; \
+ front->next = *pphead; \
+ *pphead = front; \
+ front = next; \
+ } \
+ }
+
+#define MAX_VERB 40
+#define QUANTIFIER_INF -1
+
+struct value
+{
+ int v;
+
+ struct value *next;
+};
+
+enum value_type
+{
+ dot = 256,
+};
+
+enum quantifer_type
+{
+ quant_greedy = 1,
+ quant_lazy,
+ quant_possessive,
+};
+
+struct quantifier
+{
+ int min, max;
+ enum quantifer_type type;
+};
+
+enum element_type
+{
+ element_value_type = 1,
+ element_regex_type,
+ element_look_around_type,
+ element_char_class_type,
+ element_backref_type,
+ element_verb_type,
+};
+
+struct element
+{
+ enum element_type type;
+ struct regex *regex;
+ struct look_around *look_around;
+ struct quantifier *quantifier;
+ struct value *value_list;
+ struct backref *backref;
+ struct verb *verb;
+
+ struct element *next;
+};
+
+struct term
+{
+ struct element *element_list;
+
+ struct term *next;
+ struct term *parent;
+};
+
+enum regex_type
+{
+ regex_non_capture = 1,
+ regex_capture,
+ regex_look_around
+};
+
+#define REGEX_ATOMIC_GROUP 0x01
+#define REGEX_CAPTURE_RESET 0x02
+
+struct regex
+{
+ struct term *term_list;
+
+ enum regex_type type;
+ int capture;
+ int node_id;
+ int options;
+
+ int flags;
+
+ struct regex *next;
+ struct regex *parent;
+};
+
+struct pattern
+{
+ struct regex *regex;
+ int node_id;
+ int is_look_around;
+
+ struct look_around *look_arounds;
+ struct pattern *parent;
+};
+
+enum look_around_type {
+ lat_pos_ahead = 1,
+ lat_neg_ahead,
+ lat_pos_behind,
+ lat_neg_behind,
+};
+
+struct look_around
+{
+ enum look_around_type type;
+ struct pattern *pattern;
+ int node_id;
+
+ struct look_around *next;
+};
+
+struct backref
+{
+ int number;
+};
+
+struct verb
+{
+ char *name;
+ char *value;
+ int number;
+};
+
+struct quantifier *new_quantifier()
+{
+ struct quantifier *quant = malloc( sizeof( struct value ) );
+ memset( quant, 0, sizeof(struct quantifier) );
+ quant->type = quant_greedy;
+ return quant;
+}
+
+struct value *new_value( int v )
+{
+ struct value *value = malloc( sizeof( struct value ) );
+ memset( value, 0, sizeof(struct value) );
+ value->v = v;
+ return value;
+}
+
+struct term *new_term( void )
+{
+ struct term *term = malloc( sizeof( struct term ) );
+ memset( term, 0, sizeof(struct term) );
+ return term;
+}
+
+struct element *new_element( enum element_type type, struct regex *regex, struct look_around *look_around )
+{
+ struct element *element = malloc( sizeof( struct element ) );
+ memset( element, 0, sizeof(struct element) );
+ element->type = type;
+ element->regex = regex;
+ element->look_around = look_around;
+ return element;
+}
+
+struct regex *new_regex( enum regex_type type, int options )
+{
+ struct regex *regex = malloc( sizeof( struct regex ) );
+ memset( regex, 0, sizeof(struct regex) );
+ regex->type = type;
+ regex->options = options;
+ return regex;
+}
+
+struct look_around *new_look_around( enum look_around_type lat, struct pattern *pattern )
+{
+ struct look_around *look_around = malloc( sizeof( struct look_around ) );
+ memset( look_around, 0, sizeof(struct look_around) );
+ look_around->type = lat;
+ look_around->pattern = pattern;
+ return look_around;
+}
+
+struct pattern *new_pattern( struct regex *regex, int is_look_around )
+{
+ struct pattern *pattern = malloc( sizeof( struct pattern ) );
+ pattern->regex = regex;
+ pattern->is_look_around = is_look_around;
+ return pattern;
+}
+
+struct backref *new_backref( int number )
+{
+ struct backref *backref = malloc( sizeof( struct backref ) );
+ memset( backref, 0, sizeof(struct backref) );
+ backref->number = number;
+ return backref;
+}
+
+struct verb *new_verb( char *d, int len )
+{
+ struct verb *verb = malloc( sizeof( struct verb ) );
+ verb->name = malloc( len + 1 );
+ memcpy( verb->name, d, len );
+ verb->name[len] = 0;
+ verb->value = 0;
+ verb->number = -1;
+ return verb;
+}
+
+void append_element_value( struct term *term, int value )
+{
+ struct element *el = term->element_list;
+ if ( el == NULL ||
+ el->type != element_value_type ||
+ el->quantifier != 0 )
+ {
+ el = new_element( element_value_type, NULL, NULL );
+ LIST_APPEND( &term->element_list, el );
+ }
+
+ struct value *v = new_value( value );
+ LIST_APPEND( &el->value_list, v );
+}
+
+void reverse_value_list( struct value **pphead )
+{
+ REVERSE_LIST( struct value, pphead );
+}
+
+void reverse_element_list( struct element **pphead )
+{
+ REVERSE_LIST( struct element, pphead );
+}
+
+void reverse_term_list( struct term **pphead )
+{
+ REVERSE_LIST( struct term, pphead );
+}
+
+int is_backref( int num, int num_closed_captures )
+{
+ if ( num < 8 || num <= num_closed_captures )
+ return 1;
+ return 0;
+}
+
+void append_backref( struct term *term, int number )
+{
+ struct element *el = new_element( element_backref_type, NULL, NULL );
+ el->backref = new_backref( number );
+ LIST_APPEND( &term->element_list, el );
+}
+
+/* Provides initial ragel call state stack. */
+int *init_ragel_stack( int *size )
+{
+ *size = 24;
+ return malloc( *size * sizeof(int) );
+}
+
+/* Grows the ragel call state stack. Use in pre-push. */
+int *grow_ragel_stack( int *size, int *stack )
+{
+ int new_size = *size * 2;
+ int *new_stack = malloc( new_size * sizeof(int) ) ;
+ memcpy( new_stack, stack, *size * sizeof(int) );
+ free( stack );
+
+ *size = new_size;
+ return new_stack;
+}
+
+void apply_quantifier( struct term *term, int min, int max )
+{
+ struct element *el = term->element_list;
+
+ /* We may need to remove the last value from the element's value list. */
+ if ( el->type == element_value_type && el->value_list != NULL && el->value_list->next != NULL ) {
+ /* Take the value off and add it to an element of its own. */
+ struct value *last = el->value_list;
+ LIST_REMOVE( &el->value_list );
+
+ el = new_element( element_value_type, NULL, NULL );
+
+ LIST_APPEND( &el->value_list, last );
+ LIST_APPEND( &term->element_list, el );
+ }
+
+ el->quantifier = new_quantifier();
+ el->quantifier->min = min;
+ el->quantifier->max = max;
+}
+
+void apply_lazy( struct term *term )
+{
+ struct element *el = term->element_list;
+ el->quantifier->type = quant_lazy;
+}
+
+void apply_possessive( struct term *term )
+{
+ struct element *el = term->element_list;
+ el->quantifier->type = quant_possessive;
+}
+
+%%{
+ machine PCRE;
+
+ action enter_term {
+ struct term *term = new_term();
+ STACK_PUSH( &s_term, term );
+ LIST_APPEND( &s_regex->term_list, term );
+ }
+
+ action leave_term {
+ reverse_element_list( &s_term->element_list );
+ struct element *el = s_term->element_list;
+ while ( el != 0 ) {
+ reverse_value_list( &el->value_list );
+ el = el->next;
+ }
+ STACK_POP( &s_term );
+ }
+
+ action enter_regex {
+ struct regex *regex = new_regex( regex_non_capture, s_regex->options );
+ struct element *element = new_element( element_regex_type, regex, NULL );
+ LIST_APPEND( &s_term->element_list, element );
+ STACK_PUSH( &s_regex, regex );
+ }
+
+ action enter_lookaround {
+ struct regex *regex = new_regex( regex_look_around, s_regex->options );
+ struct pattern *pattern = new_pattern( regex, 1 );
+ struct look_around *la = new_look_around( lat, pattern );
+ pattern->node_id = la->node_id;
+
+ LIST_APPEND( &s_pattern->look_arounds, la );
+
+ struct element *element = new_element( element_look_around_type, NULL, la );
+ LIST_APPEND( &s_term->element_list, element );
+
+ STACK_PUSH( &s_regex, regex );
+ STACK_PUSH( &s_pattern, pattern );
+ }
+
+ action leave_regex {
+ if ( s_regex->type == regex_capture )
+ closed_captures += 1;
+
+ if ( s_regex->type == regex_look_around )
+ STACK_POP( &s_pattern );
+
+ reverse_term_list( &s_regex->term_list );
+ STACK_POP( &s_regex );
+ }
+
+ #
+ # Numbers
+ #
+ action init_number
+ { number = 0; }
+
+ action decimal_digit
+ { number = number * 10 + ( *p - '0' ); }
+
+ number = [0-9]+ >init_number $decimal_digit;
+
+ #
+ # Verbs
+ #
+ verb = ( [a-zA-Z_] [a-zA-Z_0-9]* ) >{ verb_len = 0; } ${
+ if ( verb_len < MAX_VERB ) {
+ verb[verb_len++] = *p;
+ }
+ };
+
+ action append_verb {
+ struct element *el = new_element( element_verb_type, NULL, NULL );
+ cur_verb = el->verb = new_verb( verb, verb_len );
+
+ LIST_APPEND( &s_term->element_list, el );
+ }
+
+ action verb_value
+ {
+ cur_verb->value = malloc( verb_len + 1 );
+ memcpy( cur_verb->value, verb, verb_len );
+ cur_verb->value[verb_len] = 0;
+ }
+
+ action verb_number
+ {
+ cur_verb->number = number;
+ }
+
+ verb_forms =
+ verb %append_verb ( ':' verb %verb_value | '=' number %verb_number )?;
+
+ action options_only { fret; }
+
+ options = (
+ 'c' # case-sensitive matching
+ | 'i' # case-insensitive matching
+ )*;
+
+ alpha_char = [a-zA-Z];
+ digit_char = [0-9];
+
+ open_paren = '(';
+ close_paren = ')';
+
+ char_class_start = '[';
+ char_class_end = ']';
+
+ open_curly = '{';
+ close_curly = '}';
+
+ ampersand = '&';
+ colon = ':';
+ comma = ',';
+ dollar = '$';
+ dot = '.';
+ equals = '=';
+ exclamation = '!';
+ greater_than = '>';
+ hash = '#';
+ hyphen = '-';
+ less_than = '<';
+ pipe = '|';
+ single_quote = "'";
+ underscore = '_';
+
+ other_char_printable =
+ ' ' | '~' | ';' | '@' | '%' | '`' | '"' | '/';
+
+ other_char_non_printable = ^( 0 .. 127 );
+
+ capture_non_capture =
+ '(' @{ fcall paren_open; };
+
+ literal_sym =
+ comma | hyphen | less_than |
+ greater_than | single_quote | underscore | colon |
+ hash | equals | exclamation | ampersand;
+
+ action append_char {
+ append_element_value( s_term, *p );
+ }
+
+ literal = (
+ alpha_char |
+ digit_char |
+ literal_sym |
+ other_char_printable |
+ other_char_non_printable
+ ) @append_char;
+
+ #
+ # Octals and backrefs. There is an ambiguity between these two, therefore
+ # the grammar handles them together.
+ #
+
+ action init_octal
+ { octal = 0; }
+
+ action octal_digit
+ { octal = octal * 8 + ( *p - '0' ); }
+
+ backref_num = [1-9] [0-9]*;
+
+ octal_num =
+ [0-3] [0-7] [0-7] |
+ [0-7] [0-7] |
+ [0];
+
+ action octal { append_element_value( s_term, octal ); }
+
+ action octal_or_backref
+ {
+ if ( is_backref( number, closed_captures ) )
+ append_backref( s_term, number );
+ else
+ append_element_value( s_term, octal );
+ }
+
+ action backref
+ {
+ if ( number > closed_captures ) {
+ printf( "invalid backref: \\%d\n", number );
+ fbreak;
+ }
+
+ append_backref( s_term, number );
+ }
+
+ # Certainly octal. All octals, with possibly backrefs removed.
+ def_octal =
+ '\\' @init_octal
+ ( octal_num - backref_num ) $octal_digit;
+
+ # All cases that can be either octal or backref and we need to inspect the
+ # backref number to decide. Use the intersection of the two numbers as the
+ # pattern.
+ octal_or_backref =
+ '\\' @init_octal @init_number
+ ( octal_num & backref_num ) $octal_digit $decimal_digit;
+
+ # Definitely backref. The backref pattern excluding anything with a prefix
+ # that could be octal.
+ def_backref =
+ '\\' @init_number
+ ( backref_num - ( octal_num any* ) ) $decimal_digit;
+
+ octal =
+ def_octal %octal |
+ octal_or_backref %octal_or_backref;
+
+ backref =
+ def_backref %backref |
+ octal_or_backref %octal_or_backref;
+
+ atom =
+ literal |
+ char_class_end @append_char |
+ dot @{ append_element_value( s_term, dot ); } |
+ octal |
+ backref |
+ open_paren @{ fcall open_paren_forms; }
+ ;
+
+ non_greedy =
+ '?' @{ apply_lazy( s_term ); } |
+ '+' @{ apply_possessive( s_term ); };
+
+ quant_min = number %{ quant_min = number; };
+ quant_max = number %{ quant_max = number; };
+
+ action quant_zero_plus { apply_quantifier( s_term, 0, QUANTIFIER_INF ); }
+ action quant_one_plus { apply_quantifier( s_term, 1, QUANTIFIER_INF ); }
+ action quant_optional { apply_quantifier( s_term, 0, 1 ); }
+ action quant_exact { apply_quantifier( s_term, quant_min, quant_min ); }
+ action quant_min { apply_quantifier( s_term, quant_min, QUANTIFIER_INF ); }
+ action quant_min_max { apply_quantifier( s_term, quant_min, quant_max ); }
+
+ quant_forms =
+ '*' @quant_zero_plus |
+ '+' @quant_one_plus |
+ '?' @quant_optional |
+ open_curly quant_min close_curly @quant_exact |
+ open_curly quant_min ',' close_curly @quant_min |
+ open_curly quant_min ',' quant_max close_curly @quant_min_max;
+
+ quantifier = quant_forms non_greedy?;
+
+ element = ( atom quantifier? );
+
+ term = ( element** ) >enter_term;
+
+ #
+ # Expression
+ #
+ expr = term ( '|' @leave_term term )*;
+
+ #
+ # Regex
+ #
+ regex = expr;
+
+
+ open_paren_forms :=
+ # Look at the first few charcters to see what the form is. What we
+ # handle here:
+ # (re) capturing parens
+ # (?:re) non-capturing parens
+ # (?=re) positive lookahead
+ # (?!re) negative lookahead
+ # (?<=re) positive lookbehind
+ # (?<!re) negative lookbehind
+ (
+ # Non-capture.
+ '?' options (
+ ')' @options_only |
+ ':' @enter_regex
+ ) |
+
+ # Atomic group.
+ '?>' @enter_regex
+ @{ s_regex->flags |= REGEX_ATOMIC_GROUP; } |
+
+ # Capture number reset on branch.
+ '?|' @enter_regex
+ @{ s_regex->flags |= REGEX_CAPTURE_RESET; } |
+
+ # Lookaround
+ (
+ '?=' @{ lat = lat_pos_ahead; } @enter_lookaround |
+ '?!' @{ lat = lat_neg_ahead; } @enter_lookaround |
+ '?<=' @{ lat = lat_pos_behind; } @enter_lookaround |
+ '?<!' @{ lat = lat_neg_behind; } @enter_lookaround
+ ) |
+
+ # Verbs
+ '*' verb_forms ')' @{fret;} |
+
+ # Catpuring.
+ ^( '?' | '*' ) @enter_regex @{
+ s_regex->type = regex_capture;
+ s_regex->capture = next_capture++;
+ fhold;
+ }
+
+ )
+ regex ')' @leave_term @leave_regex @{ fret; };
+
+
+ main := regex '\n' @leave_term @leave_regex @{ success = 1; };
+}%%
+
+%% write data;
+
+int pcre_parse( struct pattern **result_pattern, char *line, int len )
+{
+ int *stack;
+ int stack_size;
+ int cs, top;
+ char *p, *pe;
+ int success = 0;
+ int result = 1;
+
+ int number = 0;
+ int octal = 0;
+ int quant_min, quant_max;
+
+ struct pattern *s_pattern = NULL;
+ struct regex *s_regex = NULL;
+ struct term *s_term = NULL;
+
+ /* Collect into this. */
+ struct regex *root_regex = new_regex( regex_non_capture, 0 );
+ struct pattern *root_pattern = new_pattern( root_regex, 0 );
+ STACK_PUSH( &s_pattern, root_pattern );
+ STACK_PUSH( &s_regex, root_regex );
+
+ int closed_captures = 0;
+ int next_capture = 1;
+
+ enum look_around_type lat;
+
+ int verb_len;
+ char verb[MAX_VERB];
+ struct verb *cur_verb;
+
+ stack = init_ragel_stack( &stack_size );
+
+ %% write init;
+
+ p = line;
+ pe = line + len;
+
+ %% write exec;
+
+ if ( !success ) {
+ printf( "parse error at %ld\n", ( p - line + 1) );
+ result = 0;
+ }
+ else if (
+ s_pattern == NULL || s_pattern->parent != NULL ||
+ s_regex != NULL ||
+ s_term != NULL )
+ {
+ printf( "parse error: items not closed\n" );
+ result = 0;
+ }
+
+ if ( result ) {
+ *result_pattern = root_pattern;
+ }
+ else {
+
+ }
+
+ free( stack );
+
+ return result;
+}
+
+extern void print_value( int indent, struct value *value );
+extern void print_element( int indent, struct element *el );
+extern void print_term( int indent, struct term *term );
+extern void print_regex( int indent, struct regex *regex );
+
+void print_indent( int indent )
+{
+ while ( indent-- > 0 )
+ printf( " " );
+}
+
+void print_value( int indent, struct value *value )
+{
+ printf( "v(%d)", value->v );
+}
+
+void print_element( int indent, struct element *el )
+{
+ print_indent( indent );
+ printf( "el: " );
+ if ( el->type == element_value_type ) {
+ struct value *value = el->value_list;
+ while ( value != NULL ) {
+ print_value( indent + 1, value );
+ if ( value->next != 0 )
+ printf( " . " );
+ value = value->next;
+ }
+ }
+ else if ( el->type == element_regex_type ) {
+ if ( el->regex != 0 )
+ print_regex( indent, el->regex );
+ }
+ else if ( el->type == element_look_around_type ) {
+ printf( "lookaround " );
+ switch ( el->look_around->type ) {
+ case lat_pos_ahead: printf( "=" ); break;
+ case lat_neg_ahead: printf( "!" ); break;
+ case lat_pos_behind: printf( "<=" ); break;
+ case lat_neg_behind: printf( "<!" ); break;
+ }
+ printf( " " );
+ print_regex( indent, el->look_around->pattern->regex );
+ }
+ else if ( el->type == element_char_class_type )
+ printf( "cc" );
+ else if ( el->type == element_backref_type )
+ printf( "backref(%d)", el->backref->number );
+ else if ( el->type == element_verb_type ) {
+ printf( "verb(%s", el->verb->name );
+ if ( el->verb->value != NULL )
+ printf( ":%s", el->verb->value );
+ else if ( el->verb->number >= 0 )
+ printf( "=%d", el->verb->number );
+ printf( ")" );
+ }
+
+ if ( el->quantifier != NULL ) {
+ printf( " {%d,", el->quantifier->min );
+
+ if ( el->quantifier->max == QUANTIFIER_INF )
+ printf( "inf" );
+ else
+ printf( "%d", el->quantifier->max );
+
+ printf( "}" );
+
+ if ( el->quantifier->type == quant_lazy )
+ printf( "?" );
+ else if ( el->quantifier->type == quant_possessive )
+ printf( "+" );
+
+ }
+
+ printf("\n");
+}
+
+void print_term( int indent, struct term *term )
+{
+ print_indent( indent );
+ printf( "term:\n" );
+ struct element *el = term->element_list;
+ while ( el != NULL ) {
+ print_element( indent + 1, el );
+ el = el->next;
+ }
+}
+
+void print_regex( int indent, struct regex *regex )
+{
+ printf( "reg: " );
+ if ( regex->type == regex_capture )
+ printf( "cap(%d) ", regex->capture );
+ if ( regex->flags & REGEX_ATOMIC_GROUP )
+ printf( "atomic_group " );
+ if ( regex->flags & REGEX_CAPTURE_RESET )
+ printf( "capture_reset " );
+ printf( "(\n" );
+
+ struct term *term = regex->term_list;
+ while ( term != NULL ) {
+ print_term( indent + 1, term );
+ term = term->next;
+ }
+ print_indent( indent );
+ printf( ")" );
+}
+
+void print_pattern( int indent, struct pattern *pat )
+{
+ printf( "pat: " );
+ print_regex( indent, pat->regex );
+ printf( "\n" );
+}
+
+char linebuf[2048];
+
+int main( int argc, char **argv )
+{
+ if ( argc < 2 ) {
+ fprintf( stderr, "usage: ./pcre <regex>\n" );
+ return -1;
+ }
+
+ FILE *input = fopen( argv[1], "r" );
+ if ( input == NULL ) {
+ fprintf( stderr, "failed to open %s: %s", argv[0], strerror(errno) );
+ return -1;
+ }
+
+ char *line = NULL;
+ size_t n = 0;
+ ssize_t read;
+
+ while ( (read = getline( &line, &n, input )) != -1 ) {
+ struct pattern *pat;
+ int parsed = pcre_parse( &pat, line, (int)read );
+ if ( parsed ) {
+ print_pattern( 0, pat );
+ }
+ }
+
+ fclose( input );
+ free( line );
+ return 0;
+}
+