diff options
author | Adrian Thurston <thurston@colm.net> | 2019-12-01 09:05:03 -0800 |
---|---|---|
committer | Adrian Thurston <thurston@colm.net> | 2019-12-01 09:05:03 -0800 |
commit | 5a4407de2c4c03083f1c25b5c7fdb849cdc08a12 (patch) | |
tree | 4436efeb998fd54de016723d92d74c400d48e070 /grammar/pcre | |
parent | 479376c313208ee764d212846892cb904b8e5b89 (diff) | |
download | colm-5a4407de2c4c03083f1c25b5c7fdb849cdc08a12.tar.gz |
moved grammars out to their own dirs
We also want some testing/validation support for each grammar. This dir will
get quite messy if we move files.
Diffstat (limited to 'grammar/pcre')
-rw-r--r-- | grammar/pcre/.gitignore | 4 | ||||
-rw-r--r-- | grammar/pcre/Makefile | 11 | ||||
-rw-r--r-- | grammar/pcre/pcre.lm | 583 | ||||
-rw-r--r-- | grammar/pcre/pcre.rl | 864 |
4 files changed, 1462 insertions, 0 deletions
diff --git a/grammar/pcre/.gitignore b/grammar/pcre/.gitignore new file mode 100644 index 00000000..05b136b8 --- /dev/null +++ b/grammar/pcre/.gitignore @@ -0,0 +1,4 @@ +/pcre-colm.c +/pcre-colm +/pcre-ragel.c +/pcre-ragel diff --git a/grammar/pcre/Makefile b/grammar/pcre/Makefile new file mode 100644 index 00000000..6f895485 --- /dev/null +++ b/grammar/pcre/Makefile @@ -0,0 +1,11 @@ +COLM = ../../colm/colm +RAGEL = ../../ragel/ragel + +all: pcre-colm pcre-ragel + +pcre-colm: pcre.lm + $(COLM) -o $@ pcre.lm + +pcre-ragel: pcre.rl $(RAGEL) + $(RAGEL) -G2 pcre.rl + gcc -g -Wall -o $@ pcre.c diff --git a/grammar/pcre/pcre.lm b/grammar/pcre/pcre.lm new file mode 100644 index 00000000..691b1307 --- /dev/null +++ b/grammar/pcre/pcre.lm @@ -0,0 +1,583 @@ +global Backrefs: int = 0 + +lex + token pre_equals /'='/ +end + +token alpha_char + / [a-zA-Z] / + +token digit_char + / [0-9] / + +rl alpha_nums + / (alpha_char | '_' ) (alpha_char | '_' | digit_char)* / + +rl alpha_numeric + / 'a'..'z' | 'A'..'Z' | '0'..'9' / + +rl alpha_numerics + / alpha_numeric+ / + +rl hex_digit + / '0'..'9' | 'a'..'f' | 'A'..'F' / + +literal `| `^ +literal `. `? `+ `* +literal `{ `} + +# It is important that these all go into the same lexical region, so we get a +# longest-match with no backtracking among these lexical options. Probably need +# to separate mainline regex from character class regex lexical, but for now +# they are the same regions. +lex + literal `[ + token cc_open_caret /"[^"/ + token cc_open_caret_close /"[^]"/ + token cc_open_close /"[]"/ +end + +literal `] +literal `( `) +literal `< `> +literal `, `: `- `_ `= `! +literal `# `& `$ + +token NL + / '\r' ? '\n' / + +token number + /[0-9]+/ + +# With greedy (default) or lazy (?), we are always attempting all matches. But +# possessive (+) prunes paths, so it must force the pattern to become a +# prefilter. +def quantifier_type + [`+] +| [`?] +| [] + +def general_repetition + [`{ number `} ] +| [`{ number comma `} ] +| [`{ number comma number `} ] + +def quantifier + [`? quantifier_type] :Question +| [`* quantifier_type] :Star +| [`+ quantifier_type] :Plus +| [general_repetition quantifier_type] :General +| [] :Base + +token sr_R /'R'/ +token sr_P /'P'/ + +def subroutine_reference + [`( `? sr_R `)] +| [`( `? number `)] +| [`( `? `+ number `)] +| [`( `? `- number `)] +| [`( `? `& name `)] +| [`( `? sr_P `> name `)] +| [br_g `< name `>] +| [br_g `< number `>] +| [br_g `< `+ number `>] +| [br_g `< `- number `>] +| [br_g single_quote name single_quote ] +| [br_g single_quote number single_quote] +| [br_g single_quote `+ number single_quote] +| [br_g single_quote `- number single_quote] + +token ns_open /'[[:'/ + +lex + token ns_caret /'^'/ + token ns_word /alpha_numerics/ + token ns_close /':]]'/ +end + +def posix_named_set + [ns_open ns_caret? ns_word ns_close] + +token reset_start_match + / '\\K' / + +def shared_atom + [decimal_digit] :DecimalDigit +| [not_decimal_digit] :NotDecimalDigit +| [horizontal_white_space] :HorizonalWhiteSpace +| [not_horizontal_white_space] :NotHorizontalWhiteSpace +| [not_new_line] :NotNewLine +| [new_line_sequence] :NewLineSequence +| [white_space] :WhiteSpace +| [not_white_space] :NotWhiteSpace +| [vertical_white_space] :VerticalWhiteSpace +| [not_vertical_white_space] :NotVerticalWhiteSpace +| [word_char] :WordChar +| [not_word_char] :NotWordChar +| [posix_named_set] :PosixNamedSet +| [char_with_property] :CharWithProperty +| [char_without_property] :CharWithoutProperty +| [control_char] :ControlChar + +def shared_literal + [octal] :Octal +| [alpha_char] :AlphaChar +| [digit_char] :DigitChar +| [bell_char] :BellChar +| [escape_char] :EscapeChar +| [form_feed] :FormFeed +| [new_line] :NewLine +| [carriage_ret] :CarriageRet +| [tab] :Tab +| [hex_char_fixed] :HexCharFixed +| [hex_char_var] :HexCharVar +| [quoted] :Quoted +| [block_quoted] :BlockQuoted +| [open_brace] :OpenBrace +| [close_brace] :CloseBrace +| [comma] :Comma +| [hyphen] :Hypen +| [less_than] :LessThan +| [greater_than] :GreaterThan +| [single_quote] :SingleQuote +| [underscore] :Underscore +| [colon] :Colon +| [hash] :Hash +| [equals] :Equals +| [exclamation] :Excalmation +| [ampersand] :Ampersand +| [other_char_printable] :OtherCharPrintable +| [other_char_non_printable] :OhterCharNonPrintable + +token name + / alpha_nums / + +token bell_char / '\\a' / +token escape_char / '\\e' / +token form_feed / '\\f' / +token new_line / '\\n' / +token carriage_ret / '\\r' / +token tab / '\\t' / +token control_char + / '\\c' ( 0x00 .. 0x7c ) / + +token underscore_alpha_numerics + / ('_' | alpha_numeric)+ / + +rl non_alpha_numeric + / ^alpha_numeric / + +token quoted + /'\\' non_alpha_numeric/ + +token bs_Q + /'\\Q'/ + +lex + # String of non-backslash chars. Or a single backslash. + token block_data / ( [^\\]+ ) | '\\' / + token block_end /'\\E'/ +end + +token block_quoted + /bs_Q block_data* block_end/ + +def hyphen [ `- ] +def less_than [ `< ] +def greater_than [ `> ] +def underscore [ `_ ] +def colon [ `: ] +def equals [ `= ] +def exclamation [ `! ] +def ampersand [ `& ] +def hash [ `# ] +def dollar [ `$ ] + +token single_quote + / "'" / + +token other_char_printable + / ' ' | '~' | ';' | '@' | '%' | '`' | '"' | '/' / + +token other_char_non_printable + / ^( 0 .. 127 ) / + +token P / 'P' / + +def capture_form + [`? `< name `> regex] :NamedPerl +| [`? single_quote name single_quote regex] :NamedQuoted +| [`? P `< name `> regex] :NamedPython +| [regex] :Unamed + +def capture + # This ID is for the ragel implementation. We use the nfa repetition + # operator, which needs an id. + [`( capture_form `)] :Capture + { + Backrefs = Backrefs + 1 + } + +def option_spec + [Add: option_flags `- Remove: option_flags] +| [Add: option_flags] +| [`- Remove: option_flags] + +def non_capture + [`( `? `: regex `)] +| [`( `? option_spec `: regex `)] +| [`( `? `| regex `)] +| [`( `? `> regex `)] + +token non_close_parens + / [^)]+ / + +def comment + [ `( `? `# non_close_parens? `) ] + +def option + [`( `? option_spec `)] +| [`( `* no_start_opt `)] +| [`( `* utf8 `)] +| [`( `* utf16 `)] +| [`( `* ucp `)] + +def option_flags + [option_flag+] + +token option_flag / 'i' | 'J' | 'm' | 's' | 'U' | 'x' / + +token no_start_opt / 'NO_START_OPT' / +token utf8 / 'UTF8' / +token utf16 / 'UTF16' / +token ucp / 'UCP' / + +def look_ahead + [`( `? `= regex `)] +| [`( `? `! regex `)] + +def look_behind + [`( `? `< `= regex `)] +| [`( `? `< `! regex `)] + +def look_around + [look_ahead] +| [look_behind] + +token br_g / '\\g' / +token br_k / '\\k' / + +token maybe_backref / '\\' [1-9] [0-9]* / + +lex + token maybe_octal / + '\\' ( + [1-3] [0-7] [0-7] | + [1-7] [0-7] + ) + / + + token def_octal / + '\\' ( + [0] [0-7] [0-7] | + [0] [0-7] | + [0] + ) + / +end + +token else_digits / '\\' [0-9]+ / + +bool is_backref( Num: str ) +{ + Num = suffix( Num, 1 ) + Ref: int = atoi( Num ) + if ( Ref < 8 || Ref <= Backrefs ) + return true + return false +} + +# Simple disambig between octals and backrefs. Reject octals that can be a +# backref, as determined by counting the number of captures. +def octal + [maybe_octal] :Maybe + { + if ( is_backref( $lhs.maybe_octal ) ) + reject + } +| [def_octal] :Def + +def backref + [maybe_backref] + { + if ( !is_backref( $lhs.maybe_backref ) ) + reject + } +| [br_g number] +| [br_g `{ number `}] +| [br_g `{ `- number `}] +| [br_k `< name `>] +| [br_k single_quote name single_quote] +| [br_g `{ name `}] +| [br_k `{ name `}] +| [`( `? P `= name `)] + +def literal_digits + [else_digits] + +def cond_ref + [number] +| [`+ number] +| [`- number] +| [`< name `>] +| [single_quote name single_quote] +| [cond_ref_R number] +| [cond_ref_R] +| [cond_ref_R `& name] +| [cond_ref_DEFINE] +| [cond_ref_assert] +| [name] + +token cond_ref_DEFINE / 'DEFINE' / +token cond_ref_assert / 'assert' / +token cond_ref_R / 'R' / + +def cond_false + [`| regex ] + +def conditional + [`( `? `( cond_ref `) regex cond_false? `)] + +token btc_accept / 'ACCEPT' / +token btc_fail / 'F' ( 'AIL' )? / +token btc_mark_name / ('MARK')? ':NAME' / +token btc_commit / 'COMMIT' / +token btc_prune / 'PRUNE' / +token btc_prune_name / 'PRUNE:NAME)' / +token btc_skip / 'SKIP' / +token btc_skip_name / 'SKIP:NAME' / +token btc_then / 'THEN' / +token btc_then_name / 'THEN:NAME' / + +def btc_type + [btc_accept] +| [btc_fail] +| [btc_mark_name] +| [btc_commit] +| [btc_prune] +| [btc_prune_name] +| [btc_skip] +| [btc_skip_name] +| [btc_then] +| [btc_then_name] + +def backtrack_control + [ `( `* btc_type `) ] + +token nlc_cr / 'CR' / +token nlc_lf / 'LF' / +token nlc_crlf / 'CRLF' / +token nlc_anycrlf / 'ANYCRLF' / +token nlc_any / 'ANY' / +token nlc_bsr_anycrlf / 'BSR_ANYCRLF' / +token nlc_bsr_unicodo / 'BSR_UNICODE' / + +def nlc_type + [nlc_cr] +| [nlc_lf] +| [nlc_crlf] +| [nlc_anycrlf] +| [nlc_any] +| [nlc_bsr_anycrlf] +| [nlc_bsr_unicodo] + +def newline_convention + [ `( `* nlc_type `) ] + +token callout_C / 'C' / + +def callout + [ `( `? callout_C `) ] +| [ `( `? callout_C number `) ] + +def char_class_start [ `[ ] +def char_class_end [ `] ] +def dot [ `. ] +def caret [ `^ ] +def question_mark [ `? ] +def plus [ `+ ] +def star [ `* ] +def open_brace [ `{ ] +def close_brace [ `} ] +def comma [ `, ] +def pipe [ `| ] +def open_paren [ `( ] +def close_paren [ `) ] + +lex + token hex_char_fixed + / '\\x' hex_digit hex_digit / + + token hex_char_var + / '\\x' '{' hex_digit hex_digit hex_digit+ '}' / +end + +# +# Anchors +# + +token word_boundary / '\\b' / +token non_word_boundary / '\\B' / + +token sos_A + / '\\A' / + +def start_of_subject + [`^] +| [sos_A] + +token eos_z / '\\z' / +token eos_Z / '\\Z' / + +def end_of_subject + [`$] +| [eos_Z] +| [eos_z] + +token first_matching_pos + / '\\G' / + +def anchor + [word_boundary] +| [non_word_boundary] +| [start_of_subject] +| [end_of_subject] +| [first_matching_pos] + +# +# Character classes +# + +def cc_atom_list + [cc_atom cc_atom*] + +def character_class + [`[ cc_atom_list `]] +| [cc_open_caret cc_atom_list `]] +| [cc_open_caret_close cc_atom* `]] +| [cc_open_close cc_atom* `]] +| [cc_open_caret_close hyphen cc_atom_end_range cc_atom* `]] +| [cc_open_close hyphen cc_atom_end_range cc_atom* `]] + +def cc_atom_end_range + [cc_atom] + +def cc_atom + [cc_literal hyphen cc_literal] +| [shared_atom] +| [cc_literal] +| [octal] + +def cc_literal + [shared_literal] +| [dot] +| [char_class_start] +| [caret] +| [question_mark] +| [plus] +| [star] +| [word_boundary] +| [non_word_boundary] +| [dollar] +| [pipe] +| [open_paren] +| [close_paren] + +token decimal_digit / '\\d' / +token not_decimal_digit / '\\D' / +token horizontal_white_space / '\\h' / +token not_horizontal_white_space / '\\H' / +token not_new_line / '\\N' / +token new_line_sequence / '\\R' / +token white_space / '\\s' / +token not_white_space / '\\S' / +token vertical_white_space / '\\v' / +token not_vertical_white_space / '\\V' / +token word_char / '\\w' / +token not_word_char / '\\W' / + +token one_data_unit / '\\C' / +token extended_unicode_char / '\\X' / + +token with_property_open / '\\p' / +token without_property_open / '\\P' / + +def char_with_property + [with_property_open `{ underscore_alpha_numerics `}] +def char_without_property + [without_property_open `{ underscore_alpha_numerics `}] + +def atom + [shared_atom] :SharedAtom +| [shared_literal] :SharedLiteral +| [char_class_end] :CharClassEnd +| [dot] :Dot +| [character_class] :CharacterClass +| [capture] :Capture +| [non_capture] :NonCapture +| [anchor] :Anchor +| [look_around] :LookAround +| [option] :Option +| [newline_convention] :NewlineConvention +| [callout] :Callout +| [reset_start_match] :ResetStartMatch +| [one_data_unit] :OneDataUnit +| [extended_unicode_char] :ExtendedUnicodeChar +| [backtrack_control] :BacktrackControl +| [backref] :Backref +| [literal_digits] :LiteralDigits +| [subroutine_reference] :SubroutineReference +| [conditional] :Conditional +| [comment] :Comment + +def element + [atom quantifier] :Atom + +def term + [element term] :Element +| [] :Base + +def expr + [expr `| term] :Union +| [term] :Base + +def regex + [expr] :Expr + +def init + [] + { + Backrefs = 0 + } + +token unparseable /[^\n]*/ + +def line + [init regex NL] :Regex commit +| [unparseable NL] :Unparseable commit + +def file + [line*] + + +parse F: file [stdin] + +if !F + print "parse error: [error] +else { + for U: unparseable in F + print "unparseable: [U] + for B: backref in F + print "backref: [B] +} diff --git a/grammar/pcre/pcre.rl b/grammar/pcre/pcre.rl new file mode 100644 index 00000000..307d5467 --- /dev/null +++ b/grammar/pcre/pcre.rl @@ -0,0 +1,864 @@ +#include <stdio.h> +#include <errno.h> +#include <string.h> +#include <stdlib.h> + +#define LIST_APPEND( pphead, element ) \ + { \ + element->next = *pphead; \ + *pphead = element; \ + } + +#define LIST_REMOVE( pphead ) \ + { \ + *pphead = (*pphead)->next; \ + } + +#define STACK_PUSH( pphead, element ) \ + { \ + element->parent = *pphead; \ + *pphead = element; \ + } + +#define STACK_POP( pphead ) \ + { \ + *pphead = (*pphead)->parent; \ + } + +#define REVERSE_LIST( S, pphead ) \ + if ( *pphead != 0 ) { \ + S *front = (*pphead)->next; \ + (*pphead)->next = 0; \ + while ( front != NULL ) { \ + S *next = front->next; \ + front->next = *pphead; \ + *pphead = front; \ + front = next; \ + } \ + } + +#define MAX_VERB 40 +#define QUANTIFIER_INF -1 + +struct value +{ + int v; + + struct value *next; +}; + +enum value_type +{ + dot = 256, +}; + +enum quantifer_type +{ + quant_greedy = 1, + quant_lazy, + quant_possessive, +}; + +struct quantifier +{ + int min, max; + enum quantifer_type type; +}; + +enum element_type +{ + element_value_type = 1, + element_regex_type, + element_look_around_type, + element_char_class_type, + element_backref_type, + element_verb_type, +}; + +struct element +{ + enum element_type type; + struct regex *regex; + struct look_around *look_around; + struct quantifier *quantifier; + struct value *value_list; + struct backref *backref; + struct verb *verb; + + struct element *next; +}; + +struct term +{ + struct element *element_list; + + struct term *next; + struct term *parent; +}; + +enum regex_type +{ + regex_non_capture = 1, + regex_capture, + regex_look_around +}; + +#define REGEX_ATOMIC_GROUP 0x01 +#define REGEX_CAPTURE_RESET 0x02 + +struct regex +{ + struct term *term_list; + + enum regex_type type; + int capture; + int node_id; + int options; + + int flags; + + struct regex *next; + struct regex *parent; +}; + +struct pattern +{ + struct regex *regex; + int node_id; + int is_look_around; + + struct look_around *look_arounds; + struct pattern *parent; +}; + +enum look_around_type { + lat_pos_ahead = 1, + lat_neg_ahead, + lat_pos_behind, + lat_neg_behind, +}; + +struct look_around +{ + enum look_around_type type; + struct pattern *pattern; + int node_id; + + struct look_around *next; +}; + +struct backref +{ + int number; +}; + +struct verb +{ + char *name; + char *value; + int number; +}; + +struct quantifier *new_quantifier() +{ + struct quantifier *quant = malloc( sizeof( struct value ) ); + memset( quant, 0, sizeof(struct quantifier) ); + quant->type = quant_greedy; + return quant; +} + +struct value *new_value( int v ) +{ + struct value *value = malloc( sizeof( struct value ) ); + memset( value, 0, sizeof(struct value) ); + value->v = v; + return value; +} + +struct term *new_term( void ) +{ + struct term *term = malloc( sizeof( struct term ) ); + memset( term, 0, sizeof(struct term) ); + return term; +} + +struct element *new_element( enum element_type type, struct regex *regex, struct look_around *look_around ) +{ + struct element *element = malloc( sizeof( struct element ) ); + memset( element, 0, sizeof(struct element) ); + element->type = type; + element->regex = regex; + element->look_around = look_around; + return element; +} + +struct regex *new_regex( enum regex_type type, int options ) +{ + struct regex *regex = malloc( sizeof( struct regex ) ); + memset( regex, 0, sizeof(struct regex) ); + regex->type = type; + regex->options = options; + return regex; +} + +struct look_around *new_look_around( enum look_around_type lat, struct pattern *pattern ) +{ + struct look_around *look_around = malloc( sizeof( struct look_around ) ); + memset( look_around, 0, sizeof(struct look_around) ); + look_around->type = lat; + look_around->pattern = pattern; + return look_around; +} + +struct pattern *new_pattern( struct regex *regex, int is_look_around ) +{ + struct pattern *pattern = malloc( sizeof( struct pattern ) ); + pattern->regex = regex; + pattern->is_look_around = is_look_around; + return pattern; +} + +struct backref *new_backref( int number ) +{ + struct backref *backref = malloc( sizeof( struct backref ) ); + memset( backref, 0, sizeof(struct backref) ); + backref->number = number; + return backref; +} + +struct verb *new_verb( char *d, int len ) +{ + struct verb *verb = malloc( sizeof( struct verb ) ); + verb->name = malloc( len + 1 ); + memcpy( verb->name, d, len ); + verb->name[len] = 0; + verb->value = 0; + verb->number = -1; + return verb; +} + +void append_element_value( struct term *term, int value ) +{ + struct element *el = term->element_list; + if ( el == NULL || + el->type != element_value_type || + el->quantifier != 0 ) + { + el = new_element( element_value_type, NULL, NULL ); + LIST_APPEND( &term->element_list, el ); + } + + struct value *v = new_value( value ); + LIST_APPEND( &el->value_list, v ); +} + +void reverse_value_list( struct value **pphead ) +{ + REVERSE_LIST( struct value, pphead ); +} + +void reverse_element_list( struct element **pphead ) +{ + REVERSE_LIST( struct element, pphead ); +} + +void reverse_term_list( struct term **pphead ) +{ + REVERSE_LIST( struct term, pphead ); +} + +int is_backref( int num, int num_closed_captures ) +{ + if ( num < 8 || num <= num_closed_captures ) + return 1; + return 0; +} + +void append_backref( struct term *term, int number ) +{ + struct element *el = new_element( element_backref_type, NULL, NULL ); + el->backref = new_backref( number ); + LIST_APPEND( &term->element_list, el ); +} + +/* Provides initial ragel call state stack. */ +int *init_ragel_stack( int *size ) +{ + *size = 24; + return malloc( *size * sizeof(int) ); +} + +/* Grows the ragel call state stack. Use in pre-push. */ +int *grow_ragel_stack( int *size, int *stack ) +{ + int new_size = *size * 2; + int *new_stack = malloc( new_size * sizeof(int) ) ; + memcpy( new_stack, stack, *size * sizeof(int) ); + free( stack ); + + *size = new_size; + return new_stack; +} + +void apply_quantifier( struct term *term, int min, int max ) +{ + struct element *el = term->element_list; + + /* We may need to remove the last value from the element's value list. */ + if ( el->type == element_value_type && el->value_list != NULL && el->value_list->next != NULL ) { + /* Take the value off and add it to an element of its own. */ + struct value *last = el->value_list; + LIST_REMOVE( &el->value_list ); + + el = new_element( element_value_type, NULL, NULL ); + + LIST_APPEND( &el->value_list, last ); + LIST_APPEND( &term->element_list, el ); + } + + el->quantifier = new_quantifier(); + el->quantifier->min = min; + el->quantifier->max = max; +} + +void apply_lazy( struct term *term ) +{ + struct element *el = term->element_list; + el->quantifier->type = quant_lazy; +} + +void apply_possessive( struct term *term ) +{ + struct element *el = term->element_list; + el->quantifier->type = quant_possessive; +} + +%%{ + machine PCRE; + + action enter_term { + struct term *term = new_term(); + STACK_PUSH( &s_term, term ); + LIST_APPEND( &s_regex->term_list, term ); + } + + action leave_term { + reverse_element_list( &s_term->element_list ); + struct element *el = s_term->element_list; + while ( el != 0 ) { + reverse_value_list( &el->value_list ); + el = el->next; + } + STACK_POP( &s_term ); + } + + action enter_regex { + struct regex *regex = new_regex( regex_non_capture, s_regex->options ); + struct element *element = new_element( element_regex_type, regex, NULL ); + LIST_APPEND( &s_term->element_list, element ); + STACK_PUSH( &s_regex, regex ); + } + + action enter_lookaround { + struct regex *regex = new_regex( regex_look_around, s_regex->options ); + struct pattern *pattern = new_pattern( regex, 1 ); + struct look_around *la = new_look_around( lat, pattern ); + pattern->node_id = la->node_id; + + LIST_APPEND( &s_pattern->look_arounds, la ); + + struct element *element = new_element( element_look_around_type, NULL, la ); + LIST_APPEND( &s_term->element_list, element ); + + STACK_PUSH( &s_regex, regex ); + STACK_PUSH( &s_pattern, pattern ); + } + + action leave_regex { + if ( s_regex->type == regex_capture ) + closed_captures += 1; + + if ( s_regex->type == regex_look_around ) + STACK_POP( &s_pattern ); + + reverse_term_list( &s_regex->term_list ); + STACK_POP( &s_regex ); + } + + # + # Numbers + # + action init_number + { number = 0; } + + action decimal_digit + { number = number * 10 + ( *p - '0' ); } + + number = [0-9]+ >init_number $decimal_digit; + + # + # Verbs + # + verb = ( [a-zA-Z_] [a-zA-Z_0-9]* ) >{ verb_len = 0; } ${ + if ( verb_len < MAX_VERB ) { + verb[verb_len++] = *p; + } + }; + + action append_verb { + struct element *el = new_element( element_verb_type, NULL, NULL ); + cur_verb = el->verb = new_verb( verb, verb_len ); + + LIST_APPEND( &s_term->element_list, el ); + } + + action verb_value + { + cur_verb->value = malloc( verb_len + 1 ); + memcpy( cur_verb->value, verb, verb_len ); + cur_verb->value[verb_len] = 0; + } + + action verb_number + { + cur_verb->number = number; + } + + verb_forms = + verb %append_verb ( ':' verb %verb_value | '=' number %verb_number )?; + + action options_only { fret; } + + options = ( + 'c' # case-sensitive matching + | 'i' # case-insensitive matching + )*; + + alpha_char = [a-zA-Z]; + digit_char = [0-9]; + + open_paren = '('; + close_paren = ')'; + + char_class_start = '['; + char_class_end = ']'; + + open_curly = '{'; + close_curly = '}'; + + ampersand = '&'; + colon = ':'; + comma = ','; + dollar = '$'; + dot = '.'; + equals = '='; + exclamation = '!'; + greater_than = '>'; + hash = '#'; + hyphen = '-'; + less_than = '<'; + pipe = '|'; + single_quote = "'"; + underscore = '_'; + + other_char_printable = + ' ' | '~' | ';' | '@' | '%' | '`' | '"' | '/'; + + other_char_non_printable = ^( 0 .. 127 ); + + capture_non_capture = + '(' @{ fcall paren_open; }; + + literal_sym = + comma | hyphen | less_than | + greater_than | single_quote | underscore | colon | + hash | equals | exclamation | ampersand; + + action append_char { + append_element_value( s_term, *p ); + } + + literal = ( + alpha_char | + digit_char | + literal_sym | + other_char_printable | + other_char_non_printable + ) @append_char; + + # + # Octals and backrefs. There is an ambiguity between these two, therefore + # the grammar handles them together. + # + + action init_octal + { octal = 0; } + + action octal_digit + { octal = octal * 8 + ( *p - '0' ); } + + backref_num = [1-9] [0-9]*; + + octal_num = + [0-3] [0-7] [0-7] | + [0-7] [0-7] | + [0]; + + action octal { append_element_value( s_term, octal ); } + + action octal_or_backref + { + if ( is_backref( number, closed_captures ) ) + append_backref( s_term, number ); + else + append_element_value( s_term, octal ); + } + + action backref + { + if ( number > closed_captures ) { + printf( "invalid backref: \\%d\n", number ); + fbreak; + } + + append_backref( s_term, number ); + } + + # Certainly octal. All octals, with possibly backrefs removed. + def_octal = + '\\' @init_octal + ( octal_num - backref_num ) $octal_digit; + + # All cases that can be either octal or backref and we need to inspect the + # backref number to decide. Use the intersection of the two numbers as the + # pattern. + octal_or_backref = + '\\' @init_octal @init_number + ( octal_num & backref_num ) $octal_digit $decimal_digit; + + # Definitely backref. The backref pattern excluding anything with a prefix + # that could be octal. + def_backref = + '\\' @init_number + ( backref_num - ( octal_num any* ) ) $decimal_digit; + + octal = + def_octal %octal | + octal_or_backref %octal_or_backref; + + backref = + def_backref %backref | + octal_or_backref %octal_or_backref; + + atom = + literal | + char_class_end @append_char | + dot @{ append_element_value( s_term, dot ); } | + octal | + backref | + open_paren @{ fcall open_paren_forms; } + ; + + non_greedy = + '?' @{ apply_lazy( s_term ); } | + '+' @{ apply_possessive( s_term ); }; + + quant_min = number %{ quant_min = number; }; + quant_max = number %{ quant_max = number; }; + + action quant_zero_plus { apply_quantifier( s_term, 0, QUANTIFIER_INF ); } + action quant_one_plus { apply_quantifier( s_term, 1, QUANTIFIER_INF ); } + action quant_optional { apply_quantifier( s_term, 0, 1 ); } + action quant_exact { apply_quantifier( s_term, quant_min, quant_min ); } + action quant_min { apply_quantifier( s_term, quant_min, QUANTIFIER_INF ); } + action quant_min_max { apply_quantifier( s_term, quant_min, quant_max ); } + + quant_forms = + '*' @quant_zero_plus | + '+' @quant_one_plus | + '?' @quant_optional | + open_curly quant_min close_curly @quant_exact | + open_curly quant_min ',' close_curly @quant_min | + open_curly quant_min ',' quant_max close_curly @quant_min_max; + + quantifier = quant_forms non_greedy?; + + element = ( atom quantifier? ); + + term = ( element** ) >enter_term; + + # + # Expression + # + expr = term ( '|' @leave_term term )*; + + # + # Regex + # + regex = expr; + + + open_paren_forms := + # Look at the first few charcters to see what the form is. What we + # handle here: + # (re) capturing parens + # (?:re) non-capturing parens + # (?=re) positive lookahead + # (?!re) negative lookahead + # (?<=re) positive lookbehind + # (?<!re) negative lookbehind + ( + # Non-capture. + '?' options ( + ')' @options_only | + ':' @enter_regex + ) | + + # Atomic group. + '?>' @enter_regex + @{ s_regex->flags |= REGEX_ATOMIC_GROUP; } | + + # Capture number reset on branch. + '?|' @enter_regex + @{ s_regex->flags |= REGEX_CAPTURE_RESET; } | + + # Lookaround + ( + '?=' @{ lat = lat_pos_ahead; } @enter_lookaround | + '?!' @{ lat = lat_neg_ahead; } @enter_lookaround | + '?<=' @{ lat = lat_pos_behind; } @enter_lookaround | + '?<!' @{ lat = lat_neg_behind; } @enter_lookaround + ) | + + # Verbs + '*' verb_forms ')' @{fret;} | + + # Catpuring. + ^( '?' | '*' ) @enter_regex @{ + s_regex->type = regex_capture; + s_regex->capture = next_capture++; + fhold; + } + + ) + regex ')' @leave_term @leave_regex @{ fret; }; + + + main := regex '\n' @leave_term @leave_regex @{ success = 1; }; +}%% + +%% write data; + +int pcre_parse( struct pattern **result_pattern, char *line, int len ) +{ + int *stack; + int stack_size; + int cs, top; + char *p, *pe; + int success = 0; + int result = 1; + + int number = 0; + int octal = 0; + int quant_min, quant_max; + + struct pattern *s_pattern = NULL; + struct regex *s_regex = NULL; + struct term *s_term = NULL; + + /* Collect into this. */ + struct regex *root_regex = new_regex( regex_non_capture, 0 ); + struct pattern *root_pattern = new_pattern( root_regex, 0 ); + STACK_PUSH( &s_pattern, root_pattern ); + STACK_PUSH( &s_regex, root_regex ); + + int closed_captures = 0; + int next_capture = 1; + + enum look_around_type lat; + + int verb_len; + char verb[MAX_VERB]; + struct verb *cur_verb; + + stack = init_ragel_stack( &stack_size ); + + %% write init; + + p = line; + pe = line + len; + + %% write exec; + + if ( !success ) { + printf( "parse error at %ld\n", ( p - line + 1) ); + result = 0; + } + else if ( + s_pattern == NULL || s_pattern->parent != NULL || + s_regex != NULL || + s_term != NULL ) + { + printf( "parse error: items not closed\n" ); + result = 0; + } + + if ( result ) { + *result_pattern = root_pattern; + } + else { + + } + + free( stack ); + + return result; +} + +extern void print_value( int indent, struct value *value ); +extern void print_element( int indent, struct element *el ); +extern void print_term( int indent, struct term *term ); +extern void print_regex( int indent, struct regex *regex ); + +void print_indent( int indent ) +{ + while ( indent-- > 0 ) + printf( " " ); +} + +void print_value( int indent, struct value *value ) +{ + printf( "v(%d)", value->v ); +} + +void print_element( int indent, struct element *el ) +{ + print_indent( indent ); + printf( "el: " ); + if ( el->type == element_value_type ) { + struct value *value = el->value_list; + while ( value != NULL ) { + print_value( indent + 1, value ); + if ( value->next != 0 ) + printf( " . " ); + value = value->next; + } + } + else if ( el->type == element_regex_type ) { + if ( el->regex != 0 ) + print_regex( indent, el->regex ); + } + else if ( el->type == element_look_around_type ) { + printf( "lookaround " ); + switch ( el->look_around->type ) { + case lat_pos_ahead: printf( "=" ); break; + case lat_neg_ahead: printf( "!" ); break; + case lat_pos_behind: printf( "<=" ); break; + case lat_neg_behind: printf( "<!" ); break; + } + printf( " " ); + print_regex( indent, el->look_around->pattern->regex ); + } + else if ( el->type == element_char_class_type ) + printf( "cc" ); + else if ( el->type == element_backref_type ) + printf( "backref(%d)", el->backref->number ); + else if ( el->type == element_verb_type ) { + printf( "verb(%s", el->verb->name ); + if ( el->verb->value != NULL ) + printf( ":%s", el->verb->value ); + else if ( el->verb->number >= 0 ) + printf( "=%d", el->verb->number ); + printf( ")" ); + } + + if ( el->quantifier != NULL ) { + printf( " {%d,", el->quantifier->min ); + + if ( el->quantifier->max == QUANTIFIER_INF ) + printf( "inf" ); + else + printf( "%d", el->quantifier->max ); + + printf( "}" ); + + if ( el->quantifier->type == quant_lazy ) + printf( "?" ); + else if ( el->quantifier->type == quant_possessive ) + printf( "+" ); + + } + + printf("\n"); +} + +void print_term( int indent, struct term *term ) +{ + print_indent( indent ); + printf( "term:\n" ); + struct element *el = term->element_list; + while ( el != NULL ) { + print_element( indent + 1, el ); + el = el->next; + } +} + +void print_regex( int indent, struct regex *regex ) +{ + printf( "reg: " ); + if ( regex->type == regex_capture ) + printf( "cap(%d) ", regex->capture ); + if ( regex->flags & REGEX_ATOMIC_GROUP ) + printf( "atomic_group " ); + if ( regex->flags & REGEX_CAPTURE_RESET ) + printf( "capture_reset " ); + printf( "(\n" ); + + struct term *term = regex->term_list; + while ( term != NULL ) { + print_term( indent + 1, term ); + term = term->next; + } + print_indent( indent ); + printf( ")" ); +} + +void print_pattern( int indent, struct pattern *pat ) +{ + printf( "pat: " ); + print_regex( indent, pat->regex ); + printf( "\n" ); +} + +char linebuf[2048]; + +int main( int argc, char **argv ) +{ + if ( argc < 2 ) { + fprintf( stderr, "usage: ./pcre <regex>\n" ); + return -1; + } + + FILE *input = fopen( argv[1], "r" ); + if ( input == NULL ) { + fprintf( stderr, "failed to open %s: %s", argv[0], strerror(errno) ); + return -1; + } + + char *line = NULL; + size_t n = 0; + ssize_t read; + + while ( (read = getline( &line, &n, input )) != -1 ) { + struct pattern *pat; + int parsed = pcre_parse( &pat, line, (int)read ); + if ( parsed ) { + print_pattern( 0, pat ); + } + } + + fclose( input ); + free( line ); + return 0; +} + |