From a8ff0c82a3a26adf1547e8bef9919f29341849b1 Mon Sep 17 00:00:00 2001 From: Adrian Thurston Date: Thu, 24 Oct 2019 08:34:18 -0300 Subject: start of a PCRE parser in C/Ragel Basic structure is there, needs to be filled in. So far it handles sub-expressions, lookarounds, acaccepts characters a-z and optinos c and i. Most of what is done so far is tree building. --- grammar/.gitignore | 6 +- grammar/Makefile | 8 +- grammar/pcre.rl | 502 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 513 insertions(+), 3 deletions(-) create mode 100644 grammar/pcre.rl (limited to 'grammar') diff --git a/grammar/.gitignore b/grammar/.gitignore index 16067b3d..adcb030d 100644 --- a/grammar/.gitignore +++ b/grammar/.gitignore @@ -1,2 +1,4 @@ -/parserust -/parserust.c +/rust.c +/rust +/pcre.c +/pcre diff --git a/grammar/Makefile b/grammar/Makefile index b7613689..f269524f 100644 --- a/grammar/Makefile +++ b/grammar/Makefile @@ -1,2 +1,8 @@ +all: rust pcre + rust: rust.lm parserust.lm - ../colm/colm parserust.lm + ../colm/colm -o rust parserust.lm + +pcre: pcre.rl + ../ragel/ragel -G2 pcre.rl + gcc -g -Wall -o pcre pcre.c diff --git a/grammar/pcre.rl b/grammar/pcre.rl new file mode 100644 index 00000000..2a6191fe --- /dev/null +++ b/grammar/pcre.rl @@ -0,0 +1,502 @@ +#include +#include +#include +#include + +#define LIST_APPEND( pphead, element ) \ + { \ + element->next = *pphead; \ + *pphead = element; \ + } + +#define STACK_PUSH( pphead, element ) \ + { \ + element->parent = *pphead; \ + *pphead = element; \ + } + +#define STACK_POP( pphead ) \ + { \ + *pphead = (*pphead)->parent; \ + } + +#define REVERSE_LIST( S, pphead ) \ + if ( *pphead != 0 ) { \ + S *front = (*pphead)->next; \ + (*pphead)->next = 0; \ + while ( front != NULL ) { \ + S *next = front->next; \ + front->next = *pphead; \ + *pphead = front; \ + front = next; \ + } \ + } + +struct value +{ + int v; + + struct value *next; +}; + +struct quantifier +{ + +}; + +enum element_type +{ + element_value_type = 1, + element_regex_type, + element_look_around_type, + element_char_class_type, + element_back_ref_type +}; + +struct element +{ + enum element_type type; + struct regex *regex; + struct look_around *look_around; + struct quantifier *quantifier; + struct value *value_list; + + struct element *next; +}; + +struct term +{ + struct element *element_list; + + struct term *next; + struct term *parent; +}; + +enum regex_type +{ + regex_non_capture = 1, + regex_capture, + regex_look_around +}; + +struct regex +{ + struct term *term_list; + + enum regex_type type; + int capture; + int node_id; + int options; + + struct regex *next; + struct regex *parent; +}; + +struct pattern +{ + struct regex *regex; + int node_id; + int is_look_around; + + struct look_around *look_arounds; + struct pattern *parent; +}; + +enum look_around_type { + lat_pos_ahead = 1, + lat_neg_ahead, + lat_pos_behind, + lat_neg_behind, +}; + +struct look_around +{ + enum look_around_type type; + struct pattern *pattern; + int node_id; + + struct look_around *next; +}; + +struct value *new_value( int v ) +{ + struct value *value = malloc( sizeof( struct value ) ); + memset( value, 0, sizeof(struct value) ); + value->v = v; + return value; +} + +struct term *new_term( void ) +{ + struct term *term = malloc( sizeof( struct term ) ); + memset( term, 0, sizeof(struct term) ); + return term; +} + +struct element *new_element( enum element_type type, struct regex *regex, struct look_around *look_around ) +{ + struct element *element = malloc( sizeof( struct element ) ); + memset( element, 0, sizeof(struct element) ); + element->type = type; + element->regex = regex; + element->look_around = look_around; + return element; +} + +struct regex *new_regex( enum regex_type type, int options ) +{ + struct regex *regex = malloc( sizeof( struct regex ) ); + memset( regex, 0, sizeof(struct regex) ); + regex->type = type; + regex->options = options; + return regex; +} + +struct look_around *new_look_around( enum look_around_type lat, struct pattern *pattern ) +{ + struct look_around *look_around = malloc( sizeof( struct look_around ) ); + memset( look_around, 0, sizeof(struct look_around) ); + look_around->type = lat; + look_around->pattern = pattern; + return look_around; +} + +struct pattern *new_pattern( struct regex *regex, int is_look_around ) +{ + struct pattern *pattern = malloc( sizeof( struct pattern ) ); + pattern->regex = regex; + pattern->is_look_around = is_look_around; + return pattern; +} + +void append_element_value( struct term *term, int value ) +{ + struct element *el = term->element_list; + if ( el == NULL || + el->type != element_value_type || + el->quantifier != 0 ) + { + el = new_element( element_value_type, NULL, NULL ); + LIST_APPEND( &term->element_list, el ); + } + + struct value *v = new_value( value ); + LIST_APPEND( &el->value_list, v ); +} + +void reverse_value_list( struct value **pphead ) +{ + REVERSE_LIST( struct value, pphead ); +} + +void reverse_element_list( struct element **pphead ) +{ + REVERSE_LIST( struct element, pphead ); +} + +void reverse_term_list( struct term **pphead ) +{ + REVERSE_LIST( struct term, pphead ); +} + + +/* Provides initial ragel call state stack. */ +int *init_ragel_stack( int *size ) +{ + *size = 24; + return malloc( *size * sizeof(int) ); +} + +/* Grows the ragel call state stack. Use in pre-push. */ +int *grow_ragel_stack( int *size, int *stack ) +{ + int new_size = *size * 2; + int *new_stack = malloc( new_size * sizeof(int) ) ; + memcpy( new_stack, stack, *size * sizeof(int) ); + free( stack ); + + *size = new_size; + return new_stack; +} + +%%{ + machine PCRE; + + action enter_term { + struct term *term = new_term(); + STACK_PUSH( &s_term, term ); + LIST_APPEND( &s_regex->term_list, term ); + } + + action leave_term { + reverse_element_list( &s_term->element_list ); + struct element *el = s_term->element_list; + while ( el != 0 ) { + reverse_value_list( &el->value_list ); + el = el->next; + } + STACK_POP( &s_term ); + } + + action enter_regex { + struct regex *regex = new_regex( regex_non_capture, s_regex->options ); + struct element *element = new_element( element_regex_type, regex, NULL ); + LIST_APPEND( &s_term->element_list, element ); + STACK_PUSH( &s_regex, regex ); + } + + action enter_lookaround { + struct regex *regex = new_regex( regex_look_around, s_regex->options ); + struct pattern *pattern = new_pattern( regex, 1 ); + struct look_around *la = new_look_around( lat, pattern ); + pattern->node_id = la->node_id; + + LIST_APPEND( &s_pattern->look_arounds, la ); + + struct element *element = new_element( element_look_around_type, NULL, la ); + LIST_APPEND( &s_term->element_list, element ); + + STACK_PUSH( &s_regex, regex ); + STACK_PUSH( &s_pattern, pattern ); + } + + action leave_regex { + if ( s_regex->type == regex_capture ) + closed_captures += 1; + + if ( s_regex->type == regex_look_around ) + STACK_POP( &s_pattern ); + + reverse_term_list( &s_regex->term_list ); + STACK_POP( &s_regex ); + } + + action options_only { fret; } + + options = ( + 'c' # case-sensitive matching + | 'i' # case-insensitive matching + )*; + + quant_forms = '_'; + + atom = + [a-z] @{ append_element_value( s_term, *p ); } | + '(' @{ fcall paren_open; } + ; + + non_greedy = '_'; + + quantifier = quant_forms non_greedy?; + + element = ( atom quantifier? ); + + term = ( element** ) >enter_term; + + # + # Expression + # + expr = term ( '|' @leave_term term )*; + + # + # Regex + # + regex = expr; + + paren_open := + # Look at the first few charcters to see what the form is. What we + # handle here: + # (re) capturing parens + # (?:re) non-capturing parens + # (?=re) positive lookahead + # (?!re) negative lookahead + # (?<=re) positive lookbehind + # (?type = regex_capture; + s_regex->capture = next_capture++; + fhold; + } + ) + regex ')' @leave_term @leave_regex @{ fret; }; + + + main := regex '\n' @leave_term @leave_regex @{ success = 1; }; +}%% + +%% write data; + +int pcre_parse( struct pattern **result_pattern, char *line, int len ) +{ + int *stack; + int stack_size; + int cs, top; + char *p, *pe; + int success = 0; + int result = 1; + + struct pattern *s_pattern = 0; + struct regex *s_regex = 0; + struct term *s_term = 0; + + /* Collect into this. */ + struct regex *root_regex = new_regex( regex_non_capture, 0 ); + struct pattern *root_pattern = new_pattern( root_regex, 0 ); + STACK_PUSH( &s_pattern, root_pattern ); + STACK_PUSH( &s_regex, root_regex ); + + int closed_captures = 0; + int next_capture = 1; + + enum look_around_type lat; + + stack = init_ragel_stack( &stack_size ); + + %% write init; + + p = line; + pe = line + len; + + %% write exec; + + if ( !success ) { + printf( "parse error at %ld\n", ( p - line + 1) ); + result = 0; + } + else if ( + s_pattern == NULL || s_pattern->parent != NULL || + s_regex != NULL || + s_term != NULL ) + { + printf( "parse error: items not closed\n" ); + result = 0; + } + + if ( result ) { + *result_pattern = root_pattern; + } + else { + + } + + free( stack ); + + return result; +} + +extern void print_value( int indent, struct value *value ); +extern void print_element( int indent, struct element *el ); +extern void print_term( int indent, struct term *term ); +extern void print_regex( int indent, struct regex *regex ); + +void print_indent( int indent ) +{ + while ( indent-- > 0 ) + printf( " " ); +} + +void print_value( int indent, struct value *value ) +{ + printf( "v(%d)", value->v ); +} + +void print_element( int indent, struct element *el ) +{ + print_indent( indent ); + printf( "el: " ); + if ( el->type == element_value_type ) { + struct value *value = el->value_list; + while ( value != NULL ) { + print_value( indent + 1, value ); + if ( value->next != 0 ) + printf( " . " ); + value = value->next; + } + printf("\n"); + } + else if ( el->type == element_regex_type ) { + printf("\n"); + if ( el->regex != 0 ) + print_regex( indent + 1, el->regex ); + } + else if ( el->type == element_look_around_type ) + printf( "lookaround\n" ); + else if ( el->type == element_char_class_type ) + printf( "cc\n" ); + else if ( el->type == element_back_ref_type ) + printf( "backref\n" ); +} + +void print_term( int indent, struct term *term ) +{ + print_indent( indent ); + printf( "term:\n" ); + struct element *el = term->element_list; + while ( el != NULL ) { + print_element( indent + 1, el ); + el = el->next; + } +} + +void print_regex( int indent, struct regex *regex ) +{ + print_indent( indent ); + printf( "reg:\n" ); + + struct term *term = regex->term_list; + while ( term != NULL ) { + print_term( indent + 1, term ); + term = term->next; + } +} + +void print_pattern( int indent, struct pattern *pat ) +{ + printf( "pat:\n" ); + print_regex( indent + 1, pat->regex ); +} + +char linebuf[2048]; + +int main( int argc, char **argv ) +{ + if ( argc < 2 ) { + fprintf( stderr, "usage: ./pcre \n" ); + return -1; + } + + FILE *input = fopen( argv[1], "r" ); + if ( input == NULL ) { + fprintf( stderr, "failed to open %s: %s", argv[0], strerror(errno) ); + return -1; + } + + char *line = NULL; + size_t n = 0; + ssize_t read; + + while ( (read = getline( &line, &n, input )) != -1 ) { + struct pattern *pat; + int parsed = pcre_parse( &pat, line, (int)read ); + if ( parsed ) { + print_pattern( 0, pat ); + } + } + + fclose( input ); + free( line ); + return 0; +} + -- cgit v1.2.1