summaryrefslogtreecommitdiff
path: root/grammar/pcre/pcre.rl
diff options
context:
space:
mode:
Diffstat (limited to 'grammar/pcre/pcre.rl')
-rw-r--r--grammar/pcre/pcre.rl864
1 files changed, 864 insertions, 0 deletions
diff --git a/grammar/pcre/pcre.rl b/grammar/pcre/pcre.rl
new file mode 100644
index 00000000..307d5467
--- /dev/null
+++ b/grammar/pcre/pcre.rl
@@ -0,0 +1,864 @@
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define LIST_APPEND( pphead, element ) \
+ { \
+ element->next = *pphead; \
+ *pphead = element; \
+ }
+
+#define LIST_REMOVE( pphead ) \
+ { \
+ *pphead = (*pphead)->next; \
+ }
+
+#define STACK_PUSH( pphead, element ) \
+ { \
+ element->parent = *pphead; \
+ *pphead = element; \
+ }
+
+#define STACK_POP( pphead ) \
+ { \
+ *pphead = (*pphead)->parent; \
+ }
+
+#define REVERSE_LIST( S, pphead ) \
+ if ( *pphead != 0 ) { \
+ S *front = (*pphead)->next; \
+ (*pphead)->next = 0; \
+ while ( front != NULL ) { \
+ S *next = front->next; \
+ front->next = *pphead; \
+ *pphead = front; \
+ front = next; \
+ } \
+ }
+
+#define MAX_VERB 40
+#define QUANTIFIER_INF -1
+
+struct value
+{
+ int v;
+
+ struct value *next;
+};
+
+enum value_type
+{
+ dot = 256,
+};
+
+enum quantifer_type
+{
+ quant_greedy = 1,
+ quant_lazy,
+ quant_possessive,
+};
+
+struct quantifier
+{
+ int min, max;
+ enum quantifer_type type;
+};
+
+enum element_type
+{
+ element_value_type = 1,
+ element_regex_type,
+ element_look_around_type,
+ element_char_class_type,
+ element_backref_type,
+ element_verb_type,
+};
+
+struct element
+{
+ enum element_type type;
+ struct regex *regex;
+ struct look_around *look_around;
+ struct quantifier *quantifier;
+ struct value *value_list;
+ struct backref *backref;
+ struct verb *verb;
+
+ struct element *next;
+};
+
+struct term
+{
+ struct element *element_list;
+
+ struct term *next;
+ struct term *parent;
+};
+
+enum regex_type
+{
+ regex_non_capture = 1,
+ regex_capture,
+ regex_look_around
+};
+
+#define REGEX_ATOMIC_GROUP 0x01
+#define REGEX_CAPTURE_RESET 0x02
+
+struct regex
+{
+ struct term *term_list;
+
+ enum regex_type type;
+ int capture;
+ int node_id;
+ int options;
+
+ int flags;
+
+ struct regex *next;
+ struct regex *parent;
+};
+
+struct pattern
+{
+ struct regex *regex;
+ int node_id;
+ int is_look_around;
+
+ struct look_around *look_arounds;
+ struct pattern *parent;
+};
+
+enum look_around_type {
+ lat_pos_ahead = 1,
+ lat_neg_ahead,
+ lat_pos_behind,
+ lat_neg_behind,
+};
+
+struct look_around
+{
+ enum look_around_type type;
+ struct pattern *pattern;
+ int node_id;
+
+ struct look_around *next;
+};
+
+struct backref
+{
+ int number;
+};
+
+struct verb
+{
+ char *name;
+ char *value;
+ int number;
+};
+
+struct quantifier *new_quantifier()
+{
+ struct quantifier *quant = malloc( sizeof( struct value ) );
+ memset( quant, 0, sizeof(struct quantifier) );
+ quant->type = quant_greedy;
+ return quant;
+}
+
+struct value *new_value( int v )
+{
+ struct value *value = malloc( sizeof( struct value ) );
+ memset( value, 0, sizeof(struct value) );
+ value->v = v;
+ return value;
+}
+
+struct term *new_term( void )
+{
+ struct term *term = malloc( sizeof( struct term ) );
+ memset( term, 0, sizeof(struct term) );
+ return term;
+}
+
+struct element *new_element( enum element_type type, struct regex *regex, struct look_around *look_around )
+{
+ struct element *element = malloc( sizeof( struct element ) );
+ memset( element, 0, sizeof(struct element) );
+ element->type = type;
+ element->regex = regex;
+ element->look_around = look_around;
+ return element;
+}
+
+struct regex *new_regex( enum regex_type type, int options )
+{
+ struct regex *regex = malloc( sizeof( struct regex ) );
+ memset( regex, 0, sizeof(struct regex) );
+ regex->type = type;
+ regex->options = options;
+ return regex;
+}
+
+struct look_around *new_look_around( enum look_around_type lat, struct pattern *pattern )
+{
+ struct look_around *look_around = malloc( sizeof( struct look_around ) );
+ memset( look_around, 0, sizeof(struct look_around) );
+ look_around->type = lat;
+ look_around->pattern = pattern;
+ return look_around;
+}
+
+struct pattern *new_pattern( struct regex *regex, int is_look_around )
+{
+ struct pattern *pattern = malloc( sizeof( struct pattern ) );
+ pattern->regex = regex;
+ pattern->is_look_around = is_look_around;
+ return pattern;
+}
+
+struct backref *new_backref( int number )
+{
+ struct backref *backref = malloc( sizeof( struct backref ) );
+ memset( backref, 0, sizeof(struct backref) );
+ backref->number = number;
+ return backref;
+}
+
+struct verb *new_verb( char *d, int len )
+{
+ struct verb *verb = malloc( sizeof( struct verb ) );
+ verb->name = malloc( len + 1 );
+ memcpy( verb->name, d, len );
+ verb->name[len] = 0;
+ verb->value = 0;
+ verb->number = -1;
+ return verb;
+}
+
+void append_element_value( struct term *term, int value )
+{
+ struct element *el = term->element_list;
+ if ( el == NULL ||
+ el->type != element_value_type ||
+ el->quantifier != 0 )
+ {
+ el = new_element( element_value_type, NULL, NULL );
+ LIST_APPEND( &term->element_list, el );
+ }
+
+ struct value *v = new_value( value );
+ LIST_APPEND( &el->value_list, v );
+}
+
+void reverse_value_list( struct value **pphead )
+{
+ REVERSE_LIST( struct value, pphead );
+}
+
+void reverse_element_list( struct element **pphead )
+{
+ REVERSE_LIST( struct element, pphead );
+}
+
+void reverse_term_list( struct term **pphead )
+{
+ REVERSE_LIST( struct term, pphead );
+}
+
+int is_backref( int num, int num_closed_captures )
+{
+ if ( num < 8 || num <= num_closed_captures )
+ return 1;
+ return 0;
+}
+
+void append_backref( struct term *term, int number )
+{
+ struct element *el = new_element( element_backref_type, NULL, NULL );
+ el->backref = new_backref( number );
+ LIST_APPEND( &term->element_list, el );
+}
+
+/* Provides initial ragel call state stack. */
+int *init_ragel_stack( int *size )
+{
+ *size = 24;
+ return malloc( *size * sizeof(int) );
+}
+
+/* Grows the ragel call state stack. Use in pre-push. */
+int *grow_ragel_stack( int *size, int *stack )
+{
+ int new_size = *size * 2;
+ int *new_stack = malloc( new_size * sizeof(int) ) ;
+ memcpy( new_stack, stack, *size * sizeof(int) );
+ free( stack );
+
+ *size = new_size;
+ return new_stack;
+}
+
+void apply_quantifier( struct term *term, int min, int max )
+{
+ struct element *el = term->element_list;
+
+ /* We may need to remove the last value from the element's value list. */
+ if ( el->type == element_value_type && el->value_list != NULL && el->value_list->next != NULL ) {
+ /* Take the value off and add it to an element of its own. */
+ struct value *last = el->value_list;
+ LIST_REMOVE( &el->value_list );
+
+ el = new_element( element_value_type, NULL, NULL );
+
+ LIST_APPEND( &el->value_list, last );
+ LIST_APPEND( &term->element_list, el );
+ }
+
+ el->quantifier = new_quantifier();
+ el->quantifier->min = min;
+ el->quantifier->max = max;
+}
+
+void apply_lazy( struct term *term )
+{
+ struct element *el = term->element_list;
+ el->quantifier->type = quant_lazy;
+}
+
+void apply_possessive( struct term *term )
+{
+ struct element *el = term->element_list;
+ el->quantifier->type = quant_possessive;
+}
+
+%%{
+ machine PCRE;
+
+ action enter_term {
+ struct term *term = new_term();
+ STACK_PUSH( &s_term, term );
+ LIST_APPEND( &s_regex->term_list, term );
+ }
+
+ action leave_term {
+ reverse_element_list( &s_term->element_list );
+ struct element *el = s_term->element_list;
+ while ( el != 0 ) {
+ reverse_value_list( &el->value_list );
+ el = el->next;
+ }
+ STACK_POP( &s_term );
+ }
+
+ action enter_regex {
+ struct regex *regex = new_regex( regex_non_capture, s_regex->options );
+ struct element *element = new_element( element_regex_type, regex, NULL );
+ LIST_APPEND( &s_term->element_list, element );
+ STACK_PUSH( &s_regex, regex );
+ }
+
+ action enter_lookaround {
+ struct regex *regex = new_regex( regex_look_around, s_regex->options );
+ struct pattern *pattern = new_pattern( regex, 1 );
+ struct look_around *la = new_look_around( lat, pattern );
+ pattern->node_id = la->node_id;
+
+ LIST_APPEND( &s_pattern->look_arounds, la );
+
+ struct element *element = new_element( element_look_around_type, NULL, la );
+ LIST_APPEND( &s_term->element_list, element );
+
+ STACK_PUSH( &s_regex, regex );
+ STACK_PUSH( &s_pattern, pattern );
+ }
+
+ action leave_regex {
+ if ( s_regex->type == regex_capture )
+ closed_captures += 1;
+
+ if ( s_regex->type == regex_look_around )
+ STACK_POP( &s_pattern );
+
+ reverse_term_list( &s_regex->term_list );
+ STACK_POP( &s_regex );
+ }
+
+ #
+ # Numbers
+ #
+ action init_number
+ { number = 0; }
+
+ action decimal_digit
+ { number = number * 10 + ( *p - '0' ); }
+
+ number = [0-9]+ >init_number $decimal_digit;
+
+ #
+ # Verbs
+ #
+ verb = ( [a-zA-Z_] [a-zA-Z_0-9]* ) >{ verb_len = 0; } ${
+ if ( verb_len < MAX_VERB ) {
+ verb[verb_len++] = *p;
+ }
+ };
+
+ action append_verb {
+ struct element *el = new_element( element_verb_type, NULL, NULL );
+ cur_verb = el->verb = new_verb( verb, verb_len );
+
+ LIST_APPEND( &s_term->element_list, el );
+ }
+
+ action verb_value
+ {
+ cur_verb->value = malloc( verb_len + 1 );
+ memcpy( cur_verb->value, verb, verb_len );
+ cur_verb->value[verb_len] = 0;
+ }
+
+ action verb_number
+ {
+ cur_verb->number = number;
+ }
+
+ verb_forms =
+ verb %append_verb ( ':' verb %verb_value | '=' number %verb_number )?;
+
+ action options_only { fret; }
+
+ options = (
+ 'c' # case-sensitive matching
+ | 'i' # case-insensitive matching
+ )*;
+
+ alpha_char = [a-zA-Z];
+ digit_char = [0-9];
+
+ open_paren = '(';
+ close_paren = ')';
+
+ char_class_start = '[';
+ char_class_end = ']';
+
+ open_curly = '{';
+ close_curly = '}';
+
+ ampersand = '&';
+ colon = ':';
+ comma = ',';
+ dollar = '$';
+ dot = '.';
+ equals = '=';
+ exclamation = '!';
+ greater_than = '>';
+ hash = '#';
+ hyphen = '-';
+ less_than = '<';
+ pipe = '|';
+ single_quote = "'";
+ underscore = '_';
+
+ other_char_printable =
+ ' ' | '~' | ';' | '@' | '%' | '`' | '"' | '/';
+
+ other_char_non_printable = ^( 0 .. 127 );
+
+ capture_non_capture =
+ '(' @{ fcall paren_open; };
+
+ literal_sym =
+ comma | hyphen | less_than |
+ greater_than | single_quote | underscore | colon |
+ hash | equals | exclamation | ampersand;
+
+ action append_char {
+ append_element_value( s_term, *p );
+ }
+
+ literal = (
+ alpha_char |
+ digit_char |
+ literal_sym |
+ other_char_printable |
+ other_char_non_printable
+ ) @append_char;
+
+ #
+ # Octals and backrefs. There is an ambiguity between these two, therefore
+ # the grammar handles them together.
+ #
+
+ action init_octal
+ { octal = 0; }
+
+ action octal_digit
+ { octal = octal * 8 + ( *p - '0' ); }
+
+ backref_num = [1-9] [0-9]*;
+
+ octal_num =
+ [0-3] [0-7] [0-7] |
+ [0-7] [0-7] |
+ [0];
+
+ action octal { append_element_value( s_term, octal ); }
+
+ action octal_or_backref
+ {
+ if ( is_backref( number, closed_captures ) )
+ append_backref( s_term, number );
+ else
+ append_element_value( s_term, octal );
+ }
+
+ action backref
+ {
+ if ( number > closed_captures ) {
+ printf( "invalid backref: \\%d\n", number );
+ fbreak;
+ }
+
+ append_backref( s_term, number );
+ }
+
+ # Certainly octal. All octals, with possibly backrefs removed.
+ def_octal =
+ '\\' @init_octal
+ ( octal_num - backref_num ) $octal_digit;
+
+ # All cases that can be either octal or backref and we need to inspect the
+ # backref number to decide. Use the intersection of the two numbers as the
+ # pattern.
+ octal_or_backref =
+ '\\' @init_octal @init_number
+ ( octal_num & backref_num ) $octal_digit $decimal_digit;
+
+ # Definitely backref. The backref pattern excluding anything with a prefix
+ # that could be octal.
+ def_backref =
+ '\\' @init_number
+ ( backref_num - ( octal_num any* ) ) $decimal_digit;
+
+ octal =
+ def_octal %octal |
+ octal_or_backref %octal_or_backref;
+
+ backref =
+ def_backref %backref |
+ octal_or_backref %octal_or_backref;
+
+ atom =
+ literal |
+ char_class_end @append_char |
+ dot @{ append_element_value( s_term, dot ); } |
+ octal |
+ backref |
+ open_paren @{ fcall open_paren_forms; }
+ ;
+
+ non_greedy =
+ '?' @{ apply_lazy( s_term ); } |
+ '+' @{ apply_possessive( s_term ); };
+
+ quant_min = number %{ quant_min = number; };
+ quant_max = number %{ quant_max = number; };
+
+ action quant_zero_plus { apply_quantifier( s_term, 0, QUANTIFIER_INF ); }
+ action quant_one_plus { apply_quantifier( s_term, 1, QUANTIFIER_INF ); }
+ action quant_optional { apply_quantifier( s_term, 0, 1 ); }
+ action quant_exact { apply_quantifier( s_term, quant_min, quant_min ); }
+ action quant_min { apply_quantifier( s_term, quant_min, QUANTIFIER_INF ); }
+ action quant_min_max { apply_quantifier( s_term, quant_min, quant_max ); }
+
+ quant_forms =
+ '*' @quant_zero_plus |
+ '+' @quant_one_plus |
+ '?' @quant_optional |
+ open_curly quant_min close_curly @quant_exact |
+ open_curly quant_min ',' close_curly @quant_min |
+ open_curly quant_min ',' quant_max close_curly @quant_min_max;
+
+ quantifier = quant_forms non_greedy?;
+
+ element = ( atom quantifier? );
+
+ term = ( element** ) >enter_term;
+
+ #
+ # Expression
+ #
+ expr = term ( '|' @leave_term term )*;
+
+ #
+ # Regex
+ #
+ regex = expr;
+
+
+ open_paren_forms :=
+ # Look at the first few charcters to see what the form is. What we
+ # handle here:
+ # (re) capturing parens
+ # (?:re) non-capturing parens
+ # (?=re) positive lookahead
+ # (?!re) negative lookahead
+ # (?<=re) positive lookbehind
+ # (?<!re) negative lookbehind
+ (
+ # Non-capture.
+ '?' options (
+ ')' @options_only |
+ ':' @enter_regex
+ ) |
+
+ # Atomic group.
+ '?>' @enter_regex
+ @{ s_regex->flags |= REGEX_ATOMIC_GROUP; } |
+
+ # Capture number reset on branch.
+ '?|' @enter_regex
+ @{ s_regex->flags |= REGEX_CAPTURE_RESET; } |
+
+ # Lookaround
+ (
+ '?=' @{ lat = lat_pos_ahead; } @enter_lookaround |
+ '?!' @{ lat = lat_neg_ahead; } @enter_lookaround |
+ '?<=' @{ lat = lat_pos_behind; } @enter_lookaround |
+ '?<!' @{ lat = lat_neg_behind; } @enter_lookaround
+ ) |
+
+ # Verbs
+ '*' verb_forms ')' @{fret;} |
+
+ # Catpuring.
+ ^( '?' | '*' ) @enter_regex @{
+ s_regex->type = regex_capture;
+ s_regex->capture = next_capture++;
+ fhold;
+ }
+
+ )
+ regex ')' @leave_term @leave_regex @{ fret; };
+
+
+ main := regex '\n' @leave_term @leave_regex @{ success = 1; };
+}%%
+
+%% write data;
+
+int pcre_parse( struct pattern **result_pattern, char *line, int len )
+{
+ int *stack;
+ int stack_size;
+ int cs, top;
+ char *p, *pe;
+ int success = 0;
+ int result = 1;
+
+ int number = 0;
+ int octal = 0;
+ int quant_min, quant_max;
+
+ struct pattern *s_pattern = NULL;
+ struct regex *s_regex = NULL;
+ struct term *s_term = NULL;
+
+ /* Collect into this. */
+ struct regex *root_regex = new_regex( regex_non_capture, 0 );
+ struct pattern *root_pattern = new_pattern( root_regex, 0 );
+ STACK_PUSH( &s_pattern, root_pattern );
+ STACK_PUSH( &s_regex, root_regex );
+
+ int closed_captures = 0;
+ int next_capture = 1;
+
+ enum look_around_type lat;
+
+ int verb_len;
+ char verb[MAX_VERB];
+ struct verb *cur_verb;
+
+ stack = init_ragel_stack( &stack_size );
+
+ %% write init;
+
+ p = line;
+ pe = line + len;
+
+ %% write exec;
+
+ if ( !success ) {
+ printf( "parse error at %ld\n", ( p - line + 1) );
+ result = 0;
+ }
+ else if (
+ s_pattern == NULL || s_pattern->parent != NULL ||
+ s_regex != NULL ||
+ s_term != NULL )
+ {
+ printf( "parse error: items not closed\n" );
+ result = 0;
+ }
+
+ if ( result ) {
+ *result_pattern = root_pattern;
+ }
+ else {
+
+ }
+
+ free( stack );
+
+ return result;
+}
+
+extern void print_value( int indent, struct value *value );
+extern void print_element( int indent, struct element *el );
+extern void print_term( int indent, struct term *term );
+extern void print_regex( int indent, struct regex *regex );
+
+void print_indent( int indent )
+{
+ while ( indent-- > 0 )
+ printf( " " );
+}
+
+void print_value( int indent, struct value *value )
+{
+ printf( "v(%d)", value->v );
+}
+
+void print_element( int indent, struct element *el )
+{
+ print_indent( indent );
+ printf( "el: " );
+ if ( el->type == element_value_type ) {
+ struct value *value = el->value_list;
+ while ( value != NULL ) {
+ print_value( indent + 1, value );
+ if ( value->next != 0 )
+ printf( " . " );
+ value = value->next;
+ }
+ }
+ else if ( el->type == element_regex_type ) {
+ if ( el->regex != 0 )
+ print_regex( indent, el->regex );
+ }
+ else if ( el->type == element_look_around_type ) {
+ printf( "lookaround " );
+ switch ( el->look_around->type ) {
+ case lat_pos_ahead: printf( "=" ); break;
+ case lat_neg_ahead: printf( "!" ); break;
+ case lat_pos_behind: printf( "<=" ); break;
+ case lat_neg_behind: printf( "<!" ); break;
+ }
+ printf( " " );
+ print_regex( indent, el->look_around->pattern->regex );
+ }
+ else if ( el->type == element_char_class_type )
+ printf( "cc" );
+ else if ( el->type == element_backref_type )
+ printf( "backref(%d)", el->backref->number );
+ else if ( el->type == element_verb_type ) {
+ printf( "verb(%s", el->verb->name );
+ if ( el->verb->value != NULL )
+ printf( ":%s", el->verb->value );
+ else if ( el->verb->number >= 0 )
+ printf( "=%d", el->verb->number );
+ printf( ")" );
+ }
+
+ if ( el->quantifier != NULL ) {
+ printf( " {%d,", el->quantifier->min );
+
+ if ( el->quantifier->max == QUANTIFIER_INF )
+ printf( "inf" );
+ else
+ printf( "%d", el->quantifier->max );
+
+ printf( "}" );
+
+ if ( el->quantifier->type == quant_lazy )
+ printf( "?" );
+ else if ( el->quantifier->type == quant_possessive )
+ printf( "+" );
+
+ }
+
+ printf("\n");
+}
+
+void print_term( int indent, struct term *term )
+{
+ print_indent( indent );
+ printf( "term:\n" );
+ struct element *el = term->element_list;
+ while ( el != NULL ) {
+ print_element( indent + 1, el );
+ el = el->next;
+ }
+}
+
+void print_regex( int indent, struct regex *regex )
+{
+ printf( "reg: " );
+ if ( regex->type == regex_capture )
+ printf( "cap(%d) ", regex->capture );
+ if ( regex->flags & REGEX_ATOMIC_GROUP )
+ printf( "atomic_group " );
+ if ( regex->flags & REGEX_CAPTURE_RESET )
+ printf( "capture_reset " );
+ printf( "(\n" );
+
+ struct term *term = regex->term_list;
+ while ( term != NULL ) {
+ print_term( indent + 1, term );
+ term = term->next;
+ }
+ print_indent( indent );
+ printf( ")" );
+}
+
+void print_pattern( int indent, struct pattern *pat )
+{
+ printf( "pat: " );
+ print_regex( indent, pat->regex );
+ printf( "\n" );
+}
+
+char linebuf[2048];
+
+int main( int argc, char **argv )
+{
+ if ( argc < 2 ) {
+ fprintf( stderr, "usage: ./pcre <regex>\n" );
+ return -1;
+ }
+
+ FILE *input = fopen( argv[1], "r" );
+ if ( input == NULL ) {
+ fprintf( stderr, "failed to open %s: %s", argv[0], strerror(errno) );
+ return -1;
+ }
+
+ char *line = NULL;
+ size_t n = 0;
+ ssize_t read;
+
+ while ( (read = getline( &line, &n, input )) != -1 ) {
+ struct pattern *pat;
+ int parsed = pcre_parse( &pat, line, (int)read );
+ if ( parsed ) {
+ print_pattern( 0, pat );
+ }
+ }
+
+ fclose( input );
+ free( line );
+ return 0;
+}
+