From 5a4407de2c4c03083f1c25b5c7fdb849cdc08a12 Mon Sep 17 00:00:00 2001 From: Adrian Thurston Date: Sun, 1 Dec 2019 09:05:03 -0800 Subject: moved grammars out to their own dirs We also want some testing/validation support for each grammar. This dir will get quite messy if we move files. --- grammar/.gitignore | 8 - grammar/Makefile | 18 +- grammar/dns.rl | 697 ------------------------ grammar/dns/.gitignore | 2 + grammar/dns/Makefile | 9 + grammar/dns/dns.rl | 697 ++++++++++++++++++++++++ grammar/parserust.lm | 81 --- grammar/pcre.lm | 583 -------------------- grammar/pcre.rl | 864 ------------------------------ grammar/pcre/.gitignore | 4 + grammar/pcre/Makefile | 11 + grammar/pcre/pcre.lm | 583 ++++++++++++++++++++ grammar/pcre/pcre.rl | 864 ++++++++++++++++++++++++++++++ grammar/rust.lm | 1305 --------------------------------------------- grammar/rust/.gitignore | 4 + grammar/rust/Makefile | 7 + grammar/rust/parserust.lm | 81 +++ grammar/rust/rust.lm | 1305 +++++++++++++++++++++++++++++++++++++++++++++ 18 files changed, 3570 insertions(+), 3553 deletions(-) delete mode 100644 grammar/.gitignore delete mode 100644 grammar/dns.rl create mode 100644 grammar/dns/.gitignore create mode 100644 grammar/dns/Makefile create mode 100644 grammar/dns/dns.rl delete mode 100644 grammar/parserust.lm delete mode 100644 grammar/pcre.lm delete mode 100644 grammar/pcre.rl create mode 100644 grammar/pcre/.gitignore create mode 100644 grammar/pcre/Makefile create mode 100644 grammar/pcre/pcre.lm create mode 100644 grammar/pcre/pcre.rl delete mode 100644 grammar/rust.lm create mode 100644 grammar/rust/.gitignore create mode 100644 grammar/rust/Makefile create mode 100644 grammar/rust/parserust.lm create mode 100644 grammar/rust/rust.lm (limited to 'grammar') diff --git a/grammar/.gitignore b/grammar/.gitignore deleted file mode 100644 index acbf5ac9..00000000 --- a/grammar/.gitignore +++ /dev/null @@ -1,8 +0,0 @@ -/rust.c -/rust -/pcre.c -/pcre -/pcre-colm.c -/pcre-colm -/dns.cc -/dns diff --git a/grammar/Makefile b/grammar/Makefile index 35f244f5..21a2603a 100644 --- a/grammar/Makefile +++ b/grammar/Makefile @@ -1,18 +1,6 @@ -all: rust pcre pcre-colm dns +SUBDIRS = "rust pcre dns" -RAGEL = ../ragel/ragel -COLM = ../colm/colm +all: rust pcre dns + for d in $(SUBDIRS); do cd $$d && $(MAKE); done -rust: rust.lm parserust.lm $(COLM) - $(COLM) -o rust parserust.lm -pcre: pcre.rl $(RAGEL) - $(RAGEL) -G2 pcre.rl - gcc -g -Wall -o pcre pcre.c - -pcre-colm: pcre.lm - $(COLM) -o pcre-colm pcre.lm - -dns: dns.rl - $(RAGEL) -G2 -o dns.cc dns.rl - g++ -g -Wall -o dns dns.cc -lpcap diff --git a/grammar/dns.rl b/grammar/dns.rl deleted file mode 100644 index 215c61bd..00000000 --- a/grammar/dns.rl +++ /dev/null @@ -1,697 +0,0 @@ -#include -#include -#include -#include - -#include -#include -#include -#include -#include - - -struct Node -{ - enum Type - { - DnsPacket = 1, - DnsQuestion, - DnsAnswer, - DnsAuthority, - DnsAdditional, - DnsRrA, - DnsRrAAAA, - DnsRrCname, - - DnsName, - DnsAddr, - }; -}; - -const char *names[] = { - "<>", - "DnsPacket", - "DnsQuestion", - "DnsAnswer", - "DnsAuthority", - "DnsAdditional", - "DnsRrA", - "DnsRrAAAA", - "DnsRrCname", - "DnsName", - "DnsAddr", -}; - -struct VPT -{ - void push( Node::Type type ) - { - std::cout << "push " << names[type] << std::endl; - } - - void pop( Node::Type type ) - { - std::cout << "pop " << names[type] << std::endl; - } - - void setText( const char *txt ) - { - std::cout << "setText: " << txt << std::endl; - } - - void setData( const unsigned char *d, int len ) - { - std::cout << "setData: "; - for ( int i = 0; i < len; i++ ) - std::cout << std::hex << (unsigned int)d[i] << ' '; - std::cout << std::dec << std::endl; - } -}; - -struct Context -{ - VPT vpt; -}; - -struct Packet -{ - -}; - -typedef unsigned char wire_t; - -struct DnsSection -{ - unsigned long count; - Node::Type tree; -}; - -struct DnsParser -{ - DnsParser() {} - - int cs; - - unsigned int names[32]; - int n; - const unsigned char *packet_start; - - unsigned long id; - unsigned long options; - - /* Current section type. indexes into sect and rr arrays. */ - int s; - - static const int nsect = 4; - static const int qd = 0, an = 1; - static const int ns = 2, ar = 3; - - DnsSection sect[nsect]; - static const char *rr[nsect]; - - Context ctx; - - int copyName( char *nb, int max, const unsigned char *ptr ); - - const unsigned char *header( Context *ctx, Packet *packet, - const unsigned char *p, const unsigned char *pe ); - const unsigned char *question( Context *ctx, Packet *packet, - const unsigned char *p, const unsigned char *pe ); - const unsigned char *resource_record( Context *ctx, Packet *packet, - const unsigned char *p, const unsigned char *pe ); - - int data( Context *ctx, Packet *packet, const wire_t *data, int length ); -}; - - -const char *DnsParser::rr[] = { - "QD", "AN", "NS", "AR" -}; - -enum RR -{ - A = 1, CNAME, OTHER -}; - - -%%{ - machine dns; - - alphtype unsigned char; - - octet = any; - - # Tokens generated from RR_UNKNOWN. Used to pick the kind - # of resource record to attempt to parse. - # RR_A = 1; # 1 a host address - # RR_NS = 2; # 2 an authoritative name server - # RR_MD = 3; # 3 a mail destination (Obsolete - use MX) - # RR_MF = 4; # 4 a mail forwarder (Obsolete - use MX) - # RR_CNAME = 5; # 5 the canonical name for an alias - # RR_SOA = 6; # 6 marks the start of a zone of authority - # RR_MB = 7; # 7 a mailbox domain name (EXPERIMENTAL) - # RR_MG = 8; # 8 a mail group member (EXPERIMENTAL) - # RR_MR = 9; # 9 a mail rename domain name (EXPERIMENTAL) - # RR_NULL = 10; # 10 a null RR (EXPERIMENTAL) - # RR_WKS = 11; # 11 a well known service description - # RR_PTR = 12; # 12 a domain name pointer - # RR_HINFO = 13; # 13 host information - # RR_MINFO = 14; # 14 mailbox or mail list information - # RR_MX = 15; # 15 mail exchange - # RR_TXT = 16; # 16 text strings - - RR_A = 0x00 0x01; - RR_NS = 0x00 0x02; - RR_CNAME = 0x00 0x05; - RR_PTR = 0x00 0x0c; - RR_MX = 0x00 0x0f; - RR_AAAA = 0x00 0x1c; - - RR_ALL = RR_A | RR_NS | RR_CNAME | RR_PTR | RR_MX | RR_AAAA; - RR_UNKNOWN = ( octet octet ) - RR_ALL; - - # - # Names - # - - action part_len { - nbytes = *p; - } - - part_len = - ( 0x1 .. 0x3f ) @part_len; - - action nb_init { b = 0; } - action nb_inc { b++; } - action nb_min { b >= nbytes } - action nb_max { b < nbytes } - - nbytes = :condplus( octet, nb_init, nb_inc, nb_min, nb_max ):; - - name_part = - part_len nbytes; - - # Name part lists are terminated by a zero length or a pointer. - name_end = - # Zero length ending - 0 - - # Pointer ending - # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ - # | 1 1| OFFSET | - # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ - | ( 0xc0 .. 0xff ) octet; - - action name_begin - { - name_start = p; - } - - # Copy names into 'namebuf'. This can be a pointer into a current buffer, - # in case there are multiple names we need. - action name_end { - if ( namebuf != 0 ) { - int r = copyName( namebuf, nblen, name_start ); - - /* Print names from the answer section. */ - if ( r < 0 ) { - std::cerr << "error copying name" << std::endl; - std::cerr << " " << rr[s] << " name: " << - namebuf << " off: " << name_start - packet_start << std::endl; - } - } - - /* Register the parts of the name. Doing this after the above copy - * prevents a self-reference. */ - const unsigned char *s = name_start; - while ( true ) { - unsigned int sl = *s; - if ( *s < 1 || *s > 63 ) - break; - else { - unsigned int off = s - packet_start; - names[n++] = off; - s += 1 + sl; - } - - } - } - - name = - ( name_part** <: name_end ) >name_begin @name_end; - - - # Message Header - # - # 1 1 1 1 1 1 - # 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 - # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ - # | ID | - # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ - # |QR| Opcode |AA|TC|RD|RA| Z | RCODE | - # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ - # | QDCOUNT | - # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ - # | ANCOUNT | - # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ - # | NSCOUNT | - # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ - # | ARCOUNT | - # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ - - header_id = - octet @{ id = (unsigned long)*p << 8; } - octet @{ id |= (unsigned long)*p; }; - - header_fields = - octet @{ options = (unsigned long)*p << 8; } - octet @{ options |= (unsigned long)*p; }; - - count = - octet @{ count = (unsigned long)*p << 8; } - octet @{ count |= (unsigned long)*p; }; - - header = - header_id header_fields - count @{ sect[qd].count = count; sect[qd].tree = Node::DnsQuestion; } - count @{ sect[an].count = count; sect[an].tree = Node::DnsAnswer; } - count @{ sect[ns].count = count; sect[ns].tree = Node::DnsAuthority; } - count @{ sect[ar].count = count; sect[ar].tree = Node::DnsAdditional; } - @{ - std::cout << "dns header:" - " id: " << id << " opts: " << options << - " qd: " << sect[qd].count << " an: " << sect[an].count << - " ns: " << sect[ns].count << " ar: " << sect[ar].count << std::endl; - fbreak; - }; - - # - # Question - # - - # 1 1 1 1 1 1 - # 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 - # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ - # | | - # / QNAME / - # / / - # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ - # | QTYPE | - # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ - # | QCLASS | - # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ - - qtype = - octet octet; - - qclass = - octet octet; - - question = - name qtype qclass - @{ - std::cout << " QD " << namebuf << std::endl; - fbreak; - }; - - # - # Address - # - address4 = - octet @{ addr[0] = *p; } - octet @{ addr[1] = *p; } - octet @{ addr[2] = *p; } - octet @{ addr[3] = *p; } - ; - - address6 = - octet @{ addr[ 0] = *p; } octet @{ addr[ 1] = *p; } - octet @{ addr[ 2] = *p; } octet @{ addr[ 3] = *p; } - octet @{ addr[ 4] = *p; } octet @{ addr[ 5] = *p; } - octet @{ addr[ 6] = *p; } octet @{ addr[ 7] = *p; } - octet @{ addr[ 8] = *p; } octet @{ addr[ 9] = *p; } - octet @{ addr[10] = *p; } octet @{ addr[11] = *p; } - octet @{ addr[12] = *p; } octet @{ addr[13] = *p; } - octet @{ addr[14] = *p; } octet @{ addr[15] = *p; } - ; - - - # - # Resource Records - # - - # 1 1 1 1 1 1 - # 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 - # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ - # | | - # / / - # / NAME / - # | | - # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ - # | TYPE | - # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ - # | CLASS | - # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ - # | TTL | - # | | - # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ - # | RDLENGTH | - # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--| - # / RDATA / - # / / - # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ - - rr_type = - octet octet; - - rr_class = - octet octet; - - ttl = - octet octet octet octet; - - rdlength = - octet @{ rdlength = (unsigned long)*p << 8; } - octet @{ rdlength |= (unsigned long)*p; }; - - rdata_bytes = - /''/; - - rr_a = - RR_A rr_class ttl rdlength - address4 - @{ - std::cout << " " << rr[s] << " A record: " << namebuf1 << " " << - (unsigned int)addr[0] << '.' << - (unsigned int)addr[1] << '.' << - (unsigned int)addr[2] << '.' << - (unsigned int)addr[3] << std::endl; - fbreak; - }; - - rr_aaaa = - RR_AAAA rr_class ttl rdlength - address6 - @{ - std::cout << " " << rr[s] << " AAAA record: " << namebuf1 << " " << std::hex << - (uint)addr[ 0] << ':' << (uint)addr[ 1] << ':' << - (uint)addr[ 2] << ':' << (uint)addr[ 3] << ":" << - (uint)addr[ 4] << ':' << (uint)addr[ 5] << ':' << - (uint)addr[ 6] << ':' << (uint)addr[ 7] << ":" << - (uint)addr[ 8] << ':' << (uint)addr[ 9] << ':' << - (uint)addr[10] << ':' << (uint)addr[11] << ":" << - (uint)addr[12] << ':' << (uint)addr[13] << ':' << - (uint)addr[14] << ':' << (uint)addr[15] << std::dec << std::endl; - - fbreak; - }; - - - # Skip collection of second name by setting namebuf = 0; */ - rr_ns = - RR_NS @{ namebuf = 0; } rr_class ttl rdlength - name - @{ fbreak; }; - - rr_cname = - RR_CNAME @{ namebuf = namebuf2; } rr_class ttl rdlength - name - @{ - std::cout << " " << rr[s] << " CNAME record: " << namebuf1 << " " << namebuf2 << std::endl; - fbreak; - }; - - rr_ptr = - RR_PTR @{ namebuf = namebuf2; } rr_class ttl rdlength - name - @{ - std::cout << " " << rr[s] << " PTR record: " << namebuf1 << " " << namebuf2 << std::endl; - fbreak; - }; - - preference = octet octet; - - rr_mx = - RR_MX @{ namebuf = 0; } rr_class ttl rdlength - preference name - @{ fbreak; }; - - rr_unknown = - RR_UNKNOWN rr_class ttl rdlength - @{ - std::cout << "skipping other section" << std::endl; - - if ( p + rdlength < pe ) { - /* Position ourselves on the last item and break. Implied - * advance of fbreak will take p to the next char to parse. */ - p = p + rdlength; - fbreak; - } - else { - fgoto *resource_record_error; - } - }; - - resource_record = - name ( rr_a | rr_ns | rr_cname | rr_ptr | rr_mx | rr_aaaa | rr_unknown ); -}%% - -/* - * header - */ - -%%{ - machine header; - include dns; - - main := header; -}%% - -%% write data; - -const unsigned char *DnsParser::header( Context *ctx, Packet *packet, const unsigned char *p, const unsigned char *pe ) -{ - unsigned long count; - - %% write init; - %% write exec; - - if ( cs == %%{ write error; }%% ) { - std::cerr << "header: DNS PARSE FAILED" << std::endl; - p = 0; - } - - return p; -} - -/* - * question - */ - -%%{ - machine question; - include dns; - - main := question; -}%% - -%% write data; - -const unsigned char *DnsParser::question( Context *ctx, Packet *packet, - const unsigned char *p, const unsigned char *pe ) -{ - unsigned long nbytes, b; - - const int nblen = 256; - char namebuf[nblen]; - const unsigned char *name_start; - - %% write init; - %% write exec; - - if ( cs == %%{ write error; }%% ) { - // log_debug( DBG_DNS, "question: DNS PARSE FAILED" ); - return 0; - } - - return p; -} - -/* - * resource_record - */ - -%%{ - machine resource_record; - include dns; - - main := resource_record; -}%% - -%% write data; - -const unsigned char *DnsParser::resource_record( Context *ctx, Packet *packet, - const unsigned char *p, const unsigned char *pe ) -{ - unsigned long nbytes, b; - unsigned long rdlength; - unsigned char addr[16]; - - const int nblen = 256; - char namebuf1[nblen], namebuf2[nblen]; - const unsigned char *name_start; - - char *namebuf = namebuf1; - - %% write init; - %% write exec; - - if ( cs == %%{ write error; }%% ) { - // log_debug( DBG_DNS, "resource_record: DNS PARSE FAILED: " << - // p - packet_start << " " << (unsigned int)*p ); - return 0; - } - - return p; -} - -int DnsParser::copyName( char *nb, int max, const unsigned char *ptr ) -{ - int dlen = 0; - - while ( true ) { - if ( *ptr == 0 ) { - /* done */ - nb[dlen++] = 0; - return 0; - } - else if ( *ptr >= 1 && *ptr <= 63 ) { - /* name section. */ - unsigned int pl = *ptr++; - if ( dlen > 0 ) - nb[dlen++] = '.'; - memcpy( nb + dlen, ptr, pl ); - ptr += pl; - dlen += pl; - } - - else if ( *ptr >= 192 ) { - unsigned int off = ( *ptr++ & 0x3f ) << 8; - off |= *ptr; - - /* find in the set of names. */ - bool good = false; - for ( int i = 0; i < n; i++ ) { - if ( names[i] == off ) { - /* good name. */ - good = true; - } - } - - if ( !good ) { - nb[0] = 0; - return -1; - } - - ptr = packet_start + off; - } - } - - return 0; -} - -int DnsParser::data( Context *ctx, Packet *packet, const unsigned char *data, int length ) -{ - const unsigned char *p = data; - const unsigned char *pe = data + length; - - //ctx->vpt.push( Node::DnsPacket ); - - packet_start = data; - n = 0; - - p = header( ctx, packet, p, pe ); - - if ( p == 0 ) - return -1; - - /* Iterate over the section type. */ - for ( s = 0; s < 4; s++ ) { - if ( sect[s].count > 0 ) { - - //ctx->vpt.push( sect[s].tree ); - - while ( p != 0 && sect[s].count > 0 ) { - p = s == qd ? question( ctx, packet, p, pe ) : resource_record( ctx, packet, p, pe ); - sect[s].count -= 1; - } - - //ctx->vpt.pop( sect[s].tree ); - } - } - - //ctx->vpt.pop( Node::DnsPacket ); - - return 0; -} - -void handler( u_char *user, const struct pcap_pkthdr *h, const u_char *bytes ) -{ - DnsParser *dnsParser = reinterpret_cast( user ); - - const ethhdr *eh = reinterpret_cast( bytes ); - -// dnsParser->data( &dnsParser->ctx, 0, bytes, Packet::UnknownDir, h, bytes ); - //std::cerr << "handler" << std::endl; - - if ( eh->h_proto == htons( ETH_P_IP ) ) { - const iphdr *ih = reinterpret_cast(bytes + sizeof(struct ethhdr)); - const int ihlen = ih->ihl * 4; - - if ( ih->protocol == IPPROTO_UDP ) { - - const udphdr *uh = reinterpret_cast( (u_char*)ih + ihlen ); - const int uhlen = sizeof( struct udphdr ); - - /* TCP data and data length. */ - u_char *data = (u_char*)uh + uhlen; - int dlen = ntohs(ih->tot_len) - uhlen - ihlen; - int caplen = h->caplen - uhlen - ihlen - sizeof(struct ethhdr); - - if ( ntohs(uh->source) == 53 || ntohs(uh->dest) == 53 ) { - dnsParser->data( &dnsParser->ctx, 0, data, dlen ); - } - } - } -} - -int main( int argc, const char **argv ) -{ - char errbuf[PCAP_ERRBUF_SIZE]; - DnsParser dnsParser; - - if ( argc < 2 ) { - std::cerr << "usage: ./dns " << std::endl; - return -1; - } - - pcap_t *pcap = pcap_open_offline( argv[1], errbuf ); - if ( pcap == NULL ) { - std::cerr << "couldn't open file " << argv[1] << ": " << errbuf << std::endl; - return -1; - } - - int datalink = pcap_datalink( pcap ); - if ( datalink != DLT_EN10MB ) { - std::cerr << "expected EN10MB datalink\n" << std::endl; - return -1; - } - - int packets = pcap_dispatch( pcap, -1, &handler, (u_char*)&dnsParser ); - if ( packets < 0 ) { - std::cerr << "error from pcap_dispatch: " << packets << std::endl; - } - - pcap_close( pcap ); - return 0; -} diff --git a/grammar/dns/.gitignore b/grammar/dns/.gitignore new file mode 100644 index 00000000..f2c6cd5b --- /dev/null +++ b/grammar/dns/.gitignore @@ -0,0 +1,2 @@ +/dns.cc +/dns diff --git a/grammar/dns/Makefile b/grammar/dns/Makefile new file mode 100644 index 00000000..b1a87107 --- /dev/null +++ b/grammar/dns/Makefile @@ -0,0 +1,9 @@ +COLM = ../../colm/colm +RAGEL = ../../ragel/ragel + +all: dns + +dns: dns.rl + $(RAGEL) -G2 -o dns.cc dns.rl + g++ -g -Wall -o dns dns.cc -lpcap + diff --git a/grammar/dns/dns.rl b/grammar/dns/dns.rl new file mode 100644 index 00000000..215c61bd --- /dev/null +++ b/grammar/dns/dns.rl @@ -0,0 +1,697 @@ +#include +#include +#include +#include + +#include +#include +#include +#include +#include + + +struct Node +{ + enum Type + { + DnsPacket = 1, + DnsQuestion, + DnsAnswer, + DnsAuthority, + DnsAdditional, + DnsRrA, + DnsRrAAAA, + DnsRrCname, + + DnsName, + DnsAddr, + }; +}; + +const char *names[] = { + "<>", + "DnsPacket", + "DnsQuestion", + "DnsAnswer", + "DnsAuthority", + "DnsAdditional", + "DnsRrA", + "DnsRrAAAA", + "DnsRrCname", + "DnsName", + "DnsAddr", +}; + +struct VPT +{ + void push( Node::Type type ) + { + std::cout << "push " << names[type] << std::endl; + } + + void pop( Node::Type type ) + { + std::cout << "pop " << names[type] << std::endl; + } + + void setText( const char *txt ) + { + std::cout << "setText: " << txt << std::endl; + } + + void setData( const unsigned char *d, int len ) + { + std::cout << "setData: "; + for ( int i = 0; i < len; i++ ) + std::cout << std::hex << (unsigned int)d[i] << ' '; + std::cout << std::dec << std::endl; + } +}; + +struct Context +{ + VPT vpt; +}; + +struct Packet +{ + +}; + +typedef unsigned char wire_t; + +struct DnsSection +{ + unsigned long count; + Node::Type tree; +}; + +struct DnsParser +{ + DnsParser() {} + + int cs; + + unsigned int names[32]; + int n; + const unsigned char *packet_start; + + unsigned long id; + unsigned long options; + + /* Current section type. indexes into sect and rr arrays. */ + int s; + + static const int nsect = 4; + static const int qd = 0, an = 1; + static const int ns = 2, ar = 3; + + DnsSection sect[nsect]; + static const char *rr[nsect]; + + Context ctx; + + int copyName( char *nb, int max, const unsigned char *ptr ); + + const unsigned char *header( Context *ctx, Packet *packet, + const unsigned char *p, const unsigned char *pe ); + const unsigned char *question( Context *ctx, Packet *packet, + const unsigned char *p, const unsigned char *pe ); + const unsigned char *resource_record( Context *ctx, Packet *packet, + const unsigned char *p, const unsigned char *pe ); + + int data( Context *ctx, Packet *packet, const wire_t *data, int length ); +}; + + +const char *DnsParser::rr[] = { + "QD", "AN", "NS", "AR" +}; + +enum RR +{ + A = 1, CNAME, OTHER +}; + + +%%{ + machine dns; + + alphtype unsigned char; + + octet = any; + + # Tokens generated from RR_UNKNOWN. Used to pick the kind + # of resource record to attempt to parse. + # RR_A = 1; # 1 a host address + # RR_NS = 2; # 2 an authoritative name server + # RR_MD = 3; # 3 a mail destination (Obsolete - use MX) + # RR_MF = 4; # 4 a mail forwarder (Obsolete - use MX) + # RR_CNAME = 5; # 5 the canonical name for an alias + # RR_SOA = 6; # 6 marks the start of a zone of authority + # RR_MB = 7; # 7 a mailbox domain name (EXPERIMENTAL) + # RR_MG = 8; # 8 a mail group member (EXPERIMENTAL) + # RR_MR = 9; # 9 a mail rename domain name (EXPERIMENTAL) + # RR_NULL = 10; # 10 a null RR (EXPERIMENTAL) + # RR_WKS = 11; # 11 a well known service description + # RR_PTR = 12; # 12 a domain name pointer + # RR_HINFO = 13; # 13 host information + # RR_MINFO = 14; # 14 mailbox or mail list information + # RR_MX = 15; # 15 mail exchange + # RR_TXT = 16; # 16 text strings + + RR_A = 0x00 0x01; + RR_NS = 0x00 0x02; + RR_CNAME = 0x00 0x05; + RR_PTR = 0x00 0x0c; + RR_MX = 0x00 0x0f; + RR_AAAA = 0x00 0x1c; + + RR_ALL = RR_A | RR_NS | RR_CNAME | RR_PTR | RR_MX | RR_AAAA; + RR_UNKNOWN = ( octet octet ) - RR_ALL; + + # + # Names + # + + action part_len { + nbytes = *p; + } + + part_len = + ( 0x1 .. 0x3f ) @part_len; + + action nb_init { b = 0; } + action nb_inc { b++; } + action nb_min { b >= nbytes } + action nb_max { b < nbytes } + + nbytes = :condplus( octet, nb_init, nb_inc, nb_min, nb_max ):; + + name_part = + part_len nbytes; + + # Name part lists are terminated by a zero length or a pointer. + name_end = + # Zero length ending + 0 + + # Pointer ending + # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + # | 1 1| OFFSET | + # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + | ( 0xc0 .. 0xff ) octet; + + action name_begin + { + name_start = p; + } + + # Copy names into 'namebuf'. This can be a pointer into a current buffer, + # in case there are multiple names we need. + action name_end { + if ( namebuf != 0 ) { + int r = copyName( namebuf, nblen, name_start ); + + /* Print names from the answer section. */ + if ( r < 0 ) { + std::cerr << "error copying name" << std::endl; + std::cerr << " " << rr[s] << " name: " << + namebuf << " off: " << name_start - packet_start << std::endl; + } + } + + /* Register the parts of the name. Doing this after the above copy + * prevents a self-reference. */ + const unsigned char *s = name_start; + while ( true ) { + unsigned int sl = *s; + if ( *s < 1 || *s > 63 ) + break; + else { + unsigned int off = s - packet_start; + names[n++] = off; + s += 1 + sl; + } + + } + } + + name = + ( name_part** <: name_end ) >name_begin @name_end; + + + # Message Header + # + # 1 1 1 1 1 1 + # 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 + # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + # | ID | + # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + # |QR| Opcode |AA|TC|RD|RA| Z | RCODE | + # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + # | QDCOUNT | + # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + # | ANCOUNT | + # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + # | NSCOUNT | + # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + # | ARCOUNT | + # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + + header_id = + octet @{ id = (unsigned long)*p << 8; } + octet @{ id |= (unsigned long)*p; }; + + header_fields = + octet @{ options = (unsigned long)*p << 8; } + octet @{ options |= (unsigned long)*p; }; + + count = + octet @{ count = (unsigned long)*p << 8; } + octet @{ count |= (unsigned long)*p; }; + + header = + header_id header_fields + count @{ sect[qd].count = count; sect[qd].tree = Node::DnsQuestion; } + count @{ sect[an].count = count; sect[an].tree = Node::DnsAnswer; } + count @{ sect[ns].count = count; sect[ns].tree = Node::DnsAuthority; } + count @{ sect[ar].count = count; sect[ar].tree = Node::DnsAdditional; } + @{ + std::cout << "dns header:" + " id: " << id << " opts: " << options << + " qd: " << sect[qd].count << " an: " << sect[an].count << + " ns: " << sect[ns].count << " ar: " << sect[ar].count << std::endl; + fbreak; + }; + + # + # Question + # + + # 1 1 1 1 1 1 + # 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 + # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + # | | + # / QNAME / + # / / + # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + # | QTYPE | + # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + # | QCLASS | + # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + + qtype = + octet octet; + + qclass = + octet octet; + + question = + name qtype qclass + @{ + std::cout << " QD " << namebuf << std::endl; + fbreak; + }; + + # + # Address + # + address4 = + octet @{ addr[0] = *p; } + octet @{ addr[1] = *p; } + octet @{ addr[2] = *p; } + octet @{ addr[3] = *p; } + ; + + address6 = + octet @{ addr[ 0] = *p; } octet @{ addr[ 1] = *p; } + octet @{ addr[ 2] = *p; } octet @{ addr[ 3] = *p; } + octet @{ addr[ 4] = *p; } octet @{ addr[ 5] = *p; } + octet @{ addr[ 6] = *p; } octet @{ addr[ 7] = *p; } + octet @{ addr[ 8] = *p; } octet @{ addr[ 9] = *p; } + octet @{ addr[10] = *p; } octet @{ addr[11] = *p; } + octet @{ addr[12] = *p; } octet @{ addr[13] = *p; } + octet @{ addr[14] = *p; } octet @{ addr[15] = *p; } + ; + + + # + # Resource Records + # + + # 1 1 1 1 1 1 + # 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 + # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + # | | + # / / + # / NAME / + # | | + # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + # | TYPE | + # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + # | CLASS | + # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + # | TTL | + # | | + # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + # | RDLENGTH | + # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--| + # / RDATA / + # / / + # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + + rr_type = + octet octet; + + rr_class = + octet octet; + + ttl = + octet octet octet octet; + + rdlength = + octet @{ rdlength = (unsigned long)*p << 8; } + octet @{ rdlength |= (unsigned long)*p; }; + + rdata_bytes = + /''/; + + rr_a = + RR_A rr_class ttl rdlength + address4 + @{ + std::cout << " " << rr[s] << " A record: " << namebuf1 << " " << + (unsigned int)addr[0] << '.' << + (unsigned int)addr[1] << '.' << + (unsigned int)addr[2] << '.' << + (unsigned int)addr[3] << std::endl; + fbreak; + }; + + rr_aaaa = + RR_AAAA rr_class ttl rdlength + address6 + @{ + std::cout << " " << rr[s] << " AAAA record: " << namebuf1 << " " << std::hex << + (uint)addr[ 0] << ':' << (uint)addr[ 1] << ':' << + (uint)addr[ 2] << ':' << (uint)addr[ 3] << ":" << + (uint)addr[ 4] << ':' << (uint)addr[ 5] << ':' << + (uint)addr[ 6] << ':' << (uint)addr[ 7] << ":" << + (uint)addr[ 8] << ':' << (uint)addr[ 9] << ':' << + (uint)addr[10] << ':' << (uint)addr[11] << ":" << + (uint)addr[12] << ':' << (uint)addr[13] << ':' << + (uint)addr[14] << ':' << (uint)addr[15] << std::dec << std::endl; + + fbreak; + }; + + + # Skip collection of second name by setting namebuf = 0; */ + rr_ns = + RR_NS @{ namebuf = 0; } rr_class ttl rdlength + name + @{ fbreak; }; + + rr_cname = + RR_CNAME @{ namebuf = namebuf2; } rr_class ttl rdlength + name + @{ + std::cout << " " << rr[s] << " CNAME record: " << namebuf1 << " " << namebuf2 << std::endl; + fbreak; + }; + + rr_ptr = + RR_PTR @{ namebuf = namebuf2; } rr_class ttl rdlength + name + @{ + std::cout << " " << rr[s] << " PTR record: " << namebuf1 << " " << namebuf2 << std::endl; + fbreak; + }; + + preference = octet octet; + + rr_mx = + RR_MX @{ namebuf = 0; } rr_class ttl rdlength + preference name + @{ fbreak; }; + + rr_unknown = + RR_UNKNOWN rr_class ttl rdlength + @{ + std::cout << "skipping other section" << std::endl; + + if ( p + rdlength < pe ) { + /* Position ourselves on the last item and break. Implied + * advance of fbreak will take p to the next char to parse. */ + p = p + rdlength; + fbreak; + } + else { + fgoto *resource_record_error; + } + }; + + resource_record = + name ( rr_a | rr_ns | rr_cname | rr_ptr | rr_mx | rr_aaaa | rr_unknown ); +}%% + +/* + * header + */ + +%%{ + machine header; + include dns; + + main := header; +}%% + +%% write data; + +const unsigned char *DnsParser::header( Context *ctx, Packet *packet, const unsigned char *p, const unsigned char *pe ) +{ + unsigned long count; + + %% write init; + %% write exec; + + if ( cs == %%{ write error; }%% ) { + std::cerr << "header: DNS PARSE FAILED" << std::endl; + p = 0; + } + + return p; +} + +/* + * question + */ + +%%{ + machine question; + include dns; + + main := question; +}%% + +%% write data; + +const unsigned char *DnsParser::question( Context *ctx, Packet *packet, + const unsigned char *p, const unsigned char *pe ) +{ + unsigned long nbytes, b; + + const int nblen = 256; + char namebuf[nblen]; + const unsigned char *name_start; + + %% write init; + %% write exec; + + if ( cs == %%{ write error; }%% ) { + // log_debug( DBG_DNS, "question: DNS PARSE FAILED" ); + return 0; + } + + return p; +} + +/* + * resource_record + */ + +%%{ + machine resource_record; + include dns; + + main := resource_record; +}%% + +%% write data; + +const unsigned char *DnsParser::resource_record( Context *ctx, Packet *packet, + const unsigned char *p, const unsigned char *pe ) +{ + unsigned long nbytes, b; + unsigned long rdlength; + unsigned char addr[16]; + + const int nblen = 256; + char namebuf1[nblen], namebuf2[nblen]; + const unsigned char *name_start; + + char *namebuf = namebuf1; + + %% write init; + %% write exec; + + if ( cs == %%{ write error; }%% ) { + // log_debug( DBG_DNS, "resource_record: DNS PARSE FAILED: " << + // p - packet_start << " " << (unsigned int)*p ); + return 0; + } + + return p; +} + +int DnsParser::copyName( char *nb, int max, const unsigned char *ptr ) +{ + int dlen = 0; + + while ( true ) { + if ( *ptr == 0 ) { + /* done */ + nb[dlen++] = 0; + return 0; + } + else if ( *ptr >= 1 && *ptr <= 63 ) { + /* name section. */ + unsigned int pl = *ptr++; + if ( dlen > 0 ) + nb[dlen++] = '.'; + memcpy( nb + dlen, ptr, pl ); + ptr += pl; + dlen += pl; + } + + else if ( *ptr >= 192 ) { + unsigned int off = ( *ptr++ & 0x3f ) << 8; + off |= *ptr; + + /* find in the set of names. */ + bool good = false; + for ( int i = 0; i < n; i++ ) { + if ( names[i] == off ) { + /* good name. */ + good = true; + } + } + + if ( !good ) { + nb[0] = 0; + return -1; + } + + ptr = packet_start + off; + } + } + + return 0; +} + +int DnsParser::data( Context *ctx, Packet *packet, const unsigned char *data, int length ) +{ + const unsigned char *p = data; + const unsigned char *pe = data + length; + + //ctx->vpt.push( Node::DnsPacket ); + + packet_start = data; + n = 0; + + p = header( ctx, packet, p, pe ); + + if ( p == 0 ) + return -1; + + /* Iterate over the section type. */ + for ( s = 0; s < 4; s++ ) { + if ( sect[s].count > 0 ) { + + //ctx->vpt.push( sect[s].tree ); + + while ( p != 0 && sect[s].count > 0 ) { + p = s == qd ? question( ctx, packet, p, pe ) : resource_record( ctx, packet, p, pe ); + sect[s].count -= 1; + } + + //ctx->vpt.pop( sect[s].tree ); + } + } + + //ctx->vpt.pop( Node::DnsPacket ); + + return 0; +} + +void handler( u_char *user, const struct pcap_pkthdr *h, const u_char *bytes ) +{ + DnsParser *dnsParser = reinterpret_cast( user ); + + const ethhdr *eh = reinterpret_cast( bytes ); + +// dnsParser->data( &dnsParser->ctx, 0, bytes, Packet::UnknownDir, h, bytes ); + //std::cerr << "handler" << std::endl; + + if ( eh->h_proto == htons( ETH_P_IP ) ) { + const iphdr *ih = reinterpret_cast(bytes + sizeof(struct ethhdr)); + const int ihlen = ih->ihl * 4; + + if ( ih->protocol == IPPROTO_UDP ) { + + const udphdr *uh = reinterpret_cast( (u_char*)ih + ihlen ); + const int uhlen = sizeof( struct udphdr ); + + /* TCP data and data length. */ + u_char *data = (u_char*)uh + uhlen; + int dlen = ntohs(ih->tot_len) - uhlen - ihlen; + int caplen = h->caplen - uhlen - ihlen - sizeof(struct ethhdr); + + if ( ntohs(uh->source) == 53 || ntohs(uh->dest) == 53 ) { + dnsParser->data( &dnsParser->ctx, 0, data, dlen ); + } + } + } +} + +int main( int argc, const char **argv ) +{ + char errbuf[PCAP_ERRBUF_SIZE]; + DnsParser dnsParser; + + if ( argc < 2 ) { + std::cerr << "usage: ./dns " << std::endl; + return -1; + } + + pcap_t *pcap = pcap_open_offline( argv[1], errbuf ); + if ( pcap == NULL ) { + std::cerr << "couldn't open file " << argv[1] << ": " << errbuf << std::endl; + return -1; + } + + int datalink = pcap_datalink( pcap ); + if ( datalink != DLT_EN10MB ) { + std::cerr << "expected EN10MB datalink\n" << std::endl; + return -1; + } + + int packets = pcap_dispatch( pcap, -1, &handler, (u_char*)&dnsParser ); + if ( packets < 0 ) { + std::cerr << "error from pcap_dispatch: " << packets << std::endl; + } + + pcap_close( pcap ); + return 0; +} diff --git a/grammar/parserust.lm b/grammar/parserust.lm deleted file mode 100644 index 3e17a3b6..00000000 --- a/grammar/parserust.lm +++ /dev/null @@ -1,81 +0,0 @@ -include 'rust.lm' - -parse P: program [stdin] - -if P { - for FN: function in P { - print "function: [FN.id] - - for CE: compound_expression in FN { - if match CE [assignment_expression compound_op compound_expression] - print " compound expression: [CE] - } - - for AE: assignment_expression in FN { - if match AE [range_expression `= assignment_expression] - print " assignment expression: [AE] - } - - for RE: range_expression in FN { - if !match RE [lazy_disjunction] - print " range expression: [RE] - } - - for LD: lazy_disjunction in FN { - if !match LD [lazy_conjunction] - print " lazy disjunction: [LD] - } - - for LC: lazy_conjunction in FN { - if !match LC [comparison] - print " lazy conjunction: [LC] - } - - for C: comparison in FN { - if !match C [bitwise_or] - print " comparison: [C] - } - - for P: pattern in FN { - print " pattern: [P] - } - - for MA: match_arm in FN { - print " match_arm: [MA] - } - - for CL: cons_list in FN { - print " construct list: [^CL] - } - - for M: mult in FN { - if !match M [as] - print " mult: [^M] - } - - for TP: tuple_pattern in FN { - print " tuple pattern: [TP] - } - - for TP: grouped_pattern in FN { - print " grouped pattern: [TP] - } - - for CP: closure_param in FN { - print " closure param: [CP] - } - } - - for M: method in P { - print "method: [M.id] - } - - for MR: macro_rule in P { - print " macro matcher: [MR.macro_matcher] - } -} -else { - send stderr "failed to parse input [error] - exit(1) -} - diff --git a/grammar/pcre.lm b/grammar/pcre.lm deleted file mode 100644 index 691b1307..00000000 --- a/grammar/pcre.lm +++ /dev/null @@ -1,583 +0,0 @@ -global Backrefs: int = 0 - -lex - token pre_equals /'='/ -end - -token alpha_char - / [a-zA-Z] / - -token digit_char - / [0-9] / - -rl alpha_nums - / (alpha_char | '_' ) (alpha_char | '_' | digit_char)* / - -rl alpha_numeric - / 'a'..'z' | 'A'..'Z' | '0'..'9' / - -rl alpha_numerics - / alpha_numeric+ / - -rl hex_digit - / '0'..'9' | 'a'..'f' | 'A'..'F' / - -literal `| `^ -literal `. `? `+ `* -literal `{ `} - -# It is important that these all go into the same lexical region, so we get a -# longest-match with no backtracking among these lexical options. Probably need -# to separate mainline regex from character class regex lexical, but for now -# they are the same regions. -lex - literal `[ - token cc_open_caret /"[^"/ - token cc_open_caret_close /"[^]"/ - token cc_open_close /"[]"/ -end - -literal `] -literal `( `) -literal `< `> -literal `, `: `- `_ `= `! -literal `# `& `$ - -token NL - / '\r' ? '\n' / - -token number - /[0-9]+/ - -# With greedy (default) or lazy (?), we are always attempting all matches. But -# possessive (+) prunes paths, so it must force the pattern to become a -# prefilter. -def quantifier_type - [`+] -| [`?] -| [] - -def general_repetition - [`{ number `} ] -| [`{ number comma `} ] -| [`{ number comma number `} ] - -def quantifier - [`? quantifier_type] :Question -| [`* quantifier_type] :Star -| [`+ quantifier_type] :Plus -| [general_repetition quantifier_type] :General -| [] :Base - -token sr_R /'R'/ -token sr_P /'P'/ - -def subroutine_reference - [`( `? sr_R `)] -| [`( `? number `)] -| [`( `? `+ number `)] -| [`( `? `- number `)] -| [`( `? `& name `)] -| [`( `? sr_P `> name `)] -| [br_g `< name `>] -| [br_g `< number `>] -| [br_g `< `+ number `>] -| [br_g `< `- number `>] -| [br_g single_quote name single_quote ] -| [br_g single_quote number single_quote] -| [br_g single_quote `+ number single_quote] -| [br_g single_quote `- number single_quote] - -token ns_open /'[[:'/ - -lex - token ns_caret /'^'/ - token ns_word /alpha_numerics/ - token ns_close /':]]'/ -end - -def posix_named_set - [ns_open ns_caret? ns_word ns_close] - -token reset_start_match - / '\\K' / - -def shared_atom - [decimal_digit] :DecimalDigit -| [not_decimal_digit] :NotDecimalDigit -| [horizontal_white_space] :HorizonalWhiteSpace -| [not_horizontal_white_space] :NotHorizontalWhiteSpace -| [not_new_line] :NotNewLine -| [new_line_sequence] :NewLineSequence -| [white_space] :WhiteSpace -| [not_white_space] :NotWhiteSpace -| [vertical_white_space] :VerticalWhiteSpace -| [not_vertical_white_space] :NotVerticalWhiteSpace -| [word_char] :WordChar -| [not_word_char] :NotWordChar -| [posix_named_set] :PosixNamedSet -| [char_with_property] :CharWithProperty -| [char_without_property] :CharWithoutProperty -| [control_char] :ControlChar - -def shared_literal - [octal] :Octal -| [alpha_char] :AlphaChar -| [digit_char] :DigitChar -| [bell_char] :BellChar -| [escape_char] :EscapeChar -| [form_feed] :FormFeed -| [new_line] :NewLine -| [carriage_ret] :CarriageRet -| [tab] :Tab -| [hex_char_fixed] :HexCharFixed -| [hex_char_var] :HexCharVar -| [quoted] :Quoted -| [block_quoted] :BlockQuoted -| [open_brace] :OpenBrace -| [close_brace] :CloseBrace -| [comma] :Comma -| [hyphen] :Hypen -| [less_than] :LessThan -| [greater_than] :GreaterThan -| [single_quote] :SingleQuote -| [underscore] :Underscore -| [colon] :Colon -| [hash] :Hash -| [equals] :Equals -| [exclamation] :Excalmation -| [ampersand] :Ampersand -| [other_char_printable] :OtherCharPrintable -| [other_char_non_printable] :OhterCharNonPrintable - -token name - / alpha_nums / - -token bell_char / '\\a' / -token escape_char / '\\e' / -token form_feed / '\\f' / -token new_line / '\\n' / -token carriage_ret / '\\r' / -token tab / '\\t' / -token control_char - / '\\c' ( 0x00 .. 0x7c ) / - -token underscore_alpha_numerics - / ('_' | alpha_numeric)+ / - -rl non_alpha_numeric - / ^alpha_numeric / - -token quoted - /'\\' non_alpha_numeric/ - -token bs_Q - /'\\Q'/ - -lex - # String of non-backslash chars. Or a single backslash. - token block_data / ( [^\\]+ ) | '\\' / - token block_end /'\\E'/ -end - -token block_quoted - /bs_Q block_data* block_end/ - -def hyphen [ `- ] -def less_than [ `< ] -def greater_than [ `> ] -def underscore [ `_ ] -def colon [ `: ] -def equals [ `= ] -def exclamation [ `! ] -def ampersand [ `& ] -def hash [ `# ] -def dollar [ `$ ] - -token single_quote - / "'" / - -token other_char_printable - / ' ' | '~' | ';' | '@' | '%' | '`' | '"' | '/' / - -token other_char_non_printable - / ^( 0 .. 127 ) / - -token P / 'P' / - -def capture_form - [`? `< name `> regex] :NamedPerl -| [`? single_quote name single_quote regex] :NamedQuoted -| [`? P `< name `> regex] :NamedPython -| [regex] :Unamed - -def capture - # This ID is for the ragel implementation. We use the nfa repetition - # operator, which needs an id. - [`( capture_form `)] :Capture - { - Backrefs = Backrefs + 1 - } - -def option_spec - [Add: option_flags `- Remove: option_flags] -| [Add: option_flags] -| [`- Remove: option_flags] - -def non_capture - [`( `? `: regex `)] -| [`( `? option_spec `: regex `)] -| [`( `? `| regex `)] -| [`( `? `> regex `)] - -token non_close_parens - / [^)]+ / - -def comment - [ `( `? `# non_close_parens? `) ] - -def option - [`( `? option_spec `)] -| [`( `* no_start_opt `)] -| [`( `* utf8 `)] -| [`( `* utf16 `)] -| [`( `* ucp `)] - -def option_flags - [option_flag+] - -token option_flag / 'i' | 'J' | 'm' | 's' | 'U' | 'x' / - -token no_start_opt / 'NO_START_OPT' / -token utf8 / 'UTF8' / -token utf16 / 'UTF16' / -token ucp / 'UCP' / - -def look_ahead - [`( `? `= regex `)] -| [`( `? `! regex `)] - -def look_behind - [`( `? `< `= regex `)] -| [`( `? `< `! regex `)] - -def look_around - [look_ahead] -| [look_behind] - -token br_g / '\\g' / -token br_k / '\\k' / - -token maybe_backref / '\\' [1-9] [0-9]* / - -lex - token maybe_octal / - '\\' ( - [1-3] [0-7] [0-7] | - [1-7] [0-7] - ) - / - - token def_octal / - '\\' ( - [0] [0-7] [0-7] | - [0] [0-7] | - [0] - ) - / -end - -token else_digits / '\\' [0-9]+ / - -bool is_backref( Num: str ) -{ - Num = suffix( Num, 1 ) - Ref: int = atoi( Num ) - if ( Ref < 8 || Ref <= Backrefs ) - return true - return false -} - -# Simple disambig between octals and backrefs. Reject octals that can be a -# backref, as determined by counting the number of captures. -def octal - [maybe_octal] :Maybe - { - if ( is_backref( $lhs.maybe_octal ) ) - reject - } -| [def_octal] :Def - -def backref - [maybe_backref] - { - if ( !is_backref( $lhs.maybe_backref ) ) - reject - } -| [br_g number] -| [br_g `{ number `}] -| [br_g `{ `- number `}] -| [br_k `< name `>] -| [br_k single_quote name single_quote] -| [br_g `{ name `}] -| [br_k `{ name `}] -| [`( `? P `= name `)] - -def literal_digits - [else_digits] - -def cond_ref - [number] -| [`+ number] -| [`- number] -| [`< name `>] -| [single_quote name single_quote] -| [cond_ref_R number] -| [cond_ref_R] -| [cond_ref_R `& name] -| [cond_ref_DEFINE] -| [cond_ref_assert] -| [name] - -token cond_ref_DEFINE / 'DEFINE' / -token cond_ref_assert / 'assert' / -token cond_ref_R / 'R' / - -def cond_false - [`| regex ] - -def conditional - [`( `? `( cond_ref `) regex cond_false? `)] - -token btc_accept / 'ACCEPT' / -token btc_fail / 'F' ( 'AIL' )? / -token btc_mark_name / ('MARK')? ':NAME' / -token btc_commit / 'COMMIT' / -token btc_prune / 'PRUNE' / -token btc_prune_name / 'PRUNE:NAME)' / -token btc_skip / 'SKIP' / -token btc_skip_name / 'SKIP:NAME' / -token btc_then / 'THEN' / -token btc_then_name / 'THEN:NAME' / - -def btc_type - [btc_accept] -| [btc_fail] -| [btc_mark_name] -| [btc_commit] -| [btc_prune] -| [btc_prune_name] -| [btc_skip] -| [btc_skip_name] -| [btc_then] -| [btc_then_name] - -def backtrack_control - [ `( `* btc_type `) ] - -token nlc_cr / 'CR' / -token nlc_lf / 'LF' / -token nlc_crlf / 'CRLF' / -token nlc_anycrlf / 'ANYCRLF' / -token nlc_any / 'ANY' / -token nlc_bsr_anycrlf / 'BSR_ANYCRLF' / -token nlc_bsr_unicodo / 'BSR_UNICODE' / - -def nlc_type - [nlc_cr] -| [nlc_lf] -| [nlc_crlf] -| [nlc_anycrlf] -| [nlc_any] -| [nlc_bsr_anycrlf] -| [nlc_bsr_unicodo] - -def newline_convention - [ `( `* nlc_type `) ] - -token callout_C / 'C' / - -def callout - [ `( `? callout_C `) ] -| [ `( `? callout_C number `) ] - -def char_class_start [ `[ ] -def char_class_end [ `] ] -def dot [ `. ] -def caret [ `^ ] -def question_mark [ `? ] -def plus [ `+ ] -def star [ `* ] -def open_brace [ `{ ] -def close_brace [ `} ] -def comma [ `, ] -def pipe [ `| ] -def open_paren [ `( ] -def close_paren [ `) ] - -lex - token hex_char_fixed - / '\\x' hex_digit hex_digit / - - token hex_char_var - / '\\x' '{' hex_digit hex_digit hex_digit+ '}' / -end - -# -# Anchors -# - -token word_boundary / '\\b' / -token non_word_boundary / '\\B' / - -token sos_A - / '\\A' / - -def start_of_subject - [`^] -| [sos_A] - -token eos_z / '\\z' / -token eos_Z / '\\Z' / - -def end_of_subject - [`$] -| [eos_Z] -| [eos_z] - -token first_matching_pos - / '\\G' / - -def anchor - [word_boundary] -| [non_word_boundary] -| [start_of_subject] -| [end_of_subject] -| [first_matching_pos] - -# -# Character classes -# - -def cc_atom_list - [cc_atom cc_atom*] - -def character_class - [`[ cc_atom_list `]] -| [cc_open_caret cc_atom_list `]] -| [cc_open_caret_close cc_atom* `]] -| [cc_open_close cc_atom* `]] -| [cc_open_caret_close hyphen cc_atom_end_range cc_atom* `]] -| [cc_open_close hyphen cc_atom_end_range cc_atom* `]] - -def cc_atom_end_range - [cc_atom] - -def cc_atom - [cc_literal hyphen cc_literal] -| [shared_atom] -| [cc_literal] -| [octal] - -def cc_literal - [shared_literal] -| [dot] -| [char_class_start] -| [caret] -| [question_mark] -| [plus] -| [star] -| [word_boundary] -| [non_word_boundary] -| [dollar] -| [pipe] -| [open_paren] -| [close_paren] - -token decimal_digit / '\\d' / -token not_decimal_digit / '\\D' / -token horizontal_white_space / '\\h' / -token not_horizontal_white_space / '\\H' / -token not_new_line / '\\N' / -token new_line_sequence / '\\R' / -token white_space / '\\s' / -token not_white_space / '\\S' / -token vertical_white_space / '\\v' / -token not_vertical_white_space / '\\V' / -token word_char / '\\w' / -token not_word_char / '\\W' / - -token one_data_unit / '\\C' / -token extended_unicode_char / '\\X' / - -token with_property_open / '\\p' / -token without_property_open / '\\P' / - -def char_with_property - [with_property_open `{ underscore_alpha_numerics `}] -def char_without_property - [without_property_open `{ underscore_alpha_numerics `}] - -def atom - [shared_atom] :SharedAtom -| [shared_literal] :SharedLiteral -| [char_class_end] :CharClassEnd -| [dot] :Dot -| [character_class] :CharacterClass -| [capture] :Capture -| [non_capture] :NonCapture -| [anchor] :Anchor -| [look_around] :LookAround -| [option] :Option -| [newline_convention] :NewlineConvention -| [callout] :Callout -| [reset_start_match] :ResetStartMatch -| [one_data_unit] :OneDataUnit -| [extended_unicode_char] :ExtendedUnicodeChar -| [backtrack_control] :BacktrackControl -| [backref] :Backref -| [literal_digits] :LiteralDigits -| [subroutine_reference] :SubroutineReference -| [conditional] :Conditional -| [comment] :Comment - -def element - [atom quantifier] :Atom - -def term - [element term] :Element -| [] :Base - -def expr - [expr `| term] :Union -| [term] :Base - -def regex - [expr] :Expr - -def init - [] - { - Backrefs = 0 - } - -token unparseable /[^\n]*/ - -def line - [init regex NL] :Regex commit -| [unparseable NL] :Unparseable commit - -def file - [line*] - - -parse F: file [stdin] - -if !F - print "parse error: [error] -else { - for U: unparseable in F - print "unparseable: [U] - for B: backref in F - print "backref: [B] -} diff --git a/grammar/pcre.rl b/grammar/pcre.rl deleted file mode 100644 index 307d5467..00000000 --- a/grammar/pcre.rl +++ /dev/null @@ -1,864 +0,0 @@ -#include -#include -#include -#include - -#define LIST_APPEND( pphead, element ) \ - { \ - element->next = *pphead; \ - *pphead = element; \ - } - -#define LIST_REMOVE( pphead ) \ - { \ - *pphead = (*pphead)->next; \ - } - -#define STACK_PUSH( pphead, element ) \ - { \ - element->parent = *pphead; \ - *pphead = element; \ - } - -#define STACK_POP( pphead ) \ - { \ - *pphead = (*pphead)->parent; \ - } - -#define REVERSE_LIST( S, pphead ) \ - if ( *pphead != 0 ) { \ - S *front = (*pphead)->next; \ - (*pphead)->next = 0; \ - while ( front != NULL ) { \ - S *next = front->next; \ - front->next = *pphead; \ - *pphead = front; \ - front = next; \ - } \ - } - -#define MAX_VERB 40 -#define QUANTIFIER_INF -1 - -struct value -{ - int v; - - struct value *next; -}; - -enum value_type -{ - dot = 256, -}; - -enum quantifer_type -{ - quant_greedy = 1, - quant_lazy, - quant_possessive, -}; - -struct quantifier -{ - int min, max; - enum quantifer_type type; -}; - -enum element_type -{ - element_value_type = 1, - element_regex_type, - element_look_around_type, - element_char_class_type, - element_backref_type, - element_verb_type, -}; - -struct element -{ - enum element_type type; - struct regex *regex; - struct look_around *look_around; - struct quantifier *quantifier; - struct value *value_list; - struct backref *backref; - struct verb *verb; - - struct element *next; -}; - -struct term -{ - struct element *element_list; - - struct term *next; - struct term *parent; -}; - -enum regex_type -{ - regex_non_capture = 1, - regex_capture, - regex_look_around -}; - -#define REGEX_ATOMIC_GROUP 0x01 -#define REGEX_CAPTURE_RESET 0x02 - -struct regex -{ - struct term *term_list; - - enum regex_type type; - int capture; - int node_id; - int options; - - int flags; - - struct regex *next; - struct regex *parent; -}; - -struct pattern -{ - struct regex *regex; - int node_id; - int is_look_around; - - struct look_around *look_arounds; - struct pattern *parent; -}; - -enum look_around_type { - lat_pos_ahead = 1, - lat_neg_ahead, - lat_pos_behind, - lat_neg_behind, -}; - -struct look_around -{ - enum look_around_type type; - struct pattern *pattern; - int node_id; - - struct look_around *next; -}; - -struct backref -{ - int number; -}; - -struct verb -{ - char *name; - char *value; - int number; -}; - -struct quantifier *new_quantifier() -{ - struct quantifier *quant = malloc( sizeof( struct value ) ); - memset( quant, 0, sizeof(struct quantifier) ); - quant->type = quant_greedy; - return quant; -} - -struct value *new_value( int v ) -{ - struct value *value = malloc( sizeof( struct value ) ); - memset( value, 0, sizeof(struct value) ); - value->v = v; - return value; -} - -struct term *new_term( void ) -{ - struct term *term = malloc( sizeof( struct term ) ); - memset( term, 0, sizeof(struct term) ); - return term; -} - -struct element *new_element( enum element_type type, struct regex *regex, struct look_around *look_around ) -{ - struct element *element = malloc( sizeof( struct element ) ); - memset( element, 0, sizeof(struct element) ); - element->type = type; - element->regex = regex; - element->look_around = look_around; - return element; -} - -struct regex *new_regex( enum regex_type type, int options ) -{ - struct regex *regex = malloc( sizeof( struct regex ) ); - memset( regex, 0, sizeof(struct regex) ); - regex->type = type; - regex->options = options; - return regex; -} - -struct look_around *new_look_around( enum look_around_type lat, struct pattern *pattern ) -{ - struct look_around *look_around = malloc( sizeof( struct look_around ) ); - memset( look_around, 0, sizeof(struct look_around) ); - look_around->type = lat; - look_around->pattern = pattern; - return look_around; -} - -struct pattern *new_pattern( struct regex *regex, int is_look_around ) -{ - struct pattern *pattern = malloc( sizeof( struct pattern ) ); - pattern->regex = regex; - pattern->is_look_around = is_look_around; - return pattern; -} - -struct backref *new_backref( int number ) -{ - struct backref *backref = malloc( sizeof( struct backref ) ); - memset( backref, 0, sizeof(struct backref) ); - backref->number = number; - return backref; -} - -struct verb *new_verb( char *d, int len ) -{ - struct verb *verb = malloc( sizeof( struct verb ) ); - verb->name = malloc( len + 1 ); - memcpy( verb->name, d, len ); - verb->name[len] = 0; - verb->value = 0; - verb->number = -1; - return verb; -} - -void append_element_value( struct term *term, int value ) -{ - struct element *el = term->element_list; - if ( el == NULL || - el->type != element_value_type || - el->quantifier != 0 ) - { - el = new_element( element_value_type, NULL, NULL ); - LIST_APPEND( &term->element_list, el ); - } - - struct value *v = new_value( value ); - LIST_APPEND( &el->value_list, v ); -} - -void reverse_value_list( struct value **pphead ) -{ - REVERSE_LIST( struct value, pphead ); -} - -void reverse_element_list( struct element **pphead ) -{ - REVERSE_LIST( struct element, pphead ); -} - -void reverse_term_list( struct term **pphead ) -{ - REVERSE_LIST( struct term, pphead ); -} - -int is_backref( int num, int num_closed_captures ) -{ - if ( num < 8 || num <= num_closed_captures ) - return 1; - return 0; -} - -void append_backref( struct term *term, int number ) -{ - struct element *el = new_element( element_backref_type, NULL, NULL ); - el->backref = new_backref( number ); - LIST_APPEND( &term->element_list, el ); -} - -/* Provides initial ragel call state stack. */ -int *init_ragel_stack( int *size ) -{ - *size = 24; - return malloc( *size * sizeof(int) ); -} - -/* Grows the ragel call state stack. Use in pre-push. */ -int *grow_ragel_stack( int *size, int *stack ) -{ - int new_size = *size * 2; - int *new_stack = malloc( new_size * sizeof(int) ) ; - memcpy( new_stack, stack, *size * sizeof(int) ); - free( stack ); - - *size = new_size; - return new_stack; -} - -void apply_quantifier( struct term *term, int min, int max ) -{ - struct element *el = term->element_list; - - /* We may need to remove the last value from the element's value list. */ - if ( el->type == element_value_type && el->value_list != NULL && el->value_list->next != NULL ) { - /* Take the value off and add it to an element of its own. */ - struct value *last = el->value_list; - LIST_REMOVE( &el->value_list ); - - el = new_element( element_value_type, NULL, NULL ); - - LIST_APPEND( &el->value_list, last ); - LIST_APPEND( &term->element_list, el ); - } - - el->quantifier = new_quantifier(); - el->quantifier->min = min; - el->quantifier->max = max; -} - -void apply_lazy( struct term *term ) -{ - struct element *el = term->element_list; - el->quantifier->type = quant_lazy; -} - -void apply_possessive( struct term *term ) -{ - struct element *el = term->element_list; - el->quantifier->type = quant_possessive; -} - -%%{ - machine PCRE; - - action enter_term { - struct term *term = new_term(); - STACK_PUSH( &s_term, term ); - LIST_APPEND( &s_regex->term_list, term ); - } - - action leave_term { - reverse_element_list( &s_term->element_list ); - struct element *el = s_term->element_list; - while ( el != 0 ) { - reverse_value_list( &el->value_list ); - el = el->next; - } - STACK_POP( &s_term ); - } - - action enter_regex { - struct regex *regex = new_regex( regex_non_capture, s_regex->options ); - struct element *element = new_element( element_regex_type, regex, NULL ); - LIST_APPEND( &s_term->element_list, element ); - STACK_PUSH( &s_regex, regex ); - } - - action enter_lookaround { - struct regex *regex = new_regex( regex_look_around, s_regex->options ); - struct pattern *pattern = new_pattern( regex, 1 ); - struct look_around *la = new_look_around( lat, pattern ); - pattern->node_id = la->node_id; - - LIST_APPEND( &s_pattern->look_arounds, la ); - - struct element *element = new_element( element_look_around_type, NULL, la ); - LIST_APPEND( &s_term->element_list, element ); - - STACK_PUSH( &s_regex, regex ); - STACK_PUSH( &s_pattern, pattern ); - } - - action leave_regex { - if ( s_regex->type == regex_capture ) - closed_captures += 1; - - if ( s_regex->type == regex_look_around ) - STACK_POP( &s_pattern ); - - reverse_term_list( &s_regex->term_list ); - STACK_POP( &s_regex ); - } - - # - # Numbers - # - action init_number - { number = 0; } - - action decimal_digit - { number = number * 10 + ( *p - '0' ); } - - number = [0-9]+ >init_number $decimal_digit; - - # - # Verbs - # - verb = ( [a-zA-Z_] [a-zA-Z_0-9]* ) >{ verb_len = 0; } ${ - if ( verb_len < MAX_VERB ) { - verb[verb_len++] = *p; - } - }; - - action append_verb { - struct element *el = new_element( element_verb_type, NULL, NULL ); - cur_verb = el->verb = new_verb( verb, verb_len ); - - LIST_APPEND( &s_term->element_list, el ); - } - - action verb_value - { - cur_verb->value = malloc( verb_len + 1 ); - memcpy( cur_verb->value, verb, verb_len ); - cur_verb->value[verb_len] = 0; - } - - action verb_number - { - cur_verb->number = number; - } - - verb_forms = - verb %append_verb ( ':' verb %verb_value | '=' number %verb_number )?; - - action options_only { fret; } - - options = ( - 'c' # case-sensitive matching - | 'i' # case-insensitive matching - )*; - - alpha_char = [a-zA-Z]; - digit_char = [0-9]; - - open_paren = '('; - close_paren = ')'; - - char_class_start = '['; - char_class_end = ']'; - - open_curly = '{'; - close_curly = '}'; - - ampersand = '&'; - colon = ':'; - comma = ','; - dollar = '$'; - dot = '.'; - equals = '='; - exclamation = '!'; - greater_than = '>'; - hash = '#'; - hyphen = '-'; - less_than = '<'; - pipe = '|'; - single_quote = "'"; - underscore = '_'; - - other_char_printable = - ' ' | '~' | ';' | '@' | '%' | '`' | '"' | '/'; - - other_char_non_printable = ^( 0 .. 127 ); - - capture_non_capture = - '(' @{ fcall paren_open; }; - - literal_sym = - comma | hyphen | less_than | - greater_than | single_quote | underscore | colon | - hash | equals | exclamation | ampersand; - - action append_char { - append_element_value( s_term, *p ); - } - - literal = ( - alpha_char | - digit_char | - literal_sym | - other_char_printable | - other_char_non_printable - ) @append_char; - - # - # Octals and backrefs. There is an ambiguity between these two, therefore - # the grammar handles them together. - # - - action init_octal - { octal = 0; } - - action octal_digit - { octal = octal * 8 + ( *p - '0' ); } - - backref_num = [1-9] [0-9]*; - - octal_num = - [0-3] [0-7] [0-7] | - [0-7] [0-7] | - [0]; - - action octal { append_element_value( s_term, octal ); } - - action octal_or_backref - { - if ( is_backref( number, closed_captures ) ) - append_backref( s_term, number ); - else - append_element_value( s_term, octal ); - } - - action backref - { - if ( number > closed_captures ) { - printf( "invalid backref: \\%d\n", number ); - fbreak; - } - - append_backref( s_term, number ); - } - - # Certainly octal. All octals, with possibly backrefs removed. - def_octal = - '\\' @init_octal - ( octal_num - backref_num ) $octal_digit; - - # All cases that can be either octal or backref and we need to inspect the - # backref number to decide. Use the intersection of the two numbers as the - # pattern. - octal_or_backref = - '\\' @init_octal @init_number - ( octal_num & backref_num ) $octal_digit $decimal_digit; - - # Definitely backref. The backref pattern excluding anything with a prefix - # that could be octal. - def_backref = - '\\' @init_number - ( backref_num - ( octal_num any* ) ) $decimal_digit; - - octal = - def_octal %octal | - octal_or_backref %octal_or_backref; - - backref = - def_backref %backref | - octal_or_backref %octal_or_backref; - - atom = - literal | - char_class_end @append_char | - dot @{ append_element_value( s_term, dot ); } | - octal | - backref | - open_paren @{ fcall open_paren_forms; } - ; - - non_greedy = - '?' @{ apply_lazy( s_term ); } | - '+' @{ apply_possessive( s_term ); }; - - quant_min = number %{ quant_min = number; }; - quant_max = number %{ quant_max = number; }; - - action quant_zero_plus { apply_quantifier( s_term, 0, QUANTIFIER_INF ); } - action quant_one_plus { apply_quantifier( s_term, 1, QUANTIFIER_INF ); } - action quant_optional { apply_quantifier( s_term, 0, 1 ); } - action quant_exact { apply_quantifier( s_term, quant_min, quant_min ); } - action quant_min { apply_quantifier( s_term, quant_min, QUANTIFIER_INF ); } - action quant_min_max { apply_quantifier( s_term, quant_min, quant_max ); } - - quant_forms = - '*' @quant_zero_plus | - '+' @quant_one_plus | - '?' @quant_optional | - open_curly quant_min close_curly @quant_exact | - open_curly quant_min ',' close_curly @quant_min | - open_curly quant_min ',' quant_max close_curly @quant_min_max; - - quantifier = quant_forms non_greedy?; - - element = ( atom quantifier? ); - - term = ( element** ) >enter_term; - - # - # Expression - # - expr = term ( '|' @leave_term term )*; - - # - # Regex - # - regex = expr; - - - open_paren_forms := - # Look at the first few charcters to see what the form is. What we - # handle here: - # (re) capturing parens - # (?:re) non-capturing parens - # (?=re) positive lookahead - # (?!re) negative lookahead - # (?<=re) positive lookbehind - # (?' @enter_regex - @{ s_regex->flags |= REGEX_ATOMIC_GROUP; } | - - # Capture number reset on branch. - '?|' @enter_regex - @{ s_regex->flags |= REGEX_CAPTURE_RESET; } | - - # Lookaround - ( - '?=' @{ lat = lat_pos_ahead; } @enter_lookaround | - '?!' @{ lat = lat_neg_ahead; } @enter_lookaround | - '?<=' @{ lat = lat_pos_behind; } @enter_lookaround | - '?type = regex_capture; - s_regex->capture = next_capture++; - fhold; - } - - ) - regex ')' @leave_term @leave_regex @{ fret; }; - - - main := regex '\n' @leave_term @leave_regex @{ success = 1; }; -}%% - -%% write data; - -int pcre_parse( struct pattern **result_pattern, char *line, int len ) -{ - int *stack; - int stack_size; - int cs, top; - char *p, *pe; - int success = 0; - int result = 1; - - int number = 0; - int octal = 0; - int quant_min, quant_max; - - struct pattern *s_pattern = NULL; - struct regex *s_regex = NULL; - struct term *s_term = NULL; - - /* Collect into this. */ - struct regex *root_regex = new_regex( regex_non_capture, 0 ); - struct pattern *root_pattern = new_pattern( root_regex, 0 ); - STACK_PUSH( &s_pattern, root_pattern ); - STACK_PUSH( &s_regex, root_regex ); - - int closed_captures = 0; - int next_capture = 1; - - enum look_around_type lat; - - int verb_len; - char verb[MAX_VERB]; - struct verb *cur_verb; - - stack = init_ragel_stack( &stack_size ); - - %% write init; - - p = line; - pe = line + len; - - %% write exec; - - if ( !success ) { - printf( "parse error at %ld\n", ( p - line + 1) ); - result = 0; - } - else if ( - s_pattern == NULL || s_pattern->parent != NULL || - s_regex != NULL || - s_term != NULL ) - { - printf( "parse error: items not closed\n" ); - result = 0; - } - - if ( result ) { - *result_pattern = root_pattern; - } - else { - - } - - free( stack ); - - return result; -} - -extern void print_value( int indent, struct value *value ); -extern void print_element( int indent, struct element *el ); -extern void print_term( int indent, struct term *term ); -extern void print_regex( int indent, struct regex *regex ); - -void print_indent( int indent ) -{ - while ( indent-- > 0 ) - printf( " " ); -} - -void print_value( int indent, struct value *value ) -{ - printf( "v(%d)", value->v ); -} - -void print_element( int indent, struct element *el ) -{ - print_indent( indent ); - printf( "el: " ); - if ( el->type == element_value_type ) { - struct value *value = el->value_list; - while ( value != NULL ) { - print_value( indent + 1, value ); - if ( value->next != 0 ) - printf( " . " ); - value = value->next; - } - } - else if ( el->type == element_regex_type ) { - if ( el->regex != 0 ) - print_regex( indent, el->regex ); - } - else if ( el->type == element_look_around_type ) { - printf( "lookaround " ); - switch ( el->look_around->type ) { - case lat_pos_ahead: printf( "=" ); break; - case lat_neg_ahead: printf( "!" ); break; - case lat_pos_behind: printf( "<=" ); break; - case lat_neg_behind: printf( "look_around->pattern->regex ); - } - else if ( el->type == element_char_class_type ) - printf( "cc" ); - else if ( el->type == element_backref_type ) - printf( "backref(%d)", el->backref->number ); - else if ( el->type == element_verb_type ) { - printf( "verb(%s", el->verb->name ); - if ( el->verb->value != NULL ) - printf( ":%s", el->verb->value ); - else if ( el->verb->number >= 0 ) - printf( "=%d", el->verb->number ); - printf( ")" ); - } - - if ( el->quantifier != NULL ) { - printf( " {%d,", el->quantifier->min ); - - if ( el->quantifier->max == QUANTIFIER_INF ) - printf( "inf" ); - else - printf( "%d", el->quantifier->max ); - - printf( "}" ); - - if ( el->quantifier->type == quant_lazy ) - printf( "?" ); - else if ( el->quantifier->type == quant_possessive ) - printf( "+" ); - - } - - printf("\n"); -} - -void print_term( int indent, struct term *term ) -{ - print_indent( indent ); - printf( "term:\n" ); - struct element *el = term->element_list; - while ( el != NULL ) { - print_element( indent + 1, el ); - el = el->next; - } -} - -void print_regex( int indent, struct regex *regex ) -{ - printf( "reg: " ); - if ( regex->type == regex_capture ) - printf( "cap(%d) ", regex->capture ); - if ( regex->flags & REGEX_ATOMIC_GROUP ) - printf( "atomic_group " ); - if ( regex->flags & REGEX_CAPTURE_RESET ) - printf( "capture_reset " ); - printf( "(\n" ); - - struct term *term = regex->term_list; - while ( term != NULL ) { - print_term( indent + 1, term ); - term = term->next; - } - print_indent( indent ); - printf( ")" ); -} - -void print_pattern( int indent, struct pattern *pat ) -{ - printf( "pat: " ); - print_regex( indent, pat->regex ); - printf( "\n" ); -} - -char linebuf[2048]; - -int main( int argc, char **argv ) -{ - if ( argc < 2 ) { - fprintf( stderr, "usage: ./pcre \n" ); - return -1; - } - - FILE *input = fopen( argv[1], "r" ); - if ( input == NULL ) { - fprintf( stderr, "failed to open %s: %s", argv[0], strerror(errno) ); - return -1; - } - - char *line = NULL; - size_t n = 0; - ssize_t read; - - while ( (read = getline( &line, &n, input )) != -1 ) { - struct pattern *pat; - int parsed = pcre_parse( &pat, line, (int)read ); - if ( parsed ) { - print_pattern( 0, pat ); - } - } - - fclose( input ); - free( line ); - return 0; -} - diff --git a/grammar/pcre/.gitignore b/grammar/pcre/.gitignore new file mode 100644 index 00000000..05b136b8 --- /dev/null +++ b/grammar/pcre/.gitignore @@ -0,0 +1,4 @@ +/pcre-colm.c +/pcre-colm +/pcre-ragel.c +/pcre-ragel diff --git a/grammar/pcre/Makefile b/grammar/pcre/Makefile new file mode 100644 index 00000000..6f895485 --- /dev/null +++ b/grammar/pcre/Makefile @@ -0,0 +1,11 @@ +COLM = ../../colm/colm +RAGEL = ../../ragel/ragel + +all: pcre-colm pcre-ragel + +pcre-colm: pcre.lm + $(COLM) -o $@ pcre.lm + +pcre-ragel: pcre.rl $(RAGEL) + $(RAGEL) -G2 pcre.rl + gcc -g -Wall -o $@ pcre.c diff --git a/grammar/pcre/pcre.lm b/grammar/pcre/pcre.lm new file mode 100644 index 00000000..691b1307 --- /dev/null +++ b/grammar/pcre/pcre.lm @@ -0,0 +1,583 @@ +global Backrefs: int = 0 + +lex + token pre_equals /'='/ +end + +token alpha_char + / [a-zA-Z] / + +token digit_char + / [0-9] / + +rl alpha_nums + / (alpha_char | '_' ) (alpha_char | '_' | digit_char)* / + +rl alpha_numeric + / 'a'..'z' | 'A'..'Z' | '0'..'9' / + +rl alpha_numerics + / alpha_numeric+ / + +rl hex_digit + / '0'..'9' | 'a'..'f' | 'A'..'F' / + +literal `| `^ +literal `. `? `+ `* +literal `{ `} + +# It is important that these all go into the same lexical region, so we get a +# longest-match with no backtracking among these lexical options. Probably need +# to separate mainline regex from character class regex lexical, but for now +# they are the same regions. +lex + literal `[ + token cc_open_caret /"[^"/ + token cc_open_caret_close /"[^]"/ + token cc_open_close /"[]"/ +end + +literal `] +literal `( `) +literal `< `> +literal `, `: `- `_ `= `! +literal `# `& `$ + +token NL + / '\r' ? '\n' / + +token number + /[0-9]+/ + +# With greedy (default) or lazy (?), we are always attempting all matches. But +# possessive (+) prunes paths, so it must force the pattern to become a +# prefilter. +def quantifier_type + [`+] +| [`?] +| [] + +def general_repetition + [`{ number `} ] +| [`{ number comma `} ] +| [`{ number comma number `} ] + +def quantifier + [`? quantifier_type] :Question +| [`* quantifier_type] :Star +| [`+ quantifier_type] :Plus +| [general_repetition quantifier_type] :General +| [] :Base + +token sr_R /'R'/ +token sr_P /'P'/ + +def subroutine_reference + [`( `? sr_R `)] +| [`( `? number `)] +| [`( `? `+ number `)] +| [`( `? `- number `)] +| [`( `? `& name `)] +| [`( `? sr_P `> name `)] +| [br_g `< name `>] +| [br_g `< number `>] +| [br_g `< `+ number `>] +| [br_g `< `- number `>] +| [br_g single_quote name single_quote ] +| [br_g single_quote number single_quote] +| [br_g single_quote `+ number single_quote] +| [br_g single_quote `- number single_quote] + +token ns_open /'[[:'/ + +lex + token ns_caret /'^'/ + token ns_word /alpha_numerics/ + token ns_close /':]]'/ +end + +def posix_named_set + [ns_open ns_caret? ns_word ns_close] + +token reset_start_match + / '\\K' / + +def shared_atom + [decimal_digit] :DecimalDigit +| [not_decimal_digit] :NotDecimalDigit +| [horizontal_white_space] :HorizonalWhiteSpace +| [not_horizontal_white_space] :NotHorizontalWhiteSpace +| [not_new_line] :NotNewLine +| [new_line_sequence] :NewLineSequence +| [white_space] :WhiteSpace +| [not_white_space] :NotWhiteSpace +| [vertical_white_space] :VerticalWhiteSpace +| [not_vertical_white_space] :NotVerticalWhiteSpace +| [word_char] :WordChar +| [not_word_char] :NotWordChar +| [posix_named_set] :PosixNamedSet +| [char_with_property] :CharWithProperty +| [char_without_property] :CharWithoutProperty +| [control_char] :ControlChar + +def shared_literal + [octal] :Octal +| [alpha_char] :AlphaChar +| [digit_char] :DigitChar +| [bell_char] :BellChar +| [escape_char] :EscapeChar +| [form_feed] :FormFeed +| [new_line] :NewLine +| [carriage_ret] :CarriageRet +| [tab] :Tab +| [hex_char_fixed] :HexCharFixed +| [hex_char_var] :HexCharVar +| [quoted] :Quoted +| [block_quoted] :BlockQuoted +| [open_brace] :OpenBrace +| [close_brace] :CloseBrace +| [comma] :Comma +| [hyphen] :Hypen +| [less_than] :LessThan +| [greater_than] :GreaterThan +| [single_quote] :SingleQuote +| [underscore] :Underscore +| [colon] :Colon +| [hash] :Hash +| [equals] :Equals +| [exclamation] :Excalmation +| [ampersand] :Ampersand +| [other_char_printable] :OtherCharPrintable +| [other_char_non_printable] :OhterCharNonPrintable + +token name + / alpha_nums / + +token bell_char / '\\a' / +token escape_char / '\\e' / +token form_feed / '\\f' / +token new_line / '\\n' / +token carriage_ret / '\\r' / +token tab / '\\t' / +token control_char + / '\\c' ( 0x00 .. 0x7c ) / + +token underscore_alpha_numerics + / ('_' | alpha_numeric)+ / + +rl non_alpha_numeric + / ^alpha_numeric / + +token quoted + /'\\' non_alpha_numeric/ + +token bs_Q + /'\\Q'/ + +lex + # String of non-backslash chars. Or a single backslash. + token block_data / ( [^\\]+ ) | '\\' / + token block_end /'\\E'/ +end + +token block_quoted + /bs_Q block_data* block_end/ + +def hyphen [ `- ] +def less_than [ `< ] +def greater_than [ `> ] +def underscore [ `_ ] +def colon [ `: ] +def equals [ `= ] +def exclamation [ `! ] +def ampersand [ `& ] +def hash [ `# ] +def dollar [ `$ ] + +token single_quote + / "'" / + +token other_char_printable + / ' ' | '~' | ';' | '@' | '%' | '`' | '"' | '/' / + +token other_char_non_printable + / ^( 0 .. 127 ) / + +token P / 'P' / + +def capture_form + [`? `< name `> regex] :NamedPerl +| [`? single_quote name single_quote regex] :NamedQuoted +| [`? P `< name `> regex] :NamedPython +| [regex] :Unamed + +def capture + # This ID is for the ragel implementation. We use the nfa repetition + # operator, which needs an id. + [`( capture_form `)] :Capture + { + Backrefs = Backrefs + 1 + } + +def option_spec + [Add: option_flags `- Remove: option_flags] +| [Add: option_flags] +| [`- Remove: option_flags] + +def non_capture + [`( `? `: regex `)] +| [`( `? option_spec `: regex `)] +| [`( `? `| regex `)] +| [`( `? `> regex `)] + +token non_close_parens + / [^)]+ / + +def comment + [ `( `? `# non_close_parens? `) ] + +def option + [`( `? option_spec `)] +| [`( `* no_start_opt `)] +| [`( `* utf8 `)] +| [`( `* utf16 `)] +| [`( `* ucp `)] + +def option_flags + [option_flag+] + +token option_flag / 'i' | 'J' | 'm' | 's' | 'U' | 'x' / + +token no_start_opt / 'NO_START_OPT' / +token utf8 / 'UTF8' / +token utf16 / 'UTF16' / +token ucp / 'UCP' / + +def look_ahead + [`( `? `= regex `)] +| [`( `? `! regex `)] + +def look_behind + [`( `? `< `= regex `)] +| [`( `? `< `! regex `)] + +def look_around + [look_ahead] +| [look_behind] + +token br_g / '\\g' / +token br_k / '\\k' / + +token maybe_backref / '\\' [1-9] [0-9]* / + +lex + token maybe_octal / + '\\' ( + [1-3] [0-7] [0-7] | + [1-7] [0-7] + ) + / + + token def_octal / + '\\' ( + [0] [0-7] [0-7] | + [0] [0-7] | + [0] + ) + / +end + +token else_digits / '\\' [0-9]+ / + +bool is_backref( Num: str ) +{ + Num = suffix( Num, 1 ) + Ref: int = atoi( Num ) + if ( Ref < 8 || Ref <= Backrefs ) + return true + return false +} + +# Simple disambig between octals and backrefs. Reject octals that can be a +# backref, as determined by counting the number of captures. +def octal + [maybe_octal] :Maybe + { + if ( is_backref( $lhs.maybe_octal ) ) + reject + } +| [def_octal] :Def + +def backref + [maybe_backref] + { + if ( !is_backref( $lhs.maybe_backref ) ) + reject + } +| [br_g number] +| [br_g `{ number `}] +| [br_g `{ `- number `}] +| [br_k `< name `>] +| [br_k single_quote name single_quote] +| [br_g `{ name `}] +| [br_k `{ name `}] +| [`( `? P `= name `)] + +def literal_digits + [else_digits] + +def cond_ref + [number] +| [`+ number] +| [`- number] +| [`< name `>] +| [single_quote name single_quote] +| [cond_ref_R number] +| [cond_ref_R] +| [cond_ref_R `& name] +| [cond_ref_DEFINE] +| [cond_ref_assert] +| [name] + +token cond_ref_DEFINE / 'DEFINE' / +token cond_ref_assert / 'assert' / +token cond_ref_R / 'R' / + +def cond_false + [`| regex ] + +def conditional + [`( `? `( cond_ref `) regex cond_false? `)] + +token btc_accept / 'ACCEPT' / +token btc_fail / 'F' ( 'AIL' )? / +token btc_mark_name / ('MARK')? ':NAME' / +token btc_commit / 'COMMIT' / +token btc_prune / 'PRUNE' / +token btc_prune_name / 'PRUNE:NAME)' / +token btc_skip / 'SKIP' / +token btc_skip_name / 'SKIP:NAME' / +token btc_then / 'THEN' / +token btc_then_name / 'THEN:NAME' / + +def btc_type + [btc_accept] +| [btc_fail] +| [btc_mark_name] +| [btc_commit] +| [btc_prune] +| [btc_prune_name] +| [btc_skip] +| [btc_skip_name] +| [btc_then] +| [btc_then_name] + +def backtrack_control + [ `( `* btc_type `) ] + +token nlc_cr / 'CR' / +token nlc_lf / 'LF' / +token nlc_crlf / 'CRLF' / +token nlc_anycrlf / 'ANYCRLF' / +token nlc_any / 'ANY' / +token nlc_bsr_anycrlf / 'BSR_ANYCRLF' / +token nlc_bsr_unicodo / 'BSR_UNICODE' / + +def nlc_type + [nlc_cr] +| [nlc_lf] +| [nlc_crlf] +| [nlc_anycrlf] +| [nlc_any] +| [nlc_bsr_anycrlf] +| [nlc_bsr_unicodo] + +def newline_convention + [ `( `* nlc_type `) ] + +token callout_C / 'C' / + +def callout + [ `( `? callout_C `) ] +| [ `( `? callout_C number `) ] + +def char_class_start [ `[ ] +def char_class_end [ `] ] +def dot [ `. ] +def caret [ `^ ] +def question_mark [ `? ] +def plus [ `+ ] +def star [ `* ] +def open_brace [ `{ ] +def close_brace [ `} ] +def comma [ `, ] +def pipe [ `| ] +def open_paren [ `( ] +def close_paren [ `) ] + +lex + token hex_char_fixed + / '\\x' hex_digit hex_digit / + + token hex_char_var + / '\\x' '{' hex_digit hex_digit hex_digit+ '}' / +end + +# +# Anchors +# + +token word_boundary / '\\b' / +token non_word_boundary / '\\B' / + +token sos_A + / '\\A' / + +def start_of_subject + [`^] +| [sos_A] + +token eos_z / '\\z' / +token eos_Z / '\\Z' / + +def end_of_subject + [`$] +| [eos_Z] +| [eos_z] + +token first_matching_pos + / '\\G' / + +def anchor + [word_boundary] +| [non_word_boundary] +| [start_of_subject] +| [end_of_subject] +| [first_matching_pos] + +# +# Character classes +# + +def cc_atom_list + [cc_atom cc_atom*] + +def character_class + [`[ cc_atom_list `]] +| [cc_open_caret cc_atom_list `]] +| [cc_open_caret_close cc_atom* `]] +| [cc_open_close cc_atom* `]] +| [cc_open_caret_close hyphen cc_atom_end_range cc_atom* `]] +| [cc_open_close hyphen cc_atom_end_range cc_atom* `]] + +def cc_atom_end_range + [cc_atom] + +def cc_atom + [cc_literal hyphen cc_literal] +| [shared_atom] +| [cc_literal] +| [octal] + +def cc_literal + [shared_literal] +| [dot] +| [char_class_start] +| [caret] +| [question_mark] +| [plus] +| [star] +| [word_boundary] +| [non_word_boundary] +| [dollar] +| [pipe] +| [open_paren] +| [close_paren] + +token decimal_digit / '\\d' / +token not_decimal_digit / '\\D' / +token horizontal_white_space / '\\h' / +token not_horizontal_white_space / '\\H' / +token not_new_line / '\\N' / +token new_line_sequence / '\\R' / +token white_space / '\\s' / +token not_white_space / '\\S' / +token vertical_white_space / '\\v' / +token not_vertical_white_space / '\\V' / +token word_char / '\\w' / +token not_word_char / '\\W' / + +token one_data_unit / '\\C' / +token extended_unicode_char / '\\X' / + +token with_property_open / '\\p' / +token without_property_open / '\\P' / + +def char_with_property + [with_property_open `{ underscore_alpha_numerics `}] +def char_without_property + [without_property_open `{ underscore_alpha_numerics `}] + +def atom + [shared_atom] :SharedAtom +| [shared_literal] :SharedLiteral +| [char_class_end] :CharClassEnd +| [dot] :Dot +| [character_class] :CharacterClass +| [capture] :Capture +| [non_capture] :NonCapture +| [anchor] :Anchor +| [look_around] :LookAround +| [option] :Option +| [newline_convention] :NewlineConvention +| [callout] :Callout +| [reset_start_match] :ResetStartMatch +| [one_data_unit] :OneDataUnit +| [extended_unicode_char] :ExtendedUnicodeChar +| [backtrack_control] :BacktrackControl +| [backref] :Backref +| [literal_digits] :LiteralDigits +| [subroutine_reference] :SubroutineReference +| [conditional] :Conditional +| [comment] :Comment + +def element + [atom quantifier] :Atom + +def term + [element term] :Element +| [] :Base + +def expr + [expr `| term] :Union +| [term] :Base + +def regex + [expr] :Expr + +def init + [] + { + Backrefs = 0 + } + +token unparseable /[^\n]*/ + +def line + [init regex NL] :Regex commit +| [unparseable NL] :Unparseable commit + +def file + [line*] + + +parse F: file [stdin] + +if !F + print "parse error: [error] +else { + for U: unparseable in F + print "unparseable: [U] + for B: backref in F + print "backref: [B] +} diff --git a/grammar/pcre/pcre.rl b/grammar/pcre/pcre.rl new file mode 100644 index 00000000..307d5467 --- /dev/null +++ b/grammar/pcre/pcre.rl @@ -0,0 +1,864 @@ +#include +#include +#include +#include + +#define LIST_APPEND( pphead, element ) \ + { \ + element->next = *pphead; \ + *pphead = element; \ + } + +#define LIST_REMOVE( pphead ) \ + { \ + *pphead = (*pphead)->next; \ + } + +#define STACK_PUSH( pphead, element ) \ + { \ + element->parent = *pphead; \ + *pphead = element; \ + } + +#define STACK_POP( pphead ) \ + { \ + *pphead = (*pphead)->parent; \ + } + +#define REVERSE_LIST( S, pphead ) \ + if ( *pphead != 0 ) { \ + S *front = (*pphead)->next; \ + (*pphead)->next = 0; \ + while ( front != NULL ) { \ + S *next = front->next; \ + front->next = *pphead; \ + *pphead = front; \ + front = next; \ + } \ + } + +#define MAX_VERB 40 +#define QUANTIFIER_INF -1 + +struct value +{ + int v; + + struct value *next; +}; + +enum value_type +{ + dot = 256, +}; + +enum quantifer_type +{ + quant_greedy = 1, + quant_lazy, + quant_possessive, +}; + +struct quantifier +{ + int min, max; + enum quantifer_type type; +}; + +enum element_type +{ + element_value_type = 1, + element_regex_type, + element_look_around_type, + element_char_class_type, + element_backref_type, + element_verb_type, +}; + +struct element +{ + enum element_type type; + struct regex *regex; + struct look_around *look_around; + struct quantifier *quantifier; + struct value *value_list; + struct backref *backref; + struct verb *verb; + + struct element *next; +}; + +struct term +{ + struct element *element_list; + + struct term *next; + struct term *parent; +}; + +enum regex_type +{ + regex_non_capture = 1, + regex_capture, + regex_look_around +}; + +#define REGEX_ATOMIC_GROUP 0x01 +#define REGEX_CAPTURE_RESET 0x02 + +struct regex +{ + struct term *term_list; + + enum regex_type type; + int capture; + int node_id; + int options; + + int flags; + + struct regex *next; + struct regex *parent; +}; + +struct pattern +{ + struct regex *regex; + int node_id; + int is_look_around; + + struct look_around *look_arounds; + struct pattern *parent; +}; + +enum look_around_type { + lat_pos_ahead = 1, + lat_neg_ahead, + lat_pos_behind, + lat_neg_behind, +}; + +struct look_around +{ + enum look_around_type type; + struct pattern *pattern; + int node_id; + + struct look_around *next; +}; + +struct backref +{ + int number; +}; + +struct verb +{ + char *name; + char *value; + int number; +}; + +struct quantifier *new_quantifier() +{ + struct quantifier *quant = malloc( sizeof( struct value ) ); + memset( quant, 0, sizeof(struct quantifier) ); + quant->type = quant_greedy; + return quant; +} + +struct value *new_value( int v ) +{ + struct value *value = malloc( sizeof( struct value ) ); + memset( value, 0, sizeof(struct value) ); + value->v = v; + return value; +} + +struct term *new_term( void ) +{ + struct term *term = malloc( sizeof( struct term ) ); + memset( term, 0, sizeof(struct term) ); + return term; +} + +struct element *new_element( enum element_type type, struct regex *regex, struct look_around *look_around ) +{ + struct element *element = malloc( sizeof( struct element ) ); + memset( element, 0, sizeof(struct element) ); + element->type = type; + element->regex = regex; + element->look_around = look_around; + return element; +} + +struct regex *new_regex( enum regex_type type, int options ) +{ + struct regex *regex = malloc( sizeof( struct regex ) ); + memset( regex, 0, sizeof(struct regex) ); + regex->type = type; + regex->options = options; + return regex; +} + +struct look_around *new_look_around( enum look_around_type lat, struct pattern *pattern ) +{ + struct look_around *look_around = malloc( sizeof( struct look_around ) ); + memset( look_around, 0, sizeof(struct look_around) ); + look_around->type = lat; + look_around->pattern = pattern; + return look_around; +} + +struct pattern *new_pattern( struct regex *regex, int is_look_around ) +{ + struct pattern *pattern = malloc( sizeof( struct pattern ) ); + pattern->regex = regex; + pattern->is_look_around = is_look_around; + return pattern; +} + +struct backref *new_backref( int number ) +{ + struct backref *backref = malloc( sizeof( struct backref ) ); + memset( backref, 0, sizeof(struct backref) ); + backref->number = number; + return backref; +} + +struct verb *new_verb( char *d, int len ) +{ + struct verb *verb = malloc( sizeof( struct verb ) ); + verb->name = malloc( len + 1 ); + memcpy( verb->name, d, len ); + verb->name[len] = 0; + verb->value = 0; + verb->number = -1; + return verb; +} + +void append_element_value( struct term *term, int value ) +{ + struct element *el = term->element_list; + if ( el == NULL || + el->type != element_value_type || + el->quantifier != 0 ) + { + el = new_element( element_value_type, NULL, NULL ); + LIST_APPEND( &term->element_list, el ); + } + + struct value *v = new_value( value ); + LIST_APPEND( &el->value_list, v ); +} + +void reverse_value_list( struct value **pphead ) +{ + REVERSE_LIST( struct value, pphead ); +} + +void reverse_element_list( struct element **pphead ) +{ + REVERSE_LIST( struct element, pphead ); +} + +void reverse_term_list( struct term **pphead ) +{ + REVERSE_LIST( struct term, pphead ); +} + +int is_backref( int num, int num_closed_captures ) +{ + if ( num < 8 || num <= num_closed_captures ) + return 1; + return 0; +} + +void append_backref( struct term *term, int number ) +{ + struct element *el = new_element( element_backref_type, NULL, NULL ); + el->backref = new_backref( number ); + LIST_APPEND( &term->element_list, el ); +} + +/* Provides initial ragel call state stack. */ +int *init_ragel_stack( int *size ) +{ + *size = 24; + return malloc( *size * sizeof(int) ); +} + +/* Grows the ragel call state stack. Use in pre-push. */ +int *grow_ragel_stack( int *size, int *stack ) +{ + int new_size = *size * 2; + int *new_stack = malloc( new_size * sizeof(int) ) ; + memcpy( new_stack, stack, *size * sizeof(int) ); + free( stack ); + + *size = new_size; + return new_stack; +} + +void apply_quantifier( struct term *term, int min, int max ) +{ + struct element *el = term->element_list; + + /* We may need to remove the last value from the element's value list. */ + if ( el->type == element_value_type && el->value_list != NULL && el->value_list->next != NULL ) { + /* Take the value off and add it to an element of its own. */ + struct value *last = el->value_list; + LIST_REMOVE( &el->value_list ); + + el = new_element( element_value_type, NULL, NULL ); + + LIST_APPEND( &el->value_list, last ); + LIST_APPEND( &term->element_list, el ); + } + + el->quantifier = new_quantifier(); + el->quantifier->min = min; + el->quantifier->max = max; +} + +void apply_lazy( struct term *term ) +{ + struct element *el = term->element_list; + el->quantifier->type = quant_lazy; +} + +void apply_possessive( struct term *term ) +{ + struct element *el = term->element_list; + el->quantifier->type = quant_possessive; +} + +%%{ + machine PCRE; + + action enter_term { + struct term *term = new_term(); + STACK_PUSH( &s_term, term ); + LIST_APPEND( &s_regex->term_list, term ); + } + + action leave_term { + reverse_element_list( &s_term->element_list ); + struct element *el = s_term->element_list; + while ( el != 0 ) { + reverse_value_list( &el->value_list ); + el = el->next; + } + STACK_POP( &s_term ); + } + + action enter_regex { + struct regex *regex = new_regex( regex_non_capture, s_regex->options ); + struct element *element = new_element( element_regex_type, regex, NULL ); + LIST_APPEND( &s_term->element_list, element ); + STACK_PUSH( &s_regex, regex ); + } + + action enter_lookaround { + struct regex *regex = new_regex( regex_look_around, s_regex->options ); + struct pattern *pattern = new_pattern( regex, 1 ); + struct look_around *la = new_look_around( lat, pattern ); + pattern->node_id = la->node_id; + + LIST_APPEND( &s_pattern->look_arounds, la ); + + struct element *element = new_element( element_look_around_type, NULL, la ); + LIST_APPEND( &s_term->element_list, element ); + + STACK_PUSH( &s_regex, regex ); + STACK_PUSH( &s_pattern, pattern ); + } + + action leave_regex { + if ( s_regex->type == regex_capture ) + closed_captures += 1; + + if ( s_regex->type == regex_look_around ) + STACK_POP( &s_pattern ); + + reverse_term_list( &s_regex->term_list ); + STACK_POP( &s_regex ); + } + + # + # Numbers + # + action init_number + { number = 0; } + + action decimal_digit + { number = number * 10 + ( *p - '0' ); } + + number = [0-9]+ >init_number $decimal_digit; + + # + # Verbs + # + verb = ( [a-zA-Z_] [a-zA-Z_0-9]* ) >{ verb_len = 0; } ${ + if ( verb_len < MAX_VERB ) { + verb[verb_len++] = *p; + } + }; + + action append_verb { + struct element *el = new_element( element_verb_type, NULL, NULL ); + cur_verb = el->verb = new_verb( verb, verb_len ); + + LIST_APPEND( &s_term->element_list, el ); + } + + action verb_value + { + cur_verb->value = malloc( verb_len + 1 ); + memcpy( cur_verb->value, verb, verb_len ); + cur_verb->value[verb_len] = 0; + } + + action verb_number + { + cur_verb->number = number; + } + + verb_forms = + verb %append_verb ( ':' verb %verb_value | '=' number %verb_number )?; + + action options_only { fret; } + + options = ( + 'c' # case-sensitive matching + | 'i' # case-insensitive matching + )*; + + alpha_char = [a-zA-Z]; + digit_char = [0-9]; + + open_paren = '('; + close_paren = ')'; + + char_class_start = '['; + char_class_end = ']'; + + open_curly = '{'; + close_curly = '}'; + + ampersand = '&'; + colon = ':'; + comma = ','; + dollar = '$'; + dot = '.'; + equals = '='; + exclamation = '!'; + greater_than = '>'; + hash = '#'; + hyphen = '-'; + less_than = '<'; + pipe = '|'; + single_quote = "'"; + underscore = '_'; + + other_char_printable = + ' ' | '~' | ';' | '@' | '%' | '`' | '"' | '/'; + + other_char_non_printable = ^( 0 .. 127 ); + + capture_non_capture = + '(' @{ fcall paren_open; }; + + literal_sym = + comma | hyphen | less_than | + greater_than | single_quote | underscore | colon | + hash | equals | exclamation | ampersand; + + action append_char { + append_element_value( s_term, *p ); + } + + literal = ( + alpha_char | + digit_char | + literal_sym | + other_char_printable | + other_char_non_printable + ) @append_char; + + # + # Octals and backrefs. There is an ambiguity between these two, therefore + # the grammar handles them together. + # + + action init_octal + { octal = 0; } + + action octal_digit + { octal = octal * 8 + ( *p - '0' ); } + + backref_num = [1-9] [0-9]*; + + octal_num = + [0-3] [0-7] [0-7] | + [0-7] [0-7] | + [0]; + + action octal { append_element_value( s_term, octal ); } + + action octal_or_backref + { + if ( is_backref( number, closed_captures ) ) + append_backref( s_term, number ); + else + append_element_value( s_term, octal ); + } + + action backref + { + if ( number > closed_captures ) { + printf( "invalid backref: \\%d\n", number ); + fbreak; + } + + append_backref( s_term, number ); + } + + # Certainly octal. All octals, with possibly backrefs removed. + def_octal = + '\\' @init_octal + ( octal_num - backref_num ) $octal_digit; + + # All cases that can be either octal or backref and we need to inspect the + # backref number to decide. Use the intersection of the two numbers as the + # pattern. + octal_or_backref = + '\\' @init_octal @init_number + ( octal_num & backref_num ) $octal_digit $decimal_digit; + + # Definitely backref. The backref pattern excluding anything with a prefix + # that could be octal. + def_backref = + '\\' @init_number + ( backref_num - ( octal_num any* ) ) $decimal_digit; + + octal = + def_octal %octal | + octal_or_backref %octal_or_backref; + + backref = + def_backref %backref | + octal_or_backref %octal_or_backref; + + atom = + literal | + char_class_end @append_char | + dot @{ append_element_value( s_term, dot ); } | + octal | + backref | + open_paren @{ fcall open_paren_forms; } + ; + + non_greedy = + '?' @{ apply_lazy( s_term ); } | + '+' @{ apply_possessive( s_term ); }; + + quant_min = number %{ quant_min = number; }; + quant_max = number %{ quant_max = number; }; + + action quant_zero_plus { apply_quantifier( s_term, 0, QUANTIFIER_INF ); } + action quant_one_plus { apply_quantifier( s_term, 1, QUANTIFIER_INF ); } + action quant_optional { apply_quantifier( s_term, 0, 1 ); } + action quant_exact { apply_quantifier( s_term, quant_min, quant_min ); } + action quant_min { apply_quantifier( s_term, quant_min, QUANTIFIER_INF ); } + action quant_min_max { apply_quantifier( s_term, quant_min, quant_max ); } + + quant_forms = + '*' @quant_zero_plus | + '+' @quant_one_plus | + '?' @quant_optional | + open_curly quant_min close_curly @quant_exact | + open_curly quant_min ',' close_curly @quant_min | + open_curly quant_min ',' quant_max close_curly @quant_min_max; + + quantifier = quant_forms non_greedy?; + + element = ( atom quantifier? ); + + term = ( element** ) >enter_term; + + # + # Expression + # + expr = term ( '|' @leave_term term )*; + + # + # Regex + # + regex = expr; + + + open_paren_forms := + # Look at the first few charcters to see what the form is. What we + # handle here: + # (re) capturing parens + # (?:re) non-capturing parens + # (?=re) positive lookahead + # (?!re) negative lookahead + # (?<=re) positive lookbehind + # (?' @enter_regex + @{ s_regex->flags |= REGEX_ATOMIC_GROUP; } | + + # Capture number reset on branch. + '?|' @enter_regex + @{ s_regex->flags |= REGEX_CAPTURE_RESET; } | + + # Lookaround + ( + '?=' @{ lat = lat_pos_ahead; } @enter_lookaround | + '?!' @{ lat = lat_neg_ahead; } @enter_lookaround | + '?<=' @{ lat = lat_pos_behind; } @enter_lookaround | + '?type = regex_capture; + s_regex->capture = next_capture++; + fhold; + } + + ) + regex ')' @leave_term @leave_regex @{ fret; }; + + + main := regex '\n' @leave_term @leave_regex @{ success = 1; }; +}%% + +%% write data; + +int pcre_parse( struct pattern **result_pattern, char *line, int len ) +{ + int *stack; + int stack_size; + int cs, top; + char *p, *pe; + int success = 0; + int result = 1; + + int number = 0; + int octal = 0; + int quant_min, quant_max; + + struct pattern *s_pattern = NULL; + struct regex *s_regex = NULL; + struct term *s_term = NULL; + + /* Collect into this. */ + struct regex *root_regex = new_regex( regex_non_capture, 0 ); + struct pattern *root_pattern = new_pattern( root_regex, 0 ); + STACK_PUSH( &s_pattern, root_pattern ); + STACK_PUSH( &s_regex, root_regex ); + + int closed_captures = 0; + int next_capture = 1; + + enum look_around_type lat; + + int verb_len; + char verb[MAX_VERB]; + struct verb *cur_verb; + + stack = init_ragel_stack( &stack_size ); + + %% write init; + + p = line; + pe = line + len; + + %% write exec; + + if ( !success ) { + printf( "parse error at %ld\n", ( p - line + 1) ); + result = 0; + } + else if ( + s_pattern == NULL || s_pattern->parent != NULL || + s_regex != NULL || + s_term != NULL ) + { + printf( "parse error: items not closed\n" ); + result = 0; + } + + if ( result ) { + *result_pattern = root_pattern; + } + else { + + } + + free( stack ); + + return result; +} + +extern void print_value( int indent, struct value *value ); +extern void print_element( int indent, struct element *el ); +extern void print_term( int indent, struct term *term ); +extern void print_regex( int indent, struct regex *regex ); + +void print_indent( int indent ) +{ + while ( indent-- > 0 ) + printf( " " ); +} + +void print_value( int indent, struct value *value ) +{ + printf( "v(%d)", value->v ); +} + +void print_element( int indent, struct element *el ) +{ + print_indent( indent ); + printf( "el: " ); + if ( el->type == element_value_type ) { + struct value *value = el->value_list; + while ( value != NULL ) { + print_value( indent + 1, value ); + if ( value->next != 0 ) + printf( " . " ); + value = value->next; + } + } + else if ( el->type == element_regex_type ) { + if ( el->regex != 0 ) + print_regex( indent, el->regex ); + } + else if ( el->type == element_look_around_type ) { + printf( "lookaround " ); + switch ( el->look_around->type ) { + case lat_pos_ahead: printf( "=" ); break; + case lat_neg_ahead: printf( "!" ); break; + case lat_pos_behind: printf( "<=" ); break; + case lat_neg_behind: printf( "look_around->pattern->regex ); + } + else if ( el->type == element_char_class_type ) + printf( "cc" ); + else if ( el->type == element_backref_type ) + printf( "backref(%d)", el->backref->number ); + else if ( el->type == element_verb_type ) { + printf( "verb(%s", el->verb->name ); + if ( el->verb->value != NULL ) + printf( ":%s", el->verb->value ); + else if ( el->verb->number >= 0 ) + printf( "=%d", el->verb->number ); + printf( ")" ); + } + + if ( el->quantifier != NULL ) { + printf( " {%d,", el->quantifier->min ); + + if ( el->quantifier->max == QUANTIFIER_INF ) + printf( "inf" ); + else + printf( "%d", el->quantifier->max ); + + printf( "}" ); + + if ( el->quantifier->type == quant_lazy ) + printf( "?" ); + else if ( el->quantifier->type == quant_possessive ) + printf( "+" ); + + } + + printf("\n"); +} + +void print_term( int indent, struct term *term ) +{ + print_indent( indent ); + printf( "term:\n" ); + struct element *el = term->element_list; + while ( el != NULL ) { + print_element( indent + 1, el ); + el = el->next; + } +} + +void print_regex( int indent, struct regex *regex ) +{ + printf( "reg: " ); + if ( regex->type == regex_capture ) + printf( "cap(%d) ", regex->capture ); + if ( regex->flags & REGEX_ATOMIC_GROUP ) + printf( "atomic_group " ); + if ( regex->flags & REGEX_CAPTURE_RESET ) + printf( "capture_reset " ); + printf( "(\n" ); + + struct term *term = regex->term_list; + while ( term != NULL ) { + print_term( indent + 1, term ); + term = term->next; + } + print_indent( indent ); + printf( ")" ); +} + +void print_pattern( int indent, struct pattern *pat ) +{ + printf( "pat: " ); + print_regex( indent, pat->regex ); + printf( "\n" ); +} + +char linebuf[2048]; + +int main( int argc, char **argv ) +{ + if ( argc < 2 ) { + fprintf( stderr, "usage: ./pcre \n" ); + return -1; + } + + FILE *input = fopen( argv[1], "r" ); + if ( input == NULL ) { + fprintf( stderr, "failed to open %s: %s", argv[0], strerror(errno) ); + return -1; + } + + char *line = NULL; + size_t n = 0; + ssize_t read; + + while ( (read = getline( &line, &n, input )) != -1 ) { + struct pattern *pat; + int parsed = pcre_parse( &pat, line, (int)read ); + if ( parsed ) { + print_pattern( 0, pat ); + } + } + + fclose( input ); + free( line ); + return 0; +} + diff --git a/grammar/rust.lm b/grammar/rust.lm deleted file mode 100644 index f6dd8c15..00000000 --- a/grammar/rust.lm +++ /dev/null @@ -1,1305 +0,0 @@ -lex - literal `as `break `const `continue `crate - literal `$crate `dyn `else `enum `extern - literal `false `fn `for `if `impl `in `let - literal `loop `macro_rules `match `mod `move - literal `mut `pub `ref `return `self `static - literal `struct `super `trait `true `type - literal `unsafe `use `where `while - - literal `; `: `:: `( `) `{ `} `[ `] `< `> - literal `. `, `@ `-> `=> `? - literal `- `+ `/ `* `% `! `^ `| `& - - literal `<< `== `!= `>= `<= `|| `&& - literal `.. `..= `... - literal `= `+= `-= `*= `/= `%= `&= `|= `^= `<<= `>>= - - literal `# `_ - - token id / [A-Za-z_] [A-Za-z_0-9]* / - token string / 'b'? '"' ( [^\"] | '\\' any )* '"' / - token char / 'b'? "'" ( [^\'] | '\\' any ) "'" / - token lifetime / "'" id / - token number / - ( - [0-9] [_0-9]* | - '0x' [_a-fA-F0-9]+ | - '0b' [_0-1]+ | - '0o' [_0-7]+ - ) - ( ( 'u' | 'i' ) [a-z0-9]+ )? - / - - rl float_exponent - / ( [eE] [+\-]? [0-9_]* [0-9] [0-9_]* )? / - - rl float_suffix - / ( 'f32' | 'f64' )? / - - # Handles: DEC_LITERAL . (not immediately followed by ., _ or an identifier) - token hanging_float - / [0-9]+ '.' [^0-9._a-zA-Z] / - { - Float: str = input->pull( match_length - 1 ) - input->push( make_token( typeid, Float ) ) - } - - token float/ - [0-9]+ float_exponent | - [0-9]+ '.' [0-9]+ float_exponent? | - [0-9]+ ( '.' [0-9]+ )? float_exponent? float_suffix - / - - # Raw open. Rest handled in a its own lexical region. - token raw_open / 'r' '#' * '"' / - { - # Stash the length (not including r) for comparison against potential - # close strings. - RawOpenLength = match_length - 1 - RawOpen: str = input->pull( match_length ) - input->push( make_token( typeid, RawOpen ) ) - } - - ignore / "//" [^\n]* '\n' / - ignore / "/*" any* :>> "*/" / - ignore / [ \t\n]+ / -end - -# Raw strings. -def raw_string - [raw_open raw_content* raw_close] - -global RawOpenLength: int = 0 - -# Lexical region dedicated to raw strings. Attempts to close by matching -# candidates and then testing the length. -lex - token raw_close / '"' '#'* / - { - # Check the length. We use >= to match the close because rust is lazy - # in matching it. If it is longer we just chop it. Probably will result - # in a parse error. - if match_length >= RawOpenLength { - # Chop it by using RawOpenLength in the pull from input. - Candidate: str = input->pull( RawOpenLength ) - input->push( make_token( typeid, Candidate ) ) - } - else { - # Otherwise just send it as raw content. - Candidate: str = input->pull( match_length ) - input->push( make_token( typeid, Candidate ) ) - } - } - - # Content, send out strings not containing # or ". Or single such chars - # that are not part of a sequence that is first matched by close candidate. - token raw_content / [^"#]+ | any / -end - -namespace attr - lex - token id / [A-Za-z_] [A-Za-z_0-9]* / - token string / '"' ( [^\"] | '\\' any )* '"' / - token char / "'" ( [^\'] | '\\' any ) "'" / - token lifetime / "'" id / - token number / [0-9]+ / - token float / [0-9]+ '.' [0-9]+ / - - literal `[ `] - - ignore / "//" [^\n]* '\n' / - ignore / "/*" any* :>> "*/" / - ignore / [ \t\n]+ / - - token sym / any / - end - - def item - [id] - | [string] - | [char] - | [lifetime] - | [number] - | [float] - | [sym] - | [_list] - - def _list - [ `[ item* `] ] -end - -def attribute - [`# `! attr::_list] -| [`# attr::_list] - -namespace macro - lex - token id / [A-Za-z_] [A-Za-z_0-9]* / - token string / '"' ( [^\"] | '\\' any )* '"' / - token char / "'" ( [^\'] | '\\' any ) "'" / - token lifetime / "'" id / - token number / [0-9]+ / - token float / [0-9]+ '.' [0-9]+ / - - literal `( `) `[ `] `{ `} `; - - ignore / "//" [^\n]* '\n' / - ignore / "/*" any* :>> "*/" / - ignore / [ \t\n]+ / - - token sym / any / - end - - def item - [id] - | [string] - | [char] - | [lifetime] - | [number] - | [float] - | [sym] - | [macro] - | [`;] - - def macro_semi - [ `( item* `) `; ] - | [ `[ item* `] `; ] - | [ `{ item* `} ] - - def macro - [ `( item* `) ] - | [ `[ item* `] ] - | [ `{ item* `} ] -end - -def macro_invocation - [simple_path `! macro::macro] - -def macro_invocation_semi - [simple_path `! macro::macro_semi] - -# -# Macro Defition -# - -def macro_matcher - [macro::macro] - -def macro_transcriber - [macro::macro] - -def macro_rule - [macro_matcher `=> macro_transcriber] - -def macro_rules_tail - [macro_rules_tail `; macro_rule] -| [] - -def macro_rules - [macro_rule macro_rules_tail `;`?] - -def macro_rules_def - [`( macro_rules `) `;] -| [`[ macro_rules `] `;] -| [`{ macro_rules `}] - -def macro_rules_definition - [`macro_rules `! id macro_rules_def] - -# -# Use statments -# - - -def simple_path_segment - [id] -| [`super] -| [`self] -| [`crate] -| [`$crate] - -def sp_tail - [sp_tail `:: simple_path_segment] -| [] - -def simple_path - [opt_sep simple_path_segment sp_tail] - -def use_list_tail - [use_list_tail `, use_tree] -| [] - -def use_path_opt - [simple_path `::] -| [`::] -| [] - -def use_as_opt - [`as id] -| [`as `_] -| [] - -def use_tree - [use_path_opt `*] -| [use_path_opt `{ use_tree use_list_tail `,`? `}] -| [simple_path use_as_opt] - -def use_declaration - [`use use_tree `;] - -# -# Patterns -# - -def literal_pattern - [`true] -| [`false] -| [char] -| [string] -| [raw_string] -| [`-`? number] -| [`-`? float] - -def identifier_pattern - [`ref`? `mut`? id] -| [`ref`? `mut`? id `@ pattern] - -def wildcard_pattern - [`_] - -def range_pattern_bound - [literal_pattern] - -def range_pattern - [range_pattern_bound `..= range_pattern_bound] -| [range_pattern_bound `... range_pattern_bound] - -def reference_pattern - [`& `mut`? pattern] -| [`&& `mut`? pattern] - -# -# struct pattern -# -def struct_pattern_field - [number `: pattern] -| [id `: pattern] -| [`ref`? `mut`? id] - -def struct_pattern_fields - [struct_pattern_fields `, struct_pattern_field] -| [struct_pattern_field] - -def struct_pattern_et_cetera - [`..] - -def struct_pattern_elements - [struct_pattern_fields] -| [struct_pattern_fields `, ] -| [struct_pattern_fields `, struct_pattern_et_cetera] -| [struct_pattern_et_cetera] - -def opt_struct_pattern_elements - [struct_pattern_elements] -| [] - -def struct_pattern - [path_in_expression `{ opt_struct_pattern_elements `}] - -def tuple_struct_item - [pattern] -| [`..] - -def tuple_struct_items - [tuple_struct_items `, tuple_struct_item] -| [tuple_struct_item] - -def tuple_struct_pattern - [path_in_expression `( tuple_struct_items `,`? `)] - -def tuple_pattern_item - [pattern] -| [`..] - -def tuple_pattern_items - [tuple_pattern_items `, tuple_pattern_item] -| [tuple_pattern_item] - -def tuple_pattern - [`( `)] -| [`( tuple_pattern_item `, `)] -| [`( tuple_pattern_item `, tuple_pattern_items `,`? `)] - -def grouped_pattern - [`( pattern `)] - -def pattern_list - [pattern_list `, pattern] -| [pattern] - -# -# Grammar doesnt seem to support empty slice patterns, but found some code -# allowing it. -# -def slice_pattern - [`[ pattern `, pattern_list `,`? `]] -| [`[ pattern `, `]] -| [`[ pattern `]] -| [`[ `]] - -def path_pattern - [path_in_expression] -| [qualified_path_in_expression] - -def pattern - [literal_pattern] -| [identifier_pattern] -| [wildcard_pattern] -| [range_pattern] -| [reference_pattern] -| [struct_pattern] -| [tuple_struct_pattern] -| [tuple_pattern] -| [grouped_pattern] -| [slice_pattern] -| [path_pattern] - -# Range Pattern - - -# -# Match Expressions. -# - -def match_expression - [`match expression `{ match_arms? `}] - -def match_arms_last - [match_arm `=> block_expression `,`?] -| [match_arm `=> expression `,`?] - -def match_arms_first_case - [match_arm `=> block_expression `,`?] -| [match_arm `=> expression `,] - -def match_arms_first_list - [match_arms_first_list match_arms_first_case] -| [] - -def match_arms - [match_arms_first_list match_arms_last] - -def match_arm - [match_arm_patterns opt_match_arm_guard] - -def match_arms_pattern_tail - [match_arms_pattern_tail `| pattern] -| [] - -def match_arm_patterns - [`|`? pattern match_arms_pattern_tail] - -def opt_match_arm_guard - [`if expression] -| [] - -# -# Return expressions -# -def return_expression - [`return expression] -| [`return] - -# -# Break expression -# -def break_expression - [`break expression] -| [`break lifetime expression] -| [`break lifetime] -| [`break] - -# -# Continue expression -# -def continue_expression - [`continue lifetime] -| [`continue] - -# -# Generic Args -# - -def lifetime_tail - [lifetime_tail `, lifetime] -| [] - -def lifetime_list - [lifetime lifetime_tail] - -def opt_type_params - [`< lifetime_list `,`? `>] -| [`< type_list `,`? `>] -| [`< lifetime_list `, type_list `,`? `>] -| [] - - -# -# Function declaration -# - -def opt_return - [] -| [ `-> type] - -def extern_abi - [`extern string] - -def function_qualifiers - [`const ? `unsafe ? extern_abi?] - -def function_param - [pattern `: type] - -def func_param_tail - [func_param_tail `, function_param] -| [] - -def function_parameters - [function_param func_param_tail `,`?] - -def function - [ - function_qualifiers `fn id opt_generics `( function_parameters? `) - opt_return opt_where_clause block_expression - ] - -# -# Method declaration -# - -def self_param - [`mut`? `self] -| [`& `mut`? `self] -| [`& lifetime `mut`? `self] -| [`mut`? `self `: type] - -def opt_method_params - [`, function_parameters] -| [`,] -| [] - -def method - [ - function_qualifiers `fn id opt_generics `( self_param opt_method_params `) - opt_return opt_where_clause block_expression - ] - -# -# Types -# - -def qual_tail - [qual_tail `:: id] -| [] - -def qual_id - [id qual_tail] - -def type_id - [id opt_type_params] - -def type_path_segment - [path_ident_segment `:: ? generic_args] -| [path_ident_segment `:: ? type_path_fn] -| [path_ident_segment `:: ?] - -def type_path_fn - [`( type_path_fn_inputs? `) opt_arrow_type] - -def opt_arrow_type - [`-> type] -| [] - -def type_path_fn_inputs - [type_list `,`?] - -def type_path_tail - [type_path_tail `:: type_path_segment] -| [] - -def opt_sep - [`::] -| [] - -def type_path - [opt_sep type_path_segment type_path_tail] - -def opt_lifetime - [lifetime] -| [] - -def array_type - [`[ type `; expression `]] - -def slice_type - [`[ type `]] - -def raw_pointer_type - [`* `mut type_no_bounds] -| [`* `const type_no_bounds] - -def tuple_type - [`( `)] -| [`( type `, `)] -| [`( type `, type_list `,`? `)] - -def trait_object_type - [`dyn ? type_param_bounds] - -def impl_trait_type - [`impl type_param_bounds] - -def impl_trait_type_one_bound - [`impl trait_bound] - -def qualified_path_type - [`< type `as type_path `>] -| [`< type `>] - -def type_path_segment_list - [] - -def qualified_path_in_type - [qualified_path_type type_path_tail] - -# -# Bare Function Type -# - -def bare_function_return_type - [`-> type_no_bounds] - -def function_parameters_maybe_named_variadic - [maybe_named_function_parameters] -| [maybe_named_function_parameters_variadic] - -def maybe_named_param_tail - [maybe_named_param_tail `, maybe_named_param] -| [] - -def maybe_named_function_parameters - [maybe_named_param maybe_named_param_tail `,`?] - -def maybe_named_param - [id `: type] -| [`_ `: type] -| [type] - -def maybe_named_function_parameters_variadic - [maybe_named_param maybe_named_param_tail `, `...] - -def bare_function_type - [ - for_lifetimes? function_qualifiers `fn - `( function_parameters_maybe_named_variadic? `) bare_function_return_type? - ] - -# -# Type -# - -def type - [type_no_bounds] -| [impl_trait_type] -| [trait_object_type] - -def type_no_bounds - [impl_trait_type_one_bound] -| [type_path] -| [array_type] -| [slice_type] -| [raw_pointer_type] -| [`& opt_lifetime type] -| [`& `mut type] -| [`& lifetime `mut type] -| [tuple_type] -| [`_] -| [`!] -| [qualified_path_in_type] -| [bare_function_type] - -def type_list - [type_list `, type] -| [type] - -def opt_type - [`: type] -| [] - -def let_rvalue - [expression] -| [`{ statements `}] - -def expr_tail - [expr_tail `, expression] -| [] - -def expr_list - [expression expr_tail] -| [] - -def _construct - [attribute* id] -| [attribute* id `: expression] - -def cons_plus - [cons_plus `, _construct] -| [_construct] - -def cons_list - [cons_plus `,`?] -| [] - - -# -# Expression -# - -def path_ident_segment - [id] -| [`self] -| [`crate] -| [`super] - -def path_expr_segment - [path_ident_segment] -| [path_ident_segment `:: generic_args] - -def generic_args - [`< `>] -| [`< generic_args_lifetimes `,`? `>] -| [`< generic_args_types `,`? `>] -| [`< generic_args_bindings `,`? `>] -| [`< generic_args_types `, generic_args_bindings `,`? `>] -| [`< generic_args_lifetimes `, generic_args_types `,`? `>] -| [`< generic_args_lifetimes `, generic_args_bindings `,`? `>] -| [`< generic_args_lifetimes `, generic_args_types `, generic_args_bindings `,`? `>] - -def generic_args_lifetimes - [lifetime_list] - -def generic_args_types - [type_list] - -def generic_args_binding - [id `= type] - -def generic_args_bindings - [generic_args_bindings `, generic_args_binding] -| [generic_args_binding] - -def pie_tail - [pie_tail `:: path_expr_segment] -| [] - -def opt_path_sep - [`::] -| [] - -def path_in_expression - [opt_path_sep path_expr_segment pie_tail] - -def qualified_path_in_expression - [qualified_path_type type_path_tail] - -def path_expression - [path_in_expression] -| [qualified_path_in_expression] - -def tuple_expression - [`( expression `, `)] -| [`( expression `, expr_list `,`? `)] - -def array_expression - [`[ expr_list `,`? `]] -| [`[ expression `; expression `]] - -def closure_parameters - [closure_parameters `, closure_param] -| [closure_param] - -def closure_param - [pattern] -| [pattern `: type] - -def closure_expression_param_forms - [`| closure_parameters `,`? `|] -| [`||] - -def closure_expression - [`move ? closure_expression_param_forms expression] -| [`move ? closure_expression_param_forms `-> type_no_bounds block_expression] - -def paths - [path_expression] -| [char] -| [string] -| [raw_string] -| [number] -| [float] -| [path_in_expression `{ cons_list `}] -| [path_in_expression `{ `.. expression `}] -| [path_in_expression `{ cons_list `, `.. expression `}] -| [`( `)] -| [`true] -| [`false] -| [`( expression `)] -| [tuple_expression] -| [array_expression] -| [macro_invocation] -| [closure_expression] -| [block_expression] -| [match_expression] -| [if_expression] -| [if_let_expression] - - -def func_index - [func_index `. path_expr_segment] -| [func_index `. number] -| [func_index `( expr_list `,`? `)] -| [func_index `[ expr_list `,`? `]] -| [func_index `?] -| [paths] - -def question - [func_index] - -def unary - [question] -| [`- unary] -| [`* unary] -| [`! unary] -| [`& unary] -| [`& `mut unary] -| [`mut unary] - -def as - [as `as type_no_bounds] -| [unary] - -def mult - [mult `* as] -| [mult `/ as] -| [mult `% as] -| [as] - -def add_sub - [add_sub `- mult] -| [add_sub `+ mult] -| [mult] - -def shift - [shift `> `> add_sub] -| [shift `<< add_sub] -| [add_sub] - -def bitwise_and - [bitwise_and `& shift] -| [shift] - -def bitwise_xor - [bitwise_xor `^ bitwise_and] -| [bitwise_and] - -def bitwise_or - [bitwise_or `| bitwise_xor] -| [bitwise_xor] - -def comp_op - [`==] | [`!=] -| [`>] | [`<] -| [`>=] | [`<=] - -def comparison - [comparison comp_op bitwise_or] -| [bitwise_or] - -def lazy_conjunction - [lazy_conjunction `&& comparison] -| [comparison] - -def lazy_disjunction - [lazy_disjunction `|| lazy_conjunction] -| [lazy_conjunction] - -def range_expression - [range_expression `.. lazy_disjunction] -| [lazy_disjunction `..] -| [`.. lazy_disjunction] -| [`..] -| [range_expression `..= lazy_disjunction] -| [`..= lazy_disjunction] -| [lazy_disjunction] - -# Evaluates right to left. -def assignment_expression - [range_expression `= expression] -| [range_expression] - -def compound_op - [`+=] | [`-=] | [`*=] | [`/=] | [`%=] -| [`&=] | [`|=] | [`^=] | [`<<=] | [`>>=] - -# Evaluates right to left. -def compound_expression - [assignment_expression compound_op compound_expression] -| [assignment_expression] - -def expression - [expression_without_block] -| [expression_with_block] - -# -# Statements -# - -def block_expression - [`unsafe ? `{ statements? `}] - -def let_statement - [`let pattern opt_type `= let_rvalue `;] -| [`let pattern opt_type `;] - -def expression_without_block - [compound_expression `? ?] -| [return_expression] -| [break_expression] -| [continue_expression] - -def expression_with_block - [block_expression] -| [loop_expression] -| [if_expression] -| [if_let_expression] -| [match_expression] - -def expression_statement - [expression_without_block `;] -| [expression_with_block] - -def statement - [`;] -| [item] -| [let_statement] -| [expression_statement] -| [use_declaration] -| [macro_invocation_semi] - -def statement_list - [statement_list statement] -| [statement] - -def statements - [statement_list] -| [statement_list expression_without_block] -| [expression_without_block] - -def loop_label - [lifetime `:] - -def loop_expression - [loop_label? `loop block_expression] -| [loop_label? `while expression block_expression] -| [loop_label? `while `let match_arm_patterns `= expression block_expression] -| [loop_label? `for pattern `in expression block_expression] - -def if_expression - [`if expression block_expression opt_else_expression] - -def opt_else_expression - [`else block_expression] -| [`else if_expression] -| [`else if_let_expression] -| [] - -def if_let_expression - [ - `if `let match_arm_patterns `= expression block_expression - opt_else_expression - ] - -def visibility - [`pub] -| [`pub `( `crate `)] -| [`pub `( `self `)] -| [`pub `( `super `)] -| [`pub `( `in simple_path `)] - -def field - [attribute* visibility? id `: type] - -def field_plus - [field_plus `, field] -| [field] - -def field_list - [field_plus] -| [] - -# -# Lifetime Params -# - -def colon_lifetime_bounds - [`: lifetime_bounds] - -def lifetime_param - [lifetime colon_lifetime_bounds?] - -def lifetime_param_list - [lifetime_param_list `, lifetime_param ] -| [lifetime_param] - -# -# Type param bounds -# - -def trait_bound - [`? ? for_lifetimes? type_path] -| [`( `? ? for_lifetimes? type_path `)] - -def type_param_bound - [lifetime] -| [trait_bound] - -def tpb_tail - [`+ type_param_bound] - -def type_param_bounds - [type_param_bound tpb_tail* `+`?] - -# -# Type Params -# - -def opt_eq_type - [`= type] -| [] - -def opt_type_param_bounds - [`: type_param_bounds] -| [] - -def type_param - [id opt_type_param_bounds opt_eq_type] - -def type_param_tail - [type_param_tail `, type_param] -| [] - -def type_param_list - [type_param type_param_tail] - -# -# Generics -# - -def generic_params - [lifetime_param_list `, type_param_list] -| [lifetime_param_list] -| [type_param_list] -| [] - -def opt_generics - [`< generic_params `>] -| [] - -# -# Where clause -# - -def lifetime_params - [lifetime_param_list? `,`?] - -def for_lifetimes - [`for `< lifetime_params `>] - -def lifetime_bounds_list - [lifetime_bounds_list `+ lifetime] -| [lifetime] - -def lifetime_bounds - [lifetime_bounds_list? `+`?] - -def lifetime_where_clause_item - [lifetime `: lifetime_bounds] - -def type_bound_where_clause_item - [for_lifetimes? type `: type_param_bounds?] - -def where_clause_item - [lifetime_where_clause_item] -| [type_bound_where_clause_item] - -def where_clause_item_list - [where_clause_item_list `, where_clause_item] -| [where_clause_item] - -def opt_where_clause - [`where where_clause_item_list `,`? ] -| [] - - -# -# Tuple List -# - -def tuple_field - [attribute* visibility? type] - -def tuple_field_list - [tuple_field_list `, tuple_field] -| [tuple_field] - -def tuple_fields - [tuple_field_list `,`?] - -# -# Structure -# - -def struct_field - [attribute* visibility? id `: type] - -def struct_field_list - [struct_field_list `, struct_field] -| [struct_field] - -def struct_fields - [struct_field_list `,`?] - -def struct_struct - [`struct id opt_generics opt_where_clause `{ struct_fields? `}] -| [`struct id opt_generics opt_where_clause `;] - -def tuple_struct - [`struct id opt_generics `( tuple_fields? `) opt_where_clause `; ] - -def structure - [struct_struct] -| [tuple_struct] - -# -# Union -# -def union - [Union: id id opt_generics opt_where_clause `{ struct_fields? `}] - { - if $lhs.Union != "union" - reject - } - -# -# Trait -# - -def opt_type_param_bounds_opt - [`: type_param_bounds?] -| [] - -def trait - [ - `trait id opt_generics opt_type_param_bounds_opt opt_where_clause - `{ - trait_item* - `} - ] - -def trait_item - [attribute* trait_func] -| [attribute* trait_method] -| [attribute* trait_const] -| [attribute* trait_type] -| [attribute* macro_invocation_semi] - -def trait_func - [trait_func_decl `;] -| [trait_func_decl block_expression] - -def trait_method - [trait_method_decl `;] -| [trait_method_decl block_expression] - -def trait_func_decl - [function_qualifiers `fn id opt_generics - `( trait_function_parameters `) opt_return opt_where_clause] - -def trait_method_decl - [function_qualifiers `fn id opt_generics - `( self_param trait_method_parameters `) opt_return opt_where_clause] - -def trait_function_parameters - [trait_param_list `,] -| [trait_param_list ] -| [] - -def trait_method_parameters - [`, trait_param_list `,] -| [`, trait_param_list ] -| [`,] -| [] - -def trait_param_list - [trait_param_list `, trait_function_param] -| [trait_function_param] - -def trait_function_param - [pattern `: type] -| [type] - -def trait_const - [`const id `: type `;] -| [`const id `: type `= expression `;] - -def trait_type - [`type id opt_type_param_bounds_opt `;] - -# -# Implementation -# - -def inherent_impl_item - [attribute* visibility? function] commit -| [attribute* visibility? method] commit - -def inherent_impl - [`impl opt_generics type opt_where_clause `{ inherent_impl_item* `}] - -def trait_impl_item - [attribute* visibility? type_alias] -| [attribute* visibility? constant_item] -| [attribute* visibility? function] -| [attribute* visibility? method] - -def constant_item - [`const id `: type `= expression `;] -| [`const `_ `: type `= expression `;] - -def trait_impl - [`unsafe ? `impl opt_generics `! ? type_path `for type opt_where_clause `{ trait_impl_item* `}] - -def implementation - [inherent_impl] -| [trait_impl] - -def type_alias - [`type id opt_generics opt_where_clause `= type `;] - -def const_item - [`const id `: type `= expression `;] -| [`const `_ `: type `= expression `;] - -def module - [`mod id `;] -| [`mod id `{ item* `}] - -def crate_ref - [id] | [`self] - -def as_clause - [`as id] -| [`as `_] - -def extern_crate - [`extern `crate crate_ref as_clause? `;] - -def enum - [`enum id opt_generics opt_where_clause `{ enum_items? `,`? `}] - -def enum_items - [enum_items `, enum_item] -| [enum_item] - -def enum_item - [attribute* id enum_item_tuple] -| [attribute* id enum_item_struct] -| [attribute* id enum_item_discriminant] -| [attribute* id] - -def enum_item_tuple - [`( tuple_fields? `)] - -def enum_item_struct - [`{ field_list `,`? `}] - -def enum_item_discriminant - [`= expression] - -def static_item - [`static `mut`? id `: type `= expression `;] - -def abi - [string] -| [raw_string] - -def external_static_item - [`static `mut`? id `: type `;] - -def function_return_type - [`-> type] - -def external_function_item - [`fn id opt_generics `( named_function_parameters? `) function_return_type? opt_where_clause `;] - -def named_function_param - [id `: type] -| [`_ `: type] - -def named_function_parameters - [named_function_parameter_list `, `...] -| [named_function_parameter_list `,] -| [named_function_parameter_list ] - -def named_function_parameter_list - [named_function_parameter_list `, named_function_param] -| [named_function_param] - -def external_item - [attribute* visibility? external_static_item] -| [attribute* visibility? external_function_item] - -def extern_block - [`extern abi? `{ external_item* `}] - - -# -# All Items. -# - -def item - [visibility? vis_item] commit -| [macro_invocation_semi] commit -| [macro_rules_definition] commit - -def vis_item - [attribute] commit -| [function] commit -| [structure] commit -| [union] commit -| [trait] commit -| [implementation] commit -| [use_declaration] commit -| [type_alias] commit -| [const_item] commit -| [static_item] commit -| [module] commit -| [extern_crate] commit -| [extern_block] commit -| [enum] commit - -def program - [item*] - diff --git a/grammar/rust/.gitignore b/grammar/rust/.gitignore new file mode 100644 index 00000000..5bf8231b --- /dev/null +++ b/grammar/rust/.gitignore @@ -0,0 +1,4 @@ +/rust.c +/rust +/query.c +/query diff --git a/grammar/rust/Makefile b/grammar/rust/Makefile new file mode 100644 index 00000000..20e3edeb --- /dev/null +++ b/grammar/rust/Makefile @@ -0,0 +1,7 @@ +COLM = ../../colm/colm +RAGEL = ../../ragel/ragel + +all: rust + +rust: rust.lm parserust.lm $(COLM) + $(COLM) -o rust parserust.lm diff --git a/grammar/rust/parserust.lm b/grammar/rust/parserust.lm new file mode 100644 index 00000000..3e17a3b6 --- /dev/null +++ b/grammar/rust/parserust.lm @@ -0,0 +1,81 @@ +include 'rust.lm' + +parse P: program [stdin] + +if P { + for FN: function in P { + print "function: [FN.id] + + for CE: compound_expression in FN { + if match CE [assignment_expression compound_op compound_expression] + print " compound expression: [CE] + } + + for AE: assignment_expression in FN { + if match AE [range_expression `= assignment_expression] + print " assignment expression: [AE] + } + + for RE: range_expression in FN { + if !match RE [lazy_disjunction] + print " range expression: [RE] + } + + for LD: lazy_disjunction in FN { + if !match LD [lazy_conjunction] + print " lazy disjunction: [LD] + } + + for LC: lazy_conjunction in FN { + if !match LC [comparison] + print " lazy conjunction: [LC] + } + + for C: comparison in FN { + if !match C [bitwise_or] + print " comparison: [C] + } + + for P: pattern in FN { + print " pattern: [P] + } + + for MA: match_arm in FN { + print " match_arm: [MA] + } + + for CL: cons_list in FN { + print " construct list: [^CL] + } + + for M: mult in FN { + if !match M [as] + print " mult: [^M] + } + + for TP: tuple_pattern in FN { + print " tuple pattern: [TP] + } + + for TP: grouped_pattern in FN { + print " grouped pattern: [TP] + } + + for CP: closure_param in FN { + print " closure param: [CP] + } + } + + for M: method in P { + print "method: [M.id] + } + + for MR: macro_rule in P { + print " macro matcher: [MR.macro_matcher] + } +} +else { + send stderr "failed to parse input [error] + exit(1) +} + diff --git a/grammar/rust/rust.lm b/grammar/rust/rust.lm new file mode 100644 index 00000000..f6dd8c15 --- /dev/null +++ b/grammar/rust/rust.lm @@ -0,0 +1,1305 @@ +lex + literal `as `break `const `continue `crate + literal `$crate `dyn `else `enum `extern + literal `false `fn `for `if `impl `in `let + literal `loop `macro_rules `match `mod `move + literal `mut `pub `ref `return `self `static + literal `struct `super `trait `true `type + literal `unsafe `use `where `while + + literal `; `: `:: `( `) `{ `} `[ `] `< `> + literal `. `, `@ `-> `=> `? + literal `- `+ `/ `* `% `! `^ `| `& + + literal `<< `== `!= `>= `<= `|| `&& + literal `.. `..= `... + literal `= `+= `-= `*= `/= `%= `&= `|= `^= `<<= `>>= + + literal `# `_ + + token id / [A-Za-z_] [A-Za-z_0-9]* / + token string / 'b'? '"' ( [^\"] | '\\' any )* '"' / + token char / 'b'? "'" ( [^\'] | '\\' any ) "'" / + token lifetime / "'" id / + token number / + ( + [0-9] [_0-9]* | + '0x' [_a-fA-F0-9]+ | + '0b' [_0-1]+ | + '0o' [_0-7]+ + ) + ( ( 'u' | 'i' ) [a-z0-9]+ )? + / + + rl float_exponent + / ( [eE] [+\-]? [0-9_]* [0-9] [0-9_]* )? / + + rl float_suffix + / ( 'f32' | 'f64' )? / + + # Handles: DEC_LITERAL . (not immediately followed by ., _ or an identifier) + token hanging_float + / [0-9]+ '.' [^0-9._a-zA-Z] / + { + Float: str = input->pull( match_length - 1 ) + input->push( make_token( typeid, Float ) ) + } + + token float/ + [0-9]+ float_exponent | + [0-9]+ '.' [0-9]+ float_exponent? | + [0-9]+ ( '.' [0-9]+ )? float_exponent? float_suffix + / + + # Raw open. Rest handled in a its own lexical region. + token raw_open / 'r' '#' * '"' / + { + # Stash the length (not including r) for comparison against potential + # close strings. + RawOpenLength = match_length - 1 + RawOpen: str = input->pull( match_length ) + input->push( make_token( typeid, RawOpen ) ) + } + + ignore / "//" [^\n]* '\n' / + ignore / "/*" any* :>> "*/" / + ignore / [ \t\n]+ / +end + +# Raw strings. +def raw_string + [raw_open raw_content* raw_close] + +global RawOpenLength: int = 0 + +# Lexical region dedicated to raw strings. Attempts to close by matching +# candidates and then testing the length. +lex + token raw_close / '"' '#'* / + { + # Check the length. We use >= to match the close because rust is lazy + # in matching it. If it is longer we just chop it. Probably will result + # in a parse error. + if match_length >= RawOpenLength { + # Chop it by using RawOpenLength in the pull from input. + Candidate: str = input->pull( RawOpenLength ) + input->push( make_token( typeid, Candidate ) ) + } + else { + # Otherwise just send it as raw content. + Candidate: str = input->pull( match_length ) + input->push( make_token( typeid, Candidate ) ) + } + } + + # Content, send out strings not containing # or ". Or single such chars + # that are not part of a sequence that is first matched by close candidate. + token raw_content / [^"#]+ | any / +end + +namespace attr + lex + token id / [A-Za-z_] [A-Za-z_0-9]* / + token string / '"' ( [^\"] | '\\' any )* '"' / + token char / "'" ( [^\'] | '\\' any ) "'" / + token lifetime / "'" id / + token number / [0-9]+ / + token float / [0-9]+ '.' [0-9]+ / + + literal `[ `] + + ignore / "//" [^\n]* '\n' / + ignore / "/*" any* :>> "*/" / + ignore / [ \t\n]+ / + + token sym / any / + end + + def item + [id] + | [string] + | [char] + | [lifetime] + | [number] + | [float] + | [sym] + | [_list] + + def _list + [ `[ item* `] ] +end + +def attribute + [`# `! attr::_list] +| [`# attr::_list] + +namespace macro + lex + token id / [A-Za-z_] [A-Za-z_0-9]* / + token string / '"' ( [^\"] | '\\' any )* '"' / + token char / "'" ( [^\'] | '\\' any ) "'" / + token lifetime / "'" id / + token number / [0-9]+ / + token float / [0-9]+ '.' [0-9]+ / + + literal `( `) `[ `] `{ `} `; + + ignore / "//" [^\n]* '\n' / + ignore / "/*" any* :>> "*/" / + ignore / [ \t\n]+ / + + token sym / any / + end + + def item + [id] + | [string] + | [char] + | [lifetime] + | [number] + | [float] + | [sym] + | [macro] + | [`;] + + def macro_semi + [ `( item* `) `; ] + | [ `[ item* `] `; ] + | [ `{ item* `} ] + + def macro + [ `( item* `) ] + | [ `[ item* `] ] + | [ `{ item* `} ] +end + +def macro_invocation + [simple_path `! macro::macro] + +def macro_invocation_semi + [simple_path `! macro::macro_semi] + +# +# Macro Defition +# + +def macro_matcher + [macro::macro] + +def macro_transcriber + [macro::macro] + +def macro_rule + [macro_matcher `=> macro_transcriber] + +def macro_rules_tail + [macro_rules_tail `; macro_rule] +| [] + +def macro_rules + [macro_rule macro_rules_tail `;`?] + +def macro_rules_def + [`( macro_rules `) `;] +| [`[ macro_rules `] `;] +| [`{ macro_rules `}] + +def macro_rules_definition + [`macro_rules `! id macro_rules_def] + +# +# Use statments +# + + +def simple_path_segment + [id] +| [`super] +| [`self] +| [`crate] +| [`$crate] + +def sp_tail + [sp_tail `:: simple_path_segment] +| [] + +def simple_path + [opt_sep simple_path_segment sp_tail] + +def use_list_tail + [use_list_tail `, use_tree] +| [] + +def use_path_opt + [simple_path `::] +| [`::] +| [] + +def use_as_opt + [`as id] +| [`as `_] +| [] + +def use_tree + [use_path_opt `*] +| [use_path_opt `{ use_tree use_list_tail `,`? `}] +| [simple_path use_as_opt] + +def use_declaration + [`use use_tree `;] + +# +# Patterns +# + +def literal_pattern + [`true] +| [`false] +| [char] +| [string] +| [raw_string] +| [`-`? number] +| [`-`? float] + +def identifier_pattern + [`ref`? `mut`? id] +| [`ref`? `mut`? id `@ pattern] + +def wildcard_pattern + [`_] + +def range_pattern_bound + [literal_pattern] + +def range_pattern + [range_pattern_bound `..= range_pattern_bound] +| [range_pattern_bound `... range_pattern_bound] + +def reference_pattern + [`& `mut`? pattern] +| [`&& `mut`? pattern] + +# +# struct pattern +# +def struct_pattern_field + [number `: pattern] +| [id `: pattern] +| [`ref`? `mut`? id] + +def struct_pattern_fields + [struct_pattern_fields `, struct_pattern_field] +| [struct_pattern_field] + +def struct_pattern_et_cetera + [`..] + +def struct_pattern_elements + [struct_pattern_fields] +| [struct_pattern_fields `, ] +| [struct_pattern_fields `, struct_pattern_et_cetera] +| [struct_pattern_et_cetera] + +def opt_struct_pattern_elements + [struct_pattern_elements] +| [] + +def struct_pattern + [path_in_expression `{ opt_struct_pattern_elements `}] + +def tuple_struct_item + [pattern] +| [`..] + +def tuple_struct_items + [tuple_struct_items `, tuple_struct_item] +| [tuple_struct_item] + +def tuple_struct_pattern + [path_in_expression `( tuple_struct_items `,`? `)] + +def tuple_pattern_item + [pattern] +| [`..] + +def tuple_pattern_items + [tuple_pattern_items `, tuple_pattern_item] +| [tuple_pattern_item] + +def tuple_pattern + [`( `)] +| [`( tuple_pattern_item `, `)] +| [`( tuple_pattern_item `, tuple_pattern_items `,`? `)] + +def grouped_pattern + [`( pattern `)] + +def pattern_list + [pattern_list `, pattern] +| [pattern] + +# +# Grammar doesnt seem to support empty slice patterns, but found some code +# allowing it. +# +def slice_pattern + [`[ pattern `, pattern_list `,`? `]] +| [`[ pattern `, `]] +| [`[ pattern `]] +| [`[ `]] + +def path_pattern + [path_in_expression] +| [qualified_path_in_expression] + +def pattern + [literal_pattern] +| [identifier_pattern] +| [wildcard_pattern] +| [range_pattern] +| [reference_pattern] +| [struct_pattern] +| [tuple_struct_pattern] +| [tuple_pattern] +| [grouped_pattern] +| [slice_pattern] +| [path_pattern] + +# Range Pattern + + +# +# Match Expressions. +# + +def match_expression + [`match expression `{ match_arms? `}] + +def match_arms_last + [match_arm `=> block_expression `,`?] +| [match_arm `=> expression `,`?] + +def match_arms_first_case + [match_arm `=> block_expression `,`?] +| [match_arm `=> expression `,] + +def match_arms_first_list + [match_arms_first_list match_arms_first_case] +| [] + +def match_arms + [match_arms_first_list match_arms_last] + +def match_arm + [match_arm_patterns opt_match_arm_guard] + +def match_arms_pattern_tail + [match_arms_pattern_tail `| pattern] +| [] + +def match_arm_patterns + [`|`? pattern match_arms_pattern_tail] + +def opt_match_arm_guard + [`if expression] +| [] + +# +# Return expressions +# +def return_expression + [`return expression] +| [`return] + +# +# Break expression +# +def break_expression + [`break expression] +| [`break lifetime expression] +| [`break lifetime] +| [`break] + +# +# Continue expression +# +def continue_expression + [`continue lifetime] +| [`continue] + +# +# Generic Args +# + +def lifetime_tail + [lifetime_tail `, lifetime] +| [] + +def lifetime_list + [lifetime lifetime_tail] + +def opt_type_params + [`< lifetime_list `,`? `>] +| [`< type_list `,`? `>] +| [`< lifetime_list `, type_list `,`? `>] +| [] + + +# +# Function declaration +# + +def opt_return + [] +| [ `-> type] + +def extern_abi + [`extern string] + +def function_qualifiers + [`const ? `unsafe ? extern_abi?] + +def function_param + [pattern `: type] + +def func_param_tail + [func_param_tail `, function_param] +| [] + +def function_parameters + [function_param func_param_tail `,`?] + +def function + [ + function_qualifiers `fn id opt_generics `( function_parameters? `) + opt_return opt_where_clause block_expression + ] + +# +# Method declaration +# + +def self_param + [`mut`? `self] +| [`& `mut`? `self] +| [`& lifetime `mut`? `self] +| [`mut`? `self `: type] + +def opt_method_params + [`, function_parameters] +| [`,] +| [] + +def method + [ + function_qualifiers `fn id opt_generics `( self_param opt_method_params `) + opt_return opt_where_clause block_expression + ] + +# +# Types +# + +def qual_tail + [qual_tail `:: id] +| [] + +def qual_id + [id qual_tail] + +def type_id + [id opt_type_params] + +def type_path_segment + [path_ident_segment `:: ? generic_args] +| [path_ident_segment `:: ? type_path_fn] +| [path_ident_segment `:: ?] + +def type_path_fn + [`( type_path_fn_inputs? `) opt_arrow_type] + +def opt_arrow_type + [`-> type] +| [] + +def type_path_fn_inputs + [type_list `,`?] + +def type_path_tail + [type_path_tail `:: type_path_segment] +| [] + +def opt_sep + [`::] +| [] + +def type_path + [opt_sep type_path_segment type_path_tail] + +def opt_lifetime + [lifetime] +| [] + +def array_type + [`[ type `; expression `]] + +def slice_type + [`[ type `]] + +def raw_pointer_type + [`* `mut type_no_bounds] +| [`* `const type_no_bounds] + +def tuple_type + [`( `)] +| [`( type `, `)] +| [`( type `, type_list `,`? `)] + +def trait_object_type + [`dyn ? type_param_bounds] + +def impl_trait_type + [`impl type_param_bounds] + +def impl_trait_type_one_bound + [`impl trait_bound] + +def qualified_path_type + [`< type `as type_path `>] +| [`< type `>] + +def type_path_segment_list + [] + +def qualified_path_in_type + [qualified_path_type type_path_tail] + +# +# Bare Function Type +# + +def bare_function_return_type + [`-> type_no_bounds] + +def function_parameters_maybe_named_variadic + [maybe_named_function_parameters] +| [maybe_named_function_parameters_variadic] + +def maybe_named_param_tail + [maybe_named_param_tail `, maybe_named_param] +| [] + +def maybe_named_function_parameters + [maybe_named_param maybe_named_param_tail `,`?] + +def maybe_named_param + [id `: type] +| [`_ `: type] +| [type] + +def maybe_named_function_parameters_variadic + [maybe_named_param maybe_named_param_tail `, `...] + +def bare_function_type + [ + for_lifetimes? function_qualifiers `fn + `( function_parameters_maybe_named_variadic? `) bare_function_return_type? + ] + +# +# Type +# + +def type + [type_no_bounds] +| [impl_trait_type] +| [trait_object_type] + +def type_no_bounds + [impl_trait_type_one_bound] +| [type_path] +| [array_type] +| [slice_type] +| [raw_pointer_type] +| [`& opt_lifetime type] +| [`& `mut type] +| [`& lifetime `mut type] +| [tuple_type] +| [`_] +| [`!] +| [qualified_path_in_type] +| [bare_function_type] + +def type_list + [type_list `, type] +| [type] + +def opt_type + [`: type] +| [] + +def let_rvalue + [expression] +| [`{ statements `}] + +def expr_tail + [expr_tail `, expression] +| [] + +def expr_list + [expression expr_tail] +| [] + +def _construct + [attribute* id] +| [attribute* id `: expression] + +def cons_plus + [cons_plus `, _construct] +| [_construct] + +def cons_list + [cons_plus `,`?] +| [] + + +# +# Expression +# + +def path_ident_segment + [id] +| [`self] +| [`crate] +| [`super] + +def path_expr_segment + [path_ident_segment] +| [path_ident_segment `:: generic_args] + +def generic_args + [`< `>] +| [`< generic_args_lifetimes `,`? `>] +| [`< generic_args_types `,`? `>] +| [`< generic_args_bindings `,`? `>] +| [`< generic_args_types `, generic_args_bindings `,`? `>] +| [`< generic_args_lifetimes `, generic_args_types `,`? `>] +| [`< generic_args_lifetimes `, generic_args_bindings `,`? `>] +| [`< generic_args_lifetimes `, generic_args_types `, generic_args_bindings `,`? `>] + +def generic_args_lifetimes + [lifetime_list] + +def generic_args_types + [type_list] + +def generic_args_binding + [id `= type] + +def generic_args_bindings + [generic_args_bindings `, generic_args_binding] +| [generic_args_binding] + +def pie_tail + [pie_tail `:: path_expr_segment] +| [] + +def opt_path_sep + [`::] +| [] + +def path_in_expression + [opt_path_sep path_expr_segment pie_tail] + +def qualified_path_in_expression + [qualified_path_type type_path_tail] + +def path_expression + [path_in_expression] +| [qualified_path_in_expression] + +def tuple_expression + [`( expression `, `)] +| [`( expression `, expr_list `,`? `)] + +def array_expression + [`[ expr_list `,`? `]] +| [`[ expression `; expression `]] + +def closure_parameters + [closure_parameters `, closure_param] +| [closure_param] + +def closure_param + [pattern] +| [pattern `: type] + +def closure_expression_param_forms + [`| closure_parameters `,`? `|] +| [`||] + +def closure_expression + [`move ? closure_expression_param_forms expression] +| [`move ? closure_expression_param_forms `-> type_no_bounds block_expression] + +def paths + [path_expression] +| [char] +| [string] +| [raw_string] +| [number] +| [float] +| [path_in_expression `{ cons_list `}] +| [path_in_expression `{ `.. expression `}] +| [path_in_expression `{ cons_list `, `.. expression `}] +| [`( `)] +| [`true] +| [`false] +| [`( expression `)] +| [tuple_expression] +| [array_expression] +| [macro_invocation] +| [closure_expression] +| [block_expression] +| [match_expression] +| [if_expression] +| [if_let_expression] + + +def func_index + [func_index `. path_expr_segment] +| [func_index `. number] +| [func_index `( expr_list `,`? `)] +| [func_index `[ expr_list `,`? `]] +| [func_index `?] +| [paths] + +def question + [func_index] + +def unary + [question] +| [`- unary] +| [`* unary] +| [`! unary] +| [`& unary] +| [`& `mut unary] +| [`mut unary] + +def as + [as `as type_no_bounds] +| [unary] + +def mult + [mult `* as] +| [mult `/ as] +| [mult `% as] +| [as] + +def add_sub + [add_sub `- mult] +| [add_sub `+ mult] +| [mult] + +def shift + [shift `> `> add_sub] +| [shift `<< add_sub] +| [add_sub] + +def bitwise_and + [bitwise_and `& shift] +| [shift] + +def bitwise_xor + [bitwise_xor `^ bitwise_and] +| [bitwise_and] + +def bitwise_or + [bitwise_or `| bitwise_xor] +| [bitwise_xor] + +def comp_op + [`==] | [`!=] +| [`>] | [`<] +| [`>=] | [`<=] + +def comparison + [comparison comp_op bitwise_or] +| [bitwise_or] + +def lazy_conjunction + [lazy_conjunction `&& comparison] +| [comparison] + +def lazy_disjunction + [lazy_disjunction `|| lazy_conjunction] +| [lazy_conjunction] + +def range_expression + [range_expression `.. lazy_disjunction] +| [lazy_disjunction `..] +| [`.. lazy_disjunction] +| [`..] +| [range_expression `..= lazy_disjunction] +| [`..= lazy_disjunction] +| [lazy_disjunction] + +# Evaluates right to left. +def assignment_expression + [range_expression `= expression] +| [range_expression] + +def compound_op + [`+=] | [`-=] | [`*=] | [`/=] | [`%=] +| [`&=] | [`|=] | [`^=] | [`<<=] | [`>>=] + +# Evaluates right to left. +def compound_expression + [assignment_expression compound_op compound_expression] +| [assignment_expression] + +def expression + [expression_without_block] +| [expression_with_block] + +# +# Statements +# + +def block_expression + [`unsafe ? `{ statements? `}] + +def let_statement + [`let pattern opt_type `= let_rvalue `;] +| [`let pattern opt_type `;] + +def expression_without_block + [compound_expression `? ?] +| [return_expression] +| [break_expression] +| [continue_expression] + +def expression_with_block + [block_expression] +| [loop_expression] +| [if_expression] +| [if_let_expression] +| [match_expression] + +def expression_statement + [expression_without_block `;] +| [expression_with_block] + +def statement + [`;] +| [item] +| [let_statement] +| [expression_statement] +| [use_declaration] +| [macro_invocation_semi] + +def statement_list + [statement_list statement] +| [statement] + +def statements + [statement_list] +| [statement_list expression_without_block] +| [expression_without_block] + +def loop_label + [lifetime `:] + +def loop_expression + [loop_label? `loop block_expression] +| [loop_label? `while expression block_expression] +| [loop_label? `while `let match_arm_patterns `= expression block_expression] +| [loop_label? `for pattern `in expression block_expression] + +def if_expression + [`if expression block_expression opt_else_expression] + +def opt_else_expression + [`else block_expression] +| [`else if_expression] +| [`else if_let_expression] +| [] + +def if_let_expression + [ + `if `let match_arm_patterns `= expression block_expression + opt_else_expression + ] + +def visibility + [`pub] +| [`pub `( `crate `)] +| [`pub `( `self `)] +| [`pub `( `super `)] +| [`pub `( `in simple_path `)] + +def field + [attribute* visibility? id `: type] + +def field_plus + [field_plus `, field] +| [field] + +def field_list + [field_plus] +| [] + +# +# Lifetime Params +# + +def colon_lifetime_bounds + [`: lifetime_bounds] + +def lifetime_param + [lifetime colon_lifetime_bounds?] + +def lifetime_param_list + [lifetime_param_list `, lifetime_param ] +| [lifetime_param] + +# +# Type param bounds +# + +def trait_bound + [`? ? for_lifetimes? type_path] +| [`( `? ? for_lifetimes? type_path `)] + +def type_param_bound + [lifetime] +| [trait_bound] + +def tpb_tail + [`+ type_param_bound] + +def type_param_bounds + [type_param_bound tpb_tail* `+`?] + +# +# Type Params +# + +def opt_eq_type + [`= type] +| [] + +def opt_type_param_bounds + [`: type_param_bounds] +| [] + +def type_param + [id opt_type_param_bounds opt_eq_type] + +def type_param_tail + [type_param_tail `, type_param] +| [] + +def type_param_list + [type_param type_param_tail] + +# +# Generics +# + +def generic_params + [lifetime_param_list `, type_param_list] +| [lifetime_param_list] +| [type_param_list] +| [] + +def opt_generics + [`< generic_params `>] +| [] + +# +# Where clause +# + +def lifetime_params + [lifetime_param_list? `,`?] + +def for_lifetimes + [`for `< lifetime_params `>] + +def lifetime_bounds_list + [lifetime_bounds_list `+ lifetime] +| [lifetime] + +def lifetime_bounds + [lifetime_bounds_list? `+`?] + +def lifetime_where_clause_item + [lifetime `: lifetime_bounds] + +def type_bound_where_clause_item + [for_lifetimes? type `: type_param_bounds?] + +def where_clause_item + [lifetime_where_clause_item] +| [type_bound_where_clause_item] + +def where_clause_item_list + [where_clause_item_list `, where_clause_item] +| [where_clause_item] + +def opt_where_clause + [`where where_clause_item_list `,`? ] +| [] + + +# +# Tuple List +# + +def tuple_field + [attribute* visibility? type] + +def tuple_field_list + [tuple_field_list `, tuple_field] +| [tuple_field] + +def tuple_fields + [tuple_field_list `,`?] + +# +# Structure +# + +def struct_field + [attribute* visibility? id `: type] + +def struct_field_list + [struct_field_list `, struct_field] +| [struct_field] + +def struct_fields + [struct_field_list `,`?] + +def struct_struct + [`struct id opt_generics opt_where_clause `{ struct_fields? `}] +| [`struct id opt_generics opt_where_clause `;] + +def tuple_struct + [`struct id opt_generics `( tuple_fields? `) opt_where_clause `; ] + +def structure + [struct_struct] +| [tuple_struct] + +# +# Union +# +def union + [Union: id id opt_generics opt_where_clause `{ struct_fields? `}] + { + if $lhs.Union != "union" + reject + } + +# +# Trait +# + +def opt_type_param_bounds_opt + [`: type_param_bounds?] +| [] + +def trait + [ + `trait id opt_generics opt_type_param_bounds_opt opt_where_clause + `{ + trait_item* + `} + ] + +def trait_item + [attribute* trait_func] +| [attribute* trait_method] +| [attribute* trait_const] +| [attribute* trait_type] +| [attribute* macro_invocation_semi] + +def trait_func + [trait_func_decl `;] +| [trait_func_decl block_expression] + +def trait_method + [trait_method_decl `;] +| [trait_method_decl block_expression] + +def trait_func_decl + [function_qualifiers `fn id opt_generics + `( trait_function_parameters `) opt_return opt_where_clause] + +def trait_method_decl + [function_qualifiers `fn id opt_generics + `( self_param trait_method_parameters `) opt_return opt_where_clause] + +def trait_function_parameters + [trait_param_list `,] +| [trait_param_list ] +| [] + +def trait_method_parameters + [`, trait_param_list `,] +| [`, trait_param_list ] +| [`,] +| [] + +def trait_param_list + [trait_param_list `, trait_function_param] +| [trait_function_param] + +def trait_function_param + [pattern `: type] +| [type] + +def trait_const + [`const id `: type `;] +| [`const id `: type `= expression `;] + +def trait_type + [`type id opt_type_param_bounds_opt `;] + +# +# Implementation +# + +def inherent_impl_item + [attribute* visibility? function] commit +| [attribute* visibility? method] commit + +def inherent_impl + [`impl opt_generics type opt_where_clause `{ inherent_impl_item* `}] + +def trait_impl_item + [attribute* visibility? type_alias] +| [attribute* visibility? constant_item] +| [attribute* visibility? function] +| [attribute* visibility? method] + +def constant_item + [`const id `: type `= expression `;] +| [`const `_ `: type `= expression `;] + +def trait_impl + [`unsafe ? `impl opt_generics `! ? type_path `for type opt_where_clause `{ trait_impl_item* `}] + +def implementation + [inherent_impl] +| [trait_impl] + +def type_alias + [`type id opt_generics opt_where_clause `= type `;] + +def const_item + [`const id `: type `= expression `;] +| [`const `_ `: type `= expression `;] + +def module + [`mod id `;] +| [`mod id `{ item* `}] + +def crate_ref + [id] | [`self] + +def as_clause + [`as id] +| [`as `_] + +def extern_crate + [`extern `crate crate_ref as_clause? `;] + +def enum + [`enum id opt_generics opt_where_clause `{ enum_items? `,`? `}] + +def enum_items + [enum_items `, enum_item] +| [enum_item] + +def enum_item + [attribute* id enum_item_tuple] +| [attribute* id enum_item_struct] +| [attribute* id enum_item_discriminant] +| [attribute* id] + +def enum_item_tuple + [`( tuple_fields? `)] + +def enum_item_struct + [`{ field_list `,`? `}] + +def enum_item_discriminant + [`= expression] + +def static_item + [`static `mut`? id `: type `= expression `;] + +def abi + [string] +| [raw_string] + +def external_static_item + [`static `mut`? id `: type `;] + +def function_return_type + [`-> type] + +def external_function_item + [`fn id opt_generics `( named_function_parameters? `) function_return_type? opt_where_clause `;] + +def named_function_param + [id `: type] +| [`_ `: type] + +def named_function_parameters + [named_function_parameter_list `, `...] +| [named_function_parameter_list `,] +| [named_function_parameter_list ] + +def named_function_parameter_list + [named_function_parameter_list `, named_function_param] +| [named_function_param] + +def external_item + [attribute* visibility? external_static_item] +| [attribute* visibility? external_function_item] + +def extern_block + [`extern abi? `{ external_item* `}] + + +# +# All Items. +# + +def item + [visibility? vis_item] commit +| [macro_invocation_semi] commit +| [macro_rules_definition] commit + +def vis_item + [attribute] commit +| [function] commit +| [structure] commit +| [union] commit +| [trait] commit +| [implementation] commit +| [use_declaration] commit +| [type_alias] commit +| [const_item] commit +| [static_item] commit +| [module] commit +| [extern_crate] commit +| [extern_block] commit +| [enum] commit + +def program + [item*] + -- cgit v1.2.1