diff options
Diffstat (limited to 'examples')
-rw-r--r-- | examples/.gitignore | 39 | ||||
-rw-r--r-- | examples/CMakeLists.txt | 11 | ||||
-rw-r--r-- | examples/Makefile.am | 72 | ||||
-rw-r--r-- | examples/README | 40 | ||||
-rw-r--r-- | examples/atoi.rl | 59 | ||||
-rw-r--r-- | examples/awkemu.rl | 116 | ||||
-rwxr-xr-x | examples/awkequiv.awk | 10 | ||||
-rw-r--r-- | examples/clang.rl | 150 | ||||
-rw-r--r-- | examples/concurrent.rl | 126 | ||||
-rw-r--r-- | examples/cppscan.lex | 143 | ||||
-rw-r--r-- | examples/cppscan.rec | 183 | ||||
-rw-r--r-- | examples/cppscan.rl | 208 | ||||
-rw-r--r-- | examples/format.rl | 191 | ||||
-rw-r--r-- | examples/go/.gitignore | 5 | ||||
-rw-r--r-- | examples/go/Makefile | 32 | ||||
-rw-r--r-- | examples/go/README | 36 | ||||
-rw-r--r-- | examples/go/atoi.rl | 89 | ||||
-rw-r--r-- | examples/go/rpn.rl | 159 | ||||
-rw-r--r-- | examples/go/url.rl | 414 | ||||
-rw-r--r-- | examples/go/url_authority.rl | 165 | ||||
-rw-r--r-- | examples/gotocallret.rl | 96 | ||||
-rw-r--r-- | examples/mailbox.rl | 207 | ||||
-rw-r--r-- | examples/params.rl | 102 | ||||
-rw-r--r-- | examples/pullscan.rl | 170 | ||||
-rw-r--r-- | examples/rlscan.rl | 300 | ||||
-rw-r--r-- | examples/statechart.rl | 116 | ||||
-rw-r--r-- | examples/uri.rl | 31 |
27 files changed, 3270 insertions, 0 deletions
diff --git a/examples/.gitignore b/examples/.gitignore new file mode 100644 index 00000000..b309591b --- /dev/null +++ b/examples/.gitignore @@ -0,0 +1,39 @@ +/Makefile.in +/Makefile +/concurrent +/concurrent.exe +/rlscan +/rlscan.exe +/clang +/clang.exe +/statechart +/statechart.exe +/gotocallret +/gotocallret.exe +/pullscan +/pullscan.exe +/cppscan +/cppscan.exe +/format +/format.exe +/awkemu +/awkemu.exe +/mailbox +/mailbox.exe +/atoi +/atoi.exe +/params +/params.exe +/statechart.cpp +/gotocallret.cpp +/clang.c +/cppscan.cpp +/mailbox.cpp +/atoi.cpp +/pullscan.c +/concurrent.cpp +/rlscan.cpp +/params.c +/format.c +/awkemu.c +/.deps diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt new file mode 100644 index 00000000..6ff75544 --- /dev/null +++ b/examples/CMakeLists.txt @@ -0,0 +1,11 @@ +foreach(_example atoi awkemu clang concurrent cppscan format gotocallret + mailbox params rlscan statechart pullscan) + add_custom_command( + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${_example}.cpp" + DEPENDS ${_example}.rl + COMMAND ragel + ARGS -G2 -o "${CMAKE_CURRENT_BINARY_DIR}/${_example}.cpp" + "${CMAKE_CURRENT_LIST_DIR}/${_example}.rl" + WORKING_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}") + add_executable(${_example} "${CMAKE_CURRENT_BINARY_DIR}/${_example}.cpp") +endforeach() diff --git a/examples/Makefile.am b/examples/Makefile.am new file mode 100644 index 00000000..7cda0171 --- /dev/null +++ b/examples/Makefile.am @@ -0,0 +1,72 @@ + +RAGEL = ../src/ragel +FLEX = flex +RE2C = re2c + +noinst_PROGRAMS = \ + atoi concurrent cppscan format gotocallret mailbox params \ + statechart + +EXTRA_DIST = \ + gotocallret.rl pullscan.rl concurrent.rl rlscan.rl statechart.rl \ + params.rl clang.rl cppscan.rl format.rl awkemu.rl mailbox.rl atoi.rl + +gotocallret_SOURCES = gotocallret.cpp +pullscan_SOURCES = pullscan.c +concurrent_SOURCES = concurrent.cpp +rlscan_SOURCES = rlscan.cpp +statechart_SOURCES = statechart.cpp +params_SOURCES = params.c +clang_SOURCES = clang.c +cppscan_SOURCES = cppscan.cpp +format_SOURCES = format.c +awkemu_SOURCES = awkemu.c +mailbox_SOURCES = mailbox.cpp +atoi_SOURCES = atoi.cpp + +gotocallret.cpp: gotocallret.rl + $(RAGEL) -G2 -o gotocallret.cpp gotocallret.rl + +pullscan.c: pullscan.rl $(RAGEL) + $(RAGEL) -G2 -o $@ pullscan.rl + +concurrent.cpp: concurrent.rl $(RAGEL) + $(RAGEL) -G2 -o concurrent.cpp concurrent.rl + +rlscan.cpp: rlscan.rl + $(RAGEL) -G2 -o rlscan.cpp rlscan.rl + +statechart.cpp: statechart.rl + $(RAGEL) -G2 -o statechart.cpp statechart.rl + +params.c: params.rl + $(RAGEL) -G2 -o params.c params.rl + +clang.c: clang.rl + $(RAGEL) -G2 -o clang.c clang.rl + +cppscan.cpp: cppscan.rl + $(RAGEL) -G2 -o $@ cppscan.rl + +format.c: format.rl + $(RAGEL) -G2 -o format.c format.rl + +awkemu.c: awkemu.rl + $(RAGEL) -G2 -o awkemu.c awkemu.rl + +mailbox.cpp: mailbox.rl + $(RAGEL) -G2 -o mailbox.cpp mailbox.rl + +atoi.cpp: atoi.rl + $(RAGEL) -G2 -o atoi.cpp atoi.rl + +### + +lex-cppscan.cpp: cppscan.lex + $(FLEX) -f -o $@ $< + +re2c-cppscan.cpp: cppscan.rec + $(RE2C) -s $< > $@ + +example.cpp: example.rec + $(RE2C) -s $< > $@ diff --git a/examples/README b/examples/README new file mode 100644 index 00000000..12773cb3 --- /dev/null +++ b/examples/README @@ -0,0 +1,40 @@ + + Ragel State Machine Compiler -- Examples + ======================================== + +atoi -- Converts a string to an integer. + +awkemu -- Perfoms the basic parsing that the awk program perfoms on input. + The awk equivalent to awkemu is in awkemu/awkequiv.awk + +clang -- A scanner for a simple C like language. It breaks input up into + words, numbers, strings and symbols and strips out whitespace + and comments. It is a suitable template for writing a parser + that finds a sequence of tokens. + +concurrent -- Demonstrates the ability of ragel to produce parsers that + perform independent tasks concurrently. + +cppscan -- A C++ scanner that uses the longest match scanning method. This + example differs from other examples of scanning. Each run of the + state machine matches one token. This method results in a + smaller state machine since the final kleene star is omitted and + therefore every state does not need to get all the transitions + of the start state. + +format -- Partial printf implementation. + +gotocallret -- Demonstrate the use of fgoto, fcall and fret. + +mailbox -- Parses unix mailbox files. It breaks files into messages, and + messages into headers and body. It demonstrates Ragel's ability + to make parsers for structured file formats. + +params -- Parses command line arguements. + +rlscan -- Lexes Ragel input files. + +statechart -- Demonstrate the use of labels, the epsilon operator, and the + join operator for creating machines using the named state and + transition list paradigm. This implementes the same machine as + the atoi example. diff --git a/examples/atoi.rl b/examples/atoi.rl new file mode 100644 index 00000000..7164b68d --- /dev/null +++ b/examples/atoi.rl @@ -0,0 +1,59 @@ +/* + * Convert a string to an integer. + */ + +#include <stdlib.h> +#include <string.h> +#include <stdio.h> + +%%{ + machine atoi; + write data; +}%% + +long long atoi( char *str ) +{ + char *p = str, *pe = str + strlen( str ); + int cs; + long long val = 0; + bool neg = false; + + %%{ + action see_neg { + neg = true; + } + + action add_digit { + val = val * 10 + (fc - '0'); + } + + main := + ( '-'@see_neg | '+' )? ( digit @add_digit )+ + '\n'; + + # Initialize and execute. + write init; + write exec; + }%% + + if ( neg ) + val = -1 * val; + + if ( cs < atoi_first_final ) + fprintf( stderr, "atoi: there was an error\n" ); + + return val; +}; + + +#define BUFSIZE 1024 + +int main() +{ + char buf[BUFSIZE]; + while ( fgets( buf, sizeof(buf), stdin ) != 0 ) { + long long value = atoi( buf ); + printf( "%lld\n", value ); + } + return 0; +} diff --git a/examples/awkemu.rl b/examples/awkemu.rl new file mode 100644 index 00000000..6615943d --- /dev/null +++ b/examples/awkemu.rl @@ -0,0 +1,116 @@ +/* + * Perform the basic line parsing of input performed by awk. + */ + +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <unistd.h> + +%%{ + machine awkemu; + + action start_word { + ws[nwords] = fpc; + } + + action end_word { + we[nwords++] = fpc; + } + + action start_line { + nwords = 0; + ls = fpc; + } + + action end_line { + printf("endline(%i): ", nwords ); + fwrite( ls, 1, p - ls, stdout ); + printf("\n"); + + for ( i = 0; i < nwords; i++ ) { + printf(" word: "); + fwrite( ws[i], 1, we[i] - ws[i], stdout ); + printf("\n"); + } + } + + # Words in a line. + word = ^[ \t\n]+; + + # The whitespace separating words in a line. + whitespace = [ \t]; + + # The components in a line to break up. Either a word or a single char of + # whitespace. On the word capture characters. + blineElements = word >start_word %end_word | whitespace; + + # Star the break line elements. Just be careful to decrement the leaving + # priority as we don't want multiple character identifiers to be treated as + # multiple single char identifiers. + line = ( blineElements** '\n' ) >start_line @end_line; + + # Any number of lines. + main := line*; +}%% + +%% write data noerror nofinal; + +#define MAXWORDS 256 +#define BUFSIZE 4096 +char buf[BUFSIZE]; + +int main() +{ + int i, nwords = 0; + char *ls = 0; + char *ws[MAXWORDS]; + char *we[MAXWORDS]; + + int cs; + int have = 0; + + %% write init; + + while ( 1 ) { + char *p, *pe, *data = buf + have; + int len, space = BUFSIZE - have; + /* fprintf( stderr, "space: %i\n", space ); */ + + if ( space == 0 ) { + fprintf(stderr, "buffer out of space\n"); + exit(1); + } + + len = fread( data, 1, space, stdin ); + /* fprintf( stderr, "len: %i\n", len ); */ + if ( len == 0 ) + break; + + /* Find the last newline by searching backwards. This is where + * we will stop processing on this iteration. */ + p = buf; + pe = buf + have + len - 1; + while ( *pe != '\n' && pe >= buf ) + pe--; + pe += 1; + + /* fprintf( stderr, "running on: %i\n", pe - p ); */ + + %% write exec; + + /* How much is still in the buffer. */ + have = data + len - pe; + if ( have > 0 ) + memmove( buf, pe, have ); + + /* fprintf(stderr, "have: %i\n", have ); */ + + if ( len < space ) + break; + } + + if ( have > 0 ) + fprintf(stderr, "input not newline terminated\n"); + return 0; +} diff --git a/examples/awkequiv.awk b/examples/awkequiv.awk new file mode 100755 index 00000000..9877dd36 --- /dev/null +++ b/examples/awkequiv.awk @@ -0,0 +1,10 @@ +#!/usr/bin/awk -f +# + + +{ + print "endline(" NF "): " $0 + for ( i = 1; i <= NF; i++ ) { + print " word: " $i + } +} diff --git a/examples/clang.rl b/examples/clang.rl new file mode 100644 index 00000000..60491e5e --- /dev/null +++ b/examples/clang.rl @@ -0,0 +1,150 @@ +/* + * A mini C-like language scanner. + */ + +#include <stdio.h> +#include <string.h> +#include <stdlib.h> + +%%{ + machine clang; + + newline = '\n' @{curline += 1;}; + any_count_line = any | newline; + + # Consume a C comment. + c_comment := any_count_line* :>> '*/' @{fgoto main;}; + + main := |* + + # Alpha numberic characters or underscore. + alnum_u = alnum | '_'; + + # Alpha charactres or underscore. + alpha_u = alpha | '_'; + + # Symbols. Upon entering clear the buffer. On all transitions + # buffer a character. Upon leaving dump the symbol. + ( punct - [_'"] ) { + printf( "symbol(%i): %c\n", curline, ts[0] ); + }; + + # Identifier. Upon entering clear the buffer. On all transitions + # buffer a character. Upon leaving, dump the identifier. + alpha_u alnum_u* { + printf( "ident(%i): ", curline ); + fwrite( ts, 1, te-ts, stdout ); + printf("\n"); + }; + + # Single Quote. + sliteralChar = [^'\\] | newline | ( '\\' . any_count_line ); + '\'' . sliteralChar* . '\'' { + printf( "single_lit(%i): ", curline ); + fwrite( ts, 1, te-ts, stdout ); + printf("\n"); + }; + + # Double Quote. + dliteralChar = [^"\\] | newline | ( '\\' any_count_line ); + '"' . dliteralChar* . '"' { + printf( "double_lit(%i): ", curline ); + fwrite( ts, 1, te-ts, stdout ); + printf("\n"); + }; + + # Whitespace is standard ws, newlines and control codes. + any_count_line - 0x21..0x7e; + + # Describe both c style comments and c++ style comments. The + # priority bump on tne terminator of the comments brings us + # out of the extend* which matches everything. + '//' [^\n]* newline; + + '/*' { fgoto c_comment; }; + + # Match an integer. We don't bother clearing the buf or filling it. + # The float machine overlaps with int and it will do it. + digit+ { + printf( "int(%i): ", curline ); + fwrite( ts, 1, te-ts, stdout ); + printf("\n"); + }; + + # Match a float. Upon entering the machine clear the buf, buffer + # characters on every trans and dump the float upon leaving. + digit+ '.' digit+ { + printf( "float(%i): ", curline ); + fwrite( ts, 1, te-ts, stdout ); + printf("\n"); + }; + + # Match a hex. Upon entering the hex part, clear the buf, buffer characters + # on every trans and dump the hex on leaving transitions. + '0x' xdigit+ { + printf( "hex(%i): ", curline ); + fwrite( ts, 1, te-ts, stdout ); + printf("\n"); + }; + + *|; +}%% + +%% write data nofinal; + +#define BUFSIZE 128 + +void scanner() +{ + static char buf[BUFSIZE]; + int cs, act, have = 0, curline = 1; + char *ts, *te = 0; + int done = 0; + + %% write init; + + while ( !done ) { + char *p = buf + have, *pe, *eof = 0; + int len, space = BUFSIZE - have; + + if ( space == 0 ) { + /* We've used up the entire buffer storing an already-parsed token + * prefix that must be preserved. */ + fprintf(stderr, "OUT OF BUFFER SPACE\n" ); + exit(1); + } + + len = fread( p, 1, space, stdin ); + pe = p + len; + + /* Check if this is the end of file. */ + if ( len < space ) { + eof = pe; + done = 1; + } + + %% write exec; + + if ( cs == clang_error ) { + fprintf(stderr, "PARSE ERROR\n" ); + break; + } + + if ( ts == 0 ) + have = 0; + else { + /* There is a prefix to preserve, shift it over. */ + have = pe - ts; + memmove( buf, ts, have ); + te = buf + (te-ts); + ts = buf; + } + } +} + +int main() +{ + scanner(); + return 0; +} + diff --git a/examples/concurrent.rl b/examples/concurrent.rl new file mode 100644 index 00000000..224f9601 --- /dev/null +++ b/examples/concurrent.rl @@ -0,0 +1,126 @@ +/* + * Show off concurrent abilities. + */ + +#include <iostream> +#include <stdlib.h> +#include <stdio.h> + +using namespace std; + +#define BUFSIZE 2048 + +struct Concurrent +{ + int cur_char; + int start_word; + int start_comment; + int start_literal; + + int cs; + + int init( ); + int execute( const char *data, int len, bool isEof ); + int finish( ); +}; + +%%{ + machine Concurrent; + + action next_char { + cur_char += 1; + } + + action start_word { + start_word = cur_char; + } + action end_word { + cout << "word: " << start_word << + " " << cur_char-1 << endl; + } + + action start_comment { + start_comment = cur_char; + } + action end_comment { + cout << "comment: " << start_comment << + " " << cur_char-1 << endl; + } + + action start_literal { + start_literal = cur_char; + } + action end_literal { + cout << "literal: " << start_literal << + " " << cur_char-1 << endl; + } + + # Count characters. + chars = ( any @next_char )*; + + # Words are non-whitespace. + word = ( any-space )+ >start_word %end_word; + words = ( ( word | space ) $1 %0 )*; + + # Finds C style comments. + comment = ( '/*' any* :>> '*/' ) >start_comment %end_comment; + comments = ( comment | any )**; + + # Finds single quoted strings. + literalChar = ( any - ['\\] ) | ( '\\' . any ); + literal = ('\'' literalChar* '\'' ) >start_literal %end_literal; + literals = ( ( literal | (any-'\'') ) $1 %0 )*; + + main := chars | words | comments | literals; +}%% + +%% write data; + +int Concurrent::init( ) +{ + %% write init; + cur_char = 0; + return 1; +} + +int Concurrent::execute( const char *data, int len, bool isEof ) +{ + const char *p = data; + const char *pe = data + len; + const char *eof = isEof ? pe : 0; + + %% write exec; + + if ( cs == Concurrent_error ) + return -1; + if ( cs >= Concurrent_first_final ) + return 1; + return 0; +} + +int Concurrent::finish( ) +{ + if ( cs == Concurrent_error ) + return -1; + if ( cs >= Concurrent_first_final ) + return 1; + return 0; +} + +Concurrent concurrent; +char buf[BUFSIZE]; + +int main() +{ + concurrent.init(); + while ( 1 ) { + int len = fread( buf, 1, BUFSIZE, stdin ); + concurrent.execute( buf, len, len != BUFSIZE ); + if ( len != BUFSIZE ) + break; + } + + if ( concurrent.finish() <= 0 ) + cerr << "concurrent: error parsing input" << endl; + return 0; +} diff --git a/examples/cppscan.lex b/examples/cppscan.lex new file mode 100644 index 00000000..fb662538 --- /dev/null +++ b/examples/cppscan.lex @@ -0,0 +1,143 @@ +/* + * flex equivalent to cppscan.rl + */ + +%{ + +#include <stdio.h> + +#define TK_Dlit 256 +#define TK_Slit 257 +#define TK_Float 258 +#define TK_Id 259 +#define TK_NameSep 260 +#define TK_Arrow 261 +#define TK_PlusPlus 262 +#define TK_MinusMinus 263 +#define TK_ArrowStar 264 +#define TK_DotStar 265 +#define TK_ShiftLeft 266 +#define TK_ShiftRight 267 +#define TK_IntegerDecimal 268 +#define TK_IntegerOctal 269 +#define TK_IntegerHex 270 +#define TK_EqualsEquals 271 +#define TK_NotEquals 272 +#define TK_AndAnd 273 +#define TK_OrOr 274 +#define TK_MultAssign 275 +#define TK_DivAssign 276 +#define TK_PercentAssign 277 +#define TK_PlusAssign 278 +#define TK_MinusAssign 279 +#define TK_AmpAssign 280 +#define TK_CaretAssign 281 +#define TK_BarAssign 282 +#define TK_DotDotDot 283 +#define TK_Whitespace 284 +#define TK_Comment 285 + +int line = 1, col = 1; + +void token( int tok, char *data, int len ) +{ + printf( "<%i> ", tok ); + for ( int i = 0; i < len; i++ ) + fputc( data[i], stdout ); + fputc( '\n', stdout ); + + /* Count newlines and columns. This code is here mainly for having some + * code in the token routine when commenting out the above output during + * performance testing. */ + for ( int i = 0; i < len; i ++ ) { + if ( data[i] == '\n' ) { + line += 1; + col = 1; + } + else { + col += 1; + } + } +} + + +%} + +%x COMMENT + +FRACT_CONST [0-9]*\.[0-9]+|[0-9]+\. +EXPONENT [eE][+\-]?[0-9]+ +FLOAT_SUFFIX [flFL] + +%% + + /* Single and double literals. */ +L?\'([^\'\\\n]|\\.)*\' { + token( TK_Slit, yytext, yyleng ); +} + +L?\"([^\"\\\n]|\\.)*\" { + token( TK_Dlit, yytext, yyleng ); +} + +[a-zA-Z_][a-zA-Z0-9_]* { + token( TK_Id, yytext, yyleng ); +} + +{FRACT_CONST}{EXPONENT}?{FLOAT_SUFFIX}?|[0-9]+{EXPONENT}{FLOAT_SUFFIX}? { + token( TK_Float, yytext, yyleng ); +} + +(0|[1-9][0-9]*)[ulUL]{0,3} { + token( TK_IntegerDecimal, yytext, yyleng ); +} + +0[0-9]+[ulUL]{0,2} { + token( TK_IntegerOctal, yytext, yyleng ); +} + +0x[0-9a-fA-F]+[ulUL]{0,2} { + token( TK_IntegerHex, yytext, yyleng ); +} + +:: token( TK_NameSep, yytext, yyleng ); +== token( TK_EqualsEquals, yytext, yyleng ); +!= token( TK_NotEquals, yytext, yyleng ); +&& token( TK_AndAnd, yytext, yyleng ); +\|\| token( TK_OrOr, yytext, yyleng ); +\*= token( TK_MultAssign, yytext, yyleng ); +\/= token( TK_DivAssign, yytext, yyleng ); +%= token( TK_PercentAssign, yytext, yyleng ); +\+= token( TK_PlusAssign, yytext, yyleng ); +-= token( TK_MinusAssign, yytext, yyleng ); +&= token( TK_AmpAssign, yytext, yyleng ); +^= token( TK_CaretAssign, yytext, yyleng ); +\|= token( TK_BarAssign, yytext, yyleng ); +\+\+ token( TK_PlusPlus, yytext, yyleng ); +-- token( TK_MinusMinus, yytext, yyleng ); +-> token( TK_Arrow, yytext, yyleng ); +->\* token( TK_ArrowStar, yytext, yyleng ); +\.\* token( TK_DotStar, yytext, yyleng ); +\.\.\. token( TK_DotDotDot, yytext, yyleng ); + +\/\* BEGIN(COMMENT); +<COMMENT>\*\/ BEGIN(INITIAL); +<COMMENT>(.|\n) { } + +\/\/.*\n {} +[^!-~]+ {} + +[!-/:-@\[-`{-~] token( yytext[0], yytext, yyleng ); + +%% + +int yywrap() +{ + /* Once the input is done, no more. */ + return 1; +} + +int main() +{ + yylex(); +} diff --git a/examples/cppscan.rec b/examples/cppscan.rec new file mode 100644 index 00000000..43f297d8 --- /dev/null +++ b/examples/cppscan.rec @@ -0,0 +1,183 @@ +#include <stdio.h> +#include <string.h> +#include <stdlib.h> + +#define TK_Dlit 256 +#define TK_Slit 257 +#define TK_Float 258 +#define TK_Id 259 +#define TK_NameSep 260 +#define TK_Arrow 261 +#define TK_PlusPlus 262 +#define TK_MinusMinus 263 +#define TK_ArrowStar 264 +#define TK_DotStar 265 +#define TK_ShiftLeft 266 +#define TK_ShiftRight 267 +#define TK_IntegerDecimal 268 +#define TK_IntegerOctal 269 +#define TK_IntegerHex 270 +#define TK_EqualsEquals 271 +#define TK_NotEquals 272 +#define TK_AndAnd 273 +#define TK_OrOr 274 +#define TK_MultAssign 275 +#define TK_DivAssign 276 +#define TK_PercentAssign 277 +#define TK_PlusAssign 278 +#define TK_MinusAssign 279 +#define TK_AmpAssign 280 +#define TK_CaretAssign 281 +#define TK_BarAssign 282 +#define TK_DotDotDot 283 +#define TK_Whitespace 284 +#define TK_Comment 285 + +int line = 1, col = 1; + +void token( int tok, char *data, int len ) +{ + printf( "<%i> ", tok ); + for ( int i = 0; i < len; i++ ) + fputc( data[i], stdout ); + fputc( '\n', stdout ); + + /* Count newlines and columns. This code is here mainly for having some + * code in the token routine when commenting out the above output during + * performance testing. */ + for ( int i = 0; i < len; i ++ ) { + if ( data[i] == '\n' ) { + line += 1; + col = 1; + } + else { + col += 1; + } + } +} + +#define BUFSIZE 8192 +char buf[BUFSIZE]; + +void fill( int n ) +{ + printf("fill(%i)\n", n); + exit(1); +} + +int main() +{ + char *start, *p = buf, *lim = buf, *marker; + int len, have, want, shift; + int done = 0; + +#define YYCTYPE char + +#define YYCURSOR p +#define YYLIMIT lim +#define YYMARKER marker + +#define YYFILL(n) { \ + if ( ! done ) { \ + have = lim-start; \ + if ( start > buf ) { \ + shift = start-buf; \ + memmove( buf, start, have ); \ + start -= shift; \ + p -= shift; \ + lim -= shift; \ + marker -= shift; \ + } \ + want = BUFSIZE - have - 1; \ + len = fread( lim, 1, want, stdin ); \ + lim += len; \ + if ( len < want ) { \ + *lim++ = 0; \ + done = 1; \ + } \ + } \ + } + +again: + start = p; + +/*!re2c + +ANY = [\000-\377]; +FRACTCONST = ( [0-9]* "." [0-9]+ ) | [0-9]+ "."; +EXPONENT = [eE] [+\-]? [0-9]+; +FLOATSUFFIX = [flFL]; + + "L"? "\'" ( ANY \ [\'\\\n] | "\\" ANY )* "\'" { + token( TK_Slit, start, p-start ); + goto again; + } + + "L"? "\"" ( ANY \ [\"\\\n] | "\\" ANY )* "\"" { + token( TK_Dlit, start, p-start ); + goto again; + } + + [a-zA-Z_][a-zA-Z0-9_]* { + token( TK_Id, start, p-start ); + goto again; + } + + ( FRACTCONST EXPONENT? FLOATSUFFIX? ) | ( [0-9]+ EXPONENT FLOATSUFFIX? ) { + token( TK_Float, start, p-start ); + goto again; + } + + + ( "0" | [1-9][0-9]* ) [ulUL]* { + token( TK_IntegerDecimal, start, p-start ); + goto again; + } + + "0" [0-9]+ [ulUL]* { + token( TK_IntegerOctal, start, p-start ); + goto again; + } + + "0x" [0-9a-fA-F]+[ulUL]* { + token( TK_IntegerHex, start, p-start ); + goto again; + } + + "::" { token( TK_NameSep, start, p-start ); goto again; } + "==" { token( TK_EqualsEquals, start, p-start ); goto again; } + "!=" { token( TK_NotEquals, start, p-start ); goto again; } + "&&" { token( TK_AndAnd, start, p-start ); goto again; } + "||" { token( TK_OrOr, start, p-start ); goto again; } + "*=" { token( TK_MultAssign, start, p-start ); goto again; } + "/=" { token( TK_DivAssign, start, p-start ); goto again; } + "%=" { token( TK_PercentAssign, start, p-start ); goto again; } + "+=" { token( TK_PlusAssign, start, p-start ); goto again; } + "-=" { token( TK_MinusAssign, start, p-start ); goto again; } + "&=" { token( TK_AmpAssign, start, p-start ); goto again; } + "^=" { token( TK_CaretAssign, start, p-start ); goto again; } + "|=" { token( TK_BarAssign, start, p-start ); goto again; } + "++" { token( TK_PlusPlus, start, p-start ); goto again; } + "--" { token( TK_MinusMinus, start, p-start ); goto again; } + "->" { token( TK_Arrow, start, p-start ); goto again; } + "->*" { token( TK_ArrowStar, start, p-start ); goto again; } + ".*" { token( TK_DotStar, start, p-start ); goto again; } + "..." { token( TK_DotDotDot, start, p-start ); goto again; } + + "/*" { goto comment; } + "//" (ANY\"\n")* "\n" { goto again; } + [\001-\040\177]+ { goto again; } + + [\041-\057\072-\100\133-\140\173-\176] { + token( *start, start, p-start ); + goto again; + } + "\000" { return 0; } +*/ + +comment: +/*!re2c + "*/" { goto again; } + ANY { goto comment; } +*/ +} diff --git a/examples/cppscan.rl b/examples/cppscan.rl new file mode 100644 index 00000000..1ead5aa6 --- /dev/null +++ b/examples/cppscan.rl @@ -0,0 +1,208 @@ +/* + * A C++ scanner. Uses the longest match construction. + * << <= <<= >> >= >>= are left out since angle brackets are used in templates. + */ + +#include <string.h> +#include <stdlib.h> +#include <iostream> + +#define TK_Dlit 256 +#define TK_Slit 257 +#define TK_Float 258 +#define TK_Id 259 +#define TK_NameSep 260 +#define TK_Arrow 261 +#define TK_PlusPlus 262 +#define TK_MinusMinus 263 +#define TK_ArrowStar 264 +#define TK_DotStar 265 +#define TK_ShiftLeft 266 +#define TK_ShiftRight 267 +#define TK_IntegerDecimal 268 +#define TK_IntegerOctal 269 +#define TK_IntegerHex 270 +#define TK_EqualsEquals 271 +#define TK_NotEquals 272 +#define TK_AndAnd 273 +#define TK_OrOr 274 +#define TK_MultAssign 275 +#define TK_DivAssign 276 +#define TK_PercentAssign 277 +#define TK_PlusAssign 278 +#define TK_MinusAssign 279 +#define TK_AmpAssign 280 +#define TK_CaretAssign 281 +#define TK_BarAssign 282 +#define TK_DotDotDot 283 +#define TK_Whitespace 284 +#define TK_Comment 285 + +#define BUFSIZE 16384 + +/* EOF char used to flush out that last token. This should be a whitespace + * token. */ + +#define LAST_CHAR 0 + +using std::cerr; +using std::cout; +using std::cin; +using std::endl; + +static char buf[BUFSIZE]; +static int line = 1, col = 1; +static char *ts, *te; +static int act, have = 0; +static int cs; + +%%{ + machine Scanner; + write data nofinal; + + # Floating literals. + fract_const = digit* '.' digit+ | digit+ '.'; + exponent = [eE] [+\-]? digit+; + float_suffix = [flFL]; + + c_comment := + any* :>> '*/' + @{ fgoto main; }; + + main := |* + + # Single and double literals. + ( 'L'? "'" ( [^'\\\n] | /\\./ )* "'" ) + {token( TK_Slit );}; + ( 'L'? '"' ( [^"\\\n] | /\\./ )* '"' ) + {token( TK_Dlit );}; + + # Identifiers + ( [a-zA-Z_] [a-zA-Z0-9_]* ) + {token( TK_Id );}; + + # Floating literals. + ( fract_const exponent? float_suffix? | digit+ exponent float_suffix? ) + {token( TK_Float );}; + + # Integer decimal. Leading part buffered by float. + ( ( '0' | [1-9] [0-9]* ) [ulUL]{0,3} ) + {token( TK_IntegerDecimal );}; + + # Integer octal. Leading part buffered by float. + ( '0' [0-9]+ [ulUL]{0,2} ) + {token( TK_IntegerOctal );}; + + # Integer hex. Leading 0 buffered by float. + ( '0' ( 'x' [0-9a-fA-F]+ [ulUL]{0,2} ) ) + {token( TK_IntegerHex );}; + + # Only buffer the second item, first buffered by symbol. */ + '::' {token( TK_NameSep );}; + '==' {token( TK_EqualsEquals );}; + '!=' {token( TK_NotEquals );}; + '&&' {token( TK_AndAnd );}; + '||' {token( TK_OrOr );}; + '*=' {token( TK_MultAssign );}; + '/=' {token( TK_DivAssign );}; + '%=' {token( TK_PercentAssign );}; + '+=' {token( TK_PlusAssign );}; + '-=' {token( TK_MinusAssign );}; + '&=' {token( TK_AmpAssign );}; + '^=' {token( TK_CaretAssign );}; + '|=' {token( TK_BarAssign );}; + '++' {token( TK_PlusPlus );}; + '--' {token( TK_MinusMinus );}; + '->' {token( TK_Arrow );}; + '->*' {token( TK_ArrowStar );}; + '.*' {token( TK_DotStar );}; + + # Three char compounds, first item already buffered. */ + '...' {token( TK_DotDotDot );}; + + # Single char symbols. + ( punct - [_"'] ) {token( ts[0] );}; + + # Comments and whitespace. + '/*' { fgoto c_comment; }; + '//' [^\n]* '\n'; + ( any - 33..126 )+; + + *|; +}%% + +void token( int tok ) +{ + char *data = ts; + int len = te - ts; + + cout << '<' << tok << "> "; + cout.write( data, len ); + cout << '\n'; + + /* Count newlines and columns. This code is here mainly for having some + * code in the token routine when commenting out the above output during + * performance testing. */ + for ( int i = 0; i < len; i ++ ) { + if ( data[i] == '\n' ) { + line += 1; + col = 1; + } + else { + col += 1; + } + } +} + +int main() +{ + std::ios::sync_with_stdio(false); + + %% write init; + + /* Do the first read. */ + bool done = false; + while ( !done ) { + char *p = buf + have; + int space = BUFSIZE - have; + + if ( space == 0 ) { + /* We filled up the buffer trying to scan a token. */ + cerr << "OUT OF BUFFER SPACE" << endl; + exit(1); + } + + cin.read( p, space ); + int len = cin.gcount(); + char *pe = p + len; + char *eof = 0; + + /* If we see eof then append the EOF char. */ + if ( cin.eof() ) { + eof = pe; + done = true; + } + + %% write exec; + + /* Check if we failed. */ + if ( cs == Scanner_error ) { + /* Machine failed before finding a token. */ + cerr << "PARSE ERROR" << endl; + exit(1); + } + + /* Now set up the prefix. */ + if ( ts == 0 ) + have = 0; + else { + /* There is data that needs to be shifted over. */ + have = pe - ts; + memmove( buf, ts, have ); + te -= (ts-buf); + ts = buf; + } + } + + return 0; +} diff --git a/examples/format.rl b/examples/format.rl new file mode 100644 index 00000000..f8a37beb --- /dev/null +++ b/examples/format.rl @@ -0,0 +1,191 @@ +/* + * Partial printf implementation. + */ + +#define BUFLEN 1024 +#include <stdio.h> + +typedef void (*WriteFunc)( char *data, int len ); + +struct format +{ + char buf[BUFLEN+1]; + int buflen; + WriteFunc write; + + int flags; + int width; + int prec; + int cs; +}; + +void do_conv( struct format *fsm, char c ) +{ + printf( "flags: %x\n", fsm->flags ); + printf( "width: %i\n", fsm->width ); + printf( "prec: %i\n", fsm->prec ); + printf( "conv: %c\n", c ); + printf( "\n" ); +} + +#define FL_HASH 0x01 +#define FL_ZERO 0x02 +#define FL_DASH 0x04 +#define FL_SPACE 0x08 +#define FL_PLUS 0x10 + +#define FL_HAS_WIDTH 0x0100 +#define FL_WIDTH_ARG 0x0200 +#define FL_HAS_PREC 0x0400 +#define FL_PREC_ARG 0x0800 + +#define FL_LEN_H 0x010000 +#define FL_LEN_HH 0x020000 +#define FL_LEN_L 0x040000 +#define FL_LEN_LL 0x080000 + +%%{ + machine format; + access fsm->; + + action clear { + fsm->flags = 0; + fsm->width = 0; + fsm->prec = 0; + } + + # A non-zero number. + nznum = [1-9] [0-9]*; + + # Width + action width_num { fsm->width = 10 * fsm->width + (fc-'0'); } + action width_arg { fsm->flags |= FL_WIDTH_ARG; } + action width { fsm->flags |= FL_HAS_WIDTH; } + width = ( ( nznum $width_num | '*' @width_arg ) %width )?; + + # Precision + action prec_num { fsm->prec = 10 * fsm->prec + (fc-'0'); } + action prec_arg { fsm->flags |= FL_PREC_ARG; } + action prec { fsm->flags |= FL_HAS_PREC; } + precision = ( '.' ( digit* $prec_num %prec | '*' @prec_arg ) )?; + + # Flags + action flags_hash { fsm->flags |= FL_HASH; } + action flags_zero { fsm->flags |= FL_ZERO; } + action flags_dash { fsm->flags |= FL_DASH; } + action flags_space { fsm->flags |= FL_SPACE; } + action flags_plus { fsm->flags |= FL_PLUS; } + + flags = ( + '#' @flags_hash | + '0' @flags_zero | + '-' @flags_dash | + ' ' @flags_space | + '+' @flags_plus )*; + + action length_h { fsm->flags |= FL_LEN_H; } + action length_l { fsm->flags |= FL_LEN_L; } + action length_hh { fsm->flags |= FL_LEN_HH; } + action length_ll { fsm->flags |= FL_LEN_LL; } + + # Must use leaving transitions on 'h' and 'l' because they are + # prefixes for 'hh' and 'll'. + length = ( + 'h' %length_h | + 'l' %length_l | + 'hh' @length_hh | + 'll' @length_ll )?; + + action conversion { + do_conv( fsm, fc ); + } + + conversion = [diouxXcsp] @conversion; + + fmt_spec = + '%' @clear + flags + width + precision + length + conversion; + + action emit { + if ( fsm->buflen == BUFLEN ) { + fsm->write( fsm->buf, fsm->buflen ); + fsm->buflen = 0; + } + fsm->buf[fsm->buflen++] = fc; + } + + action finish_ok { + if ( fsm->buflen > 0 ) + fsm->write( fsm->buf, fsm->buflen ); + } + action finish_err { + printf("EOF IN FORMAT\n"); + } + action err_char { + printf("ERROR ON CHAR: 0x%x\n", fc ); + } + + main := ( + [^%] @emit | + '%%' @emit | + fmt_spec + )* @/finish_err %/finish_ok $!err_char; +}%% + +%% write data; + +void format_init( struct format *fsm ) +{ + fsm->buflen = 0; + %% write init; +} + +void format_execute( struct format *fsm, const char *data, int len, int isEof ) +{ + const char *p = data; + const char *pe = data + len; + const char *eof = isEof ? pe : 0; + + %% write exec; +} + +int format_finish( struct format *fsm ) +{ + if ( fsm->cs == format_error ) + return -1; + if ( fsm->cs >= format_first_final ) + return 1; + return 0; +} + + +#define INPUT_BUFSIZE 2048 + +struct format fsm; +char buf[INPUT_BUFSIZE]; + +void write(char *data, int len ) +{ + fwrite( data, 1, len, stdout ); +} + +int main() +{ + fsm.write = write; + format_init( &fsm ); + while ( 1 ) { + int len = fread( buf, 1, INPUT_BUFSIZE, stdin ); + int eof = len != INPUT_BUFSIZE; + format_execute( &fsm, buf, len, eof ); + if ( eof ) + break; + } + if ( format_finish( &fsm ) <= 0 ) + printf("FAIL\n"); + return 0; +} + diff --git a/examples/go/.gitignore b/examples/go/.gitignore new file mode 100644 index 00000000..f8b421d6 --- /dev/null +++ b/examples/go/.gitignore @@ -0,0 +1,5 @@ +/*.dot +/*.go +/atoi +/rpn +/url diff --git a/examples/go/Makefile b/examples/go/Makefile new file mode 100644 index 00000000..536afcc7 --- /dev/null +++ b/examples/go/Makefile @@ -0,0 +1,32 @@ +ragel = ragel + +check: atoi rpn url + ./atoi + ./rpn + ./url + @echo PASS + +graph: atoi.dot rpn.dot url.dot url_authority.dot + xdot atoi.dot + xdot rpn.dot + xdot url.dot + xdot url_authority.dot + +atoi: atoi.go +atoi.go: atoi.rl +atoi.dot: atoi.rl + +rpn: rpn.go +rpn.go: rpn.rl +rpn.dot: rpn.rl + +url: url.go url_authority.go +url.go: url.rl +url.dot: url.rl +url_authority.go: url_authority.rl +url_authority.dot: url_authority.rl + +clean: ; rm -f *.go *.dot atoi rpn url +%: %.go ; go build -o $@ $^ +%.go: %.rl ; $(ragel) -Z -T0 -o $@ $< +%.dot: %.rl ; $(ragel) -V -Z -p -o $@ $< diff --git a/examples/go/README b/examples/go/README new file mode 100644 index 00000000..bdb924b8 --- /dev/null +++ b/examples/go/README @@ -0,0 +1,36 @@ +.. -*-rst-*- + +Ragel Examples for Go +===================== + +These examples serve the following purposes: + +- Help you learn Ragel +- Test the correctness of the code I wrote for Ragel +- Benchmark Ragel's performance on your machine +- And hopefully give you some code you can steal ;] + +To get started you should first ``make install`` ragel. Then navigate +to this directory and run:: + + make + +To automatically compile/test/benchmark these examples. + +The following examples are provided: + +- atoi.rl: Convert string to integer (very simple) +- rpn.rl: Reverse polish notation calculator (simple) +- url.rl: Very fast and robust HTTP/SIP URL parser (very complicated) + +To see graphviz diagrams of the state machines generated by Ragel in +these examples, run the following commands:: + + sudo apt-get install xdot + make graph + +Those diagrams (along with the pdf manual) are super important for +troubleshooting and simplifying your Ragel code. + +I truly hope these examples help you in your personal and professional +endeavors. If you have any questions my email is: jtunney@gmail.com diff --git a/examples/go/atoi.rl b/examples/go/atoi.rl new file mode 100644 index 00000000..97c5163e --- /dev/null +++ b/examples/go/atoi.rl @@ -0,0 +1,89 @@ +// -*-go-*- +// +// Convert a string to an integer. +// +// To compile: +// +// ragel -Z -T0 -o atoi.go atoi.rl +// go build -o atoi atoi.go +// ./atoi +// +// To show a diagram of your state machine: +// +// ragel -V -Z -p -o atoi.dot atoi.rl +// xdot atoi.dot +// + +package main + +import ( + "os" + "fmt" +) + +%%{ + machine atoi; + write data; +}%% + +func atoi(data string) (val int) { + cs, p, pe := 0, 0, len(data) + neg := false + + %%{ + action see_neg { neg = true } + action add_digit { val = val * 10 + (int(fc) - '0') } + + main := + ( '-'@see_neg | '+' )? ( digit @add_digit )+ + '\n'? + ; + + write init; + write exec; + }%% + + if neg { + val = -1 * val; + } + + if cs < atoi_first_final { + fmt.Println("atoi: there was an error:", cs, "<", atoi_first_final) + fmt.Println(data) + for i := 0; i < p; i++ { + fmt.Print(" ") + } + fmt.Println("^") + } + + return val +} + +////////////////////////////////////////////////////////////////////// + +type atoiTest struct { + s string + v int +} + +var atoiTests = []atoiTest{ + atoiTest{"7", 7}, + atoiTest{"666", 666}, + atoiTest{"-666", -666}, + atoiTest{"+666", 666}, + atoiTest{"1234567890", 1234567890}, + atoiTest{"+1234567890\n", 1234567890}, + // atoiTest{"+ 1234567890", 1234567890}, // i will fail +} + +func main() { + res := 0 + for _, test := range atoiTests { + res := atoi(test.s) + if res != test.v { + fmt.Fprintf(os.Stderr, "FAIL atoi(%#v) != %#v\n", test.s, test.v) + res = 1 + } + } + os.Exit(res) +} diff --git a/examples/go/rpn.rl b/examples/go/rpn.rl new file mode 100644 index 00000000..2ad0a2db --- /dev/null +++ b/examples/go/rpn.rl @@ -0,0 +1,159 @@ +// -*-go-*- +// +// Reverse Polish Notation Calculator +// Copyright (c) 2010 J.A. Roberts Tunney +// MIT License +// +// To compile: +// +// ragel -Z -T0 -o rpn.go rpn.rl +// go build -o rpn rpn.go +// ./rpn +// +// To show a diagram of your state machine: +// +// ragel -V -Z -p -o rpn.dot rpn.rl +// xdot -Tpng -o rpn.png rpn.dot +// + +package main + +import ( + "errors" + "fmt" + "os" + "strconv" +) + +type stack struct { + items []int + count int +} + +func (s *stack) pop() int { + s.count-- + v := s.items[s.count] + return v +} + +func (s *stack) push(v int) { + s.items[s.count] = v + s.count++ +} + +func abs(v int) int { + if v < 0 { + v = -v + } + return v +} + +%% machine rpn; +%% write data; + +func rpn(data string) (res int, err error) { + // p, pe, eof := 0, len(data), len(data) + cs, p, pe := 0, 0, len(data) + mark := 0 + st := &stack{items: make([]int, 128), count: 0} + + %%{ + action mark { mark = p } + action push { x, _ := strconv.Atoi(data[mark:p]); st.push(x) } + action add { y, x := st.pop(), st.pop(); st.push(x + y) } + action sub { y, x := st.pop(), st.pop(); st.push(x - y) } + action mul { y, x := st.pop(), st.pop(); st.push(x * y) } + action div { y, x := st.pop(), st.pop(); st.push(x / y) } + action abs { st.push(abs(st.pop())) } + action abba { st.push(666) } + + stuff = digit+ >mark %push + | '+' @add + | '-' @sub + | '*' @mul + | '/' @div + | 'abs' %abs + | 'add' %add + | 'abba' %abba + ; + + main := ( space | stuff space )* ; + + write init; + write exec; + }%% + + if cs < rpn_first_final { + if p == pe { + return 0, errors.New("unexpected eof") + } else { + return 0, errors.New(fmt.Sprintf("error at position %d", p)) + } + } + + if st.count == 0 { + return 0, errors.New("rpn stack empty on result") + } + + return st.pop(), nil +} + +////////////////////////////////////////////////////////////////////// + +type rpnTest struct { + s string + v int +} + +var rpnTests = []rpnTest{ + rpnTest{"666\n", 666}, + rpnTest{"666 111\n", 111}, + rpnTest{"4 3 add\n", 7}, + rpnTest{"4 3 +\n", 7}, + rpnTest{"4 3 -\n", 1}, + rpnTest{"4 3 *\n", 12}, + rpnTest{"6 2 /\n", 3}, + rpnTest{"0 3 -\n", -3}, + rpnTest{"0 3 - abs\n", 3}, + rpnTest{" 2 2 + 3 - \n", 1}, + rpnTest{"10 7 3 2 * - +\n", 11}, + rpnTest{"abba abba add\n", 1332}, +} + +type rpnFailTest struct { + s string + e string +} + +var rpnFailTests = []rpnFailTest{ + rpnFailTest{"\n", "rpn stack empty on result"}, +} + +func main() { + rc := 0 + + for _, test := range rpnTests { + res, err := rpn(test.s) + if err != nil { + fmt.Fprintf(os.Stderr, "FAIL rpn(%#v) %s\n", test.s, err) + rc = 1 + } else if res != test.v { + fmt.Fprintf(os.Stderr, "FAIL rpn(%#v) -> %#v != %#v\n", + test.s, res, test.v) + rc = 1 + } + } + + for _, test := range rpnFailTests { + res, err := rpn(test.s) + if err == nil { + fmt.Fprintf(os.Stderr, "FAIL rpn(%#v) -> %#v should fail: %#v\n", + test.s, res, test.e) + } else if err.Error() != test.e { + fmt.Fprintf(os.Stderr, "FAIL rpn(%#v) %#v should be %#v\n", + test.s, err.Error(), test.e) + } + } + + os.Exit(rc) +} diff --git a/examples/go/url.rl b/examples/go/url.rl new file mode 100644 index 00000000..e94d59c6 --- /dev/null +++ b/examples/go/url.rl @@ -0,0 +1,414 @@ +// -*-go-*- +// +// URL Parser +// Copyright (c) 2010 J.A. Roberts Tunney +// MIT License +// +// To compile: +// +// ragel -Z -T0 -o url.go url.rl +// ragel -Z -T0 -o url_authority.go url_authority.rl +// go build -o url url.go url_authority.go +// ./url +// +// To show a diagram of your state machine: +// +// ragel -V -Z -p -o url.dot url.rl +// xdot url.dot +// +// ragel -V -Z -p -o url_authority.dot url_authority.rl +// xdot url_authority.dot +// +// Reference: +// +// - http://tools.ietf.org/html/rfc3986 +// + +package main + +import ( + "errors" + "fmt" + "os" + "time" +) + +type URL struct { + Scheme string // http, sip, file, etc. (never blank, always lowercase) + User string // who is you yo + Pass string // for like, logging in + Host string // IP 4/6 address or hostname (mandatory) + Port int // like 80 or 5060 (default 0) + Params string // stuff after ';' (NOT UNESCAPED, used in sip) + Path string // stuff starting with '/' + Query string // stuff after '?' (NOT UNESCAPED) + Fragment string // stuff after '#' +} + +%% machine url; +%% write data; + +// i parse absolute urls and don't suck at it. i'll parse just about +// any type of url you can think of and give you a human-friendly data +// structure. +// +// this routine takes no more than a few microseconds, is reentrant, +// performs in a predictable manner (for security/soft-realtime,) +// doesn't modify your `data` buffer, and under no circumstances will +// it panic (i hope!) +func URLParse(data []byte) (url *URL, err error) { + cs, p, pe, eof := 0, 0, len(data), len(data) + mark := 0 + url = new(URL) + + // this buffer is so we can unescape while we roll + var hex byte + buf := make([]byte, len(data)) + amt := 0 + + %%{ + action mark { mark = p } + action str_start { amt = 0 } + action str_char { buf[amt] = fc; amt++ } + action str_lower { buf[amt] = fc + 0x20; amt++ } + action hex_hi { hex = unhex(fc) * 16 } + action hex_lo { hex += unhex(fc) + buf[amt] = hex; amt++ } + action scheme { url.Scheme = string(buf[0:amt]) } + action authority { err = url.parseAuthority(data[mark:p]) + if err != nil { return nil, err } } + action path { url.Path = string(buf[0:amt]) } + action query { url.Query = string(data[mark:p]) } + action fragment { url.Fragment = string(buf[0:amt]) } + + # # do this instead if you *actually* use URNs (lol) + # action authority { url.Authority = string(data[mark:p]) } + + # define what a single character is allowed to be + toxic = ( cntrl | 127 ) ; + scary = ( toxic | " " | "\"" | "#" | "%" | "<" | ">" ) ; + schmchars = ( lower | digit | "+" | "-" | "." ) ; + authchars = any -- ( scary | "/" | "?" | "#" ) ; + pathchars = any -- ( scary | "?" | "#" ) ; + querchars = any -- ( scary | "#" ) ; + fragchars = any -- ( scary ) ; + + # define how characters trigger actions + escape = "%" xdigit xdigit ; + unescape = "%" ( xdigit @hex_hi ) ( xdigit @hex_lo ) ; + schmfirst = ( upper @str_lower ) | ( lower @str_char ) ; + schmchar = ( upper @str_lower ) | ( schmchars @str_char ) ; + authchar = escape | authchars ; + pathchar = unescape | ( pathchars @str_char ) ; + querchar = escape | querchars ; + fragchar = unescape | ( fragchars @str_char ) ; + + # define multi-character patterns + scheme = ( schmfirst schmchar* ) >str_start %scheme ; + authority = authchar+ >mark %authority ; + path = ( ( "/" @str_char ) pathchar* ) >str_start %path ; + query = "?" ( querchar* >mark %query ) ; + fragment = "#" ( fragchar* >str_start %fragment ) ; + url = scheme ":" "//"? authority path? query? fragment? + | scheme ":" "//" authority? path? query? fragment? + ; + + main := url; + write init; + write exec; + }%% + + if cs < url_first_final { + if p == pe { + return nil, errors.New( + fmt.Sprintf("unexpected eof: %s", data)) + } else { + return nil, errors.New( + fmt.Sprintf("error in url at pos %d: %s", p, data)) + } + } + + return url, nil +} + +func unhex(b byte) byte { + switch { + case '0' <= b && b <= '9': + return b - '0' + case 'a' <= b && b <= 'f': + return b - 'a' + 10 + case 'A' <= b && b <= 'F': + return b - 'A' + 10 + } + return 0 +} + +////////////////////////////////////////////////////////////////////// + +type urlTest struct { + s []byte + url URL +} + +var urlTests = []urlTest{ + + urlTest{ + []byte("http://user:pass@example.com:80;hello/lol.php?fun#omg"), + URL{ + Scheme: "http", + User: "user", + Pass: "pass", + Host: "example.com", + Port: 80, + Params: "hello", + Path: "/lol.php", + Query: "fun", + Fragment: "omg", + }, + }, + + urlTest{ + []byte("a:b"), + URL{ + Scheme: "a", + Host: "b", + }, + }, + + urlTest{ + []byte("GoPHeR://@example.com@:;/?#"), + URL{ + Scheme: "gopher", + Host: "@example.com@", + Path: "/", + }, + }, + + urlTest{ + []byte("ldap://[2001:db8::7]/c=GB?objectClass/?one"), + URL{ + Scheme: "ldap", + Host: "2001:db8::7", + Path: "/c=GB", + Query: "objectClass/?one", + }, + }, + + urlTest{ + []byte("http://user@example.com"), + URL{ + Scheme: "http", + User: "user", + Host: "example.com", + }, + }, + + urlTest{ + []byte("http://品研发和研发管@☃.com:65000;%20"), + URL{ + Scheme: "http", + User: "品研发和研发管", + Host: "☃.com", + Port: 65000, + Params: "%20", + }, + }, + + urlTest{ + []byte("https://example.com:80"), + URL{ + Scheme: "https", + Host: "example.com", + Port: 80, + }, + }, + + urlTest{ + []byte("file:///etc/passwd"), + URL{ + Scheme: "file", + Path: "/etc/passwd", + }, + }, + + urlTest{ + []byte("file:///c:/WINDOWS/clock.avi"), + URL{ + Scheme: "file", + Path: "/c:/WINDOWS/clock.avi", // <-- is this kosher? + }, + }, + + urlTest{ + []byte("file://hostname/path/to/the%20file.txt"), + URL{ + Scheme: "file", + Host: "hostname", + Path: "/path/to/the file.txt", + }, + }, + + urlTest{ + []byte("sip:example.com"), + URL{ + Scheme: "sip", + Host: "example.com", + }, + }, + + urlTest{ + []byte("sip:example.com:5060"), + URL{ + Scheme: "sip", + Host: "example.com", + Port: 5060, + }, + }, + + urlTest{ + []byte("mailto:ditto@pokémon.com"), + URL{ + Scheme: "mailto", + User: "ditto", + Host: "pokémon.com", + }, + }, + + urlTest{ + []byte("sip:[dead:beef::666]:5060"), + URL{ + Scheme: "sip", + Host: "dead:beef::666", + Port: 5060, + }, + }, + + urlTest{ + []byte("tel:+12126660420"), + URL{ + Scheme: "tel", + Host: "+12126660420", + }, + }, + + urlTest{ + []byte("sip:bob%20barker:priceisright@[dead:beef::666]:5060;isup-oli=00/palfun.html?haha#omg"), + URL{ + Scheme: "sip", + User: "bob barker", + Pass: "priceisright", + Host: "dead:beef::666", + Port: 5060, + Params: "isup-oli=00", + Path: "/palfun.html", + Query: "haha", + Fragment: "omg", + }, + }, + + urlTest{ + []byte("http://www.google.com/search?%68l=en&safe=off&q=omfg&aq=f&aqi=g2g-s1g1g-s1g5&aql=&oq=&gs_rfai="), + URL{ + Scheme: "http", + Host: "www.google.com", + Path: "/search", + Query: "%68l=en&safe=off&q=omfg&aq=f&aqi=g2g-s1g1g-s1g5&aql=&oq=&gs_rfai=", + }, + }, + +} + +func (test *urlTest) compare(url *URL) (passed bool) { + if url.Scheme != test.url.Scheme { + fmt.Fprintf(os.Stderr, "FAIL url(%#v) scheme: %#v != %#v\n", + string(test.s), url.Scheme, test.url.Scheme) + passed = true + } + if url.User != test.url.User { + fmt.Fprintf(os.Stderr, "FAIL url(%#v) user: %#v != %#v\n", + string(test.s), url.User, test.url.User) + passed = true + } + if url.Pass != test.url.Pass { + fmt.Fprintf(os.Stderr, "FAIL url(%#v) pass: %#v != %#v\n", + string(test.s), url.Pass, test.url.Pass) + passed = true + } + if url.Host != test.url.Host { + fmt.Fprintf(os.Stderr, "FAIL url(%#v) host: %#v != %#v\n", + string(test.s), url.Host, test.url.Host) + passed = true + } + if url.Port != test.url.Port { + fmt.Fprintf(os.Stderr, "FAIL url(%#v) port: %#v != %#v\n", + string(test.s), url.Port, test.url.Port) + passed = true + } + if url.Port != test.url.Port { + fmt.Fprintf(os.Stderr, "FAIL url(%#v) port: %#v != %#v\n", + string(test.s), url.Port, test.url.Port) + passed = true + } + if url.Params != test.url.Params { + fmt.Fprintf(os.Stderr, "FAIL url(%#v) params: %#v != %#v\n", + string(test.s), url.Params, test.url.Params) + passed = true + } + if url.Path != test.url.Path { + fmt.Fprintf(os.Stderr, "FAIL url(%#v) path: %#v != %#v\n", + string(test.s), url.Path, test.url.Path) + passed = true + } + if url.Query != test.url.Query { + fmt.Fprintf(os.Stderr, "FAIL url(%#v) query: %#v != %#v\n", + string(test.s), url.Query, test.url.Query) + passed = true + } + if url.Fragment != test.url.Fragment { + fmt.Fprintf(os.Stderr, "FAIL url(%#v) fragment: %#v != %#v\n", + string(test.s), url.Fragment, test.url.Fragment) + passed = true + } + return !passed +} + +func bench() { + const rounds = 10000 + for _, s := range [][]byte{ + []byte("a:a"), + []byte("http://google.com/"), + []byte("sip:jtunney@lobstertech.com"), + []byte("http://user:pass@example.com:80;hello/lol.php?fun#omg"), + []byte("file:///etc/passwd"), + } { + ts1 := time.Now() + for i := 0; i < rounds; i++ { + URLParse(s) + } + ts2 := time.Now() + fmt.Printf("BENCH URLParse(%s) -> %d ns\n", s, ts2.Sub(ts1).Nanoseconds() / rounds) + } +} + +func test() (rc int) { + for _, test := range urlTests { + url, err := URLParse(test.s) + if err != nil { + fmt.Fprintf(os.Stderr, "FAIL url(%#v) %s\n", string(test.s), err) + rc = 1 + continue + } + if !test.compare(url) { + rc = 1 + } + } + return rc +} + +func main() { + rc := test() + if rc == 0 { + bench() + } + os.Exit(rc) +} diff --git a/examples/go/url_authority.rl b/examples/go/url_authority.rl new file mode 100644 index 00000000..3e651ad0 --- /dev/null +++ b/examples/go/url_authority.rl @@ -0,0 +1,165 @@ +// -*-go-*- +// +// URL Parser +// Copyright (c) 2010 J.A. Roberts Tunney +// MIT License +// + +package main + +import ( + "errors" + "fmt" + "strconv" +) + +%% machine url_authority; +%% write data; + +// i parse strings like `alice@pokémon.com`. +// +// sounds simple right? but i also parse stuff like: +// +// bob%20barker:priceisright@[dead:beef::666]:5060;isup-oli=00 +// +// which in actual reality is: +// +// - User: "bob barker" +// - Pass: "priceisright" +// - Host: "dead:beef::666" +// - Port: 5060 +// - Params: "isup-oli=00" +// +// which was probably extracted from an absolute url that looked like: +// +// sip:bob%20barker:priceisright@[dead:beef::666]:5060;isup-oli=00/palfun.html?haha#omg +// +// which was probably extracted from its address form: +// +// "Bob Barker" <sip:bob%20barker:priceisright@[dead:beef::666]:5060;isup-oli=00/palfun.html?haha#omg>;tag=666 +// +// who would have thought this could be so hard ._. +func (url *URL) parseAuthority(data []byte) (err error) { + cs, p, pe, eof := 0, 0, len(data), len(data) + mark := 0 + + // temporary holding place for user:pass and/or host:port cuz an + // optional term (user[:pass]) coming before a mandatory term + // (host[:pass]) would require require backtracking and all that + // evil nondeterministic stuff which ragel seems to hate. (for + // this same reason you're also allowed to use square quotes + // around the username.) + var b1, b2 string + + // this buffer is so we can unescape while we roll + var hex byte + buf := make([]byte, len(data)) + amt := 0 + + %%{ + action mark { mark = p } + action str_start { amt = 0 } + action str_char { buf[amt] = fc; amt++ } + action hex_hi { hex = unhex(fc) * 16 } + action hex_lo { hex += unhex(fc) + buf[amt] = hex; amt++ } + action copy_b1 { b1 = string(buf[0:amt]); amt = 0 } + action copy_b2 { b2 = string(buf[0:amt]); amt = 0 } + action copy_host { url.Host = string(b1); amt = 0 } + + action copy_port { + if b2 != "" { + url.Port, err = strconv.Atoi(string(b2)) + if err != nil { goto fail } + if url.Port > 65535 { goto fail } + } + } + + action params { + url.Params = string(data[mark:p]) + } + + action params_eof { + url.Params = string(data[mark:p]) + return nil + } + + action atsymbol { + url.User = string(b1) + url.Pass = string(b2) + b2 = "" + } + + action alldone { + url.Host = string(b1) + if url.Host == "" { + url.Host = string(buf[0:amt]) + } else { + if amt > 0 { + b2 = string(buf[0:amt]) + } + if b2 != "" { + url.Port, err = strconv.Atoi(string(b2)) + if err != nil { goto fail } + if url.Port > 65535 { goto fail } + } + } + return nil + } + + # define what a single character is allowed to be + toxic = ( cntrl | 127 ) ; + scary = ( toxic | space | "\"" | "#" | "%" | "<" | ">" ) ; + authdelims = ( "/" | "?" | "#" | ":" | "@" | ";" | "[" | "]" ) ; + userchars = any -- ( authdelims | scary ) ; + userchars_esc = userchars | ":" ; + passchars = userchars ; + hostchars = passchars | "@" ; + hostchars_esc = hostchars | ":" ; + portchars = digit ; + paramchars = hostchars | ":" | ";" ; + + # define how characters trigger actions + escape = "%" xdigit xdigit ; + unescape = "%" ( xdigit @hex_hi ) ( xdigit @hex_lo ) ; + userchar = unescape | ( userchars @str_char ) ; + userchar_esc = unescape | ( userchars_esc @str_char ) ; + passchar = unescape | ( passchars @str_char ) ; + hostchar = unescape | ( hostchars @str_char ) ; + hostchar_esc = unescape | ( hostchars_esc @str_char ) ; + portchar = unescape | ( portchars @str_char ) ; + paramchar = escape | paramchars ; + + # define multi-character patterns + user_plain = userchar+ >str_start %copy_b1 ; + user_quoted = "[" ( userchar_esc+ >str_start %copy_b1 ) "]" ; + user = ( user_quoted | user_plain ) %/alldone ; + pass = passchar+ >str_start %copy_b2 %/alldone ; + host_plain = hostchar+ >str_start %copy_b1 %copy_host ; + host_quoted = "[" ( hostchar_esc+ >str_start %copy_b1 %copy_host ) "]" ; + host = ( host_quoted | host_plain ) %/alldone ; + port = portchar* >str_start %copy_b2 %copy_port %/alldone ; + params = ";" ( paramchar* >mark %params %/params_eof ) ; + userpass = user ( ":" pass )? ; + hostport = host ( ":" port )? ; + authority = ( userpass ( "@" @atsymbol ) )? hostport params? ; + + main := authority; + write init; + write exec; + }%% + + // if cs >= url_authority_first_final { + // return nil + // } + +fail: + // fmt.Println("error state", cs) + // fmt.Println(string(data)) + // for i := 0; i < p; i++ { + // fmt.Print(" ") + // } + // fmt.Println("^") + // fmt.Println(url) + return errors.New(fmt.Sprintf("bad url authority: %#v", string(data))) +} diff --git a/examples/gotocallret.rl b/examples/gotocallret.rl new file mode 100644 index 00000000..32c01a2c --- /dev/null +++ b/examples/gotocallret.rl @@ -0,0 +1,96 @@ +/* + * Demonstrate the use of goto, call and return. This machine expects either a + * lower case char or a digit as a command then a space followed by the command + * arg. If the command is a char, then the arg must be an a string of chars. + * If the command is a digit, then the arg must be a string of digits. This + * choice is determined by action code, rather than though transition + * desitinations. + */ + +#include <iostream> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +using namespace std; + +struct GotoCallRet +{ + char comm; + int cs, top, stack[32]; + + int init( ); + int execute( const char *data, int len, bool isEof ); + int finish( ); +}; + +%%{ + machine GotoCallRet; + + # Error machine, consumes to end of + # line, then starts the main line over. + garble_line := ( + (any-'\n')*'\n' + ) >{cout << "error: garbling line" << endl;} @{fgoto main;}; + + # Look for a string of alphas or of digits, + # on anything else, hold the character and return. + alp_comm := alpha+ $!{fhold;fret;}; + dig_comm := digit+ $!{fhold;fret;}; + + # Choose which to machine to call into based on the command. + action comm_arg { + if ( comm >= 'a' ) + fcall alp_comm; + else + fcall dig_comm; + } + + # Specifies command string. Note that the arg is left out. + command = ( + [a-z0-9] @{comm = fc;} ' ' @comm_arg '\n' + ) @{cout << "correct command" << endl;}; + + # Any number of commands. If there is an + # error anywhere, garble the line. + main := command* $!{fhold;fgoto garble_line;}; +}%% + +%% write data; + +int GotoCallRet::init( ) +{ + %% write init; + return 1; +} + +int GotoCallRet::execute( const char *data, int len, bool isEof ) +{ + const char *p = data; + const char *pe = data + len; + const char *eof = isEof ? pe : 0; + + %% write exec; + if ( cs == GotoCallRet_error ) + return -1; + if ( cs >= GotoCallRet_first_final ) + return 1; + return 0; +} + +#define BUFSIZE 1024 + +int main() +{ + char buf[BUFSIZE]; + + GotoCallRet gcr; + gcr.init(); + while ( fgets( buf, sizeof(buf), stdin ) != 0 ) + gcr.execute( buf, strlen(buf), false ); + + gcr.execute( 0, 0, true ); + if ( gcr.cs < GotoCallRet_first_final ) + cerr << "gotocallret: error: parsing input" << endl; + return 0; +} diff --git a/examples/mailbox.rl b/examples/mailbox.rl new file mode 100644 index 00000000..94590fdd --- /dev/null +++ b/examples/mailbox.rl @@ -0,0 +1,207 @@ +/* + * Parses unix mail boxes into headers and bodies. + */ + +#include <iostream> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +using namespace std; + +#define BUFSIZE 2048 + +/* A growable buffer for collecting headers. */ +struct Buffer +{ + Buffer() : data(0), allocated(0), length(0) { } + ~Buffer() { empty(); } + + void append( char p ) { + if ( ++length > allocated ) + upAllocate( length*2 ); + data[length-1] = p; + } + + void clear() { length = 0; } + void upAllocate( int len ); + void empty(); + + char *data; + int allocated; + int length; +}; + + +struct MailboxScanner +{ + Buffer headName; + Buffer headContent; + + int cs, top, stack[1]; + + int init( ); + int execute( const char *data, int len, bool isEof ); + int finish( ); +}; + +%%{ + machine MailboxScanner; + + # Buffer the header names. + action bufHeadName { headName.append(fc); } + + # Prints a blank line after the end of the headers of each message. + action blankLine { cout << endl; } + + # Helpers we will use in matching the date section of the from line. + day = /[A-Z][a-z][a-z]/; + month = /[A-Z][a-z][a-z]/; + year = /[0-9][0-9][0-9][0-9]/; + time = /[0-9][0-9]:[0-9][0-9]/ . ( /:[0-9][0-9]/ | '' ); + letterZone = /[A-Z][A-Z][A-Z]/; + numZone = /[+\-][0-9][0-9][0-9][0-9]/; + zone = letterZone | numZone; + dayNum = /[0-9 ][0-9]/; + + # These are the different formats of the date minus an obscure + # type that has a funny string 'remote from xxx' on the end. Taken + # from c-client in the imap-2000 distribution. + date = day . ' ' . month . ' ' . dayNum . ' ' . time . ' ' . + ( year | year . ' ' . zone | zone . ' ' . year ); + + # From lines separate messages. We will exclude fromLine from a message + # body line. This will cause us to stay in message line up until an + # entirely correct from line is matched. + fromLine = 'From ' . (any-'\n')* . ' ' . date . '\n'; + + # The types of characters that can be used as a header name. + hchar = print - [ :]; + + # Simply eat up an uninteresting header. Return at the first non-ws + # character following a newline. + consumeHeader := ( + [^\n] | + '\n' [ \t] | + '\n' [^ \t] @{fhold; fret;} + )*; + + action hchar {headContent.append(fc);} + action hspace {headContent.append(' ');} + + action hfinish { + headContent.append(0); + cout << headContent.data << endl; + headContent.clear(); + fhold; + fret; + } + + # Display the contents of a header as it is consumed. Collapses line + # continuations to a single space. + printHeader := ( + [^\n] @hchar | + ( '\n' ( [ \t]+ '\n' )* [ \t]+ ) %hspace + )** $!hfinish; + + action onHeader + { + headName.append(0); + if ( strcmp( headName.data, "From" ) == 0 || + strcmp( headName.data, "To" ) == 0 || + strcmp( headName.data, "Subject" ) == 0 ) + { + /* Print the header name, then jump to a machine the will display + * the contents. */ + cout << headName.data << ":"; + headName.clear(); + fcall printHeader; + } + + headName.clear(); + fcall consumeHeader; + } + + header = hchar+ $bufHeadName ':' @onHeader; + + # Exclude fromLine from a messageLine, otherwise when encountering a + # fromLine we will be simultaneously matching the old message and a new + # message. + messageLine = ( [^\n]* '\n' - fromLine ); + + # An entire message. + message = ( fromLine . header* . '\n' @blankLine . messageLine* ); + + # File is a series of messages. + main := message*; +}%% + +%% write data; + +int MailboxScanner::init( ) +{ + %% write init; + return 1; +} + +int MailboxScanner::execute( const char *data, int len, bool isEof ) +{ + const char *p = data; + const char *pe = data + len; + const char *eof = isEof ? pe : 0; + + %% write exec; + + if ( cs == MailboxScanner_error ) + return -1; + if ( cs >= MailboxScanner_first_final ) + return 1; + return 0; +} + +int MailboxScanner::finish( ) +{ + if ( cs == MailboxScanner_error ) + return -1; + if ( cs >= MailboxScanner_first_final ) + return 1; + return 0; +} + + +void Buffer::empty() +{ + if ( data != 0 ) { + free( data ); + + data = 0; + length = 0; + allocated = 0; + } +} + +void Buffer::upAllocate( int len ) +{ + if ( data == 0 ) + data = (char*) malloc( len ); + else + data = (char*) realloc( data, len ); + allocated = len; +} + +MailboxScanner mailbox; +char buf[BUFSIZE]; + +int main() +{ + mailbox.init(); + while ( 1 ) { + int len = fread( buf, 1, BUFSIZE, stdin ); + mailbox.execute( buf, len, len != BUFSIZE ); + if ( len != BUFSIZE ) + break; + } + if ( mailbox.finish() <= 0 ) + cerr << "mailbox: error parsing input" << endl; + return 0; +} diff --git a/examples/params.rl b/examples/params.rl new file mode 100644 index 00000000..a8ffeae9 --- /dev/null +++ b/examples/params.rl @@ -0,0 +1,102 @@ +/* + * Parse command line arguments. + */ + +#include <stdio.h> +#include <string.h> + +#define BUFLEN 1024 + +struct params +{ + char buffer[BUFLEN+1]; + int buflen; + int cs; +}; + +%%{ + machine params; + access fsm->; + + # A buffer to collect argurments + + # Append to the buffer. + action append { + if ( fsm->buflen < BUFLEN ) + fsm->buffer[fsm->buflen++] = fc; + } + + # Terminate a buffer. + action term { + if ( fsm->buflen < BUFLEN ) + fsm->buffer[fsm->buflen++] = 0; + } + + # Clear out the buffer + action clear { fsm->buflen = 0; } + + action help { printf("help\n"); } + action version { printf("version\n"); } + action output { printf("output: \"%s\"\n", fsm->buffer); } + action spec { printf("spec: \"%s\"\n", fsm->buffer); } + action mach { printf("machine: \"%s\"\n", fsm->buffer); } + + # Helpers that collect strings + string = [^\0]+ >clear $append %term; + + # Different arguments. + help = ( '-h' | '-H' | '-?' | '--help' ) 0 @help; + version = ( '-v' | '--version' ) 0 @version; + output = '-o' 0? string 0 @output; + spec = '-S' 0? string 0 @spec; + mach = '-M' 0? string 0 @mach; + + main := ( + help | + version | + output | + spec | + mach + )*; +}%% + +%% write data; + +void params_init( struct params *fsm ) +{ + fsm->buflen = 0; + %% write init; +} + +void params_execute( struct params *fsm, const char *data, int len ) +{ + const char *p = data; + const char *pe = data + len; + + %% write exec; +} + +int params_finish( struct params *fsm ) +{ + if ( fsm->cs == params_error ) + return -1; + if ( fsm->cs >= params_first_final ) + return 1; + return 0; +} + +#define BUFSIZE 2048 + +int main( int argc, char **argv ) +{ + int a; + struct params params; + + params_init( ¶ms ); + for ( a = 1; a < argc; a++ ) + params_execute( ¶ms, argv[a], strlen(argv[a])+1 ); + if ( params_finish( ¶ms ) != 1 ) + fprintf( stderr, "params: error processing arguments\n" ); + + return 0; +} diff --git a/examples/pullscan.rl b/examples/pullscan.rl new file mode 100644 index 00000000..d9e8a579 --- /dev/null +++ b/examples/pullscan.rl @@ -0,0 +1,170 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#define BUFSIZE 4096 + +typedef struct _Scanner { + /* Scanner state. */ + int cs; + int act; + int have; + int curline; + char *ts; + char *te; + char *p; + char *pe; + char *eof; + FILE *file; + int done; + + /* Token data */ + char *data; + int len; + int value; + + char buf[BUFSIZE]; +} Scanner; + + +%%{ + machine Scanner; + write data; +}%% + +void scan_init( Scanner *s, FILE *file ) +{ + memset (s, '\0', sizeof(Scanner)); + s->curline = 1; + s->file = file; + s->eof = 0; + %% write init; +} + +#define TK_NO_TOKEN (-1) +#define TK_ERR 128 +#define TK_EOF 129 +#define TK_Identifier 130 +#define TK_Number 131 +#define TK_String 132 + +#define ret_tok( _tok ) token = _tok; s->data = s->ts + +int scan( Scanner *s ) +{ + int token = TK_NO_TOKEN; + int space, readlen; + + while ( 1 ) { + if ( s->p == s->pe ) { + printf("scanner: need more data\n"); + + if ( s->ts == 0 ) + s->have = 0; + else { + /* There is data that needs to be shifted over. */ + printf("scanner: buffer broken mid token\n"); + s->have = s->pe - s->ts; + memmove( s->buf, s->ts, s->have ); + s->te -= (s->ts-s->buf); + s->ts = s->buf; + } + + s->p = s->buf + s->have; + space = BUFSIZE - s->have; + + if ( space == 0 ) { + /* We filled up the buffer trying to scan a token. */ + printf("scanner: out of buffer space\n"); + return TK_ERR; + } + + if ( s->done ) { + printf("scanner: end of file\n"); + s->p[0] = 0; + readlen = 1; + } + else { + readlen = fread( s->p, 1, space, s->file ); + if ( readlen < space ) + s->done = 1; + } + + s->pe = s->p + readlen; + } + + %%{ + machine Scanner; + access s->; + variable p s->p; + variable pe s->pe; + variable eof s->eof; + + main := |* + + # Identifiers + ( [a-zA-Z_] [a-zA-Z0-9_]* ) => + { ret_tok( TK_Identifier ); fbreak; }; + + # Whitespace + [ \t\n]; + + '"' ( [^\\"] | '\\' any ) * '"' => + { ret_tok( TK_String ); fbreak; }; + + # Number + digit+ => + { ret_tok( TK_Number ); fbreak; }; + + # EOF + 0 => + { ret_tok( TK_EOF ); fbreak; }; + + # Anything else + any => + { ret_tok( *s->p ); fbreak; }; + + *|; + + write exec; + }%% + + if ( s->cs == Scanner_error ) + return TK_ERR; + + if ( token != TK_NO_TOKEN ) { + s->len = s->p - s->data; + return token; + } + } +} + + +int main (int argc, char** argv) +{ + Scanner ss; + int tok; + + scan_init(&ss, stdin); + + while ( 1 ) { + tok = scan (&ss); + if ( tok == TK_EOF ) { + printf ("parser: EOF\n"); + break; + } + else if ( tok == TK_ERR ) { + printf ("parser: ERR\n"); + break; + } + else { + printf ("parser: %d \"", tok); + fwrite ( ss.data, 1, ss.len, stdout ); + printf ("\"\n" ); + } + } + + return 0; +} + + diff --git a/examples/rlscan.rl b/examples/rlscan.rl new file mode 100644 index 00000000..d4d4bf97 --- /dev/null +++ b/examples/rlscan.rl @@ -0,0 +1,300 @@ +/* + * Lexes Ragel input files. + */ + +#include <iostream> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +using namespace std; + +void escapeXML( char *data ) +{ + while ( *data != 0 ) { + switch ( *data ) { + case '<': cout << "<"; break; + case '>': cout << ">"; break; + case '&': cout << "&"; break; + default: cout << *data; break; + } + data += 1; + } +} + +void escapeXML( char c ) +{ + switch ( c ) { + case '<': cout << "<"; break; + case '>': cout << ">"; break; + case '&': cout << "&"; break; + default: cout << c; break; + } +} + +void escapeXML( char *data, int len ) +{ + for ( char *end = data + len; data != end; data++ ) { + switch ( *data ) { + case '<': cout << "<"; break; + case '>': cout << ">"; break; + case '&': cout << "&"; break; + default: cout << *data; break; + } + } +} + +inline void write( const char *data ) +{ + cout << data; +} + +inline void write( char c ) +{ + cout << c; +} + +inline void write( char *data, int len ) +{ + cout.write( data, len ); +} + + +%%{ + machine RagelScan; + + word = [a-zA-Z_][a-zA-Z_0-9]*; + integer = [0-9]+; + hex = '0x' [0-9a-fA-F] [0-9a-fA-F]*; + + default = ^0; + EOF = 0; + + # Handles comments in outside code and inline blocks. + c_comment := + ( default* :>> '*/' ) + ${ escapeXML( fc ); } + @{ fret; }; + + action emit { + escapeXML( ts, te-ts ); + } + + # + # Inline action code + # + + ilscan := |* + + "'" ( [^'\\] | /\\./ )* "'" => emit; + '"' ( [^"\\] | /\\./ )* '"' => emit; + '/*' { + write( "/*" ); + fcall c_comment; + }; + '//' [^\n]* '\n' => emit; + + '{' { + write( '{' ); + inline_depth += 1; + }; + + '}' { + write( '}' ); + /* If dropping down to the last } then return + * to ragel code. */ + if ( --inline_depth == 0 ) { + write( "</inline>\n" ); + fgoto rlscan; + } + }; + + default => { escapeXML( *ts ); }; + *|; + + # + # Ragel Tokens + # + + rlscan := |* + '}%%' { + if ( !single_line ) { + write( "</section>\n" ); + fgoto main; + } + }; + + '\n' { + if ( single_line ) { + write( "</section>\n" ); + fgoto main; + } + }; + + # Word + word { + write( "<word>" ); + write( ts, te-ts ); + write( "</word>\n" ); + }; + + # Decimal integer. + integer { + write( "<int>" ); + write( ts, te-ts ); + write( "</int>\n" ); + }; + + # Hexidecimal integer. + hex { + write( "<hex>" ); + write( ts, te-ts ); + write( "</hex>\n" ); + }; + + # Consume comments. + '#' [^\n]* '\n'; + + # Single literal string. + "'" ( [^'\\] | /\\./ )* "'" { + write( "<single_lit>" ); + escapeXML( ts, te-ts ); + write( "</single_lit>\n" ); + }; + + # Double literal string. + '"' ( [^"\\] | /\\./ )* '"' { + write( "<double_lit>" ); + escapeXML( ts, te-ts ); + write( "</double_lit>\n" ); + }; + + # Or literal. + '[' ( [^\]\\] | /\\./ )* ']' { + write( "<or_lit>" ); + escapeXML( ts, te-ts ); + write( "</or_lit>\n" ); + }; + + # Regex Literal. + '/' ( [^/\\] | /\\./ ) * '/' { + write( "<re_lit>" ); + escapeXML( ts, te-ts ); + write( "</re_lit>\n" ); + }; + + # Open an inline block + '{' { + inline_depth = 1; + write( "<inline>{" ); + fgoto ilscan; + }; + + punct { + write( "<symbol>" ); + escapeXML( fc ); + write( "</symbol>\n" ); + }; + + default; + *|; + + # + # Outside code. + # + + main := |* + + "'" ( [^'\\] | /\\./ )* "'" => emit; + '"' ( [^"\\] | /\\./ )* '"' => emit; + + '/*' { + escapeXML( ts, te-ts ); + fcall c_comment; + }; + + '//' [^\n]* '\n' => emit; + + '%%{' { + write( "<section>\n" ); + single_line = false; + fgoto rlscan; + }; + + '%%' { + write( "<section>\n" ); + single_line = true; + fgoto rlscan; + }; + + default { + escapeXML( *ts ); + }; + + # EOF. + EOF; + *|; +}%% + +%% write data nofinal; + +#define BUFSIZE 2048 + +int main() +{ + std::ios::sync_with_stdio(false); + + int cs, act; + char *ts, *te; + int stack[1], top; + + static char inbuf[BUFSIZE]; + bool single_line = false; + int inline_depth = 0; + + %% write init; + + bool done = false; + int have = 0; + while ( !done ) { + /* How much space is in the buffer? */ + int space = BUFSIZE - have; + if ( space == 0 ) { + /* Buffer is full. */ + cerr << "TOKEN TOO BIG" << endl; + exit(1); + } + + /* Read in a block. */ + char *p = inbuf + have; + cin.read( p, space ); + int len = cin.gcount(); + char *pe = p + len; + char *eof = 0; + + /* Check for EOF. */ + if ( len == 0 ) { + eof = pe; + done = true; + } + + %% write exec; + + if ( cs == RagelScan_error ) { + /* Machine failed before finding a token. */ + cerr << "PARSE ERROR" << endl; + exit(1); + } + + if ( ts == 0 ) + have = 0; + else { + /* There is a prefix to preserve, shift it over. */ + have = pe - ts; + memmove( inbuf, ts, have ); + te = inbuf + (te-ts); + ts = inbuf; + } + } + return 0; +} diff --git a/examples/statechart.rl b/examples/statechart.rl new file mode 100644 index 00000000..a04471b5 --- /dev/null +++ b/examples/statechart.rl @@ -0,0 +1,116 @@ +/* + * Demonstrate the use of labels, the epsilon operator, and the join operator + * for creating machines using the named state and transition list paradigm. + * This implementes the same machine as the atoi example. + */ + +#include <iostream> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +using namespace std; + +struct StateChart +{ + bool neg; + int val; + int cs; + + int init( ); + int execute( const char *data, int len ); + int finish( ); +}; + +%%{ + machine StateChart; + + action begin { + neg = false; + val = 0; + } + + action see_neg { + neg = true; + } + + action add_digit { + val = val * 10 + (fc - '0'); + } + + action finish { + if ( neg ) + val = -1 * val; + } + + atoi = ( + start: ( + '-' @see_neg ->om_num | + '+' ->om_num | + [0-9] @add_digit ->more_nums + ), + + # One or more nums. + om_num: ( + [0-9] @add_digit ->more_nums + ), + + # Zero ore more nums. + more_nums: ( + [0-9] @add_digit ->more_nums | + '' -> final + ) + ) >begin %finish; + + main := ( atoi '\n' @{ cout << val << endl; } )*; +}%% + +%% write data; + +int StateChart::init( ) +{ + neg = false; + val = false; + %% write init; + return 1; +} + +int StateChart::execute( const char *data, int len ) +{ + const char *p = data; + const char *pe = data + len; + + %% write exec; + + if ( cs == StateChart_error ) + return -1; + if ( cs >= StateChart_first_final ) + return 1; + return 0; +} + +int StateChart::finish( ) +{ + if ( cs == StateChart_error ) + return -1; + if ( cs >= StateChart_first_final ) + return 1; + return 0; +} + + +#define BUFSIZE 1024 + +int main() +{ + char buf[BUFSIZE]; + + StateChart atoi; + atoi.init(); + while ( fgets( buf, sizeof(buf), stdin ) != 0 ) { + atoi.execute( buf, strlen(buf) ); + } + if ( atoi.finish() <= 0 ) + cerr << "statechart: error: parsing input" << endl; + return 0; +} diff --git a/examples/uri.rl b/examples/uri.rl new file mode 100644 index 00000000..185a76c6 --- /dev/null +++ b/examples/uri.rl @@ -0,0 +1,31 @@ +%%{ + machine uri; + + action scheme {} + action loc {} + action item {} + action query {} + action last {} + action nothing {} + + main := + # Scheme machine. This is ambiguous with the item machine. We commit + # to the scheme machine on colon. + ( [^:/?#]+ ':' @(colon,1) @scheme )? + + # Location machine. This is ambiguous with the item machine. We remain + # ambiguous until a second slash, at that point and all points after + # we place a higher priority on staying in the location machine over + # moving into the item machine. + ( ( '/' ( '/' [^/?#]* ) $(loc,1) ) %loc %/loc )? + + # Item machine. Ambiguous with both scheme and location, which both + # get a higher priority on the characters causing ambiguity. + ( ( [^?#]+ ) $(loc,0) $(colon,0) %item %/item )? + + # Last two components, the characters that initiate these machines are + # not supported in any previous components, therefore there are no + # ambiguities introduced by these parts. + ( '?' [^#]* %query %/query)? + ( '#' any* %/last )?; +}%% |