diff options
author | Adrian Thurston <thurston@complang.org> | 2013-03-22 23:39:49 -0400 |
---|---|---|
committer | Adrian Thurston <thurston@complang.org> | 2013-03-22 23:39:49 -0400 |
commit | 9ddc931cbacf4d270a7113d07092203bec8aaaae (patch) | |
tree | b5591d9c8632de3f67e18aebc02216b4cd9a4663 /test | |
parent | 56a51921c90303d65fd8a4b3fee99240676d43dd (diff) | |
download | colm-9ddc931cbacf4d270a7113d07092203bec8aaaae.tar.gz |
tilde strings, neq, and unsigned numbers in lexical defs
Diffstat (limited to 'test')
-rw-r--r-- | test/argv1.args | 1 | ||||
-rw-r--r-- | test/argv1.exp | 1 | ||||
-rw-r--r-- | test/argv1.lm | 6 | ||||
-rw-r--r-- | test/argv2.args | 1 | ||||
-rw-r--r-- | test/argv2.exp | 10 | ||||
-rw-r--r-- | test/argv2.lm | 14 | ||||
-rw-r--r-- | test/exit1.exp | 1 | ||||
-rw-r--r-- | test/exit1.lm | 3 | ||||
-rw-r--r-- | test/exit2.exp | 0 | ||||
-rw-r--r-- | test/exit2.lm | 2 | ||||
-rw-r--r-- | test/exit3.exp | 0 | ||||
-rw-r--r-- | test/exit3.lm | 2 | ||||
-rw-r--r-- | test/repeat2.exp | 3552 | ||||
-rw-r--r-- | test/repeat2.in | 3307 | ||||
-rw-r--r-- | test/repeat2.lm | 6862 |
15 files changed, 6889 insertions, 6873 deletions
diff --git a/test/argv1.args b/test/argv1.args deleted file mode 100644 index 3a4295fc..00000000 --- a/test/argv1.args +++ /dev/null @@ -1 +0,0 @@ -a b c 1 2 3 diff --git a/test/argv1.exp b/test/argv1.exp deleted file mode 100644 index 6011e46d..00000000 --- a/test/argv1.exp +++ /dev/null @@ -1 +0,0 @@ -<__list0><str>./argv1.bin</str><str>a</str><str>b</str><str>c</str><str>1</str><str>2</str><str>3</str></__list0>
\ No newline at end of file diff --git a/test/argv1.lm b/test/argv1.lm index aa77d524..9ffd4467 100644 --- a/test/argv1.lm +++ b/test/argv1.lm @@ -1,2 +1,8 @@ +##### LM ##### print_xml( argv ) +print( '\n' ) +##### ARGS ##### +a b c 1 2 3 +##### EXP ##### +<__list0><str>./working/argv1.bin</str><str>a</str><str>b</str><str>c</str><str>1</str><str>2</str><str>3</str></__list0> diff --git a/test/argv2.args b/test/argv2.args deleted file mode 100644 index 60d8bc0b..00000000 --- a/test/argv2.args +++ /dev/null @@ -1 +0,0 @@ --qv -h -o output -iinput file --input=foo --input bar --help --verbose diff --git a/test/argv2.exp b/test/argv2.exp deleted file mode 100644 index 038294b0..00000000 --- a/test/argv2.exp +++ /dev/null @@ -1,10 +0,0 @@ -single: q -single: v -single: h -with opt: o -> output -with opt: i -> input -file: file -long: input -> foo -long: input -> bar -long: help -long: verbose diff --git a/test/argv2.lm b/test/argv2.lm index 9d7792ab..eb1ba790 100644 --- a/test/argv2.lm +++ b/test/argv2.lm @@ -1,3 +1,4 @@ +##### LM ##### lex literal '-' token file /^('-'|0)(^0)*/ @@ -74,3 +75,16 @@ for Item: item in Args { } } +##### ARGS ##### +-qv -h -o output -iinput file --input=foo --input bar --help --verbose +##### EXP ##### +single: q +single: v +single: h +with opt: o -> output +with opt: i -> input +file: file +long: input -> foo +long: input -> bar +long: help +long: verbose diff --git a/test/exit1.exp b/test/exit1.exp deleted file mode 100644 index 90be1f30..00000000 --- a/test/exit1.exp +++ /dev/null @@ -1 +0,0 @@ -before diff --git a/test/exit1.lm b/test/exit1.lm index a41a71d5..1a4f82aa 100644 --- a/test/exit1.lm +++ b/test/exit1.lm @@ -1,4 +1,7 @@ +##### LM ##### print( 'before\n' ) exit( 0 ) print( 'after\n' ) +##### EXP ##### +before diff --git a/test/exit2.exp b/test/exit2.exp deleted file mode 100644 index e69de29b..00000000 --- a/test/exit2.exp +++ /dev/null diff --git a/test/exit2.lm b/test/exit2.lm index 48284832..2105f987 100644 --- a/test/exit2.lm +++ b/test/exit2.lm @@ -1,3 +1,4 @@ +##### LM ##### int f3() { @@ -20,3 +21,4 @@ int f1() I: int = 1 f1() +##### EXP ##### diff --git a/test/exit3.exp b/test/exit3.exp deleted file mode 100644 index e69de29b..00000000 --- a/test/exit3.exp +++ /dev/null diff --git a/test/exit3.lm b/test/exit3.lm index 8c7a325f..4cf46862 100644 --- a/test/exit3.lm +++ b/test/exit3.lm @@ -1,3 +1,4 @@ +##### LM ##### int f3() { @@ -20,3 +21,4 @@ int f1() I: int = 1 exit( 0 ) +##### EXP ##### diff --git a/test/repeat2.exp b/test/repeat2.exp deleted file mode 100644 index 0bc0f8e7..00000000 --- a/test/repeat2.exp +++ /dev/null @@ -1,3552 +0,0 @@ -\documentclass[letterpaper,11pt,oneside]{book} -\usepackage{graphicx} -\usepackage{comment} -\usepackage{multicol} -\usepackage[ - colorlinks=true, - linkcolor=black, - citecolor=green, - filecolor=black, - urlcolor=black]{hyperref} - -\topmargin -0.20in -\oddsidemargin 0in -\textwidth 6.5in -\textheight 9in - -\setlength{\parskip}{0pt} -\setlength{\topsep}{0pt} -\setlength{\partopsep}{0pt} -\setlength{\itemsep}{0pt} - -\input{version} - -\newcommand{\verbspace}{\vspace{10pt}} -\newcommand{\graphspace}{\vspace{10pt}} - -\renewcommand\floatpagefraction{.99} -\renewcommand\topfraction{.99} -\renewcommand\bottomfraction{.99} -\renewcommand\textfraction{.01} -\setcounter{totalnumber}{50} -\setcounter{topnumber}{50} -\setcounter{bottomnumber}{50} - -\newenvironment{inline_code}{\def\baselinestretch{1}\vspace{12pt}\small}{} - -\begin{document} - -\thispagestyle{empty} -\begin{center} -\vspace*{3in} -{\huge Ragel State Machine Compiler}\\ -\vspace*{12pt} -{\Large User Guide}\\ -\vspace{1in} -by\\ -\vspace{12pt} -{\large Adrian Thurston}\\ -\end{center} -\clearpage - -\pagenumbering{roman} - -\chapter*{License} -Ragel version \version, \pubdate\\ -Copyright \copyright\ 2003-2012 Adrian D. Thurston -\vspace{6mm} - -{\bf\it\noindent This document is part of Ragel, and as such, this document is -released under the terms of the GNU General Public License as published by the -Free Software Foundation; either version 2 of the License, or (at your option) -any later version. -} - -\vspace{5pt} - -{\bf\it\noindent Ragel is distributed in the hope that it will be useful, but -WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more -details. -} - -\vspace{5pt} - -{\bf\it\noindent You should have received a copy of the GNU General Public -License along with Ragel; if not, write to the Free Software Foundation, Inc., -59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -} - -\clearpage -\tableofcontents -\clearpage - -\pagenumbering{arabic} -\chapter{Introduction} - -\section{Abstract} - -Regular expressions are used heavily in practice for the purpose of specifying -parsers. They are normally used as black boxes linked together with program -logic. User actions are executed in between invocations of the regular -expression engine. Adding actions before a pattern terminates requires patterns -to be broken and pasted back together with program logic. The more user actions -are needed, the less the advantages of regular expressions are seen. - -Ragel is a software development tool that allows user actions to be -embedded into the transitions of a regular expression's corresponding state -machine, eliminating the need to switch from the regular expression engine and -user code execution environment and back again. As a result, expressions can be -maximally continuous. One is free to specify an entire parser using a single -regular expression. The single-expression model affords concise and elegant -descriptions of languages and the generation of very simple, fast and robust -code. Ragel compiles executable finite state machines from a high level regular language -notation. Ragel targets C, C++, Objective-C, D, Go, Java and Ruby. - -In addition to building state machines from regular expressions, Ragel allows -the programmer to directly specify state machines with state charts. These two -notations may be freely combined. There are also facilities for controlling -nondeterminism in the resulting machines and building scanners using patterns -that themselves have embedded actions. Ragel can produce code that is small and -runs very fast. Ragel can handle integer-sized alphabets and can compile very -large state machines. - -\section{Motivation} - -When a programmer is faced with the task of producing a parser for a -context-free language there are many tools to choose from. It is quite common -to generate useful and efficient parsers for programming languages from a -formal grammar. It is also quite common for programmers to avoid such tools -when making parsers for simple computer languages, such as file formats and -communication protocols. Such languages are often regular and tools for -processing the context-free languages are viewed as too heavyweight for the -purpose of parsing regular languages. The extra run-time effort required for -supporting the recursive nature of context-free languages is wasted. - -When we turn to the regular expression-based parsing tools, such as Lex, Re2C, -and scripting languages such as Sed, Awk and Perl we find that they are split -into two levels: a regular expression matching engine and some kind of program -logic for linking patterns together. For example, a Lex program is composed of -sets of regular expressions. The implied program logic repeatedly attempts to -match a pattern in the current set. When a match is found the associated user -code executed. It requires the user to consider a language as a sequence of -independent tokens. Scripting languages and regular expression libraries allow -one to link patterns together using arbitrary program code. This is very -flexible and powerful, however we can be more concise and clear if we avoid -gluing together regular expressions with if statements and while loops. - -This model of execution, where the runtime alternates between regular -expression matching and user code exectution places restrictions on when -action code may be executed. Since action code can only be associated with -complete patterns, any action code that must be executed before an entire -pattern is matched requires that the pattern be broken into smaller units. -Instead of being forced to disrupt the regular expression syntax and write -smaller expressions, it is desirable to retain a single expression and embed -code for performing actions directly into the transitions that move over the -characters. After all, capable programmers are astutely aware of the machinery -underlying their programs, so why not provide them with access to that -machinery? To achieve this we require an action execution model for associating -code with the sub-expressions of a regular expression in a way that does not -disrupt its syntax. - -The primary goal of Ragel is to provide developers with an ability to embed -actions into the transitions and states of a regular expression's state machine -in support of the definition of entire parsers or large sections of parsers -using a single regular expression. From the regular expression we gain a clear -and concise statement of our language. From the state machine we obtain a very -fast and robust executable that lends itself to many kinds of analysis and -visualization. - -\section{Overview} - -Ragel is a language for specifying state machines. The Ragel program is a -compiler that assembles a state machine definition to executable code. Ragel -is based on the principle that any regular language can be converted to a -deterministic finite state automaton. Since every regular language has a state -machine representation and vice versa, the terms regular language and state -machine (or just machine) will be used interchangeably in this document. - -Ragel outputs machines to C, C++, Objective-C, D, Go, Java or Ruby code. The output is -designed to be generic and is not bound to any particular input or processing -method. A Ragel machine expects to have data passed to it in buffer blocks. -When there is no more input, the machine can be queried for acceptance. In -this way, a Ragel machine can be used to simply recognize a regular language -like a regular expression library. By embedding code into the regular language, -a Ragel machine can also be used to parse input. - -The Ragel language has many operators for constructing and manipulating -machines. Machines are built up from smaller machines, to bigger ones, to the -final machine representing the language that needs to be recognized or parsed. - -The core state machine construction operators are those found in most theory -of computation textbooks. They date back to the 1950s and are widely studied. -They are based on set operations and permit one to think of languages as a set -of strings. They are Union, Intersection, Difference, Concatenation and Kleene -Star. Put together, these operators make up what most people know as regular -expressions. Ragel also provides a scanner construction operator -and provides operators for explicitly constructing machines -using a state chart method. In the state chart method, one joins machines -together without any implied transitions and then explicitly specifies where -epsilon transitions should be drawn. - -The state machine manipulation operators are specific to Ragel. They allow the -programmer to access the states and transitions of regular language's -corresponding machine. There are two uses of the manipulation operators. The -first and primary use is to embed code into transitions and states, allowing -the programmer to specify the actions of the state machine. - -Ragel attempts to make the action embedding facility as intuitive as possible. -To do so, a number of issues need to be addressed. For example, when making a -nondeterministic specification into a DFA using machines that have embedded -actions, new transitions are often made that have the combined actions of -several source transitions. Ragel ensures that multiple actions associated with -a single transition are ordered consistently with respect to the order of -reference and the natural ordering implied by the construction operators. - -The second use of the manipulation operators is to assign priorities to -transitions. Priorities provide a convenient way of controlling any -nondeterminism introduced by the construction operators. Suppose two -transitions leave from the same state and go to distinct target states on the -same character. If these transitions are assigned conflicting priorities, then -during the determinization process the transition with the higher priority will -take precedence over the transition with the lower priority. The lower priority -transition gets abandoned. The transitions would otherwise be combined into a new -transition that goes to a new state that is a combination of the original -target states. Priorities are often required for segmenting machines. The most -common uses of priorities have been encoded into a set of simple operators -that should be used instead of priority embeddings whenever possible. - -For the purposes of embedding, Ragel divides transitions and states into -different classes. There are four operators for embedding actions and -priorities into the transitions of a state machine. It is possible to embed -into entering transitions, finishing transitions, all transitions and leaving -transitions. The embedding into leaving transitions is a special case. -These transition embeddings get stored in the final states of a machine. They -are transferred to any transitions that are made going out of the machine by -future concatenation or kleene star operations. - -There are several more operators for embedding actions into states. Like the -transition embeddings, there are various different classes of states that the -embedding operators access. For example, one can access start states, final -states or all states, among others. Unlike the transition embeddings, there are -several different types of state action embeddings. These are executed at -various different times during the processing of input. It is possible to embed -actions that are exectued on transitions into a state, on transitions out of a -state, on transitions taken on the error event, or on transitions taken on the -EOF event. - -Within actions, it is possible to influence the behaviour of the state machine. -The user can write action code that jumps or calls to another portion of the -machine, changes the current character being processed, or breaks out of the -processing loop. With the state machine calling feature Ragel can be used to -parse languages that are not regular. For example, one can parse balanced -parentheses by calling into a parser when an open parenthesis character is seen -and returning to the state on the top of the stack when the corresponding -closing parenthesis character is seen. More complicated context-free languages -such as expressions in C are out of the scope of Ragel. - -Ragel also provides a scanner construction operator that can be used to build -scanners much the same way that Lex is used. The Ragel generated code, which -relies on user-defined variables for backtracking, repeatedly tries to match -patterns to the input, favouring longer patterns over shorter ones and patterns -that appear ahead of others when the lengths of the possible matches are -identical. When a pattern is matched the associated action is executed. - -The key distinguishing feature between scanners in Ragel and scanners in Lex is -that Ragel patterns may be arbitrary Ragel expressions and can therefore -contain embedded code. With a Ragel-based scanner the user need not wait until -the end of a pattern before user code can be executed. - -Scanners do take Ragel out of the domain of pure state machines and require the -user to maintain the backtracking related variables. However, scanners -integrate well with regular state machine instantiations. They can be called to -or jumped to only when needed, or they can be called out of or jumped out of -when a simpler, pure state machine model is appropriate. - -Two types of output code style are available. Ragel can produce a table-driven -machine or a directly executable machine. The directly executable machine is -much faster than the table-driven. On the other hand, the table-driven machine -is more compact and less demanding on the host language compiler. It is better -suited to compiling large state machines. - -\section{Related Work} - -Lex is perhaps the best-known tool for constructing parsers from regular -expressions. In the Lex processing model, generated code attempts to match one -of the user's regular expression patterns, favouring longer matches over -shorter ones. Once a match is made it then executes the code associated with -the pattern and consumes the matching string. This process is repeated until -the input is fully consumed. - -Through the use of start conditions, related sets of patterns may be defined. -The active set may be changed at any time. This allows the user to define -different lexical regions. It also allows the user to link patterns together by -requiring that some patterns come before others. This is quite like a -concatenation operation. However, use of Lex for languages that require a -considerable amount of pattern concatenation is inappropriate. In such cases a -Lex program deteriorates into a manually specified state machine, where start -conditions define the states and pattern actions define the transitions. Lex -is therefore best suited to parsing tasks where the language to be parsed can -be described in terms of regions of tokens. - -Lex is useful in many scenarios and has undoubtedly stood the test of time. -There are, however, several drawbacks to using Lex. Lex can impose too much -overhead for parsing applications where buffering is not required because all -the characters are available in a single string. In these cases there is -structure to the language to be parsed and a parser specification tool can -help, but employing a heavyweight processing loop that imposes a stream -``pull'' model and dynamic input buffer allocation is inappropriate. An -example of this kind of scenario is the conversion of floating point numbers -contained in a string to their corresponding numerical values. - -Another drawback is the very issue that Ragel attempts to solve. -It is not possible to execute a user action while -matching a character contained inside a pattern. For example, if scanning a -programming language and string literals can contain newlines which must be -counted, a Lex user must break up a string literal pattern so as to associate -an action with newlines. This forces the definition of a new start condition. -Alternatively the user can reprocess the text of the matched string literal to -count newlines. - - -The Re2C program defines an input processing model similar to that of Lex. -Re2C focuses on making generated state machines run very fast and -integrate easily into any program, free of dependencies. Re2C generates -directly executable code and is able to claim that generated parsers run nearly -as fast as their hand-coded equivalents. This is very important for user -adoption, as programmers are reluctant to use a tool when a faster alternative -exists. A consideration to ease of use is also important because developers -need the freedom to integrate the generated code as they see fit. - -Many scripting languages provide ways of composing parsers by linking regular -expressions using program logic. For example, Sed and Awk are two established -Unix scripting tools that allow the programmer to exploit regular expressions -for the purpose of locating and extracting text of interest. High-level -programming languages such as Perl, Python, PHP and Ruby all provide regular -expression libraries that allow the user to combine regular expressions with -arbitrary code. - -In addition to supporting the linking of regular expressions with arbitrary -program logic, the Perl programming language permits the embedding of code into -regular expressions. Perl embeddings do not translate into the embedding of -code into deterministic state machines. Perl regular expressions are in fact -not fully compiled to deterministic machines when embedded code is involved. -They are instead interpreted and involve backtracking. This is shown by the -following Perl program. When it is fed the input \verb|abcd| the interpretor -attempts to match the first alternative, printing \verb|a1 b1|. When this -possibility fails it backtracks and tries the second possibility, printing -\verb|a2 b2|, at which point it succeeds. - -\begin{inline_code} -\begin{verbatim} -print "YES\n" if ( <STDIN> =~ - /( a (?{ print "a1 "; }) b (?{ print "b1 "; }) cX ) | - ( a (?{ print "a2 "; }) b (?{ print "b2 "; }) cd )/x ) -\end{verbatim} -\end{inline_code} -\verbspace - -In Ragel there is no regular expression interpretor. Aside from the scanner -operator, all Ragel expressions are made into deterministic machines and the -run time simply moves from state to state as it consumes input. An equivalent -parser expressed in Ragel would attempt both of the alternatives concurrently, -printing \verb|a1 a2 b1 b2|. - -\section{Development Status} - -Ragel is a relatively new tool and is under continuous development. As a rough -release guide, minor revision number changes are for implementation -improvements and feature additions. Major revision number changes are for -implementation and language changes that do not preserve backwards -compatibility. Though in the past this has not always held true: changes that -break code have crept into minor version number changes. Typically, the -documentation lags behind the development in the interest of documenting only -the lasting features. The latest changes are always documented in the ChangeLog -file. - -\chapter{Constructing State Machines} - -\section{Ragel State Machine Specifications} - -A Ragel input file consists of a program in the host language that contains embedded machine -specifications. Ragel normally passes input straight to output. When it sees -a machine specification it stops to read the Ragel statements and possibly generate -code in place of the specification. -Afterwards it continues to pass input through. There -can be any number of FSM specifications in an input file. A multi-line FSM spec -starts with \verb|%%{| and ends with \verb|}%%|. A single-line FSM spec starts -with \verb|%%| and ends at the first newline. - -While Ragel is looking for FSM specifications it does basic lexical analysis on -the surrounding input. It interprets literal strings and comments so a -\verb|%%| sequence in either of those will not trigger the parsing of an FSM -specification. Ragel does not pass the input through any preprocessor nor does it -interpret preprocessor directives itself so includes, defines and ifdef logic -cannot be used to alter the parse of a Ragel input file. It is therefore not -possible to use an \verb|#if 0| directive to comment out a machine as is -commonly done in C code. As an alternative, a machine can be prevented from -causing any generated output by commenting out write statements. - -In Figure \ref{cmd-line-parsing}, a multi-line specification is used to define the -machine and single line specifications are used to trigger the writing of the machine -data and execution code. - -\begin{figure} -\small -\begin{multicols}{2} -\begin{verbatim} -#include <string.h> -#include <stdio.h> - -%%{ - machine foo; - main := - ( 'foo' | 'bar' ) - 0 @{ res = 1; }; -}%% - -%% write data; -\end{verbatim} -\verbspace -\columnbreak -\begin{verbatim} -int main( int argc, char **argv ) -{ - int cs, res = 0; - if ( argc > 1 ) { - char *p = argv[1]; - char *pe = p + strlen(p) + 1; - %% write init; - %% write exec; - } - printf("result = %i\n", res ); - return 0; -} -\end{verbatim} -\verbspace -\end{multicols} -\caption{Parsing a command line argument. -} -\label{cmd-line-parsing} -\end{figure} - -\subsection{Naming Ragel Blocks} - -\begin{verbatim} -machine fsm_name; -\end{verbatim} -\verbspace - -The \verb|machine| statement gives the name of the FSM. If present in a -specification, this statement must appear first. If a machine specification -does not have a name then Ragel uses the previous specification name. If no -previous specification name exists then this is an error. Because FSM -specifications persist in memory, a machine's statements can be spread across -multiple machine specifications. This allows one to break up a machine across -several files or draw in statements that are common to multiple machines using -the \verb|include| statement. - -\subsection{Machine Definition} -\label{definition} - -\begin{verbatim} -<name> = <expression>; -\end{verbatim} -\verbspace - -The machine definition statement associates an FSM expression with a name. Machine -expressions assigned to names can later be referenced in other expressions. A -definition statement on its own does not cause any states to be generated. It is simply a -description of a machine to be used later. States are generated only when a definition is -instantiated, which happens when a definition is referenced in an instantiated -expression. - -\subsection{Machine Instantiation} -\label{instantiation} - -\begin{verbatim} -<name> := <expression>; -\end{verbatim} -\verbspace - -The machine instantiation statement generates a set of states representing an -expression. Each instantiation generates a distinct set of states. The starting -state of the instantiation is written in the data section of the generated code -using the instantiation name. If a machine named -\verb|main| is instantiated, its start state is used as the -specification's start state and is assigned to the \verb|cs| variable by the -\verb|write init| command. If no \verb|main| machine is given, the start state -of the last machine instantiation to appear is used as the specification's -start state. - -From outside the execution loop, control may be passed to any machine by -assigning the entry point to the \verb|cs| variable. From inside the execution -loop, control may be passed to any machine instantiation using \verb|fcall|, -\verb|fgoto| or \verb|fnext| statements. - -\subsection{Including Ragel Code} - -\begin{verbatim} -include FsmName "inputfile.rl"; -\end{verbatim} -\verbspace - -The \verb|include| statement can be used to draw in the statements of another FSM -specification. Both the name and input file are optional, however at least one -must be given. Without an FSM name, the given input file is searched for an FSM -of the same name as the current specification. Without an input file the -current file is searched for a machine of the given name. If both are present, -the given input file is searched for a machine of the given name. - -Ragel searches for included files from the location of the current file. -Additional directories can be added to the search path using the \verb|-I| -option. - -\subsection{Importing Definitions} -\label{import} - -\begin{verbatim} -import "inputfile.h"; -\end{verbatim} -\verbspace - -The \verb|import| statement scrapes a file for sequences of tokens that match -the following forms. Ragel treats these forms as state machine definitions. - -\noindent\hspace*{24pt}\verb|name '=' number|\\ -\noindent\hspace*{24pt}\verb|name '=' lit_string|\\ -\noindent\hspace*{24pt}\verb|'define' name number|\\ -\noindent\hspace*{24pt}\verb|'define' name lit_string| -\vspace{12pt} - -If the input file is a Ragel program then tokens inside any Ragel -specifications are ignored. See Section \ref{export} for a description of -exporting machine definitions. - -Ragel searches for imported files from the location of the current file. -Additional directories can be added to the search path using the \verb|-I| -option. - -\section{Lexical Analysis of a Ragel Block} -\label{lexing} - -Within a machine specification the following lexical rules apply to the input. - -\begin{itemize} - -\item The \verb|#| symbol begins a comment that terminates at the next newline. - -\item The symbols \verb|""|, \verb|''|, \verb|//|, \verb|[]| behave as the -delimiters of literal strings. Within them, the following escape sequences -are interpreted: - -\verb| \0 \a \b \t \n \v \f \r| - -A backslash at the end of a line joins the following line onto the current. A -backslash preceding any other character removes special meaning. This applies -to terminating characters and to special characters in regular expression -literals. As an exception, regular expression literals do not support escape -sequences as the operands of a range within a list. See the bullet on regular -expressions in Section \ref{basic}. - -\item The symbols \verb|{}| delimit a block of host language code that will be -embedded into the machine as an action. Within the block of host language -code, basic lexical analysis of comments and strings is done in order to -correctly find the closing brace of the block. With the exception of FSM -commands embedded in code blocks, the entire block is preserved as is for -identical reproduction in the output code. - -\item The pattern \verb|[+-]?[0-9]+| denotes an integer in decimal format. -Integers used for specifying machines may be negative only if the alphabet type -is signed. Integers used for specifying priorities may be positive or negative. - -\item The pattern \verb|0x[0-9A-Fa-f]+| denotes an integer in hexadecimal -format. - -\item The keywords are \verb|access|, \verb|action|, \verb|alphtype|, -\verb|getkey|, \verb|write|, \verb|machine| and \verb|include|. - -\item The pattern \verb|[a-zA-Z_][a-zA-Z_0-9]*| denotes an identifier. - - -\item Any amount of whitespace may separate tokens. - -\end{itemize} - - -\section{Basic Machines} -\label{basic} - -The basic machines are the base operands of regular language expressions. They -are the smallest unit to which machine construction and manipulation operators -can be applied. - -\begin{itemize} - -\item \verb|'hello'| -- Concatenation Literal. Produces a machine that matches -the sequence of characters in the quoted string. If there are 5 characters -there will be 6 states chained together with the characters in the string. See -Section \ref{lexing} for information on valid escape sequences. - - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{bmconcat} -\end{center} -\graphspace - -It is possible -to make a concatenation literal case-insensitive by appending an \verb|i| to -the string, for example \verb|'cmd'i|. - -\item \verb|"hello"| -- Identical to the single quoted version. - -\item \verb|[hello]| -- Or Expression. Produces a union of characters. There -will be two states with a transition for each unique character between the two states. -The \verb|[]| delimiters behave like the quotes of a literal string. For example, -\verb|[ \t]| means tab or space. The \verb|or| expression supports character ranges -with the \verb|-| symbol as a separator. The meaning of the union can be negated -using an initial \verb|^| character as in standard regular expressions. -See Section \ref{lexing} for information on valid escape sequences -in \verb|or| expressions. - - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{bmor} -\end{center} -\graphspace - -\item \verb|''|, \verb|""|, and \verb|[]| -- Zero Length Machine. Produces a machine -that matches the zero length string. Zero length machines have one state that is both -a start state and a final state. - - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{bmnull} -\end{center} -\graphspace - -% FIXME: More on the range of values here. -\item \verb|42| -- Numerical Literal. Produces a two state machine with one -transition on the given number. The number may be in decimal or hexadecimal -format and should be in the range allowed by the alphabet type. The minimum and -maximum values permitted are defined by the host machine that Ragel is compiled -on. For example, numbers in a \verb|short| alphabet on an i386 machine should -be in the range \verb|-32768| to \verb|32767|. - - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{bmnum} -\end{center} -\graphspace - -\item \verb|/simple_regex/| -- Regular Expression. Regular expressions are -parsed as a series of expressions that are concatenated together. Each -concatenated expression -may be a literal character, the ``any'' character specified by the \verb|.| -symbol, or a union of characters specified by the \verb|[]| delimiters. If the -first character of a union is \verb|^| then it matches any character not in the -list. Within a union, a range of characters can be given by separating the first -and last characters of the range with the \verb|-| symbol. Each -concatenated machine may have repetition specified by following it with the -\verb|*| symbol. The standard escape sequences described in Section -\ref{lexing} are supported everywhere in regular expressions except as the -operands of a range within in a list. This notation also supports the \verb|i| -trailing option. Use it to produce case-insensitive machines, as in \verb|/GET/i|. - -Ragel does not support very complex regular expressions because the desired -results can always be achieved using the more general machine construction -operators listed in Section \ref{machconst}. The following diagram shows the -result of compiling \verb|/ab*[c-z].*[123]/|. \verb|DEF| represents the default -transition, which is taken if no other transition can be taken. - - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{bmregex} -\end{center} -\graphspace - -\item \verb|'a' .. 'z'| -- Range. Produces a machine that matches any -characters in the specified range. Allowable upper and lower bounds of the -range are concatenation literals of length one and numerical literals. For -example, \verb|0x10..0x20|, \verb|0..63|, and \verb|'a'..'z'| are valid ranges. -The bounds should be in the range allowed by the alphabet type. - - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{bmrange} -\end{center} -\graphspace - -\item \verb|variable_name| -- Lookup the machine definition assigned to the -variable name given and use an instance of it. See Section \ref{definition} for -an important note on what it means to reference a variable name. - -\item \verb|builtin_machine| -- There are several built-in machines available -for use. They are all two state machines for the purpose of matching common -classes of characters. They are: - -\begin{itemize} - -\item \verb|any | -- Any character in the alphabet. - -\item \verb|ascii | -- Ascii characters. \verb|0..127| - -\item \verb|extend| -- Ascii extended characters. This is the range -\verb|-128..127| for signed alphabets and the range \verb|0..255| for unsigned -alphabets. - -\item \verb|alpha | -- Alphabetic characters. \verb|[A-Za-z]| - -\item \verb|digit | -- Digits. \verb|[0-9]| - -\item \verb|alnum | -- Alpha numerics. \verb|[0-9A-Za-z]| - -\item \verb|lower | -- Lowercase characters. \verb|[a-z]| - -\item \verb|upper | -- Uppercase characters. \verb|[A-Z]| - -\item \verb|xdigit| -- Hexadecimal digits. \verb|[0-9A-Fa-f]| - -\item \verb|cntrl | -- Control characters. \verb|0..31| - -\item \verb|graph | -- Graphical characters. \verb|[!-~]| - -\item \verb|print | -- Printable characters. \verb|[ -~]| - -\item \verb|punct | -- Punctuation. Graphical characters that are not alphanumerics. -\verb|[!-/:-@[-`{-~]| - -\item \verb|space | -- Whitespace. \verb|[\t\v\f\n\r ]| - -\item \verb|zlen | -- Zero length string. \verb|""| - -\item \verb|empty | -- Empty set. Matches nothing. \verb|^any| - -\end{itemize} -\end{itemize} - -\section{Operator Precedence} -The following table shows operator precedence from lowest to highest. Operators -in the same precedence group are evaluated from left to right. - -\begin{tabular}{|c|c|c|} -\hline -1&\verb| , |&Join\\ -\hline -2&\verb/ | & - --/&Union, Intersection and Subtraction\\ -\hline -3&\verb| . <: :> :>> |&Concatenation\\ -\hline -4&\verb| : |&Label\\ -\hline -5&\verb| -> |&Epsilon Transition\\ -\hline -6&\verb| > @ $ % |&Transitions Actions and Priorities\\ -\hline -6&\verb| >/ $/ %/ </ @/ <>/ |&EOF Actions\\ -\hline -6&\verb| >! $! %! <! @! <>! |&Global Error Actions\\ -\hline -6&\verb| >^ $^ %^ <^ @^ <>^ |&Local Error Actions\\ -\hline -6&\verb| >~ $~ %~ <~ @~ <>~ |&To-State Actions\\ -\hline -6&\verb| >* $* %* <* @* <>* |&From-State Action\\ -\hline -7&\verb| * ** ? + {n} {,n} {n,} {n,m} |&Repetition\\ -\hline -8&\verb| ! ^ |&Negation and Character-Level Negation\\ -\hline -9&\verb| ( <expr> ) |&Grouping\\ -\hline -\end{tabular} - -\section{Regular Language Operators} -\label{machconst} - -When using Ragel it is helpful to have a sense of how it constructs machines. -The determinization process can produce results that seem unusual to someone -not familiar with the NFA to DFA conversion algorithm. In this section we -describe Ragel's state machine operators. Though the operators are defined -using epsilon transitions, it should be noted that this is for discussion only. -The epsilon transitions described in this section do not persist, but are -immediately removed by the determinization process which is executed at every -operation. Ragel does not make use of any nondeterministic intermediate state -machines. - -To create an epsilon transition between two states \verb|x| and \verb|y| is to -copy all of the properties of \verb|y| into \verb|x|. This involves drawing in -all of \verb|y|'s to-state actions, EOF actions, etc., in addition to its -transitions. If \verb|x| and \verb|y| both have a transition out on the same -character, then the transitions must be combined. During transition -combination a new transition is made that goes to a new state that is the -combination of both target states. The new combination state is created using -the same epsilon transition method. The new state has an epsilon transition -drawn to all the states that compose it. Since the creation of new epsilon -transitions may be triggered every time an epsilon transition is drawn, the -process of drawing epsilon transitions is repeated until there are no more -epsilon transitions to be made. - -A very common error that is made when using Ragel is to make machines that do -too much. That is, to create machines that have unintentional -nondetermistic properties. This usually results from being unaware of the common strings -between machines that are combined together using the regular language -operators. This can involve never leaving a machine, causing its actions to be -propagated through all the following states. Or it can involve an alternation -where both branches are unintentionally taken simultaneously. - -This problem forces one to think hard about the language that needs to be -matched. To guard against this kind of problem one must ensure that the machine -specification is divided up using boundaries that do not allow ambiguities from -one portion of the machine to the next. See Chapter -\ref{controlling-nondeterminism} for more on this problem and how to solve it. - -The Graphviz tool is an immense help when debugging improperly compiled -machines or otherwise learning how to use Ragel. Graphviz Dot files can be -generated from Ragel programs using the \verb|-V| option. See Section -\ref{visualization} for more information. - - -\subsection{Union} - -\verb/expr | expr/ - -The union operation produces a machine that matches any string in machine one -or machine two. The operation first creates a new start state. Epsilon -transitions are drawn from the new start state to the start states of both -input machines. The resulting machine has a final state set equivalent to the -union of the final state sets of both input machines. In this operation, there -is the opportunity for nondeterminism among both branches. If there are -strings, or prefixes of strings that are matched by both machines then the new -machine will follow both parts of the alternation at once. The union operation is -shown below. - -\graphspace -\begin{center} -\includegraphics[scale=1.0]{opor} -\end{center} -\graphspace - -The following example demonstrates the union of three machines representing -common tokens. - -% GENERATE: exor -% OPT: -p -% %%{ -% machine exor; -\begin{inline_code} -\begin{verbatim} -# Hex digits, decimal digits, or identifiers -main := '0x' xdigit+ | digit+ | alpha alnum*; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{exor} -\end{center} -\graphspace - -\subsection{Intersection} - -\verb|expr & expr| - -Intersection produces a machine that matches any -string that is in both machine one and machine two. To achieve intersection, a -union is performed on the two machines. After the result has been made -deterministic, any final state that is not a combination of final states from -both machines has its final state status revoked. To complete the operation, -paths that do not lead to a final state are pruned from the machine. Therefore, -if there are any such paths in either of the expressions they will be removed -by the intersection operator. Intersection can be used to require that two -independent patterns be simultaneously satisfied as in the following example. - -% GENERATE: exinter -% OPT: -p -% %%{ -% machine exinter; -\begin{inline_code} -\begin{verbatim} -# Match lines four characters wide that contain -# words separated by whitespace. -main := - /[^\n][^\n][^\n][^\n]\n/* & - (/[a-z][a-z]*/ | [ \n])**; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{exinter} -\end{center} -\graphspace - -\subsection{Difference} - -\verb|expr - expr| - -The difference operation produces a machine that matches -strings that are in machine one but are not in machine two. To achieve subtraction, -a union is performed on the two machines. After the result has been made -deterministic, any final state that came from machine two or is a combination -of states involving a final state from machine two has its final state status -revoked. As with intersection, the operation is completed by pruning any path -that does not lead to a final state. The following example demonstrates the -use of subtraction to exclude specific cases from a set. - -% GENERATE: exsubtr -% OPT: -p -% %%{ -% machine exsubtr; -\begin{inline_code} -\begin{verbatim} -# Subtract keywords from identifiers. -main := /[a-z][a-z]*/ - ( 'for' | 'int' ); -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{exsubtr} -\end{center} -\graphspace - -\subsection{Strong Difference} -\label{strong_difference} - -\verb|expr -- expr| - -Strong difference produces a machine that matches any string of the first -machine that does not have any string of the second machine as a substring. In -the following example, strong subtraction is used to excluded \verb|CRLF| from -a sequence. In the corresponding visualization, the label \verb|DEF| is short -for default. The default transition is taken if no other transition can be -taken. - -% GENERATE: exstrongsubtr -% OPT: -p -% %%{ -% machine exstrongsubtr; -\begin{inline_code} -\begin{verbatim} -crlf = '\r\n'; -main := [a-z]+ ':' ( any* -- crlf ) crlf; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{exstrongsubtr} -\end{center} -\graphspace - -This operator is equivalent to the following. - -\begin{verbatim} -expr - ( any* expr any* ) -\end{verbatim} -\verbspace - -\subsection{Concatenation} - -\verb|expr . expr| - -Concatenation produces a machine that matches all the strings in machine one followed by all -the strings in machine two. Concatenation draws epsilon transitions from the -final states of the first machine to the start state of the second machine. The -final states of the first machine lose their final state status, unless the -start state of the second machine is final as well. -Concatenation is the default operator. Two machines next to each other with no -operator between them results in concatenation. - -\graphspace -\begin{center} -\includegraphics[scale=1.0]{opconcat} -\end{center} -\graphspace - -The opportunity for nondeterministic behaviour results from the possibility of -the final states of the first machine accepting a string that is also accepted -by the start state of the second machine. -The most common scenario in which this happens is the -concatenation of a machine that repeats some pattern with a machine that gives -a terminating string, but the repetition machine does not exclude the -terminating string. The example in Section \ref{strong_difference} -guards against this. Another example is the expression \verb|("'" any* "'")|. -When executed the thread of control will -never leave the \verb|any*| machine. This is a problem especially if actions -are embedded to process the characters of the \verb|any*| component. - -In the following example, the first machine is always active due to the -nondeterministic nature of concatenation. This particular nondeterminism is intended -however because we wish to permit EOF strings before the end of the input. - -% GENERATE: exconcat -% OPT: -p -% %%{ -% machine exconcat; -\begin{inline_code} -\begin{verbatim} -# Require an eof marker on the last line. -main := /[^\n]*\n/* . 'EOF\n'; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{exconcat} -\end{center} -\graphspace - -There is a language -ambiguity involving concatenation and subtraction. Because concatenation is the -default operator for two -adjacent machines there is an ambiguity between subtraction of -a positive numerical literal and concatenation of a negative numerical literal. -For example, \verb|(x-7)| could be interpreted as \verb|(x . -7)| or -\verb|(x - 7)|. In the Ragel language, the subtraction operator always takes precedence -over concatenation of a negative literal. We adhere to the rule that the default -concatenation operator takes effect only when there are no other operators between -two machines. Beware of writing machines such as \verb|(any -1)| when what is -desired is a concatenation of \verb|any| and \verb|-1|. Instead write -\verb|(any . -1)| or \verb|(any (-1))|. If in doubt of the meaning of your program do not -rely on the default concatenation operator; always use the \verb|.| symbol. - - -\subsection{Kleene Star} - -\verb|expr*| - -The machine resulting from the Kleene Star operator will match zero or more -repetitions of the machine it is applied to. -It creates a new start state and an additional final -state. Epsilon transitions are drawn between the new start state and the old start -state, between the new start state and the new final state, and -between the final states of the machine and the new start state. After the -machine is made deterministic the effect is of the final states getting all the -transitions of the start state. - -\graphspace -\begin{center} -\includegraphics[scale=1.0]{opstar} -\end{center} -\graphspace - -The possibility for nondeterministic behaviour arises if the final states have -transitions on any of the same characters as the start state. This is common -when applying kleene star to an alternation of tokens. Like the other problems -arising from nondeterministic behavior, this is discussed in more detail in Chapter -\ref{controlling-nondeterminism}. This particular problem can also be solved -by using the longest-match construction discussed in Section -\ref{generating-scanners} on scanners. - -In this -example, there is no nondeterminism introduced by the exterior kleene star due to -the newline at the end of the regular expression. Without the newline the -exterior kleene star would be redundant and there would be ambiguity between -repeating the inner range of the regular expression and the entire regular -expression. Though it would not cause a problem in this case, unnecessary -nondeterminism in the kleene star operator often causes undesired results for -new Ragel users and must be guarded against. - -% GENERATE: exstar -% OPT: -p -% %%{ -% machine exstar; -\begin{inline_code} -\begin{verbatim} -# Match any number of lines with only lowercase letters. -main := /[a-z]*\n/*; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{exstar} -\end{center} -\graphspace - -\subsection{One Or More Repetition} - -\verb|expr+| - -This operator produces the concatenation of the machine with the kleene star of -itself. The result will match one or more repetitions of the machine. The plus -operator is equivalent to \verb|(expr . expr*)|. - -% GENERATE: explus -% OPT: -p -% %%{ -% machine explus; -\begin{inline_code} -\begin{verbatim} -# Match alpha-numeric words. -main := alnum+; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{explus} -\end{center} -\graphspace - -\subsection{Optional} - -\verb|expr?| - -The {\em optional} operator produces a machine that accepts the machine -given or the zero length string. The optional operator is equivalent to -\verb/(expr | '' )/. In the following example the optional operator is used to -possibly extend a token. - -% GENERATE: exoption -% OPT: -p -% %%{ -% machine exoption; -\begin{inline_code} -\begin{verbatim} -# Match integers or floats. -main := digit+ ('.' digit+)?; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{exoption} -\end{center} -\graphspace - -\subsection{Repetition} - -\noindent\hspace*{24pt}\verb|expr {n}| -- Exactly N copies of expr.\\ -\noindent\hspace*{24pt}\verb|expr {,n}| -- Zero to N copies of expr.\\ -\noindent\hspace*{24pt}\verb|expr {n,}| -- N or more copies of expr.\\ -\noindent\hspace*{24pt}\verb|expr {n,m}| -- N to M copies of expr. -\vspace{12pt} - -\subsection{Negation} - -\verb|!expr| - -Negation produces a machine that matches any string not matched by the given -machine. Negation is equivalent to \verb|(any* - expr)|. - -% GENERATE: exnegate -% OPT: -p -% %%{ -% machine exnegate; -\begin{inline_code} -\begin{verbatim} -# Accept anything but a string beginning with a digit. -main := ! ( digit any* ); -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{exnegate} -\end{center} -\graphspace - -\subsection{Character-Level Negation} - -\verb|^expr| - -Character-level negation produces a machine that matches any single character -not matched by the given machine. Character-Level Negation is equivalent to -\verb|(any - expr)|. It must be applied only to machines that match strings of -length one. - -\section{State Machine Minimization} - -State machine minimization is the process of finding the minimal equivalent FSM accepting -the language. Minimization reduces the number of states in machines -by merging equivalent states. It does not change the behaviour of the machine -in any way. It will cause some states to be merged into one because they are -functionally equivalent. State minimization is on by default. It can be turned -off with the \verb|-n| option. - -The algorithm implemented is similar to Hopcroft's state minimization -algorithm. Hopcroft's algorithm assumes a finite alphabet that can be listed in -memory, whereas Ragel supports arbitrary integer alphabets that cannot be -listed in memory. Though exact analysis is very difficult, Ragel minimization -runs close to O(n * log(n)) and requires O(n) temporary storage where -$n$ is the number of states. - -\section{Visualization} -\label{visualization} - -%In many cases, practical -%parsing programs will be too large to completely visualize with Graphviz. The -%proper approach is to reduce the language to the smallest subset possible that -%still exhibits the characteristics that one wishes to learn about or to fix. -%This can be done without modifying the source code using the \verb|-M| and -%\verb|-S| options. If a machine cannot be easily reduced, -%embeddings of unique actions can be very useful for tracing a -%particular component of a larger machine specification, since action names are -%written out on transition labels. - -Ragel is able to emit compiled state machines in Graphviz's Dot file format. -This is done using the \verb|-V| option. -Graphviz support allows users to perform -incremental visualization of their parsers. User actions are displayed on -transition labels of the graph. - -If the final graph is too large to be -meaningful, or even drawn, the user is able to inspect portions of the parser -by naming particular regular expression definitions with the \verb|-S| and -\verb|-M| options to the \verb|ragel| program. Use of Graphviz greatly -improves the Ragel programming experience. It allows users to learn Ragel by -experimentation and also to track down bugs caused by unintended -nondeterminism. - -Ragel has another option to help debugging. The \verb|-x| option causes Ragel -to emit the compiled machine in an XML format. - -\chapter{User Actions} - -Ragel permits the user to embed actions into the transitions of a regular -expression's corresponding state machine. These actions are executed when the -generated code moves over a transition. Like the regular expression operators, -the action embedding operators are fully compositional. They take a state -machine and an action as input, embed the action and yield a new state machine -that can be used in the construction of other machines. Due to the -compositional nature of embeddings, the user has complete freedom in the -placement of actions. - -A machine's transitions are categorized into four classes. The action embedding -operators access the transitions defined by these classes. The {\em entering -transition} operator \verb|>| isolates the start state, then embeds an action -into all transitions leaving it. The {\em finishing transition} operator -\verb|@| embeds an action into all transitions going into a final state. The -{\em all transition} operator \verb|$| embeds an action into all transitions of -an expression. The {\em leaving transition} operator \verb|%| provides access -to the yet-unmade transitions moving out of the machine via the final states. - -\section{Embedding Actions} - -\begin{verbatim} -action ActionName { - /* Code an action here. */ - count += 1; -} -\end{verbatim} -\verbspace - -The action statement defines a block of code that can be embedded into an FSM. -Action names can be referenced by the action embedding operators in -expressions. Though actions need not be named in this way (literal blocks -of code can be embedded directly when building machines), defining reusable -blocks of code whenever possible is good practice because it potentially increases the -degree to which the machine can be minimized. - -Within an action some Ragel expressions and statements are parsed and -translated. These allow the user to interact with the machine from action code. -See Section \ref{vals} for a complete list of statements and values available -in code blocks. - -\subsection{Entering Action} - -\verb|expr > action| - -The entering action operator embeds an action into all transitions -that enter into the machine from the start state. If the start state is final, -then the action is also embedded into the start state as a leaving action. This -means that if a machine accepts the zero-length string and control passes -through the start state then the entering action is executed. Note -that this can happen on both a following character and on the EOF event. - -In some machines the start state has transtions coming in from within the -machine. In these cases the start state is first isolated from the rest of the -machine ensuring that the entering actions are exected once only. - -% GENERATE: exstact -% OPT: -p -% %%{ -% machine exstact; -\begin{inline_code} -\begin{verbatim} -# Execute A at the beginning of a string of alpha. -action A {} -main := ( lower* >A ) . ' '; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{exstact} -\end{center} -\graphspace - -\subsection{Finishing Action} - -\verb|expr @ action| - -The finishing action operator embeds an action into any transitions that move -the machine into a final state. Further input may move the machine out of the -final state, but keep it in the machine. Therefore finishing actions may be -executed more than once if a machine has any internal transitions out of a -final state. In the following example the final state has no transitions out -and the finishing action is executed only once. - -% GENERATE: exdoneact -% OPT: -p -% %%{ -% machine exdoneact; -% action A {} -\begin{inline_code} -\begin{verbatim} -# Execute A when the trailing space is seen. -main := ( lower* ' ' ) @A; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{exdoneact} -\end{center} -\graphspace - -\subsection{All Transition Action} - -\verb|expr $ action| - -The all transition operator embeds an action into all transitions of a machine. -The action is executed whenever a transition of the machine is taken. In the -following example, A is executed on every character matched. - -% GENERATE: exallact -% OPT: -p -% %%{ -% machine exallact; -% action A {} -\begin{inline_code} -\begin{verbatim} -# Execute A on any characters of the machine. -main := ( 'm1' | 'm2' ) $A; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{exallact} -\end{center} -\graphspace - -\subsection{Leaving Actions} -\label{out-actions} - -\verb|expr % action| - -The leaving action operator queues an action for embedding into the transitions -that go out of a machine via a final state. The action is first stored in -the machine's final states and is later transferred to any transitions that are -made going out of the machine by a kleene star or concatenation operation. - -If a final state of the machine is still final when compilation is complete -then the leaving action is also embedded as an EOF action. Therefore, leaving -the machine is defined as either leaving on a character or as state machine -acceptance. - -This operator allows one to associate an action with the termination of a -sequence, without being concerned about what particular character terminates -the sequence. In the following example, A is executed when leaving the alpha -machine on the newline character. - -% GENERATE: exoutact1 -% OPT: -p -% %%{ -% machine exoutact1; -% action A {} -\begin{inline_code} -\begin{verbatim} -# Match a word followed by a newline. Execute A when -# finishing the word. -main := ( lower+ %A ) . '\n'; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{exoutact1} -\end{center} -\graphspace - -In the following example, the \verb|term_word| action could be used to register -the appearance of a word and to clear the buffer that the \verb|lower| action used -to store the text of it. - -% GENERATE: exoutact2 -% OPT: -p -% %%{ -% machine exoutact2; -% action lower {} -% action space {} -% action term_word {} -% action newline {} -\begin{inline_code} -\begin{verbatim} -word = ( [a-z] @lower )+ %term_word; -main := word ( ' ' @space word )* '\n' @newline; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{exoutact2} -\end{center} -\graphspace - -In this final example of the action embedding operators, A is executed upon entering -the alpha machine, B is executed on all transitions of the -alpha machine, C is executed when the alpha machine is exited by moving into the -newline machine and N is executed when the newline machine moves into a final -state. - -% GENERATE: exaction -% OPT: -p -% %%{ -% machine exaction; -% action A {} -% action B {} -% action C {} -% action N {} -\begin{inline_code} -\begin{verbatim} -# Execute A on starting the alpha machine, B on every transition -# moving through it and C upon finishing. Execute N on the newline. -main := ( lower* >A $B %C ) . '\n' @N; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{exaction} -\end{center} -\graphspace - - -\section{State Action Embedding Operators} - -The state embedding operators allow one to embed actions into states. Like the -transition embedding operators, there are several different classes of states -that the operators access. The meanings of the symbols are similar to the -meanings of the symbols used for the transition embedding operators. The design -of the state selections was driven by a need to cover the states of an -expression with exactly one error action. - -Unlike the transition embedding operators, the state embedding operators are -also distinguished by the different kinds of events that embedded actions can -be associated with. Therefore the state embedding operators have two -components. The first, which is the first one or two characters, specifies the -class of states that the action will be embedded into. The second component -specifies the type of event the action will be executed on. The symbols of the -second component also have equivalent kewords. - -\begin{multicols}{2} -The different classes of states are: - -\noindent\hspace*{24pt}\verb|> | -- the start state\\ -\noindent\hspace*{24pt}\verb|< | -- any state except the start state\\ -\noindent\hspace*{24pt}\verb|$ | -- all states\\ -\noindent\hspace*{24pt}\verb|% | -- final states\\ -\noindent\hspace*{24pt}\verb|@ | -- any state except final states\\ -\noindent\hspace*{24pt}\verb|<>| -- any except start and final (middle) -\vspace{12pt} - -\columnbreak - -The different kinds of embeddings are: - -\noindent\hspace*{24pt}\verb|~| -- to-state actions (\verb|to|)\\ -\noindent\hspace*{24pt}\verb|*| -- from-state actions (\verb|from|)\\ -\noindent\hspace*{24pt}\verb|/| -- EOF actions (\verb|eof|)\\ -\noindent\hspace*{24pt}\verb|!| -- error actions (\verb|err|)\\ -\noindent\hspace*{24pt}\verb|^| -- local error actions (\verb|lerr|) -\vspace{12pt} - -\end{multicols} - -\subsection{To-State and From-State Actions} - -\subsubsection{To-State Actions} - -\noindent\hspace*{24pt}\verb|>~action >to(name) >to{...} | -- the start state\\ -\noindent\hspace*{24pt}\verb|<~action <to(name) <to{...} | -- any state except the start state\\ -\noindent\hspace*{24pt}\verb|$~action $to(name) $to{...} | -- all states\\ -\noindent\hspace*{24pt}\verb|%~action %to(name) %to{...} | -- final states\\ -\noindent\hspace*{24pt}\verb|@~action @to(name) @to{...} | -- any state except final states\\ -\noindent\hspace*{24pt}\verb|<>~action <>to(name) <>to{...}| -- any except start and final (middle) -\vspace{12pt} - - -To-state actions are executed whenever the state machine moves into the -specified state, either by a natural movement over a transition or by an -action-based transfer of control such as \verb|fgoto|. They are executed after the -in-transition's actions but before the current character is advanced and -tested against the end of the input block. To-state embeddings stay with the -state. They are irrespective of the state's current set of transitions and any -future transitions that may be added in or out of the state. - -Note that the setting of the current state variable \verb|cs| outside of the -execute code is not considered by Ragel as moving into a state and consequently -the to-state actions of the new current state are not executed. This includes -the initialization of the current state when the machine begins. This is -because the entry point into the machine execution code is after the execution -of to-state actions. - -\subsubsection{From-State Actions} - -\noindent\hspace*{24pt}\verb|>*action >from(name) >from{...} | -- the start state\\ -\noindent\hspace*{24pt}\verb|<*action <from(name) <from{...} | -- any state except the start state\\ -\noindent\hspace*{24pt}\verb|$*action $from(name) $from{...} | -- all states\\ -\noindent\hspace*{24pt}\verb|%*action %from(name) %from{...} | -- final states\\ -\noindent\hspace*{24pt}\verb|@*action @from(name) @from{...} | -- any state except final states\\ -\noindent\hspace*{24pt}\verb|<>*action <>from(name) <>from{...}| -- any except start and final (middle) -\vspace{12pt} - -From-state actions are executed whenever the state machine takes a transition from a -state, either to itself or to some other state. These actions are executed -immediately after the current character is tested against the input block end -marker and before the transition to take is sought based on the current -character. From-state actions are therefore executed even if a transition -cannot be found and the machine moves into the error state. Like to-state -embeddings, from-state embeddings stay with the state. - -\subsection{EOF Actions} - -\noindent\hspace*{24pt}\verb|>/action >eof(name) >eof{...} | -- the start state\\ -\noindent\hspace*{24pt}\verb|</action <eof(name) <eof{...} | -- any state except the start state\\ -\noindent\hspace*{24pt}\verb|$/action $eof(name) $eof{...} | -- all states\\ -\noindent\hspace*{24pt}\verb|%/action %eof(name) %eof{...} | -- final states\\ -\noindent\hspace*{24pt}\verb|@/action @eof(name) @eof{...} | -- any state except final states\\ -\noindent\hspace*{24pt}\verb|<>/action <>eof(name) <>eof{...}| -- any except start and final (middle) -\vspace{12pt} - -The EOF action embedding operators enable the user to embed actions that are -executed at the end of the input stream. EOF actions are stored in states and -generated in the \verb|write exec| block. They are run when \verb|p == pe == eof| -as the execute block is finishing. EOF actions are free to adjust \verb|p| and -jump to another part of the machine to restart execution. - -\subsection{Handling Errors} - -In many applications it is useful to be able to react to parsing errors. The -user may wish to print an error message that depends on the context. It -may also be desirable to consume input in an attempt to return the input stream -to some known state and resume parsing. To support error handling and recovery, -Ragel provides error action embedding operators. There are two kinds of error -actions: global error actions and local error actions. -Error actions can be used to simply report errors, or by jumping to a machine -instantiation that consumes input, can attempt to recover from errors. - -\subsubsection{Global Error Actions} - -\noindent\hspace*{24pt}\verb|>!action >err(name) >err{...} | -- the start state\\ -\noindent\hspace*{24pt}\verb|<!action <err(name) <err{...} | -- any state except the start state\\ -\noindent\hspace*{24pt}\verb|$!action $err(name) $err{...} | -- all states\\ -\noindent\hspace*{24pt}\verb|%!action %err(name) %err{...} | -- final states\\ -\noindent\hspace*{24pt}\verb|@!action @err(name) @err{...} | -- any state except final states\\ -\noindent\hspace*{24pt}\verb|<>!action <>err(name) <>err{...}| -- any except start and final (middle) -\vspace{12pt} - -Global error actions are stored in the states they are embedded into until -compilation is complete. They are then transferred to the transitions that move -into the error state. These transitions are taken on all input characters that -are not already covered by the state's transitions. If a state with an error -action is not final when compilation is complete, then the action is also -embedded as an EOF action. - -Error actions can be used to recover from errors by jumping back into the -machine with \verb|fgoto| and optionally altering \verb|p|. - -\subsubsection{Local Error Actions} - -\noindent\hspace*{24pt}\verb|>^action >lerr(name) >lerr{...} | -- the start state\\ -\noindent\hspace*{24pt}\verb|<^action <lerr(name) <lerr{...} | -- any state except the start state\\ -\noindent\hspace*{24pt}\verb|$^action $lerr(name) $lerr{...} | -- all states\\ -\noindent\hspace*{24pt}\verb|%^action %lerr(name) %lerr{...} | -- final states\\ -\noindent\hspace*{24pt}\verb|@^action @lerr(name) @lerr{...} | -- any state except final states\\ -\noindent\hspace*{24pt}\verb|<>^action <>lerr(name) <>lerr{...}| -- any except start and final (middle) -\vspace{12pt} - -Like global error actions, local error actions are also stored in the states -they are embedded into until a transfer point. The transfer point is different -however. Each local error action embedding is associated with a name. When a -machine definition has been fully constructed, all local error action -embeddings associated with the same name as the machine definition are -transferred to the error transitions. At this time they are also embedded as -EOF actions in the case of non-final states. - -Local error actions can be used to specify an action to take when a particular -section of a larger state machine fails to match. A particular machine -definition's ``thread'' may die and the local error actions executed, however -the machine as a whole may continue to match input. - -There are two forms of local error action embeddings. In the first form the -name defaults to the current machine. In the second form the machine name can -be specified. This is useful when it is more convenient to specify the local -error action in a sub-definition that is used to construct the machine -definition that the local error action is associated with. To embed local -error actions and -explicitly state the machine definition on which the transfer is to happen use -\verb|(name, action)| as the action. - -\subsubsection{Example} - -The following example uses error actions to report an error and jump to a -machine that consumes the remainder of the line when parsing fails. After -consuming the line, the error recovery machine returns to the main loop. - -% GENERATE: erract -% %%{ -% machine erract; -% ws = ' '; -% address = 'foo AT bar..com'; -% date = 'Monday May 12'; -\begin{inline_code} -\begin{verbatim} -action cmd_err { - printf( "command error\n" ); - fhold; fgoto line; -} -action from_err { - printf( "from error\n" ); - fhold; fgoto line; -} -action to_err { - printf( "to error\n" ); - fhold; fgoto line; -} - -line := [^\n]* '\n' @{ fgoto main; }; - -main := ( - ( - 'from' @err(cmd_err) - ( ws+ address ws+ date '\n' ) $err(from_err) | - 'to' @err(cmd_err) - ( ws+ address '\n' ) $err(to_err) - ) -)*; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% %% write data; -% void f() -% { -% %% write init; -% %% write exec; -% } -% END GENERATE - - - -\section{Action Ordering and Duplicates} - -When combining expressions that have embedded actions it is often the case that -a number of actions must be executed on a single input character. For example, -following a concatenation the leaving action of the left expression and the -entering action of the right expression will be embedded into one transition. -This requires a method of ordering actions that is intuitive and -predictable for the user, and repeatable for the compiler. - -We associate with the embedding of each action a unique timestamp that is -used to order actions that appear together on a single transition in the final -state machine. To accomplish this we recursively traverse the parse tree of -regular expressions and assign timestamps to action embeddings. References to -machine definitions are followed in the traversal. When we visit a -parse tree node we assign timestamps to all {\em entering} action embeddings, -recurse on the parse tree, then assign timestamps to the remaining {\em all}, -{\em finishing}, and {\em leaving} embeddings in the order in which they -appear. - -By default Ragel does not permit a single action to appear multiple times in an action -list. When the final machine has been created, actions that appear more than -once in a single transition, to-state, from-state or EOF action list have their -duplicates removed. -The first appearance of the action is preserved. This is useful in a number of -scenarios. First, it allows us to union machines with common prefixes without -worrying about the action embeddings in the prefix being duplicated. Second, it -prevents leaving actions from being transferred multiple times. This can -happen when a machine is repeated, then followed with another machine that -begins with a common character. For example: - -\begin{verbatim} -word = [a-z]+ %act; -main := word ( '\n' word )* '\n\n'; -\end{verbatim} -\verbspace - -Note that Ragel does not compare action bodies to determine if they have -identical program text. It simply checks for duplicates using each action -block's unique location in the program. - -The removal of duplicates can be turned off using the \verb|-d| option. - -\section{Values and Statements Available in Code Blocks} -\label{vals} - -The following values are available in code blocks: - -\begin{itemize} -\item \verb|fpc| -- A pointer to the current character. This is equivalent to -accessing the \verb|p| variable. - -\item \verb|fc| -- The current character. This is equivalent to the expression \verb|(*p)|. - -\item \verb|fcurs| -- An integer value representing the current state. This -value should only be read from. To move to a different place in the machine -from action code use the \verb|fgoto|, \verb|fnext| or \verb|fcall| statements. -Outside of the machine execution code the \verb|cs| variable may be modified. - -\item \verb|ftargs| -- An integer value representing the target state. This -value should only be read from. Again, \verb|fgoto|, \verb|fnext| and -\verb|fcall| can be used to move to a specific entry point. - -\item \verb|fentry(<label>)| -- Retrieve an integer value representing the -entry point \verb|label|. The integer value returned will be a compile time -constant. This number is suitable for later use in control flow transfer -statements that take an expression. This value should not be compared against -the current state because any given label can have multiple states representing -it. The value returned by \verb|fentry| can be any one of the multiple states that -it represents. -\end{itemize} - -The following statements are available in code blocks: - -\begin{itemize} - -\item \verb|fhold;| -- Do not advance over the current character. If processing -data in multiple buffer blocks, the \verb|fhold| statement should only be used -once in the set of actions executed on a character. Multiple calls may result -in backing up over the beginning of the buffer block. The \verb|fhold| -statement does not imply any transfer of control. It is equivalent to the -\verb|p--;| statement. - -\item \verb|fexec <expr>;| -- Set the next character to process. This can be -used to backtrack to previous input or advance ahead. -Unlike \verb|fhold|, which can be used -anywhere, \verb|fexec| requires the user to ensure that the target of the -backtrack is in the current buffer block or is known to be somewhere ahead of -it. The machine will continue iterating forward until \verb|pe| is arrived at, -\verb|fbreak| is called or the machine moves into the error state. In actions -embedded into transitions, the \verb|fexec| statement is equivalent to setting -\verb|p| to one position ahead of the next character to process. If the user -also modifies \verb|pe|, it is possible to change the buffer block entirely. - -\item \verb|fgoto <label>;| -- Jump to an entry point defined by -\verb|<label>|. The \verb|fgoto| statement immediately transfers control to -the destination state. - -\item \verb|fgoto *<expr>;| -- Jump to an entry point given by \verb|<expr>|. -The expression must evaluate to an integer value representing a state. - -\item \verb|fnext <label>;| -- Set the next state to be the entry point defined -by \verb|label|. The \verb|fnext| statement does not immediately jump to the -specified state. Any action code following the statement is executed. - -\item \verb|fnext *<expr>;| -- Set the next state to be the entry point given -by \verb|<expr>|. The expression must evaluate to an integer value representing -a state. - -\item \verb|fcall <label>;| -- Push the target state and jump to the entry -point defined by \verb|<label>|. The next \verb|fret| will jump to the target -of the transition on which the call was made. Use of \verb|fcall| requires -the declaration of a call stack. An array of integers named \verb|stack| and a -single integer named \verb|top| must be declared. With the \verb|fcall| -construct, control is immediately transferred to the destination state. -See section \ref{modularization} for more information. - -\item \verb|fcall *<expr>;| -- Push the current state and jump to the entry -point given by \verb|<expr>|. The expression must evaluate to an integer value -representing a state. - -\item \verb|fret;| -- Return to the target state of the transition on which the -last \verb|fcall| was made. Use of \verb|fret| requires the declaration of a -call stack. Control is immediately transferred to the destination state. - -\item \verb|fbreak;| -- Advance \verb|p|, save the target state to \verb|cs| -and immediately break out of the execute loop. This statement is useful -in conjunction with the \verb|noend| write option. Rather than process input -until \verb|pe| is arrived at, the fbreak statement -can be used to stop processing from an action. After an \verb|fbreak| -statement the \verb|p| variable will point to the next character in the input. The -current state will be the target of the current transition. Note that \verb|fbreak| -causes the target state's to-state actions to be skipped. - -\end{itemize} - -Once actions with control-flow commands are embedded into a -machine, the user must exercise caution when using the machine as the operand -to other machine construction operators. If an action jumps to another state -then unioning any transition that executes that action with another transition -that follows some other path will cause that other path to be lost. Using -commands that manually jump around a machine takes us out of the domain of -regular languages because transitions that the -machine construction operators are not aware of are introduced. These -commands should therefore be used with caution. - - -\chapter{Controlling Nondeterminism} -\label{controlling-nondeterminism} - -Along with the flexibility of arbitrary action embeddings comes a need to -control nondeterminism in regular expressions. If a regular expression is -ambiguous, then sub-components of a parser other than the intended parts may become -active. This means that actions that are irrelevant to the -current subset of the parser may be executed, causing problems for the -programmer. - -Tools that are based on regular expression engines and that are used for -recognition tasks will usually function as intended regardless of the presence -of ambiguities. It is quite common for users of scripting languages to write -regular expressions that are heavily ambiguous and it generally does not -matter. As long as one of the potential matches is recognized, there can be any -number of other matches present. In some parsing systems the run-time engine -can employ a strategy for resolving ambiguities, for example always pursuing -the longest possible match and discarding others. - -In Ragel, there is no regular expression run-time engine, just a simple state -machine execution model. When we begin to embed actions and face the -possibility of spurious action execution, it becomes clear that controlling -nondeterminism at the machine construction level is very important. Consider -the following example. - -% GENERATE: lines1 -% OPT: -p -% %%{ -% machine lines1; -% action first {} -% action tail {} -% word = [a-z]+; -\begin{inline_code} -\begin{verbatim} -ws = [\n\t ]; -line = word $first ( ws word $tail )* '\n'; -lines = line*; -\end{verbatim} -\end{inline_code} -\verbspace -% main := lines; -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.53]{lines1} -\end{center} -\graphspace - -Since the \verb|ws| expression includes the newline character, we will -not finish the \verb|line| expression when a newline character is seen. We will -simultaneously pursue the possibility of matching further words on the same -line and the possibility of matching a second line. Evidence of this fact is -in the state tables. On several transitions both the \verb|first| and -\verb|tail| actions are executed. The solution here is simple: exclude -the newline character from the \verb|ws| expression. - -% GENERATE: lines2 -% OPT: -p -% %%{ -% machine lines2; -% action first {} -% action tail {} -% word = [a-z]+; -\begin{inline_code} -\begin{verbatim} -ws = [\t ]; -line = word $first ( ws word $tail )* '\n'; -lines = line*; -\end{verbatim} -\end{inline_code} -\verbspace -% main := lines; -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{lines2} -\end{center} -\graphspace - -Solving this kind of problem is straightforward when the ambiguity is created -by strings that are a single character long. When the ambiguity is created by -strings that are multiple characters long we have a more difficult problem. -The following example is an incorrect attempt at a regular expression for C -language comments. - -% GENERATE: comments1 -% OPT: -p -% %%{ -% machine comments1; -% action comm {} -\begin{inline_code} -\begin{verbatim} -comment = '/*' ( any @comm )* '*/'; -main := comment ' '; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{comments1} -\end{center} -\graphspace - -Using standard concatenation, we will never leave the \verb|any*| expression. -We will forever entertain the possibility that a \verb|'*/'| string that we see -is contained in a longer comment and that, simultaneously, the comment has -ended. The concatenation of the \verb|comment| machine with \verb|SP| is done -to show this. When we match space, we are also still matching the comment body. - -One way to approach the problem is to exclude the terminating string -from the \verb|any*| expression using set difference. We must be careful to -exclude not just the terminating string, but any string that contains it as a -substring. A verbose, but proper specification of a C comment parser is given -by the following regular expression. - -% GENERATE: comments2 -% OPT: -p -% %%{ -% machine comments2; -% action comm {} -\begin{inline_code} -\begin{verbatim} -comment = '/*' ( ( any @comm )* - ( any* '*/' any* ) ) '*/'; -\end{verbatim} -\end{inline_code} -\verbspace -% main := comment; -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{comments2} -\end{center} -\graphspace - -Note that Ragel's strong subtraction operator \verb|--| can also be used here. -In doing this subtraction we have phrased the problem of controlling non-determinism in -terms of excluding strings common to two expressions that interact when -combined. -We can also phrase the problem in terms of the transitions of the state -machines that implement these expressions. During the concatenation of -\verb|any*| and \verb|'*/'| we will be making transitions that are composed of -both the loop of the first expression and the final character of the second. -At this time we want the transition on the \verb|'/'| character to take precedence -over and disallow the transition that originated in the \verb|any*| loop. - -In another parsing problem, we wish to implement a lightweight tokenizer that we can -utilize in the composition of a larger machine. For example, some HTTP headers -have a token stream as a sub-language. The following example is an attempt -at a regular expression-based tokenizer that does not function correctly due to -unintended nondeterminism. - -% GENERATE: smallscanner -% OPT: -p -% %%{ -% machine smallscanner; -% action start_str {} -% action on_char {} -% action finish_str {} -\begin{inline_code} -\begin{verbatim} -header_contents = ( - lower+ >start_str $on_char %finish_str | - ' ' -)*; -\end{verbatim} -\end{inline_code} -\verbspace -% main := header_contents; -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{smallscanner} -\end{center} -\graphspace - -In this case, the problem with using a standard kleene star operation is that -there is an ambiguity between extending a token and wrapping around the machine -to begin a new token. Using the standard operator, we get an undesirable -nondeterministic behaviour. Evidence of this can be seen on the transition out -of state one to itself. The transition extends the string, and simultaneously, -finishes the string only to immediately begin a new one. What is required is -for the -transitions that represent an extension of a token to take precedence over the -transitions that represent the beginning of a new token. For this problem -there is no simple solution that uses standard regular expression operators. - -\section{Priorities} - -A priority mechanism was devised and built into the determinization -process, specifically for the purpose of allowing the user to control -nondeterminism. Priorities are integer values embedded into transitions. When -the determinization process is combining transitions that have different -priorities, the transition with the higher priority is preserved and the -transition with the lower priority is dropped. - -Unfortunately, priorities can have unintended side effects because their -operation requires that they linger in transitions indefinitely. They must linger -because the Ragel program cannot know when the user is finished with a priority -embedding. A solution whereby they are explicitly deleted after use is -conceivable; however this is not very user-friendly. Priorities were therefore -made into named entities. Only priorities with the same name are allowed to -interact. This allows any number of priorities to coexist in one machine for -the purpose of controlling various different regular expression operations and -eliminates the need to ever delete them. Such a scheme allows the user to -choose a unique name, embed two different priority values using that name -and be confident that the priority embedding will be free of any side effects. - -In the first form of priority embedding the name defaults to the name of the machine -definition that the priority is assigned in. In this sense priorities are by -default local to the current machine definition or instantiation. Beware of -using this form in a longest-match machine, since there is only one name for -the entire set of longest match patterns. In the second form the priority's -name can be specified, allowing priority interaction across machine definition -boundaries. - -\begin{itemize} -\item \verb|expr > int| -- Sets starting transitions to have priority int. -\item \verb|expr @ int| -- Sets transitions that go into a final state to have priority int. -\item \verb|expr $ int| -- Sets all transitions to have priority int. -\item \verb|expr % int| -- Sets leaving transitions to -have priority int. When a transition is made going out of the machine (either -by concatenation or kleene star) its priority is immediately set to the -leaving priority. -\end{itemize} - -The second form of priority assignment allows the programmer to specify the name -to which the priority is assigned. - -\begin{itemize} -\item \verb|expr > (name, int)| -- Starting transitions. -\item \verb|expr @ (name, int)| -- Finishing transitions (into a final state). -\item \verb|expr $ (name, int)| -- All transitions. -\item \verb|expr % (name, int)| -- Leaving transitions. -\end{itemize} - -\section{Guarded Operators that Encapsulate Priorities} - -Priority embeddings are a very expressive mechanism. At the same time they -can be very confusing for the user. They force the user to imagine -the transitions inside two interacting expressions and work out the precise -effects of the operations between them. When we consider -that this problem is worsened by the -potential for side effects caused by unintended priority name collisions, we -see that exposing the user to priorities is undesirable. - -Fortunately, in practice the use of priorities has been necessary only in a -small number of scenarios. This allows us to encapsulate their functionality -into a small set of operators and fully hide them from the user. This is -advantageous from a language design point of view because it greatly simplifies -the design. - -Going back to the C comment example, we can now properly specify -it using a guarded concatenation operator which we call {\em finish-guarded -concatenation}. From the user's point of view, this operator terminates the -first machine when the second machine moves into a final state. It chooses a -unique name and uses it to embed a low priority into all -transitions of the first machine. A higher priority is then embedded into the -transitions of the second machine that enter into a final state. The following -example yields a machine identical to the example in Section -\ref{controlling-nondeterminism}. - -\begin{inline_code} -\begin{verbatim} -comment = '/*' ( any @comm )* :>> '*/'; -\end{verbatim} -\end{inline_code} -\verbspace - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{comments2} -\end{center} -\graphspace - -Another guarded operator is {\em left-guarded concatenation}, given by the -\verb|<:| compound symbol. This operator places a higher priority on all -transitions of the first machine. This is useful if one must forcibly separate -two lists that contain common elements. For example, one may need to tokenize a -stream, but first consume leading whitespace. - -Ragel also includes a {\em longest-match kleene star} operator, given by the -\verb|**| compound symbol. This -guarded operator embeds a high -priority into all transitions of the machine. -A lower priority is then embedded into the leaving transitions. When the -kleene star operator makes the epsilon transitions from -the final states into the new start state, the lower priority will be transferred -to the epsilon transitions. In cases where following an epsilon transition -out of a final state conflicts with an existing transition out of a final -state, the epsilon transition will be dropped. - -Other guarded operators are conceivable, such as guards on union that cause one -alternative to take precedence over another. These may be implemented when it -is clear they constitute a frequently used operation. -In the next section we discuss the explicit specification of state machines -using state charts. - -\subsection{Entry-Guarded Concatenation} - -\verb|expr :> expr| - -This operator concatenates two machines, but first assigns a low -priority to all transitions -of the first machine and a high priority to the starting transitions of the -second machine. This operator is useful if from the final states of the first -machine it is possible to accept the characters in the entering transitions of -the second machine. This operator effectively terminates the first machine -immediately upon starting the second machine, where otherwise they would be -pursued concurrently. In the following example, entry-guarded concatenation is -used to move out of a machine that matches everything at the first sign of an -end-of-input marker. - -% GENERATE: entryguard -% OPT: -p -% %%{ -% machine entryguard; -\begin{inline_code} -\begin{verbatim} -# Leave the catch-all machine on the first character of FIN. -main := any* :> 'FIN'; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{entryguard} -\end{center} -\graphspace - -Entry-guarded concatenation is equivalent to the following: - -\begin{verbatim} -expr $(unique_name,0) . expr >(unique_name,1) -\end{verbatim} -\verbspace - -\subsection{Finish-Guarded Concatenation} - -\verb|expr :>> expr| - -This operator is -like the previous operator, except the higher priority is placed on the final -transitions of the second machine. This is useful if one wishes to entertain -the possibility of continuing to match the first machine right up until the -second machine enters a final state. In other words it terminates the first -machine only when the second accepts. In the following example, finish-guarded -concatenation causes the move out of the machine that matches everything to be -delayed until the full end-of-input marker has been matched. - -% GENERATE: finguard -% OPT: -p -% %%{ -% machine finguard; -\begin{inline_code} -\begin{verbatim} -# Leave the catch-all machine on the last character of FIN. -main := any* :>> 'FIN'; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{finguard} -\end{center} -\graphspace - -Finish-guarded concatenation is equivalent to the following, with one -exception. If the right machine's start state is final, the higher priority is -also embedded into it as a leaving priority. This prevents the left machine -from persisting via the zero-length string. - -\begin{verbatim} -expr $(unique_name,0) . expr @(unique_name,1) -\end{verbatim} -\verbspace - -\subsection{Left-Guarded Concatenation} - -\verb|expr <: expr| - -This operator places -a higher priority on the left expression. It is useful if you want to prefix a -sequence with another sequence composed of some of the same characters. For -example, one can consume leading whitespace before tokenizing a sequence of -whitespace-separated words as in: - -% GENERATE: leftguard -% OPT: -p -% %%{ -% machine leftguard; -% action alpha {} -% action ws {} -% action start {} -% action fin {} -\begin{inline_code} -\begin{verbatim} -main := ( ' '* >start %fin ) <: ( ' ' $ws | [a-z] $alpha )*; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{leftguard} -\end{center} -\graphspace - -Left-guarded concatenation is equivalent to the following: - -\begin{verbatim} -expr $(unique_name,1) . expr >(unique_name,0) -\end{verbatim} -\verbspace - -\subsection{Longest-Match Kleene Star} -\label{longest_match_kleene_star} - -\verb|expr**| - -This version of kleene star puts a higher priority on staying in the -machine versus wrapping around and starting over. The LM kleene star is useful -when writing simple tokenizers. These machines are built by applying the -longest-match kleene star to an alternation of token patterns, as in the -following. - -% GENERATE: lmkleene -% OPT: -p -% %%{ -% machine exfinpri; -% action A {} -% action B {} -\begin{inline_code} -\begin{verbatim} -# Repeat tokens, but make sure to get the longest match. -main := ( - lower ( lower | digit )* %A | - digit+ %B | - ' ' -)**; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{lmkleene} -\end{center} -\graphspace - -If a regular kleene star were used the machine above would not be able to -distinguish between extending a word and beginning a new one. This operator is -equivalent to: - -\begin{verbatim} -( expr $(unique_name,1) %(unique_name,0) )* -\end{verbatim} -\verbspace - -When the kleene star is applied, transitions that go out of the machine and -back into it are made. These are assigned a priority of zero by the leaving -transition mechanism. This is less than the priority of one assigned to the -transitions leaving the final states but not leaving the machine. When -these transitions clash on the same character, the -transition that stays in the machine takes precedence. The transition -that wraps around is dropped. - -Note that this operator does not build a scanner in the traditional sense -because there is never any backtracking. To build a scanner with backtracking -use the Longest-Match machine construction described in Section -\ref{generating-scanners}. - -\chapter{Interface to Host Program} - -The Ragel code generator is very flexible. The generated code has no -dependencies and can be inserted in any function, perhaps inside a loop if -desired. The user is responsible for declaring and initializing a number of -required variables, including the current state and the pointer to the input -stream. These can live in any scope. Control of the input processing loop is -also possible: the user may break out of the processing loop and return to it -at any time. - -In the case of the C, D, and Go host languages, Ragel is able to generate very -fast-running code that implements state machines as directly executable code. -Since very large files strain the host language compiler, table-based code -generation is also supported. In the future we hope to provide a partitioned, -directly executable format that is able to reduce the burden on the host -compiler by splitting large machines across multiple functions. - -In the case of Java and Ruby, table-based code generation is the only code -style supported. In the future this may be expanded to include other code -styles. - -Ragel can be used to parse input in one block, or it can be used to parse input -in a sequence of blocks as it arrives from a file or socket. Parsing the input -in a sequence of blocks brings with it a few responsibilities. If the parser -utilizes a scanner, care must be taken to not break the input stream anywhere -but token boundaries. If pointers to the input stream are taken during -parsing, care must be taken to not use a pointer that has been invalidated by -movement to a subsequent block. If the current input data pointer is moved -backwards it must not be moved past the beginning of the current block. - -Figure \ref{basic-example} shows a simple Ragel program that does not have any -actions. The example tests the first argument of the program against a number -pattern and then prints the machine's acceptance status. - -\begin{figure} -\small -\begin{verbatim} -#include <stdio.h> -#include <string.h> -%%{ - machine foo; - write data; -}%% -int main( int argc, char **argv ) -{ - int cs; - if ( argc > 1 ) { - char *p = argv[1]; - char *pe = p + strlen( p ); - %%{ - main := [0-9]+ ( '.' [0-9]+ )?; - - write init; - write exec; - }%% - } - printf("result = %i\n", cs >= foo_first_final ); - return 0; -} -\end{verbatim} -\verbspace -\caption{A basic Ragel example without any actions. -} -\label{basic-example} -\end{figure} - -\section{Variables Used by Ragel} - -There are a number of variables that Ragel expects the user to declare. At a -very minimum the \verb|cs|, \verb|p| and \verb|pe| variables must be declared. -In Go, Java and Ruby code the \verb|data| variable must also be declared. If -EOF actions are used then the \verb|eof| variable is required. If -stack-based state machine control flow statements are used then the -\verb|stack| and \verb|top| variables are required. If a scanner is declared -then the \verb|act|, \verb|ts| and \verb|te| variables must be -declared. - -\begin{itemize} - -\item \verb|cs| - Current state. This must be an integer and it should persist -across invocations of the machine when the data is broken into blocks that are -processed independently. This variable may be modified from outside the -execution loop, but not from within. - -\item \verb|p| - Data pointer. In C/D code this variable is expected to be a -pointer to the character data to process. It should be initialized to the -beginning of the data block on every run of the machine. In Go, Java and Ruby it is -used as an offset to \verb|data| and must be an integer. In this case it should -be initialized to zero on every run of the machine. - -\item \verb|pe| - Data end pointer. This should be initialized to \verb|p| plus -the data length on every run of the machine. In Go, Java and Ruby code this should -be initialized to the data length. - -\item \verb|eof| - End of file pointer. This should be set to \verb|pe| when -the buffer block being processed is the last one, otherwise it should be set to -null. In Go, Java and Ruby code \verb|-1| must be used instead of null. If the EOF -event can be known only after the final buffer block has been processed, then -it is possible to set \verb|p = pe = eof| and run the execute block. - -\item \verb|data| - This variable is only required in Go, Java and Ruby code. It -must be an array containting the data to process. - -\item \verb|stack| - This must be an array of integers. It is used to store -integer values representing states. If the stack must resize dynamically the -Pre-push and Post-Pop statements can be used to do this (Sections -\ref{prepush} and \ref{postpop}). - -\item \verb|top| - This must be an integer value and will be used as an offset -to \verb|stack|, giving the next available spot on the top of the stack. - -\item \verb|act| - This must be an integer value. It is a variable sometimes -used by scanner code to keep track of the most recent successful pattern match. - -\item \verb|ts| - This must be a pointer to character data. In Go, Java and -Ruby code this must be an integer. See Section \ref{generating-scanners} for -more information. - -\item \verb|te| - Also a pointer to character data. - -\end{itemize} - -\section{Alphtype Statement} - -\begin{verbatim} -alphtype unsigned int; -\end{verbatim} -\verbspace - -The alphtype statement specifies the alphabet data type that the machine -operates on. During the compilation of the machine, integer literals are -expected to be in the range of possible values of the alphtype. The default -is \verb|char| for all languages except Go where the default is \verb|byte|. - -\begin{multicols}{2} -C/C++/Objective-C: -\begin{verbatim} - char unsigned char - short unsigned short - int unsigned int - long unsigned long -\end{verbatim} -\verbspace - -Go: -\begin{verbatim} - byte - int8 uint8 - int16 uint16 - int32 uint32 - int -\end{verbatim} -\verbspace - -Ruby: -\begin{verbatim} - char - int -\end{verbatim} -\verbspace - -\columnbreak - -Java: -\begin{verbatim} - char - byte - short - int -\end{verbatim} -\verbspace - -D: -\begin{verbatim} - char - byte ubyte - short ushort - wchar - int uint - dchar -\end{verbatim} -\verbspace - -\end{multicols} - -\section{Getkey Statement} - -\begin{verbatim} -getkey fpc->id; -\end{verbatim} -\verbspace - -This statement specifies to Ragel how to retrieve the current character from -from the pointer to the current element (\verb|p|). Any expression that returns -a value of the alphabet type -may be used. The getkey statement may be used for looking into element -structures or for translating the character to process. The getkey expression -defaults to \verb|(*p)|. In goto-driven machines the getkey expression may be -evaluated more than once per element processed, therefore it should not incur a -large cost nor preclude optimization. - -\section{Access Statement} - -\begin{verbatim} -access fsm->; -\end{verbatim} -\verbspace - -The access statement specifies how the generated code should -access the machine data that is persistent across processing buffer blocks. -This applies to all variables except \verb|p|, \verb|pe| and \verb|eof|. This includes -\verb|cs|, \verb|top|, \verb|stack|, \verb|ts|, \verb|te| and \verb|act|. -The access statement is useful if a machine is to be encapsulated inside a -structure in C code. It can be used to give the name of -a pointer to the structure. - -\section{Variable Statement} - -\begin{verbatim} -variable p fsm->p; -\end{verbatim} -\verbspace - -The variable statement specifies how to access a specific -variable. All of the variables that are declared by the user and -used by Ragel can be changed. This includes \verb|p|, \verb|pe|, \verb|eof|, \verb|cs|, -\verb|top|, \verb|stack|, \verb|ts|, \verb|te| and \verb|act|. -In Go, Ruby and Java code generation the \verb|data| variable can also be changed. - -\section{Pre-Push Statement} -\label{prepush} - -\begin{verbatim} -prepush { - /* stack growing code */ -} -\end{verbatim} -\verbspace - -The prepush statement allows the user to supply stack management code that is -written out during the generation of fcall, immediately before the current -state is pushed to the stack. This statement can be used to test the number of -available spaces and dynamically grow the stack if necessary. - -\section{Post-Pop Statement} -\label{postpop} - -\begin{verbatim} -postpop { - /* stack shrinking code */ -} -\end{verbatim} -\verbspace - -The postpop statement allows the user to supply stack management code that is -written out during the generation of fret, immediately after the next state is -popped from the stack. This statement can be used to dynamically shrink the -stack. - -\section{Write Statement} -\label{write-statement} - -\begin{verbatim} -write <component> [options]; -\end{verbatim} -\verbspace - -The write statement is used to generate parts of the machine. -There are seven -components that can be generated by a write statement. These components make up the -state machine's data, initialization code, execution code, and export definitions. -A write statement may appear before a machine is fully defined. -This allows one to write out the data first then later define the machine where -it is used. An example of this is shown in Figure \ref{fbreak-example}. - -\subsection{Write Data} -\begin{verbatim} -write data [options]; -\end{verbatim} -\verbspace - -The write data statement causes Ragel to emit the constant static data needed -by the machine. In table-driven output styles (see Section \ref{genout}) this -is a collection of arrays that represent the states and transitions of the -machine. In goto-driven machines much less data is emitted. At the very -minimum a start state \verb|name_start| is generated. All variables written -out in machine data have both the \verb|static| and \verb|const| properties and -are prefixed with the name of the machine and an -underscore. The data can be placed inside a class, inside a function, or it can -be defined as global data. - -Two variables are written that may be used to test the state of the machine -after a buffer block has been processed. The \verb|name_error| variable gives -the id of the state that the machine moves into when it cannot find a valid -transition to take. The machine immediately breaks out of the processing loop when -it finds itself in the error state. The error variable can be compared to the -current state to determine if the machine has failed to parse the input. If the -machine is complete, that is from every state there is a transition to a proper -state on every possible character of the alphabet, then no error state is required -and this variable will be set to -1. - -The \verb|name_first_final| variable stores the id of the first final state. -All of the machine's states are sorted by their final state status before -having their ids assigned. Checking if the machine has accepted its input can -then be done by checking if the current state is greater-than or equal to the -first final state. - -Data generation has several options: - -\noindent\hspace*{24pt}\verb|noerror | - Do not generate the integer variable that gives the id of the error state.\\ -\noindent\hspace*{24pt}\verb|nofinal | - Do not generate the integer variable that gives the id of the first final state.\\ -\noindent\hspace*{24pt}\verb|noprefix | - Do not prefix the variable names with the name of the machine. -\vspace{12pt} - -\begin{figure} -\small -\begin{verbatim} -#include <stdio.h> -%% machine foo; -%% write data; -int main( int argc, char **argv ) -{ - int cs, res = 0; - if ( argc > 1 ) { - char *p = argv[1]; - %%{ - main := - [a-z]+ - 0 @{ res = 1; fbreak; }; - write init; - write exec noend; - }%% - } - printf("execute = %i\n", res ); - return 0; -} -\end{verbatim} -\verbspace -\caption{Use of {\tt noend} write option and the {\tt fbreak} statement for -processing a string. -} -\label{fbreak-example} -\end{figure} - -\subsection{Write Start, First Final and Error} - -\begin{verbatim} -write start; -write first_final; -write error; -\end{verbatim} -\verbspace - -These three write statements provide an alternative means of accessing the -\verb|start|, \verb|first_final| and \verb|error| states. If there are many -different machine specifications in one file it is easy to get the prefix for -these wrong. This is especially true if the state machine boilerplate is -frequently made by a copy-paste-edit process. These write statements allow the -problem to be avoided. They can be used as follows: - -\begin{verbatim} -/* Did parsing succeed? */ -if ( cs < %%{ write first_final; }%% ) { - result = ERR_PARSE_ERROR; - goto fail; -} -\end{verbatim} -\verbspace - -\subsection{Write Init} -\begin{verbatim} -write init [options]; -\end{verbatim} -\verbspace - -The write init statement causes Ragel to emit initialization code. This should -be executed once before the machine is started. At a very minimum this sets the -current state to the start state. If other variables are needed by the -generated code, such as call stack variables or scanner management -variables, they are also initialized here. - -The \verb|nocs| option to the write init statement will cause ragel to skip -intialization of the cs variable. This is useful if the user wishes to use -custom logic to decide which state the specification should start in. - -\subsection{Write Exec} -\begin{verbatim} -write exec [options]; -\end{verbatim} -\verbspace - -The write exec statement causes Ragel to emit the state machine's execution code. -Ragel expects several variables to be available to this code. At a very minimum, the -generated code needs access to the current character position \verb|p|, the ending -position \verb|pe| and the current state \verb|cs| (though \verb|pe| -can be omitted using the \verb|noend| write option). -The \verb|p| variable is the cursor that the execute code will -used to traverse the input. The \verb|pe| variable should be set up to point to one -position past the last valid character in the buffer. - -Other variables are needed when certain features are used. For example using -the \verb|fcall| or \verb|fret| statements requires \verb|stack| and -\verb|top| variables to be defined. If a longest-match construction is used, -variables for managing backtracking are required. - -The write exec statement has one option. The \verb|noend| option tells Ragel -to generate code that ignores the end position \verb|pe|. In this -case the user must explicitly break out of the processing loop using -\verb|fbreak|, otherwise the machine will continue to process characters until -it moves into the error state. This option is useful if one wishes to process a -null terminated string. Rather than traverse the string to discover then length -before processing the input, the user can break out when the null character is -seen. The example in Figure \ref{fbreak-example} shows the use of the -\verb|noend| write option and the \verb|fbreak| statement for processing a string. - -\subsection{Write Exports} -\label{export} - -\begin{verbatim} -write exports; -\end{verbatim} -\verbspace - -The export feature can be used to export simple machine definitions. Machine definitions -are marked for export using the \verb|export| keyword. - -\begin{verbatim} -export machine_to_export = 0x44; -\end{verbatim} -\verbspace - -When the write exports statement is used these machines are -written out in the generated code. Defines are used for C and constant integers -are used for D, Java and Ruby. See Section \ref{import} for a description of the -import statement. - -\section{Maintaining Pointers to Input Data} - -In the creation of any parser it is not uncommon to require the collection of -the data being parsed. It is always possible to collect data into a growable -buffer as the machine moves over it, however the copying of data is a somewhat -wasteful use of processor cycles. The most efficient way to collect data from -the parser is to set pointers into the input then later reference them. This -poses a problem for uses of Ragel where the input data arrives in blocks, such -as over a socket or from a file. If a pointer is set in one buffer block but -must be used while parsing a following buffer block, some extra consideration -to correctness must be made. - -The scanner constructions exhibit this problem, requiring the maintenance -code described in Section \ref{generating-scanners}. If a longest-match -construction has been used somewhere in the machine then it is possible to -take advantage of the required prefix maintenance code in the driver program to -ensure pointers to the input are always valid. If laying down a pointer one can -set \verb|ts| at the same spot or ahead of it. When data is shifted in -between loops the user must also shift the pointer. In this way it is possible -to maintain pointers to the input that will always be consistent. - -\begin{figure} -\small -\begin{verbatim} - int have = 0; - while ( 1 ) { - char *p, *pe, *data = buf + have; - int len, space = BUFSIZE - have; - - if ( space == 0 ) { - fprintf(stderr, "BUFFER OUT OF SPACE\n"); - exit(1); - } - - len = fread( data, 1, space, stdin ); - if ( len == 0 ) - break; - - /* Find the last newline by searching backwards. */ - p = buf; - pe = data + len - 1; - while ( *pe != '\n' && pe >= buf ) - pe--; - pe += 1; - - %% write exec; - - /* How much is still in the buffer? */ - have = data + len - pe; - if ( have > 0 ) - memmove( buf, pe, have ); - - if ( len < space ) - break; - } -\end{verbatim} -\verbspace -\caption{An example of line-oriented processing. -} -\label{line-oriented} -\end{figure} - -In general, there are two approaches for guaranteeing the consistency of -pointers to input data. The first approach is the one just described; -lay down a marker from an action, -then later ensure that the data the marker points to is preserved ahead of -the buffer on the next execute invocation. This approach is good because it -allows the parser to decide on the pointer-use boundaries, which can be -arbitrarily complex parsing conditions. A downside is that it requires any -pointers that are set to be corrected in between execute invocations. - -The alternative is to find the pointer-use boundaries before invoking the execute -routine, then pass in the data using these boundaries. For example, if the -program must perform line-oriented processing, the user can scan backwards from -the end of an input block that has just been read in and process only up to the -first found newline. On the next input read, the new data is placed after the -partially read line and processing continues from the beginning of the line. -An example of line-oriented processing is given in Figure \ref{line-oriented}. - -\section{Specifying the Host Language} - -The \verb|ragel| program has a number of options for specifying the host -language. The host-language options are: - -\begin{itemize} -\item \verb|-C | for C/C++/Objective-C code (default) -\item \verb|-D | for D code. -\item \verb|-Z | for Go code. -\item \verb|-J | for Java code. -\item \verb|-R | for Ruby code. -\item \verb|-A | for C\# code. -\end{itemize} - -\section{Choosing a Generated Code Style} -\label{genout} - -There are three styles of code output to choose from. Code style affects the -size and speed of the compiled binary. Changing code style does not require any -change to the Ragel program. There are two table-driven formats and a goto -driven format. - -In addition to choosing a style to emit, there are various levels of action -code reuse to choose from. The maximum reuse levels (\verb|-T0|, \verb|-F0| -and \verb|-G0|) ensure that no FSM action code is ever duplicated by encoding -each transition's action list as static data and iterating -through the lists on every transition. This will normally result in a smaller -binary. The less action reuse options (\verb|-T1|, \verb|-F1| and \verb|-G1|) -will usually produce faster running code by expanding each transition's action -list into a single block of code, eliminating the need to iterate through the -lists. This duplicates action code instead of generating the logic necessary -for reuse. Consequently the binary will be larger. However, this tradeoff applies to -machines with moderate to dense action lists only. If a machine's transitions -frequently have less than two actions then the less reuse options will actually -produce both a smaller and a faster running binary due to less action sharing -overhead. The best way to choose the appropriate code style for your -application is to perform your own tests. - -The table-driven FSM represents the state machine as constant static data. There are -tables of states, transitions, indices and actions. The current state is -stored in a variable. The execution is simply a loop that looks up the current -state, looks up the transition to take, executes any actions and moves to the -target state. In general, the table-driven FSM can handle any machine, produces -a smaller binary and requires a less expensive host language compile, but -results in slower running code. Since the table-driven format is the most -flexible it is the default code style. - -The flat table-driven machine is a table-based machine that is optimized for -small alphabets. Where the regular table machine uses the current character as -the key in a binary search for the transition to take, the flat table machine -uses the current character as an index into an array of transitions. This is -faster in general, however is only suitable if the span of possible characters -is small. - -The goto-driven FSM represents the state machine using goto and switch -statements. The execution is a flat code block where the transition to take is -computed using switch statements and directly executable binary searches. In -general, the goto FSM produces faster code but results in a larger binary and a -more expensive host language compile. - -The goto-driven format has an additional action reuse level (\verb|-G2|) that -writes actions directly into the state transitioning logic rather than putting -all the actions together into a single switch. Generally this produces faster -running code because it allows the machine to encode the current state using -the processor's instruction pointer. Again, sparse machines may actually -compile to smaller binaries when \verb|-G2| is used due to less state and -action management overhead. For many parsing applications \verb|-G2| is the -preferred output format. - -\begin{center} - -Code Output Style Options - -\begin{tabular}{|c|c|c|} -\hline -\verb|-T0|&binary search table-driven&C/D/Java/Ruby/C\#\\ -\hline -\verb|-T1|&binary search, expanded actions&C/D/Ruby/C\#\\ -\hline -\verb|-F0|&flat table-driven&C/D/Ruby/C\#\\ -\hline -\verb|-F1|&flat table, expanded actions&C/D/Ruby/C\#\\ -\hline -\verb|-G0|&goto-driven&C/D/C\#\\ -\hline -\verb|-G1|&goto, expanded actions&C/D/C\#\\ -\hline -\verb|-G2|&goto, in-place actions&C/D/Go\\ -\hline -\end{tabular} -\end{center} - -\chapter{Beyond the Basic Model} - -\section{Parser Modularization} -\label{modularization} - -It is possible to use Ragel's machine construction and action embedding -operators to specify an entire parser using a single regular expression. In -many cases this is the desired way to specify a parser in Ragel. However, in -some scenarios the language to parse may be so large that it is difficult to -think about it as a single regular expression. It may also shift between distinct -parsing strategies, in which case modularization into several coherent blocks -of the language may be appropriate. - -It may also be the case that patterns that compile to a large number of states -must be used in a number of different contexts and referencing them in each -context results in a very large state machine. In this case, an ability to reuse -parsers would reduce code size. - -To address this, distinct regular expressions may be instantiated and linked -together by means of a jumping and calling mechanism. This mechanism is -analogous to the jumping to and calling of processor instructions. A jump -command, given in action code, causes control to be immediately passed to -another portion of the machine by way of setting the current state variable. A -call command causes the target state of the current transition to be pushed to -a state stack before control is transferred. Later on, the original location -may be returned to with a return statement. In the following example, distinct -state machines are used to handle the parsing of two types of headers. - -% GENERATE: call -% %%{ -% machine call; -\begin{inline_code} -\begin{verbatim} -action return { fret; } -action call_date { fcall date; } -action call_name { fcall name; } - -# A parser for date strings. -date := [0-9][0-9] '/' - [0-9][0-9] '/' - [0-9][0-9][0-9][0-9] '\n' @return; - -# A parser for name strings. -name := ( [a-zA-Z]+ | ' ' )** '\n' @return; - -# The main parser. -headers = - ( 'from' | 'to' ) ':' @call_name | - ( 'departed' | 'arrived' ) ':' @call_date; - -main := headers*; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% %% write data; -% void f() -% { -% %% write init; -% %% write exec; -% } -% END GENERATE - -Calling and jumping should be used carefully as they are operations that take -one out of the domain of regular languages. A machine that contains a call or -jump statement in one of its actions should be used as an argument to a machine -construction operator only with considerable care. Since DFA transitions may -actually represent several NFA transitions, a call or jump embedded in one -machine can inadvertently terminate another machine that it shares prefixes -with. Despite this danger, theses statements have proven useful for tying -together sub-parsers of a language into a parser for the full language, -especially for the purpose of modularizing code and reducing the number of -states when the machine contains frequently recurring patterns. - -Section \ref{vals} describes the jump and call statements that are used to -transfer control. These statements make use of two variables that must be -declared by the user, \verb|stack| and \verb|top|. The \verb|stack| variable -must be an array of integers and \verb|top| must be a single integer, which -will point to the next available space in \verb|stack|. Sections \ref{prepush} -and \ref{postpop} describe the Pre-Push and Post-Pop statements which can be -used to implement a dynamically resizable array. - -\section{Referencing Names} -\label{labels} - -This section describes how to reference names in epsilon transitions (Section -\ref{state-charts}) and -action-based control-flow statements such as \verb|fgoto|. There is a hierarchy -of names implied in a Ragel specification. At the top level are the machine -instantiations. Beneath the instantiations are labels and references to machine -definitions. Beneath those are more labels and references to definitions, and -so on. - -Any name reference may contain multiple components separated with the \verb|::| -compound symbol. The search for the first component of a name reference is -rooted at the join expression that the epsilon transition or action embedding -is contained in. If the name reference is not contained in a join, -the search is rooted at the machine definition that the epsilon transition or -action embedding is contained in. Each component after the first is searched -for beginning at the location in the name tree that the previous reference -component refers to. - -In the case of action-based references, if the action is embedded more than -once, the local search is performed for each embedding and the result is the -union of all the searches. If no result is found for action-based references then -the search is repeated at the root of the name tree. Any action-based name -search may be forced into a strictly global search by prefixing the name -reference with \verb|::|. - -The final component of the name reference must resolve to a unique entry point. -If a name is unique in the entire name tree it can be referenced as is. If it -is not unique it can be specified by qualifying it with names above it in the -name tree. However, it can always be renamed. - -% FIXME: Should fit this in somewhere. -% Some kinds of name references are illegal. Cannot call into longest-match -% machine, can only call its start state. Cannot make a call to anywhere from -% any part of a longest-match machine except a rule's action. This would result -% in an eventual return to some point inside a longest-match other than the -% start state. This is banned for the same reason a call into the LM machine is -% banned. - - -\section{Scanners} -\label{generating-scanners} - -Scanners are very much intertwined with regular-languages and their -corresponding processors. For this reason Ragel supports the definition of -scanners. The generated code will repeatedly attempt to match patterns from a -list, favouring longer patterns over shorter patterns. In the case of -equal-length matches, the generated code will favour patterns that appear ahead -of others. When a scanner makes a match it executes the user code associated -with the match, consumes the input then resumes scanning. - -\begin{verbatim} -<machine_name> := |* - pattern1 => action1; - pattern2 => action2; - ... - *|; -\end{verbatim} -\verbspace - -On the surface, Ragel scanners are similar to those defined by Lex. Though -there is a key distinguishing feature: patterns may be arbitrary Ragel -expressions and can therefore contain embedded code. With a Ragel-based scanner -the user need not wait until the end of a pattern before user code can be -executed. - -Scanners can be used to process sub-languages, as well as for tokenizing -programming languages. In the following example a scanner is used to tokenize -the contents of a header field. - -\begin{inline_code} -\begin{verbatim} -word = [a-z]+; -head_name = 'Header'; - -header := |* - word; - ' '; - '\n' => { fret; }; -*|; - -main := ( head_name ':' @{ fcall header; } )*; -\end{verbatim} -\end{inline_code} -\verbspace - -The scanner construction has a purpose similar to the longest-match kleene star -operator \verb|**|. The key -difference is that a scanner is able to backtrack to match a previously matched -shorter string when the pursuit of a longer string fails. For this reason the -scanner construction operator is not a pure state machine construction -operator. It relies on several variables that enable it to backtrack and make -pointers to the matched input text available to the user. For this reason -scanners must be immediately instantiated. They cannot be defined inline or -referenced by another expression. Scanners must be jumped to or called. - -Scanners rely on the \verb|ts|, \verb|te| and \verb|act| -variables to be present so that they can backtrack and make pointers to the -matched text available to the user. If input is processed using multiple calls -to the execute code then the user must ensure that when a token is only -partially matched that the prefix is preserved on the subsequent invocation of -the execute code. - -The \verb|ts| variable must be defined as a pointer to the input data. -It is used for recording where the current token match begins. This variable -may be used in action code for retrieving the text of the current match. Ragel -ensures that in between tokens and outside of the longest-match machines that -this pointer is set to null. In between calls to the execute code the user must -check if \verb|ts| is set and if so, ensure that the data it points to is -preserved ahead of the next buffer block. This is described in more detail -below. - -The \verb|te| variable must also be defined as a pointer to the input data. -It is used for recording where a match ends and where scanning of the next -token should begin. This can also be used in action code for retrieving the -text of the current match. - -The \verb|act| variable must be defined as an integer type. It is used for -recording the identity of the last pattern matched when the scanner must go -past a matched pattern in an attempt to make a longer match. If the longer -match fails it may need to consult the \verb|act| variable. In some cases, use -of the \verb|act| -variable can be avoided because the value of the current state is enough -information to determine which token to accept, however in other cases this is -not enough and so the \verb|act| variable is used. - -When the longest-match operator is in use, the user's driver code must take on -some buffer management functions. The following algorithm gives an overview of -the steps that should be taken to properly use the longest-match operator. - -\begin{itemize} -\item Read a block of input data. -\item Run the execute code. -\item If \verb|ts| is set, the execute code will expect the incomplete -token to be preserved ahead of the buffer on the next invocation of the execute -code. -\begin{itemize} -\item Shift the data beginning at \verb|ts| and ending at \verb|pe| to the -beginning of the input buffer. -\item Reset \verb|ts| to the beginning of the buffer. -\item Shift \verb|te| by the distance from the old value of \verb|ts| -to the new value. The \verb|te| variable may or may not be valid. There is -no way to know if it holds a meaningful value because it is not kept at null -when it is not in use. It can be shifted regardless. -\end{itemize} -\item Read another block of data into the buffer, immediately following any -preserved data. -\item Run the scanner on the new data. -\end{itemize} - -Figure \ref{preserve_example} shows the required handling of an input stream in -which a token is broken by the input block boundaries. After processing up to -and including the ``t'' of ``characters'', the prefix of the string token must be -retained and processing should resume at the ``e'' on the next iteration of -the execute code. - -If one uses a large input buffer for collecting input then the number of times -the shifting must be done will be small. Furthermore, if one takes care not to -define tokens that are allowed to be very long and instead processes these -items using pure state machines or sub-scanners, then only a small amount of -data will ever need to be shifted. - -\begin{figure} -\small -\begin{verbatim} - a) A stream "of characters" to be scanned. - | | | - p ts pe - - b) "of characters" to be scanned. - | | | - ts p pe -\end{verbatim} -\verbspace -\caption{Following an invocation of the execute code there may be a partially -matched token (a). The data of the partially matched token -must be preserved ahead of the new data on the next invocation (b). -} -\label{preserve_example} -\end{figure} - -Since scanners attempt to make the longest possible match of input, patterns -such as identifiers require one character of lookahead in order to trigger a -match. In the case of the last token in the input stream the user must ensure -that the \verb|eof| variable is set so that the final token is flushed out. - -An example scanner processing loop is given in Figure \ref{scanner-loop}. - -\begin{figure} -\small -\begin{verbatim} - int have = 0; - bool done = false; - while ( !done ) { - /* How much space is in the buffer? */ - int space = BUFSIZE - have; - if ( space == 0 ) { - /* Buffer is full. */ - cerr << "TOKEN TOO BIG" << endl; - exit(1); - } - - /* Read in a block after any data we already have. */ - char *p = inbuf + have; - cin.read( p, space ); - int len = cin.gcount(); - - char *pe = p + len; - char *eof = 0; - - /* If no data was read indicate EOF. */ - if ( len == 0 ) { - eof = pe; - done = true; - } - - %% write exec; - - if ( cs == Scanner_error ) { - /* Machine failed before finding a token. */ - cerr << "PARSE ERROR" << endl; - exit(1); - } - - if ( ts == 0 ) - have = 0; - else { - /* There is a prefix to preserve, shift it over. */ - have = pe - ts; - memmove( inbuf, ts, have ); - te = inbuf + (te-ts); - ts = inbuf; - } - } -\end{verbatim} -\verbspace -\caption{A processing loop for a scanner. -} -\label{scanner-loop} -\end{figure} - -\section{State Charts} -\label{state-charts} - -In addition to supporting the construction of state machines using regular -languages, Ragel provides a way to manually specify state machines using -state charts. The comma operator combines machines together without any -implied transitions. The user can then manually link machines by specifying -epsilon transitions with the \verb|->| operator. Epsilon transitions are drawn -between the final states of a machine and entry points defined by labels. This -makes it possible to build machines using the explicit state-chart method while -making minimal changes to the Ragel language. - -An interesting feature of Ragel's state chart construction method is that it -can be mixed freely with regular expression constructions. A state chart may be -referenced from within a regular expression, or a regular expression may be -used in the definition of a state chart transition. - -\subsection{Join} - -\verb|expr , expr , ...| - -Join a list of machines together without -drawing any transitions, without setting up a start state, and without -designating any final states. Transitions between the machines may be specified -using labels and epsilon transitions. The start state must be explicity -specified with the ``start'' label. Final states may be specified with an -epsilon transition to the implicitly created ``final'' state. The join -operation allows one to build machines using a state chart model. - -\subsection{Label} - -\verb|label: expr| - -Attaches a label to an expression. Labels can be -used as the target of epsilon transitions and explicit control transfer -statements such as \verb|fgoto| and \verb|fnext| in action -code. - -\subsection{Epsilon} - -\verb|expr -> label| - -Draws an epsilon transition to the state defined -by \verb|label|. Epsilon transitions are made deterministic when join -operators are evaluated. Epsilon transitions that are not in a join operation -are made deterministic when the machine definition that contains the epsilon is -complete. See Section \ref{labels} for information on referencing labels. - -\subsection{Simplifying State Charts} - -There are two benefits to providing state charts in Ragel. The first is that it -allows us to take a state chart with a full listing of states and transitions -and simplify it in selective places using regular expressions. - -The state chart method of specifying parsers is very common. It is an -effective programming technique for producing robust code. The key disadvantage -becomes clear when one attempts to comprehend a large parser specified in this -way. These programs usually require many lines, causing logic to be spread out -over large distances in the source file. Remembering the function of a large -number of states can be difficult and organizing the parser in a sensible way -requires discipline because branches and repetition present many file layout -options. This kind of programming takes a specification with inherent -structure such as looping, alternation and concatenation and expresses it in a -flat form. - -If we could take an isolated component of a manually programmed state chart, -that is, a subset of states that has only one entry point, and implement it -using regular language operators then we could eliminate all the explicit -naming of the states contained in it. By eliminating explicitly named states -and replacing them with higher-level specifications we simplify a state machine -specification. - -For example, sometimes chains of states are needed, with only a small number of -possible characters appearing along the chain. These can easily be replaced -with a concatenation of characters. Sometimes a group of common states -implement a loop back to another single portion of the machine. Rather than -manually duplicate all the transitions that loop back, we may be able to -express the loop using a kleene star operator. - -Ragel allows one to take this state map simplification approach. We can build -state machines using a state map model and implement portions of the state map -using regular languages. In place of any transition in the state machine, -entire sub-machines can be given. These can encapsulate functionality -defined elsewhere. An important aspect of the Ragel approach is that when we -wrap up a collection of states using a regular expression we do not lose -access to the states and transitions. We can still execute code on the -transitions that we have encapsulated. - -\subsection{Dropping Down One Level of Abstraction} -\label{down} - -The second benefit of incorporating state charts into Ragel is that it permits -us to bypass the regular language abstraction if we need to. Ragel's action -embedding operators are sometimes insufficient for expressing certain parsing -tasks. In the same way that is useful for C language programmers to drop down -to assembly language programming using embedded assembler, it is sometimes -useful for the Ragel programmer to drop down to programming with state charts. - -In the following example, we wish to buffer the characters of an XML CDATA -sequence. The sequence is terminated by the string \verb|]]>|. The challenge -in our application is that we do not wish the terminating characters to be -buffered. An expression of the form \verb|any* @buffer :>> ']]>'| will not work -because the buffer will always contain the characters \verb|]]| on the end. -Instead, what we need is to delay the buffering of \verb|]| -characters until a time when we -abandon the terminating sequence and go back into the main loop. There is no -easy way to express this using Ragel's regular expression and action embedding -operators, and so an ability to drop down to the state chart method is useful. - -% GENERATE: dropdown -% OPT: -p -% %%{ -% machine dropdown; -\begin{inline_code} -\begin{verbatim} -action bchar { buff( fpc ); } # Buffer the current character. -action bbrack1 { buff( "]" ); } -action bbrack2 { buff( "]]" ); } - -CDATA_body = -start: ( - ']' -> one | - (any-']') @bchar ->start -), -one: ( - ']' -> two | - [^\]] @bbrack1 @bchar ->start -), -two: ( - '>' -> final | - ']' @bbrack1 -> two | - [^>\]] @bbrack2 @bchar ->start -); -\end{verbatim} -\end{inline_code} -\verbspace -% main := CDATA_body; -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{dropdown} -\end{center} -\graphspace - - -\section{Semantic Conditions} -\label{semantic} - -Many communication protocols contain variable-length fields, where the length -of the field is given ahead of the field as a value. This -problem cannot be expressed using regular languages because of its -context-dependent nature. The prevalence of variable-length fields in -communication protocols motivated us to introduce semantic conditions into -the Ragel language. - -A semantic condition is a block of user code that is interpreted as an -expression and evaluated immediately -before a transition is taken. If the code returns a value of true, the -transition may be taken. We can now embed code that extracts the length of a -field, then proceed to match $n$ data values. - -% GENERATE: conds1 -% OPT: -p -% %%{ -% machine conds1; -% number = digit+; -\begin{inline_code} -\begin{verbatim} -action rec_num { i = 0; n = getnumber(); } -action test_len { i++ < n } -data_fields = ( - 'd' - [0-9]+ %rec_num - ':' - ( [a-z] when test_len )* -)**; -\end{verbatim} -\end{inline_code} -\verbspace -% main := data_fields; -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{conds1} -\end{center} -\graphspace - -The Ragel implementation of semantic conditions does not force us to give up the -compositional property of Ragel definitions. For example, a machine that tests -the length of a field using conditions can be unioned with another machine -that accepts some of the same strings, without the two machines interfering with -one another. The user need not be concerned about whether or not the result of the -semantic condition will affect the matching of the second machine. - -To see this, first consider that when a user associates a condition with an -existing transition, the transition's label is translated from the base character -to its corresponding value in the space that represents ``condition $c$ true''. Should -the determinization process combine a state that has a conditional transition -with another state that has a transition on the same input character but -without a condition, then the condition-less transition first has its label -translated into two values, one to its corresponding value in the space that -represents ``condition $c$ true'' and another to its corresponding value in the -space that represents ``condition $c$ false''. It -is then safe to combine the two transitions. This is shown in the following -example. Two intersecting patterns are unioned, one with a condition and one -without. The condition embedded in the first pattern does not affect the second -pattern. - -% GENERATE: conds2 -% OPT: -p -% %%{ -% machine conds2; -% number = digit+; -\begin{inline_code} -\begin{verbatim} -action test_len { i++ < n } -action one { /* accept pattern one */ } -action two { /* accept pattern two */ } -patterns = - ( [a-z] when test_len )+ %one | - [a-z][a-z0-9]* %two; -main := patterns '\n'; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{conds2} -\end{center} -\graphspace - -There are many more potential uses for semantic conditions. The user is free to -use arbitrary code and may therefore perform actions such as looking up names -in dictionaries, validating input using external parsing mechanisms or -performing checks on the semantic structure of input seen so far. In the next -section we describe how Ragel accommodates several common parser engineering -problems. - -The semantic condition feature works only with alphabet types that are smaller -in width than the \verb|long| type. To implement semantic conditions Ragel -needs to be able to allocate characters from the alphabet space. Ragel uses -these allocated characters to express "character C with condition P true" or "C -with P false." Since internally Ragel uses longs to store characters there is -no room left in the alphabet space unless an alphabet type smaller than long is -used. - -\section{Implementing Lookahead} - -There are a few strategies for implementing lookahead in Ragel programs. -Leaving actions, which are described in Section \ref{out-actions}, can be -used as a form of lookahead. Ragel also provides the \verb|fhold| directive -which can be used in actions to prevent the machine from advancing over the -current character. It is also possible to manually adjust the current character -position by shifting it backwards using \verb|fexec|, however when this is -done, care must be taken not to overstep the beginning of the current buffer -block. In both the use of \verb|fhold| and \verb|fexec| the user must be -cautious of combining the resulting machine with another in such a way that the -transition on which the current position is adjusted is not combined with a -transition from the other machine. - -\section{Parsing Recursive Language Structures} - -In general Ragel cannot handle recursive structures because the grammar is -interpreted as a regular language. However, depending on what needs to be -parsed it is sometimes practical to implement the recursive parts using manual -coding techniques. This often works in cases where the recursive structures are -simple and easy to recognize, such as in the balancing of parentheses - -One approach to parsing recursive structures is to use actions that increment -and decrement counters or otherwise recognize the entry to and exit from -recursive structures and then jump to the appropriate machine defnition using -\verb|fcall| and \verb|fret|. Alternatively, semantic conditions can be used to -test counter variables. - -A more traditional approach is to call a separate parsing function (expressed -in the host language) when a recursive structure is entered, then later return -when the end is recognized. - -\end{document} diff --git a/test/repeat2.in b/test/repeat2.in deleted file mode 100644 index 7ebfdda3..00000000 --- a/test/repeat2.in +++ /dev/null @@ -1,3307 +0,0 @@ -.title Ragel State Machine Compiler - -.subtitle User Guide - -.author Adrian Thurston - -.license - -This document is part of Ragel, and as such, this document is -released under the terms of the GNU General Public License as published by the -Free Software Foundation; either version 2 of the License, or (at your option) -any later version. - -Ragel is distributed in the hope that it will be useful, but -WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more -details. - -You should have received a copy of the GNU General Public -License along with Ragel; if not, write to the Free Software Foundation, Inc., -59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -.chapter Introduction - -.section Abstract - -Regular expressions are used heavily in practice for the purpose of specifying -parsers. They are normally used as black boxes linked together with program -logic. User actions are executed in between invocations of the regular -expression engine. Adding actions before a pattern terminates requires patterns -to be broken and pasted back together with program logic. The more user actions -are needed, the less the advantages of regular expressions are seen. - -Ragel is a software development tool that allows user actions to be -embedded into the transitions of a regular expression's corresponding state -machine, eliminating the need to switch from the regular expression engine and -user code execution environment and back again. As a result, expressions can be -maximally continuous. One is free to specify an entire parser using a single -regular expression. The single-expression model affords concise and elegant -descriptions of languages and the generation of very simple, fast and robust -code. Ragel compiles executable finite state machines from a high level regular language -notation. Ragel targets C, C++, Objective-C, D, Go, Java and Ruby. - -In addition to building state machines from regular expressions, Ragel allows -the programmer to directly specify state machines with state charts. These two -notations may be freely combined. There are also facilities for controlling -nondeterminism in the resulting machines and building scanners using patterns -that themselves have embedded actions. Ragel can produce code that is small and -runs very fast. Ragel can handle integer-sized alphabets and can compile very -large state machines. - -.section Motivation - -When a programmer is faced with the task of producing a parser for a -context-free language there are many tools to choose from. It is quite common -to generate useful and efficient parsers for programming languages from a -formal grammar. It is also quite common for programmers to avoid such tools -when making parsers for simple computer languages, such as file formats and -communication protocols. Such languages are often regular and tools for -processing the context-free languages are viewed as too heavyweight for the -purpose of parsing regular languages. The extra run-time effort required for -supporting the recursive nature of context-free languages is wasted. - -When we turn to the regular expression-based parsing tools, such as Lex, Re2C, -and scripting languages such as Sed, Awk and Perl we find that they are split -into two levels: a regular expression matching engine and some kind of program -logic for linking patterns together. For example, a Lex program is composed of -sets of regular expressions. The implied program logic repeatedly attempts to -match a pattern in the current set. When a match is found the associated user -code executed. It requires the user to consider a language as a sequence of -independent tokens. Scripting languages and regular expression libraries allow -one to link patterns together using arbitrary program code. This is very -flexible and powerful, however we can be more concise and clear if we avoid -gluing together regular expressions with if statements and while loops. - -This model of execution, where the runtime alternates between regular -expression matching and user code exectution places restrictions on when -action code may be executed. Since action code can only be associated with -complete patterns, any action code that must be executed before an entire -pattern is matched requires that the pattern be broken into smaller units. -Instead of being forced to disrupt the regular expression syntax and write -smaller expressions, it is desirable to retain a single expression and embed -code for performing actions directly into the transitions that move over the -characters. After all, capable programmers are astutely aware of the machinery -underlying their programs, so why not provide them with access to that -machinery? To achieve this we require an action execution model for associating -code with the sub-expressions of a regular expression in a way that does not -disrupt its syntax. - -The primary goal of Ragel is to provide developers with an ability to embed -actions into the transitions and states of a regular expression's state machine -in support of the definition of entire parsers or large sections of parsers -using a single regular expression. From the regular expression we gain a clear -and concise statement of our language. From the state machine we obtain a very -fast and robust executable that lends itself to many kinds of analysis and -visualization. - -.section Overview - -Ragel is a language for specifying state machines. The Ragel program is a -compiler that assembles a state machine definition to executable code. Ragel -is based on the principle that any regular language can be converted to a -deterministic finite state automaton. Since every regular language has a state -machine representation and vice versa, the terms regular language and state -machine (or just machine) will be used interchangeably in this document. - -Ragel outputs machines to C, C++, Objective-C, D, Go, Java or Ruby code. The output is -designed to be generic and is not bound to any particular input or processing -method. A Ragel machine expects to have data passed to it in buffer blocks. -When there is no more input, the machine can be queried for acceptance. In -this way, a Ragel machine can be used to simply recognize a regular language -like a regular expression library. By embedding code into the regular language, -a Ragel machine can also be used to parse input. - -The Ragel language has many operators for constructing and manipulating -machines. Machines are built up from smaller machines, to bigger ones, to the -final machine representing the language that needs to be recognized or parsed. - -The core state machine construction operators are those found in most theory -of computation textbooks. They date back to the 1950s and are widely studied. -They are based on set operations and permit one to think of languages as a set -of strings. They are Union, Intersection, Difference, Concatenation and Kleene -Star. Put together, these operators make up what most people know as regular -expressions. Ragel also provides a scanner construction operator -and provides operators for explicitly constructing machines -using a state chart method. In the state chart method, one joins machines -together without any implied transitions and then explicitly specifies where -epsilon transitions should be drawn. - -The state machine manipulation operators are specific to Ragel. They allow the -programmer to access the states and transitions of regular language's -corresponding machine. There are two uses of the manipulation operators. The -first and primary use is to embed code into transitions and states, allowing -the programmer to specify the actions of the state machine. - -Ragel attempts to make the action embedding facility as intuitive as possible. -To do so, a number of issues need to be addressed. For example, when making a -nondeterministic specification into a DFA using machines that have embedded -actions, new transitions are often made that have the combined actions of -several source transitions. Ragel ensures that multiple actions associated with -a single transition are ordered consistently with respect to the order of -reference and the natural ordering implied by the construction operators. - -The second use of the manipulation operators is to assign priorities to -transitions. Priorities provide a convenient way of controlling any -nondeterminism introduced by the construction operators. Suppose two -transitions leave from the same state and go to distinct target states on the -same character. If these transitions are assigned conflicting priorities, then -during the determinization process the transition with the higher priority will -take precedence over the transition with the lower priority. The lower priority -transition gets abandoned. The transitions would otherwise be combined into a new -transition that goes to a new state that is a combination of the original -target states. Priorities are often required for segmenting machines. The most -common uses of priorities have been encoded into a set of simple operators -that should be used instead of priority embeddings whenever possible. - -For the purposes of embedding, Ragel divides transitions and states into -different classes. There are four operators for embedding actions and -priorities into the transitions of a state machine. It is possible to embed -into entering transitions, finishing transitions, all transitions and leaving -transitions. The embedding into leaving transitions is a special case. -These transition embeddings get stored in the final states of a machine. They -are transferred to any transitions that are made going out of the machine by -future concatenation or kleene star operations. - -There are several more operators for embedding actions into states. Like the -transition embeddings, there are various different classes of states that the -embedding operators access. For example, one can access start states, final -states or all states, among others. Unlike the transition embeddings, there are -several different types of state action embeddings. These are executed at -various different times during the processing of input. It is possible to embed -actions that are exectued on transitions into a state, on transitions out of a -state, on transitions taken on the error event, or on transitions taken on the -EOF event. - -Within actions, it is possible to influence the behaviour of the state machine. -The user can write action code that jumps or calls to another portion of the -machine, changes the current character being processed, or breaks out of the -processing loop. With the state machine calling feature Ragel can be used to -parse languages that are not regular. For example, one can parse balanced -parentheses by calling into a parser when an open parenthesis character is seen -and returning to the state on the top of the stack when the corresponding -closing parenthesis character is seen. More complicated context-free languages -such as expressions in C are out of the scope of Ragel. - -Ragel also provides a scanner construction operator that can be used to build -scanners much the same way that Lex is used. The Ragel generated code, which -relies on user-defined variables for backtracking, repeatedly tries to match -patterns to the input, favouring longer patterns over shorter ones and patterns -that appear ahead of others when the lengths of the possible matches are -identical. When a pattern is matched the associated action is executed. - -The key distinguishing feature between scanners in Ragel and scanners in Lex is -that Ragel patterns may be arbitrary Ragel expressions and can therefore -contain embedded code. With a Ragel-based scanner the user need not wait until -the end of a pattern before user code can be executed. - -Scanners do take Ragel out of the domain of pure state machines and require the -user to maintain the backtracking related variables. However, scanners -integrate well with regular state machine instantiations. They can be called to -or jumped to only when needed, or they can be called out of or jumped out of -when a simpler, pure state machine model is appropriate. - -Two types of output code style are available. Ragel can produce a table-driven -machine or a directly executable machine. The directly executable machine is -much faster than the table-driven. On the other hand, the table-driven machine -is more compact and less demanding on the host language compiler. It is better -suited to compiling large state machines. - -.section Related Work - -Lex is perhaps the best-known tool for constructing parsers from regular -expressions. In the Lex processing model, generated code attempts to match one -of the user's regular expression patterns, favouring longer matches over -shorter ones. Once a match is made it then executes the code associated with -the pattern and consumes the matching string. This process is repeated until -the input is fully consumed. - -Through the use of start conditions, related sets of patterns may be defined. -The active set may be changed at any time. This allows the user to define -different lexical regions. It also allows the user to link patterns together by -requiring that some patterns come before others. This is quite like a -concatenation operation. However, use of Lex for languages that require a -considerable amount of pattern concatenation is inappropriate. In such cases a -Lex program deteriorates into a manually specified state machine, where start -conditions define the states and pattern actions define the transitions. Lex -is therefore best suited to parsing tasks where the language to be parsed can -be described in terms of regions of tokens. - -Lex is useful in many scenarios and has undoubtedly stood the test of time. -There are, however, several drawbacks to using Lex. Lex can impose too much -overhead for parsing applications where buffering is not required because all -the characters are available in a single string. In these cases there is -structure to the language to be parsed and a parser specification tool can -help, but employing a heavyweight processing loop that imposes a stream -``pull'' model and dynamic input buffer allocation is inappropriate. An -example of this kind of scenario is the conversion of floating point numbers -contained in a string to their corresponding numerical values. - -Another drawback is the very issue that Ragel attempts to solve. -It is not possible to execute a user action while -matching a character contained inside a pattern. For example, if scanning a -programming language and string literals can contain newlines which must be -counted, a Lex user must break up a string literal pattern so as to associate -an action with newlines. This forces the definition of a new start condition. -Alternatively the user can reprocess the text of the matched string literal to -count newlines. - -.comment - -How ragel is different from Lex. - -Like Re2c, Ragel provides a simple execution model that does not make any -assumptions as to how the input is collected. Also, Ragel does not do any -buffering in the generated code. Consequently there are no dependencies on -external functions such as .verb|malloc|. - -If buffering is required it can be manually implemented by embedding actions -that copy the current character to a buffer, or data can be passed to the -parser using known block boundaries. If the longest-match operator is used, -Ragel requires the user to ensure that the ending portion of the input buffer -is preserved when the buffer is exhaused before a token is fully matched. The -user should move the token prefix to a new memory location, such as back to the -beginning of the input buffer, then place the subsequently read input -immediately after the prefix. - -These properties of Ragel make it more work to write a program that requires -the longest-match operator or buffering of input, however they make Ragel a -more flexible tool that can produce very simple and fast-running programs under -a variety of input acquisition arrangements. - -In Ragel, it is not necessary -to introduce start conditions to concatenate tokens and retain action -execution. Ragel allows one to structure a parser as a series of tokens, but -does not require it. - -Like Lex and Re2C, Ragel is able to process input using a longest-match -execution model, however the core of the Ragel language specifies parsers at a -much lower level. This core is built around a pure state machine model. When -building basic machines there is no implied algorithm for processing input -other than to move from state to state on the transitions of the machine. This -core of pure state machine operations makes Ragel well suited to handling -parsing problems not based on token scanning. Should one need to use a -longest-match model, the functionality is available and the lower level state -machine construction facilities can be used to specify the patterns of a -longest-match machine. - -This is not possible in Ragel. One can only program -a longest-match instantiation with a fixed set of rules. One can jump to -another longest-match machine that employs the same machine definitions in the -construction of its rules, however no states will be shared. - -In Ragel, input may be re-parsed using a -different machine, but since the action to be executed is associated with -transitions of the compiled state machine, the longest-match construction does -not permit a single rule to be excluded from the active set. It cannot be done -ahead of time nor in the excluded rule's action. - -.end comment - -The Re2C program defines an input processing model similar to that of Lex. -Re2C focuses on making generated state machines run very fast and -integrate easily into any program, free of dependencies. Re2C generates -directly executable code and is able to claim that generated parsers run nearly -as fast as their hand-coded equivalents. This is very important for user -adoption, as programmers are reluctant to use a tool when a faster alternative -exists. A consideration to ease of use is also important because developers -need the freedom to integrate the generated code as they see fit. - -Many scripting languages provide ways of composing parsers by linking regular -expressions using program logic. For example, Sed and Awk are two established -Unix scripting tools that allow the programmer to exploit regular expressions -for the purpose of locating and extracting text of interest. High-level -programming languages such as Perl, Python, PHP and Ruby all provide regular -expression libraries that allow the user to combine regular expressions with -arbitrary code. - -In addition to supporting the linking of regular expressions with arbitrary -program logic, the Perl programming language permits the embedding of code into -regular expressions. Perl embeddings do not translate into the embedding of -code into deterministic state machines. Perl regular expressions are in fact -not fully compiled to deterministic machines when embedded code is involved. -They are instead interpreted and involve backtracking. This is shown by the -following Perl program. When it is fed the input .verb|abcd| the interpretor -attempts to match the first alternative, printing .verb|a1 b1|. When this -possibility fails it backtracks and tries the second possibility, printing -.verb|a2 b2|, at which point it succeeds. - -.code -print "YES\n" if ( <STDIN> =~ - /( a (?{ print "a1 "; }) b (?{ print "b1 "; }) cX ) | - ( a (?{ print "a2 "; }) b (?{ print "b2 "; }) cd )/x ) -.end code - -In Ragel there is no regular expression interpretor. Aside from the scanner -operator, all Ragel expressions are made into deterministic machines and the -run time simply moves from state to state as it consumes input. An equivalent -parser expressed in Ragel would attempt both of the alternatives concurrently, -printing .verb|a1 a2 b1 b2|. - -.section Development Status - -Ragel is a relatively new tool and is under continuous development. As a rough -release guide, minor revision number changes are for implementation -improvements and feature additions. Major revision number changes are for -implementation and language changes that do not preserve backwards -compatibility. Though in the past this has not always held true: changes that -break code have crept into minor version number changes. Typically, the -documentation lags behind the development in the interest of documenting only -the lasting features. The latest changes are always documented in the ChangeLog -file. - -.chapter Constructing State Machines - -.section Ragel State Machine Specifications - -A Ragel input file consists of a program in the host language that contains embedded machine -specifications. Ragel normally passes input straight to output. When it sees -a machine specification it stops to read the Ragel statements and possibly generate -code in place of the specification. -Afterwards it continues to pass input through. There -can be any number of FSM specifications in an input file. A multi-line FSM spec -starts with .verb|%%{| and ends with .verb|}%%|. A single-line FSM spec starts -with .verb|%%| and ends at the first newline. - -While Ragel is looking for FSM specifications it does basic lexical analysis on -the surrounding input. It interprets literal strings and comments so a -.verb|%%| sequence in either of those will not trigger the parsing of an FSM -specification. Ragel does not pass the input through any preprocessor nor does it -interpret preprocessor directives itself so includes, defines and ifdef logic -cannot be used to alter the parse of a Ragel input file. It is therefore not -possible to use an .verb|#if 0| directive to comment out a machine as is -commonly done in C code. As an alternative, a machine can be prevented from -causing any generated output by commenting out write statements. - -In Figure .ref{cmd-line-parsing}, a multi-line specification is used to define the -machine and single line specifications are used to trigger the writing of the machine -data and execution code. - -.figure cmd-line-parsing -.multicols -.verbatim -#include <string.h> -#include <stdio.h> - -%%{ - machine foo; - main := - ( 'foo' | 'bar' ) - 0 @{ res = 1; }; -}%% - -%% write data; -.end verbatim -.columnbreak -.verbatim -int main( int argc, char **argv ) -{ - int cs, res = 0; - if ( argc > 1 ) { - char *p = argv[1]; - char *pe = p + strlen(p) + 1; - %% write init; - %% write exec; - } - printf("result = %i\n", res ); - return 0; -} -.end verbatim -.end multicols -.caption Parsing a command line argument. -.end figure - -.subsection Naming Ragel Blocks - -.verbatim -machine fsm_name; -.end verbatim - -The .verb|machine| statement gives the name of the FSM. If present in a -specification, this statement must appear first. If a machine specification -does not have a name then Ragel uses the previous specification name. If no -previous specification name exists then this is an error. Because FSM -specifications persist in memory, a machine's statements can be spread across -multiple machine specifications. This allows one to break up a machine across -several files or draw in statements that are common to multiple machines using -the .verb|include| statement. - -.subsection Machine Definition -.label{definition} - -.verbatim -<name> = <expression>; -.end verbatim - -The machine definition statement associates an FSM expression with a name. Machine -expressions assigned to names can later be referenced in other expressions. A -definition statement on its own does not cause any states to be generated. It is simply a -description of a machine to be used later. States are generated only when a definition is -instantiated, which happens when a definition is referenced in an instantiated -expression. - -.subsection Machine Instantiation -.label{instantiation} - -.verbatim -<name> := <expression>; -.end verbatim - -The machine instantiation statement generates a set of states representing an -expression. Each instantiation generates a distinct set of states. The starting -state of the instantiation is written in the data section of the generated code -using the instantiation name. If a machine named -.verb|main| is instantiated, its start state is used as the -specification's start state and is assigned to the .verb|cs| variable by the -.verb|write init| command. If no .verb|main| machine is given, the start state -of the last machine instantiation to appear is used as the specification's -start state. - -From outside the execution loop, control may be passed to any machine by -assigning the entry point to the .verb|cs| variable. From inside the execution -loop, control may be passed to any machine instantiation using .verb|fcall|, -.verb|fgoto| or .verb|fnext| statements. - -.subsection Including Ragel Code - -.verbatim -include FsmName "inputfile.rl"; -.end verbatim - -The .verb|include| statement can be used to draw in the statements of another FSM -specification. Both the name and input file are optional, however at least one -must be given. Without an FSM name, the given input file is searched for an FSM -of the same name as the current specification. Without an input file the -current file is searched for a machine of the given name. If both are present, -the given input file is searched for a machine of the given name. - -Ragel searches for included files from the location of the current file. -Additional directories can be added to the search path using the .verb|-I| -option. - -.subsection Importing Definitions -.label{import} - -.verbatim -import "inputfile.h"; -.end verbatim - -The .verb|import| statement scrapes a file for sequences of tokens that match -the following forms. Ragel treats these forms as state machine definitions. - -.list -.li .verb|name '=' number| -.li .verb|name '=' lit_string| -.li .verb|'define' name number| -.li .verb|'define' name lit_string| -.end list - -If the input file is a Ragel program then tokens inside any Ragel -specifications are ignored. See Section .ref{export} for a description of -exporting machine definitions. - -Ragel searches for imported files from the location of the current file. -Additional directories can be added to the search path using the .verb|-I| -option. - -.section Lexical Analysis of a Ragel Block -.label{lexing} - -Within a machine specification the following lexical rules apply to the input. - -.itemize - -.item The .verb|#| symbol begins a comment that terminates at the next newline. - -.item The symbols .verb|""|, .verb|''|, .verb|//|, .verb|[]| behave as the -delimiters of literal strings. Within them, the following escape sequences -are interpreted: - -.verb| \0 \a \b \t \n \v \f \r| - -A backslash at the end of a line joins the following line onto the current. A -backslash preceding any other character removes special meaning. This applies -to terminating characters and to special characters in regular expression -literals. As an exception, regular expression literals do not support escape -sequences as the operands of a range within a list. See the bullet on regular -expressions in Section .ref{basic}. - -.item The symbols .verb|{}| delimit a block of host language code that will be -embedded into the machine as an action. Within the block of host language -code, basic lexical analysis of comments and strings is done in order to -correctly find the closing brace of the block. With the exception of FSM -commands embedded in code blocks, the entire block is preserved as is for -identical reproduction in the output code. - -.item The pattern .verb|[+-]?[0-9]+| denotes an integer in decimal format. -Integers used for specifying machines may be negative only if the alphabet type -is signed. Integers used for specifying priorities may be positive or negative. - -.item The pattern .verb|0x[0-9A-Fa-f]+| denotes an integer in hexadecimal -format. - -.item The keywords are .verb|access|, .verb|action|, .verb|alphtype|, -.verb|getkey|, .verb|write|, .verb|machine| and .verb|include|. - -.item The pattern .verb|[a-zA-Z_][a-zA-Z_0-9]*| denotes an identifier. - -.comment -.item The allowable symbols are: - -.verb/ ( ) ! ^ * ? + : -> - | & . , := = ; > @ $ % /\\ -.verb| >/ $/ %/ </ @/ <>/ >! $! %! <! @! <>!|\\ -.verb| >^ $^ %^ <^ @^ <>^ >~ $~ %~ <~ @~ <>~|\\ -.verb| >* $* %* <* @* <>*| -.end comment - -.item Any amount of whitespace may separate tokens. - -.end itemize - -.comment -.section Parse of an FSM Specification - -The following statements are possible within an FSM specification. The -requirements for trailing semicolons loosely follow that of C. -A block -specifying code does not require a trailing semicolon. An expression -statement does require a trailing semicolon. -.end comment - -.section Basic Machines -.label{basic} - -The basic machines are the base operands of regular language expressions. They -are the smallest unit to which machine construction and manipulation operators -can be applied. - -.itemize - -.item .verb|'hello'| -- Concatenation Literal. Produces a machine that matches -the sequence of characters in the quoted string. If there are 5 characters -there will be 6 states chained together with the characters in the string. See -Section .ref{lexing} for information on valid escape sequences. - -.comment -% GENERATE: bmconcat -% OPT: -p -% %%{ -% machine bmconcat; -.verbatim -main := 'hello'; -.end verbatim -% }%% -% END GENERATE -.end comment - -.graphic bmconcat - -It is possible -to make a concatenation literal case-insensitive by appending an .verb|i| to -the string, for example .verb|'cmd'i|. - -.item .verb|"hello"| -- Identical to the single quoted version. - -.item .verb|[hello]| -- Or Expression. Produces a union of characters. There -will be two states with a transition for each unique character between the two states. -The .verb|[]| delimiters behave like the quotes of a literal string. For example, -.verb|[ \t]| means tab or space. The .verb|or| expression supports character ranges -with the .verb|-| symbol as a separator. The meaning of the union can be negated -using an initial .verb|^| character as in standard regular expressions. -See Section .ref{lexing} for information on valid escape sequences -in .verb|or| expressions. - -.comment -% GENERATE: bmor -% OPT: -p -% %%{ -% machine bmor; -.verbatim -main := [hello]; -.end verbatim -% }%% -% END GENERATE -.end comment - -.graphic bmor - -.item .verb|''|, .verb|""|, and .verb|[]| -- Zero Length Machine. Produces a machine -that matches the zero length string. Zero length machines have one state that is both -a start state and a final state. - -.comment -% GENERATE: bmnull -% OPT: -p -% %%{ -% machine bmnull; -.verbatim -main := ''; -.end verbatim -% }%% -% END GENERATE -.end comment - -.graphic bmnull - -% FIXME: More on the range of values here. -.item .verb|42| -- Numerical Literal. Produces a two state machine with one -transition on the given number. The number may be in decimal or hexadecimal -format and should be in the range allowed by the alphabet type. The minimum and -maximum values permitted are defined by the host machine that Ragel is compiled -on. For example, numbers in a .verb|short| alphabet on an i386 machine should -be in the range .verb|-32768| to .verb|32767|. - -.comment -% GENERATE: bmnum -% %%{ -% machine bmnum; -.verbatim -main := 42; -.end verbatim -% }%% -% END GENERATE -.end comment - -.graphic bmnum - -.item .verb|/simple_regex/| -- Regular Expression. Regular expressions are -parsed as a series of expressions that are concatenated together. Each -concatenated expression -may be a literal character, the ``any'' character specified by the .verb|.| -symbol, or a union of characters specified by the .verb|[]| delimiters. If the -first character of a union is .verb|^| then it matches any character not in the -list. Within a union, a range of characters can be given by separating the first -and last characters of the range with the .verb|-| symbol. Each -concatenated machine may have repetition specified by following it with the -.verb|*| symbol. The standard escape sequences described in Section -.ref{lexing} are supported everywhere in regular expressions except as the -operands of a range within in a list. This notation also supports the .verb|i| -trailing option. Use it to produce case-insensitive machines, as in .verb|/GET/i|. - -Ragel does not support very complex regular expressions because the desired -results can always be achieved using the more general machine construction -operators listed in Section .ref{machconst}. The following diagram shows the -result of compiling .verb|/ab*[c-z].*[123]/|. .verb|DEF| represents the default -transition, which is taken if no other transition can be taken. - -.comment -% GENERATE: bmregex -% OPT: -p -% %%{ -% machine bmregex; -.verbatim -main := /ab*[c-z].*[123]/; -.end verbatim -% }%% -% END GENERATE -.end comment - -.graphic bmregex - -.item .verb|'a' .. 'z'| -- Range. Produces a machine that matches any -characters in the specified range. Allowable upper and lower bounds of the -range are concatenation literals of length one and numerical literals. For -example, .verb|0x10..0x20|, .verb|0..63|, and .verb|'a'..'z'| are valid ranges. -The bounds should be in the range allowed by the alphabet type. - -.comment -% GENERATE: bmrange -% OPT: -p -% %%{ -% machine bmrange; -.verbatim -main := 'a' .. 'z'; -.end verbatim -% }%% -% END GENERATE -.end comment - -.graphic bmrange - -.item .verb|variable_name| -- Lookup the machine definition assigned to the -variable name given and use an instance of it. See Section .ref{definition} for -an important note on what it means to reference a variable name. - -.item .verb|builtin_machine| -- There are several built-in machines available -for use. They are all two state machines for the purpose of matching common -classes of characters. They are: - -.itemize - -.item .verb|any | -- Any character in the alphabet. - -.item .verb|ascii | -- Ascii characters. .verb|0..127| - -.item .verb|extend| -- Ascii extended characters. This is the range -.verb|-128..127| for signed alphabets and the range .verb|0..255| for unsigned -alphabets. - -.item .verb|alpha | -- Alphabetic characters. .verb|[A-Za-z]| - -.item .verb|digit | -- Digits. .verb|[0-9]| - -.item .verb|alnum | -- Alpha numerics. .verb|[0-9A-Za-z]| - -.item .verb|lower | -- Lowercase characters. .verb|[a-z]| - -.item .verb|upper | -- Uppercase characters. .verb|[A-Z]| - -.item .verb|xdigit| -- Hexadecimal digits. .verb|[0-9A-Fa-f]| - -.item .verb|cntrl | -- Control characters. .verb|0..31| - -.item .verb|graph | -- Graphical characters. .verb|[!-~]| - -.item .verb|print | -- Printable characters. .verb|[ -~]| - -.item .verb|punct | -- Punctuation. Graphical characters that are not alphanumerics. -.verb|[!-/:-@[-`{-~]| - -.item .verb|space | -- Whitespace. .verb|[\t\v\f\n\r ]| - -.item .verb|zlen | -- Zero length string. .verb|""| - -.item .verb|empty | -- Empty set. Matches nothing. .verb|^any| - -.end itemize -.end itemize - -.section Operator Precedence -The following table shows operator precedence from lowest to highest. Operators -in the same precedence group are evaluated from left to right. - -.tabular -.row 1&.verb| , |&Join -.row 2&.verb/ | & - --/&Union, Intersection and Subtraction -.row 3&.verb| . <: :> :>> |&Concatenation -.row 4&.verb| : |&Label -.row 5&.verb| -> |&Epsilon Transition -.row 6&.verb| > @ $ % |&Transitions Actions and Priorities -.row 6&.verb| >/ $/ %/ </ @/ <>/ |&EOF Actions -.row 6&.verb| >! $! %! <! @! <>! |&Global Error Actions -.row 6&.verb| >^ $^ %^ <^ @^ <>^ |&Local Error Actions -.row 6&.verb| >~ $~ %~ <~ @~ <>~ |&To-State Actions -.row 6&.verb| >* $* %* <* @* <>* |&From-State Action -.row 7&.verb| * ** ? + {n} {,n} {n,} {n,m} |&Repetition -.row 8&.verb| ! ^ |&Negation and Character-Level Negation -.row 9&.verb| ( <expr> ) |&Grouping -.end tabular - -.section Regular Language Operators -.label{machconst} - -When using Ragel it is helpful to have a sense of how it constructs machines. -The determinization process can produce results that seem unusual to someone -not familiar with the NFA to DFA conversion algorithm. In this section we -describe Ragel's state machine operators. Though the operators are defined -using epsilon transitions, it should be noted that this is for discussion only. -The epsilon transitions described in this section do not persist, but are -immediately removed by the determinization process which is executed at every -operation. Ragel does not make use of any nondeterministic intermediate state -machines. - -To create an epsilon transition between two states .verb|x| and .verb|y| is to -copy all of the properties of .verb|y| into .verb|x|. This involves drawing in -all of .verb|y|'s to-state actions, EOF actions, etc., in addition to its -transitions. If .verb|x| and .verb|y| both have a transition out on the same -character, then the transitions must be combined. During transition -combination a new transition is made that goes to a new state that is the -combination of both target states. The new combination state is created using -the same epsilon transition method. The new state has an epsilon transition -drawn to all the states that compose it. Since the creation of new epsilon -transitions may be triggered every time an epsilon transition is drawn, the -process of drawing epsilon transitions is repeated until there are no more -epsilon transitions to be made. - -A very common error that is made when using Ragel is to make machines that do -too much. That is, to create machines that have unintentional -nondetermistic properties. This usually results from being unaware of the common strings -between machines that are combined together using the regular language -operators. This can involve never leaving a machine, causing its actions to be -propagated through all the following states. Or it can involve an alternation -where both branches are unintentionally taken simultaneously. - -This problem forces one to think hard about the language that needs to be -matched. To guard against this kind of problem one must ensure that the machine -specification is divided up using boundaries that do not allow ambiguities from -one portion of the machine to the next. See Chapter -.ref{controlling-nondeterminism} for more on this problem and how to solve it. - -The Graphviz tool is an immense help when debugging improperly compiled -machines or otherwise learning how to use Ragel. Graphviz Dot files can be -generated from Ragel programs using the .verb|-V| option. See Section -.ref{visualization} for more information. - - -.subsection Union - -.verb/expr | expr/ - -The union operation produces a machine that matches any string in machine one -or machine two. The operation first creates a new start state. Epsilon -transitions are drawn from the new start state to the start states of both -input machines. The resulting machine has a final state set equivalent to the -union of the final state sets of both input machines. In this operation, there -is the opportunity for nondeterminism among both branches. If there are -strings, or prefixes of strings that are matched by both machines then the new -machine will follow both parts of the alternation at once. The union operation is -shown below. - -.graphic opor 1.0 - -The following example demonstrates the union of three machines representing -common tokens. - -% GENERATE: exor -% OPT: -p -% %%{ -% machine exor; -.code -# Hex digits, decimal digits, or identifiers -main := '0x' xdigit+ | digit+ | alpha alnum*; -.end code -% }%% -% END GENERATE - -.graphic exor - -.subsection Intersection - -.verb|expr & expr| - -Intersection produces a machine that matches any -string that is in both machine one and machine two. To achieve intersection, a -union is performed on the two machines. After the result has been made -deterministic, any final state that is not a combination of final states from -both machines has its final state status revoked. To complete the operation, -paths that do not lead to a final state are pruned from the machine. Therefore, -if there are any such paths in either of the expressions they will be removed -by the intersection operator. Intersection can be used to require that two -independent patterns be simultaneously satisfied as in the following example. - -% GENERATE: exinter -% OPT: -p -% %%{ -% machine exinter; -.code -# Match lines four characters wide that contain -# words separated by whitespace. -main := - /[^\n][^\n][^\n][^\n]\n/* & - (/[a-z][a-z]*/ | [ \n])**; -.end code -% }%% -% END GENERATE - -.graphic exinter - -.subsection Difference - -.verb|expr - expr| - -The difference operation produces a machine that matches -strings that are in machine one but are not in machine two. To achieve subtraction, -a union is performed on the two machines. After the result has been made -deterministic, any final state that came from machine two or is a combination -of states involving a final state from machine two has its final state status -revoked. As with intersection, the operation is completed by pruning any path -that does not lead to a final state. The following example demonstrates the -use of subtraction to exclude specific cases from a set. - -% GENERATE: exsubtr -% OPT: -p -% %%{ -% machine exsubtr; -.code -# Subtract keywords from identifiers. -main := /[a-z][a-z]*/ - ( 'for' | 'int' ); -.end code -% }%% -% END GENERATE - -.graphic exsubtr - -.subsection Strong Difference -.label{strong_difference} - -.verb|expr -- expr| - -Strong difference produces a machine that matches any string of the first -machine that does not have any string of the second machine as a substring. In -the following example, strong subtraction is used to excluded .verb|CRLF| from -a sequence. In the corresponding visualization, the label .verb|DEF| is short -for default. The default transition is taken if no other transition can be -taken. - -% GENERATE: exstrongsubtr -% OPT: -p -% %%{ -% machine exstrongsubtr; -.code -crlf = '\r\n'; -main := [a-z]+ ':' ( any* -- crlf ) crlf; -.end code -% }%% -% END GENERATE - -.graphic exstrongsubtr - -This operator is equivalent to the following. - -.verbatim -expr - ( any* expr any* ) -.end verbatim - -.subsection Concatenation - -.verb|expr . expr| - -Concatenation produces a machine that matches all the strings in machine one followed by all -the strings in machine two. Concatenation draws epsilon transitions from the -final states of the first machine to the start state of the second machine. The -final states of the first machine lose their final state status, unless the -start state of the second machine is final as well. -Concatenation is the default operator. Two machines next to each other with no -operator between them results in concatenation. - -.graphic opconcat 1.0 - -The opportunity for nondeterministic behaviour results from the possibility of -the final states of the first machine accepting a string that is also accepted -by the start state of the second machine. -The most common scenario in which this happens is the -concatenation of a machine that repeats some pattern with a machine that gives -a terminating string, but the repetition machine does not exclude the -terminating string. The example in Section .ref{strong_difference} -guards against this. Another example is the expression .verb|("'" any* "'")|. -When executed the thread of control will -never leave the .verb|any*| machine. This is a problem especially if actions -are embedded to process the characters of the .verb|any*| component. - -In the following example, the first machine is always active due to the -nondeterministic nature of concatenation. This particular nondeterminism is intended -however because we wish to permit EOF strings before the end of the input. - -% GENERATE: exconcat -% OPT: -p -% %%{ -% machine exconcat; -.code -# Require an eof marker on the last line. -main := /[^\n]*\n/* . 'EOF\n'; -.end code -% }%% -% END GENERATE - -.graphic exconcat - -There is a language -ambiguity involving concatenation and subtraction. Because concatenation is the -default operator for two -adjacent machines there is an ambiguity between subtraction of -a positive numerical literal and concatenation of a negative numerical literal. -For example, .verb|(x-7)| could be interpreted as .verb|(x . -7)| or -.verb|(x - 7)|. In the Ragel language, the subtraction operator always takes precedence -over concatenation of a negative literal. We adhere to the rule that the default -concatenation operator takes effect only when there are no other operators between -two machines. Beware of writing machines such as .verb|(any -1)| when what is -desired is a concatenation of .verb|any| and .verb|-1|. Instead write -.verb|(any . -1)| or .verb|(any (-1))|. If in doubt of the meaning of your program do not -rely on the default concatenation operator; always use the .verb|.| symbol. - - -.subsection Kleene Star - -.verb|expr*| - -The machine resulting from the Kleene Star operator will match zero or more -repetitions of the machine it is applied to. -It creates a new start state and an additional final -state. Epsilon transitions are drawn between the new start state and the old start -state, between the new start state and the new final state, and -between the final states of the machine and the new start state. After the -machine is made deterministic the effect is of the final states getting all the -transitions of the start state. - -.graphic opstar 1.0 - -The possibility for nondeterministic behaviour arises if the final states have -transitions on any of the same characters as the start state. This is common -when applying kleene star to an alternation of tokens. Like the other problems -arising from nondeterministic behavior, this is discussed in more detail in Chapter -.ref{controlling-nondeterminism}. This particular problem can also be solved -by using the longest-match construction discussed in Section -.ref{generating-scanners} on scanners. - -In this -example, there is no nondeterminism introduced by the exterior kleene star due to -the newline at the end of the regular expression. Without the newline the -exterior kleene star would be redundant and there would be ambiguity between -repeating the inner range of the regular expression and the entire regular -expression. Though it would not cause a problem in this case, unnecessary -nondeterminism in the kleene star operator often causes undesired results for -new Ragel users and must be guarded against. - -% GENERATE: exstar -% OPT: -p -% %%{ -% machine exstar; -.code -# Match any number of lines with only lowercase letters. -main := /[a-z]*\n/*; -.end code -% }%% -% END GENERATE - -.graphic exstar - -.subsection One Or More Repetition - -.verb|expr+| - -This operator produces the concatenation of the machine with the kleene star of -itself. The result will match one or more repetitions of the machine. The plus -operator is equivalent to .verb|(expr . expr*)|. - -% GENERATE: explus -% OPT: -p -% %%{ -% machine explus; -.code -# Match alpha-numeric words. -main := alnum+; -.end code -% }%% -% END GENERATE - -.graphic explus - -.subsection Optional - -.verb|expr?| - -The .em{optional} operator produces a machine that accepts the machine -given or the zero length string. The optional operator is equivalent to -.verb/(expr | '' )/. In the following example the optional operator is used to -possibly extend a token. - -% GENERATE: exoption -% OPT: -p -% %%{ -% machine exoption; -.code -# Match integers or floats. -main := digit+ ('.' digit+)?; -.end code -% }%% -% END GENERATE - -.graphic exoption - -.subsection Repetition - -.list -.li .verb|expr {n}| -- Exactly N copies of expr. -.li .verb|expr {,n}| -- Zero to N copies of expr. -.li .verb|expr {n,}| -- N or more copies of expr. -.li .verb|expr {n,m}| -- N to M copies of expr. -.end list - -.subsection Negation - -.verb|!expr| - -Negation produces a machine that matches any string not matched by the given -machine. Negation is equivalent to .verb|(any* - expr)|. - -% GENERATE: exnegate -% OPT: -p -% %%{ -% machine exnegate; -.code -# Accept anything but a string beginning with a digit. -main := ! ( digit any* ); -.end code -% }%% -% END GENERATE - -.graphic exnegate - -.subsection Character-Level Negation - -.verb|^expr| - -Character-level negation produces a machine that matches any single character -not matched by the given machine. Character-Level Negation is equivalent to -.verb|(any - expr)|. It must be applied only to machines that match strings of -length one. - -.section State Machine Minimization - -State machine minimization is the process of finding the minimal equivalent FSM accepting -the language. Minimization reduces the number of states in machines -by merging equivalent states. It does not change the behaviour of the machine -in any way. It will cause some states to be merged into one because they are -functionally equivalent. State minimization is on by default. It can be turned -off with the .verb|-n| option. - -The algorithm implemented is similar to Hopcroft's state minimization -algorithm. Hopcroft's algorithm assumes a finite alphabet that can be listed in -memory, whereas Ragel supports arbitrary integer alphabets that cannot be -listed in memory. Though exact analysis is very difficult, Ragel minimization -runs close to O(n * log(n)) and requires O(n) temporary storage where -$n$ is the number of states. - -.section Visualization -.label{visualization} - -%In many cases, practical -%parsing programs will be too large to completely visualize with Graphviz. The -%proper approach is to reduce the language to the smallest subset possible that -%still exhibits the characteristics that one wishes to learn about or to fix. -%This can be done without modifying the source code using the .verb|-M| and -%.verb|-S| options. If a machine cannot be easily reduced, -%embeddings of unique actions can be very useful for tracing a -%particular component of a larger machine specification, since action names are -%written out on transition labels. - -Ragel is able to emit compiled state machines in Graphviz's Dot file format. -This is done using the .verb|-V| option. -Graphviz support allows users to perform -incremental visualization of their parsers. User actions are displayed on -transition labels of the graph. - -If the final graph is too large to be -meaningful, or even drawn, the user is able to inspect portions of the parser -by naming particular regular expression definitions with the .verb|-S| and -.verb|-M| options to the .verb|ragel| program. Use of Graphviz greatly -improves the Ragel programming experience. It allows users to learn Ragel by -experimentation and also to track down bugs caused by unintended -nondeterminism. - -Ragel has another option to help debugging. The .verb|-x| option causes Ragel -to emit the compiled machine in an XML format. - -.chapter User Actions - -Ragel permits the user to embed actions into the transitions of a regular -expression's corresponding state machine. These actions are executed when the -generated code moves over a transition. Like the regular expression operators, -the action embedding operators are fully compositional. They take a state -machine and an action as input, embed the action and yield a new state machine -that can be used in the construction of other machines. Due to the -compositional nature of embeddings, the user has complete freedom in the -placement of actions. - -A machine's transitions are categorized into four classes. The action embedding -operators access the transitions defined by these classes. The .em{entering -transition} operator .verb|>| isolates the start state, then embeds an action -into all transitions leaving it. The .em{finishing transition} operator -.verb|@| embeds an action into all transitions going into a final state. The -.em{all transition} operator .verb|$| embeds an action into all transitions of -an expression. The .em{leaving transition} operator .verb|%| provides access -to the yet-unmade transitions moving out of the machine via the final states. - -.section Embedding Actions - -.verbatim -action ActionName { - /* Code an action here. */ - count += 1; -} -.end verbatim - -The action statement defines a block of code that can be embedded into an FSM. -Action names can be referenced by the action embedding operators in -expressions. Though actions need not be named in this way (literal blocks -of code can be embedded directly when building machines), defining reusable -blocks of code whenever possible is good practice because it potentially increases the -degree to which the machine can be minimized. - -Within an action some Ragel expressions and statements are parsed and -translated. These allow the user to interact with the machine from action code. -See Section .ref{vals} for a complete list of statements and values available -in code blocks. - -.subsection Entering Action - -.verb|expr > action| - -The entering action operator embeds an action into all transitions -that enter into the machine from the start state. If the start state is final, -then the action is also embedded into the start state as a leaving action. This -means that if a machine accepts the zero-length string and control passes -through the start state then the entering action is executed. Note -that this can happen on both a following character and on the EOF event. - -In some machines the start state has transtions coming in from within the -machine. In these cases the start state is first isolated from the rest of the -machine ensuring that the entering actions are exected once only. - -% GENERATE: exstact -% OPT: -p -% %%{ -% machine exstact; -.code -# Execute A at the beginning of a string of alpha. -action A {} -main := ( lower* >A ) . ' '; -.end code -% }%% -% END GENERATE - -.graphic exstact - -.subsection Finishing Action - -.verb|expr @ action| - -The finishing action operator embeds an action into any transitions that move -the machine into a final state. Further input may move the machine out of the -final state, but keep it in the machine. Therefore finishing actions may be -executed more than once if a machine has any internal transitions out of a -final state. In the following example the final state has no transitions out -and the finishing action is executed only once. - -% GENERATE: exdoneact -% OPT: -p -% %%{ -% machine exdoneact; -% action A {} -.code -# Execute A when the trailing space is seen. -main := ( lower* ' ' ) @A; -.end code -% }%% -% END GENERATE - -.graphic exdoneact - -.subsection All Transition Action - -.verb|expr $ action| - -The all transition operator embeds an action into all transitions of a machine. -The action is executed whenever a transition of the machine is taken. In the -following example, A is executed on every character matched. - -% GENERATE: exallact -% OPT: -p -% %%{ -% machine exallact; -% action A {} -.code -# Execute A on any characters of the machine. -main := ( 'm1' | 'm2' ) $A; -.end code -% }%% -% END GENERATE - -.graphic exallact - -.subsection Leaving Actions -.label{out-actions} - -.verb|expr % action| - -The leaving action operator queues an action for embedding into the transitions -that go out of a machine via a final state. The action is first stored in -the machine's final states and is later transferred to any transitions that are -made going out of the machine by a kleene star or concatenation operation. - -If a final state of the machine is still final when compilation is complete -then the leaving action is also embedded as an EOF action. Therefore, leaving -the machine is defined as either leaving on a character or as state machine -acceptance. - -This operator allows one to associate an action with the termination of a -sequence, without being concerned about what particular character terminates -the sequence. In the following example, A is executed when leaving the alpha -machine on the newline character. - -% GENERATE: exoutact1 -% OPT: -p -% %%{ -% machine exoutact1; -% action A {} -.code -# Match a word followed by a newline. Execute A when -# finishing the word. -main := ( lower+ %A ) . '\n'; -.end code -% }%% -% END GENERATE - -.graphic exoutact1 - -In the following example, the .verb|term_word| action could be used to register -the appearance of a word and to clear the buffer that the .verb|lower| action used -to store the text of it. - -% GENERATE: exoutact2 -% OPT: -p -% %%{ -% machine exoutact2; -% action lower {} -% action space {} -% action term_word {} -% action newline {} -.code -word = ( [a-z] @lower )+ %term_word; -main := word ( ' ' @space word )* '\n' @newline; -.end code -% }%% -% END GENERATE - -.graphic exoutact2 - -In this final example of the action embedding operators, A is executed upon entering -the alpha machine, B is executed on all transitions of the -alpha machine, C is executed when the alpha machine is exited by moving into the -newline machine and N is executed when the newline machine moves into a final -state. - -% GENERATE: exaction -% OPT: -p -% %%{ -% machine exaction; -% action A {} -% action B {} -% action C {} -% action N {} -.code -# Execute A on starting the alpha machine, B on every transition -# moving through it and C upon finishing. Execute N on the newline. -main := ( lower* >A $B %C ) . '\n' @N; -.end code -% }%% -% END GENERATE - -.graphic exaction - - -.section State Action Embedding Operators - -The state embedding operators allow one to embed actions into states. Like the -transition embedding operators, there are several different classes of states -that the operators access. The meanings of the symbols are similar to the -meanings of the symbols used for the transition embedding operators. The design -of the state selections was driven by a need to cover the states of an -expression with exactly one error action. - -Unlike the transition embedding operators, the state embedding operators are -also distinguished by the different kinds of events that embedded actions can -be associated with. Therefore the state embedding operators have two -components. The first, which is the first one or two characters, specifies the -class of states that the action will be embedded into. The second component -specifies the type of event the action will be executed on. The symbols of the -second component also have equivalent kewords. - -.multicols -The different classes of states are: - -.list -.li .verb|> | -- the start state -.li .verb|< | -- any state except the start state -.li .verb|$ | -- all states -.li .verb|% | -- final states -.li .verb|@ | -- any state except final states -.li .verb|<>| -- any except start and final (middle) -.end list - -.columnbreak - -The different kinds of embeddings are: - -.list -.li .verb|~| -- to-state actions (.verb|to|) -.li .verb|*| -- from-state actions (.verb|from|) -.li .verb|/| -- EOF actions (.verb|eof|) -.li .verb|!| -- error actions (.verb|err|) -.li .verb|^| -- local error actions (.verb|lerr|) -.end list - -.end multicols - -.subsection To-State and From-State Actions - -.subsubsection To-State Actions - -.list -.li .verb|>~action >to(name) >to{...} | -- the start state -.li .verb|<~action <to(name) <to{...} | -- any state except the start state -.li .verb|$~action $to(name) $to{...} | -- all states -.li .verb|%~action %to(name) %to{...} | -- final states -.li .verb|@~action @to(name) @to{...} | -- any state except final states -.li .verb|<>~action <>to(name) <>to{...}| -- any except start and final (middle) -.end list - - -To-state actions are executed whenever the state machine moves into the -specified state, either by a natural movement over a transition or by an -action-based transfer of control such as .verb|fgoto|. They are executed after the -in-transition's actions but before the current character is advanced and -tested against the end of the input block. To-state embeddings stay with the -state. They are irrespective of the state's current set of transitions and any -future transitions that may be added in or out of the state. - -Note that the setting of the current state variable .verb|cs| outside of the -execute code is not considered by Ragel as moving into a state and consequently -the to-state actions of the new current state are not executed. This includes -the initialization of the current state when the machine begins. This is -because the entry point into the machine execution code is after the execution -of to-state actions. - -.subsubsection From-State Actions - -.list -.li .verb|>*action >from(name) >from{...} | -- the start state -.li .verb|<*action <from(name) <from{...} | -- any state except the start state -.li .verb|$*action $from(name) $from{...} | -- all states -.li .verb|%*action %from(name) %from{...} | -- final states -.li .verb|@*action @from(name) @from{...} | -- any state except final states -.li .verb|<>*action <>from(name) <>from{...}| -- any except start and final (middle) -.end list - -From-state actions are executed whenever the state machine takes a transition from a -state, either to itself or to some other state. These actions are executed -immediately after the current character is tested against the input block end -marker and before the transition to take is sought based on the current -character. From-state actions are therefore executed even if a transition -cannot be found and the machine moves into the error state. Like to-state -embeddings, from-state embeddings stay with the state. - -.subsection EOF Actions - -.list -.li .verb|>/action >eof(name) >eof{...} | -- the start state -.li .verb|</action <eof(name) <eof{...} | -- any state except the start state -.li .verb|$/action $eof(name) $eof{...} | -- all states -.li .verb|%/action %eof(name) %eof{...} | -- final states -.li .verb|@/action @eof(name) @eof{...} | -- any state except final states -.li .verb|<>/action <>eof(name) <>eof{...}| -- any except start and final (middle) -.end list - -The EOF action embedding operators enable the user to embed actions that are -executed at the end of the input stream. EOF actions are stored in states and -generated in the .verb|write exec| block. They are run when .verb|p == pe == eof| -as the execute block is finishing. EOF actions are free to adjust .verb|p| and -jump to another part of the machine to restart execution. - -.subsection Handling Errors - -In many applications it is useful to be able to react to parsing errors. The -user may wish to print an error message that depends on the context. It -may also be desirable to consume input in an attempt to return the input stream -to some known state and resume parsing. To support error handling and recovery, -Ragel provides error action embedding operators. There are two kinds of error -actions: global error actions and local error actions. -Error actions can be used to simply report errors, or by jumping to a machine -instantiation that consumes input, can attempt to recover from errors. - -.subsubsection Global Error Actions - -.list -.li .verb|>!action >err(name) >err{...} | -- the start state -.li .verb|<!action <err(name) <err{...} | -- any state except the start state -.li .verb|$!action $err(name) $err{...} | -- all states -.li .verb|%!action %err(name) %err{...} | -- final states -.li .verb|@!action @err(name) @err{...} | -- any state except final states -.li .verb|<>!action <>err(name) <>err{...}| -- any except start and final (middle) -.end list - -Global error actions are stored in the states they are embedded into until -compilation is complete. They are then transferred to the transitions that move -into the error state. These transitions are taken on all input characters that -are not already covered by the state's transitions. If a state with an error -action is not final when compilation is complete, then the action is also -embedded as an EOF action. - -Error actions can be used to recover from errors by jumping back into the -machine with .verb|fgoto| and optionally altering .verb|p|. - -.subsubsection Local Error Actions - -.list -.li .verb|>^action >lerr(name) >lerr{...} | -- the start state -.li .verb|<^action <lerr(name) <lerr{...} | -- any state except the start state -.li .verb|$^action $lerr(name) $lerr{...} | -- all states -.li .verb|%^action %lerr(name) %lerr{...} | -- final states -.li .verb|@^action @lerr(name) @lerr{...} | -- any state except final states -.li .verb|<>^action <>lerr(name) <>lerr{...}| -- any except start and final (middle) -.end list - -Like global error actions, local error actions are also stored in the states -they are embedded into until a transfer point. The transfer point is different -however. Each local error action embedding is associated with a name. When a -machine definition has been fully constructed, all local error action -embeddings associated with the same name as the machine definition are -transferred to the error transitions. At this time they are also embedded as -EOF actions in the case of non-final states. - -Local error actions can be used to specify an action to take when a particular -section of a larger state machine fails to match. A particular machine -definition's ``thread'' may die and the local error actions executed, however -the machine as a whole may continue to match input. - -There are two forms of local error action embeddings. In the first form the -name defaults to the current machine. In the second form the machine name can -be specified. This is useful when it is more convenient to specify the local -error action in a sub-definition that is used to construct the machine -definition that the local error action is associated with. To embed local -error actions and -explicitly state the machine definition on which the transfer is to happen use -.verb|(name, action)| as the action. - -.subsubsection Example - -The following example uses error actions to report an error and jump to a -machine that consumes the remainder of the line when parsing fails. After -consuming the line, the error recovery machine returns to the main loop. - -% GENERATE: erract -% %%{ -% machine erract; -% ws = ' '; -% address = 'foo AT bar..com'; -% date = 'Monday May 12'; -.code -action cmd_err { - printf( "command error\n" ); - fhold; fgoto line; -} -action from_err { - printf( "from error\n" ); - fhold; fgoto line; -} -action to_err { - printf( "to error\n" ); - fhold; fgoto line; -} - -line := [^\n]* '\n' @{ fgoto main; }; - -main := ( - ( - 'from' @err(cmd_err) - ( ws+ address ws+ date '\n' ) $err(from_err) | - 'to' @err(cmd_err) - ( ws+ address '\n' ) $err(to_err) - ) -)*; -.end code -% }%% -% %% write data; -% void f() -% { -% %% write init; -% %% write exec; -% } -% END GENERATE - - - -.section Action Ordering and Duplicates - -When combining expressions that have embedded actions it is often the case that -a number of actions must be executed on a single input character. For example, -following a concatenation the leaving action of the left expression and the -entering action of the right expression will be embedded into one transition. -This requires a method of ordering actions that is intuitive and -predictable for the user, and repeatable for the compiler. - -We associate with the embedding of each action a unique timestamp that is -used to order actions that appear together on a single transition in the final -state machine. To accomplish this we recursively traverse the parse tree of -regular expressions and assign timestamps to action embeddings. References to -machine definitions are followed in the traversal. When we visit a -parse tree node we assign timestamps to all .em{entering} action embeddings, -recurse on the parse tree, then assign timestamps to the remaining .em{all}, -.em{finishing}, and .em{leaving} embeddings in the order in which they -appear. - -By default Ragel does not permit a single action to appear multiple times in an action -list. When the final machine has been created, actions that appear more than -once in a single transition, to-state, from-state or EOF action list have their -duplicates removed. -The first appearance of the action is preserved. This is useful in a number of -scenarios. First, it allows us to union machines with common prefixes without -worrying about the action embeddings in the prefix being duplicated. Second, it -prevents leaving actions from being transferred multiple times. This can -happen when a machine is repeated, then followed with another machine that -begins with a common character. For example: - -.verbatim -word = [a-z]+ %act; -main := word ( '\n' word )* '\n\n'; -.end verbatim - -Note that Ragel does not compare action bodies to determine if they have -identical program text. It simply checks for duplicates using each action -block's unique location in the program. - -The removal of duplicates can be turned off using the .verb|-d| option. - -.section Values and Statements Available in Code Blocks -.label{vals} - -The following values are available in code blocks: - -.itemize -.item .verb|fpc| -- A pointer to the current character. This is equivalent to -accessing the .verb|p| variable. - -.item .verb|fc| -- The current character. This is equivalent to the expression .verb|(*p)|. - -.item .verb|fcurs| -- An integer value representing the current state. This -value should only be read from. To move to a different place in the machine -from action code use the .verb|fgoto|, .verb|fnext| or .verb|fcall| statements. -Outside of the machine execution code the .verb|cs| variable may be modified. - -.item .verb|ftargs| -- An integer value representing the target state. This -value should only be read from. Again, .verb|fgoto|, .verb|fnext| and -.verb|fcall| can be used to move to a specific entry point. - -.item .verb|fentry(<label>)| -- Retrieve an integer value representing the -entry point .verb|label|. The integer value returned will be a compile time -constant. This number is suitable for later use in control flow transfer -statements that take an expression. This value should not be compared against -the current state because any given label can have multiple states representing -it. The value returned by .verb|fentry| can be any one of the multiple states that -it represents. -.end itemize - -The following statements are available in code blocks: - -.itemize - -.item .verb|fhold;| -- Do not advance over the current character. If processing -data in multiple buffer blocks, the .verb|fhold| statement should only be used -once in the set of actions executed on a character. Multiple calls may result -in backing up over the beginning of the buffer block. The .verb|fhold| -statement does not imply any transfer of control. It is equivalent to the -.verb|p--;| statement. - -.item .verb|fexec <expr>;| -- Set the next character to process. This can be -used to backtrack to previous input or advance ahead. -Unlike .verb|fhold|, which can be used -anywhere, .verb|fexec| requires the user to ensure that the target of the -backtrack is in the current buffer block or is known to be somewhere ahead of -it. The machine will continue iterating forward until .verb|pe| is arrived at, -.verb|fbreak| is called or the machine moves into the error state. In actions -embedded into transitions, the .verb|fexec| statement is equivalent to setting -.verb|p| to one position ahead of the next character to process. If the user -also modifies .verb|pe|, it is possible to change the buffer block entirely. - -.item .verb|fgoto <label>;| -- Jump to an entry point defined by -.verb|<label>|. The .verb|fgoto| statement immediately transfers control to -the destination state. - -.item .verb|fgoto *<expr>;| -- Jump to an entry point given by .verb|<expr>|. -The expression must evaluate to an integer value representing a state. - -.item .verb|fnext <label>;| -- Set the next state to be the entry point defined -by .verb|label|. The .verb|fnext| statement does not immediately jump to the -specified state. Any action code following the statement is executed. - -.item .verb|fnext *<expr>;| -- Set the next state to be the entry point given -by .verb|<expr>|. The expression must evaluate to an integer value representing -a state. - -.item .verb|fcall <label>;| -- Push the target state and jump to the entry -point defined by .verb|<label>|. The next .verb|fret| will jump to the target -of the transition on which the call was made. Use of .verb|fcall| requires -the declaration of a call stack. An array of integers named .verb|stack| and a -single integer named .verb|top| must be declared. With the .verb|fcall| -construct, control is immediately transferred to the destination state. -See section .ref{modularization} for more information. - -.item .verb|fcall *<expr>;| -- Push the current state and jump to the entry -point given by .verb|<expr>|. The expression must evaluate to an integer value -representing a state. - -.item .verb|fret;| -- Return to the target state of the transition on which the -last .verb|fcall| was made. Use of .verb|fret| requires the declaration of a -call stack. Control is immediately transferred to the destination state. - -.item .verb|fbreak;| -- Advance .verb|p|, save the target state to .verb|cs| -and immediately break out of the execute loop. This statement is useful -in conjunction with the .verb|noend| write option. Rather than process input -until .verb|pe| is arrived at, the fbreak statement -can be used to stop processing from an action. After an .verb|fbreak| -statement the .verb|p| variable will point to the next character in the input. The -current state will be the target of the current transition. Note that .verb|fbreak| -causes the target state's to-state actions to be skipped. - -.end itemize - -Once actions with control-flow commands are embedded into a -machine, the user must exercise caution when using the machine as the operand -to other machine construction operators. If an action jumps to another state -then unioning any transition that executes that action with another transition -that follows some other path will cause that other path to be lost. Using -commands that manually jump around a machine takes us out of the domain of -regular languages because transitions that the -machine construction operators are not aware of are introduced. These -commands should therefore be used with caution. - - -.chapter Controlling Nondeterminism -.label{controlling-nondeterminism} - -Along with the flexibility of arbitrary action embeddings comes a need to -control nondeterminism in regular expressions. If a regular expression is -ambiguous, then sub-components of a parser other than the intended parts may become -active. This means that actions that are irrelevant to the -current subset of the parser may be executed, causing problems for the -programmer. - -Tools that are based on regular expression engines and that are used for -recognition tasks will usually function as intended regardless of the presence -of ambiguities. It is quite common for users of scripting languages to write -regular expressions that are heavily ambiguous and it generally does not -matter. As long as one of the potential matches is recognized, there can be any -number of other matches present. In some parsing systems the run-time engine -can employ a strategy for resolving ambiguities, for example always pursuing -the longest possible match and discarding others. - -In Ragel, there is no regular expression run-time engine, just a simple state -machine execution model. When we begin to embed actions and face the -possibility of spurious action execution, it becomes clear that controlling -nondeterminism at the machine construction level is very important. Consider -the following example. - -% GENERATE: lines1 -% OPT: -p -% %%{ -% machine lines1; -% action first {} -% action tail {} -% word = [a-z]+; -.code -ws = [\n\t ]; -line = word $first ( ws word $tail )* '\n'; -lines = line*; -.end code -% main := lines; -% }%% -% END GENERATE - -.graphic lines1 0.53 - -Since the .verb|ws| expression includes the newline character, we will -not finish the .verb|line| expression when a newline character is seen. We will -simultaneously pursue the possibility of matching further words on the same -line and the possibility of matching a second line. Evidence of this fact is -in the state tables. On several transitions both the .verb|first| and -.verb|tail| actions are executed. The solution here is simple: exclude -the newline character from the .verb|ws| expression. - -% GENERATE: lines2 -% OPT: -p -% %%{ -% machine lines2; -% action first {} -% action tail {} -% word = [a-z]+; -.code -ws = [\t ]; -line = word $first ( ws word $tail )* '\n'; -lines = line*; -.end code -% main := lines; -% }%% -% END GENERATE - -.graphic lines2 - -Solving this kind of problem is straightforward when the ambiguity is created -by strings that are a single character long. When the ambiguity is created by -strings that are multiple characters long we have a more difficult problem. -The following example is an incorrect attempt at a regular expression for C -language comments. - -% GENERATE: comments1 -% OPT: -p -% %%{ -% machine comments1; -% action comm {} -.code -comment = '/*' ( any @comm )* '*/'; -main := comment ' '; -.end code -% }%% -% END GENERATE - -.graphic comments1 - -Using standard concatenation, we will never leave the .verb|any*| expression. -We will forever entertain the possibility that a .verb|'*/'| string that we see -is contained in a longer comment and that, simultaneously, the comment has -ended. The concatenation of the .verb|comment| machine with .verb|SP| is done -to show this. When we match space, we are also still matching the comment body. - -One way to approach the problem is to exclude the terminating string -from the .verb|any*| expression using set difference. We must be careful to -exclude not just the terminating string, but any string that contains it as a -substring. A verbose, but proper specification of a C comment parser is given -by the following regular expression. - -% GENERATE: comments2 -% OPT: -p -% %%{ -% machine comments2; -% action comm {} -.code -comment = '/*' ( ( any @comm )* - ( any* '*/' any* ) ) '*/'; -.end code -% main := comment; -% }%% -% END GENERATE - -.graphic comments2 - -Note that Ragel's strong subtraction operator .verb|--| can also be used here. -In doing this subtraction we have phrased the problem of controlling non-determinism in -terms of excluding strings common to two expressions that interact when -combined. -We can also phrase the problem in terms of the transitions of the state -machines that implement these expressions. During the concatenation of -.verb|any*| and .verb|'*/'| we will be making transitions that are composed of -both the loop of the first expression and the final character of the second. -At this time we want the transition on the .verb|'/'| character to take precedence -over and disallow the transition that originated in the .verb|any*| loop. - -In another parsing problem, we wish to implement a lightweight tokenizer that we can -utilize in the composition of a larger machine. For example, some HTTP headers -have a token stream as a sub-language. The following example is an attempt -at a regular expression-based tokenizer that does not function correctly due to -unintended nondeterminism. - -% GENERATE: smallscanner -% OPT: -p -% %%{ -% machine smallscanner; -% action start_str {} -% action on_char {} -% action finish_str {} -.code -header_contents = ( - lower+ >start_str $on_char %finish_str | - ' ' -)*; -.end code -% main := header_contents; -% }%% -% END GENERATE - -.graphic smallscanner - -In this case, the problem with using a standard kleene star operation is that -there is an ambiguity between extending a token and wrapping around the machine -to begin a new token. Using the standard operator, we get an undesirable -nondeterministic behaviour. Evidence of this can be seen on the transition out -of state one to itself. The transition extends the string, and simultaneously, -finishes the string only to immediately begin a new one. What is required is -for the -transitions that represent an extension of a token to take precedence over the -transitions that represent the beginning of a new token. For this problem -there is no simple solution that uses standard regular expression operators. - -.section Priorities - -A priority mechanism was devised and built into the determinization -process, specifically for the purpose of allowing the user to control -nondeterminism. Priorities are integer values embedded into transitions. When -the determinization process is combining transitions that have different -priorities, the transition with the higher priority is preserved and the -transition with the lower priority is dropped. - -Unfortunately, priorities can have unintended side effects because their -operation requires that they linger in transitions indefinitely. They must linger -because the Ragel program cannot know when the user is finished with a priority -embedding. A solution whereby they are explicitly deleted after use is -conceivable; however this is not very user-friendly. Priorities were therefore -made into named entities. Only priorities with the same name are allowed to -interact. This allows any number of priorities to coexist in one machine for -the purpose of controlling various different regular expression operations and -eliminates the need to ever delete them. Such a scheme allows the user to -choose a unique name, embed two different priority values using that name -and be confident that the priority embedding will be free of any side effects. - -In the first form of priority embedding the name defaults to the name of the machine -definition that the priority is assigned in. In this sense priorities are by -default local to the current machine definition or instantiation. Beware of -using this form in a longest-match machine, since there is only one name for -the entire set of longest match patterns. In the second form the priority's -name can be specified, allowing priority interaction across machine definition -boundaries. - -.itemize -.item .verb|expr > int| -- Sets starting transitions to have priority int. -.item .verb|expr @ int| -- Sets transitions that go into a final state to have priority int. -.item .verb|expr $ int| -- Sets all transitions to have priority int. -.item .verb|expr % int| -- Sets leaving transitions to -have priority int. When a transition is made going out of the machine (either -by concatenation or kleene star) its priority is immediately set to the -leaving priority. -.end itemize - -The second form of priority assignment allows the programmer to specify the name -to which the priority is assigned. - -.itemize -.item .verb|expr > (name, int)| -- Starting transitions. -.item .verb|expr @ (name, int)| -- Finishing transitions (into a final state). -.item .verb|expr $ (name, int)| -- All transitions. -.item .verb|expr % (name, int)| -- Leaving transitions. -.end itemize - -.section Guarded Operators that Encapsulate Priorities - -Priority embeddings are a very expressive mechanism. At the same time they -can be very confusing for the user. They force the user to imagine -the transitions inside two interacting expressions and work out the precise -effects of the operations between them. When we consider -that this problem is worsened by the -potential for side effects caused by unintended priority name collisions, we -see that exposing the user to priorities is undesirable. - -Fortunately, in practice the use of priorities has been necessary only in a -small number of scenarios. This allows us to encapsulate their functionality -into a small set of operators and fully hide them from the user. This is -advantageous from a language design point of view because it greatly simplifies -the design. - -Going back to the C comment example, we can now properly specify -it using a guarded concatenation operator which we call .em{finish-guarded -concatenation}. From the user's point of view, this operator terminates the -first machine when the second machine moves into a final state. It chooses a -unique name and uses it to embed a low priority into all -transitions of the first machine. A higher priority is then embedded into the -transitions of the second machine that enter into a final state. The following -example yields a machine identical to the example in Section -.ref{controlling-nondeterminism}. - -.code -comment = '/*' ( any @comm )* :>> '*/'; -.end code - -.graphic comments2 - -Another guarded operator is .em{left-guarded concatenation}, given by the -.verb|<:| compound symbol. This operator places a higher priority on all -transitions of the first machine. This is useful if one must forcibly separate -two lists that contain common elements. For example, one may need to tokenize a -stream, but first consume leading whitespace. - -Ragel also includes a .em{longest-match kleene star} operator, given by the -.verb|**| compound symbol. This -guarded operator embeds a high -priority into all transitions of the machine. -A lower priority is then embedded into the leaving transitions. When the -kleene star operator makes the epsilon transitions from -the final states into the new start state, the lower priority will be transferred -to the epsilon transitions. In cases where following an epsilon transition -out of a final state conflicts with an existing transition out of a final -state, the epsilon transition will be dropped. - -Other guarded operators are conceivable, such as guards on union that cause one -alternative to take precedence over another. These may be implemented when it -is clear they constitute a frequently used operation. -In the next section we discuss the explicit specification of state machines -using state charts. - -.subsection Entry-Guarded Concatenation - -.verb|expr :> expr| - -This operator concatenates two machines, but first assigns a low -priority to all transitions -of the first machine and a high priority to the starting transitions of the -second machine. This operator is useful if from the final states of the first -machine it is possible to accept the characters in the entering transitions of -the second machine. This operator effectively terminates the first machine -immediately upon starting the second machine, where otherwise they would be -pursued concurrently. In the following example, entry-guarded concatenation is -used to move out of a machine that matches everything at the first sign of an -end-of-input marker. - -% GENERATE: entryguard -% OPT: -p -% %%{ -% machine entryguard; -.code -# Leave the catch-all machine on the first character of FIN. -main := any* :> 'FIN'; -.end code -% }%% -% END GENERATE - -.graphic entryguard - -Entry-guarded concatenation is equivalent to the following: - -.verbatim -expr $(unique_name,0) . expr >(unique_name,1) -.end verbatim - -.subsection Finish-Guarded Concatenation - -.verb|expr :>> expr| - -This operator is -like the previous operator, except the higher priority is placed on the final -transitions of the second machine. This is useful if one wishes to entertain -the possibility of continuing to match the first machine right up until the -second machine enters a final state. In other words it terminates the first -machine only when the second accepts. In the following example, finish-guarded -concatenation causes the move out of the machine that matches everything to be -delayed until the full end-of-input marker has been matched. - -% GENERATE: finguard -% OPT: -p -% %%{ -% machine finguard; -.code -# Leave the catch-all machine on the last character of FIN. -main := any* :>> 'FIN'; -.end code -% }%% -% END GENERATE - -.graphic finguard - -Finish-guarded concatenation is equivalent to the following, with one -exception. If the right machine's start state is final, the higher priority is -also embedded into it as a leaving priority. This prevents the left machine -from persisting via the zero-length string. - -.verbatim -expr $(unique_name,0) . expr @(unique_name,1) -.end verbatim - -.subsection Left-Guarded Concatenation - -.verb|expr <: expr| - -This operator places -a higher priority on the left expression. It is useful if you want to prefix a -sequence with another sequence composed of some of the same characters. For -example, one can consume leading whitespace before tokenizing a sequence of -whitespace-separated words as in: - -% GENERATE: leftguard -% OPT: -p -% %%{ -% machine leftguard; -% action alpha {} -% action ws {} -% action start {} -% action fin {} -.code -main := ( ' '* >start %fin ) <: ( ' ' $ws | [a-z] $alpha )*; -.end code -% }%% -% END GENERATE - -.graphic leftguard - -Left-guarded concatenation is equivalent to the following: - -.verbatim -expr $(unique_name,1) . expr >(unique_name,0) -.end verbatim - -.subsection Longest-Match Kleene Star -.label{longest_match_kleene_star} - -.verb|expr**| - -This version of kleene star puts a higher priority on staying in the -machine versus wrapping around and starting over. The LM kleene star is useful -when writing simple tokenizers. These machines are built by applying the -longest-match kleene star to an alternation of token patterns, as in the -following. - -% GENERATE: lmkleene -% OPT: -p -% %%{ -% machine exfinpri; -% action A {} -% action B {} -.code -# Repeat tokens, but make sure to get the longest match. -main := ( - lower ( lower | digit )* %A | - digit+ %B | - ' ' -)**; -.end code -% }%% -% END GENERATE - -.graphic lmkleene - -If a regular kleene star were used the machine above would not be able to -distinguish between extending a word and beginning a new one. This operator is -equivalent to: - -.verbatim -( expr $(unique_name,1) %(unique_name,0) )* -.end verbatim - -When the kleene star is applied, transitions that go out of the machine and -back into it are made. These are assigned a priority of zero by the leaving -transition mechanism. This is less than the priority of one assigned to the -transitions leaving the final states but not leaving the machine. When -these transitions clash on the same character, the -transition that stays in the machine takes precedence. The transition -that wraps around is dropped. - -Note that this operator does not build a scanner in the traditional sense -because there is never any backtracking. To build a scanner with backtracking -use the Longest-Match machine construction described in Section -.ref{generating-scanners}. - -.chapter Interface to Host Program - -The Ragel code generator is very flexible. The generated code has no -dependencies and can be inserted in any function, perhaps inside a loop if -desired. The user is responsible for declaring and initializing a number of -required variables, including the current state and the pointer to the input -stream. These can live in any scope. Control of the input processing loop is -also possible: the user may break out of the processing loop and return to it -at any time. - -In the case of the C, D, and Go host languages, Ragel is able to generate very -fast-running code that implements state machines as directly executable code. -Since very large files strain the host language compiler, table-based code -generation is also supported. In the future we hope to provide a partitioned, -directly executable format that is able to reduce the burden on the host -compiler by splitting large machines across multiple functions. - -In the case of Java and Ruby, table-based code generation is the only code -style supported. In the future this may be expanded to include other code -styles. - -Ragel can be used to parse input in one block, or it can be used to parse input -in a sequence of blocks as it arrives from a file or socket. Parsing the input -in a sequence of blocks brings with it a few responsibilities. If the parser -utilizes a scanner, care must be taken to not break the input stream anywhere -but token boundaries. If pointers to the input stream are taken during -parsing, care must be taken to not use a pointer that has been invalidated by -movement to a subsequent block. If the current input data pointer is moved -backwards it must not be moved past the beginning of the current block. - -Figure .ref{basic-example} shows a simple Ragel program that does not have any -actions. The example tests the first argument of the program against a number -pattern and then prints the machine's acceptance status. - -.figure basic-example -.verbatim -#include <stdio.h> -#include <string.h> -%%{ - machine foo; - write data; -}%% -int main( int argc, char **argv ) -{ - int cs; - if ( argc > 1 ) { - char *p = argv[1]; - char *pe = p + strlen( p ); - %%{ - main := [0-9]+ ( '.' [0-9]+ )?; - - write init; - write exec; - }%% - } - printf("result = %i\n", cs >= foo_first_final ); - return 0; -} -.end verbatim -.caption A basic Ragel example without any actions. -.end figure - -.section Variables Used by Ragel - -There are a number of variables that Ragel expects the user to declare. At a -very minimum the .verb|cs|, .verb|p| and .verb|pe| variables must be declared. -In Go, Java and Ruby code the .verb|data| variable must also be declared. If -EOF actions are used then the .verb|eof| variable is required. If -stack-based state machine control flow statements are used then the -.verb|stack| and .verb|top| variables are required. If a scanner is declared -then the .verb|act|, .verb|ts| and .verb|te| variables must be -declared. - -.itemize - -.item .verb|cs| - Current state. This must be an integer and it should persist -across invocations of the machine when the data is broken into blocks that are -processed independently. This variable may be modified from outside the -execution loop, but not from within. - -.item .verb|p| - Data pointer. In C/D code this variable is expected to be a -pointer to the character data to process. It should be initialized to the -beginning of the data block on every run of the machine. In Go, Java and Ruby it is -used as an offset to .verb|data| and must be an integer. In this case it should -be initialized to zero on every run of the machine. - -.item .verb|pe| - Data end pointer. This should be initialized to .verb|p| plus -the data length on every run of the machine. In Go, Java and Ruby code this should -be initialized to the data length. - -.item .verb|eof| - End of file pointer. This should be set to .verb|pe| when -the buffer block being processed is the last one, otherwise it should be set to -null. In Go, Java and Ruby code .verb|-1| must be used instead of null. If the EOF -event can be known only after the final buffer block has been processed, then -it is possible to set .verb|p = pe = eof| and run the execute block. - -.item .verb|data| - This variable is only required in Go, Java and Ruby code. It -must be an array containting the data to process. - -.item .verb|stack| - This must be an array of integers. It is used to store -integer values representing states. If the stack must resize dynamically the -Pre-push and Post-Pop statements can be used to do this (Sections -.ref{prepush} and .ref{postpop}). - -.item .verb|top| - This must be an integer value and will be used as an offset -to .verb|stack|, giving the next available spot on the top of the stack. - -.item .verb|act| - This must be an integer value. It is a variable sometimes -used by scanner code to keep track of the most recent successful pattern match. - -.item .verb|ts| - This must be a pointer to character data. In Go, Java and -Ruby code this must be an integer. See Section .ref{generating-scanners} for -more information. - -.item .verb|te| - Also a pointer to character data. - -.end itemize - -.section Alphtype Statement - -.verbatim -alphtype unsigned int; -.end verbatim - -The alphtype statement specifies the alphabet data type that the machine -operates on. During the compilation of the machine, integer literals are -expected to be in the range of possible values of the alphtype. The default -is .verb|char| for all languages except Go where the default is .verb|byte|. - -.multicols -C/C++/Objective-C: -.verbatim - char unsigned char - short unsigned short - int unsigned int - long unsigned long -.end verbatim - -Go: -.verbatim - byte - int8 uint8 - int16 uint16 - int32 uint32 - int -.end verbatim - -Ruby: -.verbatim - char - int -.end verbatim - -.columnbreak - -Java: -.verbatim - char - byte - short - int -.end verbatim - -D: -.verbatim - char - byte ubyte - short ushort - wchar - int uint - dchar -.end verbatim - -.end multicols - -.section Getkey Statement - -.verbatim -getkey fpc->id; -.end verbatim - -This statement specifies to Ragel how to retrieve the current character from -from the pointer to the current element (.verb|p|). Any expression that returns -a value of the alphabet type -may be used. The getkey statement may be used for looking into element -structures or for translating the character to process. The getkey expression -defaults to .verb|(*p)|. In goto-driven machines the getkey expression may be -evaluated more than once per element processed, therefore it should not incur a -large cost nor preclude optimization. - -.section Access Statement - -.verbatim -access fsm->; -.end verbatim - -The access statement specifies how the generated code should -access the machine data that is persistent across processing buffer blocks. -This applies to all variables except .verb|p|, .verb|pe| and .verb|eof|. This includes -.verb|cs|, .verb|top|, .verb|stack|, .verb|ts|, .verb|te| and .verb|act|. -The access statement is useful if a machine is to be encapsulated inside a -structure in C code. It can be used to give the name of -a pointer to the structure. - -.section Variable Statement - -.verbatim -variable p fsm->p; -.end verbatim - -The variable statement specifies how to access a specific -variable. All of the variables that are declared by the user and -used by Ragel can be changed. This includes .verb|p|, .verb|pe|, .verb|eof|, .verb|cs|, -.verb|top|, .verb|stack|, .verb|ts|, .verb|te| and .verb|act|. -In Go, Ruby and Java code generation the .verb|data| variable can also be changed. - -.section Pre-Push Statement -.label{prepush} - -.verbatim -prepush { - /* stack growing code */ -} -.end verbatim - -The prepush statement allows the user to supply stack management code that is -written out during the generation of fcall, immediately before the current -state is pushed to the stack. This statement can be used to test the number of -available spaces and dynamically grow the stack if necessary. - -.section Post-Pop Statement -.label{postpop} - -.verbatim -postpop { - /* stack shrinking code */ -} -.end verbatim - -The postpop statement allows the user to supply stack management code that is -written out during the generation of fret, immediately after the next state is -popped from the stack. This statement can be used to dynamically shrink the -stack. - -.section Write Statement -.label{write-statement} - -.verbatim -write <component> [options]; -.end verbatim - -The write statement is used to generate parts of the machine. -There are seven -components that can be generated by a write statement. These components make up the -state machine's data, initialization code, execution code, and export definitions. -A write statement may appear before a machine is fully defined. -This allows one to write out the data first then later define the machine where -it is used. An example of this is shown in Figure .ref{fbreak-example}. - -.subsection Write Data -.verbatim -write data [options]; -.end verbatim - -The write data statement causes Ragel to emit the constant static data needed -by the machine. In table-driven output styles (see Section .ref{genout}) this -is a collection of arrays that represent the states and transitions of the -machine. In goto-driven machines much less data is emitted. At the very -minimum a start state .verb|name_start| is generated. All variables written -out in machine data have both the .verb|static| and .verb|const| properties and -are prefixed with the name of the machine and an -underscore. The data can be placed inside a class, inside a function, or it can -be defined as global data. - -Two variables are written that may be used to test the state of the machine -after a buffer block has been processed. The .verb|name_error| variable gives -the id of the state that the machine moves into when it cannot find a valid -transition to take. The machine immediately breaks out of the processing loop when -it finds itself in the error state. The error variable can be compared to the -current state to determine if the machine has failed to parse the input. If the -machine is complete, that is from every state there is a transition to a proper -state on every possible character of the alphabet, then no error state is required -and this variable will be set to -1. - -The .verb|name_first_final| variable stores the id of the first final state. -All of the machine's states are sorted by their final state status before -having their ids assigned. Checking if the machine has accepted its input can -then be done by checking if the current state is greater-than or equal to the -first final state. - -Data generation has several options: - -.list -.li .verb|noerror | - Do not generate the integer variable that gives the id of the error state. -.li .verb|nofinal | - Do not generate the integer variable that gives the id of the first final state. -.li .verb|noprefix | - Do not prefix the variable names with the name of the machine. -.end list - -.figure fbreak-example -.verbatim -#include <stdio.h> -%% machine foo; -%% write data; -int main( int argc, char **argv ) -{ - int cs, res = 0; - if ( argc > 1 ) { - char *p = argv[1]; - %%{ - main := - [a-z]+ - 0 @{ res = 1; fbreak; }; - write init; - write exec noend; - }%% - } - printf("execute = %i\n", res ); - return 0; -} -.end verbatim -.caption Use of .tt{noend} write option and the .tt{fbreak} statement for -processing a string. -.end figure - -.subsection Write Start, First Final and Error - -.verbatim -write start; -write first_final; -write error; -.end verbatim - -These three write statements provide an alternative means of accessing the -.verb|start|, .verb|first_final| and .verb|error| states. If there are many -different machine specifications in one file it is easy to get the prefix for -these wrong. This is especially true if the state machine boilerplate is -frequently made by a copy-paste-edit process. These write statements allow the -problem to be avoided. They can be used as follows: - -.verbatim -/* Did parsing succeed? */ -if ( cs < %%{ write first_final; }%% ) { - result = ERR_PARSE_ERROR; - goto fail; -} -.end verbatim - -.subsection Write Init -.verbatim -write init [options]; -.end verbatim - -The write init statement causes Ragel to emit initialization code. This should -be executed once before the machine is started. At a very minimum this sets the -current state to the start state. If other variables are needed by the -generated code, such as call stack variables or scanner management -variables, they are also initialized here. - -The .verb|nocs| option to the write init statement will cause ragel to skip -intialization of the cs variable. This is useful if the user wishes to use -custom logic to decide which state the specification should start in. - -.subsection Write Exec -.verbatim -write exec [options]; -.end verbatim - -The write exec statement causes Ragel to emit the state machine's execution code. -Ragel expects several variables to be available to this code. At a very minimum, the -generated code needs access to the current character position .verb|p|, the ending -position .verb|pe| and the current state .verb|cs| (though .verb|pe| -can be omitted using the .verb|noend| write option). -The .verb|p| variable is the cursor that the execute code will -used to traverse the input. The .verb|pe| variable should be set up to point to one -position past the last valid character in the buffer. - -Other variables are needed when certain features are used. For example using -the .verb|fcall| or .verb|fret| statements requires .verb|stack| and -.verb|top| variables to be defined. If a longest-match construction is used, -variables for managing backtracking are required. - -The write exec statement has one option. The .verb|noend| option tells Ragel -to generate code that ignores the end position .verb|pe|. In this -case the user must explicitly break out of the processing loop using -.verb|fbreak|, otherwise the machine will continue to process characters until -it moves into the error state. This option is useful if one wishes to process a -null terminated string. Rather than traverse the string to discover then length -before processing the input, the user can break out when the null character is -seen. The example in Figure .ref{fbreak-example} shows the use of the -.verb|noend| write option and the .verb|fbreak| statement for processing a string. - -.subsection Write Exports -.label{export} - -.verbatim -write exports; -.end verbatim - -The export feature can be used to export simple machine definitions. Machine definitions -are marked for export using the .verb|export| keyword. - -.verbatim -export machine_to_export = 0x44; -.end verbatim - -When the write exports statement is used these machines are -written out in the generated code. Defines are used for C and constant integers -are used for D, Java and Ruby. See Section .ref{import} for a description of the -import statement. - -.section Maintaining Pointers to Input Data - -In the creation of any parser it is not uncommon to require the collection of -the data being parsed. It is always possible to collect data into a growable -buffer as the machine moves over it, however the copying of data is a somewhat -wasteful use of processor cycles. The most efficient way to collect data from -the parser is to set pointers into the input then later reference them. This -poses a problem for uses of Ragel where the input data arrives in blocks, such -as over a socket or from a file. If a pointer is set in one buffer block but -must be used while parsing a following buffer block, some extra consideration -to correctness must be made. - -The scanner constructions exhibit this problem, requiring the maintenance -code described in Section .ref{generating-scanners}. If a longest-match -construction has been used somewhere in the machine then it is possible to -take advantage of the required prefix maintenance code in the driver program to -ensure pointers to the input are always valid. If laying down a pointer one can -set .verb|ts| at the same spot or ahead of it. When data is shifted in -between loops the user must also shift the pointer. In this way it is possible -to maintain pointers to the input that will always be consistent. - -.figure line-oriented -.verbatim - int have = 0; - while ( 1 ) { - char *p, *pe, *data = buf + have; - int len, space = BUFSIZE - have; - - if ( space == 0 ) { - fprintf(stderr, "BUFFER OUT OF SPACE\n"); - exit(1); - } - - len = fread( data, 1, space, stdin ); - if ( len == 0 ) - break; - - /* Find the last newline by searching backwards. */ - p = buf; - pe = data + len - 1; - while ( *pe != '\n' && pe >= buf ) - pe--; - pe += 1; - - %% write exec; - - /* How much is still in the buffer? */ - have = data + len - pe; - if ( have > 0 ) - memmove( buf, pe, have ); - - if ( len < space ) - break; - } -.end verbatim -.caption An example of line-oriented processing. -.end figure - -In general, there are two approaches for guaranteeing the consistency of -pointers to input data. The first approach is the one just described; -lay down a marker from an action, -then later ensure that the data the marker points to is preserved ahead of -the buffer on the next execute invocation. This approach is good because it -allows the parser to decide on the pointer-use boundaries, which can be -arbitrarily complex parsing conditions. A downside is that it requires any -pointers that are set to be corrected in between execute invocations. - -The alternative is to find the pointer-use boundaries before invoking the execute -routine, then pass in the data using these boundaries. For example, if the -program must perform line-oriented processing, the user can scan backwards from -the end of an input block that has just been read in and process only up to the -first found newline. On the next input read, the new data is placed after the -partially read line and processing continues from the beginning of the line. -An example of line-oriented processing is given in Figure .ref{line-oriented}. - -.section Specifying the Host Language - -The .verb|ragel| program has a number of options for specifying the host -language. The host-language options are: - -.itemize -.item .verb|-C | for C/C++/Objective-C code (default) -.item .verb|-D | for D code. -.item .verb|-Z | for Go code. -.item .verb|-J | for Java code. -.item .verb|-R | for Ruby code. -.item .verb|-A | for C\# code. -.end itemize - -.section Choosing a Generated Code Style -.label{genout} - -There are three styles of code output to choose from. Code style affects the -size and speed of the compiled binary. Changing code style does not require any -change to the Ragel program. There are two table-driven formats and a goto -driven format. - -In addition to choosing a style to emit, there are various levels of action -code reuse to choose from. The maximum reuse levels (.verb|-T0|, .verb|-F0| -and .verb|-G0|) ensure that no FSM action code is ever duplicated by encoding -each transition's action list as static data and iterating -through the lists on every transition. This will normally result in a smaller -binary. The less action reuse options (.verb|-T1|, .verb|-F1| and .verb|-G1|) -will usually produce faster running code by expanding each transition's action -list into a single block of code, eliminating the need to iterate through the -lists. This duplicates action code instead of generating the logic necessary -for reuse. Consequently the binary will be larger. However, this tradeoff applies to -machines with moderate to dense action lists only. If a machine's transitions -frequently have less than two actions then the less reuse options will actually -produce both a smaller and a faster running binary due to less action sharing -overhead. The best way to choose the appropriate code style for your -application is to perform your own tests. - -The table-driven FSM represents the state machine as constant static data. There are -tables of states, transitions, indices and actions. The current state is -stored in a variable. The execution is simply a loop that looks up the current -state, looks up the transition to take, executes any actions and moves to the -target state. In general, the table-driven FSM can handle any machine, produces -a smaller binary and requires a less expensive host language compile, but -results in slower running code. Since the table-driven format is the most -flexible it is the default code style. - -The flat table-driven machine is a table-based machine that is optimized for -small alphabets. Where the regular table machine uses the current character as -the key in a binary search for the transition to take, the flat table machine -uses the current character as an index into an array of transitions. This is -faster in general, however is only suitable if the span of possible characters -is small. - -The goto-driven FSM represents the state machine using goto and switch -statements. The execution is a flat code block where the transition to take is -computed using switch statements and directly executable binary searches. In -general, the goto FSM produces faster code but results in a larger binary and a -more expensive host language compile. - -The goto-driven format has an additional action reuse level (.verb|-G2|) that -writes actions directly into the state transitioning logic rather than putting -all the actions together into a single switch. Generally this produces faster -running code because it allows the machine to encode the current state using -the processor's instruction pointer. Again, sparse machines may actually -compile to smaller binaries when .verb|-G2| is used due to less state and -action management overhead. For many parsing applications .verb|-G2| is the -preferred output format. - -.center - -Code Output Style Options - -.tabular -.row .verb|-T0|&binary search table-driven&C/D/Java/Ruby/C\# -.row .verb|-T1|&binary search, expanded actions&C/D/Ruby/C\# -.row .verb|-F0|&flat table-driven&C/D/Ruby/C\# -.row .verb|-F1|&flat table, expanded actions&C/D/Ruby/C\# -.row .verb|-G0|&goto-driven&C/D/C\# -.row .verb|-G1|&goto, expanded actions&C/D/C\# -.row .verb|-G2|&goto, in-place actions&C/D/Go -.end tabular -.end center - -.chapter Beyond the Basic Model - -.section Parser Modularization -.label{modularization} - -It is possible to use Ragel's machine construction and action embedding -operators to specify an entire parser using a single regular expression. In -many cases this is the desired way to specify a parser in Ragel. However, in -some scenarios the language to parse may be so large that it is difficult to -think about it as a single regular expression. It may also shift between distinct -parsing strategies, in which case modularization into several coherent blocks -of the language may be appropriate. - -It may also be the case that patterns that compile to a large number of states -must be used in a number of different contexts and referencing them in each -context results in a very large state machine. In this case, an ability to reuse -parsers would reduce code size. - -To address this, distinct regular expressions may be instantiated and linked -together by means of a jumping and calling mechanism. This mechanism is -analogous to the jumping to and calling of processor instructions. A jump -command, given in action code, causes control to be immediately passed to -another portion of the machine by way of setting the current state variable. A -call command causes the target state of the current transition to be pushed to -a state stack before control is transferred. Later on, the original location -may be returned to with a return statement. In the following example, distinct -state machines are used to handle the parsing of two types of headers. - -% GENERATE: call -% %%{ -% machine call; -.code -action return { fret; } -action call_date { fcall date; } -action call_name { fcall name; } - -# A parser for date strings. -date := [0-9][0-9] '/' - [0-9][0-9] '/' - [0-9][0-9][0-9][0-9] '\n' @return; - -# A parser for name strings. -name := ( [a-zA-Z]+ | ' ' )** '\n' @return; - -# The main parser. -headers = - ( 'from' | 'to' ) ':' @call_name | - ( 'departed' | 'arrived' ) ':' @call_date; - -main := headers*; -.end code -% }%% -% %% write data; -% void f() -% { -% %% write init; -% %% write exec; -% } -% END GENERATE - -Calling and jumping should be used carefully as they are operations that take -one out of the domain of regular languages. A machine that contains a call or -jump statement in one of its actions should be used as an argument to a machine -construction operator only with considerable care. Since DFA transitions may -actually represent several NFA transitions, a call or jump embedded in one -machine can inadvertently terminate another machine that it shares prefixes -with. Despite this danger, theses statements have proven useful for tying -together sub-parsers of a language into a parser for the full language, -especially for the purpose of modularizing code and reducing the number of -states when the machine contains frequently recurring patterns. - -Section .ref{vals} describes the jump and call statements that are used to -transfer control. These statements make use of two variables that must be -declared by the user, .verb|stack| and .verb|top|. The .verb|stack| variable -must be an array of integers and .verb|top| must be a single integer, which -will point to the next available space in .verb|stack|. Sections .ref{prepush} -and .ref{postpop} describe the Pre-Push and Post-Pop statements which can be -used to implement a dynamically resizable array. - -.section Referencing Names -.label{labels} - -This section describes how to reference names in epsilon transitions (Section -.ref{state-charts}) and -action-based control-flow statements such as .verb|fgoto|. There is a hierarchy -of names implied in a Ragel specification. At the top level are the machine -instantiations. Beneath the instantiations are labels and references to machine -definitions. Beneath those are more labels and references to definitions, and -so on. - -Any name reference may contain multiple components separated with the .verb|::| -compound symbol. The search for the first component of a name reference is -rooted at the join expression that the epsilon transition or action embedding -is contained in. If the name reference is not contained in a join, -the search is rooted at the machine definition that the epsilon transition or -action embedding is contained in. Each component after the first is searched -for beginning at the location in the name tree that the previous reference -component refers to. - -In the case of action-based references, if the action is embedded more than -once, the local search is performed for each embedding and the result is the -union of all the searches. If no result is found for action-based references then -the search is repeated at the root of the name tree. Any action-based name -search may be forced into a strictly global search by prefixing the name -reference with .verb|::|. - -The final component of the name reference must resolve to a unique entry point. -If a name is unique in the entire name tree it can be referenced as is. If it -is not unique it can be specified by qualifying it with names above it in the -name tree. However, it can always be renamed. - -% FIXME: Should fit this in somewhere. -% Some kinds of name references are illegal. Cannot call into longest-match -% machine, can only call its start state. Cannot make a call to anywhere from -% any part of a longest-match machine except a rule's action. This would result -% in an eventual return to some point inside a longest-match other than the -% start state. This is banned for the same reason a call into the LM machine is -% banned. - - -.section Scanners -.label{generating-scanners} - -Scanners are very much intertwined with regular-languages and their -corresponding processors. For this reason Ragel supports the definition of -scanners. The generated code will repeatedly attempt to match patterns from a -list, favouring longer patterns over shorter patterns. In the case of -equal-length matches, the generated code will favour patterns that appear ahead -of others. When a scanner makes a match it executes the user code associated -with the match, consumes the input then resumes scanning. - -.verbatim -<machine_name> := |* - pattern1 => action1; - pattern2 => action2; - ... - *|; -.end verbatim - -On the surface, Ragel scanners are similar to those defined by Lex. Though -there is a key distinguishing feature: patterns may be arbitrary Ragel -expressions and can therefore contain embedded code. With a Ragel-based scanner -the user need not wait until the end of a pattern before user code can be -executed. - -Scanners can be used to process sub-languages, as well as for tokenizing -programming languages. In the following example a scanner is used to tokenize -the contents of a header field. - -.code -word = [a-z]+; -head_name = 'Header'; - -header := |* - word; - ' '; - '\n' => { fret; }; -*|; - -main := ( head_name ':' @{ fcall header; } )*; -.end code - -The scanner construction has a purpose similar to the longest-match kleene star -operator .verb|**|. The key -difference is that a scanner is able to backtrack to match a previously matched -shorter string when the pursuit of a longer string fails. For this reason the -scanner construction operator is not a pure state machine construction -operator. It relies on several variables that enable it to backtrack and make -pointers to the matched input text available to the user. For this reason -scanners must be immediately instantiated. They cannot be defined inline or -referenced by another expression. Scanners must be jumped to or called. - -Scanners rely on the .verb|ts|, .verb|te| and .verb|act| -variables to be present so that they can backtrack and make pointers to the -matched text available to the user. If input is processed using multiple calls -to the execute code then the user must ensure that when a token is only -partially matched that the prefix is preserved on the subsequent invocation of -the execute code. - -The .verb|ts| variable must be defined as a pointer to the input data. -It is used for recording where the current token match begins. This variable -may be used in action code for retrieving the text of the current match. Ragel -ensures that in between tokens and outside of the longest-match machines that -this pointer is set to null. In between calls to the execute code the user must -check if .verb|ts| is set and if so, ensure that the data it points to is -preserved ahead of the next buffer block. This is described in more detail -below. - -The .verb|te| variable must also be defined as a pointer to the input data. -It is used for recording where a match ends and where scanning of the next -token should begin. This can also be used in action code for retrieving the -text of the current match. - -The .verb|act| variable must be defined as an integer type. It is used for -recording the identity of the last pattern matched when the scanner must go -past a matched pattern in an attempt to make a longer match. If the longer -match fails it may need to consult the .verb|act| variable. In some cases, use -of the .verb|act| -variable can be avoided because the value of the current state is enough -information to determine which token to accept, however in other cases this is -not enough and so the .verb|act| variable is used. - -When the longest-match operator is in use, the user's driver code must take on -some buffer management functions. The following algorithm gives an overview of -the steps that should be taken to properly use the longest-match operator. - -.itemize -.item Read a block of input data. -.item Run the execute code. -.item If .verb|ts| is set, the execute code will expect the incomplete -token to be preserved ahead of the buffer on the next invocation of the execute -code. -.itemize -.item Shift the data beginning at .verb|ts| and ending at .verb|pe| to the -beginning of the input buffer. -.item Reset .verb|ts| to the beginning of the buffer. -.item Shift .verb|te| by the distance from the old value of .verb|ts| -to the new value. The .verb|te| variable may or may not be valid. There is -no way to know if it holds a meaningful value because it is not kept at null -when it is not in use. It can be shifted regardless. -.end itemize -.item Read another block of data into the buffer, immediately following any -preserved data. -.item Run the scanner on the new data. -.end itemize - -Figure .ref{preserve_example} shows the required handling of an input stream in -which a token is broken by the input block boundaries. After processing up to -and including the ``t'' of ``characters'', the prefix of the string token must be -retained and processing should resume at the ``e'' on the next iteration of -the execute code. - -If one uses a large input buffer for collecting input then the number of times -the shifting must be done will be small. Furthermore, if one takes care not to -define tokens that are allowed to be very long and instead processes these -items using pure state machines or sub-scanners, then only a small amount of -data will ever need to be shifted. - -.figure preserve_example -.verbatim - a) A stream "of characters" to be scanned. - | | | - p ts pe - - b) "of characters" to be scanned. - | | | - ts p pe -.end verbatim -.caption Following an invocation of the execute code there may be a partially -matched token (a). The data of the partially matched token -must be preserved ahead of the new data on the next invocation (b). -.end figure - -Since scanners attempt to make the longest possible match of input, patterns -such as identifiers require one character of lookahead in order to trigger a -match. In the case of the last token in the input stream the user must ensure -that the .verb|eof| variable is set so that the final token is flushed out. - -An example scanner processing loop is given in Figure .ref{scanner-loop}. - -.figure scanner-loop -.verbatim - int have = 0; - bool done = false; - while ( !done ) { - /* How much space is in the buffer? */ - int space = BUFSIZE - have; - if ( space == 0 ) { - /* Buffer is full. */ - cerr << "TOKEN TOO BIG" << endl; - exit(1); - } - - /* Read in a block after any data we already have. */ - char *p = inbuf + have; - cin.read( p, space ); - int len = cin.gcount(); - - char *pe = p + len; - char *eof = 0; - - /* If no data was read indicate EOF. */ - if ( len == 0 ) { - eof = pe; - done = true; - } - - %% write exec; - - if ( cs == Scanner_error ) { - /* Machine failed before finding a token. */ - cerr << "PARSE ERROR" << endl; - exit(1); - } - - if ( ts == 0 ) - have = 0; - else { - /* There is a prefix to preserve, shift it over. */ - have = pe - ts; - memmove( inbuf, ts, have ); - te = inbuf + (te-ts); - ts = inbuf; - } - } -.end verbatim -.caption A processing loop for a scanner. -.end figure - -.section State Charts -.label{state-charts} - -In addition to supporting the construction of state machines using regular -languages, Ragel provides a way to manually specify state machines using -state charts. The comma operator combines machines together without any -implied transitions. The user can then manually link machines by specifying -epsilon transitions with the .verb|->| operator. Epsilon transitions are drawn -between the final states of a machine and entry points defined by labels. This -makes it possible to build machines using the explicit state-chart method while -making minimal changes to the Ragel language. - -An interesting feature of Ragel's state chart construction method is that it -can be mixed freely with regular expression constructions. A state chart may be -referenced from within a regular expression, or a regular expression may be -used in the definition of a state chart transition. - -.subsection Join - -.verb|expr , expr , ...| - -Join a list of machines together without -drawing any transitions, without setting up a start state, and without -designating any final states. Transitions between the machines may be specified -using labels and epsilon transitions. The start state must be explicity -specified with the ``start'' label. Final states may be specified with an -epsilon transition to the implicitly created ``final'' state. The join -operation allows one to build machines using a state chart model. - -.subsection Label - -.verb|label: expr| - -Attaches a label to an expression. Labels can be -used as the target of epsilon transitions and explicit control transfer -statements such as .verb|fgoto| and .verb|fnext| in action -code. - -.subsection Epsilon - -.verb|expr -> label| - -Draws an epsilon transition to the state defined -by .verb|label|. Epsilon transitions are made deterministic when join -operators are evaluated. Epsilon transitions that are not in a join operation -are made deterministic when the machine definition that contains the epsilon is -complete. See Section .ref{labels} for information on referencing labels. - -.subsection Simplifying State Charts - -There are two benefits to providing state charts in Ragel. The first is that it -allows us to take a state chart with a full listing of states and transitions -and simplify it in selective places using regular expressions. - -The state chart method of specifying parsers is very common. It is an -effective programming technique for producing robust code. The key disadvantage -becomes clear when one attempts to comprehend a large parser specified in this -way. These programs usually require many lines, causing logic to be spread out -over large distances in the source file. Remembering the function of a large -number of states can be difficult and organizing the parser in a sensible way -requires discipline because branches and repetition present many file layout -options. This kind of programming takes a specification with inherent -structure such as looping, alternation and concatenation and expresses it in a -flat form. - -If we could take an isolated component of a manually programmed state chart, -that is, a subset of states that has only one entry point, and implement it -using regular language operators then we could eliminate all the explicit -naming of the states contained in it. By eliminating explicitly named states -and replacing them with higher-level specifications we simplify a state machine -specification. - -For example, sometimes chains of states are needed, with only a small number of -possible characters appearing along the chain. These can easily be replaced -with a concatenation of characters. Sometimes a group of common states -implement a loop back to another single portion of the machine. Rather than -manually duplicate all the transitions that loop back, we may be able to -express the loop using a kleene star operator. - -Ragel allows one to take this state map simplification approach. We can build -state machines using a state map model and implement portions of the state map -using regular languages. In place of any transition in the state machine, -entire sub-machines can be given. These can encapsulate functionality -defined elsewhere. An important aspect of the Ragel approach is that when we -wrap up a collection of states using a regular expression we do not lose -access to the states and transitions. We can still execute code on the -transitions that we have encapsulated. - -.subsection Dropping Down One Level of Abstraction -.label{down} - -The second benefit of incorporating state charts into Ragel is that it permits -us to bypass the regular language abstraction if we need to. Ragel's action -embedding operators are sometimes insufficient for expressing certain parsing -tasks. In the same way that is useful for C language programmers to drop down -to assembly language programming using embedded assembler, it is sometimes -useful for the Ragel programmer to drop down to programming with state charts. - -In the following example, we wish to buffer the characters of an XML CDATA -sequence. The sequence is terminated by the string .verb|]]>|. The challenge -in our application is that we do not wish the terminating characters to be -buffered. An expression of the form .verb|any* @buffer :>> ']]>'| will not work -because the buffer will always contain the characters .verb|]]| on the end. -Instead, what we need is to delay the buffering of .verb|]| -characters until a time when we -abandon the terminating sequence and go back into the main loop. There is no -easy way to express this using Ragel's regular expression and action embedding -operators, and so an ability to drop down to the state chart method is useful. - -% GENERATE: dropdown -% OPT: -p -% %%{ -% machine dropdown; -.code -action bchar { buff( fpc ); } # Buffer the current character. -action bbrack1 { buff( "]" ); } -action bbrack2 { buff( "]]" ); } - -CDATA_body = -start: ( - ']' -> one | - (any-']') @bchar ->start -), -one: ( - ']' -> two | - [^\]] @bbrack1 @bchar ->start -), -two: ( - '>' -> final | - ']' @bbrack1 -> two | - [^>\]] @bbrack2 @bchar ->start -); -.end code -% main := CDATA_body; -% }%% -% END GENERATE - -.graphic dropdown - - -.section Semantic Conditions -.label{semantic} - -Many communication protocols contain variable-length fields, where the length -of the field is given ahead of the field as a value. This -problem cannot be expressed using regular languages because of its -context-dependent nature. The prevalence of variable-length fields in -communication protocols motivated us to introduce semantic conditions into -the Ragel language. - -A semantic condition is a block of user code that is interpreted as an -expression and evaluated immediately -before a transition is taken. If the code returns a value of true, the -transition may be taken. We can now embed code that extracts the length of a -field, then proceed to match $n$ data values. - -% GENERATE: conds1 -% OPT: -p -% %%{ -% machine conds1; -% number = digit+; -.code -action rec_num { i = 0; n = getnumber(); } -action test_len { i++ < n } -data_fields = ( - 'd' - [0-9]+ %rec_num - ':' - ( [a-z] when test_len )* -)**; -.end code -% main := data_fields; -% }%% -% END GENERATE - -.graphic conds1 - -The Ragel implementation of semantic conditions does not force us to give up the -compositional property of Ragel definitions. For example, a machine that tests -the length of a field using conditions can be unioned with another machine -that accepts some of the same strings, without the two machines interfering with -one another. The user need not be concerned about whether or not the result of the -semantic condition will affect the matching of the second machine. - -To see this, first consider that when a user associates a condition with an -existing transition, the transition's label is translated from the base character -to its corresponding value in the space that represents ``condition $c$ true''. Should -the determinization process combine a state that has a conditional transition -with another state that has a transition on the same input character but -without a condition, then the condition-less transition first has its label -translated into two values, one to its corresponding value in the space that -represents ``condition $c$ true'' and another to its corresponding value in the -space that represents ``condition $c$ false''. It -is then safe to combine the two transitions. This is shown in the following -example. Two intersecting patterns are unioned, one with a condition and one -without. The condition embedded in the first pattern does not affect the second -pattern. - -% GENERATE: conds2 -% OPT: -p -% %%{ -% machine conds2; -% number = digit+; -.code -action test_len { i++ < n } -action one { /* accept pattern one */ } -action two { /* accept pattern two */ } -patterns = - ( [a-z] when test_len )+ %one | - [a-z][a-z0-9]* %two; -main := patterns '\n'; -.end code -% }%% -% END GENERATE - -.graphic conds2 - -There are many more potential uses for semantic conditions. The user is free to -use arbitrary code and may therefore perform actions such as looking up names -in dictionaries, validating input using external parsing mechanisms or -performing checks on the semantic structure of input seen so far. In the next -section we describe how Ragel accommodates several common parser engineering -problems. - -The semantic condition feature works only with alphabet types that are smaller -in width than the .verb|long| type. To implement semantic conditions Ragel -needs to be able to allocate characters from the alphabet space. Ragel uses -these allocated characters to express "character C with condition P true" or "C -with P false." Since internally Ragel uses longs to store characters there is -no room left in the alphabet space unless an alphabet type smaller than long is -used. - -.section Implementing Lookahead - -There are a few strategies for implementing lookahead in Ragel programs. -Leaving actions, which are described in Section .ref{out-actions}, can be -used as a form of lookahead. Ragel also provides the .verb|fhold| directive -which can be used in actions to prevent the machine from advancing over the -current character. It is also possible to manually adjust the current character -position by shifting it backwards using .verb|fexec|, however when this is -done, care must be taken not to overstep the beginning of the current buffer -block. In both the use of .verb|fhold| and .verb|fexec| the user must be -cautious of combining the resulting machine with another in such a way that the -transition on which the current position is adjusted is not combined with a -transition from the other machine. - -.section Parsing Recursive Language Structures - -In general Ragel cannot handle recursive structures because the grammar is -interpreted as a regular language. However, depending on what needs to be -parsed it is sometimes practical to implement the recursive parts using manual -coding techniques. This often works in cases where the recursive structures are -simple and easy to recognize, such as in the balancing of parentheses - -One approach to parsing recursive structures is to use actions that increment -and decrement counters or otherwise recognize the entry to and exit from -recursive structures and then jump to the appropriate machine defnition using -.verb|fcall| and .verb|fret|. Alternatively, semantic conditions can be used to -test counter variables. - -A more traditional approach is to call a separate parsing function (expressed -in the host language) when a recursive structure is entered, then later return -when the end is recognized. diff --git a/test/repeat2.lm b/test/repeat2.lm index d9104492..39ffe53b 100644 --- a/test/repeat2.lm +++ b/test/repeat2.lm @@ -1,3 +1,4 @@ +##### LM ##### # # Copyright 2012 Adrian Thurston <thurston@complang.org> # @@ -544,3 +545,6864 @@ print( ~ ~\end{document} ) +##### IN ##### +.title Ragel State Machine Compiler + +.subtitle User Guide + +.author Adrian Thurston + +.license + +This document is part of Ragel, and as such, this document is +released under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 2 of the License, or (at your option) +any later version. + +Ragel is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. + +You should have received a copy of the GNU General Public +License along with Ragel; if not, write to the Free Software Foundation, Inc., +59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +.chapter Introduction + +.section Abstract + +Regular expressions are used heavily in practice for the purpose of specifying +parsers. They are normally used as black boxes linked together with program +logic. User actions are executed in between invocations of the regular +expression engine. Adding actions before a pattern terminates requires patterns +to be broken and pasted back together with program logic. The more user actions +are needed, the less the advantages of regular expressions are seen. + +Ragel is a software development tool that allows user actions to be +embedded into the transitions of a regular expression's corresponding state +machine, eliminating the need to switch from the regular expression engine and +user code execution environment and back again. As a result, expressions can be +maximally continuous. One is free to specify an entire parser using a single +regular expression. The single-expression model affords concise and elegant +descriptions of languages and the generation of very simple, fast and robust +code. Ragel compiles executable finite state machines from a high level regular language +notation. Ragel targets C, C++, Objective-C, D, Go, Java and Ruby. + +In addition to building state machines from regular expressions, Ragel allows +the programmer to directly specify state machines with state charts. These two +notations may be freely combined. There are also facilities for controlling +nondeterminism in the resulting machines and building scanners using patterns +that themselves have embedded actions. Ragel can produce code that is small and +runs very fast. Ragel can handle integer-sized alphabets and can compile very +large state machines. + +.section Motivation + +When a programmer is faced with the task of producing a parser for a +context-free language there are many tools to choose from. It is quite common +to generate useful and efficient parsers for programming languages from a +formal grammar. It is also quite common for programmers to avoid such tools +when making parsers for simple computer languages, such as file formats and +communication protocols. Such languages are often regular and tools for +processing the context-free languages are viewed as too heavyweight for the +purpose of parsing regular languages. The extra run-time effort required for +supporting the recursive nature of context-free languages is wasted. + +When we turn to the regular expression-based parsing tools, such as Lex, Re2C, +and scripting languages such as Sed, Awk and Perl we find that they are split +into two levels: a regular expression matching engine and some kind of program +logic for linking patterns together. For example, a Lex program is composed of +sets of regular expressions. The implied program logic repeatedly attempts to +match a pattern in the current set. When a match is found the associated user +code executed. It requires the user to consider a language as a sequence of +independent tokens. Scripting languages and regular expression libraries allow +one to link patterns together using arbitrary program code. This is very +flexible and powerful, however we can be more concise and clear if we avoid +gluing together regular expressions with if statements and while loops. + +This model of execution, where the runtime alternates between regular +expression matching and user code exectution places restrictions on when +action code may be executed. Since action code can only be associated with +complete patterns, any action code that must be executed before an entire +pattern is matched requires that the pattern be broken into smaller units. +Instead of being forced to disrupt the regular expression syntax and write +smaller expressions, it is desirable to retain a single expression and embed +code for performing actions directly into the transitions that move over the +characters. After all, capable programmers are astutely aware of the machinery +underlying their programs, so why not provide them with access to that +machinery? To achieve this we require an action execution model for associating +code with the sub-expressions of a regular expression in a way that does not +disrupt its syntax. + +The primary goal of Ragel is to provide developers with an ability to embed +actions into the transitions and states of a regular expression's state machine +in support of the definition of entire parsers or large sections of parsers +using a single regular expression. From the regular expression we gain a clear +and concise statement of our language. From the state machine we obtain a very +fast and robust executable that lends itself to many kinds of analysis and +visualization. + +.section Overview + +Ragel is a language for specifying state machines. The Ragel program is a +compiler that assembles a state machine definition to executable code. Ragel +is based on the principle that any regular language can be converted to a +deterministic finite state automaton. Since every regular language has a state +machine representation and vice versa, the terms regular language and state +machine (or just machine) will be used interchangeably in this document. + +Ragel outputs machines to C, C++, Objective-C, D, Go, Java or Ruby code. The output is +designed to be generic and is not bound to any particular input or processing +method. A Ragel machine expects to have data passed to it in buffer blocks. +When there is no more input, the machine can be queried for acceptance. In +this way, a Ragel machine can be used to simply recognize a regular language +like a regular expression library. By embedding code into the regular language, +a Ragel machine can also be used to parse input. + +The Ragel language has many operators for constructing and manipulating +machines. Machines are built up from smaller machines, to bigger ones, to the +final machine representing the language that needs to be recognized or parsed. + +The core state machine construction operators are those found in most theory +of computation textbooks. They date back to the 1950s and are widely studied. +They are based on set operations and permit one to think of languages as a set +of strings. They are Union, Intersection, Difference, Concatenation and Kleene +Star. Put together, these operators make up what most people know as regular +expressions. Ragel also provides a scanner construction operator +and provides operators for explicitly constructing machines +using a state chart method. In the state chart method, one joins machines +together without any implied transitions and then explicitly specifies where +epsilon transitions should be drawn. + +The state machine manipulation operators are specific to Ragel. They allow the +programmer to access the states and transitions of regular language's +corresponding machine. There are two uses of the manipulation operators. The +first and primary use is to embed code into transitions and states, allowing +the programmer to specify the actions of the state machine. + +Ragel attempts to make the action embedding facility as intuitive as possible. +To do so, a number of issues need to be addressed. For example, when making a +nondeterministic specification into a DFA using machines that have embedded +actions, new transitions are often made that have the combined actions of +several source transitions. Ragel ensures that multiple actions associated with +a single transition are ordered consistently with respect to the order of +reference and the natural ordering implied by the construction operators. + +The second use of the manipulation operators is to assign priorities to +transitions. Priorities provide a convenient way of controlling any +nondeterminism introduced by the construction operators. Suppose two +transitions leave from the same state and go to distinct target states on the +same character. If these transitions are assigned conflicting priorities, then +during the determinization process the transition with the higher priority will +take precedence over the transition with the lower priority. The lower priority +transition gets abandoned. The transitions would otherwise be combined into a new +transition that goes to a new state that is a combination of the original +target states. Priorities are often required for segmenting machines. The most +common uses of priorities have been encoded into a set of simple operators +that should be used instead of priority embeddings whenever possible. + +For the purposes of embedding, Ragel divides transitions and states into +different classes. There are four operators for embedding actions and +priorities into the transitions of a state machine. It is possible to embed +into entering transitions, finishing transitions, all transitions and leaving +transitions. The embedding into leaving transitions is a special case. +These transition embeddings get stored in the final states of a machine. They +are transferred to any transitions that are made going out of the machine by +future concatenation or kleene star operations. + +There are several more operators for embedding actions into states. Like the +transition embeddings, there are various different classes of states that the +embedding operators access. For example, one can access start states, final +states or all states, among others. Unlike the transition embeddings, there are +several different types of state action embeddings. These are executed at +various different times during the processing of input. It is possible to embed +actions that are exectued on transitions into a state, on transitions out of a +state, on transitions taken on the error event, or on transitions taken on the +EOF event. + +Within actions, it is possible to influence the behaviour of the state machine. +The user can write action code that jumps or calls to another portion of the +machine, changes the current character being processed, or breaks out of the +processing loop. With the state machine calling feature Ragel can be used to +parse languages that are not regular. For example, one can parse balanced +parentheses by calling into a parser when an open parenthesis character is seen +and returning to the state on the top of the stack when the corresponding +closing parenthesis character is seen. More complicated context-free languages +such as expressions in C are out of the scope of Ragel. + +Ragel also provides a scanner construction operator that can be used to build +scanners much the same way that Lex is used. The Ragel generated code, which +relies on user-defined variables for backtracking, repeatedly tries to match +patterns to the input, favouring longer patterns over shorter ones and patterns +that appear ahead of others when the lengths of the possible matches are +identical. When a pattern is matched the associated action is executed. + +The key distinguishing feature between scanners in Ragel and scanners in Lex is +that Ragel patterns may be arbitrary Ragel expressions and can therefore +contain embedded code. With a Ragel-based scanner the user need not wait until +the end of a pattern before user code can be executed. + +Scanners do take Ragel out of the domain of pure state machines and require the +user to maintain the backtracking related variables. However, scanners +integrate well with regular state machine instantiations. They can be called to +or jumped to only when needed, or they can be called out of or jumped out of +when a simpler, pure state machine model is appropriate. + +Two types of output code style are available. Ragel can produce a table-driven +machine or a directly executable machine. The directly executable machine is +much faster than the table-driven. On the other hand, the table-driven machine +is more compact and less demanding on the host language compiler. It is better +suited to compiling large state machines. + +.section Related Work + +Lex is perhaps the best-known tool for constructing parsers from regular +expressions. In the Lex processing model, generated code attempts to match one +of the user's regular expression patterns, favouring longer matches over +shorter ones. Once a match is made it then executes the code associated with +the pattern and consumes the matching string. This process is repeated until +the input is fully consumed. + +Through the use of start conditions, related sets of patterns may be defined. +The active set may be changed at any time. This allows the user to define +different lexical regions. It also allows the user to link patterns together by +requiring that some patterns come before others. This is quite like a +concatenation operation. However, use of Lex for languages that require a +considerable amount of pattern concatenation is inappropriate. In such cases a +Lex program deteriorates into a manually specified state machine, where start +conditions define the states and pattern actions define the transitions. Lex +is therefore best suited to parsing tasks where the language to be parsed can +be described in terms of regions of tokens. + +Lex is useful in many scenarios and has undoubtedly stood the test of time. +There are, however, several drawbacks to using Lex. Lex can impose too much +overhead for parsing applications where buffering is not required because all +the characters are available in a single string. In these cases there is +structure to the language to be parsed and a parser specification tool can +help, but employing a heavyweight processing loop that imposes a stream +``pull'' model and dynamic input buffer allocation is inappropriate. An +example of this kind of scenario is the conversion of floating point numbers +contained in a string to their corresponding numerical values. + +Another drawback is the very issue that Ragel attempts to solve. +It is not possible to execute a user action while +matching a character contained inside a pattern. For example, if scanning a +programming language and string literals can contain newlines which must be +counted, a Lex user must break up a string literal pattern so as to associate +an action with newlines. This forces the definition of a new start condition. +Alternatively the user can reprocess the text of the matched string literal to +count newlines. + +.comment + +How ragel is different from Lex. + +Like Re2c, Ragel provides a simple execution model that does not make any +assumptions as to how the input is collected. Also, Ragel does not do any +buffering in the generated code. Consequently there are no dependencies on +external functions such as .verb|malloc|. + +If buffering is required it can be manually implemented by embedding actions +that copy the current character to a buffer, or data can be passed to the +parser using known block boundaries. If the longest-match operator is used, +Ragel requires the user to ensure that the ending portion of the input buffer +is preserved when the buffer is exhaused before a token is fully matched. The +user should move the token prefix to a new memory location, such as back to the +beginning of the input buffer, then place the subsequently read input +immediately after the prefix. + +These properties of Ragel make it more work to write a program that requires +the longest-match operator or buffering of input, however they make Ragel a +more flexible tool that can produce very simple and fast-running programs under +a variety of input acquisition arrangements. + +In Ragel, it is not necessary +to introduce start conditions to concatenate tokens and retain action +execution. Ragel allows one to structure a parser as a series of tokens, but +does not require it. + +Like Lex and Re2C, Ragel is able to process input using a longest-match +execution model, however the core of the Ragel language specifies parsers at a +much lower level. This core is built around a pure state machine model. When +building basic machines there is no implied algorithm for processing input +other than to move from state to state on the transitions of the machine. This +core of pure state machine operations makes Ragel well suited to handling +parsing problems not based on token scanning. Should one need to use a +longest-match model, the functionality is available and the lower level state +machine construction facilities can be used to specify the patterns of a +longest-match machine. + +This is not possible in Ragel. One can only program +a longest-match instantiation with a fixed set of rules. One can jump to +another longest-match machine that employs the same machine definitions in the +construction of its rules, however no states will be shared. + +In Ragel, input may be re-parsed using a +different machine, but since the action to be executed is associated with +transitions of the compiled state machine, the longest-match construction does +not permit a single rule to be excluded from the active set. It cannot be done +ahead of time nor in the excluded rule's action. + +.end comment + +The Re2C program defines an input processing model similar to that of Lex. +Re2C focuses on making generated state machines run very fast and +integrate easily into any program, free of dependencies. Re2C generates +directly executable code and is able to claim that generated parsers run nearly +as fast as their hand-coded equivalents. This is very important for user +adoption, as programmers are reluctant to use a tool when a faster alternative +exists. A consideration to ease of use is also important because developers +need the freedom to integrate the generated code as they see fit. + +Many scripting languages provide ways of composing parsers by linking regular +expressions using program logic. For example, Sed and Awk are two established +Unix scripting tools that allow the programmer to exploit regular expressions +for the purpose of locating and extracting text of interest. High-level +programming languages such as Perl, Python, PHP and Ruby all provide regular +expression libraries that allow the user to combine regular expressions with +arbitrary code. + +In addition to supporting the linking of regular expressions with arbitrary +program logic, the Perl programming language permits the embedding of code into +regular expressions. Perl embeddings do not translate into the embedding of +code into deterministic state machines. Perl regular expressions are in fact +not fully compiled to deterministic machines when embedded code is involved. +They are instead interpreted and involve backtracking. This is shown by the +following Perl program. When it is fed the input .verb|abcd| the interpretor +attempts to match the first alternative, printing .verb|a1 b1|. When this +possibility fails it backtracks and tries the second possibility, printing +.verb|a2 b2|, at which point it succeeds. + +.code +print "YES\n" if ( <STDIN> =~ + /( a (?{ print "a1 "; }) b (?{ print "b1 "; }) cX ) | + ( a (?{ print "a2 "; }) b (?{ print "b2 "; }) cd )/x ) +.end code + +In Ragel there is no regular expression interpretor. Aside from the scanner +operator, all Ragel expressions are made into deterministic machines and the +run time simply moves from state to state as it consumes input. An equivalent +parser expressed in Ragel would attempt both of the alternatives concurrently, +printing .verb|a1 a2 b1 b2|. + +.section Development Status + +Ragel is a relatively new tool and is under continuous development. As a rough +release guide, minor revision number changes are for implementation +improvements and feature additions. Major revision number changes are for +implementation and language changes that do not preserve backwards +compatibility. Though in the past this has not always held true: changes that +break code have crept into minor version number changes. Typically, the +documentation lags behind the development in the interest of documenting only +the lasting features. The latest changes are always documented in the ChangeLog +file. + +.chapter Constructing State Machines + +.section Ragel State Machine Specifications + +A Ragel input file consists of a program in the host language that contains embedded machine +specifications. Ragel normally passes input straight to output. When it sees +a machine specification it stops to read the Ragel statements and possibly generate +code in place of the specification. +Afterwards it continues to pass input through. There +can be any number of FSM specifications in an input file. A multi-line FSM spec +starts with .verb|%%{| and ends with .verb|}%%|. A single-line FSM spec starts +with .verb|%%| and ends at the first newline. + +While Ragel is looking for FSM specifications it does basic lexical analysis on +the surrounding input. It interprets literal strings and comments so a +.verb|%%| sequence in either of those will not trigger the parsing of an FSM +specification. Ragel does not pass the input through any preprocessor nor does it +interpret preprocessor directives itself so includes, defines and ifdef logic +cannot be used to alter the parse of a Ragel input file. It is therefore not +possible to use an .verb|#if 0| directive to comment out a machine as is +commonly done in C code. As an alternative, a machine can be prevented from +causing any generated output by commenting out write statements. + +In Figure .ref{cmd-line-parsing}, a multi-line specification is used to define the +machine and single line specifications are used to trigger the writing of the machine +data and execution code. + +.figure cmd-line-parsing +.multicols +.verbatim +#include <string.h> +#include <stdio.h> + +%%{ + machine foo; + main := + ( 'foo' | 'bar' ) + 0 @{ res = 1; }; +}%% + +%% write data; +.end verbatim +.columnbreak +.verbatim +int main( int argc, char **argv ) +{ + int cs, res = 0; + if ( argc > 1 ) { + char *p = argv[1]; + char *pe = p + strlen(p) + 1; + %% write init; + %% write exec; + } + printf("result = %i\n", res ); + return 0; +} +.end verbatim +.end multicols +.caption Parsing a command line argument. +.end figure + +.subsection Naming Ragel Blocks + +.verbatim +machine fsm_name; +.end verbatim + +The .verb|machine| statement gives the name of the FSM. If present in a +specification, this statement must appear first. If a machine specification +does not have a name then Ragel uses the previous specification name. If no +previous specification name exists then this is an error. Because FSM +specifications persist in memory, a machine's statements can be spread across +multiple machine specifications. This allows one to break up a machine across +several files or draw in statements that are common to multiple machines using +the .verb|include| statement. + +.subsection Machine Definition +.label{definition} + +.verbatim +<name> = <expression>; +.end verbatim + +The machine definition statement associates an FSM expression with a name. Machine +expressions assigned to names can later be referenced in other expressions. A +definition statement on its own does not cause any states to be generated. It is simply a +description of a machine to be used later. States are generated only when a definition is +instantiated, which happens when a definition is referenced in an instantiated +expression. + +.subsection Machine Instantiation +.label{instantiation} + +.verbatim +<name> := <expression>; +.end verbatim + +The machine instantiation statement generates a set of states representing an +expression. Each instantiation generates a distinct set of states. The starting +state of the instantiation is written in the data section of the generated code +using the instantiation name. If a machine named +.verb|main| is instantiated, its start state is used as the +specification's start state and is assigned to the .verb|cs| variable by the +.verb|write init| command. If no .verb|main| machine is given, the start state +of the last machine instantiation to appear is used as the specification's +start state. + +From outside the execution loop, control may be passed to any machine by +assigning the entry point to the .verb|cs| variable. From inside the execution +loop, control may be passed to any machine instantiation using .verb|fcall|, +.verb|fgoto| or .verb|fnext| statements. + +.subsection Including Ragel Code + +.verbatim +include FsmName "inputfile.rl"; +.end verbatim + +The .verb|include| statement can be used to draw in the statements of another FSM +specification. Both the name and input file are optional, however at least one +must be given. Without an FSM name, the given input file is searched for an FSM +of the same name as the current specification. Without an input file the +current file is searched for a machine of the given name. If both are present, +the given input file is searched for a machine of the given name. + +Ragel searches for included files from the location of the current file. +Additional directories can be added to the search path using the .verb|-I| +option. + +.subsection Importing Definitions +.label{import} + +.verbatim +import "inputfile.h"; +.end verbatim + +The .verb|import| statement scrapes a file for sequences of tokens that match +the following forms. Ragel treats these forms as state machine definitions. + +.list +.li .verb|name '=' number| +.li .verb|name '=' lit_string| +.li .verb|'define' name number| +.li .verb|'define' name lit_string| +.end list + +If the input file is a Ragel program then tokens inside any Ragel +specifications are ignored. See Section .ref{export} for a description of +exporting machine definitions. + +Ragel searches for imported files from the location of the current file. +Additional directories can be added to the search path using the .verb|-I| +option. + +.section Lexical Analysis of a Ragel Block +.label{lexing} + +Within a machine specification the following lexical rules apply to the input. + +.itemize + +.item The .verb|#| symbol begins a comment that terminates at the next newline. + +.item The symbols .verb|""|, .verb|''|, .verb|//|, .verb|[]| behave as the +delimiters of literal strings. Within them, the following escape sequences +are interpreted: + +.verb| \0 \a \b \t \n \v \f \r| + +A backslash at the end of a line joins the following line onto the current. A +backslash preceding any other character removes special meaning. This applies +to terminating characters and to special characters in regular expression +literals. As an exception, regular expression literals do not support escape +sequences as the operands of a range within a list. See the bullet on regular +expressions in Section .ref{basic}. + +.item The symbols .verb|{}| delimit a block of host language code that will be +embedded into the machine as an action. Within the block of host language +code, basic lexical analysis of comments and strings is done in order to +correctly find the closing brace of the block. With the exception of FSM +commands embedded in code blocks, the entire block is preserved as is for +identical reproduction in the output code. + +.item The pattern .verb|[+-]?[0-9]+| denotes an integer in decimal format. +Integers used for specifying machines may be negative only if the alphabet type +is signed. Integers used for specifying priorities may be positive or negative. + +.item The pattern .verb|0x[0-9A-Fa-f]+| denotes an integer in hexadecimal +format. + +.item The keywords are .verb|access|, .verb|action|, .verb|alphtype|, +.verb|getkey|, .verb|write|, .verb|machine| and .verb|include|. + +.item The pattern .verb|[a-zA-Z_][a-zA-Z_0-9]*| denotes an identifier. + +.comment +.item The allowable symbols are: + +.verb/ ( ) ! ^ * ? + : -> - | & . , := = ; > @ $ % /\\ +.verb| >/ $/ %/ </ @/ <>/ >! $! %! <! @! <>!|\\ +.verb| >^ $^ %^ <^ @^ <>^ >~ $~ %~ <~ @~ <>~|\\ +.verb| >* $* %* <* @* <>*| +.end comment + +.item Any amount of whitespace may separate tokens. + +.end itemize + +.comment +.section Parse of an FSM Specification + +The following statements are possible within an FSM specification. The +requirements for trailing semicolons loosely follow that of C. +A block +specifying code does not require a trailing semicolon. An expression +statement does require a trailing semicolon. +.end comment + +.section Basic Machines +.label{basic} + +The basic machines are the base operands of regular language expressions. They +are the smallest unit to which machine construction and manipulation operators +can be applied. + +.itemize + +.item .verb|'hello'| -- Concatenation Literal. Produces a machine that matches +the sequence of characters in the quoted string. If there are 5 characters +there will be 6 states chained together with the characters in the string. See +Section .ref{lexing} for information on valid escape sequences. + +.comment +% GENERATE: bmconcat +% OPT: -p +% %%{ +% machine bmconcat; +.verbatim +main := 'hello'; +.end verbatim +% }%% +% END GENERATE +.end comment + +.graphic bmconcat + +It is possible +to make a concatenation literal case-insensitive by appending an .verb|i| to +the string, for example .verb|'cmd'i|. + +.item .verb|"hello"| -- Identical to the single quoted version. + +.item .verb|[hello]| -- Or Expression. Produces a union of characters. There +will be two states with a transition for each unique character between the two states. +The .verb|[]| delimiters behave like the quotes of a literal string. For example, +.verb|[ \t]| means tab or space. The .verb|or| expression supports character ranges +with the .verb|-| symbol as a separator. The meaning of the union can be negated +using an initial .verb|^| character as in standard regular expressions. +See Section .ref{lexing} for information on valid escape sequences +in .verb|or| expressions. + +.comment +% GENERATE: bmor +% OPT: -p +% %%{ +% machine bmor; +.verbatim +main := [hello]; +.end verbatim +% }%% +% END GENERATE +.end comment + +.graphic bmor + +.item .verb|''|, .verb|""|, and .verb|[]| -- Zero Length Machine. Produces a machine +that matches the zero length string. Zero length machines have one state that is both +a start state and a final state. + +.comment +% GENERATE: bmnull +% OPT: -p +% %%{ +% machine bmnull; +.verbatim +main := ''; +.end verbatim +% }%% +% END GENERATE +.end comment + +.graphic bmnull + +% FIXME: More on the range of values here. +.item .verb|42| -- Numerical Literal. Produces a two state machine with one +transition on the given number. The number may be in decimal or hexadecimal +format and should be in the range allowed by the alphabet type. The minimum and +maximum values permitted are defined by the host machine that Ragel is compiled +on. For example, numbers in a .verb|short| alphabet on an i386 machine should +be in the range .verb|-32768| to .verb|32767|. + +.comment +% GENERATE: bmnum +% %%{ +% machine bmnum; +.verbatim +main := 42; +.end verbatim +% }%% +% END GENERATE +.end comment + +.graphic bmnum + +.item .verb|/simple_regex/| -- Regular Expression. Regular expressions are +parsed as a series of expressions that are concatenated together. Each +concatenated expression +may be a literal character, the ``any'' character specified by the .verb|.| +symbol, or a union of characters specified by the .verb|[]| delimiters. If the +first character of a union is .verb|^| then it matches any character not in the +list. Within a union, a range of characters can be given by separating the first +and last characters of the range with the .verb|-| symbol. Each +concatenated machine may have repetition specified by following it with the +.verb|*| symbol. The standard escape sequences described in Section +.ref{lexing} are supported everywhere in regular expressions except as the +operands of a range within in a list. This notation also supports the .verb|i| +trailing option. Use it to produce case-insensitive machines, as in .verb|/GET/i|. + +Ragel does not support very complex regular expressions because the desired +results can always be achieved using the more general machine construction +operators listed in Section .ref{machconst}. The following diagram shows the +result of compiling .verb|/ab*[c-z].*[123]/|. .verb|DEF| represents the default +transition, which is taken if no other transition can be taken. + +.comment +% GENERATE: bmregex +% OPT: -p +% %%{ +% machine bmregex; +.verbatim +main := /ab*[c-z].*[123]/; +.end verbatim +% }%% +% END GENERATE +.end comment + +.graphic bmregex + +.item .verb|'a' .. 'z'| -- Range. Produces a machine that matches any +characters in the specified range. Allowable upper and lower bounds of the +range are concatenation literals of length one and numerical literals. For +example, .verb|0x10..0x20|, .verb|0..63|, and .verb|'a'..'z'| are valid ranges. +The bounds should be in the range allowed by the alphabet type. + +.comment +% GENERATE: bmrange +% OPT: -p +% %%{ +% machine bmrange; +.verbatim +main := 'a' .. 'z'; +.end verbatim +% }%% +% END GENERATE +.end comment + +.graphic bmrange + +.item .verb|variable_name| -- Lookup the machine definition assigned to the +variable name given and use an instance of it. See Section .ref{definition} for +an important note on what it means to reference a variable name. + +.item .verb|builtin_machine| -- There are several built-in machines available +for use. They are all two state machines for the purpose of matching common +classes of characters. They are: + +.itemize + +.item .verb|any | -- Any character in the alphabet. + +.item .verb|ascii | -- Ascii characters. .verb|0..127| + +.item .verb|extend| -- Ascii extended characters. This is the range +.verb|-128..127| for signed alphabets and the range .verb|0..255| for unsigned +alphabets. + +.item .verb|alpha | -- Alphabetic characters. .verb|[A-Za-z]| + +.item .verb|digit | -- Digits. .verb|[0-9]| + +.item .verb|alnum | -- Alpha numerics. .verb|[0-9A-Za-z]| + +.item .verb|lower | -- Lowercase characters. .verb|[a-z]| + +.item .verb|upper | -- Uppercase characters. .verb|[A-Z]| + +.item .verb|xdigit| -- Hexadecimal digits. .verb|[0-9A-Fa-f]| + +.item .verb|cntrl | -- Control characters. .verb|0..31| + +.item .verb|graph | -- Graphical characters. .verb|[!-~]| + +.item .verb|print | -- Printable characters. .verb|[ -~]| + +.item .verb|punct | -- Punctuation. Graphical characters that are not alphanumerics. +.verb|[!-/:-@[-`{-~]| + +.item .verb|space | -- Whitespace. .verb|[\t\v\f\n\r ]| + +.item .verb|zlen | -- Zero length string. .verb|""| + +.item .verb|empty | -- Empty set. Matches nothing. .verb|^any| + +.end itemize +.end itemize + +.section Operator Precedence +The following table shows operator precedence from lowest to highest. Operators +in the same precedence group are evaluated from left to right. + +.tabular +.row 1&.verb| , |&Join +.row 2&.verb/ | & - --/&Union, Intersection and Subtraction +.row 3&.verb| . <: :> :>> |&Concatenation +.row 4&.verb| : |&Label +.row 5&.verb| -> |&Epsilon Transition +.row 6&.verb| > @ $ % |&Transitions Actions and Priorities +.row 6&.verb| >/ $/ %/ </ @/ <>/ |&EOF Actions +.row 6&.verb| >! $! %! <! @! <>! |&Global Error Actions +.row 6&.verb| >^ $^ %^ <^ @^ <>^ |&Local Error Actions +.row 6&.verb| >~ $~ %~ <~ @~ <>~ |&To-State Actions +.row 6&.verb| >* $* %* <* @* <>* |&From-State Action +.row 7&.verb| * ** ? + {n} {,n} {n,} {n,m} |&Repetition +.row 8&.verb| ! ^ |&Negation and Character-Level Negation +.row 9&.verb| ( <expr> ) |&Grouping +.end tabular + +.section Regular Language Operators +.label{machconst} + +When using Ragel it is helpful to have a sense of how it constructs machines. +The determinization process can produce results that seem unusual to someone +not familiar with the NFA to DFA conversion algorithm. In this section we +describe Ragel's state machine operators. Though the operators are defined +using epsilon transitions, it should be noted that this is for discussion only. +The epsilon transitions described in this section do not persist, but are +immediately removed by the determinization process which is executed at every +operation. Ragel does not make use of any nondeterministic intermediate state +machines. + +To create an epsilon transition between two states .verb|x| and .verb|y| is to +copy all of the properties of .verb|y| into .verb|x|. This involves drawing in +all of .verb|y|'s to-state actions, EOF actions, etc., in addition to its +transitions. If .verb|x| and .verb|y| both have a transition out on the same +character, then the transitions must be combined. During transition +combination a new transition is made that goes to a new state that is the +combination of both target states. The new combination state is created using +the same epsilon transition method. The new state has an epsilon transition +drawn to all the states that compose it. Since the creation of new epsilon +transitions may be triggered every time an epsilon transition is drawn, the +process of drawing epsilon transitions is repeated until there are no more +epsilon transitions to be made. + +A very common error that is made when using Ragel is to make machines that do +too much. That is, to create machines that have unintentional +nondetermistic properties. This usually results from being unaware of the common strings +between machines that are combined together using the regular language +operators. This can involve never leaving a machine, causing its actions to be +propagated through all the following states. Or it can involve an alternation +where both branches are unintentionally taken simultaneously. + +This problem forces one to think hard about the language that needs to be +matched. To guard against this kind of problem one must ensure that the machine +specification is divided up using boundaries that do not allow ambiguities from +one portion of the machine to the next. See Chapter +.ref{controlling-nondeterminism} for more on this problem and how to solve it. + +The Graphviz tool is an immense help when debugging improperly compiled +machines or otherwise learning how to use Ragel. Graphviz Dot files can be +generated from Ragel programs using the .verb|-V| option. See Section +.ref{visualization} for more information. + + +.subsection Union + +.verb/expr | expr/ + +The union operation produces a machine that matches any string in machine one +or machine two. The operation first creates a new start state. Epsilon +transitions are drawn from the new start state to the start states of both +input machines. The resulting machine has a final state set equivalent to the +union of the final state sets of both input machines. In this operation, there +is the opportunity for nondeterminism among both branches. If there are +strings, or prefixes of strings that are matched by both machines then the new +machine will follow both parts of the alternation at once. The union operation is +shown below. + +.graphic opor 1.0 + +The following example demonstrates the union of three machines representing +common tokens. + +% GENERATE: exor +% OPT: -p +% %%{ +% machine exor; +.code +# Hex digits, decimal digits, or identifiers +main := '0x' xdigit+ | digit+ | alpha alnum*; +.end code +% }%% +% END GENERATE + +.graphic exor + +.subsection Intersection + +.verb|expr & expr| + +Intersection produces a machine that matches any +string that is in both machine one and machine two. To achieve intersection, a +union is performed on the two machines. After the result has been made +deterministic, any final state that is not a combination of final states from +both machines has its final state status revoked. To complete the operation, +paths that do not lead to a final state are pruned from the machine. Therefore, +if there are any such paths in either of the expressions they will be removed +by the intersection operator. Intersection can be used to require that two +independent patterns be simultaneously satisfied as in the following example. + +% GENERATE: exinter +% OPT: -p +% %%{ +% machine exinter; +.code +# Match lines four characters wide that contain +# words separated by whitespace. +main := + /[^\n][^\n][^\n][^\n]\n/* & + (/[a-z][a-z]*/ | [ \n])**; +.end code +% }%% +% END GENERATE + +.graphic exinter + +.subsection Difference + +.verb|expr - expr| + +The difference operation produces a machine that matches +strings that are in machine one but are not in machine two. To achieve subtraction, +a union is performed on the two machines. After the result has been made +deterministic, any final state that came from machine two or is a combination +of states involving a final state from machine two has its final state status +revoked. As with intersection, the operation is completed by pruning any path +that does not lead to a final state. The following example demonstrates the +use of subtraction to exclude specific cases from a set. + +% GENERATE: exsubtr +% OPT: -p +% %%{ +% machine exsubtr; +.code +# Subtract keywords from identifiers. +main := /[a-z][a-z]*/ - ( 'for' | 'int' ); +.end code +% }%% +% END GENERATE + +.graphic exsubtr + +.subsection Strong Difference +.label{strong_difference} + +.verb|expr -- expr| + +Strong difference produces a machine that matches any string of the first +machine that does not have any string of the second machine as a substring. In +the following example, strong subtraction is used to excluded .verb|CRLF| from +a sequence. In the corresponding visualization, the label .verb|DEF| is short +for default. The default transition is taken if no other transition can be +taken. + +% GENERATE: exstrongsubtr +% OPT: -p +% %%{ +% machine exstrongsubtr; +.code +crlf = '\r\n'; +main := [a-z]+ ':' ( any* -- crlf ) crlf; +.end code +% }%% +% END GENERATE + +.graphic exstrongsubtr + +This operator is equivalent to the following. + +.verbatim +expr - ( any* expr any* ) +.end verbatim + +.subsection Concatenation + +.verb|expr . expr| + +Concatenation produces a machine that matches all the strings in machine one followed by all +the strings in machine two. Concatenation draws epsilon transitions from the +final states of the first machine to the start state of the second machine. The +final states of the first machine lose their final state status, unless the +start state of the second machine is final as well. +Concatenation is the default operator. Two machines next to each other with no +operator between them results in concatenation. + +.graphic opconcat 1.0 + +The opportunity for nondeterministic behaviour results from the possibility of +the final states of the first machine accepting a string that is also accepted +by the start state of the second machine. +The most common scenario in which this happens is the +concatenation of a machine that repeats some pattern with a machine that gives +a terminating string, but the repetition machine does not exclude the +terminating string. The example in Section .ref{strong_difference} +guards against this. Another example is the expression .verb|("'" any* "'")|. +When executed the thread of control will +never leave the .verb|any*| machine. This is a problem especially if actions +are embedded to process the characters of the .verb|any*| component. + +In the following example, the first machine is always active due to the +nondeterministic nature of concatenation. This particular nondeterminism is intended +however because we wish to permit EOF strings before the end of the input. + +% GENERATE: exconcat +% OPT: -p +% %%{ +% machine exconcat; +.code +# Require an eof marker on the last line. +main := /[^\n]*\n/* . 'EOF\n'; +.end code +% }%% +% END GENERATE + +.graphic exconcat + +There is a language +ambiguity involving concatenation and subtraction. Because concatenation is the +default operator for two +adjacent machines there is an ambiguity between subtraction of +a positive numerical literal and concatenation of a negative numerical literal. +For example, .verb|(x-7)| could be interpreted as .verb|(x . -7)| or +.verb|(x - 7)|. In the Ragel language, the subtraction operator always takes precedence +over concatenation of a negative literal. We adhere to the rule that the default +concatenation operator takes effect only when there are no other operators between +two machines. Beware of writing machines such as .verb|(any -1)| when what is +desired is a concatenation of .verb|any| and .verb|-1|. Instead write +.verb|(any . -1)| or .verb|(any (-1))|. If in doubt of the meaning of your program do not +rely on the default concatenation operator; always use the .verb|.| symbol. + + +.subsection Kleene Star + +.verb|expr*| + +The machine resulting from the Kleene Star operator will match zero or more +repetitions of the machine it is applied to. +It creates a new start state and an additional final +state. Epsilon transitions are drawn between the new start state and the old start +state, between the new start state and the new final state, and +between the final states of the machine and the new start state. After the +machine is made deterministic the effect is of the final states getting all the +transitions of the start state. + +.graphic opstar 1.0 + +The possibility for nondeterministic behaviour arises if the final states have +transitions on any of the same characters as the start state. This is common +when applying kleene star to an alternation of tokens. Like the other problems +arising from nondeterministic behavior, this is discussed in more detail in Chapter +.ref{controlling-nondeterminism}. This particular problem can also be solved +by using the longest-match construction discussed in Section +.ref{generating-scanners} on scanners. + +In this +example, there is no nondeterminism introduced by the exterior kleene star due to +the newline at the end of the regular expression. Without the newline the +exterior kleene star would be redundant and there would be ambiguity between +repeating the inner range of the regular expression and the entire regular +expression. Though it would not cause a problem in this case, unnecessary +nondeterminism in the kleene star operator often causes undesired results for +new Ragel users and must be guarded against. + +% GENERATE: exstar +% OPT: -p +% %%{ +% machine exstar; +.code +# Match any number of lines with only lowercase letters. +main := /[a-z]*\n/*; +.end code +% }%% +% END GENERATE + +.graphic exstar + +.subsection One Or More Repetition + +.verb|expr+| + +This operator produces the concatenation of the machine with the kleene star of +itself. The result will match one or more repetitions of the machine. The plus +operator is equivalent to .verb|(expr . expr*)|. + +% GENERATE: explus +% OPT: -p +% %%{ +% machine explus; +.code +# Match alpha-numeric words. +main := alnum+; +.end code +% }%% +% END GENERATE + +.graphic explus + +.subsection Optional + +.verb|expr?| + +The .em{optional} operator produces a machine that accepts the machine +given or the zero length string. The optional operator is equivalent to +.verb/(expr | '' )/. In the following example the optional operator is used to +possibly extend a token. + +% GENERATE: exoption +% OPT: -p +% %%{ +% machine exoption; +.code +# Match integers or floats. +main := digit+ ('.' digit+)?; +.end code +% }%% +% END GENERATE + +.graphic exoption + +.subsection Repetition + +.list +.li .verb|expr {n}| -- Exactly N copies of expr. +.li .verb|expr {,n}| -- Zero to N copies of expr. +.li .verb|expr {n,}| -- N or more copies of expr. +.li .verb|expr {n,m}| -- N to M copies of expr. +.end list + +.subsection Negation + +.verb|!expr| + +Negation produces a machine that matches any string not matched by the given +machine. Negation is equivalent to .verb|(any* - expr)|. + +% GENERATE: exnegate +% OPT: -p +% %%{ +% machine exnegate; +.code +# Accept anything but a string beginning with a digit. +main := ! ( digit any* ); +.end code +% }%% +% END GENERATE + +.graphic exnegate + +.subsection Character-Level Negation + +.verb|^expr| + +Character-level negation produces a machine that matches any single character +not matched by the given machine. Character-Level Negation is equivalent to +.verb|(any - expr)|. It must be applied only to machines that match strings of +length one. + +.section State Machine Minimization + +State machine minimization is the process of finding the minimal equivalent FSM accepting +the language. Minimization reduces the number of states in machines +by merging equivalent states. It does not change the behaviour of the machine +in any way. It will cause some states to be merged into one because they are +functionally equivalent. State minimization is on by default. It can be turned +off with the .verb|-n| option. + +The algorithm implemented is similar to Hopcroft's state minimization +algorithm. Hopcroft's algorithm assumes a finite alphabet that can be listed in +memory, whereas Ragel supports arbitrary integer alphabets that cannot be +listed in memory. Though exact analysis is very difficult, Ragel minimization +runs close to O(n * log(n)) and requires O(n) temporary storage where +$n$ is the number of states. + +.section Visualization +.label{visualization} + +%In many cases, practical +%parsing programs will be too large to completely visualize with Graphviz. The +%proper approach is to reduce the language to the smallest subset possible that +%still exhibits the characteristics that one wishes to learn about or to fix. +%This can be done without modifying the source code using the .verb|-M| and +%.verb|-S| options. If a machine cannot be easily reduced, +%embeddings of unique actions can be very useful for tracing a +%particular component of a larger machine specification, since action names are +%written out on transition labels. + +Ragel is able to emit compiled state machines in Graphviz's Dot file format. +This is done using the .verb|-V| option. +Graphviz support allows users to perform +incremental visualization of their parsers. User actions are displayed on +transition labels of the graph. + +If the final graph is too large to be +meaningful, or even drawn, the user is able to inspect portions of the parser +by naming particular regular expression definitions with the .verb|-S| and +.verb|-M| options to the .verb|ragel| program. Use of Graphviz greatly +improves the Ragel programming experience. It allows users to learn Ragel by +experimentation and also to track down bugs caused by unintended +nondeterminism. + +Ragel has another option to help debugging. The .verb|-x| option causes Ragel +to emit the compiled machine in an XML format. + +.chapter User Actions + +Ragel permits the user to embed actions into the transitions of a regular +expression's corresponding state machine. These actions are executed when the +generated code moves over a transition. Like the regular expression operators, +the action embedding operators are fully compositional. They take a state +machine and an action as input, embed the action and yield a new state machine +that can be used in the construction of other machines. Due to the +compositional nature of embeddings, the user has complete freedom in the +placement of actions. + +A machine's transitions are categorized into four classes. The action embedding +operators access the transitions defined by these classes. The .em{entering +transition} operator .verb|>| isolates the start state, then embeds an action +into all transitions leaving it. The .em{finishing transition} operator +.verb|@| embeds an action into all transitions going into a final state. The +.em{all transition} operator .verb|$| embeds an action into all transitions of +an expression. The .em{leaving transition} operator .verb|%| provides access +to the yet-unmade transitions moving out of the machine via the final states. + +.section Embedding Actions + +.verbatim +action ActionName { + /* Code an action here. */ + count += 1; +} +.end verbatim + +The action statement defines a block of code that can be embedded into an FSM. +Action names can be referenced by the action embedding operators in +expressions. Though actions need not be named in this way (literal blocks +of code can be embedded directly when building machines), defining reusable +blocks of code whenever possible is good practice because it potentially increases the +degree to which the machine can be minimized. + +Within an action some Ragel expressions and statements are parsed and +translated. These allow the user to interact with the machine from action code. +See Section .ref{vals} for a complete list of statements and values available +in code blocks. + +.subsection Entering Action + +.verb|expr > action| + +The entering action operator embeds an action into all transitions +that enter into the machine from the start state. If the start state is final, +then the action is also embedded into the start state as a leaving action. This +means that if a machine accepts the zero-length string and control passes +through the start state then the entering action is executed. Note +that this can happen on both a following character and on the EOF event. + +In some machines the start state has transtions coming in from within the +machine. In these cases the start state is first isolated from the rest of the +machine ensuring that the entering actions are exected once only. + +% GENERATE: exstact +% OPT: -p +% %%{ +% machine exstact; +.code +# Execute A at the beginning of a string of alpha. +action A {} +main := ( lower* >A ) . ' '; +.end code +% }%% +% END GENERATE + +.graphic exstact + +.subsection Finishing Action + +.verb|expr @ action| + +The finishing action operator embeds an action into any transitions that move +the machine into a final state. Further input may move the machine out of the +final state, but keep it in the machine. Therefore finishing actions may be +executed more than once if a machine has any internal transitions out of a +final state. In the following example the final state has no transitions out +and the finishing action is executed only once. + +% GENERATE: exdoneact +% OPT: -p +% %%{ +% machine exdoneact; +% action A {} +.code +# Execute A when the trailing space is seen. +main := ( lower* ' ' ) @A; +.end code +% }%% +% END GENERATE + +.graphic exdoneact + +.subsection All Transition Action + +.verb|expr $ action| + +The all transition operator embeds an action into all transitions of a machine. +The action is executed whenever a transition of the machine is taken. In the +following example, A is executed on every character matched. + +% GENERATE: exallact +% OPT: -p +% %%{ +% machine exallact; +% action A {} +.code +# Execute A on any characters of the machine. +main := ( 'm1' | 'm2' ) $A; +.end code +% }%% +% END GENERATE + +.graphic exallact + +.subsection Leaving Actions +.label{out-actions} + +.verb|expr % action| + +The leaving action operator queues an action for embedding into the transitions +that go out of a machine via a final state. The action is first stored in +the machine's final states and is later transferred to any transitions that are +made going out of the machine by a kleene star or concatenation operation. + +If a final state of the machine is still final when compilation is complete +then the leaving action is also embedded as an EOF action. Therefore, leaving +the machine is defined as either leaving on a character or as state machine +acceptance. + +This operator allows one to associate an action with the termination of a +sequence, without being concerned about what particular character terminates +the sequence. In the following example, A is executed when leaving the alpha +machine on the newline character. + +% GENERATE: exoutact1 +% OPT: -p +% %%{ +% machine exoutact1; +% action A {} +.code +# Match a word followed by a newline. Execute A when +# finishing the word. +main := ( lower+ %A ) . '\n'; +.end code +% }%% +% END GENERATE + +.graphic exoutact1 + +In the following example, the .verb|term_word| action could be used to register +the appearance of a word and to clear the buffer that the .verb|lower| action used +to store the text of it. + +% GENERATE: exoutact2 +% OPT: -p +% %%{ +% machine exoutact2; +% action lower {} +% action space {} +% action term_word {} +% action newline {} +.code +word = ( [a-z] @lower )+ %term_word; +main := word ( ' ' @space word )* '\n' @newline; +.end code +% }%% +% END GENERATE + +.graphic exoutact2 + +In this final example of the action embedding operators, A is executed upon entering +the alpha machine, B is executed on all transitions of the +alpha machine, C is executed when the alpha machine is exited by moving into the +newline machine and N is executed when the newline machine moves into a final +state. + +% GENERATE: exaction +% OPT: -p +% %%{ +% machine exaction; +% action A {} +% action B {} +% action C {} +% action N {} +.code +# Execute A on starting the alpha machine, B on every transition +# moving through it and C upon finishing. Execute N on the newline. +main := ( lower* >A $B %C ) . '\n' @N; +.end code +% }%% +% END GENERATE + +.graphic exaction + + +.section State Action Embedding Operators + +The state embedding operators allow one to embed actions into states. Like the +transition embedding operators, there are several different classes of states +that the operators access. The meanings of the symbols are similar to the +meanings of the symbols used for the transition embedding operators. The design +of the state selections was driven by a need to cover the states of an +expression with exactly one error action. + +Unlike the transition embedding operators, the state embedding operators are +also distinguished by the different kinds of events that embedded actions can +be associated with. Therefore the state embedding operators have two +components. The first, which is the first one or two characters, specifies the +class of states that the action will be embedded into. The second component +specifies the type of event the action will be executed on. The symbols of the +second component also have equivalent kewords. + +.multicols +The different classes of states are: + +.list +.li .verb|> | -- the start state +.li .verb|< | -- any state except the start state +.li .verb|$ | -- all states +.li .verb|% | -- final states +.li .verb|@ | -- any state except final states +.li .verb|<>| -- any except start and final (middle) +.end list + +.columnbreak + +The different kinds of embeddings are: + +.list +.li .verb|~| -- to-state actions (.verb|to|) +.li .verb|*| -- from-state actions (.verb|from|) +.li .verb|/| -- EOF actions (.verb|eof|) +.li .verb|!| -- error actions (.verb|err|) +.li .verb|^| -- local error actions (.verb|lerr|) +.end list + +.end multicols + +.subsection To-State and From-State Actions + +.subsubsection To-State Actions + +.list +.li .verb|>~action >to(name) >to{...} | -- the start state +.li .verb|<~action <to(name) <to{...} | -- any state except the start state +.li .verb|$~action $to(name) $to{...} | -- all states +.li .verb|%~action %to(name) %to{...} | -- final states +.li .verb|@~action @to(name) @to{...} | -- any state except final states +.li .verb|<>~action <>to(name) <>to{...}| -- any except start and final (middle) +.end list + + +To-state actions are executed whenever the state machine moves into the +specified state, either by a natural movement over a transition or by an +action-based transfer of control such as .verb|fgoto|. They are executed after the +in-transition's actions but before the current character is advanced and +tested against the end of the input block. To-state embeddings stay with the +state. They are irrespective of the state's current set of transitions and any +future transitions that may be added in or out of the state. + +Note that the setting of the current state variable .verb|cs| outside of the +execute code is not considered by Ragel as moving into a state and consequently +the to-state actions of the new current state are not executed. This includes +the initialization of the current state when the machine begins. This is +because the entry point into the machine execution code is after the execution +of to-state actions. + +.subsubsection From-State Actions + +.list +.li .verb|>*action >from(name) >from{...} | -- the start state +.li .verb|<*action <from(name) <from{...} | -- any state except the start state +.li .verb|$*action $from(name) $from{...} | -- all states +.li .verb|%*action %from(name) %from{...} | -- final states +.li .verb|@*action @from(name) @from{...} | -- any state except final states +.li .verb|<>*action <>from(name) <>from{...}| -- any except start and final (middle) +.end list + +From-state actions are executed whenever the state machine takes a transition from a +state, either to itself or to some other state. These actions are executed +immediately after the current character is tested against the input block end +marker and before the transition to take is sought based on the current +character. From-state actions are therefore executed even if a transition +cannot be found and the machine moves into the error state. Like to-state +embeddings, from-state embeddings stay with the state. + +.subsection EOF Actions + +.list +.li .verb|>/action >eof(name) >eof{...} | -- the start state +.li .verb|</action <eof(name) <eof{...} | -- any state except the start state +.li .verb|$/action $eof(name) $eof{...} | -- all states +.li .verb|%/action %eof(name) %eof{...} | -- final states +.li .verb|@/action @eof(name) @eof{...} | -- any state except final states +.li .verb|<>/action <>eof(name) <>eof{...}| -- any except start and final (middle) +.end list + +The EOF action embedding operators enable the user to embed actions that are +executed at the end of the input stream. EOF actions are stored in states and +generated in the .verb|write exec| block. They are run when .verb|p == pe == eof| +as the execute block is finishing. EOF actions are free to adjust .verb|p| and +jump to another part of the machine to restart execution. + +.subsection Handling Errors + +In many applications it is useful to be able to react to parsing errors. The +user may wish to print an error message that depends on the context. It +may also be desirable to consume input in an attempt to return the input stream +to some known state and resume parsing. To support error handling and recovery, +Ragel provides error action embedding operators. There are two kinds of error +actions: global error actions and local error actions. +Error actions can be used to simply report errors, or by jumping to a machine +instantiation that consumes input, can attempt to recover from errors. + +.subsubsection Global Error Actions + +.list +.li .verb|>!action >err(name) >err{...} | -- the start state +.li .verb|<!action <err(name) <err{...} | -- any state except the start state +.li .verb|$!action $err(name) $err{...} | -- all states +.li .verb|%!action %err(name) %err{...} | -- final states +.li .verb|@!action @err(name) @err{...} | -- any state except final states +.li .verb|<>!action <>err(name) <>err{...}| -- any except start and final (middle) +.end list + +Global error actions are stored in the states they are embedded into until +compilation is complete. They are then transferred to the transitions that move +into the error state. These transitions are taken on all input characters that +are not already covered by the state's transitions. If a state with an error +action is not final when compilation is complete, then the action is also +embedded as an EOF action. + +Error actions can be used to recover from errors by jumping back into the +machine with .verb|fgoto| and optionally altering .verb|p|. + +.subsubsection Local Error Actions + +.list +.li .verb|>^action >lerr(name) >lerr{...} | -- the start state +.li .verb|<^action <lerr(name) <lerr{...} | -- any state except the start state +.li .verb|$^action $lerr(name) $lerr{...} | -- all states +.li .verb|%^action %lerr(name) %lerr{...} | -- final states +.li .verb|@^action @lerr(name) @lerr{...} | -- any state except final states +.li .verb|<>^action <>lerr(name) <>lerr{...}| -- any except start and final (middle) +.end list + +Like global error actions, local error actions are also stored in the states +they are embedded into until a transfer point. The transfer point is different +however. Each local error action embedding is associated with a name. When a +machine definition has been fully constructed, all local error action +embeddings associated with the same name as the machine definition are +transferred to the error transitions. At this time they are also embedded as +EOF actions in the case of non-final states. + +Local error actions can be used to specify an action to take when a particular +section of a larger state machine fails to match. A particular machine +definition's ``thread'' may die and the local error actions executed, however +the machine as a whole may continue to match input. + +There are two forms of local error action embeddings. In the first form the +name defaults to the current machine. In the second form the machine name can +be specified. This is useful when it is more convenient to specify the local +error action in a sub-definition that is used to construct the machine +definition that the local error action is associated with. To embed local +error actions and +explicitly state the machine definition on which the transfer is to happen use +.verb|(name, action)| as the action. + +.subsubsection Example + +The following example uses error actions to report an error and jump to a +machine that consumes the remainder of the line when parsing fails. After +consuming the line, the error recovery machine returns to the main loop. + +% GENERATE: erract +% %%{ +% machine erract; +% ws = ' '; +% address = 'foo AT bar..com'; +% date = 'Monday May 12'; +.code +action cmd_err { + printf( "command error\n" ); + fhold; fgoto line; +} +action from_err { + printf( "from error\n" ); + fhold; fgoto line; +} +action to_err { + printf( "to error\n" ); + fhold; fgoto line; +} + +line := [^\n]* '\n' @{ fgoto main; }; + +main := ( + ( + 'from' @err(cmd_err) + ( ws+ address ws+ date '\n' ) $err(from_err) | + 'to' @err(cmd_err) + ( ws+ address '\n' ) $err(to_err) + ) +)*; +.end code +% }%% +% %% write data; +% void f() +% { +% %% write init; +% %% write exec; +% } +% END GENERATE + + + +.section Action Ordering and Duplicates + +When combining expressions that have embedded actions it is often the case that +a number of actions must be executed on a single input character. For example, +following a concatenation the leaving action of the left expression and the +entering action of the right expression will be embedded into one transition. +This requires a method of ordering actions that is intuitive and +predictable for the user, and repeatable for the compiler. + +We associate with the embedding of each action a unique timestamp that is +used to order actions that appear together on a single transition in the final +state machine. To accomplish this we recursively traverse the parse tree of +regular expressions and assign timestamps to action embeddings. References to +machine definitions are followed in the traversal. When we visit a +parse tree node we assign timestamps to all .em{entering} action embeddings, +recurse on the parse tree, then assign timestamps to the remaining .em{all}, +.em{finishing}, and .em{leaving} embeddings in the order in which they +appear. + +By default Ragel does not permit a single action to appear multiple times in an action +list. When the final machine has been created, actions that appear more than +once in a single transition, to-state, from-state or EOF action list have their +duplicates removed. +The first appearance of the action is preserved. This is useful in a number of +scenarios. First, it allows us to union machines with common prefixes without +worrying about the action embeddings in the prefix being duplicated. Second, it +prevents leaving actions from being transferred multiple times. This can +happen when a machine is repeated, then followed with another machine that +begins with a common character. For example: + +.verbatim +word = [a-z]+ %act; +main := word ( '\n' word )* '\n\n'; +.end verbatim + +Note that Ragel does not compare action bodies to determine if they have +identical program text. It simply checks for duplicates using each action +block's unique location in the program. + +The removal of duplicates can be turned off using the .verb|-d| option. + +.section Values and Statements Available in Code Blocks +.label{vals} + +The following values are available in code blocks: + +.itemize +.item .verb|fpc| -- A pointer to the current character. This is equivalent to +accessing the .verb|p| variable. + +.item .verb|fc| -- The current character. This is equivalent to the expression .verb|(*p)|. + +.item .verb|fcurs| -- An integer value representing the current state. This +value should only be read from. To move to a different place in the machine +from action code use the .verb|fgoto|, .verb|fnext| or .verb|fcall| statements. +Outside of the machine execution code the .verb|cs| variable may be modified. + +.item .verb|ftargs| -- An integer value representing the target state. This +value should only be read from. Again, .verb|fgoto|, .verb|fnext| and +.verb|fcall| can be used to move to a specific entry point. + +.item .verb|fentry(<label>)| -- Retrieve an integer value representing the +entry point .verb|label|. The integer value returned will be a compile time +constant. This number is suitable for later use in control flow transfer +statements that take an expression. This value should not be compared against +the current state because any given label can have multiple states representing +it. The value returned by .verb|fentry| can be any one of the multiple states that +it represents. +.end itemize + +The following statements are available in code blocks: + +.itemize + +.item .verb|fhold;| -- Do not advance over the current character. If processing +data in multiple buffer blocks, the .verb|fhold| statement should only be used +once in the set of actions executed on a character. Multiple calls may result +in backing up over the beginning of the buffer block. The .verb|fhold| +statement does not imply any transfer of control. It is equivalent to the +.verb|p--;| statement. + +.item .verb|fexec <expr>;| -- Set the next character to process. This can be +used to backtrack to previous input or advance ahead. +Unlike .verb|fhold|, which can be used +anywhere, .verb|fexec| requires the user to ensure that the target of the +backtrack is in the current buffer block or is known to be somewhere ahead of +it. The machine will continue iterating forward until .verb|pe| is arrived at, +.verb|fbreak| is called or the machine moves into the error state. In actions +embedded into transitions, the .verb|fexec| statement is equivalent to setting +.verb|p| to one position ahead of the next character to process. If the user +also modifies .verb|pe|, it is possible to change the buffer block entirely. + +.item .verb|fgoto <label>;| -- Jump to an entry point defined by +.verb|<label>|. The .verb|fgoto| statement immediately transfers control to +the destination state. + +.item .verb|fgoto *<expr>;| -- Jump to an entry point given by .verb|<expr>|. +The expression must evaluate to an integer value representing a state. + +.item .verb|fnext <label>;| -- Set the next state to be the entry point defined +by .verb|label|. The .verb|fnext| statement does not immediately jump to the +specified state. Any action code following the statement is executed. + +.item .verb|fnext *<expr>;| -- Set the next state to be the entry point given +by .verb|<expr>|. The expression must evaluate to an integer value representing +a state. + +.item .verb|fcall <label>;| -- Push the target state and jump to the entry +point defined by .verb|<label>|. The next .verb|fret| will jump to the target +of the transition on which the call was made. Use of .verb|fcall| requires +the declaration of a call stack. An array of integers named .verb|stack| and a +single integer named .verb|top| must be declared. With the .verb|fcall| +construct, control is immediately transferred to the destination state. +See section .ref{modularization} for more information. + +.item .verb|fcall *<expr>;| -- Push the current state and jump to the entry +point given by .verb|<expr>|. The expression must evaluate to an integer value +representing a state. + +.item .verb|fret;| -- Return to the target state of the transition on which the +last .verb|fcall| was made. Use of .verb|fret| requires the declaration of a +call stack. Control is immediately transferred to the destination state. + +.item .verb|fbreak;| -- Advance .verb|p|, save the target state to .verb|cs| +and immediately break out of the execute loop. This statement is useful +in conjunction with the .verb|noend| write option. Rather than process input +until .verb|pe| is arrived at, the fbreak statement +can be used to stop processing from an action. After an .verb|fbreak| +statement the .verb|p| variable will point to the next character in the input. The +current state will be the target of the current transition. Note that .verb|fbreak| +causes the target state's to-state actions to be skipped. + +.end itemize + +Once actions with control-flow commands are embedded into a +machine, the user must exercise caution when using the machine as the operand +to other machine construction operators. If an action jumps to another state +then unioning any transition that executes that action with another transition +that follows some other path will cause that other path to be lost. Using +commands that manually jump around a machine takes us out of the domain of +regular languages because transitions that the +machine construction operators are not aware of are introduced. These +commands should therefore be used with caution. + + +.chapter Controlling Nondeterminism +.label{controlling-nondeterminism} + +Along with the flexibility of arbitrary action embeddings comes a need to +control nondeterminism in regular expressions. If a regular expression is +ambiguous, then sub-components of a parser other than the intended parts may become +active. This means that actions that are irrelevant to the +current subset of the parser may be executed, causing problems for the +programmer. + +Tools that are based on regular expression engines and that are used for +recognition tasks will usually function as intended regardless of the presence +of ambiguities. It is quite common for users of scripting languages to write +regular expressions that are heavily ambiguous and it generally does not +matter. As long as one of the potential matches is recognized, there can be any +number of other matches present. In some parsing systems the run-time engine +can employ a strategy for resolving ambiguities, for example always pursuing +the longest possible match and discarding others. + +In Ragel, there is no regular expression run-time engine, just a simple state +machine execution model. When we begin to embed actions and face the +possibility of spurious action execution, it becomes clear that controlling +nondeterminism at the machine construction level is very important. Consider +the following example. + +% GENERATE: lines1 +% OPT: -p +% %%{ +% machine lines1; +% action first {} +% action tail {} +% word = [a-z]+; +.code +ws = [\n\t ]; +line = word $first ( ws word $tail )* '\n'; +lines = line*; +.end code +% main := lines; +% }%% +% END GENERATE + +.graphic lines1 0.53 + +Since the .verb|ws| expression includes the newline character, we will +not finish the .verb|line| expression when a newline character is seen. We will +simultaneously pursue the possibility of matching further words on the same +line and the possibility of matching a second line. Evidence of this fact is +in the state tables. On several transitions both the .verb|first| and +.verb|tail| actions are executed. The solution here is simple: exclude +the newline character from the .verb|ws| expression. + +% GENERATE: lines2 +% OPT: -p +% %%{ +% machine lines2; +% action first {} +% action tail {} +% word = [a-z]+; +.code +ws = [\t ]; +line = word $first ( ws word $tail )* '\n'; +lines = line*; +.end code +% main := lines; +% }%% +% END GENERATE + +.graphic lines2 + +Solving this kind of problem is straightforward when the ambiguity is created +by strings that are a single character long. When the ambiguity is created by +strings that are multiple characters long we have a more difficult problem. +The following example is an incorrect attempt at a regular expression for C +language comments. + +% GENERATE: comments1 +% OPT: -p +% %%{ +% machine comments1; +% action comm {} +.code +comment = '/*' ( any @comm )* '*/'; +main := comment ' '; +.end code +% }%% +% END GENERATE + +.graphic comments1 + +Using standard concatenation, we will never leave the .verb|any*| expression. +We will forever entertain the possibility that a .verb|'*/'| string that we see +is contained in a longer comment and that, simultaneously, the comment has +ended. The concatenation of the .verb|comment| machine with .verb|SP| is done +to show this. When we match space, we are also still matching the comment body. + +One way to approach the problem is to exclude the terminating string +from the .verb|any*| expression using set difference. We must be careful to +exclude not just the terminating string, but any string that contains it as a +substring. A verbose, but proper specification of a C comment parser is given +by the following regular expression. + +% GENERATE: comments2 +% OPT: -p +% %%{ +% machine comments2; +% action comm {} +.code +comment = '/*' ( ( any @comm )* - ( any* '*/' any* ) ) '*/'; +.end code +% main := comment; +% }%% +% END GENERATE + +.graphic comments2 + +Note that Ragel's strong subtraction operator .verb|--| can also be used here. +In doing this subtraction we have phrased the problem of controlling non-determinism in +terms of excluding strings common to two expressions that interact when +combined. +We can also phrase the problem in terms of the transitions of the state +machines that implement these expressions. During the concatenation of +.verb|any*| and .verb|'*/'| we will be making transitions that are composed of +both the loop of the first expression and the final character of the second. +At this time we want the transition on the .verb|'/'| character to take precedence +over and disallow the transition that originated in the .verb|any*| loop. + +In another parsing problem, we wish to implement a lightweight tokenizer that we can +utilize in the composition of a larger machine. For example, some HTTP headers +have a token stream as a sub-language. The following example is an attempt +at a regular expression-based tokenizer that does not function correctly due to +unintended nondeterminism. + +% GENERATE: smallscanner +% OPT: -p +% %%{ +% machine smallscanner; +% action start_str {} +% action on_char {} +% action finish_str {} +.code +header_contents = ( + lower+ >start_str $on_char %finish_str | + ' ' +)*; +.end code +% main := header_contents; +% }%% +% END GENERATE + +.graphic smallscanner + +In this case, the problem with using a standard kleene star operation is that +there is an ambiguity between extending a token and wrapping around the machine +to begin a new token. Using the standard operator, we get an undesirable +nondeterministic behaviour. Evidence of this can be seen on the transition out +of state one to itself. The transition extends the string, and simultaneously, +finishes the string only to immediately begin a new one. What is required is +for the +transitions that represent an extension of a token to take precedence over the +transitions that represent the beginning of a new token. For this problem +there is no simple solution that uses standard regular expression operators. + +.section Priorities + +A priority mechanism was devised and built into the determinization +process, specifically for the purpose of allowing the user to control +nondeterminism. Priorities are integer values embedded into transitions. When +the determinization process is combining transitions that have different +priorities, the transition with the higher priority is preserved and the +transition with the lower priority is dropped. + +Unfortunately, priorities can have unintended side effects because their +operation requires that they linger in transitions indefinitely. They must linger +because the Ragel program cannot know when the user is finished with a priority +embedding. A solution whereby they are explicitly deleted after use is +conceivable; however this is not very user-friendly. Priorities were therefore +made into named entities. Only priorities with the same name are allowed to +interact. This allows any number of priorities to coexist in one machine for +the purpose of controlling various different regular expression operations and +eliminates the need to ever delete them. Such a scheme allows the user to +choose a unique name, embed two different priority values using that name +and be confident that the priority embedding will be free of any side effects. + +In the first form of priority embedding the name defaults to the name of the machine +definition that the priority is assigned in. In this sense priorities are by +default local to the current machine definition or instantiation. Beware of +using this form in a longest-match machine, since there is only one name for +the entire set of longest match patterns. In the second form the priority's +name can be specified, allowing priority interaction across machine definition +boundaries. + +.itemize +.item .verb|expr > int| -- Sets starting transitions to have priority int. +.item .verb|expr @ int| -- Sets transitions that go into a final state to have priority int. +.item .verb|expr $ int| -- Sets all transitions to have priority int. +.item .verb|expr % int| -- Sets leaving transitions to +have priority int. When a transition is made going out of the machine (either +by concatenation or kleene star) its priority is immediately set to the +leaving priority. +.end itemize + +The second form of priority assignment allows the programmer to specify the name +to which the priority is assigned. + +.itemize +.item .verb|expr > (name, int)| -- Starting transitions. +.item .verb|expr @ (name, int)| -- Finishing transitions (into a final state). +.item .verb|expr $ (name, int)| -- All transitions. +.item .verb|expr % (name, int)| -- Leaving transitions. +.end itemize + +.section Guarded Operators that Encapsulate Priorities + +Priority embeddings are a very expressive mechanism. At the same time they +can be very confusing for the user. They force the user to imagine +the transitions inside two interacting expressions and work out the precise +effects of the operations between them. When we consider +that this problem is worsened by the +potential for side effects caused by unintended priority name collisions, we +see that exposing the user to priorities is undesirable. + +Fortunately, in practice the use of priorities has been necessary only in a +small number of scenarios. This allows us to encapsulate their functionality +into a small set of operators and fully hide them from the user. This is +advantageous from a language design point of view because it greatly simplifies +the design. + +Going back to the C comment example, we can now properly specify +it using a guarded concatenation operator which we call .em{finish-guarded +concatenation}. From the user's point of view, this operator terminates the +first machine when the second machine moves into a final state. It chooses a +unique name and uses it to embed a low priority into all +transitions of the first machine. A higher priority is then embedded into the +transitions of the second machine that enter into a final state. The following +example yields a machine identical to the example in Section +.ref{controlling-nondeterminism}. + +.code +comment = '/*' ( any @comm )* :>> '*/'; +.end code + +.graphic comments2 + +Another guarded operator is .em{left-guarded concatenation}, given by the +.verb|<:| compound symbol. This operator places a higher priority on all +transitions of the first machine. This is useful if one must forcibly separate +two lists that contain common elements. For example, one may need to tokenize a +stream, but first consume leading whitespace. + +Ragel also includes a .em{longest-match kleene star} operator, given by the +.verb|**| compound symbol. This +guarded operator embeds a high +priority into all transitions of the machine. +A lower priority is then embedded into the leaving transitions. When the +kleene star operator makes the epsilon transitions from +the final states into the new start state, the lower priority will be transferred +to the epsilon transitions. In cases where following an epsilon transition +out of a final state conflicts with an existing transition out of a final +state, the epsilon transition will be dropped. + +Other guarded operators are conceivable, such as guards on union that cause one +alternative to take precedence over another. These may be implemented when it +is clear they constitute a frequently used operation. +In the next section we discuss the explicit specification of state machines +using state charts. + +.subsection Entry-Guarded Concatenation + +.verb|expr :> expr| + +This operator concatenates two machines, but first assigns a low +priority to all transitions +of the first machine and a high priority to the starting transitions of the +second machine. This operator is useful if from the final states of the first +machine it is possible to accept the characters in the entering transitions of +the second machine. This operator effectively terminates the first machine +immediately upon starting the second machine, where otherwise they would be +pursued concurrently. In the following example, entry-guarded concatenation is +used to move out of a machine that matches everything at the first sign of an +end-of-input marker. + +% GENERATE: entryguard +% OPT: -p +% %%{ +% machine entryguard; +.code +# Leave the catch-all machine on the first character of FIN. +main := any* :> 'FIN'; +.end code +% }%% +% END GENERATE + +.graphic entryguard + +Entry-guarded concatenation is equivalent to the following: + +.verbatim +expr $(unique_name,0) . expr >(unique_name,1) +.end verbatim + +.subsection Finish-Guarded Concatenation + +.verb|expr :>> expr| + +This operator is +like the previous operator, except the higher priority is placed on the final +transitions of the second machine. This is useful if one wishes to entertain +the possibility of continuing to match the first machine right up until the +second machine enters a final state. In other words it terminates the first +machine only when the second accepts. In the following example, finish-guarded +concatenation causes the move out of the machine that matches everything to be +delayed until the full end-of-input marker has been matched. + +% GENERATE: finguard +% OPT: -p +% %%{ +% machine finguard; +.code +# Leave the catch-all machine on the last character of FIN. +main := any* :>> 'FIN'; +.end code +% }%% +% END GENERATE + +.graphic finguard + +Finish-guarded concatenation is equivalent to the following, with one +exception. If the right machine's start state is final, the higher priority is +also embedded into it as a leaving priority. This prevents the left machine +from persisting via the zero-length string. + +.verbatim +expr $(unique_name,0) . expr @(unique_name,1) +.end verbatim + +.subsection Left-Guarded Concatenation + +.verb|expr <: expr| + +This operator places +a higher priority on the left expression. It is useful if you want to prefix a +sequence with another sequence composed of some of the same characters. For +example, one can consume leading whitespace before tokenizing a sequence of +whitespace-separated words as in: + +% GENERATE: leftguard +% OPT: -p +% %%{ +% machine leftguard; +% action alpha {} +% action ws {} +% action start {} +% action fin {} +.code +main := ( ' '* >start %fin ) <: ( ' ' $ws | [a-z] $alpha )*; +.end code +% }%% +% END GENERATE + +.graphic leftguard + +Left-guarded concatenation is equivalent to the following: + +.verbatim +expr $(unique_name,1) . expr >(unique_name,0) +.end verbatim + +.subsection Longest-Match Kleene Star +.label{longest_match_kleene_star} + +.verb|expr**| + +This version of kleene star puts a higher priority on staying in the +machine versus wrapping around and starting over. The LM kleene star is useful +when writing simple tokenizers. These machines are built by applying the +longest-match kleene star to an alternation of token patterns, as in the +following. + +% GENERATE: lmkleene +% OPT: -p +% %%{ +% machine exfinpri; +% action A {} +% action B {} +.code +# Repeat tokens, but make sure to get the longest match. +main := ( + lower ( lower | digit )* %A | + digit+ %B | + ' ' +)**; +.end code +% }%% +% END GENERATE + +.graphic lmkleene + +If a regular kleene star were used the machine above would not be able to +distinguish between extending a word and beginning a new one. This operator is +equivalent to: + +.verbatim +( expr $(unique_name,1) %(unique_name,0) )* +.end verbatim + +When the kleene star is applied, transitions that go out of the machine and +back into it are made. These are assigned a priority of zero by the leaving +transition mechanism. This is less than the priority of one assigned to the +transitions leaving the final states but not leaving the machine. When +these transitions clash on the same character, the +transition that stays in the machine takes precedence. The transition +that wraps around is dropped. + +Note that this operator does not build a scanner in the traditional sense +because there is never any backtracking. To build a scanner with backtracking +use the Longest-Match machine construction described in Section +.ref{generating-scanners}. + +.chapter Interface to Host Program + +The Ragel code generator is very flexible. The generated code has no +dependencies and can be inserted in any function, perhaps inside a loop if +desired. The user is responsible for declaring and initializing a number of +required variables, including the current state and the pointer to the input +stream. These can live in any scope. Control of the input processing loop is +also possible: the user may break out of the processing loop and return to it +at any time. + +In the case of the C, D, and Go host languages, Ragel is able to generate very +fast-running code that implements state machines as directly executable code. +Since very large files strain the host language compiler, table-based code +generation is also supported. In the future we hope to provide a partitioned, +directly executable format that is able to reduce the burden on the host +compiler by splitting large machines across multiple functions. + +In the case of Java and Ruby, table-based code generation is the only code +style supported. In the future this may be expanded to include other code +styles. + +Ragel can be used to parse input in one block, or it can be used to parse input +in a sequence of blocks as it arrives from a file or socket. Parsing the input +in a sequence of blocks brings with it a few responsibilities. If the parser +utilizes a scanner, care must be taken to not break the input stream anywhere +but token boundaries. If pointers to the input stream are taken during +parsing, care must be taken to not use a pointer that has been invalidated by +movement to a subsequent block. If the current input data pointer is moved +backwards it must not be moved past the beginning of the current block. + +Figure .ref{basic-example} shows a simple Ragel program that does not have any +actions. The example tests the first argument of the program against a number +pattern and then prints the machine's acceptance status. + +.figure basic-example +.verbatim +#include <stdio.h> +#include <string.h> +%%{ + machine foo; + write data; +}%% +int main( int argc, char **argv ) +{ + int cs; + if ( argc > 1 ) { + char *p = argv[1]; + char *pe = p + strlen( p ); + %%{ + main := [0-9]+ ( '.' [0-9]+ )?; + + write init; + write exec; + }%% + } + printf("result = %i\n", cs >= foo_first_final ); + return 0; +} +.end verbatim +.caption A basic Ragel example without any actions. +.end figure + +.section Variables Used by Ragel + +There are a number of variables that Ragel expects the user to declare. At a +very minimum the .verb|cs|, .verb|p| and .verb|pe| variables must be declared. +In Go, Java and Ruby code the .verb|data| variable must also be declared. If +EOF actions are used then the .verb|eof| variable is required. If +stack-based state machine control flow statements are used then the +.verb|stack| and .verb|top| variables are required. If a scanner is declared +then the .verb|act|, .verb|ts| and .verb|te| variables must be +declared. + +.itemize + +.item .verb|cs| - Current state. This must be an integer and it should persist +across invocations of the machine when the data is broken into blocks that are +processed independently. This variable may be modified from outside the +execution loop, but not from within. + +.item .verb|p| - Data pointer. In C/D code this variable is expected to be a +pointer to the character data to process. It should be initialized to the +beginning of the data block on every run of the machine. In Go, Java and Ruby it is +used as an offset to .verb|data| and must be an integer. In this case it should +be initialized to zero on every run of the machine. + +.item .verb|pe| - Data end pointer. This should be initialized to .verb|p| plus +the data length on every run of the machine. In Go, Java and Ruby code this should +be initialized to the data length. + +.item .verb|eof| - End of file pointer. This should be set to .verb|pe| when +the buffer block being processed is the last one, otherwise it should be set to +null. In Go, Java and Ruby code .verb|-1| must be used instead of null. If the EOF +event can be known only after the final buffer block has been processed, then +it is possible to set .verb|p = pe = eof| and run the execute block. + +.item .verb|data| - This variable is only required in Go, Java and Ruby code. It +must be an array containting the data to process. + +.item .verb|stack| - This must be an array of integers. It is used to store +integer values representing states. If the stack must resize dynamically the +Pre-push and Post-Pop statements can be used to do this (Sections +.ref{prepush} and .ref{postpop}). + +.item .verb|top| - This must be an integer value and will be used as an offset +to .verb|stack|, giving the next available spot on the top of the stack. + +.item .verb|act| - This must be an integer value. It is a variable sometimes +used by scanner code to keep track of the most recent successful pattern match. + +.item .verb|ts| - This must be a pointer to character data. In Go, Java and +Ruby code this must be an integer. See Section .ref{generating-scanners} for +more information. + +.item .verb|te| - Also a pointer to character data. + +.end itemize + +.section Alphtype Statement + +.verbatim +alphtype unsigned int; +.end verbatim + +The alphtype statement specifies the alphabet data type that the machine +operates on. During the compilation of the machine, integer literals are +expected to be in the range of possible values of the alphtype. The default +is .verb|char| for all languages except Go where the default is .verb|byte|. + +.multicols +C/C++/Objective-C: +.verbatim + char unsigned char + short unsigned short + int unsigned int + long unsigned long +.end verbatim + +Go: +.verbatim + byte + int8 uint8 + int16 uint16 + int32 uint32 + int +.end verbatim + +Ruby: +.verbatim + char + int +.end verbatim + +.columnbreak + +Java: +.verbatim + char + byte + short + int +.end verbatim + +D: +.verbatim + char + byte ubyte + short ushort + wchar + int uint + dchar +.end verbatim + +.end multicols + +.section Getkey Statement + +.verbatim +getkey fpc->id; +.end verbatim + +This statement specifies to Ragel how to retrieve the current character from +from the pointer to the current element (.verb|p|). Any expression that returns +a value of the alphabet type +may be used. The getkey statement may be used for looking into element +structures or for translating the character to process. The getkey expression +defaults to .verb|(*p)|. In goto-driven machines the getkey expression may be +evaluated more than once per element processed, therefore it should not incur a +large cost nor preclude optimization. + +.section Access Statement + +.verbatim +access fsm->; +.end verbatim + +The access statement specifies how the generated code should +access the machine data that is persistent across processing buffer blocks. +This applies to all variables except .verb|p|, .verb|pe| and .verb|eof|. This includes +.verb|cs|, .verb|top|, .verb|stack|, .verb|ts|, .verb|te| and .verb|act|. +The access statement is useful if a machine is to be encapsulated inside a +structure in C code. It can be used to give the name of +a pointer to the structure. + +.section Variable Statement + +.verbatim +variable p fsm->p; +.end verbatim + +The variable statement specifies how to access a specific +variable. All of the variables that are declared by the user and +used by Ragel can be changed. This includes .verb|p|, .verb|pe|, .verb|eof|, .verb|cs|, +.verb|top|, .verb|stack|, .verb|ts|, .verb|te| and .verb|act|. +In Go, Ruby and Java code generation the .verb|data| variable can also be changed. + +.section Pre-Push Statement +.label{prepush} + +.verbatim +prepush { + /* stack growing code */ +} +.end verbatim + +The prepush statement allows the user to supply stack management code that is +written out during the generation of fcall, immediately before the current +state is pushed to the stack. This statement can be used to test the number of +available spaces and dynamically grow the stack if necessary. + +.section Post-Pop Statement +.label{postpop} + +.verbatim +postpop { + /* stack shrinking code */ +} +.end verbatim + +The postpop statement allows the user to supply stack management code that is +written out during the generation of fret, immediately after the next state is +popped from the stack. This statement can be used to dynamically shrink the +stack. + +.section Write Statement +.label{write-statement} + +.verbatim +write <component> [options]; +.end verbatim + +The write statement is used to generate parts of the machine. +There are seven +components that can be generated by a write statement. These components make up the +state machine's data, initialization code, execution code, and export definitions. +A write statement may appear before a machine is fully defined. +This allows one to write out the data first then later define the machine where +it is used. An example of this is shown in Figure .ref{fbreak-example}. + +.subsection Write Data +.verbatim +write data [options]; +.end verbatim + +The write data statement causes Ragel to emit the constant static data needed +by the machine. In table-driven output styles (see Section .ref{genout}) this +is a collection of arrays that represent the states and transitions of the +machine. In goto-driven machines much less data is emitted. At the very +minimum a start state .verb|name_start| is generated. All variables written +out in machine data have both the .verb|static| and .verb|const| properties and +are prefixed with the name of the machine and an +underscore. The data can be placed inside a class, inside a function, or it can +be defined as global data. + +Two variables are written that may be used to test the state of the machine +after a buffer block has been processed. The .verb|name_error| variable gives +the id of the state that the machine moves into when it cannot find a valid +transition to take. The machine immediately breaks out of the processing loop when +it finds itself in the error state. The error variable can be compared to the +current state to determine if the machine has failed to parse the input. If the +machine is complete, that is from every state there is a transition to a proper +state on every possible character of the alphabet, then no error state is required +and this variable will be set to -1. + +The .verb|name_first_final| variable stores the id of the first final state. +All of the machine's states are sorted by their final state status before +having their ids assigned. Checking if the machine has accepted its input can +then be done by checking if the current state is greater-than or equal to the +first final state. + +Data generation has several options: + +.list +.li .verb|noerror | - Do not generate the integer variable that gives the id of the error state. +.li .verb|nofinal | - Do not generate the integer variable that gives the id of the first final state. +.li .verb|noprefix | - Do not prefix the variable names with the name of the machine. +.end list + +.figure fbreak-example +.verbatim +#include <stdio.h> +%% machine foo; +%% write data; +int main( int argc, char **argv ) +{ + int cs, res = 0; + if ( argc > 1 ) { + char *p = argv[1]; + %%{ + main := + [a-z]+ + 0 @{ res = 1; fbreak; }; + write init; + write exec noend; + }%% + } + printf("execute = %i\n", res ); + return 0; +} +.end verbatim +.caption Use of .tt{noend} write option and the .tt{fbreak} statement for +processing a string. +.end figure + +.subsection Write Start, First Final and Error + +.verbatim +write start; +write first_final; +write error; +.end verbatim + +These three write statements provide an alternative means of accessing the +.verb|start|, .verb|first_final| and .verb|error| states. If there are many +different machine specifications in one file it is easy to get the prefix for +these wrong. This is especially true if the state machine boilerplate is +frequently made by a copy-paste-edit process. These write statements allow the +problem to be avoided. They can be used as follows: + +.verbatim +/* Did parsing succeed? */ +if ( cs < %%{ write first_final; }%% ) { + result = ERR_PARSE_ERROR; + goto fail; +} +.end verbatim + +.subsection Write Init +.verbatim +write init [options]; +.end verbatim + +The write init statement causes Ragel to emit initialization code. This should +be executed once before the machine is started. At a very minimum this sets the +current state to the start state. If other variables are needed by the +generated code, such as call stack variables or scanner management +variables, they are also initialized here. + +The .verb|nocs| option to the write init statement will cause ragel to skip +intialization of the cs variable. This is useful if the user wishes to use +custom logic to decide which state the specification should start in. + +.subsection Write Exec +.verbatim +write exec [options]; +.end verbatim + +The write exec statement causes Ragel to emit the state machine's execution code. +Ragel expects several variables to be available to this code. At a very minimum, the +generated code needs access to the current character position .verb|p|, the ending +position .verb|pe| and the current state .verb|cs| (though .verb|pe| +can be omitted using the .verb|noend| write option). +The .verb|p| variable is the cursor that the execute code will +used to traverse the input. The .verb|pe| variable should be set up to point to one +position past the last valid character in the buffer. + +Other variables are needed when certain features are used. For example using +the .verb|fcall| or .verb|fret| statements requires .verb|stack| and +.verb|top| variables to be defined. If a longest-match construction is used, +variables for managing backtracking are required. + +The write exec statement has one option. The .verb|noend| option tells Ragel +to generate code that ignores the end position .verb|pe|. In this +case the user must explicitly break out of the processing loop using +.verb|fbreak|, otherwise the machine will continue to process characters until +it moves into the error state. This option is useful if one wishes to process a +null terminated string. Rather than traverse the string to discover then length +before processing the input, the user can break out when the null character is +seen. The example in Figure .ref{fbreak-example} shows the use of the +.verb|noend| write option and the .verb|fbreak| statement for processing a string. + +.subsection Write Exports +.label{export} + +.verbatim +write exports; +.end verbatim + +The export feature can be used to export simple machine definitions. Machine definitions +are marked for export using the .verb|export| keyword. + +.verbatim +export machine_to_export = 0x44; +.end verbatim + +When the write exports statement is used these machines are +written out in the generated code. Defines are used for C and constant integers +are used for D, Java and Ruby. See Section .ref{import} for a description of the +import statement. + +.section Maintaining Pointers to Input Data + +In the creation of any parser it is not uncommon to require the collection of +the data being parsed. It is always possible to collect data into a growable +buffer as the machine moves over it, however the copying of data is a somewhat +wasteful use of processor cycles. The most efficient way to collect data from +the parser is to set pointers into the input then later reference them. This +poses a problem for uses of Ragel where the input data arrives in blocks, such +as over a socket or from a file. If a pointer is set in one buffer block but +must be used while parsing a following buffer block, some extra consideration +to correctness must be made. + +The scanner constructions exhibit this problem, requiring the maintenance +code described in Section .ref{generating-scanners}. If a longest-match +construction has been used somewhere in the machine then it is possible to +take advantage of the required prefix maintenance code in the driver program to +ensure pointers to the input are always valid. If laying down a pointer one can +set .verb|ts| at the same spot or ahead of it. When data is shifted in +between loops the user must also shift the pointer. In this way it is possible +to maintain pointers to the input that will always be consistent. + +.figure line-oriented +.verbatim + int have = 0; + while ( 1 ) { + char *p, *pe, *data = buf + have; + int len, space = BUFSIZE - have; + + if ( space == 0 ) { + fprintf(stderr, "BUFFER OUT OF SPACE\n"); + exit(1); + } + + len = fread( data, 1, space, stdin ); + if ( len == 0 ) + break; + + /* Find the last newline by searching backwards. */ + p = buf; + pe = data + len - 1; + while ( *pe != '\n' && pe >= buf ) + pe--; + pe += 1; + + %% write exec; + + /* How much is still in the buffer? */ + have = data + len - pe; + if ( have > 0 ) + memmove( buf, pe, have ); + + if ( len < space ) + break; + } +.end verbatim +.caption An example of line-oriented processing. +.end figure + +In general, there are two approaches for guaranteeing the consistency of +pointers to input data. The first approach is the one just described; +lay down a marker from an action, +then later ensure that the data the marker points to is preserved ahead of +the buffer on the next execute invocation. This approach is good because it +allows the parser to decide on the pointer-use boundaries, which can be +arbitrarily complex parsing conditions. A downside is that it requires any +pointers that are set to be corrected in between execute invocations. + +The alternative is to find the pointer-use boundaries before invoking the execute +routine, then pass in the data using these boundaries. For example, if the +program must perform line-oriented processing, the user can scan backwards from +the end of an input block that has just been read in and process only up to the +first found newline. On the next input read, the new data is placed after the +partially read line and processing continues from the beginning of the line. +An example of line-oriented processing is given in Figure .ref{line-oriented}. + +.section Specifying the Host Language + +The .verb|ragel| program has a number of options for specifying the host +language. The host-language options are: + +.itemize +.item .verb|-C | for C/C++/Objective-C code (default) +.item .verb|-D | for D code. +.item .verb|-Z | for Go code. +.item .verb|-J | for Java code. +.item .verb|-R | for Ruby code. +.item .verb|-A | for C\# code. +.end itemize + +.section Choosing a Generated Code Style +.label{genout} + +There are three styles of code output to choose from. Code style affects the +size and speed of the compiled binary. Changing code style does not require any +change to the Ragel program. There are two table-driven formats and a goto +driven format. + +In addition to choosing a style to emit, there are various levels of action +code reuse to choose from. The maximum reuse levels (.verb|-T0|, .verb|-F0| +and .verb|-G0|) ensure that no FSM action code is ever duplicated by encoding +each transition's action list as static data and iterating +through the lists on every transition. This will normally result in a smaller +binary. The less action reuse options (.verb|-T1|, .verb|-F1| and .verb|-G1|) +will usually produce faster running code by expanding each transition's action +list into a single block of code, eliminating the need to iterate through the +lists. This duplicates action code instead of generating the logic necessary +for reuse. Consequently the binary will be larger. However, this tradeoff applies to +machines with moderate to dense action lists only. If a machine's transitions +frequently have less than two actions then the less reuse options will actually +produce both a smaller and a faster running binary due to less action sharing +overhead. The best way to choose the appropriate code style for your +application is to perform your own tests. + +The table-driven FSM represents the state machine as constant static data. There are +tables of states, transitions, indices and actions. The current state is +stored in a variable. The execution is simply a loop that looks up the current +state, looks up the transition to take, executes any actions and moves to the +target state. In general, the table-driven FSM can handle any machine, produces +a smaller binary and requires a less expensive host language compile, but +results in slower running code. Since the table-driven format is the most +flexible it is the default code style. + +The flat table-driven machine is a table-based machine that is optimized for +small alphabets. Where the regular table machine uses the current character as +the key in a binary search for the transition to take, the flat table machine +uses the current character as an index into an array of transitions. This is +faster in general, however is only suitable if the span of possible characters +is small. + +The goto-driven FSM represents the state machine using goto and switch +statements. The execution is a flat code block where the transition to take is +computed using switch statements and directly executable binary searches. In +general, the goto FSM produces faster code but results in a larger binary and a +more expensive host language compile. + +The goto-driven format has an additional action reuse level (.verb|-G2|) that +writes actions directly into the state transitioning logic rather than putting +all the actions together into a single switch. Generally this produces faster +running code because it allows the machine to encode the current state using +the processor's instruction pointer. Again, sparse machines may actually +compile to smaller binaries when .verb|-G2| is used due to less state and +action management overhead. For many parsing applications .verb|-G2| is the +preferred output format. + +.center + +Code Output Style Options + +.tabular +.row .verb|-T0|&binary search table-driven&C/D/Java/Ruby/C\# +.row .verb|-T1|&binary search, expanded actions&C/D/Ruby/C\# +.row .verb|-F0|&flat table-driven&C/D/Ruby/C\# +.row .verb|-F1|&flat table, expanded actions&C/D/Ruby/C\# +.row .verb|-G0|&goto-driven&C/D/C\# +.row .verb|-G1|&goto, expanded actions&C/D/C\# +.row .verb|-G2|&goto, in-place actions&C/D/Go +.end tabular +.end center + +.chapter Beyond the Basic Model + +.section Parser Modularization +.label{modularization} + +It is possible to use Ragel's machine construction and action embedding +operators to specify an entire parser using a single regular expression. In +many cases this is the desired way to specify a parser in Ragel. However, in +some scenarios the language to parse may be so large that it is difficult to +think about it as a single regular expression. It may also shift between distinct +parsing strategies, in which case modularization into several coherent blocks +of the language may be appropriate. + +It may also be the case that patterns that compile to a large number of states +must be used in a number of different contexts and referencing them in each +context results in a very large state machine. In this case, an ability to reuse +parsers would reduce code size. + +To address this, distinct regular expressions may be instantiated and linked +together by means of a jumping and calling mechanism. This mechanism is +analogous to the jumping to and calling of processor instructions. A jump +command, given in action code, causes control to be immediately passed to +another portion of the machine by way of setting the current state variable. A +call command causes the target state of the current transition to be pushed to +a state stack before control is transferred. Later on, the original location +may be returned to with a return statement. In the following example, distinct +state machines are used to handle the parsing of two types of headers. + +% GENERATE: call +% %%{ +% machine call; +.code +action return { fret; } +action call_date { fcall date; } +action call_name { fcall name; } + +# A parser for date strings. +date := [0-9][0-9] '/' + [0-9][0-9] '/' + [0-9][0-9][0-9][0-9] '\n' @return; + +# A parser for name strings. +name := ( [a-zA-Z]+ | ' ' )** '\n' @return; + +# The main parser. +headers = + ( 'from' | 'to' ) ':' @call_name | + ( 'departed' | 'arrived' ) ':' @call_date; + +main := headers*; +.end code +% }%% +% %% write data; +% void f() +% { +% %% write init; +% %% write exec; +% } +% END GENERATE + +Calling and jumping should be used carefully as they are operations that take +one out of the domain of regular languages. A machine that contains a call or +jump statement in one of its actions should be used as an argument to a machine +construction operator only with considerable care. Since DFA transitions may +actually represent several NFA transitions, a call or jump embedded in one +machine can inadvertently terminate another machine that it shares prefixes +with. Despite this danger, theses statements have proven useful for tying +together sub-parsers of a language into a parser for the full language, +especially for the purpose of modularizing code and reducing the number of +states when the machine contains frequently recurring patterns. + +Section .ref{vals} describes the jump and call statements that are used to +transfer control. These statements make use of two variables that must be +declared by the user, .verb|stack| and .verb|top|. The .verb|stack| variable +must be an array of integers and .verb|top| must be a single integer, which +will point to the next available space in .verb|stack|. Sections .ref{prepush} +and .ref{postpop} describe the Pre-Push and Post-Pop statements which can be +used to implement a dynamically resizable array. + +.section Referencing Names +.label{labels} + +This section describes how to reference names in epsilon transitions (Section +.ref{state-charts}) and +action-based control-flow statements such as .verb|fgoto|. There is a hierarchy +of names implied in a Ragel specification. At the top level are the machine +instantiations. Beneath the instantiations are labels and references to machine +definitions. Beneath those are more labels and references to definitions, and +so on. + +Any name reference may contain multiple components separated with the .verb|::| +compound symbol. The search for the first component of a name reference is +rooted at the join expression that the epsilon transition or action embedding +is contained in. If the name reference is not contained in a join, +the search is rooted at the machine definition that the epsilon transition or +action embedding is contained in. Each component after the first is searched +for beginning at the location in the name tree that the previous reference +component refers to. + +In the case of action-based references, if the action is embedded more than +once, the local search is performed for each embedding and the result is the +union of all the searches. If no result is found for action-based references then +the search is repeated at the root of the name tree. Any action-based name +search may be forced into a strictly global search by prefixing the name +reference with .verb|::|. + +The final component of the name reference must resolve to a unique entry point. +If a name is unique in the entire name tree it can be referenced as is. If it +is not unique it can be specified by qualifying it with names above it in the +name tree. However, it can always be renamed. + +% FIXME: Should fit this in somewhere. +% Some kinds of name references are illegal. Cannot call into longest-match +% machine, can only call its start state. Cannot make a call to anywhere from +% any part of a longest-match machine except a rule's action. This would result +% in an eventual return to some point inside a longest-match other than the +% start state. This is banned for the same reason a call into the LM machine is +% banned. + + +.section Scanners +.label{generating-scanners} + +Scanners are very much intertwined with regular-languages and their +corresponding processors. For this reason Ragel supports the definition of +scanners. The generated code will repeatedly attempt to match patterns from a +list, favouring longer patterns over shorter patterns. In the case of +equal-length matches, the generated code will favour patterns that appear ahead +of others. When a scanner makes a match it executes the user code associated +with the match, consumes the input then resumes scanning. + +.verbatim +<machine_name> := |* + pattern1 => action1; + pattern2 => action2; + ... + *|; +.end verbatim + +On the surface, Ragel scanners are similar to those defined by Lex. Though +there is a key distinguishing feature: patterns may be arbitrary Ragel +expressions and can therefore contain embedded code. With a Ragel-based scanner +the user need not wait until the end of a pattern before user code can be +executed. + +Scanners can be used to process sub-languages, as well as for tokenizing +programming languages. In the following example a scanner is used to tokenize +the contents of a header field. + +.code +word = [a-z]+; +head_name = 'Header'; + +header := |* + word; + ' '; + '\n' => { fret; }; +*|; + +main := ( head_name ':' @{ fcall header; } )*; +.end code + +The scanner construction has a purpose similar to the longest-match kleene star +operator .verb|**|. The key +difference is that a scanner is able to backtrack to match a previously matched +shorter string when the pursuit of a longer string fails. For this reason the +scanner construction operator is not a pure state machine construction +operator. It relies on several variables that enable it to backtrack and make +pointers to the matched input text available to the user. For this reason +scanners must be immediately instantiated. They cannot be defined inline or +referenced by another expression. Scanners must be jumped to or called. + +Scanners rely on the .verb|ts|, .verb|te| and .verb|act| +variables to be present so that they can backtrack and make pointers to the +matched text available to the user. If input is processed using multiple calls +to the execute code then the user must ensure that when a token is only +partially matched that the prefix is preserved on the subsequent invocation of +the execute code. + +The .verb|ts| variable must be defined as a pointer to the input data. +It is used for recording where the current token match begins. This variable +may be used in action code for retrieving the text of the current match. Ragel +ensures that in between tokens and outside of the longest-match machines that +this pointer is set to null. In between calls to the execute code the user must +check if .verb|ts| is set and if so, ensure that the data it points to is +preserved ahead of the next buffer block. This is described in more detail +below. + +The .verb|te| variable must also be defined as a pointer to the input data. +It is used for recording where a match ends and where scanning of the next +token should begin. This can also be used in action code for retrieving the +text of the current match. + +The .verb|act| variable must be defined as an integer type. It is used for +recording the identity of the last pattern matched when the scanner must go +past a matched pattern in an attempt to make a longer match. If the longer +match fails it may need to consult the .verb|act| variable. In some cases, use +of the .verb|act| +variable can be avoided because the value of the current state is enough +information to determine which token to accept, however in other cases this is +not enough and so the .verb|act| variable is used. + +When the longest-match operator is in use, the user's driver code must take on +some buffer management functions. The following algorithm gives an overview of +the steps that should be taken to properly use the longest-match operator. + +.itemize +.item Read a block of input data. +.item Run the execute code. +.item If .verb|ts| is set, the execute code will expect the incomplete +token to be preserved ahead of the buffer on the next invocation of the execute +code. +.itemize +.item Shift the data beginning at .verb|ts| and ending at .verb|pe| to the +beginning of the input buffer. +.item Reset .verb|ts| to the beginning of the buffer. +.item Shift .verb|te| by the distance from the old value of .verb|ts| +to the new value. The .verb|te| variable may or may not be valid. There is +no way to know if it holds a meaningful value because it is not kept at null +when it is not in use. It can be shifted regardless. +.end itemize +.item Read another block of data into the buffer, immediately following any +preserved data. +.item Run the scanner on the new data. +.end itemize + +Figure .ref{preserve_example} shows the required handling of an input stream in +which a token is broken by the input block boundaries. After processing up to +and including the ``t'' of ``characters'', the prefix of the string token must be +retained and processing should resume at the ``e'' on the next iteration of +the execute code. + +If one uses a large input buffer for collecting input then the number of times +the shifting must be done will be small. Furthermore, if one takes care not to +define tokens that are allowed to be very long and instead processes these +items using pure state machines or sub-scanners, then only a small amount of +data will ever need to be shifted. + +.figure preserve_example +.verbatim + a) A stream "of characters" to be scanned. + | | | + p ts pe + + b) "of characters" to be scanned. + | | | + ts p pe +.end verbatim +.caption Following an invocation of the execute code there may be a partially +matched token (a). The data of the partially matched token +must be preserved ahead of the new data on the next invocation (b). +.end figure + +Since scanners attempt to make the longest possible match of input, patterns +such as identifiers require one character of lookahead in order to trigger a +match. In the case of the last token in the input stream the user must ensure +that the .verb|eof| variable is set so that the final token is flushed out. + +An example scanner processing loop is given in Figure .ref{scanner-loop}. + +.figure scanner-loop +.verbatim + int have = 0; + bool done = false; + while ( !done ) { + /* How much space is in the buffer? */ + int space = BUFSIZE - have; + if ( space == 0 ) { + /* Buffer is full. */ + cerr << "TOKEN TOO BIG" << endl; + exit(1); + } + + /* Read in a block after any data we already have. */ + char *p = inbuf + have; + cin.read( p, space ); + int len = cin.gcount(); + + char *pe = p + len; + char *eof = 0; + + /* If no data was read indicate EOF. */ + if ( len == 0 ) { + eof = pe; + done = true; + } + + %% write exec; + + if ( cs == Scanner_error ) { + /* Machine failed before finding a token. */ + cerr << "PARSE ERROR" << endl; + exit(1); + } + + if ( ts == 0 ) + have = 0; + else { + /* There is a prefix to preserve, shift it over. */ + have = pe - ts; + memmove( inbuf, ts, have ); + te = inbuf + (te-ts); + ts = inbuf; + } + } +.end verbatim +.caption A processing loop for a scanner. +.end figure + +.section State Charts +.label{state-charts} + +In addition to supporting the construction of state machines using regular +languages, Ragel provides a way to manually specify state machines using +state charts. The comma operator combines machines together without any +implied transitions. The user can then manually link machines by specifying +epsilon transitions with the .verb|->| operator. Epsilon transitions are drawn +between the final states of a machine and entry points defined by labels. This +makes it possible to build machines using the explicit state-chart method while +making minimal changes to the Ragel language. + +An interesting feature of Ragel's state chart construction method is that it +can be mixed freely with regular expression constructions. A state chart may be +referenced from within a regular expression, or a regular expression may be +used in the definition of a state chart transition. + +.subsection Join + +.verb|expr , expr , ...| + +Join a list of machines together without +drawing any transitions, without setting up a start state, and without +designating any final states. Transitions between the machines may be specified +using labels and epsilon transitions. The start state must be explicity +specified with the ``start'' label. Final states may be specified with an +epsilon transition to the implicitly created ``final'' state. The join +operation allows one to build machines using a state chart model. + +.subsection Label + +.verb|label: expr| + +Attaches a label to an expression. Labels can be +used as the target of epsilon transitions and explicit control transfer +statements such as .verb|fgoto| and .verb|fnext| in action +code. + +.subsection Epsilon + +.verb|expr -> label| + +Draws an epsilon transition to the state defined +by .verb|label|. Epsilon transitions are made deterministic when join +operators are evaluated. Epsilon transitions that are not in a join operation +are made deterministic when the machine definition that contains the epsilon is +complete. See Section .ref{labels} for information on referencing labels. + +.subsection Simplifying State Charts + +There are two benefits to providing state charts in Ragel. The first is that it +allows us to take a state chart with a full listing of states and transitions +and simplify it in selective places using regular expressions. + +The state chart method of specifying parsers is very common. It is an +effective programming technique for producing robust code. The key disadvantage +becomes clear when one attempts to comprehend a large parser specified in this +way. These programs usually require many lines, causing logic to be spread out +over large distances in the source file. Remembering the function of a large +number of states can be difficult and organizing the parser in a sensible way +requires discipline because branches and repetition present many file layout +options. This kind of programming takes a specification with inherent +structure such as looping, alternation and concatenation and expresses it in a +flat form. + +If we could take an isolated component of a manually programmed state chart, +that is, a subset of states that has only one entry point, and implement it +using regular language operators then we could eliminate all the explicit +naming of the states contained in it. By eliminating explicitly named states +and replacing them with higher-level specifications we simplify a state machine +specification. + +For example, sometimes chains of states are needed, with only a small number of +possible characters appearing along the chain. These can easily be replaced +with a concatenation of characters. Sometimes a group of common states +implement a loop back to another single portion of the machine. Rather than +manually duplicate all the transitions that loop back, we may be able to +express the loop using a kleene star operator. + +Ragel allows one to take this state map simplification approach. We can build +state machines using a state map model and implement portions of the state map +using regular languages. In place of any transition in the state machine, +entire sub-machines can be given. These can encapsulate functionality +defined elsewhere. An important aspect of the Ragel approach is that when we +wrap up a collection of states using a regular expression we do not lose +access to the states and transitions. We can still execute code on the +transitions that we have encapsulated. + +.subsection Dropping Down One Level of Abstraction +.label{down} + +The second benefit of incorporating state charts into Ragel is that it permits +us to bypass the regular language abstraction if we need to. Ragel's action +embedding operators are sometimes insufficient for expressing certain parsing +tasks. In the same way that is useful for C language programmers to drop down +to assembly language programming using embedded assembler, it is sometimes +useful for the Ragel programmer to drop down to programming with state charts. + +In the following example, we wish to buffer the characters of an XML CDATA +sequence. The sequence is terminated by the string .verb|]]>|. The challenge +in our application is that we do not wish the terminating characters to be +buffered. An expression of the form .verb|any* @buffer :>> ']]>'| will not work +because the buffer will always contain the characters .verb|]]| on the end. +Instead, what we need is to delay the buffering of .verb|]| +characters until a time when we +abandon the terminating sequence and go back into the main loop. There is no +easy way to express this using Ragel's regular expression and action embedding +operators, and so an ability to drop down to the state chart method is useful. + +% GENERATE: dropdown +% OPT: -p +% %%{ +% machine dropdown; +.code +action bchar { buff( fpc ); } # Buffer the current character. +action bbrack1 { buff( "]" ); } +action bbrack2 { buff( "]]" ); } + +CDATA_body = +start: ( + ']' -> one | + (any-']') @bchar ->start +), +one: ( + ']' -> two | + [^\]] @bbrack1 @bchar ->start +), +two: ( + '>' -> final | + ']' @bbrack1 -> two | + [^>\]] @bbrack2 @bchar ->start +); +.end code +% main := CDATA_body; +% }%% +% END GENERATE + +.graphic dropdown + + +.section Semantic Conditions +.label{semantic} + +Many communication protocols contain variable-length fields, where the length +of the field is given ahead of the field as a value. This +problem cannot be expressed using regular languages because of its +context-dependent nature. The prevalence of variable-length fields in +communication protocols motivated us to introduce semantic conditions into +the Ragel language. + +A semantic condition is a block of user code that is interpreted as an +expression and evaluated immediately +before a transition is taken. If the code returns a value of true, the +transition may be taken. We can now embed code that extracts the length of a +field, then proceed to match $n$ data values. + +% GENERATE: conds1 +% OPT: -p +% %%{ +% machine conds1; +% number = digit+; +.code +action rec_num { i = 0; n = getnumber(); } +action test_len { i++ < n } +data_fields = ( + 'd' + [0-9]+ %rec_num + ':' + ( [a-z] when test_len )* +)**; +.end code +% main := data_fields; +% }%% +% END GENERATE + +.graphic conds1 + +The Ragel implementation of semantic conditions does not force us to give up the +compositional property of Ragel definitions. For example, a machine that tests +the length of a field using conditions can be unioned with another machine +that accepts some of the same strings, without the two machines interfering with +one another. The user need not be concerned about whether or not the result of the +semantic condition will affect the matching of the second machine. + +To see this, first consider that when a user associates a condition with an +existing transition, the transition's label is translated from the base character +to its corresponding value in the space that represents ``condition $c$ true''. Should +the determinization process combine a state that has a conditional transition +with another state that has a transition on the same input character but +without a condition, then the condition-less transition first has its label +translated into two values, one to its corresponding value in the space that +represents ``condition $c$ true'' and another to its corresponding value in the +space that represents ``condition $c$ false''. It +is then safe to combine the two transitions. This is shown in the following +example. Two intersecting patterns are unioned, one with a condition and one +without. The condition embedded in the first pattern does not affect the second +pattern. + +% GENERATE: conds2 +% OPT: -p +% %%{ +% machine conds2; +% number = digit+; +.code +action test_len { i++ < n } +action one { /* accept pattern one */ } +action two { /* accept pattern two */ } +patterns = + ( [a-z] when test_len )+ %one | + [a-z][a-z0-9]* %two; +main := patterns '\n'; +.end code +% }%% +% END GENERATE + +.graphic conds2 + +There are many more potential uses for semantic conditions. The user is free to +use arbitrary code and may therefore perform actions such as looking up names +in dictionaries, validating input using external parsing mechanisms or +performing checks on the semantic structure of input seen so far. In the next +section we describe how Ragel accommodates several common parser engineering +problems. + +The semantic condition feature works only with alphabet types that are smaller +in width than the .verb|long| type. To implement semantic conditions Ragel +needs to be able to allocate characters from the alphabet space. Ragel uses +these allocated characters to express "character C with condition P true" or "C +with P false." Since internally Ragel uses longs to store characters there is +no room left in the alphabet space unless an alphabet type smaller than long is +used. + +.section Implementing Lookahead + +There are a few strategies for implementing lookahead in Ragel programs. +Leaving actions, which are described in Section .ref{out-actions}, can be +used as a form of lookahead. Ragel also provides the .verb|fhold| directive +which can be used in actions to prevent the machine from advancing over the +current character. It is also possible to manually adjust the current character +position by shifting it backwards using .verb|fexec|, however when this is +done, care must be taken not to overstep the beginning of the current buffer +block. In both the use of .verb|fhold| and .verb|fexec| the user must be +cautious of combining the resulting machine with another in such a way that the +transition on which the current position is adjusted is not combined with a +transition from the other machine. + +.section Parsing Recursive Language Structures + +In general Ragel cannot handle recursive structures because the grammar is +interpreted as a regular language. However, depending on what needs to be +parsed it is sometimes practical to implement the recursive parts using manual +coding techniques. This often works in cases where the recursive structures are +simple and easy to recognize, such as in the balancing of parentheses + +One approach to parsing recursive structures is to use actions that increment +and decrement counters or otherwise recognize the entry to and exit from +recursive structures and then jump to the appropriate machine defnition using +.verb|fcall| and .verb|fret|. Alternatively, semantic conditions can be used to +test counter variables. + +A more traditional approach is to call a separate parsing function (expressed +in the host language) when a recursive structure is entered, then later return +when the end is recognized. +##### EXP ##### +\documentclass[letterpaper,11pt,oneside]{book} +\usepackage{graphicx} +\usepackage{comment} +\usepackage{multicol} +\usepackage[ + colorlinks=true, + linkcolor=black, + citecolor=green, + filecolor=black, + urlcolor=black]{hyperref} + +\topmargin -0.20in +\oddsidemargin 0in +\textwidth 6.5in +\textheight 9in + +\setlength{\parskip}{0pt} +\setlength{\topsep}{0pt} +\setlength{\partopsep}{0pt} +\setlength{\itemsep}{0pt} + +\input{version} + +\newcommand{\verbspace}{\vspace{10pt}} +\newcommand{\graphspace}{\vspace{10pt}} + +\renewcommand\floatpagefraction{.99} +\renewcommand\topfraction{.99} +\renewcommand\bottomfraction{.99} +\renewcommand\textfraction{.01} +\setcounter{totalnumber}{50} +\setcounter{topnumber}{50} +\setcounter{bottomnumber}{50} + +\newenvironment{inline_code}{\def\baselinestretch{1}\vspace{12pt}\small}{} + +\begin{document} + +\thispagestyle{empty} +\begin{center} +\vspace*{3in} +{\huge Ragel State Machine Compiler}\\ +\vspace*{12pt} +{\Large User Guide}\\ +\vspace{1in} +by\\ +\vspace{12pt} +{\large Adrian Thurston}\\ +\end{center} +\clearpage + +\pagenumbering{roman} + +\chapter*{License} +Ragel version \version, \pubdate\\ +Copyright \copyright\ 2003-2012 Adrian D. Thurston +\vspace{6mm} + +{\bf\it\noindent This document is part of Ragel, and as such, this document is +released under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 2 of the License, or (at your option) +any later version. +} + +\vspace{5pt} + +{\bf\it\noindent Ragel is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. +} + +\vspace{5pt} + +{\bf\it\noindent You should have received a copy of the GNU General Public +License along with Ragel; if not, write to the Free Software Foundation, Inc., +59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +} + +\clearpage +\tableofcontents +\clearpage + +\pagenumbering{arabic} +\chapter{Introduction} + +\section{Abstract} + +Regular expressions are used heavily in practice for the purpose of specifying +parsers. They are normally used as black boxes linked together with program +logic. User actions are executed in between invocations of the regular +expression engine. Adding actions before a pattern terminates requires patterns +to be broken and pasted back together with program logic. The more user actions +are needed, the less the advantages of regular expressions are seen. + +Ragel is a software development tool that allows user actions to be +embedded into the transitions of a regular expression's corresponding state +machine, eliminating the need to switch from the regular expression engine and +user code execution environment and back again. As a result, expressions can be +maximally continuous. One is free to specify an entire parser using a single +regular expression. The single-expression model affords concise and elegant +descriptions of languages and the generation of very simple, fast and robust +code. Ragel compiles executable finite state machines from a high level regular language +notation. Ragel targets C, C++, Objective-C, D, Go, Java and Ruby. + +In addition to building state machines from regular expressions, Ragel allows +the programmer to directly specify state machines with state charts. These two +notations may be freely combined. There are also facilities for controlling +nondeterminism in the resulting machines and building scanners using patterns +that themselves have embedded actions. Ragel can produce code that is small and +runs very fast. Ragel can handle integer-sized alphabets and can compile very +large state machines. + +\section{Motivation} + +When a programmer is faced with the task of producing a parser for a +context-free language there are many tools to choose from. It is quite common +to generate useful and efficient parsers for programming languages from a +formal grammar. It is also quite common for programmers to avoid such tools +when making parsers for simple computer languages, such as file formats and +communication protocols. Such languages are often regular and tools for +processing the context-free languages are viewed as too heavyweight for the +purpose of parsing regular languages. The extra run-time effort required for +supporting the recursive nature of context-free languages is wasted. + +When we turn to the regular expression-based parsing tools, such as Lex, Re2C, +and scripting languages such as Sed, Awk and Perl we find that they are split +into two levels: a regular expression matching engine and some kind of program +logic for linking patterns together. For example, a Lex program is composed of +sets of regular expressions. The implied program logic repeatedly attempts to +match a pattern in the current set. When a match is found the associated user +code executed. It requires the user to consider a language as a sequence of +independent tokens. Scripting languages and regular expression libraries allow +one to link patterns together using arbitrary program code. This is very +flexible and powerful, however we can be more concise and clear if we avoid +gluing together regular expressions with if statements and while loops. + +This model of execution, where the runtime alternates between regular +expression matching and user code exectution places restrictions on when +action code may be executed. Since action code can only be associated with +complete patterns, any action code that must be executed before an entire +pattern is matched requires that the pattern be broken into smaller units. +Instead of being forced to disrupt the regular expression syntax and write +smaller expressions, it is desirable to retain a single expression and embed +code for performing actions directly into the transitions that move over the +characters. After all, capable programmers are astutely aware of the machinery +underlying their programs, so why not provide them with access to that +machinery? To achieve this we require an action execution model for associating +code with the sub-expressions of a regular expression in a way that does not +disrupt its syntax. + +The primary goal of Ragel is to provide developers with an ability to embed +actions into the transitions and states of a regular expression's state machine +in support of the definition of entire parsers or large sections of parsers +using a single regular expression. From the regular expression we gain a clear +and concise statement of our language. From the state machine we obtain a very +fast and robust executable that lends itself to many kinds of analysis and +visualization. + +\section{Overview} + +Ragel is a language for specifying state machines. The Ragel program is a +compiler that assembles a state machine definition to executable code. Ragel +is based on the principle that any regular language can be converted to a +deterministic finite state automaton. Since every regular language has a state +machine representation and vice versa, the terms regular language and state +machine (or just machine) will be used interchangeably in this document. + +Ragel outputs machines to C, C++, Objective-C, D, Go, Java or Ruby code. The output is +designed to be generic and is not bound to any particular input or processing +method. A Ragel machine expects to have data passed to it in buffer blocks. +When there is no more input, the machine can be queried for acceptance. In +this way, a Ragel machine can be used to simply recognize a regular language +like a regular expression library. By embedding code into the regular language, +a Ragel machine can also be used to parse input. + +The Ragel language has many operators for constructing and manipulating +machines. Machines are built up from smaller machines, to bigger ones, to the +final machine representing the language that needs to be recognized or parsed. + +The core state machine construction operators are those found in most theory +of computation textbooks. They date back to the 1950s and are widely studied. +They are based on set operations and permit one to think of languages as a set +of strings. They are Union, Intersection, Difference, Concatenation and Kleene +Star. Put together, these operators make up what most people know as regular +expressions. Ragel also provides a scanner construction operator +and provides operators for explicitly constructing machines +using a state chart method. In the state chart method, one joins machines +together without any implied transitions and then explicitly specifies where +epsilon transitions should be drawn. + +The state machine manipulation operators are specific to Ragel. They allow the +programmer to access the states and transitions of regular language's +corresponding machine. There are two uses of the manipulation operators. The +first and primary use is to embed code into transitions and states, allowing +the programmer to specify the actions of the state machine. + +Ragel attempts to make the action embedding facility as intuitive as possible. +To do so, a number of issues need to be addressed. For example, when making a +nondeterministic specification into a DFA using machines that have embedded +actions, new transitions are often made that have the combined actions of +several source transitions. Ragel ensures that multiple actions associated with +a single transition are ordered consistently with respect to the order of +reference and the natural ordering implied by the construction operators. + +The second use of the manipulation operators is to assign priorities to +transitions. Priorities provide a convenient way of controlling any +nondeterminism introduced by the construction operators. Suppose two +transitions leave from the same state and go to distinct target states on the +same character. If these transitions are assigned conflicting priorities, then +during the determinization process the transition with the higher priority will +take precedence over the transition with the lower priority. The lower priority +transition gets abandoned. The transitions would otherwise be combined into a new +transition that goes to a new state that is a combination of the original +target states. Priorities are often required for segmenting machines. The most +common uses of priorities have been encoded into a set of simple operators +that should be used instead of priority embeddings whenever possible. + +For the purposes of embedding, Ragel divides transitions and states into +different classes. There are four operators for embedding actions and +priorities into the transitions of a state machine. It is possible to embed +into entering transitions, finishing transitions, all transitions and leaving +transitions. The embedding into leaving transitions is a special case. +These transition embeddings get stored in the final states of a machine. They +are transferred to any transitions that are made going out of the machine by +future concatenation or kleene star operations. + +There are several more operators for embedding actions into states. Like the +transition embeddings, there are various different classes of states that the +embedding operators access. For example, one can access start states, final +states or all states, among others. Unlike the transition embeddings, there are +several different types of state action embeddings. These are executed at +various different times during the processing of input. It is possible to embed +actions that are exectued on transitions into a state, on transitions out of a +state, on transitions taken on the error event, or on transitions taken on the +EOF event. + +Within actions, it is possible to influence the behaviour of the state machine. +The user can write action code that jumps or calls to another portion of the +machine, changes the current character being processed, or breaks out of the +processing loop. With the state machine calling feature Ragel can be used to +parse languages that are not regular. For example, one can parse balanced +parentheses by calling into a parser when an open parenthesis character is seen +and returning to the state on the top of the stack when the corresponding +closing parenthesis character is seen. More complicated context-free languages +such as expressions in C are out of the scope of Ragel. + +Ragel also provides a scanner construction operator that can be used to build +scanners much the same way that Lex is used. The Ragel generated code, which +relies on user-defined variables for backtracking, repeatedly tries to match +patterns to the input, favouring longer patterns over shorter ones and patterns +that appear ahead of others when the lengths of the possible matches are +identical. When a pattern is matched the associated action is executed. + +The key distinguishing feature between scanners in Ragel and scanners in Lex is +that Ragel patterns may be arbitrary Ragel expressions and can therefore +contain embedded code. With a Ragel-based scanner the user need not wait until +the end of a pattern before user code can be executed. + +Scanners do take Ragel out of the domain of pure state machines and require the +user to maintain the backtracking related variables. However, scanners +integrate well with regular state machine instantiations. They can be called to +or jumped to only when needed, or they can be called out of or jumped out of +when a simpler, pure state machine model is appropriate. + +Two types of output code style are available. Ragel can produce a table-driven +machine or a directly executable machine. The directly executable machine is +much faster than the table-driven. On the other hand, the table-driven machine +is more compact and less demanding on the host language compiler. It is better +suited to compiling large state machines. + +\section{Related Work} + +Lex is perhaps the best-known tool for constructing parsers from regular +expressions. In the Lex processing model, generated code attempts to match one +of the user's regular expression patterns, favouring longer matches over +shorter ones. Once a match is made it then executes the code associated with +the pattern and consumes the matching string. This process is repeated until +the input is fully consumed. + +Through the use of start conditions, related sets of patterns may be defined. +The active set may be changed at any time. This allows the user to define +different lexical regions. It also allows the user to link patterns together by +requiring that some patterns come before others. This is quite like a +concatenation operation. However, use of Lex for languages that require a +considerable amount of pattern concatenation is inappropriate. In such cases a +Lex program deteriorates into a manually specified state machine, where start +conditions define the states and pattern actions define the transitions. Lex +is therefore best suited to parsing tasks where the language to be parsed can +be described in terms of regions of tokens. + +Lex is useful in many scenarios and has undoubtedly stood the test of time. +There are, however, several drawbacks to using Lex. Lex can impose too much +overhead for parsing applications where buffering is not required because all +the characters are available in a single string. In these cases there is +structure to the language to be parsed and a parser specification tool can +help, but employing a heavyweight processing loop that imposes a stream +``pull'' model and dynamic input buffer allocation is inappropriate. An +example of this kind of scenario is the conversion of floating point numbers +contained in a string to their corresponding numerical values. + +Another drawback is the very issue that Ragel attempts to solve. +It is not possible to execute a user action while +matching a character contained inside a pattern. For example, if scanning a +programming language and string literals can contain newlines which must be +counted, a Lex user must break up a string literal pattern so as to associate +an action with newlines. This forces the definition of a new start condition. +Alternatively the user can reprocess the text of the matched string literal to +count newlines. + + +The Re2C program defines an input processing model similar to that of Lex. +Re2C focuses on making generated state machines run very fast and +integrate easily into any program, free of dependencies. Re2C generates +directly executable code and is able to claim that generated parsers run nearly +as fast as their hand-coded equivalents. This is very important for user +adoption, as programmers are reluctant to use a tool when a faster alternative +exists. A consideration to ease of use is also important because developers +need the freedom to integrate the generated code as they see fit. + +Many scripting languages provide ways of composing parsers by linking regular +expressions using program logic. For example, Sed and Awk are two established +Unix scripting tools that allow the programmer to exploit regular expressions +for the purpose of locating and extracting text of interest. High-level +programming languages such as Perl, Python, PHP and Ruby all provide regular +expression libraries that allow the user to combine regular expressions with +arbitrary code. + +In addition to supporting the linking of regular expressions with arbitrary +program logic, the Perl programming language permits the embedding of code into +regular expressions. Perl embeddings do not translate into the embedding of +code into deterministic state machines. Perl regular expressions are in fact +not fully compiled to deterministic machines when embedded code is involved. +They are instead interpreted and involve backtracking. This is shown by the +following Perl program. When it is fed the input \verb|abcd| the interpretor +attempts to match the first alternative, printing \verb|a1 b1|. When this +possibility fails it backtracks and tries the second possibility, printing +\verb|a2 b2|, at which point it succeeds. + +\begin{inline_code} +\begin{verbatim} +print "YES\n" if ( <STDIN> =~ + /( a (?{ print "a1 "; }) b (?{ print "b1 "; }) cX ) | + ( a (?{ print "a2 "; }) b (?{ print "b2 "; }) cd )/x ) +\end{verbatim} +\end{inline_code} +\verbspace + +In Ragel there is no regular expression interpretor. Aside from the scanner +operator, all Ragel expressions are made into deterministic machines and the +run time simply moves from state to state as it consumes input. An equivalent +parser expressed in Ragel would attempt both of the alternatives concurrently, +printing \verb|a1 a2 b1 b2|. + +\section{Development Status} + +Ragel is a relatively new tool and is under continuous development. As a rough +release guide, minor revision number changes are for implementation +improvements and feature additions. Major revision number changes are for +implementation and language changes that do not preserve backwards +compatibility. Though in the past this has not always held true: changes that +break code have crept into minor version number changes. Typically, the +documentation lags behind the development in the interest of documenting only +the lasting features. The latest changes are always documented in the ChangeLog +file. + +\chapter{Constructing State Machines} + +\section{Ragel State Machine Specifications} + +A Ragel input file consists of a program in the host language that contains embedded machine +specifications. Ragel normally passes input straight to output. When it sees +a machine specification it stops to read the Ragel statements and possibly generate +code in place of the specification. +Afterwards it continues to pass input through. There +can be any number of FSM specifications in an input file. A multi-line FSM spec +starts with \verb|%%{| and ends with \verb|}%%|. A single-line FSM spec starts +with \verb|%%| and ends at the first newline. + +While Ragel is looking for FSM specifications it does basic lexical analysis on +the surrounding input. It interprets literal strings and comments so a +\verb|%%| sequence in either of those will not trigger the parsing of an FSM +specification. Ragel does not pass the input through any preprocessor nor does it +interpret preprocessor directives itself so includes, defines and ifdef logic +cannot be used to alter the parse of a Ragel input file. It is therefore not +possible to use an \verb|#if 0| directive to comment out a machine as is +commonly done in C code. As an alternative, a machine can be prevented from +causing any generated output by commenting out write statements. + +In Figure \ref{cmd-line-parsing}, a multi-line specification is used to define the +machine and single line specifications are used to trigger the writing of the machine +data and execution code. + +\begin{figure} +\small +\begin{multicols}{2} +\begin{verbatim} +#include <string.h> +#include <stdio.h> + +%%{ + machine foo; + main := + ( 'foo' | 'bar' ) + 0 @{ res = 1; }; +}%% + +%% write data; +\end{verbatim} +\verbspace +\columnbreak +\begin{verbatim} +int main( int argc, char **argv ) +{ + int cs, res = 0; + if ( argc > 1 ) { + char *p = argv[1]; + char *pe = p + strlen(p) + 1; + %% write init; + %% write exec; + } + printf("result = %i\n", res ); + return 0; +} +\end{verbatim} +\verbspace +\end{multicols} +\caption{Parsing a command line argument. +} +\label{cmd-line-parsing} +\end{figure} + +\subsection{Naming Ragel Blocks} + +\begin{verbatim} +machine fsm_name; +\end{verbatim} +\verbspace + +The \verb|machine| statement gives the name of the FSM. If present in a +specification, this statement must appear first. If a machine specification +does not have a name then Ragel uses the previous specification name. If no +previous specification name exists then this is an error. Because FSM +specifications persist in memory, a machine's statements can be spread across +multiple machine specifications. This allows one to break up a machine across +several files or draw in statements that are common to multiple machines using +the \verb|include| statement. + +\subsection{Machine Definition} +\label{definition} + +\begin{verbatim} +<name> = <expression>; +\end{verbatim} +\verbspace + +The machine definition statement associates an FSM expression with a name. Machine +expressions assigned to names can later be referenced in other expressions. A +definition statement on its own does not cause any states to be generated. It is simply a +description of a machine to be used later. States are generated only when a definition is +instantiated, which happens when a definition is referenced in an instantiated +expression. + +\subsection{Machine Instantiation} +\label{instantiation} + +\begin{verbatim} +<name> := <expression>; +\end{verbatim} +\verbspace + +The machine instantiation statement generates a set of states representing an +expression. Each instantiation generates a distinct set of states. The starting +state of the instantiation is written in the data section of the generated code +using the instantiation name. If a machine named +\verb|main| is instantiated, its start state is used as the +specification's start state and is assigned to the \verb|cs| variable by the +\verb|write init| command. If no \verb|main| machine is given, the start state +of the last machine instantiation to appear is used as the specification's +start state. + +From outside the execution loop, control may be passed to any machine by +assigning the entry point to the \verb|cs| variable. From inside the execution +loop, control may be passed to any machine instantiation using \verb|fcall|, +\verb|fgoto| or \verb|fnext| statements. + +\subsection{Including Ragel Code} + +\begin{verbatim} +include FsmName "inputfile.rl"; +\end{verbatim} +\verbspace + +The \verb|include| statement can be used to draw in the statements of another FSM +specification. Both the name and input file are optional, however at least one +must be given. Without an FSM name, the given input file is searched for an FSM +of the same name as the current specification. Without an input file the +current file is searched for a machine of the given name. If both are present, +the given input file is searched for a machine of the given name. + +Ragel searches for included files from the location of the current file. +Additional directories can be added to the search path using the \verb|-I| +option. + +\subsection{Importing Definitions} +\label{import} + +\begin{verbatim} +import "inputfile.h"; +\end{verbatim} +\verbspace + +The \verb|import| statement scrapes a file for sequences of tokens that match +the following forms. Ragel treats these forms as state machine definitions. + +\noindent\hspace*{24pt}\verb|name '=' number|\\ +\noindent\hspace*{24pt}\verb|name '=' lit_string|\\ +\noindent\hspace*{24pt}\verb|'define' name number|\\ +\noindent\hspace*{24pt}\verb|'define' name lit_string| +\vspace{12pt} + +If the input file is a Ragel program then tokens inside any Ragel +specifications are ignored. See Section \ref{export} for a description of +exporting machine definitions. + +Ragel searches for imported files from the location of the current file. +Additional directories can be added to the search path using the \verb|-I| +option. + +\section{Lexical Analysis of a Ragel Block} +\label{lexing} + +Within a machine specification the following lexical rules apply to the input. + +\begin{itemize} + +\item The \verb|#| symbol begins a comment that terminates at the next newline. + +\item The symbols \verb|""|, \verb|''|, \verb|//|, \verb|[]| behave as the +delimiters of literal strings. Within them, the following escape sequences +are interpreted: + +\verb| \0 \a \b \t \n \v \f \r| + +A backslash at the end of a line joins the following line onto the current. A +backslash preceding any other character removes special meaning. This applies +to terminating characters and to special characters in regular expression +literals. As an exception, regular expression literals do not support escape +sequences as the operands of a range within a list. See the bullet on regular +expressions in Section \ref{basic}. + +\item The symbols \verb|{}| delimit a block of host language code that will be +embedded into the machine as an action. Within the block of host language +code, basic lexical analysis of comments and strings is done in order to +correctly find the closing brace of the block. With the exception of FSM +commands embedded in code blocks, the entire block is preserved as is for +identical reproduction in the output code. + +\item The pattern \verb|[+-]?[0-9]+| denotes an integer in decimal format. +Integers used for specifying machines may be negative only if the alphabet type +is signed. Integers used for specifying priorities may be positive or negative. + +\item The pattern \verb|0x[0-9A-Fa-f]+| denotes an integer in hexadecimal +format. + +\item The keywords are \verb|access|, \verb|action|, \verb|alphtype|, +\verb|getkey|, \verb|write|, \verb|machine| and \verb|include|. + +\item The pattern \verb|[a-zA-Z_][a-zA-Z_0-9]*| denotes an identifier. + + +\item Any amount of whitespace may separate tokens. + +\end{itemize} + + +\section{Basic Machines} +\label{basic} + +The basic machines are the base operands of regular language expressions. They +are the smallest unit to which machine construction and manipulation operators +can be applied. + +\begin{itemize} + +\item \verb|'hello'| -- Concatenation Literal. Produces a machine that matches +the sequence of characters in the quoted string. If there are 5 characters +there will be 6 states chained together with the characters in the string. See +Section \ref{lexing} for information on valid escape sequences. + + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{bmconcat} +\end{center} +\graphspace + +It is possible +to make a concatenation literal case-insensitive by appending an \verb|i| to +the string, for example \verb|'cmd'i|. + +\item \verb|"hello"| -- Identical to the single quoted version. + +\item \verb|[hello]| -- Or Expression. Produces a union of characters. There +will be two states with a transition for each unique character between the two states. +The \verb|[]| delimiters behave like the quotes of a literal string. For example, +\verb|[ \t]| means tab or space. The \verb|or| expression supports character ranges +with the \verb|-| symbol as a separator. The meaning of the union can be negated +using an initial \verb|^| character as in standard regular expressions. +See Section \ref{lexing} for information on valid escape sequences +in \verb|or| expressions. + + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{bmor} +\end{center} +\graphspace + +\item \verb|''|, \verb|""|, and \verb|[]| -- Zero Length Machine. Produces a machine +that matches the zero length string. Zero length machines have one state that is both +a start state and a final state. + + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{bmnull} +\end{center} +\graphspace + +% FIXME: More on the range of values here. +\item \verb|42| -- Numerical Literal. Produces a two state machine with one +transition on the given number. The number may be in decimal or hexadecimal +format and should be in the range allowed by the alphabet type. The minimum and +maximum values permitted are defined by the host machine that Ragel is compiled +on. For example, numbers in a \verb|short| alphabet on an i386 machine should +be in the range \verb|-32768| to \verb|32767|. + + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{bmnum} +\end{center} +\graphspace + +\item \verb|/simple_regex/| -- Regular Expression. Regular expressions are +parsed as a series of expressions that are concatenated together. Each +concatenated expression +may be a literal character, the ``any'' character specified by the \verb|.| +symbol, or a union of characters specified by the \verb|[]| delimiters. If the +first character of a union is \verb|^| then it matches any character not in the +list. Within a union, a range of characters can be given by separating the first +and last characters of the range with the \verb|-| symbol. Each +concatenated machine may have repetition specified by following it with the +\verb|*| symbol. The standard escape sequences described in Section +\ref{lexing} are supported everywhere in regular expressions except as the +operands of a range within in a list. This notation also supports the \verb|i| +trailing option. Use it to produce case-insensitive machines, as in \verb|/GET/i|. + +Ragel does not support very complex regular expressions because the desired +results can always be achieved using the more general machine construction +operators listed in Section \ref{machconst}. The following diagram shows the +result of compiling \verb|/ab*[c-z].*[123]/|. \verb|DEF| represents the default +transition, which is taken if no other transition can be taken. + + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{bmregex} +\end{center} +\graphspace + +\item \verb|'a' .. 'z'| -- Range. Produces a machine that matches any +characters in the specified range. Allowable upper and lower bounds of the +range are concatenation literals of length one and numerical literals. For +example, \verb|0x10..0x20|, \verb|0..63|, and \verb|'a'..'z'| are valid ranges. +The bounds should be in the range allowed by the alphabet type. + + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{bmrange} +\end{center} +\graphspace + +\item \verb|variable_name| -- Lookup the machine definition assigned to the +variable name given and use an instance of it. See Section \ref{definition} for +an important note on what it means to reference a variable name. + +\item \verb|builtin_machine| -- There are several built-in machines available +for use. They are all two state machines for the purpose of matching common +classes of characters. They are: + +\begin{itemize} + +\item \verb|any | -- Any character in the alphabet. + +\item \verb|ascii | -- Ascii characters. \verb|0..127| + +\item \verb|extend| -- Ascii extended characters. This is the range +\verb|-128..127| for signed alphabets and the range \verb|0..255| for unsigned +alphabets. + +\item \verb|alpha | -- Alphabetic characters. \verb|[A-Za-z]| + +\item \verb|digit | -- Digits. \verb|[0-9]| + +\item \verb|alnum | -- Alpha numerics. \verb|[0-9A-Za-z]| + +\item \verb|lower | -- Lowercase characters. \verb|[a-z]| + +\item \verb|upper | -- Uppercase characters. \verb|[A-Z]| + +\item \verb|xdigit| -- Hexadecimal digits. \verb|[0-9A-Fa-f]| + +\item \verb|cntrl | -- Control characters. \verb|0..31| + +\item \verb|graph | -- Graphical characters. \verb|[!-~]| + +\item \verb|print | -- Printable characters. \verb|[ -~]| + +\item \verb|punct | -- Punctuation. Graphical characters that are not alphanumerics. +\verb|[!-/:-@[-`{-~]| + +\item \verb|space | -- Whitespace. \verb|[\t\v\f\n\r ]| + +\item \verb|zlen | -- Zero length string. \verb|""| + +\item \verb|empty | -- Empty set. Matches nothing. \verb|^any| + +\end{itemize} +\end{itemize} + +\section{Operator Precedence} +The following table shows operator precedence from lowest to highest. Operators +in the same precedence group are evaluated from left to right. + +\begin{tabular}{|c|c|c|} +\hline +1&\verb| , |&Join\\ +\hline +2&\verb/ | & - --/&Union, Intersection and Subtraction\\ +\hline +3&\verb| . <: :> :>> |&Concatenation\\ +\hline +4&\verb| : |&Label\\ +\hline +5&\verb| -> |&Epsilon Transition\\ +\hline +6&\verb| > @ $ % |&Transitions Actions and Priorities\\ +\hline +6&\verb| >/ $/ %/ </ @/ <>/ |&EOF Actions\\ +\hline +6&\verb| >! $! %! <! @! <>! |&Global Error Actions\\ +\hline +6&\verb| >^ $^ %^ <^ @^ <>^ |&Local Error Actions\\ +\hline +6&\verb| >~ $~ %~ <~ @~ <>~ |&To-State Actions\\ +\hline +6&\verb| >* $* %* <* @* <>* |&From-State Action\\ +\hline +7&\verb| * ** ? + {n} {,n} {n,} {n,m} |&Repetition\\ +\hline +8&\verb| ! ^ |&Negation and Character-Level Negation\\ +\hline +9&\verb| ( <expr> ) |&Grouping\\ +\hline +\end{tabular} + +\section{Regular Language Operators} +\label{machconst} + +When using Ragel it is helpful to have a sense of how it constructs machines. +The determinization process can produce results that seem unusual to someone +not familiar with the NFA to DFA conversion algorithm. In this section we +describe Ragel's state machine operators. Though the operators are defined +using epsilon transitions, it should be noted that this is for discussion only. +The epsilon transitions described in this section do not persist, but are +immediately removed by the determinization process which is executed at every +operation. Ragel does not make use of any nondeterministic intermediate state +machines. + +To create an epsilon transition between two states \verb|x| and \verb|y| is to +copy all of the properties of \verb|y| into \verb|x|. This involves drawing in +all of \verb|y|'s to-state actions, EOF actions, etc., in addition to its +transitions. If \verb|x| and \verb|y| both have a transition out on the same +character, then the transitions must be combined. During transition +combination a new transition is made that goes to a new state that is the +combination of both target states. The new combination state is created using +the same epsilon transition method. The new state has an epsilon transition +drawn to all the states that compose it. Since the creation of new epsilon +transitions may be triggered every time an epsilon transition is drawn, the +process of drawing epsilon transitions is repeated until there are no more +epsilon transitions to be made. + +A very common error that is made when using Ragel is to make machines that do +too much. That is, to create machines that have unintentional +nondetermistic properties. This usually results from being unaware of the common strings +between machines that are combined together using the regular language +operators. This can involve never leaving a machine, causing its actions to be +propagated through all the following states. Or it can involve an alternation +where both branches are unintentionally taken simultaneously. + +This problem forces one to think hard about the language that needs to be +matched. To guard against this kind of problem one must ensure that the machine +specification is divided up using boundaries that do not allow ambiguities from +one portion of the machine to the next. See Chapter +\ref{controlling-nondeterminism} for more on this problem and how to solve it. + +The Graphviz tool is an immense help when debugging improperly compiled +machines or otherwise learning how to use Ragel. Graphviz Dot files can be +generated from Ragel programs using the \verb|-V| option. See Section +\ref{visualization} for more information. + + +\subsection{Union} + +\verb/expr | expr/ + +The union operation produces a machine that matches any string in machine one +or machine two. The operation first creates a new start state. Epsilon +transitions are drawn from the new start state to the start states of both +input machines. The resulting machine has a final state set equivalent to the +union of the final state sets of both input machines. In this operation, there +is the opportunity for nondeterminism among both branches. If there are +strings, or prefixes of strings that are matched by both machines then the new +machine will follow both parts of the alternation at once. The union operation is +shown below. + +\graphspace +\begin{center} +\includegraphics[scale=1.0]{opor} +\end{center} +\graphspace + +The following example demonstrates the union of three machines representing +common tokens. + +% GENERATE: exor +% OPT: -p +% %%{ +% machine exor; +\begin{inline_code} +\begin{verbatim} +# Hex digits, decimal digits, or identifiers +main := '0x' xdigit+ | digit+ | alpha alnum*; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{exor} +\end{center} +\graphspace + +\subsection{Intersection} + +\verb|expr & expr| + +Intersection produces a machine that matches any +string that is in both machine one and machine two. To achieve intersection, a +union is performed on the two machines. After the result has been made +deterministic, any final state that is not a combination of final states from +both machines has its final state status revoked. To complete the operation, +paths that do not lead to a final state are pruned from the machine. Therefore, +if there are any such paths in either of the expressions they will be removed +by the intersection operator. Intersection can be used to require that two +independent patterns be simultaneously satisfied as in the following example. + +% GENERATE: exinter +% OPT: -p +% %%{ +% machine exinter; +\begin{inline_code} +\begin{verbatim} +# Match lines four characters wide that contain +# words separated by whitespace. +main := + /[^\n][^\n][^\n][^\n]\n/* & + (/[a-z][a-z]*/ | [ \n])**; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{exinter} +\end{center} +\graphspace + +\subsection{Difference} + +\verb|expr - expr| + +The difference operation produces a machine that matches +strings that are in machine one but are not in machine two. To achieve subtraction, +a union is performed on the two machines. After the result has been made +deterministic, any final state that came from machine two or is a combination +of states involving a final state from machine two has its final state status +revoked. As with intersection, the operation is completed by pruning any path +that does not lead to a final state. The following example demonstrates the +use of subtraction to exclude specific cases from a set. + +% GENERATE: exsubtr +% OPT: -p +% %%{ +% machine exsubtr; +\begin{inline_code} +\begin{verbatim} +# Subtract keywords from identifiers. +main := /[a-z][a-z]*/ - ( 'for' | 'int' ); +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{exsubtr} +\end{center} +\graphspace + +\subsection{Strong Difference} +\label{strong_difference} + +\verb|expr -- expr| + +Strong difference produces a machine that matches any string of the first +machine that does not have any string of the second machine as a substring. In +the following example, strong subtraction is used to excluded \verb|CRLF| from +a sequence. In the corresponding visualization, the label \verb|DEF| is short +for default. The default transition is taken if no other transition can be +taken. + +% GENERATE: exstrongsubtr +% OPT: -p +% %%{ +% machine exstrongsubtr; +\begin{inline_code} +\begin{verbatim} +crlf = '\r\n'; +main := [a-z]+ ':' ( any* -- crlf ) crlf; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{exstrongsubtr} +\end{center} +\graphspace + +This operator is equivalent to the following. + +\begin{verbatim} +expr - ( any* expr any* ) +\end{verbatim} +\verbspace + +\subsection{Concatenation} + +\verb|expr . expr| + +Concatenation produces a machine that matches all the strings in machine one followed by all +the strings in machine two. Concatenation draws epsilon transitions from the +final states of the first machine to the start state of the second machine. The +final states of the first machine lose their final state status, unless the +start state of the second machine is final as well. +Concatenation is the default operator. Two machines next to each other with no +operator between them results in concatenation. + +\graphspace +\begin{center} +\includegraphics[scale=1.0]{opconcat} +\end{center} +\graphspace + +The opportunity for nondeterministic behaviour results from the possibility of +the final states of the first machine accepting a string that is also accepted +by the start state of the second machine. +The most common scenario in which this happens is the +concatenation of a machine that repeats some pattern with a machine that gives +a terminating string, but the repetition machine does not exclude the +terminating string. The example in Section \ref{strong_difference} +guards against this. Another example is the expression \verb|("'" any* "'")|. +When executed the thread of control will +never leave the \verb|any*| machine. This is a problem especially if actions +are embedded to process the characters of the \verb|any*| component. + +In the following example, the first machine is always active due to the +nondeterministic nature of concatenation. This particular nondeterminism is intended +however because we wish to permit EOF strings before the end of the input. + +% GENERATE: exconcat +% OPT: -p +% %%{ +% machine exconcat; +\begin{inline_code} +\begin{verbatim} +# Require an eof marker on the last line. +main := /[^\n]*\n/* . 'EOF\n'; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{exconcat} +\end{center} +\graphspace + +There is a language +ambiguity involving concatenation and subtraction. Because concatenation is the +default operator for two +adjacent machines there is an ambiguity between subtraction of +a positive numerical literal and concatenation of a negative numerical literal. +For example, \verb|(x-7)| could be interpreted as \verb|(x . -7)| or +\verb|(x - 7)|. In the Ragel language, the subtraction operator always takes precedence +over concatenation of a negative literal. We adhere to the rule that the default +concatenation operator takes effect only when there are no other operators between +two machines. Beware of writing machines such as \verb|(any -1)| when what is +desired is a concatenation of \verb|any| and \verb|-1|. Instead write +\verb|(any . -1)| or \verb|(any (-1))|. If in doubt of the meaning of your program do not +rely on the default concatenation operator; always use the \verb|.| symbol. + + +\subsection{Kleene Star} + +\verb|expr*| + +The machine resulting from the Kleene Star operator will match zero or more +repetitions of the machine it is applied to. +It creates a new start state and an additional final +state. Epsilon transitions are drawn between the new start state and the old start +state, between the new start state and the new final state, and +between the final states of the machine and the new start state. After the +machine is made deterministic the effect is of the final states getting all the +transitions of the start state. + +\graphspace +\begin{center} +\includegraphics[scale=1.0]{opstar} +\end{center} +\graphspace + +The possibility for nondeterministic behaviour arises if the final states have +transitions on any of the same characters as the start state. This is common +when applying kleene star to an alternation of tokens. Like the other problems +arising from nondeterministic behavior, this is discussed in more detail in Chapter +\ref{controlling-nondeterminism}. This particular problem can also be solved +by using the longest-match construction discussed in Section +\ref{generating-scanners} on scanners. + +In this +example, there is no nondeterminism introduced by the exterior kleene star due to +the newline at the end of the regular expression. Without the newline the +exterior kleene star would be redundant and there would be ambiguity between +repeating the inner range of the regular expression and the entire regular +expression. Though it would not cause a problem in this case, unnecessary +nondeterminism in the kleene star operator often causes undesired results for +new Ragel users and must be guarded against. + +% GENERATE: exstar +% OPT: -p +% %%{ +% machine exstar; +\begin{inline_code} +\begin{verbatim} +# Match any number of lines with only lowercase letters. +main := /[a-z]*\n/*; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{exstar} +\end{center} +\graphspace + +\subsection{One Or More Repetition} + +\verb|expr+| + +This operator produces the concatenation of the machine with the kleene star of +itself. The result will match one or more repetitions of the machine. The plus +operator is equivalent to \verb|(expr . expr*)|. + +% GENERATE: explus +% OPT: -p +% %%{ +% machine explus; +\begin{inline_code} +\begin{verbatim} +# Match alpha-numeric words. +main := alnum+; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{explus} +\end{center} +\graphspace + +\subsection{Optional} + +\verb|expr?| + +The {\em optional} operator produces a machine that accepts the machine +given or the zero length string. The optional operator is equivalent to +\verb/(expr | '' )/. In the following example the optional operator is used to +possibly extend a token. + +% GENERATE: exoption +% OPT: -p +% %%{ +% machine exoption; +\begin{inline_code} +\begin{verbatim} +# Match integers or floats. +main := digit+ ('.' digit+)?; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{exoption} +\end{center} +\graphspace + +\subsection{Repetition} + +\noindent\hspace*{24pt}\verb|expr {n}| -- Exactly N copies of expr.\\ +\noindent\hspace*{24pt}\verb|expr {,n}| -- Zero to N copies of expr.\\ +\noindent\hspace*{24pt}\verb|expr {n,}| -- N or more copies of expr.\\ +\noindent\hspace*{24pt}\verb|expr {n,m}| -- N to M copies of expr. +\vspace{12pt} + +\subsection{Negation} + +\verb|!expr| + +Negation produces a machine that matches any string not matched by the given +machine. Negation is equivalent to \verb|(any* - expr)|. + +% GENERATE: exnegate +% OPT: -p +% %%{ +% machine exnegate; +\begin{inline_code} +\begin{verbatim} +# Accept anything but a string beginning with a digit. +main := ! ( digit any* ); +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{exnegate} +\end{center} +\graphspace + +\subsection{Character-Level Negation} + +\verb|^expr| + +Character-level negation produces a machine that matches any single character +not matched by the given machine. Character-Level Negation is equivalent to +\verb|(any - expr)|. It must be applied only to machines that match strings of +length one. + +\section{State Machine Minimization} + +State machine minimization is the process of finding the minimal equivalent FSM accepting +the language. Minimization reduces the number of states in machines +by merging equivalent states. It does not change the behaviour of the machine +in any way. It will cause some states to be merged into one because they are +functionally equivalent. State minimization is on by default. It can be turned +off with the \verb|-n| option. + +The algorithm implemented is similar to Hopcroft's state minimization +algorithm. Hopcroft's algorithm assumes a finite alphabet that can be listed in +memory, whereas Ragel supports arbitrary integer alphabets that cannot be +listed in memory. Though exact analysis is very difficult, Ragel minimization +runs close to O(n * log(n)) and requires O(n) temporary storage where +$n$ is the number of states. + +\section{Visualization} +\label{visualization} + +%In many cases, practical +%parsing programs will be too large to completely visualize with Graphviz. The +%proper approach is to reduce the language to the smallest subset possible that +%still exhibits the characteristics that one wishes to learn about or to fix. +%This can be done without modifying the source code using the \verb|-M| and +%\verb|-S| options. If a machine cannot be easily reduced, +%embeddings of unique actions can be very useful for tracing a +%particular component of a larger machine specification, since action names are +%written out on transition labels. + +Ragel is able to emit compiled state machines in Graphviz's Dot file format. +This is done using the \verb|-V| option. +Graphviz support allows users to perform +incremental visualization of their parsers. User actions are displayed on +transition labels of the graph. + +If the final graph is too large to be +meaningful, or even drawn, the user is able to inspect portions of the parser +by naming particular regular expression definitions with the \verb|-S| and +\verb|-M| options to the \verb|ragel| program. Use of Graphviz greatly +improves the Ragel programming experience. It allows users to learn Ragel by +experimentation and also to track down bugs caused by unintended +nondeterminism. + +Ragel has another option to help debugging. The \verb|-x| option causes Ragel +to emit the compiled machine in an XML format. + +\chapter{User Actions} + +Ragel permits the user to embed actions into the transitions of a regular +expression's corresponding state machine. These actions are executed when the +generated code moves over a transition. Like the regular expression operators, +the action embedding operators are fully compositional. They take a state +machine and an action as input, embed the action and yield a new state machine +that can be used in the construction of other machines. Due to the +compositional nature of embeddings, the user has complete freedom in the +placement of actions. + +A machine's transitions are categorized into four classes. The action embedding +operators access the transitions defined by these classes. The {\em entering +transition} operator \verb|>| isolates the start state, then embeds an action +into all transitions leaving it. The {\em finishing transition} operator +\verb|@| embeds an action into all transitions going into a final state. The +{\em all transition} operator \verb|$| embeds an action into all transitions of +an expression. The {\em leaving transition} operator \verb|%| provides access +to the yet-unmade transitions moving out of the machine via the final states. + +\section{Embedding Actions} + +\begin{verbatim} +action ActionName { + /* Code an action here. */ + count += 1; +} +\end{verbatim} +\verbspace + +The action statement defines a block of code that can be embedded into an FSM. +Action names can be referenced by the action embedding operators in +expressions. Though actions need not be named in this way (literal blocks +of code can be embedded directly when building machines), defining reusable +blocks of code whenever possible is good practice because it potentially increases the +degree to which the machine can be minimized. + +Within an action some Ragel expressions and statements are parsed and +translated. These allow the user to interact with the machine from action code. +See Section \ref{vals} for a complete list of statements and values available +in code blocks. + +\subsection{Entering Action} + +\verb|expr > action| + +The entering action operator embeds an action into all transitions +that enter into the machine from the start state. If the start state is final, +then the action is also embedded into the start state as a leaving action. This +means that if a machine accepts the zero-length string and control passes +through the start state then the entering action is executed. Note +that this can happen on both a following character and on the EOF event. + +In some machines the start state has transtions coming in from within the +machine. In these cases the start state is first isolated from the rest of the +machine ensuring that the entering actions are exected once only. + +% GENERATE: exstact +% OPT: -p +% %%{ +% machine exstact; +\begin{inline_code} +\begin{verbatim} +# Execute A at the beginning of a string of alpha. +action A {} +main := ( lower* >A ) . ' '; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{exstact} +\end{center} +\graphspace + +\subsection{Finishing Action} + +\verb|expr @ action| + +The finishing action operator embeds an action into any transitions that move +the machine into a final state. Further input may move the machine out of the +final state, but keep it in the machine. Therefore finishing actions may be +executed more than once if a machine has any internal transitions out of a +final state. In the following example the final state has no transitions out +and the finishing action is executed only once. + +% GENERATE: exdoneact +% OPT: -p +% %%{ +% machine exdoneact; +% action A {} +\begin{inline_code} +\begin{verbatim} +# Execute A when the trailing space is seen. +main := ( lower* ' ' ) @A; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{exdoneact} +\end{center} +\graphspace + +\subsection{All Transition Action} + +\verb|expr $ action| + +The all transition operator embeds an action into all transitions of a machine. +The action is executed whenever a transition of the machine is taken. In the +following example, A is executed on every character matched. + +% GENERATE: exallact +% OPT: -p +% %%{ +% machine exallact; +% action A {} +\begin{inline_code} +\begin{verbatim} +# Execute A on any characters of the machine. +main := ( 'm1' | 'm2' ) $A; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{exallact} +\end{center} +\graphspace + +\subsection{Leaving Actions} +\label{out-actions} + +\verb|expr % action| + +The leaving action operator queues an action for embedding into the transitions +that go out of a machine via a final state. The action is first stored in +the machine's final states and is later transferred to any transitions that are +made going out of the machine by a kleene star or concatenation operation. + +If a final state of the machine is still final when compilation is complete +then the leaving action is also embedded as an EOF action. Therefore, leaving +the machine is defined as either leaving on a character or as state machine +acceptance. + +This operator allows one to associate an action with the termination of a +sequence, without being concerned about what particular character terminates +the sequence. In the following example, A is executed when leaving the alpha +machine on the newline character. + +% GENERATE: exoutact1 +% OPT: -p +% %%{ +% machine exoutact1; +% action A {} +\begin{inline_code} +\begin{verbatim} +# Match a word followed by a newline. Execute A when +# finishing the word. +main := ( lower+ %A ) . '\n'; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{exoutact1} +\end{center} +\graphspace + +In the following example, the \verb|term_word| action could be used to register +the appearance of a word and to clear the buffer that the \verb|lower| action used +to store the text of it. + +% GENERATE: exoutact2 +% OPT: -p +% %%{ +% machine exoutact2; +% action lower {} +% action space {} +% action term_word {} +% action newline {} +\begin{inline_code} +\begin{verbatim} +word = ( [a-z] @lower )+ %term_word; +main := word ( ' ' @space word )* '\n' @newline; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{exoutact2} +\end{center} +\graphspace + +In this final example of the action embedding operators, A is executed upon entering +the alpha machine, B is executed on all transitions of the +alpha machine, C is executed when the alpha machine is exited by moving into the +newline machine and N is executed when the newline machine moves into a final +state. + +% GENERATE: exaction +% OPT: -p +% %%{ +% machine exaction; +% action A {} +% action B {} +% action C {} +% action N {} +\begin{inline_code} +\begin{verbatim} +# Execute A on starting the alpha machine, B on every transition +# moving through it and C upon finishing. Execute N on the newline. +main := ( lower* >A $B %C ) . '\n' @N; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{exaction} +\end{center} +\graphspace + + +\section{State Action Embedding Operators} + +The state embedding operators allow one to embed actions into states. Like the +transition embedding operators, there are several different classes of states +that the operators access. The meanings of the symbols are similar to the +meanings of the symbols used for the transition embedding operators. The design +of the state selections was driven by a need to cover the states of an +expression with exactly one error action. + +Unlike the transition embedding operators, the state embedding operators are +also distinguished by the different kinds of events that embedded actions can +be associated with. Therefore the state embedding operators have two +components. The first, which is the first one or two characters, specifies the +class of states that the action will be embedded into. The second component +specifies the type of event the action will be executed on. The symbols of the +second component also have equivalent kewords. + +\begin{multicols}{2} +The different classes of states are: + +\noindent\hspace*{24pt}\verb|> | -- the start state\\ +\noindent\hspace*{24pt}\verb|< | -- any state except the start state\\ +\noindent\hspace*{24pt}\verb|$ | -- all states\\ +\noindent\hspace*{24pt}\verb|% | -- final states\\ +\noindent\hspace*{24pt}\verb|@ | -- any state except final states\\ +\noindent\hspace*{24pt}\verb|<>| -- any except start and final (middle) +\vspace{12pt} + +\columnbreak + +The different kinds of embeddings are: + +\noindent\hspace*{24pt}\verb|~| -- to-state actions (\verb|to|)\\ +\noindent\hspace*{24pt}\verb|*| -- from-state actions (\verb|from|)\\ +\noindent\hspace*{24pt}\verb|/| -- EOF actions (\verb|eof|)\\ +\noindent\hspace*{24pt}\verb|!| -- error actions (\verb|err|)\\ +\noindent\hspace*{24pt}\verb|^| -- local error actions (\verb|lerr|) +\vspace{12pt} + +\end{multicols} + +\subsection{To-State and From-State Actions} + +\subsubsection{To-State Actions} + +\noindent\hspace*{24pt}\verb|>~action >to(name) >to{...} | -- the start state\\ +\noindent\hspace*{24pt}\verb|<~action <to(name) <to{...} | -- any state except the start state\\ +\noindent\hspace*{24pt}\verb|$~action $to(name) $to{...} | -- all states\\ +\noindent\hspace*{24pt}\verb|%~action %to(name) %to{...} | -- final states\\ +\noindent\hspace*{24pt}\verb|@~action @to(name) @to{...} | -- any state except final states\\ +\noindent\hspace*{24pt}\verb|<>~action <>to(name) <>to{...}| -- any except start and final (middle) +\vspace{12pt} + + +To-state actions are executed whenever the state machine moves into the +specified state, either by a natural movement over a transition or by an +action-based transfer of control such as \verb|fgoto|. They are executed after the +in-transition's actions but before the current character is advanced and +tested against the end of the input block. To-state embeddings stay with the +state. They are irrespective of the state's current set of transitions and any +future transitions that may be added in or out of the state. + +Note that the setting of the current state variable \verb|cs| outside of the +execute code is not considered by Ragel as moving into a state and consequently +the to-state actions of the new current state are not executed. This includes +the initialization of the current state when the machine begins. This is +because the entry point into the machine execution code is after the execution +of to-state actions. + +\subsubsection{From-State Actions} + +\noindent\hspace*{24pt}\verb|>*action >from(name) >from{...} | -- the start state\\ +\noindent\hspace*{24pt}\verb|<*action <from(name) <from{...} | -- any state except the start state\\ +\noindent\hspace*{24pt}\verb|$*action $from(name) $from{...} | -- all states\\ +\noindent\hspace*{24pt}\verb|%*action %from(name) %from{...} | -- final states\\ +\noindent\hspace*{24pt}\verb|@*action @from(name) @from{...} | -- any state except final states\\ +\noindent\hspace*{24pt}\verb|<>*action <>from(name) <>from{...}| -- any except start and final (middle) +\vspace{12pt} + +From-state actions are executed whenever the state machine takes a transition from a +state, either to itself or to some other state. These actions are executed +immediately after the current character is tested against the input block end +marker and before the transition to take is sought based on the current +character. From-state actions are therefore executed even if a transition +cannot be found and the machine moves into the error state. Like to-state +embeddings, from-state embeddings stay with the state. + +\subsection{EOF Actions} + +\noindent\hspace*{24pt}\verb|>/action >eof(name) >eof{...} | -- the start state\\ +\noindent\hspace*{24pt}\verb|</action <eof(name) <eof{...} | -- any state except the start state\\ +\noindent\hspace*{24pt}\verb|$/action $eof(name) $eof{...} | -- all states\\ +\noindent\hspace*{24pt}\verb|%/action %eof(name) %eof{...} | -- final states\\ +\noindent\hspace*{24pt}\verb|@/action @eof(name) @eof{...} | -- any state except final states\\ +\noindent\hspace*{24pt}\verb|<>/action <>eof(name) <>eof{...}| -- any except start and final (middle) +\vspace{12pt} + +The EOF action embedding operators enable the user to embed actions that are +executed at the end of the input stream. EOF actions are stored in states and +generated in the \verb|write exec| block. They are run when \verb|p == pe == eof| +as the execute block is finishing. EOF actions are free to adjust \verb|p| and +jump to another part of the machine to restart execution. + +\subsection{Handling Errors} + +In many applications it is useful to be able to react to parsing errors. The +user may wish to print an error message that depends on the context. It +may also be desirable to consume input in an attempt to return the input stream +to some known state and resume parsing. To support error handling and recovery, +Ragel provides error action embedding operators. There are two kinds of error +actions: global error actions and local error actions. +Error actions can be used to simply report errors, or by jumping to a machine +instantiation that consumes input, can attempt to recover from errors. + +\subsubsection{Global Error Actions} + +\noindent\hspace*{24pt}\verb|>!action >err(name) >err{...} | -- the start state\\ +\noindent\hspace*{24pt}\verb|<!action <err(name) <err{...} | -- any state except the start state\\ +\noindent\hspace*{24pt}\verb|$!action $err(name) $err{...} | -- all states\\ +\noindent\hspace*{24pt}\verb|%!action %err(name) %err{...} | -- final states\\ +\noindent\hspace*{24pt}\verb|@!action @err(name) @err{...} | -- any state except final states\\ +\noindent\hspace*{24pt}\verb|<>!action <>err(name) <>err{...}| -- any except start and final (middle) +\vspace{12pt} + +Global error actions are stored in the states they are embedded into until +compilation is complete. They are then transferred to the transitions that move +into the error state. These transitions are taken on all input characters that +are not already covered by the state's transitions. If a state with an error +action is not final when compilation is complete, then the action is also +embedded as an EOF action. + +Error actions can be used to recover from errors by jumping back into the +machine with \verb|fgoto| and optionally altering \verb|p|. + +\subsubsection{Local Error Actions} + +\noindent\hspace*{24pt}\verb|>^action >lerr(name) >lerr{...} | -- the start state\\ +\noindent\hspace*{24pt}\verb|<^action <lerr(name) <lerr{...} | -- any state except the start state\\ +\noindent\hspace*{24pt}\verb|$^action $lerr(name) $lerr{...} | -- all states\\ +\noindent\hspace*{24pt}\verb|%^action %lerr(name) %lerr{...} | -- final states\\ +\noindent\hspace*{24pt}\verb|@^action @lerr(name) @lerr{...} | -- any state except final states\\ +\noindent\hspace*{24pt}\verb|<>^action <>lerr(name) <>lerr{...}| -- any except start and final (middle) +\vspace{12pt} + +Like global error actions, local error actions are also stored in the states +they are embedded into until a transfer point. The transfer point is different +however. Each local error action embedding is associated with a name. When a +machine definition has been fully constructed, all local error action +embeddings associated with the same name as the machine definition are +transferred to the error transitions. At this time they are also embedded as +EOF actions in the case of non-final states. + +Local error actions can be used to specify an action to take when a particular +section of a larger state machine fails to match. A particular machine +definition's ``thread'' may die and the local error actions executed, however +the machine as a whole may continue to match input. + +There are two forms of local error action embeddings. In the first form the +name defaults to the current machine. In the second form the machine name can +be specified. This is useful when it is more convenient to specify the local +error action in a sub-definition that is used to construct the machine +definition that the local error action is associated with. To embed local +error actions and +explicitly state the machine definition on which the transfer is to happen use +\verb|(name, action)| as the action. + +\subsubsection{Example} + +The following example uses error actions to report an error and jump to a +machine that consumes the remainder of the line when parsing fails. After +consuming the line, the error recovery machine returns to the main loop. + +% GENERATE: erract +% %%{ +% machine erract; +% ws = ' '; +% address = 'foo AT bar..com'; +% date = 'Monday May 12'; +\begin{inline_code} +\begin{verbatim} +action cmd_err { + printf( "command error\n" ); + fhold; fgoto line; +} +action from_err { + printf( "from error\n" ); + fhold; fgoto line; +} +action to_err { + printf( "to error\n" ); + fhold; fgoto line; +} + +line := [^\n]* '\n' @{ fgoto main; }; + +main := ( + ( + 'from' @err(cmd_err) + ( ws+ address ws+ date '\n' ) $err(from_err) | + 'to' @err(cmd_err) + ( ws+ address '\n' ) $err(to_err) + ) +)*; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% %% write data; +% void f() +% { +% %% write init; +% %% write exec; +% } +% END GENERATE + + + +\section{Action Ordering and Duplicates} + +When combining expressions that have embedded actions it is often the case that +a number of actions must be executed on a single input character. For example, +following a concatenation the leaving action of the left expression and the +entering action of the right expression will be embedded into one transition. +This requires a method of ordering actions that is intuitive and +predictable for the user, and repeatable for the compiler. + +We associate with the embedding of each action a unique timestamp that is +used to order actions that appear together on a single transition in the final +state machine. To accomplish this we recursively traverse the parse tree of +regular expressions and assign timestamps to action embeddings. References to +machine definitions are followed in the traversal. When we visit a +parse tree node we assign timestamps to all {\em entering} action embeddings, +recurse on the parse tree, then assign timestamps to the remaining {\em all}, +{\em finishing}, and {\em leaving} embeddings in the order in which they +appear. + +By default Ragel does not permit a single action to appear multiple times in an action +list. When the final machine has been created, actions that appear more than +once in a single transition, to-state, from-state or EOF action list have their +duplicates removed. +The first appearance of the action is preserved. This is useful in a number of +scenarios. First, it allows us to union machines with common prefixes without +worrying about the action embeddings in the prefix being duplicated. Second, it +prevents leaving actions from being transferred multiple times. This can +happen when a machine is repeated, then followed with another machine that +begins with a common character. For example: + +\begin{verbatim} +word = [a-z]+ %act; +main := word ( '\n' word )* '\n\n'; +\end{verbatim} +\verbspace + +Note that Ragel does not compare action bodies to determine if they have +identical program text. It simply checks for duplicates using each action +block's unique location in the program. + +The removal of duplicates can be turned off using the \verb|-d| option. + +\section{Values and Statements Available in Code Blocks} +\label{vals} + +The following values are available in code blocks: + +\begin{itemize} +\item \verb|fpc| -- A pointer to the current character. This is equivalent to +accessing the \verb|p| variable. + +\item \verb|fc| -- The current character. This is equivalent to the expression \verb|(*p)|. + +\item \verb|fcurs| -- An integer value representing the current state. This +value should only be read from. To move to a different place in the machine +from action code use the \verb|fgoto|, \verb|fnext| or \verb|fcall| statements. +Outside of the machine execution code the \verb|cs| variable may be modified. + +\item \verb|ftargs| -- An integer value representing the target state. This +value should only be read from. Again, \verb|fgoto|, \verb|fnext| and +\verb|fcall| can be used to move to a specific entry point. + +\item \verb|fentry(<label>)| -- Retrieve an integer value representing the +entry point \verb|label|. The integer value returned will be a compile time +constant. This number is suitable for later use in control flow transfer +statements that take an expression. This value should not be compared against +the current state because any given label can have multiple states representing +it. The value returned by \verb|fentry| can be any one of the multiple states that +it represents. +\end{itemize} + +The following statements are available in code blocks: + +\begin{itemize} + +\item \verb|fhold;| -- Do not advance over the current character. If processing +data in multiple buffer blocks, the \verb|fhold| statement should only be used +once in the set of actions executed on a character. Multiple calls may result +in backing up over the beginning of the buffer block. The \verb|fhold| +statement does not imply any transfer of control. It is equivalent to the +\verb|p--;| statement. + +\item \verb|fexec <expr>;| -- Set the next character to process. This can be +used to backtrack to previous input or advance ahead. +Unlike \verb|fhold|, which can be used +anywhere, \verb|fexec| requires the user to ensure that the target of the +backtrack is in the current buffer block or is known to be somewhere ahead of +it. The machine will continue iterating forward until \verb|pe| is arrived at, +\verb|fbreak| is called or the machine moves into the error state. In actions +embedded into transitions, the \verb|fexec| statement is equivalent to setting +\verb|p| to one position ahead of the next character to process. If the user +also modifies \verb|pe|, it is possible to change the buffer block entirely. + +\item \verb|fgoto <label>;| -- Jump to an entry point defined by +\verb|<label>|. The \verb|fgoto| statement immediately transfers control to +the destination state. + +\item \verb|fgoto *<expr>;| -- Jump to an entry point given by \verb|<expr>|. +The expression must evaluate to an integer value representing a state. + +\item \verb|fnext <label>;| -- Set the next state to be the entry point defined +by \verb|label|. The \verb|fnext| statement does not immediately jump to the +specified state. Any action code following the statement is executed. + +\item \verb|fnext *<expr>;| -- Set the next state to be the entry point given +by \verb|<expr>|. The expression must evaluate to an integer value representing +a state. + +\item \verb|fcall <label>;| -- Push the target state and jump to the entry +point defined by \verb|<label>|. The next \verb|fret| will jump to the target +of the transition on which the call was made. Use of \verb|fcall| requires +the declaration of a call stack. An array of integers named \verb|stack| and a +single integer named \verb|top| must be declared. With the \verb|fcall| +construct, control is immediately transferred to the destination state. +See section \ref{modularization} for more information. + +\item \verb|fcall *<expr>;| -- Push the current state and jump to the entry +point given by \verb|<expr>|. The expression must evaluate to an integer value +representing a state. + +\item \verb|fret;| -- Return to the target state of the transition on which the +last \verb|fcall| was made. Use of \verb|fret| requires the declaration of a +call stack. Control is immediately transferred to the destination state. + +\item \verb|fbreak;| -- Advance \verb|p|, save the target state to \verb|cs| +and immediately break out of the execute loop. This statement is useful +in conjunction with the \verb|noend| write option. Rather than process input +until \verb|pe| is arrived at, the fbreak statement +can be used to stop processing from an action. After an \verb|fbreak| +statement the \verb|p| variable will point to the next character in the input. The +current state will be the target of the current transition. Note that \verb|fbreak| +causes the target state's to-state actions to be skipped. + +\end{itemize} + +Once actions with control-flow commands are embedded into a +machine, the user must exercise caution when using the machine as the operand +to other machine construction operators. If an action jumps to another state +then unioning any transition that executes that action with another transition +that follows some other path will cause that other path to be lost. Using +commands that manually jump around a machine takes us out of the domain of +regular languages because transitions that the +machine construction operators are not aware of are introduced. These +commands should therefore be used with caution. + + +\chapter{Controlling Nondeterminism} +\label{controlling-nondeterminism} + +Along with the flexibility of arbitrary action embeddings comes a need to +control nondeterminism in regular expressions. If a regular expression is +ambiguous, then sub-components of a parser other than the intended parts may become +active. This means that actions that are irrelevant to the +current subset of the parser may be executed, causing problems for the +programmer. + +Tools that are based on regular expression engines and that are used for +recognition tasks will usually function as intended regardless of the presence +of ambiguities. It is quite common for users of scripting languages to write +regular expressions that are heavily ambiguous and it generally does not +matter. As long as one of the potential matches is recognized, there can be any +number of other matches present. In some parsing systems the run-time engine +can employ a strategy for resolving ambiguities, for example always pursuing +the longest possible match and discarding others. + +In Ragel, there is no regular expression run-time engine, just a simple state +machine execution model. When we begin to embed actions and face the +possibility of spurious action execution, it becomes clear that controlling +nondeterminism at the machine construction level is very important. Consider +the following example. + +% GENERATE: lines1 +% OPT: -p +% %%{ +% machine lines1; +% action first {} +% action tail {} +% word = [a-z]+; +\begin{inline_code} +\begin{verbatim} +ws = [\n\t ]; +line = word $first ( ws word $tail )* '\n'; +lines = line*; +\end{verbatim} +\end{inline_code} +\verbspace +% main := lines; +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.53]{lines1} +\end{center} +\graphspace + +Since the \verb|ws| expression includes the newline character, we will +not finish the \verb|line| expression when a newline character is seen. We will +simultaneously pursue the possibility of matching further words on the same +line and the possibility of matching a second line. Evidence of this fact is +in the state tables. On several transitions both the \verb|first| and +\verb|tail| actions are executed. The solution here is simple: exclude +the newline character from the \verb|ws| expression. + +% GENERATE: lines2 +% OPT: -p +% %%{ +% machine lines2; +% action first {} +% action tail {} +% word = [a-z]+; +\begin{inline_code} +\begin{verbatim} +ws = [\t ]; +line = word $first ( ws word $tail )* '\n'; +lines = line*; +\end{verbatim} +\end{inline_code} +\verbspace +% main := lines; +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{lines2} +\end{center} +\graphspace + +Solving this kind of problem is straightforward when the ambiguity is created +by strings that are a single character long. When the ambiguity is created by +strings that are multiple characters long we have a more difficult problem. +The following example is an incorrect attempt at a regular expression for C +language comments. + +% GENERATE: comments1 +% OPT: -p +% %%{ +% machine comments1; +% action comm {} +\begin{inline_code} +\begin{verbatim} +comment = '/*' ( any @comm )* '*/'; +main := comment ' '; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{comments1} +\end{center} +\graphspace + +Using standard concatenation, we will never leave the \verb|any*| expression. +We will forever entertain the possibility that a \verb|'*/'| string that we see +is contained in a longer comment and that, simultaneously, the comment has +ended. The concatenation of the \verb|comment| machine with \verb|SP| is done +to show this. When we match space, we are also still matching the comment body. + +One way to approach the problem is to exclude the terminating string +from the \verb|any*| expression using set difference. We must be careful to +exclude not just the terminating string, but any string that contains it as a +substring. A verbose, but proper specification of a C comment parser is given +by the following regular expression. + +% GENERATE: comments2 +% OPT: -p +% %%{ +% machine comments2; +% action comm {} +\begin{inline_code} +\begin{verbatim} +comment = '/*' ( ( any @comm )* - ( any* '*/' any* ) ) '*/'; +\end{verbatim} +\end{inline_code} +\verbspace +% main := comment; +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{comments2} +\end{center} +\graphspace + +Note that Ragel's strong subtraction operator \verb|--| can also be used here. +In doing this subtraction we have phrased the problem of controlling non-determinism in +terms of excluding strings common to two expressions that interact when +combined. +We can also phrase the problem in terms of the transitions of the state +machines that implement these expressions. During the concatenation of +\verb|any*| and \verb|'*/'| we will be making transitions that are composed of +both the loop of the first expression and the final character of the second. +At this time we want the transition on the \verb|'/'| character to take precedence +over and disallow the transition that originated in the \verb|any*| loop. + +In another parsing problem, we wish to implement a lightweight tokenizer that we can +utilize in the composition of a larger machine. For example, some HTTP headers +have a token stream as a sub-language. The following example is an attempt +at a regular expression-based tokenizer that does not function correctly due to +unintended nondeterminism. + +% GENERATE: smallscanner +% OPT: -p +% %%{ +% machine smallscanner; +% action start_str {} +% action on_char {} +% action finish_str {} +\begin{inline_code} +\begin{verbatim} +header_contents = ( + lower+ >start_str $on_char %finish_str | + ' ' +)*; +\end{verbatim} +\end{inline_code} +\verbspace +% main := header_contents; +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{smallscanner} +\end{center} +\graphspace + +In this case, the problem with using a standard kleene star operation is that +there is an ambiguity between extending a token and wrapping around the machine +to begin a new token. Using the standard operator, we get an undesirable +nondeterministic behaviour. Evidence of this can be seen on the transition out +of state one to itself. The transition extends the string, and simultaneously, +finishes the string only to immediately begin a new one. What is required is +for the +transitions that represent an extension of a token to take precedence over the +transitions that represent the beginning of a new token. For this problem +there is no simple solution that uses standard regular expression operators. + +\section{Priorities} + +A priority mechanism was devised and built into the determinization +process, specifically for the purpose of allowing the user to control +nondeterminism. Priorities are integer values embedded into transitions. When +the determinization process is combining transitions that have different +priorities, the transition with the higher priority is preserved and the +transition with the lower priority is dropped. + +Unfortunately, priorities can have unintended side effects because their +operation requires that they linger in transitions indefinitely. They must linger +because the Ragel program cannot know when the user is finished with a priority +embedding. A solution whereby they are explicitly deleted after use is +conceivable; however this is not very user-friendly. Priorities were therefore +made into named entities. Only priorities with the same name are allowed to +interact. This allows any number of priorities to coexist in one machine for +the purpose of controlling various different regular expression operations and +eliminates the need to ever delete them. Such a scheme allows the user to +choose a unique name, embed two different priority values using that name +and be confident that the priority embedding will be free of any side effects. + +In the first form of priority embedding the name defaults to the name of the machine +definition that the priority is assigned in. In this sense priorities are by +default local to the current machine definition or instantiation. Beware of +using this form in a longest-match machine, since there is only one name for +the entire set of longest match patterns. In the second form the priority's +name can be specified, allowing priority interaction across machine definition +boundaries. + +\begin{itemize} +\item \verb|expr > int| -- Sets starting transitions to have priority int. +\item \verb|expr @ int| -- Sets transitions that go into a final state to have priority int. +\item \verb|expr $ int| -- Sets all transitions to have priority int. +\item \verb|expr % int| -- Sets leaving transitions to +have priority int. When a transition is made going out of the machine (either +by concatenation or kleene star) its priority is immediately set to the +leaving priority. +\end{itemize} + +The second form of priority assignment allows the programmer to specify the name +to which the priority is assigned. + +\begin{itemize} +\item \verb|expr > (name, int)| -- Starting transitions. +\item \verb|expr @ (name, int)| -- Finishing transitions (into a final state). +\item \verb|expr $ (name, int)| -- All transitions. +\item \verb|expr % (name, int)| -- Leaving transitions. +\end{itemize} + +\section{Guarded Operators that Encapsulate Priorities} + +Priority embeddings are a very expressive mechanism. At the same time they +can be very confusing for the user. They force the user to imagine +the transitions inside two interacting expressions and work out the precise +effects of the operations between them. When we consider +that this problem is worsened by the +potential for side effects caused by unintended priority name collisions, we +see that exposing the user to priorities is undesirable. + +Fortunately, in practice the use of priorities has been necessary only in a +small number of scenarios. This allows us to encapsulate their functionality +into a small set of operators and fully hide them from the user. This is +advantageous from a language design point of view because it greatly simplifies +the design. + +Going back to the C comment example, we can now properly specify +it using a guarded concatenation operator which we call {\em finish-guarded +concatenation}. From the user's point of view, this operator terminates the +first machine when the second machine moves into a final state. It chooses a +unique name and uses it to embed a low priority into all +transitions of the first machine. A higher priority is then embedded into the +transitions of the second machine that enter into a final state. The following +example yields a machine identical to the example in Section +\ref{controlling-nondeterminism}. + +\begin{inline_code} +\begin{verbatim} +comment = '/*' ( any @comm )* :>> '*/'; +\end{verbatim} +\end{inline_code} +\verbspace + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{comments2} +\end{center} +\graphspace + +Another guarded operator is {\em left-guarded concatenation}, given by the +\verb|<:| compound symbol. This operator places a higher priority on all +transitions of the first machine. This is useful if one must forcibly separate +two lists that contain common elements. For example, one may need to tokenize a +stream, but first consume leading whitespace. + +Ragel also includes a {\em longest-match kleene star} operator, given by the +\verb|**| compound symbol. This +guarded operator embeds a high +priority into all transitions of the machine. +A lower priority is then embedded into the leaving transitions. When the +kleene star operator makes the epsilon transitions from +the final states into the new start state, the lower priority will be transferred +to the epsilon transitions. In cases where following an epsilon transition +out of a final state conflicts with an existing transition out of a final +state, the epsilon transition will be dropped. + +Other guarded operators are conceivable, such as guards on union that cause one +alternative to take precedence over another. These may be implemented when it +is clear they constitute a frequently used operation. +In the next section we discuss the explicit specification of state machines +using state charts. + +\subsection{Entry-Guarded Concatenation} + +\verb|expr :> expr| + +This operator concatenates two machines, but first assigns a low +priority to all transitions +of the first machine and a high priority to the starting transitions of the +second machine. This operator is useful if from the final states of the first +machine it is possible to accept the characters in the entering transitions of +the second machine. This operator effectively terminates the first machine +immediately upon starting the second machine, where otherwise they would be +pursued concurrently. In the following example, entry-guarded concatenation is +used to move out of a machine that matches everything at the first sign of an +end-of-input marker. + +% GENERATE: entryguard +% OPT: -p +% %%{ +% machine entryguard; +\begin{inline_code} +\begin{verbatim} +# Leave the catch-all machine on the first character of FIN. +main := any* :> 'FIN'; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{entryguard} +\end{center} +\graphspace + +Entry-guarded concatenation is equivalent to the following: + +\begin{verbatim} +expr $(unique_name,0) . expr >(unique_name,1) +\end{verbatim} +\verbspace + +\subsection{Finish-Guarded Concatenation} + +\verb|expr :>> expr| + +This operator is +like the previous operator, except the higher priority is placed on the final +transitions of the second machine. This is useful if one wishes to entertain +the possibility of continuing to match the first machine right up until the +second machine enters a final state. In other words it terminates the first +machine only when the second accepts. In the following example, finish-guarded +concatenation causes the move out of the machine that matches everything to be +delayed until the full end-of-input marker has been matched. + +% GENERATE: finguard +% OPT: -p +% %%{ +% machine finguard; +\begin{inline_code} +\begin{verbatim} +# Leave the catch-all machine on the last character of FIN. +main := any* :>> 'FIN'; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{finguard} +\end{center} +\graphspace + +Finish-guarded concatenation is equivalent to the following, with one +exception. If the right machine's start state is final, the higher priority is +also embedded into it as a leaving priority. This prevents the left machine +from persisting via the zero-length string. + +\begin{verbatim} +expr $(unique_name,0) . expr @(unique_name,1) +\end{verbatim} +\verbspace + +\subsection{Left-Guarded Concatenation} + +\verb|expr <: expr| + +This operator places +a higher priority on the left expression. It is useful if you want to prefix a +sequence with another sequence composed of some of the same characters. For +example, one can consume leading whitespace before tokenizing a sequence of +whitespace-separated words as in: + +% GENERATE: leftguard +% OPT: -p +% %%{ +% machine leftguard; +% action alpha {} +% action ws {} +% action start {} +% action fin {} +\begin{inline_code} +\begin{verbatim} +main := ( ' '* >start %fin ) <: ( ' ' $ws | [a-z] $alpha )*; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{leftguard} +\end{center} +\graphspace + +Left-guarded concatenation is equivalent to the following: + +\begin{verbatim} +expr $(unique_name,1) . expr >(unique_name,0) +\end{verbatim} +\verbspace + +\subsection{Longest-Match Kleene Star} +\label{longest_match_kleene_star} + +\verb|expr**| + +This version of kleene star puts a higher priority on staying in the +machine versus wrapping around and starting over. The LM kleene star is useful +when writing simple tokenizers. These machines are built by applying the +longest-match kleene star to an alternation of token patterns, as in the +following. + +% GENERATE: lmkleene +% OPT: -p +% %%{ +% machine exfinpri; +% action A {} +% action B {} +\begin{inline_code} +\begin{verbatim} +# Repeat tokens, but make sure to get the longest match. +main := ( + lower ( lower | digit )* %A | + digit+ %B | + ' ' +)**; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{lmkleene} +\end{center} +\graphspace + +If a regular kleene star were used the machine above would not be able to +distinguish between extending a word and beginning a new one. This operator is +equivalent to: + +\begin{verbatim} +( expr $(unique_name,1) %(unique_name,0) )* +\end{verbatim} +\verbspace + +When the kleene star is applied, transitions that go out of the machine and +back into it are made. These are assigned a priority of zero by the leaving +transition mechanism. This is less than the priority of one assigned to the +transitions leaving the final states but not leaving the machine. When +these transitions clash on the same character, the +transition that stays in the machine takes precedence. The transition +that wraps around is dropped. + +Note that this operator does not build a scanner in the traditional sense +because there is never any backtracking. To build a scanner with backtracking +use the Longest-Match machine construction described in Section +\ref{generating-scanners}. + +\chapter{Interface to Host Program} + +The Ragel code generator is very flexible. The generated code has no +dependencies and can be inserted in any function, perhaps inside a loop if +desired. The user is responsible for declaring and initializing a number of +required variables, including the current state and the pointer to the input +stream. These can live in any scope. Control of the input processing loop is +also possible: the user may break out of the processing loop and return to it +at any time. + +In the case of the C, D, and Go host languages, Ragel is able to generate very +fast-running code that implements state machines as directly executable code. +Since very large files strain the host language compiler, table-based code +generation is also supported. In the future we hope to provide a partitioned, +directly executable format that is able to reduce the burden on the host +compiler by splitting large machines across multiple functions. + +In the case of Java and Ruby, table-based code generation is the only code +style supported. In the future this may be expanded to include other code +styles. + +Ragel can be used to parse input in one block, or it can be used to parse input +in a sequence of blocks as it arrives from a file or socket. Parsing the input +in a sequence of blocks brings with it a few responsibilities. If the parser +utilizes a scanner, care must be taken to not break the input stream anywhere +but token boundaries. If pointers to the input stream are taken during +parsing, care must be taken to not use a pointer that has been invalidated by +movement to a subsequent block. If the current input data pointer is moved +backwards it must not be moved past the beginning of the current block. + +Figure \ref{basic-example} shows a simple Ragel program that does not have any +actions. The example tests the first argument of the program against a number +pattern and then prints the machine's acceptance status. + +\begin{figure} +\small +\begin{verbatim} +#include <stdio.h> +#include <string.h> +%%{ + machine foo; + write data; +}%% +int main( int argc, char **argv ) +{ + int cs; + if ( argc > 1 ) { + char *p = argv[1]; + char *pe = p + strlen( p ); + %%{ + main := [0-9]+ ( '.' [0-9]+ )?; + + write init; + write exec; + }%% + } + printf("result = %i\n", cs >= foo_first_final ); + return 0; +} +\end{verbatim} +\verbspace +\caption{A basic Ragel example without any actions. +} +\label{basic-example} +\end{figure} + +\section{Variables Used by Ragel} + +There are a number of variables that Ragel expects the user to declare. At a +very minimum the \verb|cs|, \verb|p| and \verb|pe| variables must be declared. +In Go, Java and Ruby code the \verb|data| variable must also be declared. If +EOF actions are used then the \verb|eof| variable is required. If +stack-based state machine control flow statements are used then the +\verb|stack| and \verb|top| variables are required. If a scanner is declared +then the \verb|act|, \verb|ts| and \verb|te| variables must be +declared. + +\begin{itemize} + +\item \verb|cs| - Current state. This must be an integer and it should persist +across invocations of the machine when the data is broken into blocks that are +processed independently. This variable may be modified from outside the +execution loop, but not from within. + +\item \verb|p| - Data pointer. In C/D code this variable is expected to be a +pointer to the character data to process. It should be initialized to the +beginning of the data block on every run of the machine. In Go, Java and Ruby it is +used as an offset to \verb|data| and must be an integer. In this case it should +be initialized to zero on every run of the machine. + +\item \verb|pe| - Data end pointer. This should be initialized to \verb|p| plus +the data length on every run of the machine. In Go, Java and Ruby code this should +be initialized to the data length. + +\item \verb|eof| - End of file pointer. This should be set to \verb|pe| when +the buffer block being processed is the last one, otherwise it should be set to +null. In Go, Java and Ruby code \verb|-1| must be used instead of null. If the EOF +event can be known only after the final buffer block has been processed, then +it is possible to set \verb|p = pe = eof| and run the execute block. + +\item \verb|data| - This variable is only required in Go, Java and Ruby code. It +must be an array containting the data to process. + +\item \verb|stack| - This must be an array of integers. It is used to store +integer values representing states. If the stack must resize dynamically the +Pre-push and Post-Pop statements can be used to do this (Sections +\ref{prepush} and \ref{postpop}). + +\item \verb|top| - This must be an integer value and will be used as an offset +to \verb|stack|, giving the next available spot on the top of the stack. + +\item \verb|act| - This must be an integer value. It is a variable sometimes +used by scanner code to keep track of the most recent successful pattern match. + +\item \verb|ts| - This must be a pointer to character data. In Go, Java and +Ruby code this must be an integer. See Section \ref{generating-scanners} for +more information. + +\item \verb|te| - Also a pointer to character data. + +\end{itemize} + +\section{Alphtype Statement} + +\begin{verbatim} +alphtype unsigned int; +\end{verbatim} +\verbspace + +The alphtype statement specifies the alphabet data type that the machine +operates on. During the compilation of the machine, integer literals are +expected to be in the range of possible values of the alphtype. The default +is \verb|char| for all languages except Go where the default is \verb|byte|. + +\begin{multicols}{2} +C/C++/Objective-C: +\begin{verbatim} + char unsigned char + short unsigned short + int unsigned int + long unsigned long +\end{verbatim} +\verbspace + +Go: +\begin{verbatim} + byte + int8 uint8 + int16 uint16 + int32 uint32 + int +\end{verbatim} +\verbspace + +Ruby: +\begin{verbatim} + char + int +\end{verbatim} +\verbspace + +\columnbreak + +Java: +\begin{verbatim} + char + byte + short + int +\end{verbatim} +\verbspace + +D: +\begin{verbatim} + char + byte ubyte + short ushort + wchar + int uint + dchar +\end{verbatim} +\verbspace + +\end{multicols} + +\section{Getkey Statement} + +\begin{verbatim} +getkey fpc->id; +\end{verbatim} +\verbspace + +This statement specifies to Ragel how to retrieve the current character from +from the pointer to the current element (\verb|p|). Any expression that returns +a value of the alphabet type +may be used. The getkey statement may be used for looking into element +structures or for translating the character to process. The getkey expression +defaults to \verb|(*p)|. In goto-driven machines the getkey expression may be +evaluated more than once per element processed, therefore it should not incur a +large cost nor preclude optimization. + +\section{Access Statement} + +\begin{verbatim} +access fsm->; +\end{verbatim} +\verbspace + +The access statement specifies how the generated code should +access the machine data that is persistent across processing buffer blocks. +This applies to all variables except \verb|p|, \verb|pe| and \verb|eof|. This includes +\verb|cs|, \verb|top|, \verb|stack|, \verb|ts|, \verb|te| and \verb|act|. +The access statement is useful if a machine is to be encapsulated inside a +structure in C code. It can be used to give the name of +a pointer to the structure. + +\section{Variable Statement} + +\begin{verbatim} +variable p fsm->p; +\end{verbatim} +\verbspace + +The variable statement specifies how to access a specific +variable. All of the variables that are declared by the user and +used by Ragel can be changed. This includes \verb|p|, \verb|pe|, \verb|eof|, \verb|cs|, +\verb|top|, \verb|stack|, \verb|ts|, \verb|te| and \verb|act|. +In Go, Ruby and Java code generation the \verb|data| variable can also be changed. + +\section{Pre-Push Statement} +\label{prepush} + +\begin{verbatim} +prepush { + /* stack growing code */ +} +\end{verbatim} +\verbspace + +The prepush statement allows the user to supply stack management code that is +written out during the generation of fcall, immediately before the current +state is pushed to the stack. This statement can be used to test the number of +available spaces and dynamically grow the stack if necessary. + +\section{Post-Pop Statement} +\label{postpop} + +\begin{verbatim} +postpop { + /* stack shrinking code */ +} +\end{verbatim} +\verbspace + +The postpop statement allows the user to supply stack management code that is +written out during the generation of fret, immediately after the next state is +popped from the stack. This statement can be used to dynamically shrink the +stack. + +\section{Write Statement} +\label{write-statement} + +\begin{verbatim} +write <component> [options]; +\end{verbatim} +\verbspace + +The write statement is used to generate parts of the machine. +There are seven +components that can be generated by a write statement. These components make up the +state machine's data, initialization code, execution code, and export definitions. +A write statement may appear before a machine is fully defined. +This allows one to write out the data first then later define the machine where +it is used. An example of this is shown in Figure \ref{fbreak-example}. + +\subsection{Write Data} +\begin{verbatim} +write data [options]; +\end{verbatim} +\verbspace + +The write data statement causes Ragel to emit the constant static data needed +by the machine. In table-driven output styles (see Section \ref{genout}) this +is a collection of arrays that represent the states and transitions of the +machine. In goto-driven machines much less data is emitted. At the very +minimum a start state \verb|name_start| is generated. All variables written +out in machine data have both the \verb|static| and \verb|const| properties and +are prefixed with the name of the machine and an +underscore. The data can be placed inside a class, inside a function, or it can +be defined as global data. + +Two variables are written that may be used to test the state of the machine +after a buffer block has been processed. The \verb|name_error| variable gives +the id of the state that the machine moves into when it cannot find a valid +transition to take. The machine immediately breaks out of the processing loop when +it finds itself in the error state. The error variable can be compared to the +current state to determine if the machine has failed to parse the input. If the +machine is complete, that is from every state there is a transition to a proper +state on every possible character of the alphabet, then no error state is required +and this variable will be set to -1. + +The \verb|name_first_final| variable stores the id of the first final state. +All of the machine's states are sorted by their final state status before +having their ids assigned. Checking if the machine has accepted its input can +then be done by checking if the current state is greater-than or equal to the +first final state. + +Data generation has several options: + +\noindent\hspace*{24pt}\verb|noerror | - Do not generate the integer variable that gives the id of the error state.\\ +\noindent\hspace*{24pt}\verb|nofinal | - Do not generate the integer variable that gives the id of the first final state.\\ +\noindent\hspace*{24pt}\verb|noprefix | - Do not prefix the variable names with the name of the machine. +\vspace{12pt} + +\begin{figure} +\small +\begin{verbatim} +#include <stdio.h> +%% machine foo; +%% write data; +int main( int argc, char **argv ) +{ + int cs, res = 0; + if ( argc > 1 ) { + char *p = argv[1]; + %%{ + main := + [a-z]+ + 0 @{ res = 1; fbreak; }; + write init; + write exec noend; + }%% + } + printf("execute = %i\n", res ); + return 0; +} +\end{verbatim} +\verbspace +\caption{Use of {\tt noend} write option and the {\tt fbreak} statement for +processing a string. +} +\label{fbreak-example} +\end{figure} + +\subsection{Write Start, First Final and Error} + +\begin{verbatim} +write start; +write first_final; +write error; +\end{verbatim} +\verbspace + +These three write statements provide an alternative means of accessing the +\verb|start|, \verb|first_final| and \verb|error| states. If there are many +different machine specifications in one file it is easy to get the prefix for +these wrong. This is especially true if the state machine boilerplate is +frequently made by a copy-paste-edit process. These write statements allow the +problem to be avoided. They can be used as follows: + +\begin{verbatim} +/* Did parsing succeed? */ +if ( cs < %%{ write first_final; }%% ) { + result = ERR_PARSE_ERROR; + goto fail; +} +\end{verbatim} +\verbspace + +\subsection{Write Init} +\begin{verbatim} +write init [options]; +\end{verbatim} +\verbspace + +The write init statement causes Ragel to emit initialization code. This should +be executed once before the machine is started. At a very minimum this sets the +current state to the start state. If other variables are needed by the +generated code, such as call stack variables or scanner management +variables, they are also initialized here. + +The \verb|nocs| option to the write init statement will cause ragel to skip +intialization of the cs variable. This is useful if the user wishes to use +custom logic to decide which state the specification should start in. + +\subsection{Write Exec} +\begin{verbatim} +write exec [options]; +\end{verbatim} +\verbspace + +The write exec statement causes Ragel to emit the state machine's execution code. +Ragel expects several variables to be available to this code. At a very minimum, the +generated code needs access to the current character position \verb|p|, the ending +position \verb|pe| and the current state \verb|cs| (though \verb|pe| +can be omitted using the \verb|noend| write option). +The \verb|p| variable is the cursor that the execute code will +used to traverse the input. The \verb|pe| variable should be set up to point to one +position past the last valid character in the buffer. + +Other variables are needed when certain features are used. For example using +the \verb|fcall| or \verb|fret| statements requires \verb|stack| and +\verb|top| variables to be defined. If a longest-match construction is used, +variables for managing backtracking are required. + +The write exec statement has one option. The \verb|noend| option tells Ragel +to generate code that ignores the end position \verb|pe|. In this +case the user must explicitly break out of the processing loop using +\verb|fbreak|, otherwise the machine will continue to process characters until +it moves into the error state. This option is useful if one wishes to process a +null terminated string. Rather than traverse the string to discover then length +before processing the input, the user can break out when the null character is +seen. The example in Figure \ref{fbreak-example} shows the use of the +\verb|noend| write option and the \verb|fbreak| statement for processing a string. + +\subsection{Write Exports} +\label{export} + +\begin{verbatim} +write exports; +\end{verbatim} +\verbspace + +The export feature can be used to export simple machine definitions. Machine definitions +are marked for export using the \verb|export| keyword. + +\begin{verbatim} +export machine_to_export = 0x44; +\end{verbatim} +\verbspace + +When the write exports statement is used these machines are +written out in the generated code. Defines are used for C and constant integers +are used for D, Java and Ruby. See Section \ref{import} for a description of the +import statement. + +\section{Maintaining Pointers to Input Data} + +In the creation of any parser it is not uncommon to require the collection of +the data being parsed. It is always possible to collect data into a growable +buffer as the machine moves over it, however the copying of data is a somewhat +wasteful use of processor cycles. The most efficient way to collect data from +the parser is to set pointers into the input then later reference them. This +poses a problem for uses of Ragel where the input data arrives in blocks, such +as over a socket or from a file. If a pointer is set in one buffer block but +must be used while parsing a following buffer block, some extra consideration +to correctness must be made. + +The scanner constructions exhibit this problem, requiring the maintenance +code described in Section \ref{generating-scanners}. If a longest-match +construction has been used somewhere in the machine then it is possible to +take advantage of the required prefix maintenance code in the driver program to +ensure pointers to the input are always valid. If laying down a pointer one can +set \verb|ts| at the same spot or ahead of it. When data is shifted in +between loops the user must also shift the pointer. In this way it is possible +to maintain pointers to the input that will always be consistent. + +\begin{figure} +\small +\begin{verbatim} + int have = 0; + while ( 1 ) { + char *p, *pe, *data = buf + have; + int len, space = BUFSIZE - have; + + if ( space == 0 ) { + fprintf(stderr, "BUFFER OUT OF SPACE\n"); + exit(1); + } + + len = fread( data, 1, space, stdin ); + if ( len == 0 ) + break; + + /* Find the last newline by searching backwards. */ + p = buf; + pe = data + len - 1; + while ( *pe != '\n' && pe >= buf ) + pe--; + pe += 1; + + %% write exec; + + /* How much is still in the buffer? */ + have = data + len - pe; + if ( have > 0 ) + memmove( buf, pe, have ); + + if ( len < space ) + break; + } +\end{verbatim} +\verbspace +\caption{An example of line-oriented processing. +} +\label{line-oriented} +\end{figure} + +In general, there are two approaches for guaranteeing the consistency of +pointers to input data. The first approach is the one just described; +lay down a marker from an action, +then later ensure that the data the marker points to is preserved ahead of +the buffer on the next execute invocation. This approach is good because it +allows the parser to decide on the pointer-use boundaries, which can be +arbitrarily complex parsing conditions. A downside is that it requires any +pointers that are set to be corrected in between execute invocations. + +The alternative is to find the pointer-use boundaries before invoking the execute +routine, then pass in the data using these boundaries. For example, if the +program must perform line-oriented processing, the user can scan backwards from +the end of an input block that has just been read in and process only up to the +first found newline. On the next input read, the new data is placed after the +partially read line and processing continues from the beginning of the line. +An example of line-oriented processing is given in Figure \ref{line-oriented}. + +\section{Specifying the Host Language} + +The \verb|ragel| program has a number of options for specifying the host +language. The host-language options are: + +\begin{itemize} +\item \verb|-C | for C/C++/Objective-C code (default) +\item \verb|-D | for D code. +\item \verb|-Z | for Go code. +\item \verb|-J | for Java code. +\item \verb|-R | for Ruby code. +\item \verb|-A | for C\# code. +\end{itemize} + +\section{Choosing a Generated Code Style} +\label{genout} + +There are three styles of code output to choose from. Code style affects the +size and speed of the compiled binary. Changing code style does not require any +change to the Ragel program. There are two table-driven formats and a goto +driven format. + +In addition to choosing a style to emit, there are various levels of action +code reuse to choose from. The maximum reuse levels (\verb|-T0|, \verb|-F0| +and \verb|-G0|) ensure that no FSM action code is ever duplicated by encoding +each transition's action list as static data and iterating +through the lists on every transition. This will normally result in a smaller +binary. The less action reuse options (\verb|-T1|, \verb|-F1| and \verb|-G1|) +will usually produce faster running code by expanding each transition's action +list into a single block of code, eliminating the need to iterate through the +lists. This duplicates action code instead of generating the logic necessary +for reuse. Consequently the binary will be larger. However, this tradeoff applies to +machines with moderate to dense action lists only. If a machine's transitions +frequently have less than two actions then the less reuse options will actually +produce both a smaller and a faster running binary due to less action sharing +overhead. The best way to choose the appropriate code style for your +application is to perform your own tests. + +The table-driven FSM represents the state machine as constant static data. There are +tables of states, transitions, indices and actions. The current state is +stored in a variable. The execution is simply a loop that looks up the current +state, looks up the transition to take, executes any actions and moves to the +target state. In general, the table-driven FSM can handle any machine, produces +a smaller binary and requires a less expensive host language compile, but +results in slower running code. Since the table-driven format is the most +flexible it is the default code style. + +The flat table-driven machine is a table-based machine that is optimized for +small alphabets. Where the regular table machine uses the current character as +the key in a binary search for the transition to take, the flat table machine +uses the current character as an index into an array of transitions. This is +faster in general, however is only suitable if the span of possible characters +is small. + +The goto-driven FSM represents the state machine using goto and switch +statements. The execution is a flat code block where the transition to take is +computed using switch statements and directly executable binary searches. In +general, the goto FSM produces faster code but results in a larger binary and a +more expensive host language compile. + +The goto-driven format has an additional action reuse level (\verb|-G2|) that +writes actions directly into the state transitioning logic rather than putting +all the actions together into a single switch. Generally this produces faster +running code because it allows the machine to encode the current state using +the processor's instruction pointer. Again, sparse machines may actually +compile to smaller binaries when \verb|-G2| is used due to less state and +action management overhead. For many parsing applications \verb|-G2| is the +preferred output format. + +\begin{center} + +Code Output Style Options + +\begin{tabular}{|c|c|c|} +\hline +\verb|-T0|&binary search table-driven&C/D/Java/Ruby/C\#\\ +\hline +\verb|-T1|&binary search, expanded actions&C/D/Ruby/C\#\\ +\hline +\verb|-F0|&flat table-driven&C/D/Ruby/C\#\\ +\hline +\verb|-F1|&flat table, expanded actions&C/D/Ruby/C\#\\ +\hline +\verb|-G0|&goto-driven&C/D/C\#\\ +\hline +\verb|-G1|&goto, expanded actions&C/D/C\#\\ +\hline +\verb|-G2|&goto, in-place actions&C/D/Go\\ +\hline +\end{tabular} +\end{center} + +\chapter{Beyond the Basic Model} + +\section{Parser Modularization} +\label{modularization} + +It is possible to use Ragel's machine construction and action embedding +operators to specify an entire parser using a single regular expression. In +many cases this is the desired way to specify a parser in Ragel. However, in +some scenarios the language to parse may be so large that it is difficult to +think about it as a single regular expression. It may also shift between distinct +parsing strategies, in which case modularization into several coherent blocks +of the language may be appropriate. + +It may also be the case that patterns that compile to a large number of states +must be used in a number of different contexts and referencing them in each +context results in a very large state machine. In this case, an ability to reuse +parsers would reduce code size. + +To address this, distinct regular expressions may be instantiated and linked +together by means of a jumping and calling mechanism. This mechanism is +analogous to the jumping to and calling of processor instructions. A jump +command, given in action code, causes control to be immediately passed to +another portion of the machine by way of setting the current state variable. A +call command causes the target state of the current transition to be pushed to +a state stack before control is transferred. Later on, the original location +may be returned to with a return statement. In the following example, distinct +state machines are used to handle the parsing of two types of headers. + +% GENERATE: call +% %%{ +% machine call; +\begin{inline_code} +\begin{verbatim} +action return { fret; } +action call_date { fcall date; } +action call_name { fcall name; } + +# A parser for date strings. +date := [0-9][0-9] '/' + [0-9][0-9] '/' + [0-9][0-9][0-9][0-9] '\n' @return; + +# A parser for name strings. +name := ( [a-zA-Z]+ | ' ' )** '\n' @return; + +# The main parser. +headers = + ( 'from' | 'to' ) ':' @call_name | + ( 'departed' | 'arrived' ) ':' @call_date; + +main := headers*; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% %% write data; +% void f() +% { +% %% write init; +% %% write exec; +% } +% END GENERATE + +Calling and jumping should be used carefully as they are operations that take +one out of the domain of regular languages. A machine that contains a call or +jump statement in one of its actions should be used as an argument to a machine +construction operator only with considerable care. Since DFA transitions may +actually represent several NFA transitions, a call or jump embedded in one +machine can inadvertently terminate another machine that it shares prefixes +with. Despite this danger, theses statements have proven useful for tying +together sub-parsers of a language into a parser for the full language, +especially for the purpose of modularizing code and reducing the number of +states when the machine contains frequently recurring patterns. + +Section \ref{vals} describes the jump and call statements that are used to +transfer control. These statements make use of two variables that must be +declared by the user, \verb|stack| and \verb|top|. The \verb|stack| variable +must be an array of integers and \verb|top| must be a single integer, which +will point to the next available space in \verb|stack|. Sections \ref{prepush} +and \ref{postpop} describe the Pre-Push and Post-Pop statements which can be +used to implement a dynamically resizable array. + +\section{Referencing Names} +\label{labels} + +This section describes how to reference names in epsilon transitions (Section +\ref{state-charts}) and +action-based control-flow statements such as \verb|fgoto|. There is a hierarchy +of names implied in a Ragel specification. At the top level are the machine +instantiations. Beneath the instantiations are labels and references to machine +definitions. Beneath those are more labels and references to definitions, and +so on. + +Any name reference may contain multiple components separated with the \verb|::| +compound symbol. The search for the first component of a name reference is +rooted at the join expression that the epsilon transition or action embedding +is contained in. If the name reference is not contained in a join, +the search is rooted at the machine definition that the epsilon transition or +action embedding is contained in. Each component after the first is searched +for beginning at the location in the name tree that the previous reference +component refers to. + +In the case of action-based references, if the action is embedded more than +once, the local search is performed for each embedding and the result is the +union of all the searches. If no result is found for action-based references then +the search is repeated at the root of the name tree. Any action-based name +search may be forced into a strictly global search by prefixing the name +reference with \verb|::|. + +The final component of the name reference must resolve to a unique entry point. +If a name is unique in the entire name tree it can be referenced as is. If it +is not unique it can be specified by qualifying it with names above it in the +name tree. However, it can always be renamed. + +% FIXME: Should fit this in somewhere. +% Some kinds of name references are illegal. Cannot call into longest-match +% machine, can only call its start state. Cannot make a call to anywhere from +% any part of a longest-match machine except a rule's action. This would result +% in an eventual return to some point inside a longest-match other than the +% start state. This is banned for the same reason a call into the LM machine is +% banned. + + +\section{Scanners} +\label{generating-scanners} + +Scanners are very much intertwined with regular-languages and their +corresponding processors. For this reason Ragel supports the definition of +scanners. The generated code will repeatedly attempt to match patterns from a +list, favouring longer patterns over shorter patterns. In the case of +equal-length matches, the generated code will favour patterns that appear ahead +of others. When a scanner makes a match it executes the user code associated +with the match, consumes the input then resumes scanning. + +\begin{verbatim} +<machine_name> := |* + pattern1 => action1; + pattern2 => action2; + ... + *|; +\end{verbatim} +\verbspace + +On the surface, Ragel scanners are similar to those defined by Lex. Though +there is a key distinguishing feature: patterns may be arbitrary Ragel +expressions and can therefore contain embedded code. With a Ragel-based scanner +the user need not wait until the end of a pattern before user code can be +executed. + +Scanners can be used to process sub-languages, as well as for tokenizing +programming languages. In the following example a scanner is used to tokenize +the contents of a header field. + +\begin{inline_code} +\begin{verbatim} +word = [a-z]+; +head_name = 'Header'; + +header := |* + word; + ' '; + '\n' => { fret; }; +*|; + +main := ( head_name ':' @{ fcall header; } )*; +\end{verbatim} +\end{inline_code} +\verbspace + +The scanner construction has a purpose similar to the longest-match kleene star +operator \verb|**|. The key +difference is that a scanner is able to backtrack to match a previously matched +shorter string when the pursuit of a longer string fails. For this reason the +scanner construction operator is not a pure state machine construction +operator. It relies on several variables that enable it to backtrack and make +pointers to the matched input text available to the user. For this reason +scanners must be immediately instantiated. They cannot be defined inline or +referenced by another expression. Scanners must be jumped to or called. + +Scanners rely on the \verb|ts|, \verb|te| and \verb|act| +variables to be present so that they can backtrack and make pointers to the +matched text available to the user. If input is processed using multiple calls +to the execute code then the user must ensure that when a token is only +partially matched that the prefix is preserved on the subsequent invocation of +the execute code. + +The \verb|ts| variable must be defined as a pointer to the input data. +It is used for recording where the current token match begins. This variable +may be used in action code for retrieving the text of the current match. Ragel +ensures that in between tokens and outside of the longest-match machines that +this pointer is set to null. In between calls to the execute code the user must +check if \verb|ts| is set and if so, ensure that the data it points to is +preserved ahead of the next buffer block. This is described in more detail +below. + +The \verb|te| variable must also be defined as a pointer to the input data. +It is used for recording where a match ends and where scanning of the next +token should begin. This can also be used in action code for retrieving the +text of the current match. + +The \verb|act| variable must be defined as an integer type. It is used for +recording the identity of the last pattern matched when the scanner must go +past a matched pattern in an attempt to make a longer match. If the longer +match fails it may need to consult the \verb|act| variable. In some cases, use +of the \verb|act| +variable can be avoided because the value of the current state is enough +information to determine which token to accept, however in other cases this is +not enough and so the \verb|act| variable is used. + +When the longest-match operator is in use, the user's driver code must take on +some buffer management functions. The following algorithm gives an overview of +the steps that should be taken to properly use the longest-match operator. + +\begin{itemize} +\item Read a block of input data. +\item Run the execute code. +\item If \verb|ts| is set, the execute code will expect the incomplete +token to be preserved ahead of the buffer on the next invocation of the execute +code. +\begin{itemize} +\item Shift the data beginning at \verb|ts| and ending at \verb|pe| to the +beginning of the input buffer. +\item Reset \verb|ts| to the beginning of the buffer. +\item Shift \verb|te| by the distance from the old value of \verb|ts| +to the new value. The \verb|te| variable may or may not be valid. There is +no way to know if it holds a meaningful value because it is not kept at null +when it is not in use. It can be shifted regardless. +\end{itemize} +\item Read another block of data into the buffer, immediately following any +preserved data. +\item Run the scanner on the new data. +\end{itemize} + +Figure \ref{preserve_example} shows the required handling of an input stream in +which a token is broken by the input block boundaries. After processing up to +and including the ``t'' of ``characters'', the prefix of the string token must be +retained and processing should resume at the ``e'' on the next iteration of +the execute code. + +If one uses a large input buffer for collecting input then the number of times +the shifting must be done will be small. Furthermore, if one takes care not to +define tokens that are allowed to be very long and instead processes these +items using pure state machines or sub-scanners, then only a small amount of +data will ever need to be shifted. + +\begin{figure} +\small +\begin{verbatim} + a) A stream "of characters" to be scanned. + | | | + p ts pe + + b) "of characters" to be scanned. + | | | + ts p pe +\end{verbatim} +\verbspace +\caption{Following an invocation of the execute code there may be a partially +matched token (a). The data of the partially matched token +must be preserved ahead of the new data on the next invocation (b). +} +\label{preserve_example} +\end{figure} + +Since scanners attempt to make the longest possible match of input, patterns +such as identifiers require one character of lookahead in order to trigger a +match. In the case of the last token in the input stream the user must ensure +that the \verb|eof| variable is set so that the final token is flushed out. + +An example scanner processing loop is given in Figure \ref{scanner-loop}. + +\begin{figure} +\small +\begin{verbatim} + int have = 0; + bool done = false; + while ( !done ) { + /* How much space is in the buffer? */ + int space = BUFSIZE - have; + if ( space == 0 ) { + /* Buffer is full. */ + cerr << "TOKEN TOO BIG" << endl; + exit(1); + } + + /* Read in a block after any data we already have. */ + char *p = inbuf + have; + cin.read( p, space ); + int len = cin.gcount(); + + char *pe = p + len; + char *eof = 0; + + /* If no data was read indicate EOF. */ + if ( len == 0 ) { + eof = pe; + done = true; + } + + %% write exec; + + if ( cs == Scanner_error ) { + /* Machine failed before finding a token. */ + cerr << "PARSE ERROR" << endl; + exit(1); + } + + if ( ts == 0 ) + have = 0; + else { + /* There is a prefix to preserve, shift it over. */ + have = pe - ts; + memmove( inbuf, ts, have ); + te = inbuf + (te-ts); + ts = inbuf; + } + } +\end{verbatim} +\verbspace +\caption{A processing loop for a scanner. +} +\label{scanner-loop} +\end{figure} + +\section{State Charts} +\label{state-charts} + +In addition to supporting the construction of state machines using regular +languages, Ragel provides a way to manually specify state machines using +state charts. The comma operator combines machines together without any +implied transitions. The user can then manually link machines by specifying +epsilon transitions with the \verb|->| operator. Epsilon transitions are drawn +between the final states of a machine and entry points defined by labels. This +makes it possible to build machines using the explicit state-chart method while +making minimal changes to the Ragel language. + +An interesting feature of Ragel's state chart construction method is that it +can be mixed freely with regular expression constructions. A state chart may be +referenced from within a regular expression, or a regular expression may be +used in the definition of a state chart transition. + +\subsection{Join} + +\verb|expr , expr , ...| + +Join a list of machines together without +drawing any transitions, without setting up a start state, and without +designating any final states. Transitions between the machines may be specified +using labels and epsilon transitions. The start state must be explicity +specified with the ``start'' label. Final states may be specified with an +epsilon transition to the implicitly created ``final'' state. The join +operation allows one to build machines using a state chart model. + +\subsection{Label} + +\verb|label: expr| + +Attaches a label to an expression. Labels can be +used as the target of epsilon transitions and explicit control transfer +statements such as \verb|fgoto| and \verb|fnext| in action +code. + +\subsection{Epsilon} + +\verb|expr -> label| + +Draws an epsilon transition to the state defined +by \verb|label|. Epsilon transitions are made deterministic when join +operators are evaluated. Epsilon transitions that are not in a join operation +are made deterministic when the machine definition that contains the epsilon is +complete. See Section \ref{labels} for information on referencing labels. + +\subsection{Simplifying State Charts} + +There are two benefits to providing state charts in Ragel. The first is that it +allows us to take a state chart with a full listing of states and transitions +and simplify it in selective places using regular expressions. + +The state chart method of specifying parsers is very common. It is an +effective programming technique for producing robust code. The key disadvantage +becomes clear when one attempts to comprehend a large parser specified in this +way. These programs usually require many lines, causing logic to be spread out +over large distances in the source file. Remembering the function of a large +number of states can be difficult and organizing the parser in a sensible way +requires discipline because branches and repetition present many file layout +options. This kind of programming takes a specification with inherent +structure such as looping, alternation and concatenation and expresses it in a +flat form. + +If we could take an isolated component of a manually programmed state chart, +that is, a subset of states that has only one entry point, and implement it +using regular language operators then we could eliminate all the explicit +naming of the states contained in it. By eliminating explicitly named states +and replacing them with higher-level specifications we simplify a state machine +specification. + +For example, sometimes chains of states are needed, with only a small number of +possible characters appearing along the chain. These can easily be replaced +with a concatenation of characters. Sometimes a group of common states +implement a loop back to another single portion of the machine. Rather than +manually duplicate all the transitions that loop back, we may be able to +express the loop using a kleene star operator. + +Ragel allows one to take this state map simplification approach. We can build +state machines using a state map model and implement portions of the state map +using regular languages. In place of any transition in the state machine, +entire sub-machines can be given. These can encapsulate functionality +defined elsewhere. An important aspect of the Ragel approach is that when we +wrap up a collection of states using a regular expression we do not lose +access to the states and transitions. We can still execute code on the +transitions that we have encapsulated. + +\subsection{Dropping Down One Level of Abstraction} +\label{down} + +The second benefit of incorporating state charts into Ragel is that it permits +us to bypass the regular language abstraction if we need to. Ragel's action +embedding operators are sometimes insufficient for expressing certain parsing +tasks. In the same way that is useful for C language programmers to drop down +to assembly language programming using embedded assembler, it is sometimes +useful for the Ragel programmer to drop down to programming with state charts. + +In the following example, we wish to buffer the characters of an XML CDATA +sequence. The sequence is terminated by the string \verb|]]>|. The challenge +in our application is that we do not wish the terminating characters to be +buffered. An expression of the form \verb|any* @buffer :>> ']]>'| will not work +because the buffer will always contain the characters \verb|]]| on the end. +Instead, what we need is to delay the buffering of \verb|]| +characters until a time when we +abandon the terminating sequence and go back into the main loop. There is no +easy way to express this using Ragel's regular expression and action embedding +operators, and so an ability to drop down to the state chart method is useful. + +% GENERATE: dropdown +% OPT: -p +% %%{ +% machine dropdown; +\begin{inline_code} +\begin{verbatim} +action bchar { buff( fpc ); } # Buffer the current character. +action bbrack1 { buff( "]" ); } +action bbrack2 { buff( "]]" ); } + +CDATA_body = +start: ( + ']' -> one | + (any-']') @bchar ->start +), +one: ( + ']' -> two | + [^\]] @bbrack1 @bchar ->start +), +two: ( + '>' -> final | + ']' @bbrack1 -> two | + [^>\]] @bbrack2 @bchar ->start +); +\end{verbatim} +\end{inline_code} +\verbspace +% main := CDATA_body; +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{dropdown} +\end{center} +\graphspace + + +\section{Semantic Conditions} +\label{semantic} + +Many communication protocols contain variable-length fields, where the length +of the field is given ahead of the field as a value. This +problem cannot be expressed using regular languages because of its +context-dependent nature. The prevalence of variable-length fields in +communication protocols motivated us to introduce semantic conditions into +the Ragel language. + +A semantic condition is a block of user code that is interpreted as an +expression and evaluated immediately +before a transition is taken. If the code returns a value of true, the +transition may be taken. We can now embed code that extracts the length of a +field, then proceed to match $n$ data values. + +% GENERATE: conds1 +% OPT: -p +% %%{ +% machine conds1; +% number = digit+; +\begin{inline_code} +\begin{verbatim} +action rec_num { i = 0; n = getnumber(); } +action test_len { i++ < n } +data_fields = ( + 'd' + [0-9]+ %rec_num + ':' + ( [a-z] when test_len )* +)**; +\end{verbatim} +\end{inline_code} +\verbspace +% main := data_fields; +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{conds1} +\end{center} +\graphspace + +The Ragel implementation of semantic conditions does not force us to give up the +compositional property of Ragel definitions. For example, a machine that tests +the length of a field using conditions can be unioned with another machine +that accepts some of the same strings, without the two machines interfering with +one another. The user need not be concerned about whether or not the result of the +semantic condition will affect the matching of the second machine. + +To see this, first consider that when a user associates a condition with an +existing transition, the transition's label is translated from the base character +to its corresponding value in the space that represents ``condition $c$ true''. Should +the determinization process combine a state that has a conditional transition +with another state that has a transition on the same input character but +without a condition, then the condition-less transition first has its label +translated into two values, one to its corresponding value in the space that +represents ``condition $c$ true'' and another to its corresponding value in the +space that represents ``condition $c$ false''. It +is then safe to combine the two transitions. This is shown in the following +example. Two intersecting patterns are unioned, one with a condition and one +without. The condition embedded in the first pattern does not affect the second +pattern. + +% GENERATE: conds2 +% OPT: -p +% %%{ +% machine conds2; +% number = digit+; +\begin{inline_code} +\begin{verbatim} +action test_len { i++ < n } +action one { /* accept pattern one */ } +action two { /* accept pattern two */ } +patterns = + ( [a-z] when test_len )+ %one | + [a-z][a-z0-9]* %two; +main := patterns '\n'; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{conds2} +\end{center} +\graphspace + +There are many more potential uses for semantic conditions. The user is free to +use arbitrary code and may therefore perform actions such as looking up names +in dictionaries, validating input using external parsing mechanisms or +performing checks on the semantic structure of input seen so far. In the next +section we describe how Ragel accommodates several common parser engineering +problems. + +The semantic condition feature works only with alphabet types that are smaller +in width than the \verb|long| type. To implement semantic conditions Ragel +needs to be able to allocate characters from the alphabet space. Ragel uses +these allocated characters to express "character C with condition P true" or "C +with P false." Since internally Ragel uses longs to store characters there is +no room left in the alphabet space unless an alphabet type smaller than long is +used. + +\section{Implementing Lookahead} + +There are a few strategies for implementing lookahead in Ragel programs. +Leaving actions, which are described in Section \ref{out-actions}, can be +used as a form of lookahead. Ragel also provides the \verb|fhold| directive +which can be used in actions to prevent the machine from advancing over the +current character. It is also possible to manually adjust the current character +position by shifting it backwards using \verb|fexec|, however when this is +done, care must be taken not to overstep the beginning of the current buffer +block. In both the use of \verb|fhold| and \verb|fexec| the user must be +cautious of combining the resulting machine with another in such a way that the +transition on which the current position is adjusted is not combined with a +transition from the other machine. + +\section{Parsing Recursive Language Structures} + +In general Ragel cannot handle recursive structures because the grammar is +interpreted as a regular language. However, depending on what needs to be +parsed it is sometimes practical to implement the recursive parts using manual +coding techniques. This often works in cases where the recursive structures are +simple and easy to recognize, such as in the balancing of parentheses + +One approach to parsing recursive structures is to use actions that increment +and decrement counters or otherwise recognize the entry to and exit from +recursive structures and then jump to the appropriate machine defnition using +\verb|fcall| and \verb|fret|. Alternatively, semantic conditions can be used to +test counter variables. + +A more traditional approach is to call a separate parsing function (expressed +in the host language) when a recursive structure is entered, then later return +when the end is recognized. + +\end{document} |