tilde strings, neq, and unsigned numbers in lexical defs

author: Adrian Thurston <thurston@complang.org> 2013-03-22 23:39:49 -0400
committer: Adrian Thurston <thurston@complang.org> 2013-03-22 23:39:49 -0400
commit: 9ddc931cbacf4d270a7113d07092203bec8aaaae (patch)
tree: b5591d9c8632de3f67e18aebc02216b4cd9a4663 /test
parent: 56a51921c90303d65fd8a4b3fee99240676d43dd (diff)
download: colm-9ddc931cbacf4d270a7113d07092203bec8aaaae.tar.gz
15 files changed, 6889 insertions, 6873 deletions
diff --git a/test/argv1.args b/test/argv1.args
deleted file mode 100644
index 3a4295fc..00000000
--- a/test/argv1.args
+++ /dev/null
@@ -1 +0,0 @@
-a b c 1 2 3
diff --git a/test/argv1.exp b/test/argv1.exp
deleted file mode 100644
index 6011e46d..00000000
--- a/test/argv1.exp
+++ /dev/null
@@ -1 +0,0 @@
-<__list0><str>./argv1.bin</str><str>a</str><str>b</str><str>c</str><str>1</str><str>2</str><str>3</str></__list0>
-\ No newline at end of file
diff --git a/test/argv1.lm b/test/argv1.lm
index aa77d524..9ffd4467 100644
--- a/test/argv1.lm
+++ b/test/argv1.lm
@@ -1,2 +1,8 @@
+##### LM #####
 
 print_xml( argv )
+print( '\n' )
+##### ARGS #####
+a b c 1 2 3
+##### EXP #####
+<__list0><str>./working/argv1.bin</str><str>a</str><str>b</str><str>c</str><str>1</str><str>2</str><str>3</str></__list0>
diff --git a/test/argv2.args b/test/argv2.args
deleted file mode 100644
index 60d8bc0b..00000000
--- a/test/argv2.args
+++ /dev/null
@@ -1 +0,0 @@
--qv -h -o output -iinput file --input=foo --input bar --help --verbose
diff --git a/test/argv2.exp b/test/argv2.exp
deleted file mode 100644
index 038294b0..00000000
--- a/test/argv2.exp
+++ /dev/null
@@ -1,10 +0,0 @@
-single: q
-single: v
-single: h
-with opt: o -> output
-with opt: i -> input
-file: file
-long: input -> foo
-long: input -> bar
-long: help
-long: verbose
diff --git a/test/argv2.lm b/test/argv2.lm
index 9d7792ab..eb1ba790 100644
--- a/test/argv2.lm
+++ b/test/argv2.lm
@@ -1,3 +1,4 @@
+##### LM #####
 lex
 	literal '-'
 	token file /^('-'|0)(^0)*/
@@ -74,3 +75,16 @@ for Item: item in Args {
 	}
 }
 
+##### ARGS #####
+-qv -h -o output -iinput file --input=foo --input bar --help --verbose
+##### EXP #####
+single: q
+single: v
+single: h
+with opt: o -> output
+with opt: i -> input
+file: file
+long: input -> foo
+long: input -> bar
+long: help
+long: verbose
diff --git a/test/exit1.exp b/test/exit1.exp
deleted file mode 100644
index 90be1f30..00000000
--- a/test/exit1.exp
+++ /dev/null
@@ -1 +0,0 @@
-before
diff --git a/test/exit1.lm b/test/exit1.lm
index a41a71d5..1a4f82aa 100644
--- a/test/exit1.lm
+++ b/test/exit1.lm
@@ -1,4 +1,7 @@
+##### LM #####
 
 print( 'before\n' )
 exit( 0 )
 print( 'after\n' )
+##### EXP #####
+before
diff --git a/test/exit2.exp b/test/exit2.exp
deleted file mode 100644
index e69de29b..00000000
--- a/test/exit2.exp
+++ /dev/null
diff --git a/test/exit2.lm b/test/exit2.lm
index 48284832..2105f987 100644
--- a/test/exit2.lm
+++ b/test/exit2.lm
@@ -1,3 +1,4 @@
+##### LM #####
 
 int f3()
 {
@@ -20,3 +21,4 @@ int f1()
 I: int = 1
 
 f1()
+##### EXP #####
diff --git a/test/exit3.exp b/test/exit3.exp
deleted file mode 100644
index e69de29b..00000000
--- a/test/exit3.exp
+++ /dev/null
diff --git a/test/exit3.lm b/test/exit3.lm
index 8c7a325f..4cf46862 100644
--- a/test/exit3.lm
+++ b/test/exit3.lm
@@ -1,3 +1,4 @@
+##### LM #####
 
 int f3()
 {
@@ -20,3 +21,4 @@ int f1()
 I: int = 1
 
 exit( 0 )
+##### EXP #####
diff --git a/test/repeat2.exp b/test/repeat2.exp
deleted file mode 100644
index 0bc0f8e7..00000000
--- a/test/repeat2.exp
+++ /dev/null
@@ -1,3552 +0,0 @@
-\documentclass[letterpaper,11pt,oneside]{book}
-\usepackage{graphicx}
-\usepackage{comment}
-\usepackage{multicol}
-\usepackage[
-	colorlinks=true,
-	linkcolor=black,
-	citecolor=green,
-	filecolor=black,
-	urlcolor=black]{hyperref}
-
-\topmargin -0.20in
-\oddsidemargin 0in
-\textwidth 6.5in
-\textheight 9in
-
-\setlength{\parskip}{0pt}
-\setlength{\topsep}{0pt}
-\setlength{\partopsep}{0pt}
-\setlength{\itemsep}{0pt}
-
-\input{version}
-
-\newcommand{\verbspace}{\vspace{10pt}}
-\newcommand{\graphspace}{\vspace{10pt}}
-
-\renewcommand\floatpagefraction{.99}
-\renewcommand\topfraction{.99}
-\renewcommand\bottomfraction{.99}
-\renewcommand\textfraction{.01}   
-\setcounter{totalnumber}{50}
-\setcounter{topnumber}{50}
-\setcounter{bottomnumber}{50}
-
-\newenvironment{inline_code}{\def\baselinestretch{1}\vspace{12pt}\small}{}
-
-\begin{document}
-
-\thispagestyle{empty}
-\begin{center}
-\vspace*{3in}
-{\huge Ragel State Machine Compiler}\\
-\vspace*{12pt}
-{\Large User Guide}\\
-\vspace{1in}
-by\\
-\vspace{12pt}
-{\large Adrian Thurston}\\
-\end{center}
-\clearpage
-
-\pagenumbering{roman}
-
-\chapter*{License}
-Ragel version \version, \pubdate\\
-Copyright \copyright\ 2003-2012 Adrian D. Thurston
-\vspace{6mm}
-
-{\bf\it\noindent This document is part of Ragel, and as such, this document is
-released under the terms of the GNU General Public License as published by the
-Free Software Foundation; either version 2 of the License, or (at your option)
-any later version.
-}
-
-\vspace{5pt}
-
-{\bf\it\noindent Ragel is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
-details.
-}
-
-\vspace{5pt}
-
-{\bf\it\noindent You should have received a copy of the GNU General Public
-License along with Ragel; if not, write to the Free Software Foundation, Inc.,
-59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-}
-
-\clearpage
-\tableofcontents
-\clearpage
-
-\pagenumbering{arabic}
-\chapter{Introduction}
-
-\section{Abstract}
-
-Regular expressions are used heavily in practice for the purpose of specifying
-parsers. They are normally used as black boxes linked together with program
-logic.  User actions are executed in between invocations of the regular
-expression engine. Adding actions before a pattern terminates requires patterns
-to be broken and pasted back together with program logic. The more user actions
-are needed, the less the advantages of regular expressions are seen. 
-
-Ragel is a software development tool that allows user actions to be 
-embedded into the transitions of a regular expression's corresponding state
-machine, eliminating the need to switch from the regular expression engine and
-user code execution environment and back again. As a result, expressions can be
-maximally continuous.  One is free to specify an entire parser using a single
-regular expression.  The single-expression model affords concise and elegant
-descriptions of languages and the generation of very simple, fast and robust
-code.  Ragel compiles executable finite state machines from a high level regular language
-notation. Ragel targets C, C++, Objective-C, D, Go, Java and Ruby.
-
-In addition to building state machines from regular expressions, Ragel allows
-the programmer to directly specify state machines with state charts. These two
-notations may be freely combined. There are also facilities for controlling
-nondeterminism in the resulting machines and building scanners using patterns
-that themselves have embedded actions. Ragel can produce code that is small and
-runs very fast. Ragel can handle integer-sized alphabets and can compile very
-large state machines.
-
-\section{Motivation}
-
-When a programmer is faced with the task of producing a parser for a
-context-free language there are many tools to choose from. It is quite common
-to generate useful and efficient parsers for programming languages from a
-formal grammar. It is also quite common for programmers to avoid such tools
-when making parsers for simple computer languages, such as file formats and
-communication protocols.  Such languages are often regular and tools for
-processing the context-free languages are viewed as too heavyweight for the
-purpose of parsing regular languages. The extra run-time effort required for
-supporting the recursive nature of context-free languages is wasted.
-
-When we turn to the regular expression-based parsing tools, such as Lex, Re2C,
-and scripting languages such as Sed, Awk and Perl we find that they are split
-into two levels: a regular expression matching engine and some kind of program
-logic for linking patterns together.  For example, a Lex program is composed of
-sets of regular expressions. The implied program logic repeatedly attempts to
-match a pattern in the current set. When a match is found the associated user
-code executed. It requires the user to consider a language as a sequence of
-independent tokens. Scripting languages and regular expression libraries allow
-one to link patterns together using arbitrary program code.  This is very
-flexible and powerful, however we can be more concise and clear if we avoid
-gluing together regular expressions with if statements and while loops.
-
-This model of execution, where the runtime alternates between regular
-expression matching and user code exectution places restrictions on when
-action code may be executed. Since action code can only be associated with
-complete patterns, any action code that must be executed before an entire
-pattern is matched requires that the pattern be broken into smaller units.
-Instead of being forced to disrupt the regular expression syntax and write
-smaller expressions, it is desirable to retain a single expression and embed
-code for performing actions directly into the transitions that move over the
-characters. After all, capable programmers are astutely aware of the machinery
-underlying their programs, so why not provide them with access to that
-machinery? To achieve this we require an action execution model for associating
-code with the sub-expressions of a regular expression in a way that does not
-disrupt its syntax.
-
-The primary goal of Ragel is to provide developers with an ability to embed
-actions into the transitions and states of a regular expression's state machine
-in support of the definition of entire parsers or large sections of parsers
-using a single regular expression.  From the regular expression we gain a clear
-and concise statement of our language. From the state machine we obtain a very
-fast and robust executable that lends itself to many kinds of analysis and
-visualization.
-
-\section{Overview}
-
-Ragel is a language for specifying state machines. The Ragel program is a
-compiler that assembles a state machine definition to executable code.  Ragel
-is based on the principle that any regular language can be converted to a
-deterministic finite state automaton. Since every regular language has a state
-machine representation and vice versa, the terms regular language and state
-machine (or just machine) will be used interchangeably in this document.
-
-Ragel outputs machines to C, C++, Objective-C, D, Go, Java or Ruby code. The output is
-designed to be generic and is not bound to any particular input or processing
-method. A Ragel machine expects to have data passed to it in buffer blocks.
-When there is no more input, the machine can be queried for acceptance.  In
-this way, a Ragel machine can be used to simply recognize a regular language
-like a regular expression library. By embedding code into the regular language,
-a Ragel machine can also be used to parse input.
-
-The Ragel language has many operators for constructing and manipulating
-machines. Machines are built up from smaller machines, to bigger ones, to the
-final machine representing the language that needs to be recognized or parsed.
-
-The core state machine construction operators are those found in most theory
-of computation textbooks. They date back to the 1950s and are widely studied.
-They are based on set operations and permit one to think of languages as a set
-of strings. They are Union, Intersection, Difference, Concatenation and Kleene
-Star. Put together, these operators make up what most people know as regular
-expressions. Ragel also provides a scanner construction operator 
-and provides operators for explicitly constructing machines
-using a state chart method. In the state chart method, one joins machines
-together without any implied transitions and then explicitly specifies where
-epsilon transitions should be drawn.
-
-The state machine manipulation operators are specific to Ragel. They allow the
-programmer to access the states and transitions of regular language's
-corresponding machine. There are two uses of the manipulation operators. The
-first and primary use is to embed code into transitions and states, allowing
-the programmer to specify the actions of the state machine.
-
-Ragel attempts to make the action embedding facility as intuitive as possible.
-To do so, a number of issues need to be addressed.  For example, when making a
-nondeterministic specification into a DFA using machines that have embedded
-actions, new transitions are often made that have the combined actions of
-several source transitions. Ragel ensures that multiple actions associated with
-a single transition are ordered consistently with respect to the order of
-reference and the natural ordering implied by the construction operators.
-
-The second use of the manipulation operators is to assign priorities to
-transitions. Priorities provide a convenient way of controlling any
-nondeterminism introduced by the construction operators. Suppose two
-transitions leave from the same state and go to distinct target states on the
-same character. If these transitions are assigned conflicting priorities, then
-during the determinization process the transition with the higher priority will
-take precedence over the transition with the lower priority. The lower priority
-transition gets abandoned. The transitions would otherwise be combined into a new
-transition that goes to a new state that is a combination of the original
-target states. Priorities are often required for segmenting machines. The most
-common uses of priorities have been encoded into a set of simple operators
-that should be used instead of priority embeddings whenever possible.
-
-For the purposes of embedding, Ragel divides transitions and states into
-different classes. There are four operators for embedding actions and
-priorities into the transitions of a state machine. It is possible to embed
-into entering transitions, finishing transitions, all transitions and leaving
-transitions. The embedding into leaving transitions is a special case.
-These transition embeddings get stored in the final states of a machine.  They
-are transferred to any transitions that are made going out of the machine by
-future concatenation or kleene star operations.
-
-There are several more operators for embedding actions into states. Like the
-transition embeddings, there are various different classes of states that the
-embedding operators access. For example, one can access start states, final
-states or all states, among others. Unlike the transition embeddings, there are
-several different types of state action embeddings. These are executed at
-various different times during the processing of input. It is possible to embed
-actions that are exectued on transitions into a state, on transitions out of a
-state, on transitions taken on the error event, or on transitions taken on the
-EOF event.
-
-Within actions, it is possible to influence the behaviour of the state machine.
-The user can write action code that jumps or calls to another portion of the
-machine, changes the current character being processed, or breaks out of the
-processing loop. With the state machine calling feature Ragel can be used to
-parse languages that are not regular. For example, one can parse balanced
-parentheses by calling into a parser when an open parenthesis character is seen
-and returning to the state on the top of the stack when the corresponding
-closing parenthesis character is seen. More complicated context-free languages
-such as expressions in C are out of the scope of Ragel. 
-
-Ragel also provides a scanner construction operator that can be used to build
-scanners much the same way that Lex is used. The Ragel generated code, which
-relies on user-defined variables for backtracking, repeatedly tries to match
-patterns to the input, favouring longer patterns over shorter ones and patterns
-that appear ahead of others when the lengths of the possible matches are
-identical. When a pattern is matched the associated action is executed. 
-
-The key distinguishing feature between scanners in Ragel and scanners in Lex is
-that Ragel patterns may be arbitrary Ragel expressions and can therefore
-contain embedded code. With a Ragel-based scanner the user need not wait until
-the end of a pattern before user code can be executed.
-
-Scanners do take Ragel out of the domain of pure state machines and require the
-user to maintain the backtracking related variables.  However, scanners
-integrate well with regular state machine instantiations. They can be called to
-or jumped to only when needed, or they can be called out of or jumped out of
-when a simpler, pure state machine model is appropriate.
-
-Two types of output code style are available. Ragel can produce a table-driven
-machine or a directly executable machine. The directly executable machine is
-much faster than the table-driven. On the other hand, the table-driven machine
-is more compact and less demanding on the host language compiler. It is better
-suited to compiling large state machines.
-
-\section{Related Work}
-
-Lex is perhaps the best-known tool for constructing parsers from regular
-expressions. In the Lex processing model, generated code attempts to match one
-of the user's regular expression patterns, favouring longer matches over
-shorter ones. Once a match is made it then executes the code associated with
-the pattern and consumes the matching string.  This process is repeated until
-the input is fully consumed. 
-
-Through the use of start conditions, related sets of patterns may be defined.
-The active set may be changed at any time.  This allows the user to define
-different lexical regions. It also allows the user to link patterns together by
-requiring that some patterns come before others.  This is quite like a
-concatenation operation. However, use of Lex for languages that require a
-considerable amount of pattern concatenation is inappropriate. In such cases a
-Lex program deteriorates into a manually specified state machine, where start
-conditions define the states and pattern actions define the transitions.  Lex
-is therefore best suited to parsing tasks where the language to be parsed can
-be described in terms of regions of tokens. 
-
-Lex is useful in many scenarios and has undoubtedly stood the test of time.
-There are, however, several drawbacks to using Lex.  Lex can impose too much
-overhead for parsing applications where buffering is not required because all
-the characters are available in a single string.  In these cases there is
-structure to the language to be parsed and a parser specification tool can
-help, but employing a heavyweight processing loop that imposes a stream
-``pull'' model and dynamic input buffer allocation is inappropriate.  An
-example of this kind of scenario is the conversion of floating point numbers
-contained in a string to their corresponding numerical values.
-
-Another drawback is the very issue that Ragel attempts to solve.
-It is not possible to execute a user action while
-matching a character contained inside a pattern. For example, if scanning a
-programming language and string literals can contain newlines which must be
-counted, a Lex user must break up a string literal pattern so as to associate
-an action with newlines. This forces the definition of a new start condition.
-Alternatively the user can reprocess the text of the matched string literal to
-count newlines. 
-
-
-The Re2C program defines an input processing model similar to that of Lex.
-Re2C focuses on making generated state machines run very fast and
-integrate easily into any program, free of dependencies.  Re2C generates
-directly executable code and is able to claim that generated parsers run nearly
-as fast as their hand-coded equivalents.  This is very important for user
-adoption, as programmers are reluctant to use a tool when a faster alternative
-exists.  A consideration to ease of use is also important because developers
-need the freedom to integrate the generated code as they see fit. 
-
-Many scripting languages provide ways of composing parsers by linking regular
-expressions using program logic. For example, Sed and Awk are two established
-Unix scripting tools that allow the programmer to exploit regular expressions
-for the purpose of locating and extracting text of interest. High-level
-programming languages such as Perl, Python, PHP and Ruby all provide regular
-expression libraries that allow the user to combine regular expressions with
-arbitrary code.
-
-In addition to supporting the linking of regular expressions with arbitrary
-program logic, the Perl programming language permits the embedding of code into
-regular expressions. Perl embeddings do not translate into the embedding of
-code into deterministic state machines. Perl regular expressions are in fact
-not fully compiled to deterministic machines when embedded code is involved.
-They are instead interpreted and involve backtracking. This is shown by the
-following Perl program. When it is fed the input \verb|abcd| the interpretor
-attempts to match the first alternative, printing \verb|a1 b1|.  When this
-possibility fails it backtracks and tries the second possibility, printing
-\verb|a2 b2|, at which point it succeeds.
-
-\begin{inline_code}
-\begin{verbatim}
-print "YES\n" if ( <STDIN> =~
-        /( a (?{ print "a1 "; }) b (?{ print "b1 "; }) cX ) |
-         ( a (?{ print "a2 "; }) b (?{ print "b2 "; }) cd )/x )
-\end{verbatim}
-\end{inline_code}
-\verbspace
-
-In Ragel there is no regular expression interpretor. Aside from the scanner
-operator, all Ragel expressions are made into deterministic machines and the
-run time simply moves from state to state as it consumes input. An equivalent
-parser expressed in Ragel would attempt both of the alternatives concurrently,
-printing \verb|a1 a2 b1 b2|.
-
-\section{Development Status}
-
-Ragel is a relatively new tool and is under continuous development. As a rough
-release guide, minor revision number changes are for implementation
-improvements and feature additions. Major revision number changes are for
-implementation and language changes that do not preserve backwards
-compatibility. Though in the past this has not always held true: changes that
-break code have crept into minor version number changes. Typically, the
-documentation lags behind the development in the interest of documenting only
-the lasting features. The latest changes are always documented in the ChangeLog
-file. 
-
-\chapter{Constructing State Machines}
-
-\section{Ragel State Machine Specifications}
-
-A Ragel input file consists of a program in the host language that contains embedded machine
-specifications.  Ragel normally passes input straight to output.  When it sees
-a machine specification it stops to read the Ragel statements and possibly generate
-code in place of the specification.
-Afterwards it continues to pass input through.  There
-can be any number of FSM specifications in an input file. A multi-line FSM spec
-starts with \verb|%%{| and ends with \verb|}%%|. A single-line FSM spec starts
-with \verb|%%| and ends at the first newline.  
-
-While Ragel is looking for FSM specifications it does basic lexical analysis on
-the surrounding input. It interprets literal strings and comments so a
-\verb|%%| sequence in either of those will not trigger the parsing of an FSM
-specification. Ragel does not pass the input through any preprocessor nor does it
-interpret preprocessor directives itself so includes, defines and ifdef logic
-cannot be used to alter the parse of a Ragel input file. It is therefore not
-possible to use an \verb|#if 0| directive to comment out a machine as is
-commonly done in C code. As an alternative, a machine can be prevented from
-causing any generated output by commenting out write statements.
-
-In Figure \ref{cmd-line-parsing}, a multi-line specification is used to define the
-machine and single line specifications are used to trigger the writing of the machine
-data and execution code.
-
-\begin{figure}
-\small
-\begin{multicols}{2}
-\begin{verbatim}
-#include <string.h>
-#include <stdio.h>
-
-%%{ 
-    machine foo;
-    main := 
-        ( 'foo' | 'bar' ) 
-        0 @{ res = 1; };
-}%%
-
-%% write data;
-\end{verbatim}
-\verbspace
-\columnbreak
-\begin{verbatim}
-int main( int argc, char **argv )
-{
-    int cs, res = 0;
-    if ( argc > 1 ) {
-        char *p = argv[1];
-        char *pe = p + strlen(p) + 1;
-        %% write init;
-        %% write exec;
-    }
-    printf("result = %i\n", res );
-    return 0;
-}
-\end{verbatim}
-\verbspace
-\end{multicols}
-\caption{Parsing a command line argument.
-}
-\label{cmd-line-parsing}
-\end{figure}
-
-\subsection{Naming Ragel Blocks}
-
-\begin{verbatim}
-machine fsm_name;
-\end{verbatim}
-\verbspace
-
-The \verb|machine| statement gives the name of the FSM. If present in a
-specification, this statement must appear first. If a machine specification
-does not have a name then Ragel uses the previous specification name.  If no
-previous specification name exists then this is an error. Because FSM
-specifications persist in memory, a machine's statements can be spread across
-multiple machine specifications.  This allows one to break up a machine across
-several files or draw in statements that are common to multiple machines using
-the \verb|include| statement.
-
-\subsection{Machine Definition}
-\label{definition}
-
-\begin{verbatim}
-<name> = <expression>;
-\end{verbatim}
-\verbspace
-
-The machine definition statement associates an FSM expression with a name. Machine
-expressions assigned to names can later be referenced in other expressions. A
-definition statement on its own does not cause any states to be generated. It is simply a
-description of a machine to be used later. States are generated only when a definition is
-instantiated, which happens when a definition is referenced in an instantiated
-expression. 
-
-\subsection{Machine Instantiation}
-\label{instantiation}
-
-\begin{verbatim}
-<name> := <expression>;
-\end{verbatim}
-\verbspace
-
-The machine instantiation statement generates a set of states representing an
-expression. Each instantiation generates a distinct set of states.  The starting
-state of the instantiation is written in the data section of the generated code
-using the instantiation name.  If a machine named
-\verb|main| is instantiated, its start state is used as the
-specification's start state and is assigned to the \verb|cs| variable by the
-\verb|write init| command. If no \verb|main| machine is given, the start state
-of the last machine instantiation to appear is used as the specification's
-start state.
-
-From outside the execution loop, control may be passed to any machine by
-assigning the entry point to the \verb|cs| variable.  From inside the execution
-loop, control may be passed to any machine instantiation using \verb|fcall|,
-\verb|fgoto| or \verb|fnext| statements.
-
-\subsection{Including Ragel Code}
-
-\begin{verbatim}
-include FsmName "inputfile.rl";
-\end{verbatim}
-\verbspace
-
-The \verb|include| statement can be used to draw in the statements of another FSM
-specification. Both the name and input file are optional, however at least one
-must be given. Without an FSM name, the given input file is searched for an FSM
-of the same name as the current specification. Without an input file the
-current file is searched for a machine of the given name. If both are present,
-the given input file is searched for a machine of the given name.
-
-Ragel searches for included files from the location of the current file.
-Additional directories can be added to the search path using the \verb|-I|
-option.
-
-\subsection{Importing Definitions}
-\label{import}
-
-\begin{verbatim}
-import "inputfile.h";
-\end{verbatim}
-\verbspace
-
-The \verb|import| statement scrapes a file for sequences of tokens that match
-the following forms. Ragel treats these forms as state machine definitions.
-
-\noindent\hspace*{24pt}\verb|name '=' number|\\
-\noindent\hspace*{24pt}\verb|name '=' lit_string|\\
-\noindent\hspace*{24pt}\verb|'define' name number|\\
-\noindent\hspace*{24pt}\verb|'define' name lit_string|
-\vspace{12pt}
-
-If the input file is a Ragel program then tokens inside any Ragel
-specifications are ignored. See Section \ref{export} for a description of
-exporting machine definitions.
-
-Ragel searches for imported files from the location of the current file.
-Additional directories can be added to the search path using the \verb|-I|
-option.
-
-\section{Lexical Analysis of a Ragel Block}
-\label{lexing}
-
-Within a machine specification the following lexical rules apply to the input.
-
-\begin{itemize}
-
-\item The \verb|#| symbol begins a comment that terminates at the next newline.
-
-\item The symbols \verb|""|, \verb|''|, \verb|//|, \verb|[]| behave as the
-delimiters of literal strings. Within them, the following escape sequences 
-are interpreted: 
-
-\verb|    \0 \a \b \t \n \v \f \r|
-
-A backslash at the end of a line joins the following line onto the current. A
-backslash preceding any other character removes special meaning. This applies
-to terminating characters and to special characters in regular expression
-literals. As an exception, regular expression literals do not support escape
-sequences as the operands of a range within a list. See the bullet on regular
-expressions in Section \ref{basic}.
-
-\item The symbols \verb|{}| delimit a block of host language code that will be
-embedded into the machine as an action.  Within the block of host language
-code, basic lexical analysis of comments and strings is done in order to
-correctly find the closing brace of the block. With the exception of FSM
-commands embedded in code blocks, the entire block is preserved as is for
-identical reproduction in the output code.
-
-\item The pattern \verb|[+-]?[0-9]+| denotes an integer in decimal format.
-Integers used for specifying machines may be negative only if the alphabet type
-is signed. Integers used for specifying priorities may be positive or negative.
-
-\item The pattern \verb|0x[0-9A-Fa-f]+| denotes an integer in hexadecimal
-format.
-
-\item The keywords are \verb|access|, \verb|action|, \verb|alphtype|,
-\verb|getkey|, \verb|write|, \verb|machine| and \verb|include|.
-
-\item The pattern \verb|[a-zA-Z_][a-zA-Z_0-9]*| denotes an identifier.
-
-
-\item Any amount of whitespace may separate tokens.
-
-\end{itemize}
-
-
-\section{Basic Machines}
-\label{basic}
-
-The basic machines are the base operands of regular language expressions. They
-are the smallest unit to which machine construction and manipulation operators
-can be applied.
-
-\begin{itemize}
-
-\item \verb|'hello'| -- Concatenation Literal. Produces a machine that matches
-the sequence of characters in the quoted string. If there are 5 characters
-there will be 6 states chained together with the characters in the string. See
-Section \ref{lexing} for information on valid escape sequences. 
-
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{bmconcat}
-\end{center}
-\graphspace
-
-It is possible
-to make a concatenation literal case-insensitive by appending an \verb|i| to
-the string, for example \verb|'cmd'i|.
-
-\item \verb|"hello"| -- Identical to the single quoted version.
-
-\item \verb|[hello]| -- Or Expression. Produces a union of characters.  There
-will be two states with a transition for each unique character between the two states.
-The \verb|[]| delimiters behave like the quotes of a literal string. For example, 
-\verb|[ \t]| means tab or space. The \verb|or| expression supports character ranges
-with the \verb|-| symbol as a separator. The meaning of the union can be negated
-using an initial \verb|^| character as in standard regular expressions. 
-See Section \ref{lexing} for information on valid escape sequences
-in \verb|or| expressions.
-
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{bmor}
-\end{center}
-\graphspace
-
-\item \verb|''|, \verb|""|, and \verb|[]| -- Zero Length Machine.  Produces a machine
-that matches the zero length string. Zero length machines have one state that is both
-a start state and a final state.
-
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{bmnull}
-\end{center}
-\graphspace
-
-% FIXME: More on the range of values here.
-\item \verb|42| -- Numerical Literal. Produces a two state machine with one
-transition on the given number. The number may be in decimal or hexadecimal
-format and should be in the range allowed by the alphabet type. The minimum and
-maximum values permitted are defined by the host machine that Ragel is compiled
-on. For example, numbers in a \verb|short| alphabet on an i386 machine should
-be in the range \verb|-32768| to \verb|32767|.
-
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{bmnum}
-\end{center}
-\graphspace
-
-\item \verb|/simple_regex/| -- Regular Expression. Regular expressions are
-parsed as a series of expressions that are concatenated together. Each
-concatenated expression
-may be a literal character, the ``any'' character specified by the \verb|.|
-symbol, or a union of characters specified by the \verb|[]| delimiters. If the
-first character of a union is \verb|^| then it matches any character not in the
-list. Within a union, a range of characters can be given by separating the first
-and last characters of the range with the \verb|-| symbol. Each
-concatenated machine may have repetition specified by following it with the
-\verb|*| symbol. The standard escape sequences described in Section
-\ref{lexing} are supported everywhere in regular expressions except as the
-operands of a range within in a list. This notation also supports the \verb|i|
-trailing option. Use it to produce case-insensitive machines, as in \verb|/GET/i|.
-
-Ragel does not support very complex regular expressions because the desired
-results can always be achieved using the more general machine construction
-operators listed in Section \ref{machconst}. The following diagram shows the
-result of compiling \verb|/ab*[c-z].*[123]/|. \verb|DEF| represents the default
-transition, which is taken if no other transition can be taken. 
-
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{bmregex}
-\end{center}
-\graphspace
-
-\item \verb|'a' .. 'z'| -- Range. Produces a machine that matches any
-characters in the specified range.  Allowable upper and lower bounds of the
-range are concatenation literals of length one and numerical literals.  For
-example, \verb|0x10..0x20|, \verb|0..63|, and \verb|'a'..'z'| are valid ranges.
-The bounds should be in the range allowed by the alphabet type.
-
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{bmrange}
-\end{center}
-\graphspace
-
-\item \verb|variable_name| -- Lookup the machine definition assigned to the
-variable name given and use an instance of it. See Section \ref{definition} for
-an important note on what it means to reference a variable name.
-
-\item \verb|builtin_machine| -- There are several built-in machines available
-for use. They are all two state machines for the purpose of matching common
-classes of characters. They are:
-
-\begin{itemize}
-
-\item \verb|any   | -- Any character in the alphabet.
-
-\item \verb|ascii | -- Ascii characters. \verb|0..127|
-
-\item \verb|extend| -- Ascii extended characters. This is the range
-\verb|-128..127| for signed alphabets and the range \verb|0..255| for unsigned
-alphabets.
-
-\item \verb|alpha | -- Alphabetic characters. \verb|[A-Za-z]|
-
-\item \verb|digit | -- Digits. \verb|[0-9]|
-
-\item \verb|alnum | -- Alpha numerics. \verb|[0-9A-Za-z]|
-
-\item \verb|lower | -- Lowercase characters. \verb|[a-z]|
-
-\item \verb|upper | -- Uppercase characters. \verb|[A-Z]|
-
-\item \verb|xdigit| -- Hexadecimal digits. \verb|[0-9A-Fa-f]|
-
-\item \verb|cntrl | -- Control characters. \verb|0..31|
-
-\item \verb|graph | -- Graphical characters. \verb|[!-~]|
-
-\item \verb|print | -- Printable characters. \verb|[ -~]|
-
-\item \verb|punct | -- Punctuation. Graphical characters that are not alphanumerics.
-\verb|[!-/:-@[-`{-~]|
-
-\item \verb|space | -- Whitespace. \verb|[\t\v\f\n\r ]|
-
-\item \verb|zlen  | -- Zero length string. \verb|""|
-
-\item \verb|empty | -- Empty set. Matches nothing. \verb|^any|
-
-\end{itemize}
-\end{itemize}
-
-\section{Operator Precedence}
-The following table shows operator precedence from lowest to highest. Operators
-in the same precedence group are evaluated from left to right.
-
-\begin{tabular}{|c|c|c|}
-\hline
-1&\verb| , |&Join\\
-\hline
-2&\verb/ | & - --/&Union, Intersection and Subtraction\\
-\hline
-3&\verb| . <: :> :>> |&Concatenation\\
-\hline
-4&\verb| : |&Label\\
-\hline
-5&\verb| -> |&Epsilon Transition\\
-\hline
-6&\verb| >  @  $  % |&Transitions Actions and Priorities\\
-\hline
-6&\verb| >/  $/  %/  </  @/  <>/ |&EOF Actions\\
-\hline
-6&\verb| >!  $!  %!  <!  @!  <>! |&Global Error Actions\\
-\hline
-6&\verb| >^  $^  %^  <^  @^  <>^ |&Local Error Actions\\
-\hline
-6&\verb| >~  $~  %~  <~  @~  <>~ |&To-State Actions\\
-\hline
-6&\verb| >*  $*  %*  <*  @*  <>* |&From-State Action\\
-\hline
-7&\verb| * ** ? + {n} {,n} {n,} {n,m} |&Repetition\\
-\hline
-8&\verb| ! ^ |&Negation and Character-Level Negation\\
-\hline
-9&\verb| ( <expr> ) |&Grouping\\
-\hline
-\end{tabular}
-
-\section{Regular Language Operators}
-\label{machconst}
-
-When using Ragel it is helpful to have a sense of how it constructs machines.
-The determinization process can produce results that seem unusual to someone
-not familiar with the NFA to DFA conversion algorithm. In this section we
-describe Ragel's state machine operators. Though the operators are defined
-using epsilon transitions, it should be noted that this is for discussion only.
-The epsilon transitions described in this section do not persist, but are
-immediately removed by the determinization process which is executed at every
-operation. Ragel does not make use of any nondeterministic intermediate state
-machines. 
-
-To create an epsilon transition between two states \verb|x| and \verb|y| is to
-copy all of the properties of \verb|y| into \verb|x|. This involves drawing in
-all of \verb|y|'s to-state actions, EOF actions, etc., in addition to its
-transitions. If \verb|x| and \verb|y| both have a transition out on the same
-character, then the transitions must be combined.  During transition
-combination a new transition is made that goes to a new state that is the
-combination of both target states. The new combination state is created using
-the same epsilon transition method.  The new state has an epsilon transition
-drawn to all the states that compose it. Since the creation of new epsilon
-transitions may be triggered every time an epsilon transition is drawn, the
-process of drawing epsilon transitions is repeated until there are no more
-epsilon transitions to be made.
-
-A very common error that is made when using Ragel is to make machines that do
-too much. That is, to create machines that have unintentional
-nondetermistic properties. This usually results from being unaware of the common strings
-between machines that are combined together using the regular language
-operators. This can involve never leaving a machine, causing its actions to be
-propagated through all the following states. Or it can involve an alternation
-where both branches are unintentionally taken simultaneously.
-
-This problem forces one to think hard about the language that needs to be
-matched. To guard against this kind of problem one must ensure that the machine
-specification is divided up using boundaries that do not allow ambiguities from
-one portion of the machine to the next. See Chapter
-\ref{controlling-nondeterminism} for more on this problem and how to solve it.
-
-The Graphviz tool is an immense help when debugging improperly compiled
-machines or otherwise learning how to use Ragel. Graphviz Dot files can be
-generated from Ragel programs using the \verb|-V| option. See Section
-\ref{visualization} for more information.
-
-
-\subsection{Union}
-
-\verb/expr | expr/
-
-The union operation produces a machine that matches any string in machine one
-or machine two. The operation first creates a new start state. Epsilon
-transitions are drawn from the new start state to the start states of both
-input machines.  The resulting machine has a final state set equivalent to the
-union of the final state sets of both input machines. In this operation, there
-is the opportunity for nondeterminism among both branches. If there are
-strings, or prefixes of strings that are matched by both machines then the new
-machine will follow both parts of the alternation at once. The union operation is
-shown below.
-
-\graphspace
-\begin{center}
-\includegraphics[scale=1.0]{opor}
-\end{center}
-\graphspace
-
-The following example demonstrates the union of three machines representing
-common tokens.
-
-% GENERATE: exor
-% OPT: -p
-% %%{
-% machine exor;
-\begin{inline_code}
-\begin{verbatim}
-# Hex digits, decimal digits, or identifiers
-main := '0x' xdigit+ | digit+ | alpha alnum*;
-\end{verbatim}
-\end{inline_code}
-\verbspace
-% }%%
-% END GENERATE
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{exor}
-\end{center}
-\graphspace
-
-\subsection{Intersection}
-
-\verb|expr & expr|
-
-Intersection produces a machine that matches any
-string that is in both machine one and machine two. To achieve intersection, a
-union is performed on the two machines. After the result has been made
-deterministic, any final state that is not a combination of final states from
-both machines has its final state status revoked. To complete the operation,
-paths that do not lead to a final state are pruned from the machine. Therefore,
-if there are any such paths in either of the expressions they will be removed
-by the intersection operator.  Intersection can be used to require that two
-independent patterns be simultaneously satisfied as in the following example.
-
-% GENERATE: exinter
-% OPT: -p
-% %%{
-% machine exinter;
-\begin{inline_code}
-\begin{verbatim}
-# Match lines four characters wide that contain 
-# words separated by whitespace.
-main :=
-    /[^\n][^\n][^\n][^\n]\n/* &
-    (/[a-z][a-z]*/ | [ \n])**;
-\end{verbatim}
-\end{inline_code}
-\verbspace
-% }%%
-% END GENERATE
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{exinter}
-\end{center}
-\graphspace
-
-\subsection{Difference}
-
-\verb|expr - expr|
-
-The difference operation produces a machine that matches
-strings that are in machine one but are not in machine two. To achieve subtraction,
-a union is performed on the two machines. After the result has been made
-deterministic, any final state that came from machine two or is a combination
-of states involving a final state from machine two has its final state status
-revoked. As with intersection, the operation is completed by pruning any path
-that does not lead to a final state.  The following example demonstrates the
-use of subtraction to exclude specific cases from a set.
-
-% GENERATE: exsubtr
-% OPT: -p
-% %%{
-% machine exsubtr;
-\begin{inline_code}
-\begin{verbatim}
-# Subtract keywords from identifiers.
-main := /[a-z][a-z]*/ - ( 'for' | 'int' );
-\end{verbatim}
-\end{inline_code}
-\verbspace
-% }%%
-% END GENERATE
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{exsubtr}
-\end{center}
-\graphspace
-
-\subsection{Strong Difference}
-\label{strong_difference}
-
-\verb|expr -- expr|
-
-Strong difference produces a machine that matches any string of the first
-machine that does not have any string of the second machine as a substring. In
-the following example, strong subtraction is used to excluded \verb|CRLF| from
-a sequence. In the corresponding visualization, the label \verb|DEF| is short
-for default. The default transition is taken if no other transition can be
-taken.
-
-% GENERATE: exstrongsubtr
-% OPT: -p
-% %%{
-% machine exstrongsubtr;
-\begin{inline_code}
-\begin{verbatim}
-crlf = '\r\n';
-main := [a-z]+ ':' ( any* -- crlf ) crlf;
-\end{verbatim}
-\end{inline_code}
-\verbspace
-% }%%
-% END GENERATE
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{exstrongsubtr}
-\end{center}
-\graphspace
-
-This operator is equivalent to the following.
-
-\begin{verbatim}
-expr - ( any* expr any* )
-\end{verbatim}
-\verbspace
-
-\subsection{Concatenation}
-
-\verb|expr . expr|
-
-Concatenation produces a machine that matches all the strings in machine one followed by all
-the strings in machine two.  Concatenation draws epsilon transitions from the
-final states of the first machine to the start state of the second machine. The
-final states of the first machine lose their final state status, unless the
-start state of the second machine is final as well. 
-Concatenation is the default operator. Two machines next to each other with no
-operator between them results in concatenation.
-
-\graphspace
-\begin{center}
-\includegraphics[scale=1.0]{opconcat}
-\end{center}
-\graphspace
-
-The opportunity for nondeterministic behaviour results from the possibility of
-the final states of the first machine accepting a string that is also accepted
-by the start state of the second machine.
-The most common scenario in which this happens is the
-concatenation of a machine that repeats some pattern with a machine that gives
-a terminating string, but the repetition machine does not exclude the
-terminating string. The example in Section \ref{strong_difference}
-guards against this. Another example is the expression \verb|("'" any* "'")|.
-When executed the thread of control will
-never leave the \verb|any*| machine.  This is a problem especially if actions
-are embedded to process the characters of the \verb|any*| component.
-
-In the following example, the first machine is always active due to the
-nondeterministic nature of concatenation. This particular nondeterminism is intended
-however because we wish to permit EOF strings before the end of the input.
-
-% GENERATE: exconcat
-% OPT: -p
-% %%{
-% machine exconcat;
-\begin{inline_code}
-\begin{verbatim}
-# Require an eof marker on the last line.
-main := /[^\n]*\n/* . 'EOF\n';
-\end{verbatim}
-\end{inline_code}
-\verbspace
-% }%%
-% END GENERATE
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{exconcat}
-\end{center}
-\graphspace
-
-There is a language
-ambiguity involving concatenation and subtraction. Because concatenation is the 
-default operator for two
-adjacent machines there is an ambiguity between subtraction of
-a positive numerical literal and concatenation of a negative numerical literal.
-For example, \verb|(x-7)| could be interpreted as \verb|(x . -7)| or 
-\verb|(x - 7)|. In the Ragel language, the subtraction operator always takes precedence
-over concatenation of a negative literal. We adhere to the rule that the default
-concatenation operator takes effect only when there are no other operators between
-two machines. Beware of writing machines such as \verb|(any -1)| when what is
-desired is a concatenation of \verb|any| and \verb|-1|. Instead write 
-\verb|(any . -1)| or \verb|(any (-1))|. If in doubt of the meaning of your program do not
-rely on the default concatenation operator; always use the \verb|.| symbol.
-
-
-\subsection{Kleene Star}
-
-\verb|expr*|
-
-The machine resulting from the Kleene Star operator will match zero or more
-repetitions of the machine it is applied to.
-It creates a new start state and an additional final
-state.  Epsilon transitions are drawn between the new start state and the old start
-state, between the new start state and the new final state, and
-between the final states of the machine and the new start state.  After the
-machine is made deterministic the effect is of the final states getting all the
-transitions of the start state. 
-
-\graphspace
-\begin{center}
-\includegraphics[scale=1.0]{opstar}
-\end{center}
-\graphspace
-
-The possibility for nondeterministic behaviour arises if the final states have
-transitions on any of the same characters as the start state.  This is common
-when applying kleene star to an alternation of tokens. Like the other problems
-arising from nondeterministic behavior, this is discussed in more detail in Chapter
-\ref{controlling-nondeterminism}. This particular problem can also be solved
-by using the longest-match construction discussed in Section 
-\ref{generating-scanners} on scanners.
-
-In this 
-example, there is no nondeterminism introduced by the exterior kleene star due to
-the newline at the end of the regular expression. Without the newline the
-exterior kleene star would be redundant and there would be ambiguity between
-repeating the inner range of the regular expression and the entire regular
-expression. Though it would not cause a problem in this case, unnecessary
-nondeterminism in the kleene star operator often causes undesired results for
-new Ragel users and must be guarded against.
-
-% GENERATE: exstar
-% OPT: -p
-% %%{
-% machine exstar;
-\begin{inline_code}
-\begin{verbatim}
-# Match any number of lines with only lowercase letters.
-main := /[a-z]*\n/*;
-\end{verbatim}
-\end{inline_code}
-\verbspace
-% }%%
-% END GENERATE
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{exstar}
-\end{center}
-\graphspace
-
-\subsection{One Or More Repetition}
-
-\verb|expr+|
-
-This operator produces the concatenation of the machine with the kleene star of
-itself. The result will match one or more repetitions of the machine. The plus
-operator is equivalent to \verb|(expr . expr*)|.  
-
-% GENERATE: explus
-% OPT: -p
-% %%{
-% machine explus;
-\begin{inline_code}
-\begin{verbatim}
-# Match alpha-numeric words.
-main := alnum+;
-\end{verbatim}
-\end{inline_code}
-\verbspace
-% }%%
-% END GENERATE
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{explus}
-\end{center}
-\graphspace
-
-\subsection{Optional}
-
-\verb|expr?|
-
-The {\em optional} operator produces a machine that accepts the machine
-given or the zero length string. The optional operator is equivalent to
-\verb/(expr | '' )/. In the following example the optional operator is used to
-possibly extend a token.
-
-% GENERATE: exoption
-% OPT: -p
-% %%{
-% machine exoption;
-\begin{inline_code}
-\begin{verbatim}
-# Match integers or floats.
-main := digit+ ('.' digit+)?;
-\end{verbatim}
-\end{inline_code}
-\verbspace
-% }%%
-% END GENERATE
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{exoption}
-\end{center}
-\graphspace
-
-\subsection{Repetition}
-
-\noindent\hspace*{24pt}\verb|expr {n}| -- Exactly N copies of expr.\\
-\noindent\hspace*{24pt}\verb|expr {,n}| -- Zero to N copies of expr.\\
-\noindent\hspace*{24pt}\verb|expr {n,}| -- N or more copies of expr.\\
-\noindent\hspace*{24pt}\verb|expr {n,m}| -- N to M copies of expr.
-\vspace{12pt}
-
-\subsection{Negation}
-
-\verb|!expr|
-
-Negation produces a machine that matches any string not matched by the given
-machine. Negation is equivalent to \verb|(any* - expr)|.
-
-% GENERATE: exnegate
-% OPT: -p
-% %%{
-% machine exnegate;
-\begin{inline_code}
-\begin{verbatim}
-# Accept anything but a string beginning with a digit.
-main := ! ( digit any* );
-\end{verbatim}
-\end{inline_code}
-\verbspace
-% }%%
-% END GENERATE
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{exnegate}
-\end{center}
-\graphspace
-
-\subsection{Character-Level Negation}
-
-\verb|^expr|
-
-Character-level negation produces a machine that matches any single character
-not matched by the given machine. Character-Level Negation is equivalent to
-\verb|(any - expr)|. It must be applied only to machines that match strings of
-length one.
-
-\section{State Machine Minimization}
-
-State machine minimization is the process of finding the minimal equivalent FSM accepting
-the language. Minimization reduces the number of states in machines
-by merging equivalent states. It does not change the behaviour of the machine
-in any way. It will cause some states to be merged into one because they are
-functionally equivalent. State minimization is on by default. It can be turned
-off with the \verb|-n| option.
-
-The algorithm implemented is similar to Hopcroft's state minimization
-algorithm. Hopcroft's algorithm assumes a finite alphabet that can be listed in
-memory, whereas Ragel supports arbitrary integer alphabets that cannot be
-listed in memory. Though exact analysis is very difficult, Ragel minimization
-runs close to O(n * log(n)) and requires O(n) temporary storage where
-$n$ is the number of states.
-
-\section{Visualization}
-\label{visualization}
-
-%In many cases, practical
-%parsing programs will be too large to completely visualize with Graphviz.  The
-%proper approach is to reduce the language to the smallest subset possible that
-%still exhibits the characteristics that one wishes to learn about or to fix.
-%This can be done without modifying the source code using the \verb|-M| and
-%\verb|-S| options. If a machine cannot be easily reduced,
-%embeddings of unique actions can be very useful for tracing a
-%particular component of a larger machine specification, since action names are
-%written out on transition labels.
-
-Ragel is able to emit compiled state machines in Graphviz's Dot file format.
-This is done using the \verb|-V| option.
-Graphviz support allows users to perform
-incremental visualization of their parsers. User actions are displayed on
-transition labels of the graph. 
-
-If the final graph is too large to be
-meaningful, or even drawn, the user is able to inspect portions of the parser
-by naming particular regular expression definitions with the \verb|-S| and
-\verb|-M| options to the \verb|ragel| program. Use of Graphviz greatly
-improves the Ragel programming experience. It allows users to learn Ragel by
-experimentation and also to track down bugs caused by unintended
-nondeterminism.
-
-Ragel has another option to help debugging. The \verb|-x| option causes Ragel
-to emit the compiled machine in an XML format.
-
-\chapter{User Actions}
-
-Ragel permits the user to embed actions into the transitions of a regular
-expression's corresponding state machine. These actions are executed when the
-generated code moves over a transition.  Like the regular expression operators,
-the action embedding operators are fully compositional. They take a state
-machine and an action as input, embed the action and yield a new state machine
-that can be used in the construction of other machines. Due to the
-compositional nature of embeddings, the user has complete freedom in the
-placement of actions.
-
-A machine's transitions are categorized into four classes. The action embedding
-operators access the transitions defined by these classes.  The {\em entering
-transition} operator \verb|>| isolates the start state, then embeds an action
-into all transitions leaving it. The {\em finishing transition} operator
-\verb|@| embeds an action into all transitions going into a final state.  The
-{\em all transition} operator \verb|$| embeds an action into all transitions of
-an expression. The {\em leaving transition} operator \verb|%| provides access
-to the yet-unmade transitions moving out of the machine via the final states. 
-
-\section{Embedding Actions}
-
-\begin{verbatim}
-action ActionName {
-    /* Code an action here. */
-    count += 1;
-}
-\end{verbatim}
-\verbspace
-
-The action statement defines a block of code that can be embedded into an FSM.
-Action names can be referenced by the action embedding operators in
-expressions. Though actions need not be named in this way (literal blocks
-of code can be embedded directly when building machines), defining reusable
-blocks of code whenever possible is good practice because it potentially increases the
-degree to which the machine can be minimized. 
-
-Within an action some Ragel expressions and statements are parsed and
-translated. These allow the user to interact with the machine from action code.
-See Section \ref{vals} for a complete list of statements and values available
-in code blocks. 
-
-\subsection{Entering Action}
-
-\verb|expr > action| 
-
-The entering action operator embeds an action into all transitions
-that enter into the machine from the start state. If the start state is final,
-then the action is also embedded into the start state as a leaving action. This
-means that if a machine accepts the zero-length string and control passes
-through the start state then the entering action is executed. Note
-that this can happen on both a following character and on the EOF event.
-
-In some machines the start state has transtions coming in from within the
-machine. In these cases the start state is first isolated from the rest of the
-machine ensuring that the entering actions are exected once only.
-
-% GENERATE: exstact
-% OPT: -p
-% %%{
-% machine exstact;
-\begin{inline_code}
-\begin{verbatim}
-# Execute A at the beginning of a string of alpha.
-action A {}
-main := ( lower* >A ) . ' ';
-\end{verbatim}
-\end{inline_code}
-\verbspace
-% }%%
-% END GENERATE
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{exstact}
-\end{center}
-\graphspace
-
-\subsection{Finishing Action}
-
-\verb|expr @ action|
-
-The finishing action operator embeds an action into any transitions that move
-the machine into a final state. Further input may move the machine out of the
-final state, but keep it in the machine. Therefore finishing actions may be
-executed more than once if a machine has any internal transitions out of a
-final state. In the following example the final state has no transitions out
-and the finishing action is executed only once.
-
-% GENERATE: exdoneact
-% OPT: -p
-% %%{
-% machine exdoneact;
-% action A {}
-\begin{inline_code}
-\begin{verbatim}
-# Execute A when the trailing space is seen.
-main := ( lower* ' ' ) @A;
-\end{verbatim}
-\end{inline_code}
-\verbspace
-% }%%
-% END GENERATE
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{exdoneact}
-\end{center}
-\graphspace
-
-\subsection{All Transition Action}
-
-\verb|expr $ action|
-
-The all transition operator embeds an action into all transitions of a machine.
-The action is executed whenever a transition of the machine is taken. In the
-following example, A is executed on every character matched.
-
-% GENERATE: exallact
-% OPT: -p
-% %%{
-% machine exallact;
-% action A {}
-\begin{inline_code}
-\begin{verbatim}
-# Execute A on any characters of the machine.
-main := ( 'm1' | 'm2' ) $A;
-\end{verbatim}
-\end{inline_code}
-\verbspace
-% }%%
-% END GENERATE
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{exallact}
-\end{center}
-\graphspace
-
-\subsection{Leaving Actions}
-\label{out-actions}
-
-\verb|expr % action|
-
-The leaving action operator queues an action for embedding into the transitions
-that go out of a machine via a final state. The action is first stored in
-the machine's final states and is later transferred to any transitions that are
-made going out of the machine by a kleene star or concatenation operation.
-
-If a final state of the machine is still final when compilation is complete
-then the leaving action is also embedded as an EOF action. Therefore, leaving
-the machine is defined as either leaving on a character or as state machine
-acceptance.
-
-This operator allows one to associate an action with the termination of a
-sequence, without being concerned about what particular character terminates
-the sequence. In the following example, A is executed when leaving the alpha
-machine on the newline character.
-
-% GENERATE: exoutact1
-% OPT: -p
-% %%{
-% machine exoutact1;
-% action A {}
-\begin{inline_code}
-\begin{verbatim}
-# Match a word followed by a newline. Execute A when 
-# finishing the word.
-main := ( lower+ %A ) . '\n';
-\end{verbatim}
-\end{inline_code}
-\verbspace
-% }%%
-% END GENERATE
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{exoutact1}
-\end{center}
-\graphspace
-
-In the following example, the \verb|term_word| action could be used to register
-the appearance of a word and to clear the buffer that the \verb|lower| action used
-to store the text of it.
-
-% GENERATE: exoutact2
-% OPT: -p
-% %%{
-% machine exoutact2;
-% action lower {}
-% action space {}
-% action term_word {}
-% action newline {}
-\begin{inline_code}
-\begin{verbatim}
-word = ( [a-z] @lower )+ %term_word;
-main := word ( ' ' @space word )* '\n' @newline;
-\end{verbatim}
-\end{inline_code}
-\verbspace
-% }%%
-% END GENERATE
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{exoutact2}
-\end{center}
-\graphspace
-
-In this final example of the action embedding operators, A is executed upon entering
-the alpha machine, B is executed on all transitions of the
-alpha machine, C is executed when the alpha machine is exited by moving into the
-newline machine and N is executed when the newline machine moves into a final
-state.  
-
-% GENERATE: exaction
-% OPT: -p
-% %%{
-% machine exaction;
-% action A {}
-% action B {}
-% action C {}
-% action N {}
-\begin{inline_code}
-\begin{verbatim}
-# Execute A on starting the alpha machine, B on every transition 
-# moving through it and C upon finishing. Execute N on the newline.
-main := ( lower* >A $B %C ) . '\n' @N;
-\end{verbatim}
-\end{inline_code}
-\verbspace
-% }%%
-% END GENERATE
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{exaction}
-\end{center}
-\graphspace
-
-
-\section{State Action Embedding Operators}
-
-The state embedding operators allow one to embed actions into states. Like the
-transition embedding operators, there are several different classes of states
-that the operators access. The meanings of the symbols are similar to the
-meanings of the symbols used for the transition embedding operators. The design
-of the state selections was driven by a need to cover the states of an
-expression with exactly one error action.
-
-Unlike the transition embedding operators, the state embedding operators are
-also distinguished by the different kinds of events that embedded actions can
-be associated with. Therefore the state embedding operators have two
-components.  The first, which is the first one or two characters, specifies the
-class of states that the action will be embedded into. The second component
-specifies the type of event the action will be executed on. The symbols of the
-second component also have equivalent kewords. 
-
-\begin{multicols}{2}
-The different classes of states are:
-
-\noindent\hspace*{24pt}\verb|> | -- the start state\\
-\noindent\hspace*{24pt}\verb|< | -- any state except the start state\\
-\noindent\hspace*{24pt}\verb|$ | -- all states\\
-\noindent\hspace*{24pt}\verb|% | -- final states\\
-\noindent\hspace*{24pt}\verb|@ | -- any state except final states\\
-\noindent\hspace*{24pt}\verb|<>| -- any except start and final (middle)
-\vspace{12pt}
-
-\columnbreak
-
-The different kinds of embeddings are:
-
-\noindent\hspace*{24pt}\verb|~| -- to-state actions (\verb|to|)\\
-\noindent\hspace*{24pt}\verb|*| -- from-state actions (\verb|from|)\\
-\noindent\hspace*{24pt}\verb|/| -- EOF actions (\verb|eof|)\\
-\noindent\hspace*{24pt}\verb|!| -- error actions (\verb|err|)\\
-\noindent\hspace*{24pt}\verb|^| -- local error actions (\verb|lerr|)
-\vspace{12pt}
-
-\end{multicols}
-
-\subsection{To-State and From-State Actions}
-
-\subsubsection{To-State Actions}
-
-\noindent\hspace*{24pt}\verb|>~action      >to(name)      >to{...} | -- the start state\\
-\noindent\hspace*{24pt}\verb|<~action      <to(name)      <to{...} | -- any state except the start state\\
-\noindent\hspace*{24pt}\verb|$~action      $to(name)      $to{...} | -- all states\\
-\noindent\hspace*{24pt}\verb|%~action      %to(name)      %to{...} | -- final states\\
-\noindent\hspace*{24pt}\verb|@~action      @to(name)      @to{...} | -- any state except final states\\
-\noindent\hspace*{24pt}\verb|<>~action     <>to(name)     <>to{...}| -- any except start and final (middle)
-\vspace{12pt}
-
-
-To-state actions are executed whenever the state machine moves into the
-specified state, either by a natural movement over a transition or by an
-action-based transfer of control such as \verb|fgoto|. They are executed after the
-in-transition's actions but before the current character is advanced and
-tested against the end of the input block. To-state embeddings stay with the
-state. They are irrespective of the state's current set of transitions and any
-future transitions that may be added in or out of the state.
-
-Note that the setting of the current state variable \verb|cs| outside of the
-execute code is not considered by Ragel as moving into a state and consequently
-the to-state actions of the new current state are not executed. This includes
-the initialization of the current state when the machine begins.  This is
-because the entry point into the machine execution code is after the execution
-of to-state actions.
-
-\subsubsection{From-State Actions}
-
-\noindent\hspace*{24pt}\verb|>*action     >from(name)     >from{...} | -- the start state\\
-\noindent\hspace*{24pt}\verb|<*action     <from(name)     <from{...} | -- any state except the start state\\
-\noindent\hspace*{24pt}\verb|$*action     $from(name)     $from{...} | -- all states\\
-\noindent\hspace*{24pt}\verb|%*action     %from(name)     %from{...} | -- final states\\
-\noindent\hspace*{24pt}\verb|@*action     @from(name)     @from{...} | -- any state except final states\\
-\noindent\hspace*{24pt}\verb|<>*action    <>from(name)    <>from{...}| -- any except start and final (middle)
-\vspace{12pt}
-
-From-state actions are executed whenever the state machine takes a transition from a
-state, either to itself or to some other state. These actions are executed
-immediately after the current character is tested against the input block end
-marker and before the transition to take is sought based on the current
-character. From-state actions are therefore executed even if a transition
-cannot be found and the machine moves into the error state.  Like to-state
-embeddings, from-state embeddings stay with the state.
-
-\subsection{EOF Actions}
-
-\noindent\hspace*{24pt}\verb|>/action     >eof(name)     >eof{...} | -- the start state\\
-\noindent\hspace*{24pt}\verb|</action     <eof(name)     <eof{...} | -- any state except the start state\\
-\noindent\hspace*{24pt}\verb|$/action     $eof(name)     $eof{...} | -- all states\\
-\noindent\hspace*{24pt}\verb|%/action     %eof(name)     %eof{...} | -- final states\\
-\noindent\hspace*{24pt}\verb|@/action     @eof(name)     @eof{...} | -- any state except final states\\
-\noindent\hspace*{24pt}\verb|<>/action    <>eof(name)    <>eof{...}| -- any except start and final (middle)
-\vspace{12pt}
-
-The EOF action embedding operators enable the user to embed actions that are
-executed at the end of the input stream. EOF actions are stored in states and
-generated in the \verb|write exec| block. They are run when \verb|p == pe == eof|
-as the execute block is finishing. EOF actions are free to adjust \verb|p| and
-jump to another part of the machine to restart execution.
-
-\subsection{Handling Errors}
-
-In many applications it is useful to be able to react to parsing errors.  The
-user may wish to print an error message that depends on the context.  It
-may also be desirable to consume input in an attempt to return the input stream
-to some known state and resume parsing. To support error handling and recovery,
-Ragel provides error action embedding operators. There are two kinds of error
-actions: global error actions and local error actions.
-Error actions can be used to simply report errors, or by jumping to a machine
-instantiation that consumes input, can attempt to recover from errors.  
-
-\subsubsection{Global Error Actions}
-
-\noindent\hspace*{24pt}\verb|>!action     >err(name)     >err{...} | -- the start state\\
-\noindent\hspace*{24pt}\verb|<!action     <err(name)     <err{...} | -- any state except the start state\\
-\noindent\hspace*{24pt}\verb|$!action     $err(name)     $err{...} | -- all states\\
-\noindent\hspace*{24pt}\verb|%!action     %err(name)     %err{...} | -- final states\\
-\noindent\hspace*{24pt}\verb|@!action     @err(name)     @err{...} | -- any state except final states\\
-\noindent\hspace*{24pt}\verb|<>!action    <>err(name)    <>err{...}| -- any except start and final (middle)
-\vspace{12pt}
-
-Global error actions are stored in the states they are embedded into until
-compilation is complete. They are then transferred to the transitions that move
-into the error state. These transitions are taken on all input characters that
-are not already covered by the state's transitions. If a state with an error
-action is not final when compilation is complete, then the action is also
-embedded as an EOF action.
-
-Error actions can be used to recover from errors by jumping back into the
-machine with \verb|fgoto| and optionally altering \verb|p|.
-
-\subsubsection{Local Error Actions}
-
-\noindent\hspace*{24pt}\verb|>^action     >lerr(name)     >lerr{...} | -- the start state\\
-\noindent\hspace*{24pt}\verb|<^action     <lerr(name)     <lerr{...} | -- any state except the start state\\
-\noindent\hspace*{24pt}\verb|$^action     $lerr(name)     $lerr{...} | -- all states\\
-\noindent\hspace*{24pt}\verb|%^action     %lerr(name)     %lerr{...} | -- final states\\
-\noindent\hspace*{24pt}\verb|@^action     @lerr(name)     @lerr{...} | -- any state except final states\\
-\noindent\hspace*{24pt}\verb|<>^action    <>lerr(name)    <>lerr{...}| -- any except start and final (middle)
-\vspace{12pt}
-
-Like global error actions, local error actions are also stored in the states
-they are embedded into until a transfer point. The transfer point is different
-however. Each local error action embedding is associated with a name. When a
-machine definition has been fully constructed, all local error action
-embeddings associated with the same name as the machine definition are
-transferred to the error transitions. At this time they are also embedded as
-EOF actions in the case of non-final states.
-
-Local error actions can be used to specify an action to take when a particular
-section of a larger state machine fails to match. A particular machine
-definition's ``thread'' may die and the local error actions executed, however
-the machine as a whole may continue to match input.
-
-There are two forms of local error action embeddings. In the first form the
-name defaults to the current machine. In the second form the machine name can
-be specified.  This is useful when it is more convenient to specify the local
-error action in a sub-definition that is used to construct the machine
-definition that the local error action is associated with. To embed local 
-error actions and
-explicitly state the machine definition on which the transfer is to happen use
-\verb|(name, action)| as the action.
-
-\subsubsection{Example}
-
-The following example uses error actions to report an error and jump to a
-machine that consumes the remainder of the line when parsing fails. After
-consuming the line, the error recovery machine returns to the main loop.
-
-% GENERATE: erract
-% %%{
-%   machine erract;
-%   ws = ' ';
-%   address = 'foo AT bar..com';
-%   date = 'Monday May 12';
-\begin{inline_code}
-\begin{verbatim}
-action cmd_err { 
-    printf( "command error\n" ); 
-    fhold; fgoto line;
-}
-action from_err { 
-    printf( "from error\n" ); 
-    fhold; fgoto line; 
-}
-action to_err { 
-    printf( "to error\n" ); 
-    fhold; fgoto line;
-}
-
-line := [^\n]* '\n' @{ fgoto main; };
-
-main := (
-    (
-        'from' @err(cmd_err) 
-            ( ws+ address ws+ date '\n' ) $err(from_err) |
-        'to' @err(cmd_err)
-            ( ws+ address '\n' ) $err(to_err)
-    ) 
-)*;
-\end{verbatim}
-\end{inline_code}
-\verbspace
-% }%%
-% %% write data;
-% void f()
-% {
-%   %% write init;
-%   %% write exec;
-% }
-% END GENERATE
-
-
-
-\section{Action Ordering and Duplicates}
-
-When combining expressions that have embedded actions it is often the case that
-a number of actions must be executed on a single input character. For example,
-following a concatenation the leaving action of the left expression and the
-entering action of the right expression will be embedded into one transition.
-This requires a method of ordering actions that is intuitive and
-predictable for the user, and repeatable for the compiler. 
-
-We associate with the embedding of each action a unique timestamp that is
-used to order actions that appear together on a single transition in the final
-state machine. To accomplish this we recursively traverse the parse tree of
-regular expressions and assign timestamps to action embeddings. References to
-machine definitions are followed in the traversal. When we visit a
-parse tree node we assign timestamps to all {\em entering} action embeddings,
-recurse on the parse tree, then assign timestamps to the remaining {\em all},
-{\em finishing}, and {\em leaving} embeddings in the order in which they
-appear.
-
-By default Ragel does not permit a single action to appear multiple times in an action
-list. When the final machine has been created, actions that appear more than
-once in a single transition, to-state, from-state or EOF action list have their
-duplicates removed.
-The first appearance of the action is preserved. This is useful in a number of
-scenarios. First, it allows us to union machines with common prefixes without
-worrying about the action embeddings in the prefix being duplicated. Second, it
-prevents leaving actions from being transferred multiple times. This can
-happen when a machine is repeated, then followed with another machine that
-begins with a common character. For example:
-
-\begin{verbatim}
-word = [a-z]+ %act;
-main := word ( '\n' word )* '\n\n';
-\end{verbatim}
-\verbspace
-
-Note that Ragel does not compare action bodies to determine if they have
-identical program text. It simply checks for duplicates using each action
-block's unique location in the program.
-
-The removal of duplicates can be turned off using the \verb|-d| option.
-
-\section{Values and Statements Available in Code Blocks}
-\label{vals}
-
-The following values are available in code blocks:
-
-\begin{itemize}
-\item \verb|fpc| -- A pointer to the current character. This is equivalent to
-accessing the \verb|p| variable.
-
-\item \verb|fc| -- The current character. This is equivalent to the expression \verb|(*p)|.
-
-\item \verb|fcurs| -- An integer value representing the current state. This
-value should only be read from. To move to a different place in the machine
-from action code use the \verb|fgoto|, \verb|fnext| or \verb|fcall| statements.
-Outside of the machine execution code the \verb|cs| variable may be modified.
-
-\item \verb|ftargs| -- An integer value representing the target state. This
-value should only be read from. Again, \verb|fgoto|, \verb|fnext| and
-\verb|fcall| can be used to move to a specific entry point.
-
-\item \verb|fentry(<label>)| -- Retrieve an integer value representing the
-entry point \verb|label|. The integer value returned will be a compile time
-constant. This number is suitable for later use in control flow transfer
-statements that take an expression. This value should not be compared against
-the current state because any given label can have multiple states representing
-it. The value returned by \verb|fentry| can be any one of the multiple states that
-it represents.
-\end{itemize}
-
-The following statements are available in code blocks:
-
-\begin{itemize}
-
-\item \verb|fhold;| -- Do not advance over the current character. If processing
-data in multiple buffer blocks, the \verb|fhold| statement should only be used
-once in the set of actions executed on a character.  Multiple calls may result
-in backing up over the beginning of the buffer block. The \verb|fhold|
-statement does not imply any transfer of control. It is equivalent to the
-\verb|p--;| statement. 
-
-\item \verb|fexec <expr>;| -- Set the next character to process. This can be
-used to backtrack to previous input or advance ahead.
-Unlike \verb|fhold|, which can be used
-anywhere, \verb|fexec| requires the user to ensure that the target of the
-backtrack is in the current buffer block or is known to be somewhere ahead of
-it. The machine will continue iterating forward until \verb|pe| is arrived at,
-\verb|fbreak| is called or the machine moves into the error state. In actions
-embedded into transitions, the \verb|fexec| statement is equivalent to setting
-\verb|p| to one position ahead of the next character to process.  If the user
-also modifies \verb|pe|, it is possible to change the buffer block entirely.
-
-\item \verb|fgoto <label>;| -- Jump to an entry point defined by
-\verb|<label>|.  The \verb|fgoto| statement immediately transfers control to
-the destination state.
-
-\item \verb|fgoto *<expr>;| -- Jump to an entry point given by \verb|<expr>|.
-The expression must evaluate to an integer value representing a state.
-
-\item \verb|fnext <label>;| -- Set the next state to be the entry point defined
-by \verb|label|.  The \verb|fnext| statement does not immediately jump to the
-specified state. Any action code following the statement is executed.
-
-\item \verb|fnext *<expr>;| -- Set the next state to be the entry point given
-by \verb|<expr>|. The expression must evaluate to an integer value representing
-a state.
-
-\item \verb|fcall <label>;| -- Push the target state and jump to the entry
-point defined by \verb|<label>|.  The next \verb|fret| will jump to the target
-of the transition on which the call was made. Use of \verb|fcall| requires
-the declaration of a call stack. An array of integers named \verb|stack| and a
-single integer named \verb|top| must be declared. With the \verb|fcall|
-construct, control is immediately transferred to the destination state.
-See section \ref{modularization} for more information.
-
-\item \verb|fcall *<expr>;| -- Push the current state and jump to the entry
-point given by \verb|<expr>|. The expression must evaluate to an integer value
-representing a state.
-
-\item \verb|fret;| -- Return to the target state of the transition on which the
-last \verb|fcall| was made.  Use of \verb|fret| requires the declaration of a
-call stack. Control is immediately transferred to the destination state.
-
-\item \verb|fbreak;| -- Advance \verb|p|, save the target state to \verb|cs|
-and immediately break out of the execute loop. This statement is useful
-in conjunction with the \verb|noend| write option. Rather than process input
-until \verb|pe| is arrived at, the fbreak statement
-can be used to stop processing from an action.  After an \verb|fbreak|
-statement the \verb|p| variable will point to the next character in the input. The
-current state will be the target of the current transition. Note that \verb|fbreak|
-causes the target state's to-state actions to be skipped.
-
-\end{itemize}
-
-Once actions with control-flow commands are embedded into a
-machine, the user must exercise caution when using the machine as the operand
-to other machine construction operators. If an action jumps to another state
-then unioning any transition that executes that action with another transition
-that follows some other path will cause that other path to be lost. Using
-commands that manually jump around a machine takes us out of the domain of
-regular languages because transitions that the
-machine construction operators are not aware of are introduced.  These
-commands should therefore be used with caution.
-
-
-\chapter{Controlling Nondeterminism}
-\label{controlling-nondeterminism}
-
-Along with the flexibility of arbitrary action embeddings comes a need to
-control nondeterminism in regular expressions. If a regular expression is
-ambiguous, then sub-components of a parser other than the intended parts may become
-active. This means that actions that are irrelevant to the
-current subset of the parser may be executed, causing problems for the
-programmer.
-
-Tools that are based on regular expression engines and that are used for
-recognition tasks will usually function as intended regardless of the presence
-of ambiguities. It is quite common for users of scripting languages to write
-regular expressions that are heavily ambiguous and it generally does not
-matter. As long as one of the potential matches is recognized, there can be any
-number of other matches present.  In some parsing systems the run-time engine
-can employ a strategy for resolving ambiguities, for example always pursuing
-the longest possible match and discarding others.
-
-In Ragel, there is no regular expression run-time engine, just a simple state
-machine execution model. When we begin to embed actions and face the
-possibility of spurious action execution, it becomes clear that controlling
-nondeterminism at the machine construction level is very important. Consider
-the following example.
-
-% GENERATE: lines1
-% OPT: -p
-% %%{
-% machine lines1;
-% action first {}
-% action tail {}
-% word = [a-z]+;
-\begin{inline_code}
-\begin{verbatim}
-ws = [\n\t ];
-line = word $first ( ws word $tail )* '\n';
-lines = line*;
-\end{verbatim}
-\end{inline_code}
-\verbspace
-% main := lines;
-% }%%
-% END GENERATE
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.53]{lines1}
-\end{center}
-\graphspace
-
-Since the \verb|ws| expression includes the newline character, we will
-not finish the \verb|line| expression when a newline character is seen. We will
-simultaneously pursue the possibility of matching further words on the same
-line and the possibility of matching a second line. Evidence of this fact is 
-in the state tables. On several transitions both the \verb|first| and
-\verb|tail| actions are executed.  The solution here is simple: exclude
-the newline character from the \verb|ws| expression. 
-
-% GENERATE: lines2
-% OPT: -p
-% %%{
-% machine lines2;
-% action first {}
-% action tail {}
-% word = [a-z]+;
-\begin{inline_code}
-\begin{verbatim}
-ws = [\t ];
-line = word $first ( ws word $tail )* '\n';
-lines = line*;
-\end{verbatim}
-\end{inline_code}
-\verbspace
-% main := lines;
-% }%%
-% END GENERATE
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{lines2}
-\end{center}
-\graphspace
-
-Solving this kind of problem is straightforward when the ambiguity is created
-by strings that are a single character long.  When the ambiguity is created by
-strings that are multiple characters long we have a more difficult problem.
-The following example is an incorrect attempt at a regular expression for C
-language comments. 
-
-% GENERATE: comments1
-% OPT: -p
-% %%{
-% machine comments1;
-% action comm {}
-\begin{inline_code}
-\begin{verbatim}
-comment = '/*' ( any @comm )* '*/';
-main := comment ' ';
-\end{verbatim}
-\end{inline_code}
-\verbspace
-% }%%
-% END GENERATE
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{comments1}
-\end{center}
-\graphspace
-
-Using standard concatenation, we will never leave the \verb|any*| expression.
-We will forever entertain the possibility that a \verb|'*/'| string that we see
-is contained in a longer comment and that, simultaneously, the comment has
-ended.  The concatenation of the \verb|comment| machine with \verb|SP| is done
-to show this. When we match space, we are also still matching the comment body.
-
-One way to approach the problem is to exclude the terminating string
-from the \verb|any*| expression using set difference. We must be careful to
-exclude not just the terminating string, but any string that contains it as a
-substring. A verbose, but proper specification of a C comment parser is given
-by the following regular expression. 
-
-% GENERATE: comments2
-% OPT: -p
-% %%{
-% machine comments2;
-% action comm {}
-\begin{inline_code}
-\begin{verbatim}
-comment = '/*' ( ( any @comm )* - ( any* '*/' any* ) ) '*/';
-\end{verbatim}
-\end{inline_code}
-\verbspace
-% main := comment;
-% }%%
-% END GENERATE
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{comments2}
-\end{center}
-\graphspace
-
-Note that Ragel's strong subtraction operator \verb|--| can also be used here.
-In doing this subtraction we have phrased the problem of controlling non-determinism in
-terms of excluding strings common to two expressions that interact when
-combined.
-We can also phrase the problem in terms of the transitions of the state
-machines that implement these expressions. During the concatenation of
-\verb|any*| and \verb|'*/'| we will be making transitions that are composed of
-both the loop of the first expression and the final character of the second.
-At this time we want the transition on the \verb|'/'| character to take precedence
-over and disallow the transition that originated in the \verb|any*| loop.
-
-In another parsing problem, we wish to implement a lightweight tokenizer that we can
-utilize in the composition of a larger machine. For example, some HTTP headers
-have a token stream as a sub-language. The following example is an attempt
-at a regular expression-based tokenizer that does not function correctly due to
-unintended nondeterminism.
-
-% GENERATE: smallscanner
-% OPT: -p
-% %%{
-% machine smallscanner;
-% action start_str {}
-% action on_char {}
-% action finish_str {}
-\begin{inline_code}
-\begin{verbatim}
-header_contents = ( 
-    lower+ >start_str $on_char %finish_str | 
-    ' '
-)*;
-\end{verbatim}
-\end{inline_code}
-\verbspace
-% main := header_contents;
-% }%%
-% END GENERATE
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{smallscanner}
-\end{center}
-\graphspace
-
-In this case, the problem with using a standard kleene star operation is that
-there is an ambiguity between extending a token and wrapping around the machine
-to begin a new token. Using the standard operator, we get an undesirable
-nondeterministic behaviour. Evidence of this can be seen on the transition out
-of state one to itself.  The transition extends the string, and simultaneously,
-finishes the string only to immediately begin a new one.  What is required is
-for the
-transitions that represent an extension of a token to take precedence over the
-transitions that represent the beginning of a new token. For this problem
-there is no simple solution that uses standard regular expression operators.
-
-\section{Priorities}
-
-A priority mechanism was devised and built into the determinization
-process, specifically for the purpose of allowing the user to control
-nondeterminism.  Priorities are integer values embedded into transitions. When
-the determinization process is combining transitions that have different
-priorities, the transition with the higher priority is preserved and the
-transition with the lower priority is dropped.
-
-Unfortunately, priorities can have unintended side effects because their
-operation requires that they linger in transitions indefinitely. They must linger
-because the Ragel program cannot know when the user is finished with a priority
-embedding.  A solution whereby they are explicitly deleted after use is
-conceivable; however this is not very user-friendly.  Priorities were therefore
-made into named entities. Only priorities with the same name are allowed to
-interact.  This allows any number of priorities to coexist in one machine for
-the purpose of controlling various different regular expression operations and
-eliminates the need to ever delete them. Such a scheme allows the user to
-choose a unique name, embed two different priority values using that name
-and be confident that the priority embedding will be free of any side effects.
-
-In the first form of priority embedding the name defaults to the name of the machine
-definition that the priority is assigned in. In this sense priorities are by
-default local to the current machine definition or instantiation. Beware of
-using this form in a longest-match machine, since there is only one name for
-the entire set of longest match patterns. In the second form the priority's
-name can be specified, allowing priority interaction across machine definition
-boundaries.
-
-\begin{itemize}
-\item \verb|expr > int| -- Sets starting transitions to have priority int.
-\item \verb|expr @ int| -- Sets transitions that go into a final state to have priority int. 
-\item \verb|expr $ int| -- Sets all transitions to have priority int.
-\item \verb|expr % int| -- Sets leaving transitions to
-have priority int. When a transition is made going out of the machine (either
-by concatenation or kleene star) its priority is immediately set to the 
-leaving priority.  
-\end{itemize}
-
-The second form of priority assignment allows the programmer to specify the name
-to which the priority is assigned.
-
-\begin{itemize}
-\item \verb|expr > (name, int)| -- Starting transitions.
-\item \verb|expr @ (name, int)| -- Finishing transitions (into a final state).
-\item \verb|expr $ (name, int)| -- All transitions.
-\item \verb|expr % (name, int)| -- Leaving transitions.
-\end{itemize}
-
-\section{Guarded Operators that Encapsulate Priorities}
-
-Priority embeddings are a very expressive mechanism. At the same time they
-can be very confusing for the user. They force the user to imagine
-the transitions inside two interacting expressions and work out the precise
-effects of the operations between them. When we consider
-that this problem is worsened by the
-potential for side effects caused by unintended priority name collisions, we
-see that exposing the user to priorities is undesirable.
-
-Fortunately, in practice the use of priorities has been necessary only in a
-small number of scenarios.  This allows us to encapsulate their functionality
-into a small set of operators and fully hide them from the user. This is
-advantageous from a language design point of view because it greatly simplifies
-the design.  
-
-Going back to the C comment example, we can now properly specify
-it using a guarded concatenation operator which we call {\em finish-guarded
-concatenation}. From the user's point of view, this operator terminates the
-first machine when the second machine moves into a final state.  It chooses a
-unique name and uses it to embed a low priority into all
-transitions of the first machine. A higher priority is then embedded into the
-transitions of the second machine that enter into a final state. The following
-example yields a machine identical to the example in Section 
-\ref{controlling-nondeterminism}.
-
-\begin{inline_code}
-\begin{verbatim}
-comment = '/*' ( any @comm )* :>> '*/';
-\end{verbatim}
-\end{inline_code}
-\verbspace
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{comments2}
-\end{center}
-\graphspace
-
-Another guarded operator is {\em left-guarded concatenation}, given by the
-\verb|<:| compound symbol. This operator places a higher priority on all
-transitions of the first machine. This is useful if one must forcibly separate
-two lists that contain common elements. For example, one may need to tokenize a
-stream, but first consume leading whitespace.
-
-Ragel also includes a {\em longest-match kleene star} operator, given by the
-\verb|**| compound symbol. This 
-guarded operator embeds a high
-priority into all transitions of the machine. 
-A lower priority is then embedded into the leaving transitions.  When the
-kleene star operator makes the epsilon transitions from
-the final states into the new start state, the lower priority will be transferred
-to the epsilon transitions. In cases where following an epsilon transition
-out of a final state conflicts with an existing transition out of a final
-state, the epsilon transition will be dropped.
-
-Other guarded operators are conceivable, such as guards on union that cause one
-alternative to take precedence over another. These may be implemented when it
-is clear they constitute a frequently used operation.
-In the next section we discuss the explicit specification of state machines
-using state charts.
-
-\subsection{Entry-Guarded Concatenation}
-
-\verb|expr :> expr| 
-
-This operator concatenates two machines, but first assigns a low
-priority to all transitions
-of the first machine and a high priority to the starting transitions of the
-second machine. This operator is useful if from the final states of the first
-machine it is possible to accept the characters in the entering transitions of
-the second machine. This operator effectively terminates the first machine
-immediately upon starting the second machine, where otherwise they would be
-pursued concurrently. In the following example, entry-guarded concatenation is
-used to move out of a machine that matches everything at the first sign of an
-end-of-input marker.
-
-% GENERATE: entryguard
-% OPT: -p
-% %%{
-% machine entryguard;
-\begin{inline_code}
-\begin{verbatim}
-# Leave the catch-all machine on the first character of FIN.
-main := any* :> 'FIN';
-\end{verbatim}
-\end{inline_code}
-\verbspace
-% }%%
-% END GENERATE
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{entryguard}
-\end{center}
-\graphspace
-
-Entry-guarded concatenation is equivalent to the following:
-
-\begin{verbatim}
-expr $(unique_name,0) . expr >(unique_name,1)
-\end{verbatim}
-\verbspace
-
-\subsection{Finish-Guarded Concatenation}
-
-\verb|expr :>> expr|
-
-This operator is
-like the previous operator, except the higher priority is placed on the final
-transitions of the second machine. This is useful if one wishes to entertain
-the possibility of continuing to match the first machine right up until the
-second machine enters a final state. In other words it terminates the first
-machine only when the second accepts. In the following example, finish-guarded
-concatenation causes the move out of the machine that matches everything to be
-delayed until the full end-of-input marker has been matched.
-
-% GENERATE: finguard
-% OPT: -p
-% %%{
-% machine finguard;
-\begin{inline_code}
-\begin{verbatim}
-# Leave the catch-all machine on the last character of FIN.
-main := any* :>> 'FIN';
-\end{verbatim}
-\end{inline_code}
-\verbspace
-% }%%
-% END GENERATE
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{finguard}
-\end{center}
-\graphspace
-
-Finish-guarded concatenation is equivalent to the following, with one
-exception. If the right machine's start state is final, the higher priority is
-also embedded into it as a leaving priority. This prevents the left machine
-from persisting via the zero-length string.
-
-\begin{verbatim}
-expr $(unique_name,0) . expr @(unique_name,1)
-\end{verbatim}
-\verbspace
-
-\subsection{Left-Guarded Concatenation}
-
-\verb|expr <: expr| 
-
-This operator places
-a higher priority on the left expression. It is useful if you want to prefix a
-sequence with another sequence composed of some of the same characters. For
-example, one can consume leading whitespace before tokenizing a sequence of
-whitespace-separated words as in:
-
-% GENERATE: leftguard
-% OPT: -p
-% %%{
-% machine leftguard;
-% action alpha {}
-% action ws {}
-% action start {}
-% action fin {}
-\begin{inline_code}
-\begin{verbatim}
-main := ( ' '* >start %fin ) <: ( ' ' $ws | [a-z] $alpha )*;
-\end{verbatim}
-\end{inline_code}
-\verbspace
-% }%%
-% END GENERATE
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{leftguard}
-\end{center}
-\graphspace
-
-Left-guarded concatenation is equivalent to the following:
-
-\begin{verbatim}
-expr $(unique_name,1) . expr >(unique_name,0)
-\end{verbatim}
-\verbspace
-
-\subsection{Longest-Match Kleene Star}
-\label{longest_match_kleene_star}
-
-\verb|expr**| 
-
-This version of kleene star puts a higher priority on staying in the
-machine versus wrapping around and starting over. The LM kleene star is useful
-when writing simple tokenizers.  These machines are built by applying the
-longest-match kleene star to an alternation of token patterns, as in the
-following.
-
-% GENERATE: lmkleene
-% OPT: -p
-% %%{
-% machine exfinpri;
-% action A {}
-% action B {}
-\begin{inline_code}
-\begin{verbatim}
-# Repeat tokens, but make sure to get the longest match.
-main := (
-    lower ( lower | digit )* %A | 
-    digit+ %B | 
-    ' '
-)**;
-\end{verbatim}
-\end{inline_code}
-\verbspace
-% }%%
-% END GENERATE
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{lmkleene}
-\end{center}
-\graphspace
-
-If a regular kleene star were used the machine above would not be able to
-distinguish between extending a word and beginning a new one.  This operator is
-equivalent to:
-
-\begin{verbatim}
-( expr $(unique_name,1) %(unique_name,0) )*
-\end{verbatim}
-\verbspace
-
-When the kleene star is applied, transitions that go out of the machine and
-back into it are made. These are assigned a priority of zero by the leaving 
-transition mechanism. This is less than the priority of one assigned to the
-transitions leaving the final states but not leaving the machine. When 
-these transitions clash on the same character, the 
-transition that stays in the machine takes precedence.  The transition
-that wraps around is dropped.
-
-Note that this operator does not build a scanner in the traditional sense
-because there is never any backtracking. To build a scanner with backtracking
-use the Longest-Match machine construction described in Section
-\ref{generating-scanners}.
-
-\chapter{Interface to Host Program}
-
-The Ragel code generator is very flexible. The generated code has no
-dependencies and can be inserted in any function, perhaps inside a loop if
-desired.  The user is responsible for declaring and initializing a number of
-required variables, including the current state and the pointer to the input
-stream. These can live in any scope. Control of the input processing loop is
-also possible: the user may break out of the processing loop and return to it
-at any time.
-
-In the case of the C, D, and Go host languages, Ragel is able to generate very
-fast-running code that implements state machines as directly executable code.
-Since very large files strain the host language compiler, table-based code
-generation is also supported. In the future we hope to provide a partitioned,
-directly executable format that is able to reduce the burden on the host
-compiler by splitting large machines across multiple functions.
-
-In the case of Java and Ruby, table-based code generation is the only code
-style supported. In the future this may be expanded to include other code
-styles.
-
-Ragel can be used to parse input in one block, or it can be used to parse input
-in a sequence of blocks as it arrives from a file or socket.  Parsing the input
-in a sequence of blocks brings with it a few responsibilities. If the parser
-utilizes a scanner, care must be taken to not break the input stream anywhere
-but token boundaries.  If pointers to the input stream are taken during
-parsing, care must be taken to not use a pointer that has been invalidated by
-movement to a subsequent block.  If the current input data pointer is moved
-backwards it must not be moved past the beginning of the current block.
-
-Figure \ref{basic-example} shows a simple Ragel program that does not have any
-actions. The example tests the first argument of the program against a number
-pattern and then prints the machine's acceptance status.
-
-\begin{figure}
-\small
-\begin{verbatim}
-#include <stdio.h>
-#include <string.h>
-%%{
-    machine foo;
-    write data;
-}%%
-int main( int argc, char **argv )
-{
-    int cs;
-    if ( argc > 1 ) {
-        char *p = argv[1];
-        char *pe = p + strlen( p );
-        %%{ 
-            main := [0-9]+ ( '.' [0-9]+ )?;
-
-            write init;
-            write exec;
-        }%%
-    }
-    printf("result = %i\n", cs >= foo_first_final );
-    return 0;
-}
-\end{verbatim}
-\verbspace
-\caption{A basic Ragel example without any actions.
-}
-\label{basic-example}
-\end{figure}
-
-\section{Variables Used by Ragel}
-
-There are a number of variables that Ragel expects the user to declare. At a
-very minimum the \verb|cs|, \verb|p| and \verb|pe| variables must be declared.
-In Go, Java and Ruby code the \verb|data| variable must also be declared. If
-EOF actions are used then the \verb|eof| variable is required. If
-stack-based state machine control flow statements are used then the
-\verb|stack| and \verb|top| variables are required. If a scanner is declared
-then the \verb|act|, \verb|ts| and \verb|te| variables must be
-declared.
-
-\begin{itemize}
-
-\item \verb|cs| - Current state. This must be an integer and it should persist
-across invocations of the machine when the data is broken into blocks that are
-processed independently. This variable may be modified from outside the
-execution loop, but not from within.
-
-\item \verb|p| - Data pointer. In C/D code this variable is expected to be a
-pointer to the character data to process. It should be initialized to the
-beginning of the data block on every run of the machine. In Go, Java and Ruby it is
-used as an offset to \verb|data| and must be an integer. In this case it should
-be initialized to zero on every run of the machine.
-
-\item \verb|pe| - Data end pointer. This should be initialized to \verb|p| plus
-the data length on every run of the machine. In Go, Java and Ruby code this should
-be initialized to the data length.
-
-\item \verb|eof| - End of file pointer. This should be set to \verb|pe| when
-the buffer block being processed is the last one, otherwise it should be set to
-null. In Go, Java and Ruby code \verb|-1| must be used instead of null. If the EOF
-event can be known only after the final buffer block has been processed, then
-it is possible to set \verb|p = pe = eof| and run the execute block.
-
-\item \verb|data| - This variable is only required in Go, Java and Ruby code. It
-must be an array containting the data to process.
-
-\item \verb|stack| - This must be an array of integers. It is used to store
-integer values representing states. If the stack must resize dynamically the
-Pre-push and Post-Pop statements can be used to do this (Sections
-\ref{prepush} and \ref{postpop}).
-
-\item \verb|top| - This must be an integer value and will be used as an offset
-to \verb|stack|, giving the next available spot on the top of the stack.
-
-\item \verb|act| - This must be an integer value. It is a variable sometimes
-used by scanner code to keep track of the most recent successful pattern match.
-
-\item \verb|ts| - This must be a pointer to character data. In Go, Java and
-Ruby code this must be an integer. See Section \ref{generating-scanners} for
-more information.
-
-\item \verb|te| - Also a pointer to character data.
-
-\end{itemize}
-
-\section{Alphtype Statement}
-
-\begin{verbatim}
-alphtype unsigned int;
-\end{verbatim}
-\verbspace
-
-The alphtype statement specifies the alphabet data type that the machine
-operates on. During the compilation of the machine, integer literals are
-expected to be in the range of possible values of the alphtype. The default
-is \verb|char| for all languages except Go where the default is \verb|byte|.
-
-\begin{multicols}{2}
-C/C++/Objective-C:
-\begin{verbatim}
-          char      unsigned char      
-          short     unsigned short
-          int       unsigned int
-          long      unsigned long
-\end{verbatim}
-\verbspace
-
-Go:
-\begin{verbatim}
-          byte
-          int8      uint8
-          int16     uint16
-          int32     uint32
-          int
-\end{verbatim}
-\verbspace
-
-Ruby: 
-\begin{verbatim}
-          char 
-          int
-\end{verbatim}
-\verbspace
-
-\columnbreak
-
-Java:
-\begin{verbatim}
-          char 
-          byte 
-          short 
-          int
-\end{verbatim}
-\verbspace
-
-D:
-\begin{verbatim}
-          char 
-          byte      ubyte   
-          short     ushort 
-          wchar 
-          int       uint 
-          dchar
-\end{verbatim}
-\verbspace
-
-\end{multicols}
-
-\section{Getkey Statement}
-
-\begin{verbatim}
-getkey fpc->id;
-\end{verbatim}
-\verbspace
-
-This statement specifies to Ragel how to retrieve the current character from 
-from the pointer to the current element (\verb|p|). Any expression that returns
-a value of the alphabet type
-may be used. The getkey statement may be used for looking into element
-structures or for translating the character to process. The getkey expression
-defaults to \verb|(*p)|. In goto-driven machines the getkey expression may be
-evaluated more than once per element processed, therefore it should not incur a
-large cost nor preclude optimization.
-
-\section{Access Statement}
-
-\begin{verbatim}
-access fsm->;
-\end{verbatim}
-\verbspace
-
-The access statement specifies how the generated code should
-access the machine data that is persistent across processing buffer blocks.
-This applies to all variables except \verb|p|, \verb|pe| and \verb|eof|. This includes
-\verb|cs|, \verb|top|, \verb|stack|, \verb|ts|, \verb|te| and \verb|act|.
-The access statement is useful if a machine is to be encapsulated inside a
-structure in C code. It can be used to give the name of
-a pointer to the structure.
-
-\section{Variable Statement}
-
-\begin{verbatim}
-variable p fsm->p;
-\end{verbatim}
-\verbspace
-
-The variable statement specifies how to access a specific
-variable. All of the variables that are declared by the user and
-used by Ragel can be changed. This includes \verb|p|, \verb|pe|, \verb|eof|, \verb|cs|,
-\verb|top|, \verb|stack|, \verb|ts|, \verb|te| and \verb|act|.
-In Go, Ruby and Java code generation the \verb|data| variable can also be changed.
-
-\section{Pre-Push Statement}
-\label{prepush}
-
-\begin{verbatim}
-prepush { 
-    /* stack growing code */
-}
-\end{verbatim}
-\verbspace
-
-The prepush statement allows the user to supply stack management code that is
-written out during the generation of fcall, immediately before the current
-state is pushed to the stack. This statement can be used to test the number of
-available spaces and dynamically grow the stack if necessary.
-
-\section{Post-Pop Statement}
-\label{postpop}
-
-\begin{verbatim}
-postpop { 
-    /* stack shrinking code */
-}
-\end{verbatim}
-\verbspace
-
-The postpop statement allows the user to supply stack management code that is
-written out during the generation of fret, immediately after the next state is
-popped from the stack. This statement can be used to dynamically shrink the
-stack.
-
-\section{Write Statement}
-\label{write-statement}
-
-\begin{verbatim}
-write <component> [options];
-\end{verbatim}
-\verbspace
-
-The write statement is used to generate parts of the machine. 
-There are seven
-components that can be generated by a write statement. These components make up the
-state machine's data, initialization code, execution code, and export definitions.
-A write statement may appear before a machine is fully defined.
-This allows one to write out the data first then later define the machine where
-it is used. An example of this is shown in Figure \ref{fbreak-example}.
-
-\subsection{Write Data}
-\begin{verbatim}
-write data [options];
-\end{verbatim}
-\verbspace
-
-The write data statement causes Ragel to emit the constant static data needed
-by the machine. In table-driven output styles (see Section \ref{genout}) this
-is a collection of arrays that represent the states and transitions of the
-machine.  In goto-driven machines much less data is emitted. At the very
-minimum a start state \verb|name_start| is generated.  All variables written
-out in machine data have both the \verb|static| and \verb|const| properties and
-are prefixed with the name of the machine and an
-underscore. The data can be placed inside a class, inside a function, or it can
-be defined as global data.
-
-Two variables are written that may be used to test the state of the machine
-after a buffer block has been processed. The \verb|name_error| variable gives
-the id of the state that the machine moves into when it cannot find a valid
-transition to take. The machine immediately breaks out of the processing loop when
-it finds itself in the error state. The error variable can be compared to the
-current state to determine if the machine has failed to parse the input. If the
-machine is complete, that is from every state there is a transition to a proper
-state on every possible character of the alphabet, then no error state is required
-and this variable will be set to -1.
-
-The \verb|name_first_final| variable stores the id of the first final state.
-All of the machine's states are sorted by their final state status before
-having their ids assigned. Checking if the machine has accepted its input can
-then be done by checking if the current state is greater-than or equal to the
-first final state.
-
-Data generation has several options:
-
-\noindent\hspace*{24pt}\verb|noerror  | - Do not generate the integer variable that gives the id of the error state.\\
-\noindent\hspace*{24pt}\verb|nofinal  | - Do not generate the integer variable that gives the id of the first final state.\\
-\noindent\hspace*{24pt}\verb|noprefix | - Do not prefix the variable names with the name of the machine.
-\vspace{12pt}
-
-\begin{figure}
-\small
-\begin{verbatim}
-#include <stdio.h>
-%% machine foo;
-%% write data;
-int main( int argc, char **argv )
-{
-    int cs, res = 0;
-    if ( argc > 1 ) {
-        char *p = argv[1];
-        %%{ 
-            main := 
-                [a-z]+ 
-                0 @{ res = 1; fbreak; };
-            write init;
-            write exec noend;
-        }%%
-    }
-    printf("execute = %i\n", res );
-    return 0;
-}
-\end{verbatim}
-\verbspace
-\caption{Use of {\tt noend} write option and the {\tt fbreak} statement for
-processing a string.
-}
-\label{fbreak-example}
-\end{figure}
-
-\subsection{Write Start, First Final and Error}
-
-\begin{verbatim}
-write start;
-write first_final;
-write error;
-\end{verbatim}
-\verbspace
-
-These three write statements provide an alternative means of accessing the
-\verb|start|, \verb|first_final| and \verb|error| states. If there are many
-different machine specifications in one file it is easy to get the prefix for
-these wrong. This is especially true if the state machine boilerplate is
-frequently made by a copy-paste-edit process. These write statements allow the
-problem to be avoided. They can be used as follows:
-
-\begin{verbatim}
-/* Did parsing succeed? */
-if ( cs < %%{ write first_final; }%% ) {
-    result = ERR_PARSE_ERROR;
-    goto fail;
-}
-\end{verbatim}
-\verbspace
-
-\subsection{Write Init}
-\begin{verbatim}
-write init [options];
-\end{verbatim}
-\verbspace
-
-The write init statement causes Ragel to emit initialization code. This should
-be executed once before the machine is started. At a very minimum this sets the
-current state to the start state. If other variables are needed by the
-generated code, such as call stack variables or scanner management
-variables, they are also initialized here.
-
-The \verb|nocs| option to the write init statement will cause ragel to skip
-intialization of the cs variable. This is useful if the user wishes to use
-custom logic to decide which state the specification should start in.
-
-\subsection{Write Exec}
-\begin{verbatim}
-write exec [options];
-\end{verbatim}
-\verbspace
-
-The write exec statement causes Ragel to emit the state machine's execution code.
-Ragel expects several variables to be available to this code. At a very minimum, the
-generated code needs access to the current character position \verb|p|, the ending
-position \verb|pe| and the current state \verb|cs| (though \verb|pe|
-can be omitted using the \verb|noend| write option).
-The \verb|p| variable is the cursor that the execute code will
-used to traverse the input. The \verb|pe| variable should be set up to point to one
-position past the last valid character in the buffer.
-
-Other variables are needed when certain features are used. For example using
-the \verb|fcall| or \verb|fret| statements requires \verb|stack| and
-\verb|top| variables to be defined. If a longest-match construction is used,
-variables for managing backtracking are required.
-
-The write exec statement has one option. The \verb|noend| option tells Ragel
-to generate code that ignores the end position \verb|pe|. In this
-case the user must explicitly break out of the processing loop using
-\verb|fbreak|, otherwise the machine will continue to process characters until
-it moves into the error state. This option is useful if one wishes to process a
-null terminated string. Rather than traverse the string to discover then length
-before processing the input, the user can break out when the null character is
-seen.  The example in Figure \ref{fbreak-example} shows the use of the
-\verb|noend| write option and the \verb|fbreak| statement for processing a string.
-
-\subsection{Write Exports}
-\label{export}
-
-\begin{verbatim}
-write exports;
-\end{verbatim}
-\verbspace
-
-The export feature can be used to export simple machine definitions. Machine definitions
-are marked for export using the \verb|export| keyword.
-
-\begin{verbatim}
-export machine_to_export = 0x44;
-\end{verbatim}
-\verbspace
-
-When the write exports statement is used these machines are 
-written out in the generated code. Defines are used for C and constant integers
-are used for D, Java and Ruby. See Section \ref{import} for a description of the
-import statement.
-
-\section{Maintaining Pointers to Input Data}
-
-In the creation of any parser it is not uncommon to require the collection of
-the data being parsed.  It is always possible to collect data into a growable
-buffer as the machine moves over it, however the copying of data is a somewhat
-wasteful use of processor cycles. The most efficient way to collect data from
-the parser is to set pointers into the input then later reference them.  This
-poses a problem for uses of Ragel where the input data arrives in blocks, such
-as over a socket or from a file. If a pointer is set in one buffer block but
-must be used while parsing a following buffer block, some extra consideration
-to correctness must be made.
-
-The scanner constructions exhibit this problem, requiring the maintenance
-code described in Section \ref{generating-scanners}. If a longest-match
-construction has been used somewhere in the machine then it is possible to
-take advantage of the required prefix maintenance code in the driver program to
-ensure pointers to the input are always valid. If laying down a pointer one can
-set \verb|ts| at the same spot or ahead of it. When data is shifted in
-between loops the user must also shift the pointer.  In this way it is possible
-to maintain pointers to the input that will always be consistent.
-
-\begin{figure}
-\small
-\begin{verbatim}
-    int have = 0;
-    while ( 1 ) {
-        char *p, *pe, *data = buf + have;
-        int len, space = BUFSIZE - have;
-
-        if ( space == 0 ) { 
-            fprintf(stderr, "BUFFER OUT OF SPACE\n");
-            exit(1);
-        }
-
-        len = fread( data, 1, space, stdin );
-        if ( len == 0 )
-            break;
-
-        /* Find the last newline by searching backwards. */
-        p = buf;
-        pe = data + len - 1;
-        while ( *pe != '\n' && pe >= buf )
-            pe--;
-        pe += 1;
-
-        %% write exec;
-
-        /* How much is still in the buffer? */
-        have = data + len - pe;
-        if ( have > 0 )
-            memmove( buf, pe, have );
-
-        if ( len < space )
-            break;
-    }
-\end{verbatim}
-\verbspace
-\caption{An example of line-oriented processing.
-}
-\label{line-oriented}
-\end{figure}
-
-In general, there are two approaches for guaranteeing the consistency of
-pointers to input data. The first approach is the one just described;
-lay down a marker from an action,
-then later ensure that the data the marker points to is preserved ahead of
-the buffer on the next execute invocation. This approach is good because it
-allows the parser to decide on the pointer-use boundaries, which can be
-arbitrarily complex parsing conditions. A downside is that it requires any
-pointers that are set to be corrected in between execute invocations.
-
-The alternative is to find the pointer-use boundaries before invoking the execute
-routine, then pass in the data using these boundaries. For example, if the
-program must perform line-oriented processing, the user can scan backwards from
-the end of an input block that has just been read in and process only up to the
-first found newline. On the next input read, the new data is placed after the
-partially read line and processing continues from the beginning of the line.
-An example of line-oriented processing is given in Figure \ref{line-oriented}.
-
-\section{Specifying the Host Language}
-
-The \verb|ragel| program has a number of options for specifying the host
-language. The host-language options are:
-
-\begin{itemize}
-\item \verb|-C  | for C/C++/Objective-C code (default)
-\item \verb|-D  | for D code.
-\item \verb|-Z  | for Go code.
-\item \verb|-J  | for Java code.
-\item \verb|-R  | for Ruby code.
-\item \verb|-A  | for C\# code.
-\end{itemize}
-
-\section{Choosing a Generated Code Style}
-\label{genout}
-
-There are three styles of code output to choose from. Code style affects the
-size and speed of the compiled binary. Changing code style does not require any
-change to the Ragel program. There are two table-driven formats and a goto
-driven format.
-
-In addition to choosing a style to emit, there are various levels of action
-code reuse to choose from.  The maximum reuse levels (\verb|-T0|, \verb|-F0|
-and \verb|-G0|) ensure that no FSM action code is ever duplicated by encoding
-each transition's action list as static data and iterating
-through the lists on every transition. This will normally result in a smaller
-binary. The less action reuse options (\verb|-T1|, \verb|-F1| and \verb|-G1|)
-will usually produce faster running code by expanding each transition's action
-list into a single block of code, eliminating the need to iterate through the
-lists. This duplicates action code instead of generating the logic necessary
-for reuse. Consequently the binary will be larger. However, this tradeoff applies to
-machines with moderate to dense action lists only. If a machine's transitions
-frequently have less than two actions then the less reuse options will actually
-produce both a smaller and a faster running binary due to less action sharing
-overhead. The best way to choose the appropriate code style for your
-application is to perform your own tests.
-
-The table-driven FSM represents the state machine as constant static data. There are
-tables of states, transitions, indices and actions. The current state is
-stored in a variable. The execution is simply a loop that looks up the current
-state, looks up the transition to take, executes any actions and moves to the
-target state. In general, the table-driven FSM can handle any machine, produces
-a smaller binary and requires a less expensive host language compile, but
-results in slower running code.  Since the table-driven format is the most
-flexible it is the default code style.
-
-The flat table-driven machine is a table-based machine that is optimized for
-small alphabets. Where the regular table machine uses the current character as
-the key in a binary search for the transition to take, the flat table machine
-uses the current character as an index into an array of transitions. This is
-faster in general, however is only suitable if the span of possible characters
-is small.
-
-The goto-driven FSM represents the state machine using goto and switch
-statements. The execution is a flat code block where the transition to take is
-computed using switch statements and directly executable binary searches.  In
-general, the goto FSM produces faster code but results in a larger binary and a
-more expensive host language compile.
-
-The goto-driven format has an additional action reuse level (\verb|-G2|) that
-writes actions directly into the state transitioning logic rather than putting
-all the actions together into a single switch. Generally this produces faster
-running code because it allows the machine to encode the current state using
-the processor's instruction pointer. Again, sparse machines may actually
-compile to smaller binaries when \verb|-G2| is used due to less state and
-action management overhead. For many parsing applications \verb|-G2| is the
-preferred output format.
-
-\begin{center}
-
-Code Output Style Options
-
-\begin{tabular}{|c|c|c|}
-\hline
-\verb|-T0|&binary search table-driven&C/D/Java/Ruby/C\#\\
-\hline
-\verb|-T1|&binary search, expanded actions&C/D/Ruby/C\#\\
-\hline
-\verb|-F0|&flat table-driven&C/D/Ruby/C\#\\
-\hline
-\verb|-F1|&flat table, expanded actions&C/D/Ruby/C\#\\
-\hline
-\verb|-G0|&goto-driven&C/D/C\#\\
-\hline
-\verb|-G1|&goto, expanded actions&C/D/C\#\\
-\hline
-\verb|-G2|&goto, in-place actions&C/D/Go\\
-\hline
-\end{tabular}
-\end{center}
-
-\chapter{Beyond the Basic Model}
-
-\section{Parser Modularization}
-\label{modularization}
-
-It is possible to use Ragel's machine construction and action embedding
-operators to specify an entire parser using a single regular expression. In
-many cases this is the desired way to specify a parser in Ragel. However, in
-some scenarios the language to parse may be so large that it is difficult to
-think about it as a single regular expression. It may also shift between distinct
-parsing strategies, in which case modularization into several coherent blocks
-of the language may be appropriate.
-
-It may also be the case that patterns that compile to a large number of states
-must be used in a number of different contexts and referencing them in each
-context results in a very large state machine. In this case, an ability to reuse
-parsers would reduce code size.
-
-To address this, distinct regular expressions may be instantiated and linked
-together by means of a jumping and calling mechanism. This mechanism is
-analogous to the jumping to and calling of processor instructions. A jump
-command, given in action code, causes control to be immediately passed to
-another portion of the machine by way of setting the current state variable. A
-call command causes the target state of the current transition to be pushed to
-a state stack before control is transferred.  Later on, the original location
-may be returned to with a return statement. In the following example, distinct
-state machines are used to handle the parsing of two types of headers.
-
-% GENERATE: call
-% %%{
-%   machine call;
-\begin{inline_code}
-\begin{verbatim}
-action return { fret; }
-action call_date { fcall date; }
-action call_name { fcall name; }
-
-# A parser for date strings.
-date := [0-9][0-9] '/' 
-        [0-9][0-9] '/' 
-        [0-9][0-9][0-9][0-9] '\n' @return;
-
-# A parser for name strings.
-name := ( [a-zA-Z]+ | ' ' )** '\n' @return;
-
-# The main parser.
-headers = 
-    ( 'from' | 'to' ) ':' @call_name | 
-    ( 'departed' | 'arrived' ) ':' @call_date;
-
-main := headers*;
-\end{verbatim}
-\end{inline_code}
-\verbspace
-% }%%
-% %% write data;
-% void f()
-% {
-%   %% write init;
-%   %% write exec;
-% }
-% END GENERATE
-
-Calling and jumping should be used carefully as they are operations that take
-one out of the domain of regular languages. A machine that contains a call or
-jump statement in one of its actions should be used as an argument to a machine
-construction operator only with considerable care. Since DFA transitions may
-actually represent several NFA transitions, a call or jump embedded in one
-machine can inadvertently terminate another machine that it shares prefixes
-with. Despite this danger, theses statements have proven useful for tying
-together sub-parsers of a language into a parser for the full language,
-especially for the purpose of modularizing code and reducing the number of
-states when the machine contains frequently recurring patterns.
-
-Section \ref{vals} describes the jump and call statements that are used to
-transfer control. These statements make use of two variables that must be
-declared by the user, \verb|stack| and \verb|top|. The \verb|stack| variable
-must be an array of integers and \verb|top| must be a single integer, which
-will point to the next available space in \verb|stack|. Sections \ref{prepush}
-and \ref{postpop} describe the Pre-Push and Post-Pop statements which can be
-used to implement a dynamically resizable array.
-
-\section{Referencing Names}
-\label{labels}
-
-This section describes how to reference names in epsilon transitions (Section
-\ref{state-charts}) and
-action-based control-flow statements such as \verb|fgoto|. There is a hierarchy
-of names implied in a Ragel specification.  At the top level are the machine
-instantiations. Beneath the instantiations are labels and references to machine
-definitions. Beneath those are more labels and references to definitions, and
-so on.
-
-Any name reference may contain multiple components separated with the \verb|::|
-compound symbol.  The search for the first component of a name reference is
-rooted at the join expression that the epsilon transition or action embedding
-is contained in. If the name reference is not contained in a join,
-the search is rooted at the machine definition that the epsilon transition or
-action embedding is contained in. Each component after the first is searched
-for beginning at the location in the name tree that the previous reference
-component refers to.
-
-In the case of action-based references, if the action is embedded more than
-once, the local search is performed for each embedding and the result is the
-union of all the searches. If no result is found for action-based references then
-the search is repeated at the root of the name tree.  Any action-based name
-search may be forced into a strictly global search by prefixing the name
-reference with \verb|::|.
-
-The final component of the name reference must resolve to a unique entry point.
-If a name is unique in the entire name tree it can be referenced as is. If it
-is not unique it can be specified by qualifying it with names above it in the
-name tree. However, it can always be renamed.
-
-% FIXME: Should fit this in somewhere.
-% Some kinds of name references are illegal. Cannot call into longest-match
-% machine, can only call its start state. Cannot make a call to anywhere from
-% any part of a longest-match machine except a rule's action. This would result
-% in an eventual return to some point inside a longest-match other than the
-% start state. This is banned for the same reason a call into the LM machine is
-% banned.
-
-
-\section{Scanners}
-\label{generating-scanners}
-
-Scanners are very much intertwined with regular-languages and their
-corresponding processors. For this reason Ragel supports the definition of
-scanners.  The generated code will repeatedly attempt to match patterns from a
-list, favouring longer patterns over shorter patterns.  In the case of
-equal-length matches, the generated code will favour patterns that appear ahead
-of others. When a scanner makes a match it executes the user code associated
-with the match, consumes the input then resumes scanning.
-
-\begin{verbatim}
-<machine_name> := |* 
-        pattern1 => action1;
-        pattern2 => action2;
-        ...
-    *|;
-\end{verbatim}
-\verbspace
-
-On the surface, Ragel scanners are similar to those defined by Lex. Though
-there is a key distinguishing feature: patterns may be arbitrary Ragel
-expressions and can therefore contain embedded code. With a Ragel-based scanner
-the user need not wait until the end of a pattern before user code can be
-executed.
-
-Scanners can be used to process sub-languages, as well as for tokenizing
-programming languages. In the following example a scanner is used to tokenize
-the contents of a header field.
-
-\begin{inline_code}
-\begin{verbatim}
-word = [a-z]+;
-head_name = 'Header';
-
-header := |*
-    word;
-    ' ';
-    '\n' => { fret; };
-*|;
-
-main := ( head_name ':' @{ fcall header; } )*;
-\end{verbatim}
-\end{inline_code}
-\verbspace
-
-The scanner construction has a purpose similar to the longest-match kleene star
-operator \verb|**|. The key
-difference is that a scanner is able to backtrack to match a previously matched
-shorter string when the pursuit of a longer string fails.  For this reason the
-scanner construction operator is not a pure state machine construction
-operator. It relies on several variables that enable it to backtrack and make
-pointers to the matched input text available to the user.  For this reason
-scanners must be immediately instantiated. They cannot be defined inline or
-referenced by another expression. Scanners must be jumped to or called.
-
-Scanners rely on the \verb|ts|, \verb|te| and \verb|act|
-variables to be present so that they can backtrack and make pointers to the
-matched text available to the user. If input is processed using multiple calls
-to the execute code then the user must ensure that when a token is only
-partially matched that the prefix is preserved on the subsequent invocation of
-the execute code.
-
-The \verb|ts| variable must be defined as a pointer to the input data.
-It is used for recording where the current token match begins. This variable
-may be used in action code for retrieving the text of the current match.  Ragel
-ensures that in between tokens and outside of the longest-match machines that
-this pointer is set to null. In between calls to the execute code the user must
-check if \verb|ts| is set and if so, ensure that the data it points to is
-preserved ahead of the next buffer block. This is described in more detail
-below.
-
-The \verb|te| variable must also be defined as a pointer to the input data.
-It is used for recording where a match ends and where scanning of the next
-token should begin. This can also be used in action code for retrieving the
-text of the current match.
-
-The \verb|act| variable must be defined as an integer type. It is used for
-recording the identity of the last pattern matched when the scanner must go
-past a matched pattern in an attempt to make a longer match. If the longer
-match fails it may need to consult the \verb|act| variable. In some cases, use 
-of the \verb|act|
-variable can be avoided because the value of the current state is enough
-information to determine which token to accept, however in other cases this is
-not enough and so the \verb|act| variable is used. 
-
-When the longest-match operator is in use, the user's driver code must take on
-some buffer management functions. The following algorithm gives an overview of
-the steps that should be taken to properly use the longest-match operator.
-
-\begin{itemize}
-\item Read a block of input data.
-\item Run the execute code.
-\item If \verb|ts| is set, the execute code will expect the incomplete
-token to be preserved ahead of the buffer on the next invocation of the execute
-code.  
-\begin{itemize}
-\item Shift the data beginning at \verb|ts| and ending at \verb|pe| to the
-beginning of the input buffer.
-\item Reset \verb|ts| to the beginning of the buffer. 
-\item Shift \verb|te| by the distance from the old value of \verb|ts|
-to the new value. The \verb|te| variable may or may not be valid.  There is
-no way to know if it holds a meaningful value because it is not kept at null
-when it is not in use. It can be shifted regardless.
-\end{itemize}
-\item Read another block of data into the buffer, immediately following any
-preserved data.
-\item Run the scanner on the new data.
-\end{itemize}
-
-Figure \ref{preserve_example} shows the required handling of an input stream in
-which a token is broken by the input block boundaries. After processing up to
-and including the ``t'' of ``characters'', the prefix of the string token must be
-retained and processing should resume at the ``e'' on the next iteration of
-the execute code.
-
-If one uses a large input buffer for collecting input then the number of times
-the shifting must be done will be small. Furthermore, if one takes care not to
-define tokens that are allowed to be very long and instead processes these
-items using pure state machines or sub-scanners, then only a small amount of
-data will ever need to be shifted.
-
-\begin{figure}
-\small
-\begin{verbatim}
-      a)           A stream "of characters" to be scanned.
-                   |        |          |
-                   p        ts         pe
-
-      b)           "of characters" to be scanned.
-                   |          |        |
-                   ts         p        pe
-\end{verbatim}
-\verbspace
-\caption{Following an invocation of the execute code there may be a partially
-matched token (a). The data of the partially matched token 
-must be preserved ahead of the new data on the next invocation (b).
-}
-\label{preserve_example}
-\end{figure}
-
-Since scanners attempt to make the longest possible match of input, patterns
-such as identifiers require one character of lookahead in order to trigger a
-match. In the case of the last token in the input stream the user must ensure
-that the \verb|eof| variable is set so that the final token is flushed out.
-
-An example scanner processing loop is given in Figure \ref{scanner-loop}.
-
-\begin{figure}
-\small
-\begin{verbatim}
-    int have = 0;
-    bool done = false;
-    while ( !done ) {
-        /* How much space is in the buffer? */
-        int space = BUFSIZE - have;
-        if ( space == 0 ) {
-            /* Buffer is full. */
-            cerr << "TOKEN TOO BIG" << endl;
-            exit(1);
-        }
-
-        /* Read in a block after any data we already have. */
-        char *p = inbuf + have;
-        cin.read( p, space );
-        int len = cin.gcount();
-
-        char *pe = p + len;
-        char *eof = 0;
-
-        /* If no data was read indicate EOF. */
-        if ( len == 0 ) {
-            eof = pe;
-            done = true;
-        }
-
-        %% write exec;
-
-        if ( cs == Scanner_error ) {
-            /* Machine failed before finding a token. */
-            cerr << "PARSE ERROR" << endl;
-            exit(1);
-        }
-
-        if ( ts == 0 )
-            have = 0;
-        else {
-            /* There is a prefix to preserve, shift it over. */
-            have = pe - ts;
-            memmove( inbuf, ts, have );
-            te = inbuf + (te-ts);
-            ts = inbuf;
-        }
-    }
-\end{verbatim}
-\verbspace
-\caption{A processing loop for a scanner.
-}
-\label{scanner-loop}
-\end{figure}
-
-\section{State Charts}
-\label{state-charts}
-
-In addition to supporting the construction of state machines using regular
-languages, Ragel provides a way to manually specify state machines using
-state charts.  The comma operator combines machines together without any
-implied transitions. The user can then manually link machines by specifying
-epsilon transitions with the \verb|->| operator.  Epsilon transitions are drawn
-between the final states of a machine and entry points defined by labels.  This
-makes it possible to build machines using the explicit state-chart method while
-making minimal changes to the Ragel language. 
-
-An interesting feature of Ragel's state chart construction method is that it
-can be mixed freely with regular expression constructions. A state chart may be
-referenced from within a regular expression, or a regular expression may be
-used in the definition of a state chart transition.
-
-\subsection{Join}
-
-\verb|expr , expr , ...|
-
-Join a list of machines together without
-drawing any transitions, without setting up a start state, and without
-designating any final states. Transitions between the machines may be specified
-using labels and epsilon transitions. The start state must be explicity
-specified with the ``start'' label. Final states may be specified with an
-epsilon transition to the implicitly created ``final'' state. The join
-operation allows one to build machines using a state chart model.
-
-\subsection{Label}
-
-\verb|label: expr| 
-
-Attaches a label to an expression. Labels can be
-used as the target of epsilon transitions and explicit control transfer
-statements such as \verb|fgoto| and \verb|fnext| in action
-code.
-
-\subsection{Epsilon}
-
-\verb|expr -> label| 
-
-Draws an epsilon transition to the state defined
-by \verb|label|.  Epsilon transitions are made deterministic when join
-operators are evaluated. Epsilon transitions that are not in a join operation
-are made deterministic when the machine definition that contains the epsilon is
-complete. See Section \ref{labels} for information on referencing labels.
-
-\subsection{Simplifying State Charts}
-
-There are two benefits to providing state charts in Ragel. The first is that it
-allows us to take a state chart with a full listing of states and transitions
-and simplify it in selective places using regular expressions.
-
-The state chart method of specifying parsers is very common.  It is an
-effective programming technique for producing robust code. The key disadvantage
-becomes clear when one attempts to comprehend a large parser specified in this
-way.  These programs usually require many lines, causing logic to be spread out
-over large distances in the source file. Remembering the function of a large
-number of states can be difficult and organizing the parser in a sensible way
-requires discipline because branches and repetition present many file layout
-options.  This kind of programming takes a specification with inherent
-structure such as looping, alternation and concatenation and expresses it in a
-flat form. 
-
-If we could take an isolated component of a manually programmed state chart,
-that is, a subset of states that has only one entry point, and implement it
-using regular language operators then we could eliminate all the explicit
-naming of the states contained in it. By eliminating explicitly named states
-and replacing them with higher-level specifications we simplify a state machine
-specification.
-
-For example, sometimes chains of states are needed, with only a small number of
-possible characters appearing along the chain. These can easily be replaced
-with a concatenation of characters. Sometimes a group of common states
-implement a loop back to another single portion of the machine. Rather than
-manually duplicate all the transitions that loop back, we may be able to
-express the loop using a kleene star operator.
-
-Ragel allows one to take this state map simplification approach. We can build
-state machines using a state map model and implement portions of the state map
-using regular languages. In place of any transition in the state machine,
-entire sub-machines can be given. These can encapsulate functionality
-defined elsewhere. An important aspect of the Ragel approach is that when we
-wrap up a collection of states using a regular expression we do not lose
-access to the states and transitions. We can still execute code on the
-transitions that we have encapsulated.
-
-\subsection{Dropping Down One Level of Abstraction}
-\label{down}
-
-The second benefit of incorporating state charts into Ragel is that it permits
-us to bypass the regular language abstraction if we need to. Ragel's action
-embedding operators are sometimes insufficient for expressing certain parsing
-tasks.  In the same way that is useful for C language programmers to drop down
-to assembly language programming using embedded assembler, it is sometimes
-useful for the Ragel programmer to drop down to programming with state charts.
-
-In the following example, we wish to buffer the characters of an XML CDATA
-sequence. The sequence is terminated by the string \verb|]]>|. The challenge
-in our application is that we do not wish the terminating characters to be
-buffered. An expression of the form \verb|any* @buffer :>> ']]>'| will not work
-because the buffer will always contain the characters \verb|]]| on the end.
-Instead, what we need is to delay the buffering of \verb|]|
-characters until a time when we
-abandon the terminating sequence and go back into the main loop. There is no
-easy way to express this using Ragel's regular expression and action embedding
-operators, and so an ability to drop down to the state chart method is useful.
-
-% GENERATE: dropdown
-% OPT: -p
-% %%{
-% machine dropdown;
-\begin{inline_code}
-\begin{verbatim}
-action bchar { buff( fpc ); }    # Buffer the current character.
-action bbrack1 { buff( "]" ); }
-action bbrack2 { buff( "]]" ); }
-
-CDATA_body =
-start: (
-     ']' -> one |
-     (any-']') @bchar ->start
-),
-one: (
-     ']' -> two |
-     [^\]] @bbrack1 @bchar ->start
-),
-two: (
-     '>' -> final |
-     ']' @bbrack1 -> two |
-     [^>\]] @bbrack2 @bchar ->start
-);
-\end{verbatim}
-\end{inline_code}
-\verbspace
-% main := CDATA_body;
-% }%%
-% END GENERATE
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{dropdown}
-\end{center}
-\graphspace
-
-
-\section{Semantic Conditions}
-\label{semantic}
-
-Many communication protocols contain variable-length fields, where the length
-of the field is given ahead of the field as a value. This
-problem cannot be expressed using regular languages because of its
-context-dependent nature. The prevalence of variable-length fields in
-communication protocols motivated us to introduce semantic conditions into
-the Ragel language.
-
-A semantic condition is a block of user code that is interpreted as an
-expression and evaluated immediately
-before a transition is taken. If the code returns a value of true, the
-transition may be taken.  We can now embed code that extracts the length of a
-field, then proceed to match $n$ data values.
-
-% GENERATE: conds1
-% OPT: -p
-% %%{
-% machine conds1;
-% number = digit+;
-\begin{inline_code}
-\begin{verbatim}
-action rec_num { i = 0; n = getnumber(); }
-action test_len { i++ < n }
-data_fields = (
-    'd' 
-    [0-9]+ %rec_num 
-    ':'
-    ( [a-z] when test_len )*
-)**;
-\end{verbatim}
-\end{inline_code}
-\verbspace
-% main := data_fields;
-% }%%
-% END GENERATE
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{conds1}
-\end{center}
-\graphspace
-
-The Ragel implementation of semantic conditions does not force us to give up the
-compositional property of Ragel definitions. For example, a machine that tests
-the length of a field using conditions can be unioned with another machine
-that accepts some of the same strings, without the two machines interfering with
-one another. The user need not be concerned about whether or not the result of the
-semantic condition will affect the matching of the second machine.
-
-To see this, first consider that when a user associates a condition with an
-existing transition, the transition's label is translated from the base character
-to its corresponding value in the space that represents ``condition $c$ true''. Should
-the determinization process combine a state that has a conditional transition
-with another state that has a transition on the same input character but
-without a condition, then the condition-less transition first has its label
-translated into two values, one to its corresponding value in the space that
-represents ``condition $c$ true'' and another to its corresponding value in the
-space that represents ``condition $c$ false''. It
-is then safe to combine the two transitions. This is shown in the following
-example.  Two intersecting patterns are unioned, one with a condition and one
-without. The condition embedded in the first pattern does not affect the second
-pattern.
-
-% GENERATE: conds2
-% OPT: -p
-% %%{
-% machine conds2;
-% number = digit+;
-\begin{inline_code}
-\begin{verbatim}
-action test_len { i++ < n }
-action one { /* accept pattern one */ }
-action two { /* accept pattern two */ }
-patterns = 
-    ( [a-z] when test_len )+ %one |
-    [a-z][a-z0-9]* %two;
-main := patterns '\n';
-\end{verbatim}
-\end{inline_code}
-\verbspace
-% }%%
-% END GENERATE
-
-\graphspace
-\begin{center}
-\includegraphics[scale=0.55]{conds2}
-\end{center}
-\graphspace
-
-There are many more potential uses for semantic conditions. The user is free to
-use arbitrary code and may therefore perform actions such as looking up names
-in dictionaries, validating input using external parsing mechanisms or
-performing checks on the semantic structure of input seen so far. In the next
-section we describe how Ragel accommodates several common parser engineering
-problems.
-
-The semantic condition feature works only with alphabet types that are smaller
-in width than the \verb|long| type. To implement semantic conditions Ragel
-needs to be able to allocate characters from the alphabet space. Ragel uses
-these allocated characters to express "character C with condition P true" or "C
-with P false." Since internally Ragel uses longs to store characters there is
-no room left in the alphabet space unless an alphabet type smaller than long is
-used.
-
-\section{Implementing Lookahead}
-
-There are a few strategies for implementing lookahead in Ragel programs.
-Leaving actions, which are described in Section \ref{out-actions}, can be
-used as a form of lookahead.  Ragel also provides the \verb|fhold| directive
-which can be used in actions to prevent the machine from advancing over the
-current character. It is also possible to manually adjust the current character
-position by shifting it backwards using \verb|fexec|, however when this is
-done, care must be taken not to overstep the beginning of the current buffer
-block. In both the use of \verb|fhold| and \verb|fexec| the user must be
-cautious of combining the resulting machine with another in such a way that the
-transition on which the current position is adjusted is not combined with a
-transition from the other machine.
-
-\section{Parsing Recursive Language Structures}
-
-In general Ragel cannot handle recursive structures because the grammar is
-interpreted as a regular language. However, depending on what needs to be
-parsed it is sometimes practical to implement the recursive parts using manual
-coding techniques. This often works in cases where the recursive structures are
-simple and easy to recognize, such as in the balancing of parentheses
-
-One approach to parsing recursive structures is to use actions that increment
-and decrement counters or otherwise recognize the entry to and exit from
-recursive structures and then jump to the appropriate machine defnition using
-\verb|fcall| and \verb|fret|. Alternatively, semantic conditions can be used to
-test counter variables.
-
-A more traditional approach is to call a separate parsing function (expressed
-in the host language) when a recursive structure is entered, then later return
-when the end is recognized.
-
-\end{document}
diff --git a/test/repeat2.in b/test/repeat2.in
deleted file mode 100644
index 7ebfdda3..00000000
--- a/test/repeat2.in
+++ /dev/null
@@ -1,3307 +0,0 @@
-.title Ragel State Machine Compiler
-
-.subtitle User Guide
-
-.author Adrian Thurston
-
-.license
-
-This document is part of Ragel, and as such, this document is
-released under the terms of the GNU General Public License as published by the
-Free Software Foundation; either version 2 of the License, or (at your option)
-any later version.
-
-Ragel is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
-details.
-
-You should have received a copy of the GNU General Public
-License along with Ragel; if not, write to the Free Software Foundation, Inc.,
-59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
-.chapter Introduction
-
-.section Abstract
-
-Regular expressions are used heavily in practice for the purpose of specifying
-parsers. They are normally used as black boxes linked together with program
-logic.  User actions are executed in between invocations of the regular
-expression engine. Adding actions before a pattern terminates requires patterns
-to be broken and pasted back together with program logic. The more user actions
-are needed, the less the advantages of regular expressions are seen. 
-
-Ragel is a software development tool that allows user actions to be 
-embedded into the transitions of a regular expression's corresponding state
-machine, eliminating the need to switch from the regular expression engine and
-user code execution environment and back again. As a result, expressions can be
-maximally continuous.  One is free to specify an entire parser using a single
-regular expression.  The single-expression model affords concise and elegant
-descriptions of languages and the generation of very simple, fast and robust
-code.  Ragel compiles executable finite state machines from a high level regular language
-notation. Ragel targets C, C++, Objective-C, D, Go, Java and Ruby.
-
-In addition to building state machines from regular expressions, Ragel allows
-the programmer to directly specify state machines with state charts. These two
-notations may be freely combined. There are also facilities for controlling
-nondeterminism in the resulting machines and building scanners using patterns
-that themselves have embedded actions. Ragel can produce code that is small and
-runs very fast. Ragel can handle integer-sized alphabets and can compile very
-large state machines.
-
-.section Motivation
-
-When a programmer is faced with the task of producing a parser for a
-context-free language there are many tools to choose from. It is quite common
-to generate useful and efficient parsers for programming languages from a
-formal grammar. It is also quite common for programmers to avoid such tools
-when making parsers for simple computer languages, such as file formats and
-communication protocols.  Such languages are often regular and tools for
-processing the context-free languages are viewed as too heavyweight for the
-purpose of parsing regular languages. The extra run-time effort required for
-supporting the recursive nature of context-free languages is wasted.
-
-When we turn to the regular expression-based parsing tools, such as Lex, Re2C,
-and scripting languages such as Sed, Awk and Perl we find that they are split
-into two levels: a regular expression matching engine and some kind of program
-logic for linking patterns together.  For example, a Lex program is composed of
-sets of regular expressions. The implied program logic repeatedly attempts to
-match a pattern in the current set. When a match is found the associated user
-code executed. It requires the user to consider a language as a sequence of
-independent tokens. Scripting languages and regular expression libraries allow
-one to link patterns together using arbitrary program code.  This is very
-flexible and powerful, however we can be more concise and clear if we avoid
-gluing together regular expressions with if statements and while loops.
-
-This model of execution, where the runtime alternates between regular
-expression matching and user code exectution places restrictions on when
-action code may be executed. Since action code can only be associated with
-complete patterns, any action code that must be executed before an entire
-pattern is matched requires that the pattern be broken into smaller units.
-Instead of being forced to disrupt the regular expression syntax and write
-smaller expressions, it is desirable to retain a single expression and embed
-code for performing actions directly into the transitions that move over the
-characters. After all, capable programmers are astutely aware of the machinery
-underlying their programs, so why not provide them with access to that
-machinery? To achieve this we require an action execution model for associating
-code with the sub-expressions of a regular expression in a way that does not
-disrupt its syntax.
-
-The primary goal of Ragel is to provide developers with an ability to embed
-actions into the transitions and states of a regular expression's state machine
-in support of the definition of entire parsers or large sections of parsers
-using a single regular expression.  From the regular expression we gain a clear
-and concise statement of our language. From the state machine we obtain a very
-fast and robust executable that lends itself to many kinds of analysis and
-visualization.
-
-.section Overview
-
-Ragel is a language for specifying state machines. The Ragel program is a
-compiler that assembles a state machine definition to executable code.  Ragel
-is based on the principle that any regular language can be converted to a
-deterministic finite state automaton. Since every regular language has a state
-machine representation and vice versa, the terms regular language and state
-machine (or just machine) will be used interchangeably in this document.
-
-Ragel outputs machines to C, C++, Objective-C, D, Go, Java or Ruby code. The output is
-designed to be generic and is not bound to any particular input or processing
-method. A Ragel machine expects to have data passed to it in buffer blocks.
-When there is no more input, the machine can be queried for acceptance.  In
-this way, a Ragel machine can be used to simply recognize a regular language
-like a regular expression library. By embedding code into the regular language,
-a Ragel machine can also be used to parse input.
-
-The Ragel language has many operators for constructing and manipulating
-machines. Machines are built up from smaller machines, to bigger ones, to the
-final machine representing the language that needs to be recognized or parsed.
-
-The core state machine construction operators are those found in most theory
-of computation textbooks. They date back to the 1950s and are widely studied.
-They are based on set operations and permit one to think of languages as a set
-of strings. They are Union, Intersection, Difference, Concatenation and Kleene
-Star. Put together, these operators make up what most people know as regular
-expressions. Ragel also provides a scanner construction operator 
-and provides operators for explicitly constructing machines
-using a state chart method. In the state chart method, one joins machines
-together without any implied transitions and then explicitly specifies where
-epsilon transitions should be drawn.
-
-The state machine manipulation operators are specific to Ragel. They allow the
-programmer to access the states and transitions of regular language's
-corresponding machine. There are two uses of the manipulation operators. The
-first and primary use is to embed code into transitions and states, allowing
-the programmer to specify the actions of the state machine.
-
-Ragel attempts to make the action embedding facility as intuitive as possible.
-To do so, a number of issues need to be addressed.  For example, when making a
-nondeterministic specification into a DFA using machines that have embedded
-actions, new transitions are often made that have the combined actions of
-several source transitions. Ragel ensures that multiple actions associated with
-a single transition are ordered consistently with respect to the order of
-reference and the natural ordering implied by the construction operators.
-
-The second use of the manipulation operators is to assign priorities to
-transitions. Priorities provide a convenient way of controlling any
-nondeterminism introduced by the construction operators. Suppose two
-transitions leave from the same state and go to distinct target states on the
-same character. If these transitions are assigned conflicting priorities, then
-during the determinization process the transition with the higher priority will
-take precedence over the transition with the lower priority. The lower priority
-transition gets abandoned. The transitions would otherwise be combined into a new
-transition that goes to a new state that is a combination of the original
-target states. Priorities are often required for segmenting machines. The most
-common uses of priorities have been encoded into a set of simple operators
-that should be used instead of priority embeddings whenever possible.
-
-For the purposes of embedding, Ragel divides transitions and states into
-different classes. There are four operators for embedding actions and
-priorities into the transitions of a state machine. It is possible to embed
-into entering transitions, finishing transitions, all transitions and leaving
-transitions. The embedding into leaving transitions is a special case.
-These transition embeddings get stored in the final states of a machine.  They
-are transferred to any transitions that are made going out of the machine by
-future concatenation or kleene star operations.
-
-There are several more operators for embedding actions into states. Like the
-transition embeddings, there are various different classes of states that the
-embedding operators access. For example, one can access start states, final
-states or all states, among others. Unlike the transition embeddings, there are
-several different types of state action embeddings. These are executed at
-various different times during the processing of input. It is possible to embed
-actions that are exectued on transitions into a state, on transitions out of a
-state, on transitions taken on the error event, or on transitions taken on the
-EOF event.
-
-Within actions, it is possible to influence the behaviour of the state machine.
-The user can write action code that jumps or calls to another portion of the
-machine, changes the current character being processed, or breaks out of the
-processing loop. With the state machine calling feature Ragel can be used to
-parse languages that are not regular. For example, one can parse balanced
-parentheses by calling into a parser when an open parenthesis character is seen
-and returning to the state on the top of the stack when the corresponding
-closing parenthesis character is seen. More complicated context-free languages
-such as expressions in C are out of the scope of Ragel. 
-
-Ragel also provides a scanner construction operator that can be used to build
-scanners much the same way that Lex is used. The Ragel generated code, which
-relies on user-defined variables for backtracking, repeatedly tries to match
-patterns to the input, favouring longer patterns over shorter ones and patterns
-that appear ahead of others when the lengths of the possible matches are
-identical. When a pattern is matched the associated action is executed. 
-
-The key distinguishing feature between scanners in Ragel and scanners in Lex is
-that Ragel patterns may be arbitrary Ragel expressions and can therefore
-contain embedded code. With a Ragel-based scanner the user need not wait until
-the end of a pattern before user code can be executed.
-
-Scanners do take Ragel out of the domain of pure state machines and require the
-user to maintain the backtracking related variables.  However, scanners
-integrate well with regular state machine instantiations. They can be called to
-or jumped to only when needed, or they can be called out of or jumped out of
-when a simpler, pure state machine model is appropriate.
-
-Two types of output code style are available. Ragel can produce a table-driven
-machine or a directly executable machine. The directly executable machine is
-much faster than the table-driven. On the other hand, the table-driven machine
-is more compact and less demanding on the host language compiler. It is better
-suited to compiling large state machines.
-
-.section Related Work
-
-Lex is perhaps the best-known tool for constructing parsers from regular
-expressions. In the Lex processing model, generated code attempts to match one
-of the user's regular expression patterns, favouring longer matches over
-shorter ones. Once a match is made it then executes the code associated with
-the pattern and consumes the matching string.  This process is repeated until
-the input is fully consumed. 
-
-Through the use of start conditions, related sets of patterns may be defined.
-The active set may be changed at any time.  This allows the user to define
-different lexical regions. It also allows the user to link patterns together by
-requiring that some patterns come before others.  This is quite like a
-concatenation operation. However, use of Lex for languages that require a
-considerable amount of pattern concatenation is inappropriate. In such cases a
-Lex program deteriorates into a manually specified state machine, where start
-conditions define the states and pattern actions define the transitions.  Lex
-is therefore best suited to parsing tasks where the language to be parsed can
-be described in terms of regions of tokens. 
-
-Lex is useful in many scenarios and has undoubtedly stood the test of time.
-There are, however, several drawbacks to using Lex.  Lex can impose too much
-overhead for parsing applications where buffering is not required because all
-the characters are available in a single string.  In these cases there is
-structure to the language to be parsed and a parser specification tool can
-help, but employing a heavyweight processing loop that imposes a stream
-``pull'' model and dynamic input buffer allocation is inappropriate.  An
-example of this kind of scenario is the conversion of floating point numbers
-contained in a string to their corresponding numerical values.
-
-Another drawback is the very issue that Ragel attempts to solve.
-It is not possible to execute a user action while
-matching a character contained inside a pattern. For example, if scanning a
-programming language and string literals can contain newlines which must be
-counted, a Lex user must break up a string literal pattern so as to associate
-an action with newlines. This forces the definition of a new start condition.
-Alternatively the user can reprocess the text of the matched string literal to
-count newlines. 
-
-.comment
-
-How ragel is different from Lex.
-
-Like Re2c, Ragel provides a simple execution model that does not make any
-assumptions as to how the input is collected.  Also, Ragel does not do any
-buffering in the generated code. Consequently there are no dependencies on
-external functions such as .verb|malloc|. 
-
-If buffering is required it can be manually implemented by embedding actions
-that copy the current character to a buffer, or data can be passed to the
-parser using known block boundaries. If the longest-match operator is used,
-Ragel requires the user to ensure that the ending portion of the input buffer
-is preserved when the buffer is exhaused before a token is fully matched. The
-user should move the token prefix to a new memory location, such as back to the
-beginning of the input buffer, then place the subsequently read input
-immediately after the prefix.
-
-These properties of Ragel make it more work to write a program that requires
-the longest-match operator or buffering of input, however they make Ragel a
-more flexible tool that can produce very simple and fast-running programs under
-a variety of input acquisition arrangements.
-
-In Ragel, it is not necessary
-to introduce start conditions to concatenate tokens and retain action
-execution. Ragel allows one to structure a parser as a series of tokens, but
-does not require it.
-
-Like Lex and Re2C, Ragel is able to process input using a longest-match
-execution model, however the core of the Ragel language specifies parsers at a
-much lower level. This core is built around a pure state machine model. When
-building basic machines there is no implied algorithm for processing input
-other than to move from state to state on the transitions of the machine. This
-core of pure state machine operations makes Ragel well suited to handling
-parsing problems not based on token scanning. Should one need to use a
-longest-match model, the functionality is available and the lower level state
-machine construction facilities can be used to specify the patterns of a
-longest-match machine.
-
-This is not possible in Ragel. One can only program
-a longest-match instantiation with a fixed set of rules. One can jump to
-another longest-match machine that employs the same machine definitions in the
-construction of its rules, however no states will be shared.
-
-In Ragel, input may be re-parsed using a
-different machine, but since the action to be executed is associated with
-transitions of the compiled state machine, the longest-match construction does
-not permit a single rule to be excluded from the active set. It cannot be done
-ahead of time nor in the excluded rule's action.
-
-.end comment
-
-The Re2C program defines an input processing model similar to that of Lex.
-Re2C focuses on making generated state machines run very fast and
-integrate easily into any program, free of dependencies.  Re2C generates
-directly executable code and is able to claim that generated parsers run nearly
-as fast as their hand-coded equivalents.  This is very important for user
-adoption, as programmers are reluctant to use a tool when a faster alternative
-exists.  A consideration to ease of use is also important because developers
-need the freedom to integrate the generated code as they see fit. 
-
-Many scripting languages provide ways of composing parsers by linking regular
-expressions using program logic. For example, Sed and Awk are two established
-Unix scripting tools that allow the programmer to exploit regular expressions
-for the purpose of locating and extracting text of interest. High-level
-programming languages such as Perl, Python, PHP and Ruby all provide regular
-expression libraries that allow the user to combine regular expressions with
-arbitrary code.
-
-In addition to supporting the linking of regular expressions with arbitrary
-program logic, the Perl programming language permits the embedding of code into
-regular expressions. Perl embeddings do not translate into the embedding of
-code into deterministic state machines. Perl regular expressions are in fact
-not fully compiled to deterministic machines when embedded code is involved.
-They are instead interpreted and involve backtracking. This is shown by the
-following Perl program. When it is fed the input .verb|abcd| the interpretor
-attempts to match the first alternative, printing .verb|a1 b1|.  When this
-possibility fails it backtracks and tries the second possibility, printing
-.verb|a2 b2|, at which point it succeeds.
-
-.code
-print "YES\n" if ( <STDIN> =~
-        /( a (?{ print "a1 "; }) b (?{ print "b1 "; }) cX ) |
-         ( a (?{ print "a2 "; }) b (?{ print "b2 "; }) cd )/x )
-.end code
-
-In Ragel there is no regular expression interpretor. Aside from the scanner
-operator, all Ragel expressions are made into deterministic machines and the
-run time simply moves from state to state as it consumes input. An equivalent
-parser expressed in Ragel would attempt both of the alternatives concurrently,
-printing .verb|a1 a2 b1 b2|.
-
-.section Development Status
-
-Ragel is a relatively new tool and is under continuous development. As a rough
-release guide, minor revision number changes are for implementation
-improvements and feature additions. Major revision number changes are for
-implementation and language changes that do not preserve backwards
-compatibility. Though in the past this has not always held true: changes that
-break code have crept into minor version number changes. Typically, the
-documentation lags behind the development in the interest of documenting only
-the lasting features. The latest changes are always documented in the ChangeLog
-file. 
-
-.chapter Constructing State Machines
-
-.section Ragel State Machine Specifications
-
-A Ragel input file consists of a program in the host language that contains embedded machine
-specifications.  Ragel normally passes input straight to output.  When it sees
-a machine specification it stops to read the Ragel statements and possibly generate
-code in place of the specification.
-Afterwards it continues to pass input through.  There
-can be any number of FSM specifications in an input file. A multi-line FSM spec
-starts with .verb|%%{| and ends with .verb|}%%|. A single-line FSM spec starts
-with .verb|%%| and ends at the first newline.  
-
-While Ragel is looking for FSM specifications it does basic lexical analysis on
-the surrounding input. It interprets literal strings and comments so a
-.verb|%%| sequence in either of those will not trigger the parsing of an FSM
-specification. Ragel does not pass the input through any preprocessor nor does it
-interpret preprocessor directives itself so includes, defines and ifdef logic
-cannot be used to alter the parse of a Ragel input file. It is therefore not
-possible to use an .verb|#if 0| directive to comment out a machine as is
-commonly done in C code. As an alternative, a machine can be prevented from
-causing any generated output by commenting out write statements.
-
-In Figure .ref{cmd-line-parsing}, a multi-line specification is used to define the
-machine and single line specifications are used to trigger the writing of the machine
-data and execution code.
-
-.figure cmd-line-parsing
-.multicols
-.verbatim
-#include <string.h>
-#include <stdio.h>
-
-%%{ 
-    machine foo;
-    main := 
-        ( 'foo' | 'bar' ) 
-        0 @{ res = 1; };
-}%%
-
-%% write data;
-.end verbatim
-.columnbreak
-.verbatim
-int main( int argc, char **argv )
-{
-    int cs, res = 0;
-    if ( argc > 1 ) {
-        char *p = argv[1];
-        char *pe = p + strlen(p) + 1;
-        %% write init;
-        %% write exec;
-    }
-    printf("result = %i\n", res );
-    return 0;
-}
-.end verbatim
-.end multicols
-.caption Parsing a command line argument.
-.end figure
-
-.subsection Naming Ragel Blocks
-
-.verbatim
-machine fsm_name;
-.end verbatim
-
-The .verb|machine| statement gives the name of the FSM. If present in a
-specification, this statement must appear first. If a machine specification
-does not have a name then Ragel uses the previous specification name.  If no
-previous specification name exists then this is an error. Because FSM
-specifications persist in memory, a machine's statements can be spread across
-multiple machine specifications.  This allows one to break up a machine across
-several files or draw in statements that are common to multiple machines using
-the .verb|include| statement.
-
-.subsection Machine Definition
-.label{definition}
-
-.verbatim
-<name> = <expression>;
-.end verbatim 
-
-The machine definition statement associates an FSM expression with a name. Machine
-expressions assigned to names can later be referenced in other expressions. A
-definition statement on its own does not cause any states to be generated. It is simply a
-description of a machine to be used later. States are generated only when a definition is
-instantiated, which happens when a definition is referenced in an instantiated
-expression. 
-
-.subsection Machine Instantiation
-.label{instantiation}
-
-.verbatim
-<name> := <expression>;
-.end verbatim
-
-The machine instantiation statement generates a set of states representing an
-expression. Each instantiation generates a distinct set of states.  The starting
-state of the instantiation is written in the data section of the generated code
-using the instantiation name.  If a machine named
-.verb|main| is instantiated, its start state is used as the
-specification's start state and is assigned to the .verb|cs| variable by the
-.verb|write init| command. If no .verb|main| machine is given, the start state
-of the last machine instantiation to appear is used as the specification's
-start state.
-
-From outside the execution loop, control may be passed to any machine by
-assigning the entry point to the .verb|cs| variable.  From inside the execution
-loop, control may be passed to any machine instantiation using .verb|fcall|,
-.verb|fgoto| or .verb|fnext| statements.
-
-.subsection Including Ragel Code
-
-.verbatim
-include FsmName "inputfile.rl";
-.end verbatim
-
-The .verb|include| statement can be used to draw in the statements of another FSM
-specification. Both the name and input file are optional, however at least one
-must be given. Without an FSM name, the given input file is searched for an FSM
-of the same name as the current specification. Without an input file the
-current file is searched for a machine of the given name. If both are present,
-the given input file is searched for a machine of the given name.
-
-Ragel searches for included files from the location of the current file.
-Additional directories can be added to the search path using the .verb|-I|
-option.
-
-.subsection Importing Definitions
-.label{import}
-
-.verbatim
-import "inputfile.h";
-.end verbatim
-
-The .verb|import| statement scrapes a file for sequences of tokens that match
-the following forms. Ragel treats these forms as state machine definitions.
-
-.list
-.li .verb|name '=' number|
-.li .verb|name '=' lit_string|
-.li .verb|'define' name number|
-.li .verb|'define' name lit_string|
-.end list
-
-If the input file is a Ragel program then tokens inside any Ragel
-specifications are ignored. See Section .ref{export} for a description of
-exporting machine definitions.
-
-Ragel searches for imported files from the location of the current file.
-Additional directories can be added to the search path using the .verb|-I|
-option.
-
-.section Lexical Analysis of a Ragel Block
-.label{lexing}
-
-Within a machine specification the following lexical rules apply to the input.
-
-.itemize
-
-.item The .verb|#| symbol begins a comment that terminates at the next newline.
-
-.item The symbols .verb|""|, .verb|''|, .verb|//|, .verb|[]| behave as the
-delimiters of literal strings. Within them, the following escape sequences 
-are interpreted: 
-
-.verb|    \0 \a \b \t \n \v \f \r|
-
-A backslash at the end of a line joins the following line onto the current. A
-backslash preceding any other character removes special meaning. This applies
-to terminating characters and to special characters in regular expression
-literals. As an exception, regular expression literals do not support escape
-sequences as the operands of a range within a list. See the bullet on regular
-expressions in Section .ref{basic}.
-
-.item The symbols .verb|{}| delimit a block of host language code that will be
-embedded into the machine as an action.  Within the block of host language
-code, basic lexical analysis of comments and strings is done in order to
-correctly find the closing brace of the block. With the exception of FSM
-commands embedded in code blocks, the entire block is preserved as is for
-identical reproduction in the output code.
-
-.item The pattern .verb|[+-]?[0-9]+| denotes an integer in decimal format.
-Integers used for specifying machines may be negative only if the alphabet type
-is signed. Integers used for specifying priorities may be positive or negative.
-
-.item The pattern .verb|0x[0-9A-Fa-f]+| denotes an integer in hexadecimal
-format.
-
-.item The keywords are .verb|access|, .verb|action|, .verb|alphtype|,
-.verb|getkey|, .verb|write|, .verb|machine| and .verb|include|.
-
-.item The pattern .verb|[a-zA-Z_][a-zA-Z_0-9]*| denotes an identifier.
-
-.comment
-.item The allowable symbols are:
-
-.verb/    ( ) ! ^ * ? + : -> - | & . , := = ; > @ $ % /\\
-.verb|    >/  $/  %/  </  @/  <>/ >!  $!  %!  <!  @!  <>!|\\
-.verb|    >^  $^  %^  <^  @^  <>^ >~  $~  %~  <~  @~  <>~|\\
-.verb|    >*  $*  %*  <*  @*  <>*|
-.end comment
-
-.item Any amount of whitespace may separate tokens.
-
-.end itemize
-
-.comment
-.section Parse of an FSM Specification
-
-The following statements are possible within an FSM specification. The
-requirements for trailing semicolons loosely follow that of C. 
-A block
-specifying code does not require a trailing semicolon. An expression
-statement does require a trailing semicolon.
-.end comment
-
-.section Basic Machines
-.label{basic}
-
-The basic machines are the base operands of regular language expressions. They
-are the smallest unit to which machine construction and manipulation operators
-can be applied.
-
-.itemize
-
-.item .verb|'hello'| -- Concatenation Literal. Produces a machine that matches
-the sequence of characters in the quoted string. If there are 5 characters
-there will be 6 states chained together with the characters in the string. See
-Section .ref{lexing} for information on valid escape sequences. 
-
-.comment
-% GENERATE: bmconcat
-% OPT: -p
-% %%{
-% machine bmconcat;
-.verbatim
-main := 'hello';
-.end verbatim
-% }%%
-% END GENERATE
-.end comment
-
-.graphic bmconcat
-
-It is possible
-to make a concatenation literal case-insensitive by appending an .verb|i| to
-the string, for example .verb|'cmd'i|.
-
-.item .verb|"hello"| -- Identical to the single quoted version.
-
-.item .verb|[hello]| -- Or Expression. Produces a union of characters.  There
-will be two states with a transition for each unique character between the two states.
-The .verb|[]| delimiters behave like the quotes of a literal string. For example, 
-.verb|[ \t]| means tab or space. The .verb|or| expression supports character ranges
-with the .verb|-| symbol as a separator. The meaning of the union can be negated
-using an initial .verb|^| character as in standard regular expressions. 
-See Section .ref{lexing} for information on valid escape sequences
-in .verb|or| expressions.
-
-.comment
-% GENERATE: bmor
-% OPT: -p
-% %%{
-% machine bmor;
-.verbatim
-main := [hello];
-.end verbatim
-% }%%
-% END GENERATE
-.end comment
-
-.graphic bmor
-
-.item .verb|''|, .verb|""|, and .verb|[]| -- Zero Length Machine.  Produces a machine
-that matches the zero length string. Zero length machines have one state that is both
-a start state and a final state.
-
-.comment
-% GENERATE: bmnull
-% OPT: -p
-% %%{
-% machine bmnull;
-.verbatim
-main := '';
-.end verbatim
-% }%%
-% END GENERATE
-.end comment
-
-.graphic bmnull
-
-% FIXME: More on the range of values here.
-.item .verb|42| -- Numerical Literal. Produces a two state machine with one
-transition on the given number. The number may be in decimal or hexadecimal
-format and should be in the range allowed by the alphabet type. The minimum and
-maximum values permitted are defined by the host machine that Ragel is compiled
-on. For example, numbers in a .verb|short| alphabet on an i386 machine should
-be in the range .verb|-32768| to .verb|32767|.
-
-.comment
-% GENERATE: bmnum
-% %%{
-% machine bmnum;
-.verbatim
-main := 42;
-.end verbatim
-% }%%
-% END GENERATE
-.end comment
-
-.graphic bmnum
-
-.item .verb|/simple_regex/| -- Regular Expression. Regular expressions are
-parsed as a series of expressions that are concatenated together. Each
-concatenated expression
-may be a literal character, the ``any'' character specified by the .verb|.|
-symbol, or a union of characters specified by the .verb|[]| delimiters. If the
-first character of a union is .verb|^| then it matches any character not in the
-list. Within a union, a range of characters can be given by separating the first
-and last characters of the range with the .verb|-| symbol. Each
-concatenated machine may have repetition specified by following it with the
-.verb|*| symbol. The standard escape sequences described in Section
-.ref{lexing} are supported everywhere in regular expressions except as the
-operands of a range within in a list. This notation also supports the .verb|i|
-trailing option. Use it to produce case-insensitive machines, as in .verb|/GET/i|.
-
-Ragel does not support very complex regular expressions because the desired
-results can always be achieved using the more general machine construction
-operators listed in Section .ref{machconst}. The following diagram shows the
-result of compiling .verb|/ab*[c-z].*[123]/|. .verb|DEF| represents the default
-transition, which is taken if no other transition can be taken. 
-
-.comment
-% GENERATE: bmregex
-% OPT: -p
-% %%{
-% machine bmregex;
-.verbatim
-main := /ab*[c-z].*[123]/;
-.end verbatim
-% }%%
-% END GENERATE
-.end comment
-
-.graphic bmregex
-
-.item .verb|'a' .. 'z'| -- Range. Produces a machine that matches any
-characters in the specified range.  Allowable upper and lower bounds of the
-range are concatenation literals of length one and numerical literals.  For
-example, .verb|0x10..0x20|, .verb|0..63|, and .verb|'a'..'z'| are valid ranges.
-The bounds should be in the range allowed by the alphabet type.
-
-.comment
-% GENERATE: bmrange
-% OPT: -p
-% %%{
-% machine bmrange;
-.verbatim
-main := 'a' .. 'z';
-.end verbatim
-% }%%
-% END GENERATE
-.end comment
-
-.graphic bmrange
-
-.item .verb|variable_name| -- Lookup the machine definition assigned to the
-variable name given and use an instance of it. See Section .ref{definition} for
-an important note on what it means to reference a variable name.
-
-.item .verb|builtin_machine| -- There are several built-in machines available
-for use. They are all two state machines for the purpose of matching common
-classes of characters. They are:
-
-.itemize
-
-.item .verb|any   | -- Any character in the alphabet.
-
-.item .verb|ascii | -- Ascii characters. .verb|0..127|
-
-.item .verb|extend| -- Ascii extended characters. This is the range
-.verb|-128..127| for signed alphabets and the range .verb|0..255| for unsigned
-alphabets.
-
-.item .verb|alpha | -- Alphabetic characters. .verb|[A-Za-z]|
-
-.item .verb|digit | -- Digits. .verb|[0-9]|
-
-.item .verb|alnum | -- Alpha numerics. .verb|[0-9A-Za-z]|
-
-.item .verb|lower | -- Lowercase characters. .verb|[a-z]|
-
-.item .verb|upper | -- Uppercase characters. .verb|[A-Z]|
-
-.item .verb|xdigit| -- Hexadecimal digits. .verb|[0-9A-Fa-f]|
-
-.item .verb|cntrl | -- Control characters. .verb|0..31|
-
-.item .verb|graph | -- Graphical characters. .verb|[!-~]|
-
-.item .verb|print | -- Printable characters. .verb|[ -~]|
-
-.item .verb|punct | -- Punctuation. Graphical characters that are not alphanumerics.
-.verb|[!-/:-@[-`{-~]|
-
-.item .verb|space | -- Whitespace. .verb|[\t\v\f\n\r ]|
-
-.item .verb|zlen  | -- Zero length string. .verb|""|
-
-.item .verb|empty | -- Empty set. Matches nothing. .verb|^any|
-
-.end itemize
-.end itemize
-
-.section Operator Precedence
-The following table shows operator precedence from lowest to highest. Operators
-in the same precedence group are evaluated from left to right.
-
-.tabular
-.row 1&.verb| , |&Join
-.row 2&.verb/ | & - --/&Union, Intersection and Subtraction
-.row 3&.verb| . <: :> :>> |&Concatenation
-.row 4&.verb| : |&Label
-.row 5&.verb| -> |&Epsilon Transition
-.row 6&.verb| >  @  $  % |&Transitions Actions and Priorities
-.row 6&.verb| >/  $/  %/  </  @/  <>/ |&EOF Actions
-.row 6&.verb| >!  $!  %!  <!  @!  <>! |&Global Error Actions
-.row 6&.verb| >^  $^  %^  <^  @^  <>^ |&Local Error Actions
-.row 6&.verb| >~  $~  %~  <~  @~  <>~ |&To-State Actions
-.row 6&.verb| >*  $*  %*  <*  @*  <>* |&From-State Action
-.row 7&.verb| * ** ? + {n} {,n} {n,} {n,m} |&Repetition
-.row 8&.verb| ! ^ |&Negation and Character-Level Negation
-.row 9&.verb| ( <expr> ) |&Grouping
-.end tabular
-
-.section Regular Language Operators
-.label{machconst}
-
-When using Ragel it is helpful to have a sense of how it constructs machines.
-The determinization process can produce results that seem unusual to someone
-not familiar with the NFA to DFA conversion algorithm. In this section we
-describe Ragel's state machine operators. Though the operators are defined
-using epsilon transitions, it should be noted that this is for discussion only.
-The epsilon transitions described in this section do not persist, but are
-immediately removed by the determinization process which is executed at every
-operation. Ragel does not make use of any nondeterministic intermediate state
-machines. 
-
-To create an epsilon transition between two states .verb|x| and .verb|y| is to
-copy all of the properties of .verb|y| into .verb|x|. This involves drawing in
-all of .verb|y|'s to-state actions, EOF actions, etc., in addition to its
-transitions. If .verb|x| and .verb|y| both have a transition out on the same
-character, then the transitions must be combined.  During transition
-combination a new transition is made that goes to a new state that is the
-combination of both target states. The new combination state is created using
-the same epsilon transition method.  The new state has an epsilon transition
-drawn to all the states that compose it. Since the creation of new epsilon
-transitions may be triggered every time an epsilon transition is drawn, the
-process of drawing epsilon transitions is repeated until there are no more
-epsilon transitions to be made.
-
-A very common error that is made when using Ragel is to make machines that do
-too much. That is, to create machines that have unintentional
-nondetermistic properties. This usually results from being unaware of the common strings
-between machines that are combined together using the regular language
-operators. This can involve never leaving a machine, causing its actions to be
-propagated through all the following states. Or it can involve an alternation
-where both branches are unintentionally taken simultaneously.
-
-This problem forces one to think hard about the language that needs to be
-matched. To guard against this kind of problem one must ensure that the machine
-specification is divided up using boundaries that do not allow ambiguities from
-one portion of the machine to the next. See Chapter
-.ref{controlling-nondeterminism} for more on this problem and how to solve it.
-
-The Graphviz tool is an immense help when debugging improperly compiled
-machines or otherwise learning how to use Ragel. Graphviz Dot files can be
-generated from Ragel programs using the .verb|-V| option. See Section
-.ref{visualization} for more information.
-
-
-.subsection Union
-
-.verb/expr | expr/
-
-The union operation produces a machine that matches any string in machine one
-or machine two. The operation first creates a new start state. Epsilon
-transitions are drawn from the new start state to the start states of both
-input machines.  The resulting machine has a final state set equivalent to the
-union of the final state sets of both input machines. In this operation, there
-is the opportunity for nondeterminism among both branches. If there are
-strings, or prefixes of strings that are matched by both machines then the new
-machine will follow both parts of the alternation at once. The union operation is
-shown below.
-
-.graphic opor 1.0
-
-The following example demonstrates the union of three machines representing
-common tokens.
-
-% GENERATE: exor
-% OPT: -p
-% %%{
-% machine exor;
-.code
-# Hex digits, decimal digits, or identifiers
-main := '0x' xdigit+ | digit+ | alpha alnum*;
-.end code
-% }%%
-% END GENERATE
-
-.graphic exor
-
-.subsection Intersection
-
-.verb|expr & expr|
-
-Intersection produces a machine that matches any
-string that is in both machine one and machine two. To achieve intersection, a
-union is performed on the two machines. After the result has been made
-deterministic, any final state that is not a combination of final states from
-both machines has its final state status revoked. To complete the operation,
-paths that do not lead to a final state are pruned from the machine. Therefore,
-if there are any such paths in either of the expressions they will be removed
-by the intersection operator.  Intersection can be used to require that two
-independent patterns be simultaneously satisfied as in the following example.
-
-% GENERATE: exinter
-% OPT: -p
-% %%{
-% machine exinter;
-.code
-# Match lines four characters wide that contain 
-# words separated by whitespace.
-main :=
-    /[^\n][^\n][^\n][^\n]\n/* &
-    (/[a-z][a-z]*/ | [ \n])**;
-.end code
-% }%%
-% END GENERATE
-
-.graphic exinter
-
-.subsection Difference
-
-.verb|expr - expr|
-
-The difference operation produces a machine that matches
-strings that are in machine one but are not in machine two. To achieve subtraction,
-a union is performed on the two machines. After the result has been made
-deterministic, any final state that came from machine two or is a combination
-of states involving a final state from machine two has its final state status
-revoked. As with intersection, the operation is completed by pruning any path
-that does not lead to a final state.  The following example demonstrates the
-use of subtraction to exclude specific cases from a set.
-
-% GENERATE: exsubtr
-% OPT: -p
-% %%{
-% machine exsubtr;
-.code
-# Subtract keywords from identifiers.
-main := /[a-z][a-z]*/ - ( 'for' | 'int' );
-.end code
-% }%%
-% END GENERATE
-
-.graphic exsubtr
-
-.subsection Strong Difference
-.label{strong_difference}
-
-.verb|expr -- expr|
-
-Strong difference produces a machine that matches any string of the first
-machine that does not have any string of the second machine as a substring. In
-the following example, strong subtraction is used to excluded .verb|CRLF| from
-a sequence. In the corresponding visualization, the label .verb|DEF| is short
-for default. The default transition is taken if no other transition can be
-taken.
-
-% GENERATE: exstrongsubtr
-% OPT: -p
-% %%{
-% machine exstrongsubtr;
-.code
-crlf = '\r\n';
-main := [a-z]+ ':' ( any* -- crlf ) crlf;
-.end code
-% }%%
-% END GENERATE
-
-.graphic exstrongsubtr
-
-This operator is equivalent to the following.
-
-.verbatim
-expr - ( any* expr any* )
-.end verbatim
-
-.subsection Concatenation
-
-.verb|expr . expr|
-
-Concatenation produces a machine that matches all the strings in machine one followed by all
-the strings in machine two.  Concatenation draws epsilon transitions from the
-final states of the first machine to the start state of the second machine. The
-final states of the first machine lose their final state status, unless the
-start state of the second machine is final as well. 
-Concatenation is the default operator. Two machines next to each other with no
-operator between them results in concatenation.
-
-.graphic opconcat 1.0
-
-The opportunity for nondeterministic behaviour results from the possibility of
-the final states of the first machine accepting a string that is also accepted
-by the start state of the second machine.
-The most common scenario in which this happens is the
-concatenation of a machine that repeats some pattern with a machine that gives
-a terminating string, but the repetition machine does not exclude the
-terminating string. The example in Section .ref{strong_difference}
-guards against this. Another example is the expression .verb|("'" any* "'")|.
-When executed the thread of control will
-never leave the .verb|any*| machine.  This is a problem especially if actions
-are embedded to process the characters of the .verb|any*| component.
-
-In the following example, the first machine is always active due to the
-nondeterministic nature of concatenation. This particular nondeterminism is intended
-however because we wish to permit EOF strings before the end of the input.
-
-% GENERATE: exconcat
-% OPT: -p
-% %%{
-% machine exconcat;
-.code
-# Require an eof marker on the last line.
-main := /[^\n]*\n/* . 'EOF\n';
-.end code
-% }%%
-% END GENERATE
-
-.graphic exconcat
-
-There is a language
-ambiguity involving concatenation and subtraction. Because concatenation is the 
-default operator for two
-adjacent machines there is an ambiguity between subtraction of
-a positive numerical literal and concatenation of a negative numerical literal.
-For example, .verb|(x-7)| could be interpreted as .verb|(x . -7)| or 
-.verb|(x - 7)|. In the Ragel language, the subtraction operator always takes precedence
-over concatenation of a negative literal. We adhere to the rule that the default
-concatenation operator takes effect only when there are no other operators between
-two machines. Beware of writing machines such as .verb|(any -1)| when what is
-desired is a concatenation of .verb|any| and .verb|-1|. Instead write 
-.verb|(any . -1)| or .verb|(any (-1))|. If in doubt of the meaning of your program do not
-rely on the default concatenation operator; always use the .verb|.| symbol.
-
-
-.subsection Kleene Star
-
-.verb|expr*|
-
-The machine resulting from the Kleene Star operator will match zero or more
-repetitions of the machine it is applied to.
-It creates a new start state and an additional final
-state.  Epsilon transitions are drawn between the new start state and the old start
-state, between the new start state and the new final state, and
-between the final states of the machine and the new start state.  After the
-machine is made deterministic the effect is of the final states getting all the
-transitions of the start state. 
-
-.graphic opstar 1.0
-
-The possibility for nondeterministic behaviour arises if the final states have
-transitions on any of the same characters as the start state.  This is common
-when applying kleene star to an alternation of tokens. Like the other problems
-arising from nondeterministic behavior, this is discussed in more detail in Chapter
-.ref{controlling-nondeterminism}. This particular problem can also be solved
-by using the longest-match construction discussed in Section 
-.ref{generating-scanners} on scanners.
-
-In this 
-example, there is no nondeterminism introduced by the exterior kleene star due to
-the newline at the end of the regular expression. Without the newline the
-exterior kleene star would be redundant and there would be ambiguity between
-repeating the inner range of the regular expression and the entire regular
-expression. Though it would not cause a problem in this case, unnecessary
-nondeterminism in the kleene star operator often causes undesired results for
-new Ragel users and must be guarded against.
-
-% GENERATE: exstar
-% OPT: -p
-% %%{
-% machine exstar;
-.code
-# Match any number of lines with only lowercase letters.
-main := /[a-z]*\n/*;
-.end code
-% }%%
-% END GENERATE
-
-.graphic exstar
-
-.subsection One Or More Repetition
-
-.verb|expr+|
-
-This operator produces the concatenation of the machine with the kleene star of
-itself. The result will match one or more repetitions of the machine. The plus
-operator is equivalent to .verb|(expr . expr*)|.  
-
-% GENERATE: explus
-% OPT: -p
-% %%{
-% machine explus;
-.code
-# Match alpha-numeric words.
-main := alnum+;
-.end code
-% }%%
-% END GENERATE
-
-.graphic explus
-
-.subsection Optional
-
-.verb|expr?|
-
-The .em{optional} operator produces a machine that accepts the machine
-given or the zero length string. The optional operator is equivalent to
-.verb/(expr | '' )/. In the following example the optional operator is used to
-possibly extend a token.
-
-% GENERATE: exoption
-% OPT: -p
-% %%{
-% machine exoption;
-.code
-# Match integers or floats.
-main := digit+ ('.' digit+)?;
-.end code
-% }%%
-% END GENERATE
-
-.graphic exoption
-
-.subsection Repetition
-
-.list 
-.li .verb|expr {n}| -- Exactly N copies of expr.
-.li .verb|expr {,n}| -- Zero to N copies of expr.
-.li .verb|expr {n,}| -- N or more copies of expr.
-.li .verb|expr {n,m}| -- N to M copies of expr.
-.end list
-
-.subsection Negation
-
-.verb|!expr|
-
-Negation produces a machine that matches any string not matched by the given
-machine. Negation is equivalent to .verb|(any* - expr)|.
-
-% GENERATE: exnegate
-% OPT: -p
-% %%{
-% machine exnegate;
-.code
-# Accept anything but a string beginning with a digit.
-main := ! ( digit any* );
-.end code
-% }%%
-% END GENERATE
-
-.graphic exnegate
-
-.subsection Character-Level Negation
-
-.verb|^expr|
-
-Character-level negation produces a machine that matches any single character
-not matched by the given machine. Character-Level Negation is equivalent to
-.verb|(any - expr)|. It must be applied only to machines that match strings of
-length one.
-
-.section State Machine Minimization
-
-State machine minimization is the process of finding the minimal equivalent FSM accepting
-the language. Minimization reduces the number of states in machines
-by merging equivalent states. It does not change the behaviour of the machine
-in any way. It will cause some states to be merged into one because they are
-functionally equivalent. State minimization is on by default. It can be turned
-off with the .verb|-n| option.
-
-The algorithm implemented is similar to Hopcroft's state minimization
-algorithm. Hopcroft's algorithm assumes a finite alphabet that can be listed in
-memory, whereas Ragel supports arbitrary integer alphabets that cannot be
-listed in memory. Though exact analysis is very difficult, Ragel minimization
-runs close to O(n * log(n)) and requires O(n) temporary storage where
-$n$ is the number of states.
-
-.section Visualization
-.label{visualization}
-
-%In many cases, practical
-%parsing programs will be too large to completely visualize with Graphviz.  The
-%proper approach is to reduce the language to the smallest subset possible that
-%still exhibits the characteristics that one wishes to learn about or to fix.
-%This can be done without modifying the source code using the .verb|-M| and
-%.verb|-S| options. If a machine cannot be easily reduced,
-%embeddings of unique actions can be very useful for tracing a
-%particular component of a larger machine specification, since action names are
-%written out on transition labels.
-
-Ragel is able to emit compiled state machines in Graphviz's Dot file format.
-This is done using the .verb|-V| option.
-Graphviz support allows users to perform
-incremental visualization of their parsers. User actions are displayed on
-transition labels of the graph. 
-
-If the final graph is too large to be
-meaningful, or even drawn, the user is able to inspect portions of the parser
-by naming particular regular expression definitions with the .verb|-S| and
-.verb|-M| options to the .verb|ragel| program. Use of Graphviz greatly
-improves the Ragel programming experience. It allows users to learn Ragel by
-experimentation and also to track down bugs caused by unintended
-nondeterminism.
-
-Ragel has another option to help debugging. The .verb|-x| option causes Ragel
-to emit the compiled machine in an XML format.
-
-.chapter User Actions
-
-Ragel permits the user to embed actions into the transitions of a regular
-expression's corresponding state machine. These actions are executed when the
-generated code moves over a transition.  Like the regular expression operators,
-the action embedding operators are fully compositional. They take a state
-machine and an action as input, embed the action and yield a new state machine
-that can be used in the construction of other machines. Due to the
-compositional nature of embeddings, the user has complete freedom in the
-placement of actions.
-
-A machine's transitions are categorized into four classes. The action embedding
-operators access the transitions defined by these classes.  The .em{entering
-transition} operator .verb|>| isolates the start state, then embeds an action
-into all transitions leaving it. The .em{finishing transition} operator
-.verb|@| embeds an action into all transitions going into a final state.  The
-.em{all transition} operator .verb|$| embeds an action into all transitions of
-an expression. The .em{leaving transition} operator .verb|%| provides access
-to the yet-unmade transitions moving out of the machine via the final states. 
-
-.section Embedding Actions
-
-.verbatim
-action ActionName {
-    /* Code an action here. */
-    count += 1;
-}
-.end verbatim
-
-The action statement defines a block of code that can be embedded into an FSM.
-Action names can be referenced by the action embedding operators in
-expressions. Though actions need not be named in this way (literal blocks
-of code can be embedded directly when building machines), defining reusable
-blocks of code whenever possible is good practice because it potentially increases the
-degree to which the machine can be minimized. 
-
-Within an action some Ragel expressions and statements are parsed and
-translated. These allow the user to interact with the machine from action code.
-See Section .ref{vals} for a complete list of statements and values available
-in code blocks. 
-
-.subsection Entering Action
-
-.verb|expr > action| 
-
-The entering action operator embeds an action into all transitions
-that enter into the machine from the start state. If the start state is final,
-then the action is also embedded into the start state as a leaving action. This
-means that if a machine accepts the zero-length string and control passes
-through the start state then the entering action is executed. Note
-that this can happen on both a following character and on the EOF event.
-
-In some machines the start state has transtions coming in from within the
-machine. In these cases the start state is first isolated from the rest of the
-machine ensuring that the entering actions are exected once only.
-
-% GENERATE: exstact
-% OPT: -p
-% %%{
-% machine exstact;
-.code
-# Execute A at the beginning of a string of alpha.
-action A {}
-main := ( lower* >A ) . ' ';
-.end code
-% }%%
-% END GENERATE
-
-.graphic exstact
-
-.subsection Finishing Action
-
-.verb|expr @ action|
-
-The finishing action operator embeds an action into any transitions that move
-the machine into a final state. Further input may move the machine out of the
-final state, but keep it in the machine. Therefore finishing actions may be
-executed more than once if a machine has any internal transitions out of a
-final state. In the following example the final state has no transitions out
-and the finishing action is executed only once.
-
-% GENERATE: exdoneact
-% OPT: -p
-% %%{
-% machine exdoneact;
-% action A {}
-.code
-# Execute A when the trailing space is seen.
-main := ( lower* ' ' ) @A;
-.end code
-% }%%
-% END GENERATE
-
-.graphic exdoneact
-
-.subsection All Transition Action
-
-.verb|expr $ action|
-
-The all transition operator embeds an action into all transitions of a machine.
-The action is executed whenever a transition of the machine is taken. In the
-following example, A is executed on every character matched.
-
-% GENERATE: exallact
-% OPT: -p
-% %%{
-% machine exallact;
-% action A {}
-.code
-# Execute A on any characters of the machine.
-main := ( 'm1' | 'm2' ) $A;
-.end code
-% }%%
-% END GENERATE
-
-.graphic exallact
-
-.subsection Leaving Actions
-.label{out-actions}
-
-.verb|expr % action|
-
-The leaving action operator queues an action for embedding into the transitions
-that go out of a machine via a final state. The action is first stored in
-the machine's final states and is later transferred to any transitions that are
-made going out of the machine by a kleene star or concatenation operation.
-
-If a final state of the machine is still final when compilation is complete
-then the leaving action is also embedded as an EOF action. Therefore, leaving
-the machine is defined as either leaving on a character or as state machine
-acceptance.
-
-This operator allows one to associate an action with the termination of a
-sequence, without being concerned about what particular character terminates
-the sequence. In the following example, A is executed when leaving the alpha
-machine on the newline character.
-
-% GENERATE: exoutact1
-% OPT: -p
-% %%{
-% machine exoutact1;
-% action A {}
-.code
-# Match a word followed by a newline. Execute A when 
-# finishing the word.
-main := ( lower+ %A ) . '\n';
-.end code
-% }%%
-% END GENERATE
-
-.graphic exoutact1
-
-In the following example, the .verb|term_word| action could be used to register
-the appearance of a word and to clear the buffer that the .verb|lower| action used
-to store the text of it.
-
-% GENERATE: exoutact2
-% OPT: -p
-% %%{
-% machine exoutact2;
-% action lower {}
-% action space {}
-% action term_word {}
-% action newline {}
-.code
-word = ( [a-z] @lower )+ %term_word;
-main := word ( ' ' @space word )* '\n' @newline;
-.end code
-% }%%
-% END GENERATE
-
-.graphic exoutact2
-
-In this final example of the action embedding operators, A is executed upon entering
-the alpha machine, B is executed on all transitions of the
-alpha machine, C is executed when the alpha machine is exited by moving into the
-newline machine and N is executed when the newline machine moves into a final
-state.  
-
-% GENERATE: exaction
-% OPT: -p
-% %%{
-% machine exaction;
-% action A {}
-% action B {}
-% action C {}
-% action N {}
-.code
-# Execute A on starting the alpha machine, B on every transition 
-# moving through it and C upon finishing. Execute N on the newline.
-main := ( lower* >A $B %C ) . '\n' @N;
-.end code
-% }%%
-% END GENERATE
-
-.graphic exaction
-
-
-.section State Action Embedding Operators
-
-The state embedding operators allow one to embed actions into states. Like the
-transition embedding operators, there are several different classes of states
-that the operators access. The meanings of the symbols are similar to the
-meanings of the symbols used for the transition embedding operators. The design
-of the state selections was driven by a need to cover the states of an
-expression with exactly one error action.
-
-Unlike the transition embedding operators, the state embedding operators are
-also distinguished by the different kinds of events that embedded actions can
-be associated with. Therefore the state embedding operators have two
-components.  The first, which is the first one or two characters, specifies the
-class of states that the action will be embedded into. The second component
-specifies the type of event the action will be executed on. The symbols of the
-second component also have equivalent kewords. 
-
-.multicols
-The different classes of states are:
-
-.list
-.li .verb|> | -- the start state
-.li .verb|< | -- any state except the start state
-.li .verb|$ | -- all states
-.li .verb|% | -- final states
-.li .verb|@ | -- any state except final states
-.li .verb|<>| -- any except start and final (middle)
-.end list
-
-.columnbreak
-
-The different kinds of embeddings are:
-
-.list
-.li .verb|~| -- to-state actions (.verb|to|)
-.li .verb|*| -- from-state actions (.verb|from|)
-.li .verb|/| -- EOF actions (.verb|eof|)
-.li .verb|!| -- error actions (.verb|err|)
-.li .verb|^| -- local error actions (.verb|lerr|)
-.end list
-
-.end multicols
-
-.subsection To-State and From-State Actions
-
-.subsubsection To-State Actions
-
-.list
-.li .verb|>~action      >to(name)      >to{...} | -- the start state
-.li .verb|<~action      <to(name)      <to{...} | -- any state except the start state
-.li .verb|$~action      $to(name)      $to{...} | -- all states
-.li .verb|%~action      %to(name)      %to{...} | -- final states
-.li .verb|@~action      @to(name)      @to{...} | -- any state except final states
-.li .verb|<>~action     <>to(name)     <>to{...}| -- any except start and final (middle)
-.end list
-
-
-To-state actions are executed whenever the state machine moves into the
-specified state, either by a natural movement over a transition or by an
-action-based transfer of control such as .verb|fgoto|. They are executed after the
-in-transition's actions but before the current character is advanced and
-tested against the end of the input block. To-state embeddings stay with the
-state. They are irrespective of the state's current set of transitions and any
-future transitions that may be added in or out of the state.
-
-Note that the setting of the current state variable .verb|cs| outside of the
-execute code is not considered by Ragel as moving into a state and consequently
-the to-state actions of the new current state are not executed. This includes
-the initialization of the current state when the machine begins.  This is
-because the entry point into the machine execution code is after the execution
-of to-state actions.
-
-.subsubsection From-State Actions
-
-.list
-.li .verb|>*action     >from(name)     >from{...} | -- the start state
-.li .verb|<*action     <from(name)     <from{...} | -- any state except the start state
-.li .verb|$*action     $from(name)     $from{...} | -- all states
-.li .verb|%*action     %from(name)     %from{...} | -- final states
-.li .verb|@*action     @from(name)     @from{...} | -- any state except final states
-.li .verb|<>*action    <>from(name)    <>from{...}| -- any except start and final (middle)
-.end list
-
-From-state actions are executed whenever the state machine takes a transition from a
-state, either to itself or to some other state. These actions are executed
-immediately after the current character is tested against the input block end
-marker and before the transition to take is sought based on the current
-character. From-state actions are therefore executed even if a transition
-cannot be found and the machine moves into the error state.  Like to-state
-embeddings, from-state embeddings stay with the state.
-
-.subsection EOF Actions
-
-.list
-.li .verb|>/action     >eof(name)     >eof{...} | -- the start state
-.li .verb|</action     <eof(name)     <eof{...} | -- any state except the start state
-.li .verb|$/action     $eof(name)     $eof{...} | -- all states
-.li .verb|%/action     %eof(name)     %eof{...} | -- final states
-.li .verb|@/action     @eof(name)     @eof{...} | -- any state except final states
-.li .verb|<>/action    <>eof(name)    <>eof{...}| -- any except start and final (middle)
-.end list
-
-The EOF action embedding operators enable the user to embed actions that are
-executed at the end of the input stream. EOF actions are stored in states and
-generated in the .verb|write exec| block. They are run when .verb|p == pe == eof|
-as the execute block is finishing. EOF actions are free to adjust .verb|p| and
-jump to another part of the machine to restart execution.
-
-.subsection Handling Errors
-
-In many applications it is useful to be able to react to parsing errors.  The
-user may wish to print an error message that depends on the context.  It
-may also be desirable to consume input in an attempt to return the input stream
-to some known state and resume parsing. To support error handling and recovery,
-Ragel provides error action embedding operators. There are two kinds of error
-actions: global error actions and local error actions.
-Error actions can be used to simply report errors, or by jumping to a machine
-instantiation that consumes input, can attempt to recover from errors.  
-
-.subsubsection Global Error Actions
-
-.list
-.li .verb|>!action     >err(name)     >err{...} | -- the start state
-.li .verb|<!action     <err(name)     <err{...} | -- any state except the start state
-.li .verb|$!action     $err(name)     $err{...} | -- all states
-.li .verb|%!action     %err(name)     %err{...} | -- final states
-.li .verb|@!action     @err(name)     @err{...} | -- any state except final states
-.li .verb|<>!action    <>err(name)    <>err{...}| -- any except start and final (middle)
-.end list
-
-Global error actions are stored in the states they are embedded into until
-compilation is complete. They are then transferred to the transitions that move
-into the error state. These transitions are taken on all input characters that
-are not already covered by the state's transitions. If a state with an error
-action is not final when compilation is complete, then the action is also
-embedded as an EOF action.
-
-Error actions can be used to recover from errors by jumping back into the
-machine with .verb|fgoto| and optionally altering .verb|p|.
-
-.subsubsection Local Error Actions
-
-.list
-.li .verb|>^action     >lerr(name)     >lerr{...} | -- the start state
-.li .verb|<^action     <lerr(name)     <lerr{...} | -- any state except the start state
-.li .verb|$^action     $lerr(name)     $lerr{...} | -- all states
-.li .verb|%^action     %lerr(name)     %lerr{...} | -- final states
-.li .verb|@^action     @lerr(name)     @lerr{...} | -- any state except final states
-.li .verb|<>^action    <>lerr(name)    <>lerr{...}| -- any except start and final (middle)
-.end list
-
-Like global error actions, local error actions are also stored in the states
-they are embedded into until a transfer point. The transfer point is different
-however. Each local error action embedding is associated with a name. When a
-machine definition has been fully constructed, all local error action
-embeddings associated with the same name as the machine definition are
-transferred to the error transitions. At this time they are also embedded as
-EOF actions in the case of non-final states.
-
-Local error actions can be used to specify an action to take when a particular
-section of a larger state machine fails to match. A particular machine
-definition's ``thread'' may die and the local error actions executed, however
-the machine as a whole may continue to match input.
-
-There are two forms of local error action embeddings. In the first form the
-name defaults to the current machine. In the second form the machine name can
-be specified.  This is useful when it is more convenient to specify the local
-error action in a sub-definition that is used to construct the machine
-definition that the local error action is associated with. To embed local 
-error actions and
-explicitly state the machine definition on which the transfer is to happen use
-.verb|(name, action)| as the action.
-
-.subsubsection Example
-
-The following example uses error actions to report an error and jump to a
-machine that consumes the remainder of the line when parsing fails. After
-consuming the line, the error recovery machine returns to the main loop.
-
-% GENERATE: erract
-% %%{
-%   machine erract;
-%   ws = ' ';
-%   address = 'foo AT bar..com';
-%   date = 'Monday May 12';
-.code
-action cmd_err { 
-    printf( "command error\n" ); 
-    fhold; fgoto line;
-}
-action from_err { 
-    printf( "from error\n" ); 
-    fhold; fgoto line; 
-}
-action to_err { 
-    printf( "to error\n" ); 
-    fhold; fgoto line;
-}
-
-line := [^\n]* '\n' @{ fgoto main; };
-
-main := (
-    (
-        'from' @err(cmd_err) 
-            ( ws+ address ws+ date '\n' ) $err(from_err) |
-        'to' @err(cmd_err)
-            ( ws+ address '\n' ) $err(to_err)
-    ) 
-)*;
-.end code
-% }%%
-% %% write data;
-% void f()
-% {
-%   %% write init;
-%   %% write exec;
-% }
-% END GENERATE
-
-
-
-.section Action Ordering and Duplicates
-
-When combining expressions that have embedded actions it is often the case that
-a number of actions must be executed on a single input character. For example,
-following a concatenation the leaving action of the left expression and the
-entering action of the right expression will be embedded into one transition.
-This requires a method of ordering actions that is intuitive and
-predictable for the user, and repeatable for the compiler. 
-
-We associate with the embedding of each action a unique timestamp that is
-used to order actions that appear together on a single transition in the final
-state machine. To accomplish this we recursively traverse the parse tree of
-regular expressions and assign timestamps to action embeddings. References to
-machine definitions are followed in the traversal. When we visit a
-parse tree node we assign timestamps to all .em{entering} action embeddings,
-recurse on the parse tree, then assign timestamps to the remaining .em{all},
-.em{finishing}, and .em{leaving} embeddings in the order in which they
-appear.
-
-By default Ragel does not permit a single action to appear multiple times in an action
-list. When the final machine has been created, actions that appear more than
-once in a single transition, to-state, from-state or EOF action list have their
-duplicates removed.
-The first appearance of the action is preserved. This is useful in a number of
-scenarios. First, it allows us to union machines with common prefixes without
-worrying about the action embeddings in the prefix being duplicated. Second, it
-prevents leaving actions from being transferred multiple times. This can
-happen when a machine is repeated, then followed with another machine that
-begins with a common character. For example:
-
-.verbatim
-word = [a-z]+ %act;
-main := word ( '\n' word )* '\n\n';
-.end verbatim
-
-Note that Ragel does not compare action bodies to determine if they have
-identical program text. It simply checks for duplicates using each action
-block's unique location in the program.
-
-The removal of duplicates can be turned off using the .verb|-d| option.
-
-.section Values and Statements Available in Code Blocks
-.label{vals}
-
-The following values are available in code blocks:
-
-.itemize
-.item .verb|fpc| -- A pointer to the current character. This is equivalent to
-accessing the .verb|p| variable.
-
-.item .verb|fc| -- The current character. This is equivalent to the expression .verb|(*p)|.
-
-.item .verb|fcurs| -- An integer value representing the current state. This
-value should only be read from. To move to a different place in the machine
-from action code use the .verb|fgoto|, .verb|fnext| or .verb|fcall| statements.
-Outside of the machine execution code the .verb|cs| variable may be modified.
-
-.item .verb|ftargs| -- An integer value representing the target state. This
-value should only be read from. Again, .verb|fgoto|, .verb|fnext| and
-.verb|fcall| can be used to move to a specific entry point.
-
-.item .verb|fentry(<label>)| -- Retrieve an integer value representing the
-entry point .verb|label|. The integer value returned will be a compile time
-constant. This number is suitable for later use in control flow transfer
-statements that take an expression. This value should not be compared against
-the current state because any given label can have multiple states representing
-it. The value returned by .verb|fentry| can be any one of the multiple states that
-it represents.
-.end itemize
-
-The following statements are available in code blocks:
-
-.itemize
-
-.item .verb|fhold;| -- Do not advance over the current character. If processing
-data in multiple buffer blocks, the .verb|fhold| statement should only be used
-once in the set of actions executed on a character.  Multiple calls may result
-in backing up over the beginning of the buffer block. The .verb|fhold|
-statement does not imply any transfer of control. It is equivalent to the
-.verb|p--;| statement. 
-
-.item .verb|fexec <expr>;| -- Set the next character to process. This can be
-used to backtrack to previous input or advance ahead.
-Unlike .verb|fhold|, which can be used
-anywhere, .verb|fexec| requires the user to ensure that the target of the
-backtrack is in the current buffer block or is known to be somewhere ahead of
-it. The machine will continue iterating forward until .verb|pe| is arrived at,
-.verb|fbreak| is called or the machine moves into the error state. In actions
-embedded into transitions, the .verb|fexec| statement is equivalent to setting
-.verb|p| to one position ahead of the next character to process.  If the user
-also modifies .verb|pe|, it is possible to change the buffer block entirely.
-
-.item .verb|fgoto <label>;| -- Jump to an entry point defined by
-.verb|<label>|.  The .verb|fgoto| statement immediately transfers control to
-the destination state.
-
-.item .verb|fgoto *<expr>;| -- Jump to an entry point given by .verb|<expr>|.
-The expression must evaluate to an integer value representing a state.
-
-.item .verb|fnext <label>;| -- Set the next state to be the entry point defined
-by .verb|label|.  The .verb|fnext| statement does not immediately jump to the
-specified state. Any action code following the statement is executed.
-
-.item .verb|fnext *<expr>;| -- Set the next state to be the entry point given
-by .verb|<expr>|. The expression must evaluate to an integer value representing
-a state.
-
-.item .verb|fcall <label>;| -- Push the target state and jump to the entry
-point defined by .verb|<label>|.  The next .verb|fret| will jump to the target
-of the transition on which the call was made. Use of .verb|fcall| requires
-the declaration of a call stack. An array of integers named .verb|stack| and a
-single integer named .verb|top| must be declared. With the .verb|fcall|
-construct, control is immediately transferred to the destination state.
-See section .ref{modularization} for more information.
-
-.item .verb|fcall *<expr>;| -- Push the current state and jump to the entry
-point given by .verb|<expr>|. The expression must evaluate to an integer value
-representing a state.
-
-.item .verb|fret;| -- Return to the target state of the transition on which the
-last .verb|fcall| was made.  Use of .verb|fret| requires the declaration of a
-call stack. Control is immediately transferred to the destination state.
-
-.item .verb|fbreak;| -- Advance .verb|p|, save the target state to .verb|cs|
-and immediately break out of the execute loop. This statement is useful
-in conjunction with the .verb|noend| write option. Rather than process input
-until .verb|pe| is arrived at, the fbreak statement
-can be used to stop processing from an action.  After an .verb|fbreak|
-statement the .verb|p| variable will point to the next character in the input. The
-current state will be the target of the current transition. Note that .verb|fbreak|
-causes the target state's to-state actions to be skipped.
-
-.end itemize
-
-Once actions with control-flow commands are embedded into a
-machine, the user must exercise caution when using the machine as the operand
-to other machine construction operators. If an action jumps to another state
-then unioning any transition that executes that action with another transition
-that follows some other path will cause that other path to be lost. Using
-commands that manually jump around a machine takes us out of the domain of
-regular languages because transitions that the
-machine construction operators are not aware of are introduced.  These
-commands should therefore be used with caution.
-
-
-.chapter Controlling Nondeterminism
-.label{controlling-nondeterminism}
-
-Along with the flexibility of arbitrary action embeddings comes a need to
-control nondeterminism in regular expressions. If a regular expression is
-ambiguous, then sub-components of a parser other than the intended parts may become
-active. This means that actions that are irrelevant to the
-current subset of the parser may be executed, causing problems for the
-programmer.
-
-Tools that are based on regular expression engines and that are used for
-recognition tasks will usually function as intended regardless of the presence
-of ambiguities. It is quite common for users of scripting languages to write
-regular expressions that are heavily ambiguous and it generally does not
-matter. As long as one of the potential matches is recognized, there can be any
-number of other matches present.  In some parsing systems the run-time engine
-can employ a strategy for resolving ambiguities, for example always pursuing
-the longest possible match and discarding others.
-
-In Ragel, there is no regular expression run-time engine, just a simple state
-machine execution model. When we begin to embed actions and face the
-possibility of spurious action execution, it becomes clear that controlling
-nondeterminism at the machine construction level is very important. Consider
-the following example.
-
-% GENERATE: lines1
-% OPT: -p
-% %%{
-% machine lines1;
-% action first {}
-% action tail {}
-% word = [a-z]+;
-.code
-ws = [\n\t ];
-line = word $first ( ws word $tail )* '\n';
-lines = line*;
-.end code
-% main := lines;
-% }%%
-% END GENERATE
-
-.graphic lines1 0.53
-
-Since the .verb|ws| expression includes the newline character, we will
-not finish the .verb|line| expression when a newline character is seen. We will
-simultaneously pursue the possibility of matching further words on the same
-line and the possibility of matching a second line. Evidence of this fact is 
-in the state tables. On several transitions both the .verb|first| and
-.verb|tail| actions are executed.  The solution here is simple: exclude
-the newline character from the .verb|ws| expression. 
-
-% GENERATE: lines2
-% OPT: -p
-% %%{
-% machine lines2;
-% action first {}
-% action tail {}
-% word = [a-z]+;
-.code
-ws = [\t ];
-line = word $first ( ws word $tail )* '\n';
-lines = line*;
-.end code
-% main := lines;
-% }%%
-% END GENERATE
-
-.graphic lines2
-
-Solving this kind of problem is straightforward when the ambiguity is created
-by strings that are a single character long.  When the ambiguity is created by
-strings that are multiple characters long we have a more difficult problem.
-The following example is an incorrect attempt at a regular expression for C
-language comments. 
-
-% GENERATE: comments1
-% OPT: -p
-% %%{
-% machine comments1;
-% action comm {}
-.code
-comment = '/*' ( any @comm )* '*/';
-main := comment ' ';
-.end code
-% }%%
-% END GENERATE
-
-.graphic comments1
-
-Using standard concatenation, we will never leave the .verb|any*| expression.
-We will forever entertain the possibility that a .verb|'*/'| string that we see
-is contained in a longer comment and that, simultaneously, the comment has
-ended.  The concatenation of the .verb|comment| machine with .verb|SP| is done
-to show this. When we match space, we are also still matching the comment body.
-
-One way to approach the problem is to exclude the terminating string
-from the .verb|any*| expression using set difference. We must be careful to
-exclude not just the terminating string, but any string that contains it as a
-substring. A verbose, but proper specification of a C comment parser is given
-by the following regular expression. 
-
-% GENERATE: comments2
-% OPT: -p
-% %%{
-% machine comments2;
-% action comm {}
-.code
-comment = '/*' ( ( any @comm )* - ( any* '*/' any* ) ) '*/';
-.end code
-% main := comment;
-% }%%
-% END GENERATE
-
-.graphic comments2
-
-Note that Ragel's strong subtraction operator .verb|--| can also be used here.
-In doing this subtraction we have phrased the problem of controlling non-determinism in
-terms of excluding strings common to two expressions that interact when
-combined.
-We can also phrase the problem in terms of the transitions of the state
-machines that implement these expressions. During the concatenation of
-.verb|any*| and .verb|'*/'| we will be making transitions that are composed of
-both the loop of the first expression and the final character of the second.
-At this time we want the transition on the .verb|'/'| character to take precedence
-over and disallow the transition that originated in the .verb|any*| loop.
-
-In another parsing problem, we wish to implement a lightweight tokenizer that we can
-utilize in the composition of a larger machine. For example, some HTTP headers
-have a token stream as a sub-language. The following example is an attempt
-at a regular expression-based tokenizer that does not function correctly due to
-unintended nondeterminism.
-
-% GENERATE: smallscanner
-% OPT: -p
-% %%{
-% machine smallscanner;
-% action start_str {}
-% action on_char {}
-% action finish_str {}
-.code
-header_contents = ( 
-    lower+ >start_str $on_char %finish_str | 
-    ' '
-)*;
-.end code
-% main := header_contents;
-% }%%
-% END GENERATE
-
-.graphic smallscanner
-
-In this case, the problem with using a standard kleene star operation is that
-there is an ambiguity between extending a token and wrapping around the machine
-to begin a new token. Using the standard operator, we get an undesirable
-nondeterministic behaviour. Evidence of this can be seen on the transition out
-of state one to itself.  The transition extends the string, and simultaneously,
-finishes the string only to immediately begin a new one.  What is required is
-for the
-transitions that represent an extension of a token to take precedence over the
-transitions that represent the beginning of a new token. For this problem
-there is no simple solution that uses standard regular expression operators.
-
-.section Priorities
-
-A priority mechanism was devised and built into the determinization
-process, specifically for the purpose of allowing the user to control
-nondeterminism.  Priorities are integer values embedded into transitions. When
-the determinization process is combining transitions that have different
-priorities, the transition with the higher priority is preserved and the
-transition with the lower priority is dropped.
-
-Unfortunately, priorities can have unintended side effects because their
-operation requires that they linger in transitions indefinitely. They must linger
-because the Ragel program cannot know when the user is finished with a priority
-embedding.  A solution whereby they are explicitly deleted after use is
-conceivable; however this is not very user-friendly.  Priorities were therefore
-made into named entities. Only priorities with the same name are allowed to
-interact.  This allows any number of priorities to coexist in one machine for
-the purpose of controlling various different regular expression operations and
-eliminates the need to ever delete them. Such a scheme allows the user to
-choose a unique name, embed two different priority values using that name
-and be confident that the priority embedding will be free of any side effects.
-
-In the first form of priority embedding the name defaults to the name of the machine
-definition that the priority is assigned in. In this sense priorities are by
-default local to the current machine definition or instantiation. Beware of
-using this form in a longest-match machine, since there is only one name for
-the entire set of longest match patterns. In the second form the priority's
-name can be specified, allowing priority interaction across machine definition
-boundaries.
-
-.itemize
-.item .verb|expr > int| -- Sets starting transitions to have priority int.
-.item .verb|expr @ int| -- Sets transitions that go into a final state to have priority int. 
-.item .verb|expr $ int| -- Sets all transitions to have priority int.
-.item .verb|expr % int| -- Sets leaving transitions to
-have priority int. When a transition is made going out of the machine (either
-by concatenation or kleene star) its priority is immediately set to the 
-leaving priority.  
-.end itemize
-
-The second form of priority assignment allows the programmer to specify the name
-to which the priority is assigned.
-
-.itemize
-.item .verb|expr > (name, int)| -- Starting transitions.
-.item .verb|expr @ (name, int)| -- Finishing transitions (into a final state).
-.item .verb|expr $ (name, int)| -- All transitions.
-.item .verb|expr % (name, int)| -- Leaving transitions.
-.end itemize
-
-.section Guarded Operators that Encapsulate Priorities
-
-Priority embeddings are a very expressive mechanism. At the same time they
-can be very confusing for the user. They force the user to imagine
-the transitions inside two interacting expressions and work out the precise
-effects of the operations between them. When we consider
-that this problem is worsened by the
-potential for side effects caused by unintended priority name collisions, we
-see that exposing the user to priorities is undesirable.
-
-Fortunately, in practice the use of priorities has been necessary only in a
-small number of scenarios.  This allows us to encapsulate their functionality
-into a small set of operators and fully hide them from the user. This is
-advantageous from a language design point of view because it greatly simplifies
-the design.  
-
-Going back to the C comment example, we can now properly specify
-it using a guarded concatenation operator which we call .em{finish-guarded
-concatenation}. From the user's point of view, this operator terminates the
-first machine when the second machine moves into a final state.  It chooses a
-unique name and uses it to embed a low priority into all
-transitions of the first machine. A higher priority is then embedded into the
-transitions of the second machine that enter into a final state. The following
-example yields a machine identical to the example in Section 
-.ref{controlling-nondeterminism}.
-
-.code
-comment = '/*' ( any @comm )* :>> '*/';
-.end code
-
-.graphic comments2
-
-Another guarded operator is .em{left-guarded concatenation}, given by the
-.verb|<:| compound symbol. This operator places a higher priority on all
-transitions of the first machine. This is useful if one must forcibly separate
-two lists that contain common elements. For example, one may need to tokenize a
-stream, but first consume leading whitespace.
-
-Ragel also includes a .em{longest-match kleene star} operator, given by the
-.verb|**| compound symbol. This 
-guarded operator embeds a high
-priority into all transitions of the machine. 
-A lower priority is then embedded into the leaving transitions.  When the
-kleene star operator makes the epsilon transitions from
-the final states into the new start state, the lower priority will be transferred
-to the epsilon transitions. In cases where following an epsilon transition
-out of a final state conflicts with an existing transition out of a final
-state, the epsilon transition will be dropped.
-
-Other guarded operators are conceivable, such as guards on union that cause one
-alternative to take precedence over another. These may be implemented when it
-is clear they constitute a frequently used operation.
-In the next section we discuss the explicit specification of state machines
-using state charts.
-
-.subsection Entry-Guarded Concatenation
-
-.verb|expr :> expr| 
-
-This operator concatenates two machines, but first assigns a low
-priority to all transitions
-of the first machine and a high priority to the starting transitions of the
-second machine. This operator is useful if from the final states of the first
-machine it is possible to accept the characters in the entering transitions of
-the second machine. This operator effectively terminates the first machine
-immediately upon starting the second machine, where otherwise they would be
-pursued concurrently. In the following example, entry-guarded concatenation is
-used to move out of a machine that matches everything at the first sign of an
-end-of-input marker.
-
-% GENERATE: entryguard
-% OPT: -p
-% %%{
-% machine entryguard;
-.code
-# Leave the catch-all machine on the first character of FIN.
-main := any* :> 'FIN';
-.end code
-% }%%
-% END GENERATE
-
-.graphic entryguard
-
-Entry-guarded concatenation is equivalent to the following:
-
-.verbatim
-expr $(unique_name,0) . expr >(unique_name,1)
-.end verbatim
-
-.subsection Finish-Guarded Concatenation
-
-.verb|expr :>> expr|
-
-This operator is
-like the previous operator, except the higher priority is placed on the final
-transitions of the second machine. This is useful if one wishes to entertain
-the possibility of continuing to match the first machine right up until the
-second machine enters a final state. In other words it terminates the first
-machine only when the second accepts. In the following example, finish-guarded
-concatenation causes the move out of the machine that matches everything to be
-delayed until the full end-of-input marker has been matched.
-
-% GENERATE: finguard
-% OPT: -p
-% %%{
-% machine finguard;
-.code
-# Leave the catch-all machine on the last character of FIN.
-main := any* :>> 'FIN';
-.end code
-% }%%
-% END GENERATE
-
-.graphic finguard
-
-Finish-guarded concatenation is equivalent to the following, with one
-exception. If the right machine's start state is final, the higher priority is
-also embedded into it as a leaving priority. This prevents the left machine
-from persisting via the zero-length string.
-
-.verbatim
-expr $(unique_name,0) . expr @(unique_name,1)
-.end verbatim
-
-.subsection Left-Guarded Concatenation
-
-.verb|expr <: expr| 
-
-This operator places
-a higher priority on the left expression. It is useful if you want to prefix a
-sequence with another sequence composed of some of the same characters. For
-example, one can consume leading whitespace before tokenizing a sequence of
-whitespace-separated words as in:
-
-% GENERATE: leftguard
-% OPT: -p
-% %%{
-% machine leftguard;
-% action alpha {}
-% action ws {}
-% action start {}
-% action fin {}
-.code
-main := ( ' '* >start %fin ) <: ( ' ' $ws | [a-z] $alpha )*;
-.end code
-% }%%
-% END GENERATE
-
-.graphic leftguard
-
-Left-guarded concatenation is equivalent to the following:
-
-.verbatim
-expr $(unique_name,1) . expr >(unique_name,0)
-.end verbatim
-
-.subsection Longest-Match Kleene Star
-.label{longest_match_kleene_star}
-
-.verb|expr**| 
-
-This version of kleene star puts a higher priority on staying in the
-machine versus wrapping around and starting over. The LM kleene star is useful
-when writing simple tokenizers.  These machines are built by applying the
-longest-match kleene star to an alternation of token patterns, as in the
-following.
-
-% GENERATE: lmkleene
-% OPT: -p
-% %%{
-% machine exfinpri;
-% action A {}
-% action B {}
-.code
-# Repeat tokens, but make sure to get the longest match.
-main := (
-    lower ( lower | digit )* %A | 
-    digit+ %B | 
-    ' '
-)**;
-.end code
-% }%%
-% END GENERATE
-
-.graphic lmkleene
-
-If a regular kleene star were used the machine above would not be able to
-distinguish between extending a word and beginning a new one.  This operator is
-equivalent to:
-
-.verbatim
-( expr $(unique_name,1) %(unique_name,0) )*
-.end verbatim
-
-When the kleene star is applied, transitions that go out of the machine and
-back into it are made. These are assigned a priority of zero by the leaving 
-transition mechanism. This is less than the priority of one assigned to the
-transitions leaving the final states but not leaving the machine. When 
-these transitions clash on the same character, the 
-transition that stays in the machine takes precedence.  The transition
-that wraps around is dropped.
-
-Note that this operator does not build a scanner in the traditional sense
-because there is never any backtracking. To build a scanner with backtracking
-use the Longest-Match machine construction described in Section
-.ref{generating-scanners}.
-
-.chapter Interface to Host Program
-
-The Ragel code generator is very flexible. The generated code has no
-dependencies and can be inserted in any function, perhaps inside a loop if
-desired.  The user is responsible for declaring and initializing a number of
-required variables, including the current state and the pointer to the input
-stream. These can live in any scope. Control of the input processing loop is
-also possible: the user may break out of the processing loop and return to it
-at any time.
-
-In the case of the C, D, and Go host languages, Ragel is able to generate very
-fast-running code that implements state machines as directly executable code.
-Since very large files strain the host language compiler, table-based code
-generation is also supported. In the future we hope to provide a partitioned,
-directly executable format that is able to reduce the burden on the host
-compiler by splitting large machines across multiple functions.
-
-In the case of Java and Ruby, table-based code generation is the only code
-style supported. In the future this may be expanded to include other code
-styles.
-
-Ragel can be used to parse input in one block, or it can be used to parse input
-in a sequence of blocks as it arrives from a file or socket.  Parsing the input
-in a sequence of blocks brings with it a few responsibilities. If the parser
-utilizes a scanner, care must be taken to not break the input stream anywhere
-but token boundaries.  If pointers to the input stream are taken during
-parsing, care must be taken to not use a pointer that has been invalidated by
-movement to a subsequent block.  If the current input data pointer is moved
-backwards it must not be moved past the beginning of the current block.
-
-Figure .ref{basic-example} shows a simple Ragel program that does not have any
-actions. The example tests the first argument of the program against a number
-pattern and then prints the machine's acceptance status.
-
-.figure basic-example
-.verbatim
-#include <stdio.h>
-#include <string.h>
-%%{
-    machine foo;
-    write data;
-}%%
-int main( int argc, char **argv )
-{
-    int cs;
-    if ( argc > 1 ) {
-        char *p = argv[1];
-        char *pe = p + strlen( p );
-        %%{ 
-            main := [0-9]+ ( '.' [0-9]+ )?;
-
-            write init;
-            write exec;
-        }%%
-    }
-    printf("result = %i\n", cs >= foo_first_final );
-    return 0;
-}
-.end verbatim
-.caption A basic Ragel example without any actions.
-.end figure
-
-.section Variables Used by Ragel
-
-There are a number of variables that Ragel expects the user to declare. At a
-very minimum the .verb|cs|, .verb|p| and .verb|pe| variables must be declared.
-In Go, Java and Ruby code the .verb|data| variable must also be declared. If
-EOF actions are used then the .verb|eof| variable is required. If
-stack-based state machine control flow statements are used then the
-.verb|stack| and .verb|top| variables are required. If a scanner is declared
-then the .verb|act|, .verb|ts| and .verb|te| variables must be
-declared.
-
-.itemize
-
-.item .verb|cs| - Current state. This must be an integer and it should persist
-across invocations of the machine when the data is broken into blocks that are
-processed independently. This variable may be modified from outside the
-execution loop, but not from within.
-
-.item .verb|p| - Data pointer. In C/D code this variable is expected to be a
-pointer to the character data to process. It should be initialized to the
-beginning of the data block on every run of the machine. In Go, Java and Ruby it is
-used as an offset to .verb|data| and must be an integer. In this case it should
-be initialized to zero on every run of the machine.
-
-.item .verb|pe| - Data end pointer. This should be initialized to .verb|p| plus
-the data length on every run of the machine. In Go, Java and Ruby code this should
-be initialized to the data length.
-
-.item .verb|eof| - End of file pointer. This should be set to .verb|pe| when
-the buffer block being processed is the last one, otherwise it should be set to
-null. In Go, Java and Ruby code .verb|-1| must be used instead of null. If the EOF
-event can be known only after the final buffer block has been processed, then
-it is possible to set .verb|p = pe = eof| and run the execute block.
-
-.item .verb|data| - This variable is only required in Go, Java and Ruby code. It
-must be an array containting the data to process.
-
-.item .verb|stack| - This must be an array of integers. It is used to store
-integer values representing states. If the stack must resize dynamically the
-Pre-push and Post-Pop statements can be used to do this (Sections
-.ref{prepush} and .ref{postpop}).
-
-.item .verb|top| - This must be an integer value and will be used as an offset
-to .verb|stack|, giving the next available spot on the top of the stack.
-
-.item .verb|act| - This must be an integer value. It is a variable sometimes
-used by scanner code to keep track of the most recent successful pattern match.
-
-.item .verb|ts| - This must be a pointer to character data. In Go, Java and
-Ruby code this must be an integer. See Section .ref{generating-scanners} for
-more information.
-
-.item .verb|te| - Also a pointer to character data.
-
-.end itemize
-
-.section Alphtype Statement
-
-.verbatim
-alphtype unsigned int;
-.end verbatim
-
-The alphtype statement specifies the alphabet data type that the machine
-operates on. During the compilation of the machine, integer literals are
-expected to be in the range of possible values of the alphtype. The default
-is .verb|char| for all languages except Go where the default is .verb|byte|.
-
-.multicols
-C/C++/Objective-C:
-.verbatim
-          char      unsigned char      
-          short     unsigned short
-          int       unsigned int
-          long      unsigned long
-.end verbatim
-
-Go:
-.verbatim
-          byte
-          int8      uint8
-          int16     uint16
-          int32     uint32
-          int
-.end verbatim
-
-Ruby: 
-.verbatim
-          char 
-          int
-.end verbatim
-
-.columnbreak
-
-Java:
-.verbatim
-          char 
-          byte 
-          short 
-          int
-.end verbatim
-
-D:
-.verbatim
-          char 
-          byte      ubyte   
-          short     ushort 
-          wchar 
-          int       uint 
-          dchar
-.end verbatim
-
-.end multicols
-
-.section Getkey Statement
-
-.verbatim
-getkey fpc->id;
-.end verbatim
-
-This statement specifies to Ragel how to retrieve the current character from 
-from the pointer to the current element (.verb|p|). Any expression that returns
-a value of the alphabet type
-may be used. The getkey statement may be used for looking into element
-structures or for translating the character to process. The getkey expression
-defaults to .verb|(*p)|. In goto-driven machines the getkey expression may be
-evaluated more than once per element processed, therefore it should not incur a
-large cost nor preclude optimization.
-
-.section Access Statement
-
-.verbatim
-access fsm->;
-.end verbatim
-
-The access statement specifies how the generated code should
-access the machine data that is persistent across processing buffer blocks.
-This applies to all variables except .verb|p|, .verb|pe| and .verb|eof|. This includes
-.verb|cs|, .verb|top|, .verb|stack|, .verb|ts|, .verb|te| and .verb|act|.
-The access statement is useful if a machine is to be encapsulated inside a
-structure in C code. It can be used to give the name of
-a pointer to the structure.
-
-.section Variable Statement
-
-.verbatim
-variable p fsm->p;
-.end verbatim
-
-The variable statement specifies how to access a specific
-variable. All of the variables that are declared by the user and
-used by Ragel can be changed. This includes .verb|p|, .verb|pe|, .verb|eof|, .verb|cs|,
-.verb|top|, .verb|stack|, .verb|ts|, .verb|te| and .verb|act|.
-In Go, Ruby and Java code generation the .verb|data| variable can also be changed.
-
-.section Pre-Push Statement
-.label{prepush}
-
-.verbatim
-prepush { 
-    /* stack growing code */
-}
-.end verbatim
-
-The prepush statement allows the user to supply stack management code that is
-written out during the generation of fcall, immediately before the current
-state is pushed to the stack. This statement can be used to test the number of
-available spaces and dynamically grow the stack if necessary.
-
-.section Post-Pop Statement
-.label{postpop}
-
-.verbatim
-postpop { 
-    /* stack shrinking code */
-}
-.end verbatim
-
-The postpop statement allows the user to supply stack management code that is
-written out during the generation of fret, immediately after the next state is
-popped from the stack. This statement can be used to dynamically shrink the
-stack.
-
-.section Write Statement
-.label{write-statement}
-
-.verbatim
-write <component> [options];
-.end verbatim
-
-The write statement is used to generate parts of the machine. 
-There are seven
-components that can be generated by a write statement. These components make up the
-state machine's data, initialization code, execution code, and export definitions.
-A write statement may appear before a machine is fully defined.
-This allows one to write out the data first then later define the machine where
-it is used. An example of this is shown in Figure .ref{fbreak-example}.
-
-.subsection Write Data
-.verbatim
-write data [options];
-.end verbatim
-
-The write data statement causes Ragel to emit the constant static data needed
-by the machine. In table-driven output styles (see Section .ref{genout}) this
-is a collection of arrays that represent the states and transitions of the
-machine.  In goto-driven machines much less data is emitted. At the very
-minimum a start state .verb|name_start| is generated.  All variables written
-out in machine data have both the .verb|static| and .verb|const| properties and
-are prefixed with the name of the machine and an
-underscore. The data can be placed inside a class, inside a function, or it can
-be defined as global data.
-
-Two variables are written that may be used to test the state of the machine
-after a buffer block has been processed. The .verb|name_error| variable gives
-the id of the state that the machine moves into when it cannot find a valid
-transition to take. The machine immediately breaks out of the processing loop when
-it finds itself in the error state. The error variable can be compared to the
-current state to determine if the machine has failed to parse the input. If the
-machine is complete, that is from every state there is a transition to a proper
-state on every possible character of the alphabet, then no error state is required
-and this variable will be set to -1.
-
-The .verb|name_first_final| variable stores the id of the first final state.
-All of the machine's states are sorted by their final state status before
-having their ids assigned. Checking if the machine has accepted its input can
-then be done by checking if the current state is greater-than or equal to the
-first final state.
-
-Data generation has several options:
-
-.list
-.li .verb|noerror  | - Do not generate the integer variable that gives the id of the error state.
-.li .verb|nofinal  | - Do not generate the integer variable that gives the id of the first final state.
-.li .verb|noprefix | - Do not prefix the variable names with the name of the machine.
-.end list
-
-.figure fbreak-example
-.verbatim
-#include <stdio.h>
-%% machine foo;
-%% write data;
-int main( int argc, char **argv )
-{
-    int cs, res = 0;
-    if ( argc > 1 ) {
-        char *p = argv[1];
-        %%{ 
-            main := 
-                [a-z]+ 
-                0 @{ res = 1; fbreak; };
-            write init;
-            write exec noend;
-        }%%
-    }
-    printf("execute = %i\n", res );
-    return 0;
-}
-.end verbatim
-.caption Use of .tt{noend} write option and the .tt{fbreak} statement for
-processing a string.
-.end figure
-
-.subsection Write Start, First Final and Error
-
-.verbatim
-write start;
-write first_final;
-write error;
-.end verbatim
-
-These three write statements provide an alternative means of accessing the
-.verb|start|, .verb|first_final| and .verb|error| states. If there are many
-different machine specifications in one file it is easy to get the prefix for
-these wrong. This is especially true if the state machine boilerplate is
-frequently made by a copy-paste-edit process. These write statements allow the
-problem to be avoided. They can be used as follows:
-
-.verbatim
-/* Did parsing succeed? */
-if ( cs < %%{ write first_final; }%% ) {
-    result = ERR_PARSE_ERROR;
-    goto fail;
-}
-.end verbatim
-
-.subsection Write Init
-.verbatim
-write init [options];
-.end verbatim
-
-The write init statement causes Ragel to emit initialization code. This should
-be executed once before the machine is started. At a very minimum this sets the
-current state to the start state. If other variables are needed by the
-generated code, such as call stack variables or scanner management
-variables, they are also initialized here.
-
-The .verb|nocs| option to the write init statement will cause ragel to skip
-intialization of the cs variable. This is useful if the user wishes to use
-custom logic to decide which state the specification should start in.
-
-.subsection Write Exec
-.verbatim
-write exec [options];
-.end verbatim
-
-The write exec statement causes Ragel to emit the state machine's execution code.
-Ragel expects several variables to be available to this code. At a very minimum, the
-generated code needs access to the current character position .verb|p|, the ending
-position .verb|pe| and the current state .verb|cs| (though .verb|pe|
-can be omitted using the .verb|noend| write option).
-The .verb|p| variable is the cursor that the execute code will
-used to traverse the input. The .verb|pe| variable should be set up to point to one
-position past the last valid character in the buffer.
-
-Other variables are needed when certain features are used. For example using
-the .verb|fcall| or .verb|fret| statements requires .verb|stack| and
-.verb|top| variables to be defined. If a longest-match construction is used,
-variables for managing backtracking are required.
-
-The write exec statement has one option. The .verb|noend| option tells Ragel
-to generate code that ignores the end position .verb|pe|. In this
-case the user must explicitly break out of the processing loop using
-.verb|fbreak|, otherwise the machine will continue to process characters until
-it moves into the error state. This option is useful if one wishes to process a
-null terminated string. Rather than traverse the string to discover then length
-before processing the input, the user can break out when the null character is
-seen.  The example in Figure .ref{fbreak-example} shows the use of the
-.verb|noend| write option and the .verb|fbreak| statement for processing a string.
-
-.subsection Write Exports
-.label{export}
-
-.verbatim
-write exports;
-.end verbatim
-
-The export feature can be used to export simple machine definitions. Machine definitions
-are marked for export using the .verb|export| keyword.
-
-.verbatim
-export machine_to_export = 0x44;
-.end verbatim
-
-When the write exports statement is used these machines are 
-written out in the generated code. Defines are used for C and constant integers
-are used for D, Java and Ruby. See Section .ref{import} for a description of the
-import statement.
-
-.section Maintaining Pointers to Input Data
-
-In the creation of any parser it is not uncommon to require the collection of
-the data being parsed.  It is always possible to collect data into a growable
-buffer as the machine moves over it, however the copying of data is a somewhat
-wasteful use of processor cycles. The most efficient way to collect data from
-the parser is to set pointers into the input then later reference them.  This
-poses a problem for uses of Ragel where the input data arrives in blocks, such
-as over a socket or from a file. If a pointer is set in one buffer block but
-must be used while parsing a following buffer block, some extra consideration
-to correctness must be made.
-
-The scanner constructions exhibit this problem, requiring the maintenance
-code described in Section .ref{generating-scanners}. If a longest-match
-construction has been used somewhere in the machine then it is possible to
-take advantage of the required prefix maintenance code in the driver program to
-ensure pointers to the input are always valid. If laying down a pointer one can
-set .verb|ts| at the same spot or ahead of it. When data is shifted in
-between loops the user must also shift the pointer.  In this way it is possible
-to maintain pointers to the input that will always be consistent.
-
-.figure line-oriented
-.verbatim
-    int have = 0;
-    while ( 1 ) {
-        char *p, *pe, *data = buf + have;
-        int len, space = BUFSIZE - have;
-
-        if ( space == 0 ) { 
-            fprintf(stderr, "BUFFER OUT OF SPACE\n");
-            exit(1);
-        }
-
-        len = fread( data, 1, space, stdin );
-        if ( len == 0 )
-            break;
-
-        /* Find the last newline by searching backwards. */
-        p = buf;
-        pe = data + len - 1;
-        while ( *pe != '\n' && pe >= buf )
-            pe--;
-        pe += 1;
-
-        %% write exec;
-
-        /* How much is still in the buffer? */
-        have = data + len - pe;
-        if ( have > 0 )
-            memmove( buf, pe, have );
-
-        if ( len < space )
-            break;
-    }
-.end verbatim
-.caption An example of line-oriented processing.
-.end figure
-
-In general, there are two approaches for guaranteeing the consistency of
-pointers to input data. The first approach is the one just described;
-lay down a marker from an action,
-then later ensure that the data the marker points to is preserved ahead of
-the buffer on the next execute invocation. This approach is good because it
-allows the parser to decide on the pointer-use boundaries, which can be
-arbitrarily complex parsing conditions. A downside is that it requires any
-pointers that are set to be corrected in between execute invocations.
-
-The alternative is to find the pointer-use boundaries before invoking the execute
-routine, then pass in the data using these boundaries. For example, if the
-program must perform line-oriented processing, the user can scan backwards from
-the end of an input block that has just been read in and process only up to the
-first found newline. On the next input read, the new data is placed after the
-partially read line and processing continues from the beginning of the line.
-An example of line-oriented processing is given in Figure .ref{line-oriented}.
-
-.section Specifying the Host Language
-
-The .verb|ragel| program has a number of options for specifying the host
-language. The host-language options are:
-
-.itemize
-.item .verb|-C  | for C/C++/Objective-C code (default)
-.item .verb|-D  | for D code.
-.item .verb|-Z  | for Go code.
-.item .verb|-J  | for Java code.
-.item .verb|-R  | for Ruby code.
-.item .verb|-A  | for C\# code.
-.end itemize
-
-.section Choosing a Generated Code Style
-.label{genout}
-
-There are three styles of code output to choose from. Code style affects the
-size and speed of the compiled binary. Changing code style does not require any
-change to the Ragel program. There are two table-driven formats and a goto
-driven format.
-
-In addition to choosing a style to emit, there are various levels of action
-code reuse to choose from.  The maximum reuse levels (.verb|-T0|, .verb|-F0|
-and .verb|-G0|) ensure that no FSM action code is ever duplicated by encoding
-each transition's action list as static data and iterating
-through the lists on every transition. This will normally result in a smaller
-binary. The less action reuse options (.verb|-T1|, .verb|-F1| and .verb|-G1|)
-will usually produce faster running code by expanding each transition's action
-list into a single block of code, eliminating the need to iterate through the
-lists. This duplicates action code instead of generating the logic necessary
-for reuse. Consequently the binary will be larger. However, this tradeoff applies to
-machines with moderate to dense action lists only. If a machine's transitions
-frequently have less than two actions then the less reuse options will actually
-produce both a smaller and a faster running binary due to less action sharing
-overhead. The best way to choose the appropriate code style for your
-application is to perform your own tests.
-
-The table-driven FSM represents the state machine as constant static data. There are
-tables of states, transitions, indices and actions. The current state is
-stored in a variable. The execution is simply a loop that looks up the current
-state, looks up the transition to take, executes any actions and moves to the
-target state. In general, the table-driven FSM can handle any machine, produces
-a smaller binary and requires a less expensive host language compile, but
-results in slower running code.  Since the table-driven format is the most
-flexible it is the default code style.
-
-The flat table-driven machine is a table-based machine that is optimized for
-small alphabets. Where the regular table machine uses the current character as
-the key in a binary search for the transition to take, the flat table machine
-uses the current character as an index into an array of transitions. This is
-faster in general, however is only suitable if the span of possible characters
-is small.
-
-The goto-driven FSM represents the state machine using goto and switch
-statements. The execution is a flat code block where the transition to take is
-computed using switch statements and directly executable binary searches.  In
-general, the goto FSM produces faster code but results in a larger binary and a
-more expensive host language compile.
-
-The goto-driven format has an additional action reuse level (.verb|-G2|) that
-writes actions directly into the state transitioning logic rather than putting
-all the actions together into a single switch. Generally this produces faster
-running code because it allows the machine to encode the current state using
-the processor's instruction pointer. Again, sparse machines may actually
-compile to smaller binaries when .verb|-G2| is used due to less state and
-action management overhead. For many parsing applications .verb|-G2| is the
-preferred output format.
-
-.center
-
-Code Output Style Options
-
-.tabular
-.row .verb|-T0|&binary search table-driven&C/D/Java/Ruby/C\#
-.row .verb|-T1|&binary search, expanded actions&C/D/Ruby/C\#
-.row .verb|-F0|&flat table-driven&C/D/Ruby/C\#
-.row .verb|-F1|&flat table, expanded actions&C/D/Ruby/C\#
-.row .verb|-G0|&goto-driven&C/D/C\#
-.row .verb|-G1|&goto, expanded actions&C/D/C\#
-.row .verb|-G2|&goto, in-place actions&C/D/Go
-.end tabular
-.end center
-
-.chapter Beyond the Basic Model
-
-.section Parser Modularization
-.label{modularization}
-
-It is possible to use Ragel's machine construction and action embedding
-operators to specify an entire parser using a single regular expression. In
-many cases this is the desired way to specify a parser in Ragel. However, in
-some scenarios the language to parse may be so large that it is difficult to
-think about it as a single regular expression. It may also shift between distinct
-parsing strategies, in which case modularization into several coherent blocks
-of the language may be appropriate.
-
-It may also be the case that patterns that compile to a large number of states
-must be used in a number of different contexts and referencing them in each
-context results in a very large state machine. In this case, an ability to reuse
-parsers would reduce code size.
-
-To address this, distinct regular expressions may be instantiated and linked
-together by means of a jumping and calling mechanism. This mechanism is
-analogous to the jumping to and calling of processor instructions. A jump
-command, given in action code, causes control to be immediately passed to
-another portion of the machine by way of setting the current state variable. A
-call command causes the target state of the current transition to be pushed to
-a state stack before control is transferred.  Later on, the original location
-may be returned to with a return statement. In the following example, distinct
-state machines are used to handle the parsing of two types of headers.
-
-% GENERATE: call
-% %%{
-%   machine call;
-.code
-action return { fret; }
-action call_date { fcall date; }
-action call_name { fcall name; }
-
-# A parser for date strings.
-date := [0-9][0-9] '/' 
-        [0-9][0-9] '/' 
-        [0-9][0-9][0-9][0-9] '\n' @return;
-
-# A parser for name strings.
-name := ( [a-zA-Z]+ | ' ' )** '\n' @return;
-
-# The main parser.
-headers = 
-    ( 'from' | 'to' ) ':' @call_name | 
-    ( 'departed' | 'arrived' ) ':' @call_date;
-
-main := headers*;
-.end code
-% }%%
-% %% write data;
-% void f()
-% {
-%   %% write init;
-%   %% write exec;
-% }
-% END GENERATE
-
-Calling and jumping should be used carefully as they are operations that take
-one out of the domain of regular languages. A machine that contains a call or
-jump statement in one of its actions should be used as an argument to a machine
-construction operator only with considerable care. Since DFA transitions may
-actually represent several NFA transitions, a call or jump embedded in one
-machine can inadvertently terminate another machine that it shares prefixes
-with. Despite this danger, theses statements have proven useful for tying
-together sub-parsers of a language into a parser for the full language,
-especially for the purpose of modularizing code and reducing the number of
-states when the machine contains frequently recurring patterns.
-
-Section .ref{vals} describes the jump and call statements that are used to
-transfer control. These statements make use of two variables that must be
-declared by the user, .verb|stack| and .verb|top|. The .verb|stack| variable
-must be an array of integers and .verb|top| must be a single integer, which
-will point to the next available space in .verb|stack|. Sections .ref{prepush}
-and .ref{postpop} describe the Pre-Push and Post-Pop statements which can be
-used to implement a dynamically resizable array.
-
-.section Referencing Names
-.label{labels}
-
-This section describes how to reference names in epsilon transitions (Section
-.ref{state-charts}) and
-action-based control-flow statements such as .verb|fgoto|. There is a hierarchy
-of names implied in a Ragel specification.  At the top level are the machine
-instantiations. Beneath the instantiations are labels and references to machine
-definitions. Beneath those are more labels and references to definitions, and
-so on.
-
-Any name reference may contain multiple components separated with the .verb|::|
-compound symbol.  The search for the first component of a name reference is
-rooted at the join expression that the epsilon transition or action embedding
-is contained in. If the name reference is not contained in a join,
-the search is rooted at the machine definition that the epsilon transition or
-action embedding is contained in. Each component after the first is searched
-for beginning at the location in the name tree that the previous reference
-component refers to.
-
-In the case of action-based references, if the action is embedded more than
-once, the local search is performed for each embedding and the result is the
-union of all the searches. If no result is found for action-based references then
-the search is repeated at the root of the name tree.  Any action-based name
-search may be forced into a strictly global search by prefixing the name
-reference with .verb|::|.
-
-The final component of the name reference must resolve to a unique entry point.
-If a name is unique in the entire name tree it can be referenced as is. If it
-is not unique it can be specified by qualifying it with names above it in the
-name tree. However, it can always be renamed.
-
-% FIXME: Should fit this in somewhere.
-% Some kinds of name references are illegal. Cannot call into longest-match
-% machine, can only call its start state. Cannot make a call to anywhere from
-% any part of a longest-match machine except a rule's action. This would result
-% in an eventual return to some point inside a longest-match other than the
-% start state. This is banned for the same reason a call into the LM machine is
-% banned.
-
-
-.section Scanners
-.label{generating-scanners}
-
-Scanners are very much intertwined with regular-languages and their
-corresponding processors. For this reason Ragel supports the definition of
-scanners.  The generated code will repeatedly attempt to match patterns from a
-list, favouring longer patterns over shorter patterns.  In the case of
-equal-length matches, the generated code will favour patterns that appear ahead
-of others. When a scanner makes a match it executes the user code associated
-with the match, consumes the input then resumes scanning.
-
-.verbatim
-<machine_name> := |* 
-        pattern1 => action1;
-        pattern2 => action2;
-        ...
-    *|;
-.end verbatim
-
-On the surface, Ragel scanners are similar to those defined by Lex. Though
-there is a key distinguishing feature: patterns may be arbitrary Ragel
-expressions and can therefore contain embedded code. With a Ragel-based scanner
-the user need not wait until the end of a pattern before user code can be
-executed.
-
-Scanners can be used to process sub-languages, as well as for tokenizing
-programming languages. In the following example a scanner is used to tokenize
-the contents of a header field.
-
-.code
-word = [a-z]+;
-head_name = 'Header';
-
-header := |*
-    word;
-    ' ';
-    '\n' => { fret; };
-*|;
-
-main := ( head_name ':' @{ fcall header; } )*;
-.end code
-
-The scanner construction has a purpose similar to the longest-match kleene star
-operator .verb|**|. The key
-difference is that a scanner is able to backtrack to match a previously matched
-shorter string when the pursuit of a longer string fails.  For this reason the
-scanner construction operator is not a pure state machine construction
-operator. It relies on several variables that enable it to backtrack and make
-pointers to the matched input text available to the user.  For this reason
-scanners must be immediately instantiated. They cannot be defined inline or
-referenced by another expression. Scanners must be jumped to or called.
-
-Scanners rely on the .verb|ts|, .verb|te| and .verb|act|
-variables to be present so that they can backtrack and make pointers to the
-matched text available to the user. If input is processed using multiple calls
-to the execute code then the user must ensure that when a token is only
-partially matched that the prefix is preserved on the subsequent invocation of
-the execute code.
-
-The .verb|ts| variable must be defined as a pointer to the input data.
-It is used for recording where the current token match begins. This variable
-may be used in action code for retrieving the text of the current match.  Ragel
-ensures that in between tokens and outside of the longest-match machines that
-this pointer is set to null. In between calls to the execute code the user must
-check if .verb|ts| is set and if so, ensure that the data it points to is
-preserved ahead of the next buffer block. This is described in more detail
-below.
-
-The .verb|te| variable must also be defined as a pointer to the input data.
-It is used for recording where a match ends and where scanning of the next
-token should begin. This can also be used in action code for retrieving the
-text of the current match.
-
-The .verb|act| variable must be defined as an integer type. It is used for
-recording the identity of the last pattern matched when the scanner must go
-past a matched pattern in an attempt to make a longer match. If the longer
-match fails it may need to consult the .verb|act| variable. In some cases, use 
-of the .verb|act|
-variable can be avoided because the value of the current state is enough
-information to determine which token to accept, however in other cases this is
-not enough and so the .verb|act| variable is used. 
-
-When the longest-match operator is in use, the user's driver code must take on
-some buffer management functions. The following algorithm gives an overview of
-the steps that should be taken to properly use the longest-match operator.
-
-.itemize
-.item Read a block of input data.
-.item Run the execute code.
-.item If .verb|ts| is set, the execute code will expect the incomplete
-token to be preserved ahead of the buffer on the next invocation of the execute
-code.  
-.itemize
-.item Shift the data beginning at .verb|ts| and ending at .verb|pe| to the
-beginning of the input buffer.
-.item Reset .verb|ts| to the beginning of the buffer. 
-.item Shift .verb|te| by the distance from the old value of .verb|ts|
-to the new value. The .verb|te| variable may or may not be valid.  There is
-no way to know if it holds a meaningful value because it is not kept at null
-when it is not in use. It can be shifted regardless.
-.end itemize
-.item Read another block of data into the buffer, immediately following any
-preserved data.
-.item Run the scanner on the new data.
-.end itemize
-
-Figure .ref{preserve_example} shows the required handling of an input stream in
-which a token is broken by the input block boundaries. After processing up to
-and including the ``t'' of ``characters'', the prefix of the string token must be
-retained and processing should resume at the ``e'' on the next iteration of
-the execute code.
-
-If one uses a large input buffer for collecting input then the number of times
-the shifting must be done will be small. Furthermore, if one takes care not to
-define tokens that are allowed to be very long and instead processes these
-items using pure state machines or sub-scanners, then only a small amount of
-data will ever need to be shifted.
-
-.figure preserve_example
-.verbatim
-      a)           A stream "of characters" to be scanned.
-                   |        |          |
-                   p        ts         pe
-
-      b)           "of characters" to be scanned.
-                   |          |        |
-                   ts         p        pe
-.end verbatim
-.caption Following an invocation of the execute code there may be a partially
-matched token (a). The data of the partially matched token 
-must be preserved ahead of the new data on the next invocation (b).
-.end figure
-
-Since scanners attempt to make the longest possible match of input, patterns
-such as identifiers require one character of lookahead in order to trigger a
-match. In the case of the last token in the input stream the user must ensure
-that the .verb|eof| variable is set so that the final token is flushed out.
-
-An example scanner processing loop is given in Figure .ref{scanner-loop}.
-
-.figure scanner-loop
-.verbatim
-    int have = 0;
-    bool done = false;
-    while ( !done ) {
-        /* How much space is in the buffer? */
-        int space = BUFSIZE - have;
-        if ( space == 0 ) {
-            /* Buffer is full. */
-            cerr << "TOKEN TOO BIG" << endl;
-            exit(1);
-        }
-
-        /* Read in a block after any data we already have. */
-        char *p = inbuf + have;
-        cin.read( p, space );
-        int len = cin.gcount();
-
-        char *pe = p + len;
-        char *eof = 0;
-
-        /* If no data was read indicate EOF. */
-        if ( len == 0 ) {
-            eof = pe;
-            done = true;
-        }
-
-        %% write exec;
-
-        if ( cs == Scanner_error ) {
-            /* Machine failed before finding a token. */
-            cerr << "PARSE ERROR" << endl;
-            exit(1);
-        }
-
-        if ( ts == 0 )
-            have = 0;
-        else {
-            /* There is a prefix to preserve, shift it over. */
-            have = pe - ts;
-            memmove( inbuf, ts, have );
-            te = inbuf + (te-ts);
-            ts = inbuf;
-        }
-    }
-.end verbatim
-.caption A processing loop for a scanner.
-.end figure
-
-.section State Charts
-.label{state-charts}
-
-In addition to supporting the construction of state machines using regular
-languages, Ragel provides a way to manually specify state machines using
-state charts.  The comma operator combines machines together without any
-implied transitions. The user can then manually link machines by specifying
-epsilon transitions with the .verb|->| operator.  Epsilon transitions are drawn
-between the final states of a machine and entry points defined by labels.  This
-makes it possible to build machines using the explicit state-chart method while
-making minimal changes to the Ragel language. 
-
-An interesting feature of Ragel's state chart construction method is that it
-can be mixed freely with regular expression constructions. A state chart may be
-referenced from within a regular expression, or a regular expression may be
-used in the definition of a state chart transition.
-
-.subsection Join
-
-.verb|expr , expr , ...|
-
-Join a list of machines together without
-drawing any transitions, without setting up a start state, and without
-designating any final states. Transitions between the machines may be specified
-using labels and epsilon transitions. The start state must be explicity
-specified with the ``start'' label. Final states may be specified with an
-epsilon transition to the implicitly created ``final'' state. The join
-operation allows one to build machines using a state chart model.
-
-.subsection Label
-
-.verb|label: expr| 
-
-Attaches a label to an expression. Labels can be
-used as the target of epsilon transitions and explicit control transfer
-statements such as .verb|fgoto| and .verb|fnext| in action
-code.
-
-.subsection Epsilon
-
-.verb|expr -> label| 
-
-Draws an epsilon transition to the state defined
-by .verb|label|.  Epsilon transitions are made deterministic when join
-operators are evaluated. Epsilon transitions that are not in a join operation
-are made deterministic when the machine definition that contains the epsilon is
-complete. See Section .ref{labels} for information on referencing labels.
-
-.subsection Simplifying State Charts
-
-There are two benefits to providing state charts in Ragel. The first is that it
-allows us to take a state chart with a full listing of states and transitions
-and simplify it in selective places using regular expressions.
-
-The state chart method of specifying parsers is very common.  It is an
-effective programming technique for producing robust code. The key disadvantage
-becomes clear when one attempts to comprehend a large parser specified in this
-way.  These programs usually require many lines, causing logic to be spread out
-over large distances in the source file. Remembering the function of a large
-number of states can be difficult and organizing the parser in a sensible way
-requires discipline because branches and repetition present many file layout
-options.  This kind of programming takes a specification with inherent
-structure such as looping, alternation and concatenation and expresses it in a
-flat form. 
-
-If we could take an isolated component of a manually programmed state chart,
-that is, a subset of states that has only one entry point, and implement it
-using regular language operators then we could eliminate all the explicit
-naming of the states contained in it. By eliminating explicitly named states
-and replacing them with higher-level specifications we simplify a state machine
-specification.
-
-For example, sometimes chains of states are needed, with only a small number of
-possible characters appearing along the chain. These can easily be replaced
-with a concatenation of characters. Sometimes a group of common states
-implement a loop back to another single portion of the machine. Rather than
-manually duplicate all the transitions that loop back, we may be able to
-express the loop using a kleene star operator.
-
-Ragel allows one to take this state map simplification approach. We can build
-state machines using a state map model and implement portions of the state map
-using regular languages. In place of any transition in the state machine,
-entire sub-machines can be given. These can encapsulate functionality
-defined elsewhere. An important aspect of the Ragel approach is that when we
-wrap up a collection of states using a regular expression we do not lose
-access to the states and transitions. We can still execute code on the
-transitions that we have encapsulated.
-
-.subsection Dropping Down One Level of Abstraction
-.label{down}
-
-The second benefit of incorporating state charts into Ragel is that it permits
-us to bypass the regular language abstraction if we need to. Ragel's action
-embedding operators are sometimes insufficient for expressing certain parsing
-tasks.  In the same way that is useful for C language programmers to drop down
-to assembly language programming using embedded assembler, it is sometimes
-useful for the Ragel programmer to drop down to programming with state charts.
-
-In the following example, we wish to buffer the characters of an XML CDATA
-sequence. The sequence is terminated by the string .verb|]]>|. The challenge
-in our application is that we do not wish the terminating characters to be
-buffered. An expression of the form .verb|any* @buffer :>> ']]>'| will not work
-because the buffer will always contain the characters .verb|]]| on the end.
-Instead, what we need is to delay the buffering of .verb|]|
-characters until a time when we
-abandon the terminating sequence and go back into the main loop. There is no
-easy way to express this using Ragel's regular expression and action embedding
-operators, and so an ability to drop down to the state chart method is useful.
-
-% GENERATE: dropdown
-% OPT: -p
-% %%{
-% machine dropdown;
-.code
-action bchar { buff( fpc ); }    # Buffer the current character.
-action bbrack1 { buff( "]" ); }
-action bbrack2 { buff( "]]" ); }
-
-CDATA_body =
-start: (
-     ']' -> one |
-     (any-']') @bchar ->start
-),
-one: (
-     ']' -> two |
-     [^\]] @bbrack1 @bchar ->start
-),
-two: (
-     '>' -> final |
-     ']' @bbrack1 -> two |
-     [^>\]] @bbrack2 @bchar ->start
-);
-.end code
-% main := CDATA_body;
-% }%%
-% END GENERATE
-
-.graphic dropdown
-
-
-.section Semantic Conditions
-.label{semantic}
-
-Many communication protocols contain variable-length fields, where the length
-of the field is given ahead of the field as a value. This
-problem cannot be expressed using regular languages because of its
-context-dependent nature. The prevalence of variable-length fields in
-communication protocols motivated us to introduce semantic conditions into
-the Ragel language.
-
-A semantic condition is a block of user code that is interpreted as an
-expression and evaluated immediately
-before a transition is taken. If the code returns a value of true, the
-transition may be taken.  We can now embed code that extracts the length of a
-field, then proceed to match $n$ data values.
-
-% GENERATE: conds1
-% OPT: -p
-% %%{
-% machine conds1;
-% number = digit+;
-.code
-action rec_num { i = 0; n = getnumber(); }
-action test_len { i++ < n }
-data_fields = (
-    'd' 
-    [0-9]+ %rec_num 
-    ':'
-    ( [a-z] when test_len )*
-)**;
-.end code
-% main := data_fields;
-% }%%
-% END GENERATE
-
-.graphic conds1
-
-The Ragel implementation of semantic conditions does not force us to give up the
-compositional property of Ragel definitions. For example, a machine that tests
-the length of a field using conditions can be unioned with another machine
-that accepts some of the same strings, without the two machines interfering with
-one another. The user need not be concerned about whether or not the result of the
-semantic condition will affect the matching of the second machine.
-
-To see this, first consider that when a user associates a condition with an
-existing transition, the transition's label is translated from the base character
-to its corresponding value in the space that represents ``condition $c$ true''. Should
-the determinization process combine a state that has a conditional transition
-with another state that has a transition on the same input character but
-without a condition, then the condition-less transition first has its label
-translated into two values, one to its corresponding value in the space that
-represents ``condition $c$ true'' and another to its corresponding value in the
-space that represents ``condition $c$ false''. It
-is then safe to combine the two transitions. This is shown in the following
-example.  Two intersecting patterns are unioned, one with a condition and one
-without. The condition embedded in the first pattern does not affect the second
-pattern.
-
-% GENERATE: conds2
-% OPT: -p
-% %%{
-% machine conds2;
-% number = digit+;
-.code
-action test_len { i++ < n }
-action one { /* accept pattern one */ }
-action two { /* accept pattern two */ }
-patterns = 
-    ( [a-z] when test_len )+ %one |
-    [a-z][a-z0-9]* %two;
-main := patterns '\n';
-.end code
-% }%%
-% END GENERATE
-
-.graphic conds2
-
-There are many more potential uses for semantic conditions. The user is free to
-use arbitrary code and may therefore perform actions such as looking up names
-in dictionaries, validating input using external parsing mechanisms or
-performing checks on the semantic structure of input seen so far. In the next
-section we describe how Ragel accommodates several common parser engineering
-problems.
-
-The semantic condition feature works only with alphabet types that are smaller
-in width than the .verb|long| type. To implement semantic conditions Ragel
-needs to be able to allocate characters from the alphabet space. Ragel uses
-these allocated characters to express "character C with condition P true" or "C
-with P false." Since internally Ragel uses longs to store characters there is
-no room left in the alphabet space unless an alphabet type smaller than long is
-used.
-
-.section Implementing Lookahead
-
-There are a few strategies for implementing lookahead in Ragel programs.
-Leaving actions, which are described in Section .ref{out-actions}, can be
-used as a form of lookahead.  Ragel also provides the .verb|fhold| directive
-which can be used in actions to prevent the machine from advancing over the
-current character. It is also possible to manually adjust the current character
-position by shifting it backwards using .verb|fexec|, however when this is
-done, care must be taken not to overstep the beginning of the current buffer
-block. In both the use of .verb|fhold| and .verb|fexec| the user must be
-cautious of combining the resulting machine with another in such a way that the
-transition on which the current position is adjusted is not combined with a
-transition from the other machine.
-
-.section Parsing Recursive Language Structures
-
-In general Ragel cannot handle recursive structures because the grammar is
-interpreted as a regular language. However, depending on what needs to be
-parsed it is sometimes practical to implement the recursive parts using manual
-coding techniques. This often works in cases where the recursive structures are
-simple and easy to recognize, such as in the balancing of parentheses
-
-One approach to parsing recursive structures is to use actions that increment
-and decrement counters or otherwise recognize the entry to and exit from
-recursive structures and then jump to the appropriate machine defnition using
-.verb|fcall| and .verb|fret|. Alternatively, semantic conditions can be used to
-test counter variables.
-
-A more traditional approach is to call a separate parsing function (expressed
-in the host language) when a recursive structure is entered, then later return
-when the end is recognized.
diff --git a/test/repeat2.lm b/test/repeat2.lm
index d9104492..39ffe53b 100644
--- a/test/repeat2.lm
+++ b/test/repeat2.lm
@@ -1,3 +1,4 @@
+##### LM #####
 #
 #   Copyright 2012 Adrian Thurston <thurston@complang.org>
 #
@@ -544,3 +545,6864 @@ print(
 	~
 	~\end{document}
 )
+##### IN #####
+.title Ragel State Machine Compiler
+
+.subtitle User Guide
+
+.author Adrian Thurston
+
+.license
+
+This document is part of Ragel, and as such, this document is
+released under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2 of the License, or (at your option)
+any later version.
+
+Ragel is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+details.
+
+You should have received a copy of the GNU General Public
+License along with Ragel; if not, write to the Free Software Foundation, Inc.,
+59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+.chapter Introduction
+
+.section Abstract
+
+Regular expressions are used heavily in practice for the purpose of specifying
+parsers. They are normally used as black boxes linked together with program
+logic.  User actions are executed in between invocations of the regular
+expression engine. Adding actions before a pattern terminates requires patterns
+to be broken and pasted back together with program logic. The more user actions
+are needed, the less the advantages of regular expressions are seen. 
+
+Ragel is a software development tool that allows user actions to be 
+embedded into the transitions of a regular expression's corresponding state
+machine, eliminating the need to switch from the regular expression engine and
+user code execution environment and back again. As a result, expressions can be
+maximally continuous.  One is free to specify an entire parser using a single
+regular expression.  The single-expression model affords concise and elegant
+descriptions of languages and the generation of very simple, fast and robust
+code.  Ragel compiles executable finite state machines from a high level regular language
+notation. Ragel targets C, C++, Objective-C, D, Go, Java and Ruby.
+
+In addition to building state machines from regular expressions, Ragel allows
+the programmer to directly specify state machines with state charts. These two
+notations may be freely combined. There are also facilities for controlling
+nondeterminism in the resulting machines and building scanners using patterns
+that themselves have embedded actions. Ragel can produce code that is small and
+runs very fast. Ragel can handle integer-sized alphabets and can compile very
+large state machines.
+
+.section Motivation
+
+When a programmer is faced with the task of producing a parser for a
+context-free language there are many tools to choose from. It is quite common
+to generate useful and efficient parsers for programming languages from a
+formal grammar. It is also quite common for programmers to avoid such tools
+when making parsers for simple computer languages, such as file formats and
+communication protocols.  Such languages are often regular and tools for
+processing the context-free languages are viewed as too heavyweight for the
+purpose of parsing regular languages. The extra run-time effort required for
+supporting the recursive nature of context-free languages is wasted.
+
+When we turn to the regular expression-based parsing tools, such as Lex, Re2C,
+and scripting languages such as Sed, Awk and Perl we find that they are split
+into two levels: a regular expression matching engine and some kind of program
+logic for linking patterns together.  For example, a Lex program is composed of
+sets of regular expressions. The implied program logic repeatedly attempts to
+match a pattern in the current set. When a match is found the associated user
+code executed. It requires the user to consider a language as a sequence of
+independent tokens. Scripting languages and regular expression libraries allow
+one to link patterns together using arbitrary program code.  This is very
+flexible and powerful, however we can be more concise and clear if we avoid
+gluing together regular expressions with if statements and while loops.
+
+This model of execution, where the runtime alternates between regular
+expression matching and user code exectution places restrictions on when
+action code may be executed. Since action code can only be associated with
+complete patterns, any action code that must be executed before an entire
+pattern is matched requires that the pattern be broken into smaller units.
+Instead of being forced to disrupt the regular expression syntax and write
+smaller expressions, it is desirable to retain a single expression and embed
+code for performing actions directly into the transitions that move over the
+characters. After all, capable programmers are astutely aware of the machinery
+underlying their programs, so why not provide them with access to that
+machinery? To achieve this we require an action execution model for associating
+code with the sub-expressions of a regular expression in a way that does not
+disrupt its syntax.
+
+The primary goal of Ragel is to provide developers with an ability to embed
+actions into the transitions and states of a regular expression's state machine
+in support of the definition of entire parsers or large sections of parsers
+using a single regular expression.  From the regular expression we gain a clear
+and concise statement of our language. From the state machine we obtain a very
+fast and robust executable that lends itself to many kinds of analysis and
+visualization.
+
+.section Overview
+
+Ragel is a language for specifying state machines. The Ragel program is a
+compiler that assembles a state machine definition to executable code.  Ragel
+is based on the principle that any regular language can be converted to a
+deterministic finite state automaton. Since every regular language has a state
+machine representation and vice versa, the terms regular language and state
+machine (or just machine) will be used interchangeably in this document.
+
+Ragel outputs machines to C, C++, Objective-C, D, Go, Java or Ruby code. The output is
+designed to be generic and is not bound to any particular input or processing
+method. A Ragel machine expects to have data passed to it in buffer blocks.
+When there is no more input, the machine can be queried for acceptance.  In
+this way, a Ragel machine can be used to simply recognize a regular language
+like a regular expression library. By embedding code into the regular language,
+a Ragel machine can also be used to parse input.
+
+The Ragel language has many operators for constructing and manipulating
+machines. Machines are built up from smaller machines, to bigger ones, to the
+final machine representing the language that needs to be recognized or parsed.
+
+The core state machine construction operators are those found in most theory
+of computation textbooks. They date back to the 1950s and are widely studied.
+They are based on set operations and permit one to think of languages as a set
+of strings. They are Union, Intersection, Difference, Concatenation and Kleene
+Star. Put together, these operators make up what most people know as regular
+expressions. Ragel also provides a scanner construction operator 
+and provides operators for explicitly constructing machines
+using a state chart method. In the state chart method, one joins machines
+together without any implied transitions and then explicitly specifies where
+epsilon transitions should be drawn.
+
+The state machine manipulation operators are specific to Ragel. They allow the
+programmer to access the states and transitions of regular language's
+corresponding machine. There are two uses of the manipulation operators. The
+first and primary use is to embed code into transitions and states, allowing
+the programmer to specify the actions of the state machine.
+
+Ragel attempts to make the action embedding facility as intuitive as possible.
+To do so, a number of issues need to be addressed.  For example, when making a
+nondeterministic specification into a DFA using machines that have embedded
+actions, new transitions are often made that have the combined actions of
+several source transitions. Ragel ensures that multiple actions associated with
+a single transition are ordered consistently with respect to the order of
+reference and the natural ordering implied by the construction operators.
+
+The second use of the manipulation operators is to assign priorities to
+transitions. Priorities provide a convenient way of controlling any
+nondeterminism introduced by the construction operators. Suppose two
+transitions leave from the same state and go to distinct target states on the
+same character. If these transitions are assigned conflicting priorities, then
+during the determinization process the transition with the higher priority will
+take precedence over the transition with the lower priority. The lower priority
+transition gets abandoned. The transitions would otherwise be combined into a new
+transition that goes to a new state that is a combination of the original
+target states. Priorities are often required for segmenting machines. The most
+common uses of priorities have been encoded into a set of simple operators
+that should be used instead of priority embeddings whenever possible.
+
+For the purposes of embedding, Ragel divides transitions and states into
+different classes. There are four operators for embedding actions and
+priorities into the transitions of a state machine. It is possible to embed
+into entering transitions, finishing transitions, all transitions and leaving
+transitions. The embedding into leaving transitions is a special case.
+These transition embeddings get stored in the final states of a machine.  They
+are transferred to any transitions that are made going out of the machine by
+future concatenation or kleene star operations.
+
+There are several more operators for embedding actions into states. Like the
+transition embeddings, there are various different classes of states that the
+embedding operators access. For example, one can access start states, final
+states or all states, among others. Unlike the transition embeddings, there are
+several different types of state action embeddings. These are executed at
+various different times during the processing of input. It is possible to embed
+actions that are exectued on transitions into a state, on transitions out of a
+state, on transitions taken on the error event, or on transitions taken on the
+EOF event.
+
+Within actions, it is possible to influence the behaviour of the state machine.
+The user can write action code that jumps or calls to another portion of the
+machine, changes the current character being processed, or breaks out of the
+processing loop. With the state machine calling feature Ragel can be used to
+parse languages that are not regular. For example, one can parse balanced
+parentheses by calling into a parser when an open parenthesis character is seen
+and returning to the state on the top of the stack when the corresponding
+closing parenthesis character is seen. More complicated context-free languages
+such as expressions in C are out of the scope of Ragel. 
+
+Ragel also provides a scanner construction operator that can be used to build
+scanners much the same way that Lex is used. The Ragel generated code, which
+relies on user-defined variables for backtracking, repeatedly tries to match
+patterns to the input, favouring longer patterns over shorter ones and patterns
+that appear ahead of others when the lengths of the possible matches are
+identical. When a pattern is matched the associated action is executed. 
+
+The key distinguishing feature between scanners in Ragel and scanners in Lex is
+that Ragel patterns may be arbitrary Ragel expressions and can therefore
+contain embedded code. With a Ragel-based scanner the user need not wait until
+the end of a pattern before user code can be executed.
+
+Scanners do take Ragel out of the domain of pure state machines and require the
+user to maintain the backtracking related variables.  However, scanners
+integrate well with regular state machine instantiations. They can be called to
+or jumped to only when needed, or they can be called out of or jumped out of
+when a simpler, pure state machine model is appropriate.
+
+Two types of output code style are available. Ragel can produce a table-driven
+machine or a directly executable machine. The directly executable machine is
+much faster than the table-driven. On the other hand, the table-driven machine
+is more compact and less demanding on the host language compiler. It is better
+suited to compiling large state machines.
+
+.section Related Work
+
+Lex is perhaps the best-known tool for constructing parsers from regular
+expressions. In the Lex processing model, generated code attempts to match one
+of the user's regular expression patterns, favouring longer matches over
+shorter ones. Once a match is made it then executes the code associated with
+the pattern and consumes the matching string.  This process is repeated until
+the input is fully consumed. 
+
+Through the use of start conditions, related sets of patterns may be defined.
+The active set may be changed at any time.  This allows the user to define
+different lexical regions. It also allows the user to link patterns together by
+requiring that some patterns come before others.  This is quite like a
+concatenation operation. However, use of Lex for languages that require a
+considerable amount of pattern concatenation is inappropriate. In such cases a
+Lex program deteriorates into a manually specified state machine, where start
+conditions define the states and pattern actions define the transitions.  Lex
+is therefore best suited to parsing tasks where the language to be parsed can
+be described in terms of regions of tokens. 
+
+Lex is useful in many scenarios and has undoubtedly stood the test of time.
+There are, however, several drawbacks to using Lex.  Lex can impose too much
+overhead for parsing applications where buffering is not required because all
+the characters are available in a single string.  In these cases there is
+structure to the language to be parsed and a parser specification tool can
+help, but employing a heavyweight processing loop that imposes a stream
+``pull'' model and dynamic input buffer allocation is inappropriate.  An
+example of this kind of scenario is the conversion of floating point numbers
+contained in a string to their corresponding numerical values.
+
+Another drawback is the very issue that Ragel attempts to solve.
+It is not possible to execute a user action while
+matching a character contained inside a pattern. For example, if scanning a
+programming language and string literals can contain newlines which must be
+counted, a Lex user must break up a string literal pattern so as to associate
+an action with newlines. This forces the definition of a new start condition.
+Alternatively the user can reprocess the text of the matched string literal to
+count newlines. 
+
+.comment
+
+How ragel is different from Lex.
+
+Like Re2c, Ragel provides a simple execution model that does not make any
+assumptions as to how the input is collected.  Also, Ragel does not do any
+buffering in the generated code. Consequently there are no dependencies on
+external functions such as .verb|malloc|. 
+
+If buffering is required it can be manually implemented by embedding actions
+that copy the current character to a buffer, or data can be passed to the
+parser using known block boundaries. If the longest-match operator is used,
+Ragel requires the user to ensure that the ending portion of the input buffer
+is preserved when the buffer is exhaused before a token is fully matched. The
+user should move the token prefix to a new memory location, such as back to the
+beginning of the input buffer, then place the subsequently read input
+immediately after the prefix.
+
+These properties of Ragel make it more work to write a program that requires
+the longest-match operator or buffering of input, however they make Ragel a
+more flexible tool that can produce very simple and fast-running programs under
+a variety of input acquisition arrangements.
+
+In Ragel, it is not necessary
+to introduce start conditions to concatenate tokens and retain action
+execution. Ragel allows one to structure a parser as a series of tokens, but
+does not require it.
+
+Like Lex and Re2C, Ragel is able to process input using a longest-match
+execution model, however the core of the Ragel language specifies parsers at a
+much lower level. This core is built around a pure state machine model. When
+building basic machines there is no implied algorithm for processing input
+other than to move from state to state on the transitions of the machine. This
+core of pure state machine operations makes Ragel well suited to handling
+parsing problems not based on token scanning. Should one need to use a
+longest-match model, the functionality is available and the lower level state
+machine construction facilities can be used to specify the patterns of a
+longest-match machine.
+
+This is not possible in Ragel. One can only program
+a longest-match instantiation with a fixed set of rules. One can jump to
+another longest-match machine that employs the same machine definitions in the
+construction of its rules, however no states will be shared.
+
+In Ragel, input may be re-parsed using a
+different machine, but since the action to be executed is associated with
+transitions of the compiled state machine, the longest-match construction does
+not permit a single rule to be excluded from the active set. It cannot be done
+ahead of time nor in the excluded rule's action.
+
+.end comment
+
+The Re2C program defines an input processing model similar to that of Lex.
+Re2C focuses on making generated state machines run very fast and
+integrate easily into any program, free of dependencies.  Re2C generates
+directly executable code and is able to claim that generated parsers run nearly
+as fast as their hand-coded equivalents.  This is very important for user
+adoption, as programmers are reluctant to use a tool when a faster alternative
+exists.  A consideration to ease of use is also important because developers
+need the freedom to integrate the generated code as they see fit. 
+
+Many scripting languages provide ways of composing parsers by linking regular
+expressions using program logic. For example, Sed and Awk are two established
+Unix scripting tools that allow the programmer to exploit regular expressions
+for the purpose of locating and extracting text of interest. High-level
+programming languages such as Perl, Python, PHP and Ruby all provide regular
+expression libraries that allow the user to combine regular expressions with
+arbitrary code.
+
+In addition to supporting the linking of regular expressions with arbitrary
+program logic, the Perl programming language permits the embedding of code into
+regular expressions. Perl embeddings do not translate into the embedding of
+code into deterministic state machines. Perl regular expressions are in fact
+not fully compiled to deterministic machines when embedded code is involved.
+They are instead interpreted and involve backtracking. This is shown by the
+following Perl program. When it is fed the input .verb|abcd| the interpretor
+attempts to match the first alternative, printing .verb|a1 b1|.  When this
+possibility fails it backtracks and tries the second possibility, printing
+.verb|a2 b2|, at which point it succeeds.
+
+.code
+print "YES\n" if ( <STDIN> =~
+        /( a (?{ print "a1 "; }) b (?{ print "b1 "; }) cX ) |
+         ( a (?{ print "a2 "; }) b (?{ print "b2 "; }) cd )/x )
+.end code
+
+In Ragel there is no regular expression interpretor. Aside from the scanner
+operator, all Ragel expressions are made into deterministic machines and the
+run time simply moves from state to state as it consumes input. An equivalent
+parser expressed in Ragel would attempt both of the alternatives concurrently,
+printing .verb|a1 a2 b1 b2|.
+
+.section Development Status
+
+Ragel is a relatively new tool and is under continuous development. As a rough
+release guide, minor revision number changes are for implementation
+improvements and feature additions. Major revision number changes are for
+implementation and language changes that do not preserve backwards
+compatibility. Though in the past this has not always held true: changes that
+break code have crept into minor version number changes. Typically, the
+documentation lags behind the development in the interest of documenting only
+the lasting features. The latest changes are always documented in the ChangeLog
+file. 
+
+.chapter Constructing State Machines
+
+.section Ragel State Machine Specifications
+
+A Ragel input file consists of a program in the host language that contains embedded machine
+specifications.  Ragel normally passes input straight to output.  When it sees
+a machine specification it stops to read the Ragel statements and possibly generate
+code in place of the specification.
+Afterwards it continues to pass input through.  There
+can be any number of FSM specifications in an input file. A multi-line FSM spec
+starts with .verb|%%{| and ends with .verb|}%%|. A single-line FSM spec starts
+with .verb|%%| and ends at the first newline.  
+
+While Ragel is looking for FSM specifications it does basic lexical analysis on
+the surrounding input. It interprets literal strings and comments so a
+.verb|%%| sequence in either of those will not trigger the parsing of an FSM
+specification. Ragel does not pass the input through any preprocessor nor does it
+interpret preprocessor directives itself so includes, defines and ifdef logic
+cannot be used to alter the parse of a Ragel input file. It is therefore not
+possible to use an .verb|#if 0| directive to comment out a machine as is
+commonly done in C code. As an alternative, a machine can be prevented from
+causing any generated output by commenting out write statements.
+
+In Figure .ref{cmd-line-parsing}, a multi-line specification is used to define the
+machine and single line specifications are used to trigger the writing of the machine
+data and execution code.
+
+.figure cmd-line-parsing
+.multicols
+.verbatim
+#include <string.h>
+#include <stdio.h>
+
+%%{ 
+    machine foo;
+    main := 
+        ( 'foo' | 'bar' ) 
+        0 @{ res = 1; };
+}%%
+
+%% write data;
+.end verbatim
+.columnbreak
+.verbatim
+int main( int argc, char **argv )
+{
+    int cs, res = 0;
+    if ( argc > 1 ) {
+        char *p = argv[1];
+        char *pe = p + strlen(p) + 1;
+        %% write init;
+        %% write exec;
+    }
+    printf("result = %i\n", res );
+    return 0;
+}
+.end verbatim
+.end multicols
+.caption Parsing a command line argument.
+.end figure
+
+.subsection Naming Ragel Blocks
+
+.verbatim
+machine fsm_name;
+.end verbatim
+
+The .verb|machine| statement gives the name of the FSM. If present in a
+specification, this statement must appear first. If a machine specification
+does not have a name then Ragel uses the previous specification name.  If no
+previous specification name exists then this is an error. Because FSM
+specifications persist in memory, a machine's statements can be spread across
+multiple machine specifications.  This allows one to break up a machine across
+several files or draw in statements that are common to multiple machines using
+the .verb|include| statement.
+
+.subsection Machine Definition
+.label{definition}
+
+.verbatim
+<name> = <expression>;
+.end verbatim 
+
+The machine definition statement associates an FSM expression with a name. Machine
+expressions assigned to names can later be referenced in other expressions. A
+definition statement on its own does not cause any states to be generated. It is simply a
+description of a machine to be used later. States are generated only when a definition is
+instantiated, which happens when a definition is referenced in an instantiated
+expression. 
+
+.subsection Machine Instantiation
+.label{instantiation}
+
+.verbatim
+<name> := <expression>;
+.end verbatim
+
+The machine instantiation statement generates a set of states representing an
+expression. Each instantiation generates a distinct set of states.  The starting
+state of the instantiation is written in the data section of the generated code
+using the instantiation name.  If a machine named
+.verb|main| is instantiated, its start state is used as the
+specification's start state and is assigned to the .verb|cs| variable by the
+.verb|write init| command. If no .verb|main| machine is given, the start state
+of the last machine instantiation to appear is used as the specification's
+start state.
+
+From outside the execution loop, control may be passed to any machine by
+assigning the entry point to the .verb|cs| variable.  From inside the execution
+loop, control may be passed to any machine instantiation using .verb|fcall|,
+.verb|fgoto| or .verb|fnext| statements.
+
+.subsection Including Ragel Code
+
+.verbatim
+include FsmName "inputfile.rl";
+.end verbatim
+
+The .verb|include| statement can be used to draw in the statements of another FSM
+specification. Both the name and input file are optional, however at least one
+must be given. Without an FSM name, the given input file is searched for an FSM
+of the same name as the current specification. Without an input file the
+current file is searched for a machine of the given name. If both are present,
+the given input file is searched for a machine of the given name.
+
+Ragel searches for included files from the location of the current file.
+Additional directories can be added to the search path using the .verb|-I|
+option.
+
+.subsection Importing Definitions
+.label{import}
+
+.verbatim
+import "inputfile.h";
+.end verbatim
+
+The .verb|import| statement scrapes a file for sequences of tokens that match
+the following forms. Ragel treats these forms as state machine definitions.
+
+.list
+.li .verb|name '=' number|
+.li .verb|name '=' lit_string|
+.li .verb|'define' name number|
+.li .verb|'define' name lit_string|
+.end list
+
+If the input file is a Ragel program then tokens inside any Ragel
+specifications are ignored. See Section .ref{export} for a description of
+exporting machine definitions.
+
+Ragel searches for imported files from the location of the current file.
+Additional directories can be added to the search path using the .verb|-I|
+option.
+
+.section Lexical Analysis of a Ragel Block
+.label{lexing}
+
+Within a machine specification the following lexical rules apply to the input.
+
+.itemize
+
+.item The .verb|#| symbol begins a comment that terminates at the next newline.
+
+.item The symbols .verb|""|, .verb|''|, .verb|//|, .verb|[]| behave as the
+delimiters of literal strings. Within them, the following escape sequences 
+are interpreted: 
+
+.verb|    \0 \a \b \t \n \v \f \r|
+
+A backslash at the end of a line joins the following line onto the current. A
+backslash preceding any other character removes special meaning. This applies
+to terminating characters and to special characters in regular expression
+literals. As an exception, regular expression literals do not support escape
+sequences as the operands of a range within a list. See the bullet on regular
+expressions in Section .ref{basic}.
+
+.item The symbols .verb|{}| delimit a block of host language code that will be
+embedded into the machine as an action.  Within the block of host language
+code, basic lexical analysis of comments and strings is done in order to
+correctly find the closing brace of the block. With the exception of FSM
+commands embedded in code blocks, the entire block is preserved as is for
+identical reproduction in the output code.
+
+.item The pattern .verb|[+-]?[0-9]+| denotes an integer in decimal format.
+Integers used for specifying machines may be negative only if the alphabet type
+is signed. Integers used for specifying priorities may be positive or negative.
+
+.item The pattern .verb|0x[0-9A-Fa-f]+| denotes an integer in hexadecimal
+format.
+
+.item The keywords are .verb|access|, .verb|action|, .verb|alphtype|,
+.verb|getkey|, .verb|write|, .verb|machine| and .verb|include|.
+
+.item The pattern .verb|[a-zA-Z_][a-zA-Z_0-9]*| denotes an identifier.
+
+.comment
+.item The allowable symbols are:
+
+.verb/    ( ) ! ^ * ? + : -> - | & . , := = ; > @ $ % /\\
+.verb|    >/  $/  %/  </  @/  <>/ >!  $!  %!  <!  @!  <>!|\\
+.verb|    >^  $^  %^  <^  @^  <>^ >~  $~  %~  <~  @~  <>~|\\
+.verb|    >*  $*  %*  <*  @*  <>*|
+.end comment
+
+.item Any amount of whitespace may separate tokens.
+
+.end itemize
+
+.comment
+.section Parse of an FSM Specification
+
+The following statements are possible within an FSM specification. The
+requirements for trailing semicolons loosely follow that of C. 
+A block
+specifying code does not require a trailing semicolon. An expression
+statement does require a trailing semicolon.
+.end comment
+
+.section Basic Machines
+.label{basic}
+
+The basic machines are the base operands of regular language expressions. They
+are the smallest unit to which machine construction and manipulation operators
+can be applied.
+
+.itemize
+
+.item .verb|'hello'| -- Concatenation Literal. Produces a machine that matches
+the sequence of characters in the quoted string. If there are 5 characters
+there will be 6 states chained together with the characters in the string. See
+Section .ref{lexing} for information on valid escape sequences. 
+
+.comment
+% GENERATE: bmconcat
+% OPT: -p
+% %%{
+% machine bmconcat;
+.verbatim
+main := 'hello';
+.end verbatim
+% }%%
+% END GENERATE
+.end comment
+
+.graphic bmconcat
+
+It is possible
+to make a concatenation literal case-insensitive by appending an .verb|i| to
+the string, for example .verb|'cmd'i|.
+
+.item .verb|"hello"| -- Identical to the single quoted version.
+
+.item .verb|[hello]| -- Or Expression. Produces a union of characters.  There
+will be two states with a transition for each unique character between the two states.
+The .verb|[]| delimiters behave like the quotes of a literal string. For example, 
+.verb|[ \t]| means tab or space. The .verb|or| expression supports character ranges
+with the .verb|-| symbol as a separator. The meaning of the union can be negated
+using an initial .verb|^| character as in standard regular expressions. 
+See Section .ref{lexing} for information on valid escape sequences
+in .verb|or| expressions.
+
+.comment
+% GENERATE: bmor
+% OPT: -p
+% %%{
+% machine bmor;
+.verbatim
+main := [hello];
+.end verbatim
+% }%%
+% END GENERATE
+.end comment
+
+.graphic bmor
+
+.item .verb|''|, .verb|""|, and .verb|[]| -- Zero Length Machine.  Produces a machine
+that matches the zero length string. Zero length machines have one state that is both
+a start state and a final state.
+
+.comment
+% GENERATE: bmnull
+% OPT: -p
+% %%{
+% machine bmnull;
+.verbatim
+main := '';
+.end verbatim
+% }%%
+% END GENERATE
+.end comment
+
+.graphic bmnull
+
+% FIXME: More on the range of values here.
+.item .verb|42| -- Numerical Literal. Produces a two state machine with one
+transition on the given number. The number may be in decimal or hexadecimal
+format and should be in the range allowed by the alphabet type. The minimum and
+maximum values permitted are defined by the host machine that Ragel is compiled
+on. For example, numbers in a .verb|short| alphabet on an i386 machine should
+be in the range .verb|-32768| to .verb|32767|.
+
+.comment
+% GENERATE: bmnum
+% %%{
+% machine bmnum;
+.verbatim
+main := 42;
+.end verbatim
+% }%%
+% END GENERATE
+.end comment
+
+.graphic bmnum
+
+.item .verb|/simple_regex/| -- Regular Expression. Regular expressions are
+parsed as a series of expressions that are concatenated together. Each
+concatenated expression
+may be a literal character, the ``any'' character specified by the .verb|.|
+symbol, or a union of characters specified by the .verb|[]| delimiters. If the
+first character of a union is .verb|^| then it matches any character not in the
+list. Within a union, a range of characters can be given by separating the first
+and last characters of the range with the .verb|-| symbol. Each
+concatenated machine may have repetition specified by following it with the
+.verb|*| symbol. The standard escape sequences described in Section
+.ref{lexing} are supported everywhere in regular expressions except as the
+operands of a range within in a list. This notation also supports the .verb|i|
+trailing option. Use it to produce case-insensitive machines, as in .verb|/GET/i|.
+
+Ragel does not support very complex regular expressions because the desired
+results can always be achieved using the more general machine construction
+operators listed in Section .ref{machconst}. The following diagram shows the
+result of compiling .verb|/ab*[c-z].*[123]/|. .verb|DEF| represents the default
+transition, which is taken if no other transition can be taken. 
+
+.comment
+% GENERATE: bmregex
+% OPT: -p
+% %%{
+% machine bmregex;
+.verbatim
+main := /ab*[c-z].*[123]/;
+.end verbatim
+% }%%
+% END GENERATE
+.end comment
+
+.graphic bmregex
+
+.item .verb|'a' .. 'z'| -- Range. Produces a machine that matches any
+characters in the specified range.  Allowable upper and lower bounds of the
+range are concatenation literals of length one and numerical literals.  For
+example, .verb|0x10..0x20|, .verb|0..63|, and .verb|'a'..'z'| are valid ranges.
+The bounds should be in the range allowed by the alphabet type.
+
+.comment
+% GENERATE: bmrange
+% OPT: -p
+% %%{
+% machine bmrange;
+.verbatim
+main := 'a' .. 'z';
+.end verbatim
+% }%%
+% END GENERATE
+.end comment
+
+.graphic bmrange
+
+.item .verb|variable_name| -- Lookup the machine definition assigned to the
+variable name given and use an instance of it. See Section .ref{definition} for
+an important note on what it means to reference a variable name.
+
+.item .verb|builtin_machine| -- There are several built-in machines available
+for use. They are all two state machines for the purpose of matching common
+classes of characters. They are:
+
+.itemize
+
+.item .verb|any   | -- Any character in the alphabet.
+
+.item .verb|ascii | -- Ascii characters. .verb|0..127|
+
+.item .verb|extend| -- Ascii extended characters. This is the range
+.verb|-128..127| for signed alphabets and the range .verb|0..255| for unsigned
+alphabets.
+
+.item .verb|alpha | -- Alphabetic characters. .verb|[A-Za-z]|
+
+.item .verb|digit | -- Digits. .verb|[0-9]|
+
+.item .verb|alnum | -- Alpha numerics. .verb|[0-9A-Za-z]|
+
+.item .verb|lower | -- Lowercase characters. .verb|[a-z]|
+
+.item .verb|upper | -- Uppercase characters. .verb|[A-Z]|
+
+.item .verb|xdigit| -- Hexadecimal digits. .verb|[0-9A-Fa-f]|
+
+.item .verb|cntrl | -- Control characters. .verb|0..31|
+
+.item .verb|graph | -- Graphical characters. .verb|[!-~]|
+
+.item .verb|print | -- Printable characters. .verb|[ -~]|
+
+.item .verb|punct | -- Punctuation. Graphical characters that are not alphanumerics.
+.verb|[!-/:-@[-`{-~]|
+
+.item .verb|space | -- Whitespace. .verb|[\t\v\f\n\r ]|
+
+.item .verb|zlen  | -- Zero length string. .verb|""|
+
+.item .verb|empty | -- Empty set. Matches nothing. .verb|^any|
+
+.end itemize
+.end itemize
+
+.section Operator Precedence
+The following table shows operator precedence from lowest to highest. Operators
+in the same precedence group are evaluated from left to right.
+
+.tabular
+.row 1&.verb| , |&Join
+.row 2&.verb/ | & - --/&Union, Intersection and Subtraction
+.row 3&.verb| . <: :> :>> |&Concatenation
+.row 4&.verb| : |&Label
+.row 5&.verb| -> |&Epsilon Transition
+.row 6&.verb| >  @  $  % |&Transitions Actions and Priorities
+.row 6&.verb| >/  $/  %/  </  @/  <>/ |&EOF Actions
+.row 6&.verb| >!  $!  %!  <!  @!  <>! |&Global Error Actions
+.row 6&.verb| >^  $^  %^  <^  @^  <>^ |&Local Error Actions
+.row 6&.verb| >~  $~  %~  <~  @~  <>~ |&To-State Actions
+.row 6&.verb| >*  $*  %*  <*  @*  <>* |&From-State Action
+.row 7&.verb| * ** ? + {n} {,n} {n,} {n,m} |&Repetition
+.row 8&.verb| ! ^ |&Negation and Character-Level Negation
+.row 9&.verb| ( <expr> ) |&Grouping
+.end tabular
+
+.section Regular Language Operators
+.label{machconst}
+
+When using Ragel it is helpful to have a sense of how it constructs machines.
+The determinization process can produce results that seem unusual to someone
+not familiar with the NFA to DFA conversion algorithm. In this section we
+describe Ragel's state machine operators. Though the operators are defined
+using epsilon transitions, it should be noted that this is for discussion only.
+The epsilon transitions described in this section do not persist, but are
+immediately removed by the determinization process which is executed at every
+operation. Ragel does not make use of any nondeterministic intermediate state
+machines. 
+
+To create an epsilon transition between two states .verb|x| and .verb|y| is to
+copy all of the properties of .verb|y| into .verb|x|. This involves drawing in
+all of .verb|y|'s to-state actions, EOF actions, etc., in addition to its
+transitions. If .verb|x| and .verb|y| both have a transition out on the same
+character, then the transitions must be combined.  During transition
+combination a new transition is made that goes to a new state that is the
+combination of both target states. The new combination state is created using
+the same epsilon transition method.  The new state has an epsilon transition
+drawn to all the states that compose it. Since the creation of new epsilon
+transitions may be triggered every time an epsilon transition is drawn, the
+process of drawing epsilon transitions is repeated until there are no more
+epsilon transitions to be made.
+
+A very common error that is made when using Ragel is to make machines that do
+too much. That is, to create machines that have unintentional
+nondetermistic properties. This usually results from being unaware of the common strings
+between machines that are combined together using the regular language
+operators. This can involve never leaving a machine, causing its actions to be
+propagated through all the following states. Or it can involve an alternation
+where both branches are unintentionally taken simultaneously.
+
+This problem forces one to think hard about the language that needs to be
+matched. To guard against this kind of problem one must ensure that the machine
+specification is divided up using boundaries that do not allow ambiguities from
+one portion of the machine to the next. See Chapter
+.ref{controlling-nondeterminism} for more on this problem and how to solve it.
+
+The Graphviz tool is an immense help when debugging improperly compiled
+machines or otherwise learning how to use Ragel. Graphviz Dot files can be
+generated from Ragel programs using the .verb|-V| option. See Section
+.ref{visualization} for more information.
+
+
+.subsection Union
+
+.verb/expr | expr/
+
+The union operation produces a machine that matches any string in machine one
+or machine two. The operation first creates a new start state. Epsilon
+transitions are drawn from the new start state to the start states of both
+input machines.  The resulting machine has a final state set equivalent to the
+union of the final state sets of both input machines. In this operation, there
+is the opportunity for nondeterminism among both branches. If there are
+strings, or prefixes of strings that are matched by both machines then the new
+machine will follow both parts of the alternation at once. The union operation is
+shown below.
+
+.graphic opor 1.0
+
+The following example demonstrates the union of three machines representing
+common tokens.
+
+% GENERATE: exor
+% OPT: -p
+% %%{
+% machine exor;
+.code
+# Hex digits, decimal digits, or identifiers
+main := '0x' xdigit+ | digit+ | alpha alnum*;
+.end code
+% }%%
+% END GENERATE
+
+.graphic exor
+
+.subsection Intersection
+
+.verb|expr & expr|
+
+Intersection produces a machine that matches any
+string that is in both machine one and machine two. To achieve intersection, a
+union is performed on the two machines. After the result has been made
+deterministic, any final state that is not a combination of final states from
+both machines has its final state status revoked. To complete the operation,
+paths that do not lead to a final state are pruned from the machine. Therefore,
+if there are any such paths in either of the expressions they will be removed
+by the intersection operator.  Intersection can be used to require that two
+independent patterns be simultaneously satisfied as in the following example.
+
+% GENERATE: exinter
+% OPT: -p
+% %%{
+% machine exinter;
+.code
+# Match lines four characters wide that contain 
+# words separated by whitespace.
+main :=
+    /[^\n][^\n][^\n][^\n]\n/* &
+    (/[a-z][a-z]*/ | [ \n])**;
+.end code
+% }%%
+% END GENERATE
+
+.graphic exinter
+
+.subsection Difference
+
+.verb|expr - expr|
+
+The difference operation produces a machine that matches
+strings that are in machine one but are not in machine two. To achieve subtraction,
+a union is performed on the two machines. After the result has been made
+deterministic, any final state that came from machine two or is a combination
+of states involving a final state from machine two has its final state status
+revoked. As with intersection, the operation is completed by pruning any path
+that does not lead to a final state.  The following example demonstrates the
+use of subtraction to exclude specific cases from a set.
+
+% GENERATE: exsubtr
+% OPT: -p
+% %%{
+% machine exsubtr;
+.code
+# Subtract keywords from identifiers.
+main := /[a-z][a-z]*/ - ( 'for' | 'int' );
+.end code
+% }%%
+% END GENERATE
+
+.graphic exsubtr
+
+.subsection Strong Difference
+.label{strong_difference}
+
+.verb|expr -- expr|
+
+Strong difference produces a machine that matches any string of the first
+machine that does not have any string of the second machine as a substring. In
+the following example, strong subtraction is used to excluded .verb|CRLF| from
+a sequence. In the corresponding visualization, the label .verb|DEF| is short
+for default. The default transition is taken if no other transition can be
+taken.
+
+% GENERATE: exstrongsubtr
+% OPT: -p
+% %%{
+% machine exstrongsubtr;
+.code
+crlf = '\r\n';
+main := [a-z]+ ':' ( any* -- crlf ) crlf;
+.end code
+% }%%
+% END GENERATE
+
+.graphic exstrongsubtr
+
+This operator is equivalent to the following.
+
+.verbatim
+expr - ( any* expr any* )
+.end verbatim
+
+.subsection Concatenation
+
+.verb|expr . expr|
+
+Concatenation produces a machine that matches all the strings in machine one followed by all
+the strings in machine two.  Concatenation draws epsilon transitions from the
+final states of the first machine to the start state of the second machine. The
+final states of the first machine lose their final state status, unless the
+start state of the second machine is final as well. 
+Concatenation is the default operator. Two machines next to each other with no
+operator between them results in concatenation.
+
+.graphic opconcat 1.0
+
+The opportunity for nondeterministic behaviour results from the possibility of
+the final states of the first machine accepting a string that is also accepted
+by the start state of the second machine.
+The most common scenario in which this happens is the
+concatenation of a machine that repeats some pattern with a machine that gives
+a terminating string, but the repetition machine does not exclude the
+terminating string. The example in Section .ref{strong_difference}
+guards against this. Another example is the expression .verb|("'" any* "'")|.
+When executed the thread of control will
+never leave the .verb|any*| machine.  This is a problem especially if actions
+are embedded to process the characters of the .verb|any*| component.
+
+In the following example, the first machine is always active due to the
+nondeterministic nature of concatenation. This particular nondeterminism is intended
+however because we wish to permit EOF strings before the end of the input.
+
+% GENERATE: exconcat
+% OPT: -p
+% %%{
+% machine exconcat;
+.code
+# Require an eof marker on the last line.
+main := /[^\n]*\n/* . 'EOF\n';
+.end code
+% }%%
+% END GENERATE
+
+.graphic exconcat
+
+There is a language
+ambiguity involving concatenation and subtraction. Because concatenation is the 
+default operator for two
+adjacent machines there is an ambiguity between subtraction of
+a positive numerical literal and concatenation of a negative numerical literal.
+For example, .verb|(x-7)| could be interpreted as .verb|(x . -7)| or 
+.verb|(x - 7)|. In the Ragel language, the subtraction operator always takes precedence
+over concatenation of a negative literal. We adhere to the rule that the default
+concatenation operator takes effect only when there are no other operators between
+two machines. Beware of writing machines such as .verb|(any -1)| when what is
+desired is a concatenation of .verb|any| and .verb|-1|. Instead write 
+.verb|(any . -1)| or .verb|(any (-1))|. If in doubt of the meaning of your program do not
+rely on the default concatenation operator; always use the .verb|.| symbol.
+
+
+.subsection Kleene Star
+
+.verb|expr*|
+
+The machine resulting from the Kleene Star operator will match zero or more
+repetitions of the machine it is applied to.
+It creates a new start state and an additional final
+state.  Epsilon transitions are drawn between the new start state and the old start
+state, between the new start state and the new final state, and
+between the final states of the machine and the new start state.  After the
+machine is made deterministic the effect is of the final states getting all the
+transitions of the start state. 
+
+.graphic opstar 1.0
+
+The possibility for nondeterministic behaviour arises if the final states have
+transitions on any of the same characters as the start state.  This is common
+when applying kleene star to an alternation of tokens. Like the other problems
+arising from nondeterministic behavior, this is discussed in more detail in Chapter
+.ref{controlling-nondeterminism}. This particular problem can also be solved
+by using the longest-match construction discussed in Section 
+.ref{generating-scanners} on scanners.
+
+In this 
+example, there is no nondeterminism introduced by the exterior kleene star due to
+the newline at the end of the regular expression. Without the newline the
+exterior kleene star would be redundant and there would be ambiguity between
+repeating the inner range of the regular expression and the entire regular
+expression. Though it would not cause a problem in this case, unnecessary
+nondeterminism in the kleene star operator often causes undesired results for
+new Ragel users and must be guarded against.
+
+% GENERATE: exstar
+% OPT: -p
+% %%{
+% machine exstar;
+.code
+# Match any number of lines with only lowercase letters.
+main := /[a-z]*\n/*;
+.end code
+% }%%
+% END GENERATE
+
+.graphic exstar
+
+.subsection One Or More Repetition
+
+.verb|expr+|
+
+This operator produces the concatenation of the machine with the kleene star of
+itself. The result will match one or more repetitions of the machine. The plus
+operator is equivalent to .verb|(expr . expr*)|.  
+
+% GENERATE: explus
+% OPT: -p
+% %%{
+% machine explus;
+.code
+# Match alpha-numeric words.
+main := alnum+;
+.end code
+% }%%
+% END GENERATE
+
+.graphic explus
+
+.subsection Optional
+
+.verb|expr?|
+
+The .em{optional} operator produces a machine that accepts the machine
+given or the zero length string. The optional operator is equivalent to
+.verb/(expr | '' )/. In the following example the optional operator is used to
+possibly extend a token.
+
+% GENERATE: exoption
+% OPT: -p
+% %%{
+% machine exoption;
+.code
+# Match integers or floats.
+main := digit+ ('.' digit+)?;
+.end code
+% }%%
+% END GENERATE
+
+.graphic exoption
+
+.subsection Repetition
+
+.list 
+.li .verb|expr {n}| -- Exactly N copies of expr.
+.li .verb|expr {,n}| -- Zero to N copies of expr.
+.li .verb|expr {n,}| -- N or more copies of expr.
+.li .verb|expr {n,m}| -- N to M copies of expr.
+.end list
+
+.subsection Negation
+
+.verb|!expr|
+
+Negation produces a machine that matches any string not matched by the given
+machine. Negation is equivalent to .verb|(any* - expr)|.
+
+% GENERATE: exnegate
+% OPT: -p
+% %%{
+% machine exnegate;
+.code
+# Accept anything but a string beginning with a digit.
+main := ! ( digit any* );
+.end code
+% }%%
+% END GENERATE
+
+.graphic exnegate
+
+.subsection Character-Level Negation
+
+.verb|^expr|
+
+Character-level negation produces a machine that matches any single character
+not matched by the given machine. Character-Level Negation is equivalent to
+.verb|(any - expr)|. It must be applied only to machines that match strings of
+length one.
+
+.section State Machine Minimization
+
+State machine minimization is the process of finding the minimal equivalent FSM accepting
+the language. Minimization reduces the number of states in machines
+by merging equivalent states. It does not change the behaviour of the machine
+in any way. It will cause some states to be merged into one because they are
+functionally equivalent. State minimization is on by default. It can be turned
+off with the .verb|-n| option.
+
+The algorithm implemented is similar to Hopcroft's state minimization
+algorithm. Hopcroft's algorithm assumes a finite alphabet that can be listed in
+memory, whereas Ragel supports arbitrary integer alphabets that cannot be
+listed in memory. Though exact analysis is very difficult, Ragel minimization
+runs close to O(n * log(n)) and requires O(n) temporary storage where
+$n$ is the number of states.
+
+.section Visualization
+.label{visualization}
+
+%In many cases, practical
+%parsing programs will be too large to completely visualize with Graphviz.  The
+%proper approach is to reduce the language to the smallest subset possible that
+%still exhibits the characteristics that one wishes to learn about or to fix.
+%This can be done without modifying the source code using the .verb|-M| and
+%.verb|-S| options. If a machine cannot be easily reduced,
+%embeddings of unique actions can be very useful for tracing a
+%particular component of a larger machine specification, since action names are
+%written out on transition labels.
+
+Ragel is able to emit compiled state machines in Graphviz's Dot file format.
+This is done using the .verb|-V| option.
+Graphviz support allows users to perform
+incremental visualization of their parsers. User actions are displayed on
+transition labels of the graph. 
+
+If the final graph is too large to be
+meaningful, or even drawn, the user is able to inspect portions of the parser
+by naming particular regular expression definitions with the .verb|-S| and
+.verb|-M| options to the .verb|ragel| program. Use of Graphviz greatly
+improves the Ragel programming experience. It allows users to learn Ragel by
+experimentation and also to track down bugs caused by unintended
+nondeterminism.
+
+Ragel has another option to help debugging. The .verb|-x| option causes Ragel
+to emit the compiled machine in an XML format.
+
+.chapter User Actions
+
+Ragel permits the user to embed actions into the transitions of a regular
+expression's corresponding state machine. These actions are executed when the
+generated code moves over a transition.  Like the regular expression operators,
+the action embedding operators are fully compositional. They take a state
+machine and an action as input, embed the action and yield a new state machine
+that can be used in the construction of other machines. Due to the
+compositional nature of embeddings, the user has complete freedom in the
+placement of actions.
+
+A machine's transitions are categorized into four classes. The action embedding
+operators access the transitions defined by these classes.  The .em{entering
+transition} operator .verb|>| isolates the start state, then embeds an action
+into all transitions leaving it. The .em{finishing transition} operator
+.verb|@| embeds an action into all transitions going into a final state.  The
+.em{all transition} operator .verb|$| embeds an action into all transitions of
+an expression. The .em{leaving transition} operator .verb|%| provides access
+to the yet-unmade transitions moving out of the machine via the final states. 
+
+.section Embedding Actions
+
+.verbatim
+action ActionName {
+    /* Code an action here. */
+    count += 1;
+}
+.end verbatim
+
+The action statement defines a block of code that can be embedded into an FSM.
+Action names can be referenced by the action embedding operators in
+expressions. Though actions need not be named in this way (literal blocks
+of code can be embedded directly when building machines), defining reusable
+blocks of code whenever possible is good practice because it potentially increases the
+degree to which the machine can be minimized. 
+
+Within an action some Ragel expressions and statements are parsed and
+translated. These allow the user to interact with the machine from action code.
+See Section .ref{vals} for a complete list of statements and values available
+in code blocks. 
+
+.subsection Entering Action
+
+.verb|expr > action| 
+
+The entering action operator embeds an action into all transitions
+that enter into the machine from the start state. If the start state is final,
+then the action is also embedded into the start state as a leaving action. This
+means that if a machine accepts the zero-length string and control passes
+through the start state then the entering action is executed. Note
+that this can happen on both a following character and on the EOF event.
+
+In some machines the start state has transtions coming in from within the
+machine. In these cases the start state is first isolated from the rest of the
+machine ensuring that the entering actions are exected once only.
+
+% GENERATE: exstact
+% OPT: -p
+% %%{
+% machine exstact;
+.code
+# Execute A at the beginning of a string of alpha.
+action A {}
+main := ( lower* >A ) . ' ';
+.end code
+% }%%
+% END GENERATE
+
+.graphic exstact
+
+.subsection Finishing Action
+
+.verb|expr @ action|
+
+The finishing action operator embeds an action into any transitions that move
+the machine into a final state. Further input may move the machine out of the
+final state, but keep it in the machine. Therefore finishing actions may be
+executed more than once if a machine has any internal transitions out of a
+final state. In the following example the final state has no transitions out
+and the finishing action is executed only once.
+
+% GENERATE: exdoneact
+% OPT: -p
+% %%{
+% machine exdoneact;
+% action A {}
+.code
+# Execute A when the trailing space is seen.
+main := ( lower* ' ' ) @A;
+.end code
+% }%%
+% END GENERATE
+
+.graphic exdoneact
+
+.subsection All Transition Action
+
+.verb|expr $ action|
+
+The all transition operator embeds an action into all transitions of a machine.
+The action is executed whenever a transition of the machine is taken. In the
+following example, A is executed on every character matched.
+
+% GENERATE: exallact
+% OPT: -p
+% %%{
+% machine exallact;
+% action A {}
+.code
+# Execute A on any characters of the machine.
+main := ( 'm1' | 'm2' ) $A;
+.end code
+% }%%
+% END GENERATE
+
+.graphic exallact
+
+.subsection Leaving Actions
+.label{out-actions}
+
+.verb|expr % action|
+
+The leaving action operator queues an action for embedding into the transitions
+that go out of a machine via a final state. The action is first stored in
+the machine's final states and is later transferred to any transitions that are
+made going out of the machine by a kleene star or concatenation operation.
+
+If a final state of the machine is still final when compilation is complete
+then the leaving action is also embedded as an EOF action. Therefore, leaving
+the machine is defined as either leaving on a character or as state machine
+acceptance.
+
+This operator allows one to associate an action with the termination of a
+sequence, without being concerned about what particular character terminates
+the sequence. In the following example, A is executed when leaving the alpha
+machine on the newline character.
+
+% GENERATE: exoutact1
+% OPT: -p
+% %%{
+% machine exoutact1;
+% action A {}
+.code
+# Match a word followed by a newline. Execute A when 
+# finishing the word.
+main := ( lower+ %A ) . '\n';
+.end code
+% }%%
+% END GENERATE
+
+.graphic exoutact1
+
+In the following example, the .verb|term_word| action could be used to register
+the appearance of a word and to clear the buffer that the .verb|lower| action used
+to store the text of it.
+
+% GENERATE: exoutact2
+% OPT: -p
+% %%{
+% machine exoutact2;
+% action lower {}
+% action space {}
+% action term_word {}
+% action newline {}
+.code
+word = ( [a-z] @lower )+ %term_word;
+main := word ( ' ' @space word )* '\n' @newline;
+.end code
+% }%%
+% END GENERATE
+
+.graphic exoutact2
+
+In this final example of the action embedding operators, A is executed upon entering
+the alpha machine, B is executed on all transitions of the
+alpha machine, C is executed when the alpha machine is exited by moving into the
+newline machine and N is executed when the newline machine moves into a final
+state.  
+
+% GENERATE: exaction
+% OPT: -p
+% %%{
+% machine exaction;
+% action A {}
+% action B {}
+% action C {}
+% action N {}
+.code
+# Execute A on starting the alpha machine, B on every transition 
+# moving through it and C upon finishing. Execute N on the newline.
+main := ( lower* >A $B %C ) . '\n' @N;
+.end code
+% }%%
+% END GENERATE
+
+.graphic exaction
+
+
+.section State Action Embedding Operators
+
+The state embedding operators allow one to embed actions into states. Like the
+transition embedding operators, there are several different classes of states
+that the operators access. The meanings of the symbols are similar to the
+meanings of the symbols used for the transition embedding operators. The design
+of the state selections was driven by a need to cover the states of an
+expression with exactly one error action.
+
+Unlike the transition embedding operators, the state embedding operators are
+also distinguished by the different kinds of events that embedded actions can
+be associated with. Therefore the state embedding operators have two
+components.  The first, which is the first one or two characters, specifies the
+class of states that the action will be embedded into. The second component
+specifies the type of event the action will be executed on. The symbols of the
+second component also have equivalent kewords. 
+
+.multicols
+The different classes of states are:
+
+.list
+.li .verb|> | -- the start state
+.li .verb|< | -- any state except the start state
+.li .verb|$ | -- all states
+.li .verb|% | -- final states
+.li .verb|@ | -- any state except final states
+.li .verb|<>| -- any except start and final (middle)
+.end list
+
+.columnbreak
+
+The different kinds of embeddings are:
+
+.list
+.li .verb|~| -- to-state actions (.verb|to|)
+.li .verb|*| -- from-state actions (.verb|from|)
+.li .verb|/| -- EOF actions (.verb|eof|)
+.li .verb|!| -- error actions (.verb|err|)
+.li .verb|^| -- local error actions (.verb|lerr|)
+.end list
+
+.end multicols
+
+.subsection To-State and From-State Actions
+
+.subsubsection To-State Actions
+
+.list
+.li .verb|>~action      >to(name)      >to{...} | -- the start state
+.li .verb|<~action      <to(name)      <to{...} | -- any state except the start state
+.li .verb|$~action      $to(name)      $to{...} | -- all states
+.li .verb|%~action      %to(name)      %to{...} | -- final states
+.li .verb|@~action      @to(name)      @to{...} | -- any state except final states
+.li .verb|<>~action     <>to(name)     <>to{...}| -- any except start and final (middle)
+.end list
+
+
+To-state actions are executed whenever the state machine moves into the
+specified state, either by a natural movement over a transition or by an
+action-based transfer of control such as .verb|fgoto|. They are executed after the
+in-transition's actions but before the current character is advanced and
+tested against the end of the input block. To-state embeddings stay with the
+state. They are irrespective of the state's current set of transitions and any
+future transitions that may be added in or out of the state.
+
+Note that the setting of the current state variable .verb|cs| outside of the
+execute code is not considered by Ragel as moving into a state and consequently
+the to-state actions of the new current state are not executed. This includes
+the initialization of the current state when the machine begins.  This is
+because the entry point into the machine execution code is after the execution
+of to-state actions.
+
+.subsubsection From-State Actions
+
+.list
+.li .verb|>*action     >from(name)     >from{...} | -- the start state
+.li .verb|<*action     <from(name)     <from{...} | -- any state except the start state
+.li .verb|$*action     $from(name)     $from{...} | -- all states
+.li .verb|%*action     %from(name)     %from{...} | -- final states
+.li .verb|@*action     @from(name)     @from{...} | -- any state except final states
+.li .verb|<>*action    <>from(name)    <>from{...}| -- any except start and final (middle)
+.end list
+
+From-state actions are executed whenever the state machine takes a transition from a
+state, either to itself or to some other state. These actions are executed
+immediately after the current character is tested against the input block end
+marker and before the transition to take is sought based on the current
+character. From-state actions are therefore executed even if a transition
+cannot be found and the machine moves into the error state.  Like to-state
+embeddings, from-state embeddings stay with the state.
+
+.subsection EOF Actions
+
+.list
+.li .verb|>/action     >eof(name)     >eof{...} | -- the start state
+.li .verb|</action     <eof(name)     <eof{...} | -- any state except the start state
+.li .verb|$/action     $eof(name)     $eof{...} | -- all states
+.li .verb|%/action     %eof(name)     %eof{...} | -- final states
+.li .verb|@/action     @eof(name)     @eof{...} | -- any state except final states
+.li .verb|<>/action    <>eof(name)    <>eof{...}| -- any except start and final (middle)
+.end list
+
+The EOF action embedding operators enable the user to embed actions that are
+executed at the end of the input stream. EOF actions are stored in states and
+generated in the .verb|write exec| block. They are run when .verb|p == pe == eof|
+as the execute block is finishing. EOF actions are free to adjust .verb|p| and
+jump to another part of the machine to restart execution.
+
+.subsection Handling Errors
+
+In many applications it is useful to be able to react to parsing errors.  The
+user may wish to print an error message that depends on the context.  It
+may also be desirable to consume input in an attempt to return the input stream
+to some known state and resume parsing. To support error handling and recovery,
+Ragel provides error action embedding operators. There are two kinds of error
+actions: global error actions and local error actions.
+Error actions can be used to simply report errors, or by jumping to a machine
+instantiation that consumes input, can attempt to recover from errors.  
+
+.subsubsection Global Error Actions
+
+.list
+.li .verb|>!action     >err(name)     >err{...} | -- the start state
+.li .verb|<!action     <err(name)     <err{...} | -- any state except the start state
+.li .verb|$!action     $err(name)     $err{...} | -- all states
+.li .verb|%!action     %err(name)     %err{...} | -- final states
+.li .verb|@!action     @err(name)     @err{...} | -- any state except final states
+.li .verb|<>!action    <>err(name)    <>err{...}| -- any except start and final (middle)
+.end list
+
+Global error actions are stored in the states they are embedded into until
+compilation is complete. They are then transferred to the transitions that move
+into the error state. These transitions are taken on all input characters that
+are not already covered by the state's transitions. If a state with an error
+action is not final when compilation is complete, then the action is also
+embedded as an EOF action.
+
+Error actions can be used to recover from errors by jumping back into the
+machine with .verb|fgoto| and optionally altering .verb|p|.
+
+.subsubsection Local Error Actions
+
+.list
+.li .verb|>^action     >lerr(name)     >lerr{...} | -- the start state
+.li .verb|<^action     <lerr(name)     <lerr{...} | -- any state except the start state
+.li .verb|$^action     $lerr(name)     $lerr{...} | -- all states
+.li .verb|%^action     %lerr(name)     %lerr{...} | -- final states
+.li .verb|@^action     @lerr(name)     @lerr{...} | -- any state except final states
+.li .verb|<>^action    <>lerr(name)    <>lerr{...}| -- any except start and final (middle)
+.end list
+
+Like global error actions, local error actions are also stored in the states
+they are embedded into until a transfer point. The transfer point is different
+however. Each local error action embedding is associated with a name. When a
+machine definition has been fully constructed, all local error action
+embeddings associated with the same name as the machine definition are
+transferred to the error transitions. At this time they are also embedded as
+EOF actions in the case of non-final states.
+
+Local error actions can be used to specify an action to take when a particular
+section of a larger state machine fails to match. A particular machine
+definition's ``thread'' may die and the local error actions executed, however
+the machine as a whole may continue to match input.
+
+There are two forms of local error action embeddings. In the first form the
+name defaults to the current machine. In the second form the machine name can
+be specified.  This is useful when it is more convenient to specify the local
+error action in a sub-definition that is used to construct the machine
+definition that the local error action is associated with. To embed local 
+error actions and
+explicitly state the machine definition on which the transfer is to happen use
+.verb|(name, action)| as the action.
+
+.subsubsection Example
+
+The following example uses error actions to report an error and jump to a
+machine that consumes the remainder of the line when parsing fails. After
+consuming the line, the error recovery machine returns to the main loop.
+
+% GENERATE: erract
+% %%{
+%   machine erract;
+%   ws = ' ';
+%   address = 'foo AT bar..com';
+%   date = 'Monday May 12';
+.code
+action cmd_err { 
+    printf( "command error\n" ); 
+    fhold; fgoto line;
+}
+action from_err { 
+    printf( "from error\n" ); 
+    fhold; fgoto line; 
+}
+action to_err { 
+    printf( "to error\n" ); 
+    fhold; fgoto line;
+}
+
+line := [^\n]* '\n' @{ fgoto main; };
+
+main := (
+    (
+        'from' @err(cmd_err) 
+            ( ws+ address ws+ date '\n' ) $err(from_err) |
+        'to' @err(cmd_err)
+            ( ws+ address '\n' ) $err(to_err)
+    ) 
+)*;
+.end code
+% }%%
+% %% write data;
+% void f()
+% {
+%   %% write init;
+%   %% write exec;
+% }
+% END GENERATE
+
+
+
+.section Action Ordering and Duplicates
+
+When combining expressions that have embedded actions it is often the case that
+a number of actions must be executed on a single input character. For example,
+following a concatenation the leaving action of the left expression and the
+entering action of the right expression will be embedded into one transition.
+This requires a method of ordering actions that is intuitive and
+predictable for the user, and repeatable for the compiler. 
+
+We associate with the embedding of each action a unique timestamp that is
+used to order actions that appear together on a single transition in the final
+state machine. To accomplish this we recursively traverse the parse tree of
+regular expressions and assign timestamps to action embeddings. References to
+machine definitions are followed in the traversal. When we visit a
+parse tree node we assign timestamps to all .em{entering} action embeddings,
+recurse on the parse tree, then assign timestamps to the remaining .em{all},
+.em{finishing}, and .em{leaving} embeddings in the order in which they
+appear.
+
+By default Ragel does not permit a single action to appear multiple times in an action
+list. When the final machine has been created, actions that appear more than
+once in a single transition, to-state, from-state or EOF action list have their
+duplicates removed.
+The first appearance of the action is preserved. This is useful in a number of
+scenarios. First, it allows us to union machines with common prefixes without
+worrying about the action embeddings in the prefix being duplicated. Second, it
+prevents leaving actions from being transferred multiple times. This can
+happen when a machine is repeated, then followed with another machine that
+begins with a common character. For example:
+
+.verbatim
+word = [a-z]+ %act;
+main := word ( '\n' word )* '\n\n';
+.end verbatim
+
+Note that Ragel does not compare action bodies to determine if they have
+identical program text. It simply checks for duplicates using each action
+block's unique location in the program.
+
+The removal of duplicates can be turned off using the .verb|-d| option.
+
+.section Values and Statements Available in Code Blocks
+.label{vals}
+
+The following values are available in code blocks:
+
+.itemize
+.item .verb|fpc| -- A pointer to the current character. This is equivalent to
+accessing the .verb|p| variable.
+
+.item .verb|fc| -- The current character. This is equivalent to the expression .verb|(*p)|.
+
+.item .verb|fcurs| -- An integer value representing the current state. This
+value should only be read from. To move to a different place in the machine
+from action code use the .verb|fgoto|, .verb|fnext| or .verb|fcall| statements.
+Outside of the machine execution code the .verb|cs| variable may be modified.
+
+.item .verb|ftargs| -- An integer value representing the target state. This
+value should only be read from. Again, .verb|fgoto|, .verb|fnext| and
+.verb|fcall| can be used to move to a specific entry point.
+
+.item .verb|fentry(<label>)| -- Retrieve an integer value representing the
+entry point .verb|label|. The integer value returned will be a compile time
+constant. This number is suitable for later use in control flow transfer
+statements that take an expression. This value should not be compared against
+the current state because any given label can have multiple states representing
+it. The value returned by .verb|fentry| can be any one of the multiple states that
+it represents.
+.end itemize
+
+The following statements are available in code blocks:
+
+.itemize
+
+.item .verb|fhold;| -- Do not advance over the current character. If processing
+data in multiple buffer blocks, the .verb|fhold| statement should only be used
+once in the set of actions executed on a character.  Multiple calls may result
+in backing up over the beginning of the buffer block. The .verb|fhold|
+statement does not imply any transfer of control. It is equivalent to the
+.verb|p--;| statement. 
+
+.item .verb|fexec <expr>;| -- Set the next character to process. This can be
+used to backtrack to previous input or advance ahead.
+Unlike .verb|fhold|, which can be used
+anywhere, .verb|fexec| requires the user to ensure that the target of the
+backtrack is in the current buffer block or is known to be somewhere ahead of
+it. The machine will continue iterating forward until .verb|pe| is arrived at,
+.verb|fbreak| is called or the machine moves into the error state. In actions
+embedded into transitions, the .verb|fexec| statement is equivalent to setting
+.verb|p| to one position ahead of the next character to process.  If the user
+also modifies .verb|pe|, it is possible to change the buffer block entirely.
+
+.item .verb|fgoto <label>;| -- Jump to an entry point defined by
+.verb|<label>|.  The .verb|fgoto| statement immediately transfers control to
+the destination state.
+
+.item .verb|fgoto *<expr>;| -- Jump to an entry point given by .verb|<expr>|.
+The expression must evaluate to an integer value representing a state.
+
+.item .verb|fnext <label>;| -- Set the next state to be the entry point defined
+by .verb|label|.  The .verb|fnext| statement does not immediately jump to the
+specified state. Any action code following the statement is executed.
+
+.item .verb|fnext *<expr>;| -- Set the next state to be the entry point given
+by .verb|<expr>|. The expression must evaluate to an integer value representing
+a state.
+
+.item .verb|fcall <label>;| -- Push the target state and jump to the entry
+point defined by .verb|<label>|.  The next .verb|fret| will jump to the target
+of the transition on which the call was made. Use of .verb|fcall| requires
+the declaration of a call stack. An array of integers named .verb|stack| and a
+single integer named .verb|top| must be declared. With the .verb|fcall|
+construct, control is immediately transferred to the destination state.
+See section .ref{modularization} for more information.
+
+.item .verb|fcall *<expr>;| -- Push the current state and jump to the entry
+point given by .verb|<expr>|. The expression must evaluate to an integer value
+representing a state.
+
+.item .verb|fret;| -- Return to the target state of the transition on which the
+last .verb|fcall| was made.  Use of .verb|fret| requires the declaration of a
+call stack. Control is immediately transferred to the destination state.
+
+.item .verb|fbreak;| -- Advance .verb|p|, save the target state to .verb|cs|
+and immediately break out of the execute loop. This statement is useful
+in conjunction with the .verb|noend| write option. Rather than process input
+until .verb|pe| is arrived at, the fbreak statement
+can be used to stop processing from an action.  After an .verb|fbreak|
+statement the .verb|p| variable will point to the next character in the input. The
+current state will be the target of the current transition. Note that .verb|fbreak|
+causes the target state's to-state actions to be skipped.
+
+.end itemize
+
+Once actions with control-flow commands are embedded into a
+machine, the user must exercise caution when using the machine as the operand
+to other machine construction operators. If an action jumps to another state
+then unioning any transition that executes that action with another transition
+that follows some other path will cause that other path to be lost. Using
+commands that manually jump around a machine takes us out of the domain of
+regular languages because transitions that the
+machine construction operators are not aware of are introduced.  These
+commands should therefore be used with caution.
+
+
+.chapter Controlling Nondeterminism
+.label{controlling-nondeterminism}
+
+Along with the flexibility of arbitrary action embeddings comes a need to
+control nondeterminism in regular expressions. If a regular expression is
+ambiguous, then sub-components of a parser other than the intended parts may become
+active. This means that actions that are irrelevant to the
+current subset of the parser may be executed, causing problems for the
+programmer.
+
+Tools that are based on regular expression engines and that are used for
+recognition tasks will usually function as intended regardless of the presence
+of ambiguities. It is quite common for users of scripting languages to write
+regular expressions that are heavily ambiguous and it generally does not
+matter. As long as one of the potential matches is recognized, there can be any
+number of other matches present.  In some parsing systems the run-time engine
+can employ a strategy for resolving ambiguities, for example always pursuing
+the longest possible match and discarding others.
+
+In Ragel, there is no regular expression run-time engine, just a simple state
+machine execution model. When we begin to embed actions and face the
+possibility of spurious action execution, it becomes clear that controlling
+nondeterminism at the machine construction level is very important. Consider
+the following example.
+
+% GENERATE: lines1
+% OPT: -p
+% %%{
+% machine lines1;
+% action first {}
+% action tail {}
+% word = [a-z]+;
+.code
+ws = [\n\t ];
+line = word $first ( ws word $tail )* '\n';
+lines = line*;
+.end code
+% main := lines;
+% }%%
+% END GENERATE
+
+.graphic lines1 0.53
+
+Since the .verb|ws| expression includes the newline character, we will
+not finish the .verb|line| expression when a newline character is seen. We will
+simultaneously pursue the possibility of matching further words on the same
+line and the possibility of matching a second line. Evidence of this fact is 
+in the state tables. On several transitions both the .verb|first| and
+.verb|tail| actions are executed.  The solution here is simple: exclude
+the newline character from the .verb|ws| expression. 
+
+% GENERATE: lines2
+% OPT: -p
+% %%{
+% machine lines2;
+% action first {}
+% action tail {}
+% word = [a-z]+;
+.code
+ws = [\t ];
+line = word $first ( ws word $tail )* '\n';
+lines = line*;
+.end code
+% main := lines;
+% }%%
+% END GENERATE
+
+.graphic lines2
+
+Solving this kind of problem is straightforward when the ambiguity is created
+by strings that are a single character long.  When the ambiguity is created by
+strings that are multiple characters long we have a more difficult problem.
+The following example is an incorrect attempt at a regular expression for C
+language comments. 
+
+% GENERATE: comments1
+% OPT: -p
+% %%{
+% machine comments1;
+% action comm {}
+.code
+comment = '/*' ( any @comm )* '*/';
+main := comment ' ';
+.end code
+% }%%
+% END GENERATE
+
+.graphic comments1
+
+Using standard concatenation, we will never leave the .verb|any*| expression.
+We will forever entertain the possibility that a .verb|'*/'| string that we see
+is contained in a longer comment and that, simultaneously, the comment has
+ended.  The concatenation of the .verb|comment| machine with .verb|SP| is done
+to show this. When we match space, we are also still matching the comment body.
+
+One way to approach the problem is to exclude the terminating string
+from the .verb|any*| expression using set difference. We must be careful to
+exclude not just the terminating string, but any string that contains it as a
+substring. A verbose, but proper specification of a C comment parser is given
+by the following regular expression. 
+
+% GENERATE: comments2
+% OPT: -p
+% %%{
+% machine comments2;
+% action comm {}
+.code
+comment = '/*' ( ( any @comm )* - ( any* '*/' any* ) ) '*/';
+.end code
+% main := comment;
+% }%%
+% END GENERATE
+
+.graphic comments2
+
+Note that Ragel's strong subtraction operator .verb|--| can also be used here.
+In doing this subtraction we have phrased the problem of controlling non-determinism in
+terms of excluding strings common to two expressions that interact when
+combined.
+We can also phrase the problem in terms of the transitions of the state
+machines that implement these expressions. During the concatenation of
+.verb|any*| and .verb|'*/'| we will be making transitions that are composed of
+both the loop of the first expression and the final character of the second.
+At this time we want the transition on the .verb|'/'| character to take precedence
+over and disallow the transition that originated in the .verb|any*| loop.
+
+In another parsing problem, we wish to implement a lightweight tokenizer that we can
+utilize in the composition of a larger machine. For example, some HTTP headers
+have a token stream as a sub-language. The following example is an attempt
+at a regular expression-based tokenizer that does not function correctly due to
+unintended nondeterminism.
+
+% GENERATE: smallscanner
+% OPT: -p
+% %%{
+% machine smallscanner;
+% action start_str {}
+% action on_char {}
+% action finish_str {}
+.code
+header_contents = ( 
+    lower+ >start_str $on_char %finish_str | 
+    ' '
+)*;
+.end code
+% main := header_contents;
+% }%%
+% END GENERATE
+
+.graphic smallscanner
+
+In this case, the problem with using a standard kleene star operation is that
+there is an ambiguity between extending a token and wrapping around the machine
+to begin a new token. Using the standard operator, we get an undesirable
+nondeterministic behaviour. Evidence of this can be seen on the transition out
+of state one to itself.  The transition extends the string, and simultaneously,
+finishes the string only to immediately begin a new one.  What is required is
+for the
+transitions that represent an extension of a token to take precedence over the
+transitions that represent the beginning of a new token. For this problem
+there is no simple solution that uses standard regular expression operators.
+
+.section Priorities
+
+A priority mechanism was devised and built into the determinization
+process, specifically for the purpose of allowing the user to control
+nondeterminism.  Priorities are integer values embedded into transitions. When
+the determinization process is combining transitions that have different
+priorities, the transition with the higher priority is preserved and the
+transition with the lower priority is dropped.
+
+Unfortunately, priorities can have unintended side effects because their
+operation requires that they linger in transitions indefinitely. They must linger
+because the Ragel program cannot know when the user is finished with a priority
+embedding.  A solution whereby they are explicitly deleted after use is
+conceivable; however this is not very user-friendly.  Priorities were therefore
+made into named entities. Only priorities with the same name are allowed to
+interact.  This allows any number of priorities to coexist in one machine for
+the purpose of controlling various different regular expression operations and
+eliminates the need to ever delete them. Such a scheme allows the user to
+choose a unique name, embed two different priority values using that name
+and be confident that the priority embedding will be free of any side effects.
+
+In the first form of priority embedding the name defaults to the name of the machine
+definition that the priority is assigned in. In this sense priorities are by
+default local to the current machine definition or instantiation. Beware of
+using this form in a longest-match machine, since there is only one name for
+the entire set of longest match patterns. In the second form the priority's
+name can be specified, allowing priority interaction across machine definition
+boundaries.
+
+.itemize
+.item .verb|expr > int| -- Sets starting transitions to have priority int.
+.item .verb|expr @ int| -- Sets transitions that go into a final state to have priority int. 
+.item .verb|expr $ int| -- Sets all transitions to have priority int.
+.item .verb|expr % int| -- Sets leaving transitions to
+have priority int. When a transition is made going out of the machine (either
+by concatenation or kleene star) its priority is immediately set to the 
+leaving priority.  
+.end itemize
+
+The second form of priority assignment allows the programmer to specify the name
+to which the priority is assigned.
+
+.itemize
+.item .verb|expr > (name, int)| -- Starting transitions.
+.item .verb|expr @ (name, int)| -- Finishing transitions (into a final state).
+.item .verb|expr $ (name, int)| -- All transitions.
+.item .verb|expr % (name, int)| -- Leaving transitions.
+.end itemize
+
+.section Guarded Operators that Encapsulate Priorities
+
+Priority embeddings are a very expressive mechanism. At the same time they
+can be very confusing for the user. They force the user to imagine
+the transitions inside two interacting expressions and work out the precise
+effects of the operations between them. When we consider
+that this problem is worsened by the
+potential for side effects caused by unintended priority name collisions, we
+see that exposing the user to priorities is undesirable.
+
+Fortunately, in practice the use of priorities has been necessary only in a
+small number of scenarios.  This allows us to encapsulate their functionality
+into a small set of operators and fully hide them from the user. This is
+advantageous from a language design point of view because it greatly simplifies
+the design.  
+
+Going back to the C comment example, we can now properly specify
+it using a guarded concatenation operator which we call .em{finish-guarded
+concatenation}. From the user's point of view, this operator terminates the
+first machine when the second machine moves into a final state.  It chooses a
+unique name and uses it to embed a low priority into all
+transitions of the first machine. A higher priority is then embedded into the
+transitions of the second machine that enter into a final state. The following
+example yields a machine identical to the example in Section 
+.ref{controlling-nondeterminism}.
+
+.code
+comment = '/*' ( any @comm )* :>> '*/';
+.end code
+
+.graphic comments2
+
+Another guarded operator is .em{left-guarded concatenation}, given by the
+.verb|<:| compound symbol. This operator places a higher priority on all
+transitions of the first machine. This is useful if one must forcibly separate
+two lists that contain common elements. For example, one may need to tokenize a
+stream, but first consume leading whitespace.
+
+Ragel also includes a .em{longest-match kleene star} operator, given by the
+.verb|**| compound symbol. This 
+guarded operator embeds a high
+priority into all transitions of the machine. 
+A lower priority is then embedded into the leaving transitions.  When the
+kleene star operator makes the epsilon transitions from
+the final states into the new start state, the lower priority will be transferred
+to the epsilon transitions. In cases where following an epsilon transition
+out of a final state conflicts with an existing transition out of a final
+state, the epsilon transition will be dropped.
+
+Other guarded operators are conceivable, such as guards on union that cause one
+alternative to take precedence over another. These may be implemented when it
+is clear they constitute a frequently used operation.
+In the next section we discuss the explicit specification of state machines
+using state charts.
+
+.subsection Entry-Guarded Concatenation
+
+.verb|expr :> expr| 
+
+This operator concatenates two machines, but first assigns a low
+priority to all transitions
+of the first machine and a high priority to the starting transitions of the
+second machine. This operator is useful if from the final states of the first
+machine it is possible to accept the characters in the entering transitions of
+the second machine. This operator effectively terminates the first machine
+immediately upon starting the second machine, where otherwise they would be
+pursued concurrently. In the following example, entry-guarded concatenation is
+used to move out of a machine that matches everything at the first sign of an
+end-of-input marker.
+
+% GENERATE: entryguard
+% OPT: -p
+% %%{
+% machine entryguard;
+.code
+# Leave the catch-all machine on the first character of FIN.
+main := any* :> 'FIN';
+.end code
+% }%%
+% END GENERATE
+
+.graphic entryguard
+
+Entry-guarded concatenation is equivalent to the following:
+
+.verbatim
+expr $(unique_name,0) . expr >(unique_name,1)
+.end verbatim
+
+.subsection Finish-Guarded Concatenation
+
+.verb|expr :>> expr|
+
+This operator is
+like the previous operator, except the higher priority is placed on the final
+transitions of the second machine. This is useful if one wishes to entertain
+the possibility of continuing to match the first machine right up until the
+second machine enters a final state. In other words it terminates the first
+machine only when the second accepts. In the following example, finish-guarded
+concatenation causes the move out of the machine that matches everything to be
+delayed until the full end-of-input marker has been matched.
+
+% GENERATE: finguard
+% OPT: -p
+% %%{
+% machine finguard;
+.code
+# Leave the catch-all machine on the last character of FIN.
+main := any* :>> 'FIN';
+.end code
+% }%%
+% END GENERATE
+
+.graphic finguard
+
+Finish-guarded concatenation is equivalent to the following, with one
+exception. If the right machine's start state is final, the higher priority is
+also embedded into it as a leaving priority. This prevents the left machine
+from persisting via the zero-length string.
+
+.verbatim
+expr $(unique_name,0) . expr @(unique_name,1)
+.end verbatim
+
+.subsection Left-Guarded Concatenation
+
+.verb|expr <: expr| 
+
+This operator places
+a higher priority on the left expression. It is useful if you want to prefix a
+sequence with another sequence composed of some of the same characters. For
+example, one can consume leading whitespace before tokenizing a sequence of
+whitespace-separated words as in:
+
+% GENERATE: leftguard
+% OPT: -p
+% %%{
+% machine leftguard;
+% action alpha {}
+% action ws {}
+% action start {}
+% action fin {}
+.code
+main := ( ' '* >start %fin ) <: ( ' ' $ws | [a-z] $alpha )*;
+.end code
+% }%%
+% END GENERATE
+
+.graphic leftguard
+
+Left-guarded concatenation is equivalent to the following:
+
+.verbatim
+expr $(unique_name,1) . expr >(unique_name,0)
+.end verbatim
+
+.subsection Longest-Match Kleene Star
+.label{longest_match_kleene_star}
+
+.verb|expr**| 
+
+This version of kleene star puts a higher priority on staying in the
+machine versus wrapping around and starting over. The LM kleene star is useful
+when writing simple tokenizers.  These machines are built by applying the
+longest-match kleene star to an alternation of token patterns, as in the
+following.
+
+% GENERATE: lmkleene
+% OPT: -p
+% %%{
+% machine exfinpri;
+% action A {}
+% action B {}
+.code
+# Repeat tokens, but make sure to get the longest match.
+main := (
+    lower ( lower | digit )* %A | 
+    digit+ %B | 
+    ' '
+)**;
+.end code
+% }%%
+% END GENERATE
+
+.graphic lmkleene
+
+If a regular kleene star were used the machine above would not be able to
+distinguish between extending a word and beginning a new one.  This operator is
+equivalent to:
+
+.verbatim
+( expr $(unique_name,1) %(unique_name,0) )*
+.end verbatim
+
+When the kleene star is applied, transitions that go out of the machine and
+back into it are made. These are assigned a priority of zero by the leaving 
+transition mechanism. This is less than the priority of one assigned to the
+transitions leaving the final states but not leaving the machine. When 
+these transitions clash on the same character, the 
+transition that stays in the machine takes precedence.  The transition
+that wraps around is dropped.
+
+Note that this operator does not build a scanner in the traditional sense
+because there is never any backtracking. To build a scanner with backtracking
+use the Longest-Match machine construction described in Section
+.ref{generating-scanners}.
+
+.chapter Interface to Host Program
+
+The Ragel code generator is very flexible. The generated code has no
+dependencies and can be inserted in any function, perhaps inside a loop if
+desired.  The user is responsible for declaring and initializing a number of
+required variables, including the current state and the pointer to the input
+stream. These can live in any scope. Control of the input processing loop is
+also possible: the user may break out of the processing loop and return to it
+at any time.
+
+In the case of the C, D, and Go host languages, Ragel is able to generate very
+fast-running code that implements state machines as directly executable code.
+Since very large files strain the host language compiler, table-based code
+generation is also supported. In the future we hope to provide a partitioned,
+directly executable format that is able to reduce the burden on the host
+compiler by splitting large machines across multiple functions.
+
+In the case of Java and Ruby, table-based code generation is the only code
+style supported. In the future this may be expanded to include other code
+styles.
+
+Ragel can be used to parse input in one block, or it can be used to parse input
+in a sequence of blocks as it arrives from a file or socket.  Parsing the input
+in a sequence of blocks brings with it a few responsibilities. If the parser
+utilizes a scanner, care must be taken to not break the input stream anywhere
+but token boundaries.  If pointers to the input stream are taken during
+parsing, care must be taken to not use a pointer that has been invalidated by
+movement to a subsequent block.  If the current input data pointer is moved
+backwards it must not be moved past the beginning of the current block.
+
+Figure .ref{basic-example} shows a simple Ragel program that does not have any
+actions. The example tests the first argument of the program against a number
+pattern and then prints the machine's acceptance status.
+
+.figure basic-example
+.verbatim
+#include <stdio.h>
+#include <string.h>
+%%{
+    machine foo;
+    write data;
+}%%
+int main( int argc, char **argv )
+{
+    int cs;
+    if ( argc > 1 ) {
+        char *p = argv[1];
+        char *pe = p + strlen( p );
+        %%{ 
+            main := [0-9]+ ( '.' [0-9]+ )?;
+
+            write init;
+            write exec;
+        }%%
+    }
+    printf("result = %i\n", cs >= foo_first_final );
+    return 0;
+}
+.end verbatim
+.caption A basic Ragel example without any actions.
+.end figure
+
+.section Variables Used by Ragel
+
+There are a number of variables that Ragel expects the user to declare. At a
+very minimum the .verb|cs|, .verb|p| and .verb|pe| variables must be declared.
+In Go, Java and Ruby code the .verb|data| variable must also be declared. If
+EOF actions are used then the .verb|eof| variable is required. If
+stack-based state machine control flow statements are used then the
+.verb|stack| and .verb|top| variables are required. If a scanner is declared
+then the .verb|act|, .verb|ts| and .verb|te| variables must be
+declared.
+
+.itemize
+
+.item .verb|cs| - Current state. This must be an integer and it should persist
+across invocations of the machine when the data is broken into blocks that are
+processed independently. This variable may be modified from outside the
+execution loop, but not from within.
+
+.item .verb|p| - Data pointer. In C/D code this variable is expected to be a
+pointer to the character data to process. It should be initialized to the
+beginning of the data block on every run of the machine. In Go, Java and Ruby it is
+used as an offset to .verb|data| and must be an integer. In this case it should
+be initialized to zero on every run of the machine.
+
+.item .verb|pe| - Data end pointer. This should be initialized to .verb|p| plus
+the data length on every run of the machine. In Go, Java and Ruby code this should
+be initialized to the data length.
+
+.item .verb|eof| - End of file pointer. This should be set to .verb|pe| when
+the buffer block being processed is the last one, otherwise it should be set to
+null. In Go, Java and Ruby code .verb|-1| must be used instead of null. If the EOF
+event can be known only after the final buffer block has been processed, then
+it is possible to set .verb|p = pe = eof| and run the execute block.
+
+.item .verb|data| - This variable is only required in Go, Java and Ruby code. It
+must be an array containting the data to process.
+
+.item .verb|stack| - This must be an array of integers. It is used to store
+integer values representing states. If the stack must resize dynamically the
+Pre-push and Post-Pop statements can be used to do this (Sections
+.ref{prepush} and .ref{postpop}).
+
+.item .verb|top| - This must be an integer value and will be used as an offset
+to .verb|stack|, giving the next available spot on the top of the stack.
+
+.item .verb|act| - This must be an integer value. It is a variable sometimes
+used by scanner code to keep track of the most recent successful pattern match.
+
+.item .verb|ts| - This must be a pointer to character data. In Go, Java and
+Ruby code this must be an integer. See Section .ref{generating-scanners} for
+more information.
+
+.item .verb|te| - Also a pointer to character data.
+
+.end itemize
+
+.section Alphtype Statement
+
+.verbatim
+alphtype unsigned int;
+.end verbatim
+
+The alphtype statement specifies the alphabet data type that the machine
+operates on. During the compilation of the machine, integer literals are
+expected to be in the range of possible values of the alphtype. The default
+is .verb|char| for all languages except Go where the default is .verb|byte|.
+
+.multicols
+C/C++/Objective-C:
+.verbatim
+          char      unsigned char      
+          short     unsigned short
+          int       unsigned int
+          long      unsigned long
+.end verbatim
+
+Go:
+.verbatim
+          byte
+          int8      uint8
+          int16     uint16
+          int32     uint32
+          int
+.end verbatim
+
+Ruby: 
+.verbatim
+          char 
+          int
+.end verbatim
+
+.columnbreak
+
+Java:
+.verbatim
+          char 
+          byte 
+          short 
+          int
+.end verbatim
+
+D:
+.verbatim
+          char 
+          byte      ubyte   
+          short     ushort 
+          wchar 
+          int       uint 
+          dchar
+.end verbatim
+
+.end multicols
+
+.section Getkey Statement
+
+.verbatim
+getkey fpc->id;
+.end verbatim
+
+This statement specifies to Ragel how to retrieve the current character from 
+from the pointer to the current element (.verb|p|). Any expression that returns
+a value of the alphabet type
+may be used. The getkey statement may be used for looking into element
+structures or for translating the character to process. The getkey expression
+defaults to .verb|(*p)|. In goto-driven machines the getkey expression may be
+evaluated more than once per element processed, therefore it should not incur a
+large cost nor preclude optimization.
+
+.section Access Statement
+
+.verbatim
+access fsm->;
+.end verbatim
+
+The access statement specifies how the generated code should
+access the machine data that is persistent across processing buffer blocks.
+This applies to all variables except .verb|p|, .verb|pe| and .verb|eof|. This includes
+.verb|cs|, .verb|top|, .verb|stack|, .verb|ts|, .verb|te| and .verb|act|.
+The access statement is useful if a machine is to be encapsulated inside a
+structure in C code. It can be used to give the name of
+a pointer to the structure.
+
+.section Variable Statement
+
+.verbatim
+variable p fsm->p;
+.end verbatim
+
+The variable statement specifies how to access a specific
+variable. All of the variables that are declared by the user and
+used by Ragel can be changed. This includes .verb|p|, .verb|pe|, .verb|eof|, .verb|cs|,
+.verb|top|, .verb|stack|, .verb|ts|, .verb|te| and .verb|act|.
+In Go, Ruby and Java code generation the .verb|data| variable can also be changed.
+
+.section Pre-Push Statement
+.label{prepush}
+
+.verbatim
+prepush { 
+    /* stack growing code */
+}
+.end verbatim
+
+The prepush statement allows the user to supply stack management code that is
+written out during the generation of fcall, immediately before the current
+state is pushed to the stack. This statement can be used to test the number of
+available spaces and dynamically grow the stack if necessary.
+
+.section Post-Pop Statement
+.label{postpop}
+
+.verbatim
+postpop { 
+    /* stack shrinking code */
+}
+.end verbatim
+
+The postpop statement allows the user to supply stack management code that is
+written out during the generation of fret, immediately after the next state is
+popped from the stack. This statement can be used to dynamically shrink the
+stack.
+
+.section Write Statement
+.label{write-statement}
+
+.verbatim
+write <component> [options];
+.end verbatim
+
+The write statement is used to generate parts of the machine. 
+There are seven
+components that can be generated by a write statement. These components make up the
+state machine's data, initialization code, execution code, and export definitions.
+A write statement may appear before a machine is fully defined.
+This allows one to write out the data first then later define the machine where
+it is used. An example of this is shown in Figure .ref{fbreak-example}.
+
+.subsection Write Data
+.verbatim
+write data [options];
+.end verbatim
+
+The write data statement causes Ragel to emit the constant static data needed
+by the machine. In table-driven output styles (see Section .ref{genout}) this
+is a collection of arrays that represent the states and transitions of the
+machine.  In goto-driven machines much less data is emitted. At the very
+minimum a start state .verb|name_start| is generated.  All variables written
+out in machine data have both the .verb|static| and .verb|const| properties and
+are prefixed with the name of the machine and an
+underscore. The data can be placed inside a class, inside a function, or it can
+be defined as global data.
+
+Two variables are written that may be used to test the state of the machine
+after a buffer block has been processed. The .verb|name_error| variable gives
+the id of the state that the machine moves into when it cannot find a valid
+transition to take. The machine immediately breaks out of the processing loop when
+it finds itself in the error state. The error variable can be compared to the
+current state to determine if the machine has failed to parse the input. If the
+machine is complete, that is from every state there is a transition to a proper
+state on every possible character of the alphabet, then no error state is required
+and this variable will be set to -1.
+
+The .verb|name_first_final| variable stores the id of the first final state.
+All of the machine's states are sorted by their final state status before
+having their ids assigned. Checking if the machine has accepted its input can
+then be done by checking if the current state is greater-than or equal to the
+first final state.
+
+Data generation has several options:
+
+.list
+.li .verb|noerror  | - Do not generate the integer variable that gives the id of the error state.
+.li .verb|nofinal  | - Do not generate the integer variable that gives the id of the first final state.
+.li .verb|noprefix | - Do not prefix the variable names with the name of the machine.
+.end list
+
+.figure fbreak-example
+.verbatim
+#include <stdio.h>
+%% machine foo;
+%% write data;
+int main( int argc, char **argv )
+{
+    int cs, res = 0;
+    if ( argc > 1 ) {
+        char *p = argv[1];
+        %%{ 
+            main := 
+                [a-z]+ 
+                0 @{ res = 1; fbreak; };
+            write init;
+            write exec noend;
+        }%%
+    }
+    printf("execute = %i\n", res );
+    return 0;
+}
+.end verbatim
+.caption Use of .tt{noend} write option and the .tt{fbreak} statement for
+processing a string.
+.end figure
+
+.subsection Write Start, First Final and Error
+
+.verbatim
+write start;
+write first_final;
+write error;
+.end verbatim
+
+These three write statements provide an alternative means of accessing the
+.verb|start|, .verb|first_final| and .verb|error| states. If there are many
+different machine specifications in one file it is easy to get the prefix for
+these wrong. This is especially true if the state machine boilerplate is
+frequently made by a copy-paste-edit process. These write statements allow the
+problem to be avoided. They can be used as follows:
+
+.verbatim
+/* Did parsing succeed? */
+if ( cs < %%{ write first_final; }%% ) {
+    result = ERR_PARSE_ERROR;
+    goto fail;
+}
+.end verbatim
+
+.subsection Write Init
+.verbatim
+write init [options];
+.end verbatim
+
+The write init statement causes Ragel to emit initialization code. This should
+be executed once before the machine is started. At a very minimum this sets the
+current state to the start state. If other variables are needed by the
+generated code, such as call stack variables or scanner management
+variables, they are also initialized here.
+
+The .verb|nocs| option to the write init statement will cause ragel to skip
+intialization of the cs variable. This is useful if the user wishes to use
+custom logic to decide which state the specification should start in.
+
+.subsection Write Exec
+.verbatim
+write exec [options];
+.end verbatim
+
+The write exec statement causes Ragel to emit the state machine's execution code.
+Ragel expects several variables to be available to this code. At a very minimum, the
+generated code needs access to the current character position .verb|p|, the ending
+position .verb|pe| and the current state .verb|cs| (though .verb|pe|
+can be omitted using the .verb|noend| write option).
+The .verb|p| variable is the cursor that the execute code will
+used to traverse the input. The .verb|pe| variable should be set up to point to one
+position past the last valid character in the buffer.
+
+Other variables are needed when certain features are used. For example using
+the .verb|fcall| or .verb|fret| statements requires .verb|stack| and
+.verb|top| variables to be defined. If a longest-match construction is used,
+variables for managing backtracking are required.
+
+The write exec statement has one option. The .verb|noend| option tells Ragel
+to generate code that ignores the end position .verb|pe|. In this
+case the user must explicitly break out of the processing loop using
+.verb|fbreak|, otherwise the machine will continue to process characters until
+it moves into the error state. This option is useful if one wishes to process a
+null terminated string. Rather than traverse the string to discover then length
+before processing the input, the user can break out when the null character is
+seen.  The example in Figure .ref{fbreak-example} shows the use of the
+.verb|noend| write option and the .verb|fbreak| statement for processing a string.
+
+.subsection Write Exports
+.label{export}
+
+.verbatim
+write exports;
+.end verbatim
+
+The export feature can be used to export simple machine definitions. Machine definitions
+are marked for export using the .verb|export| keyword.
+
+.verbatim
+export machine_to_export = 0x44;
+.end verbatim
+
+When the write exports statement is used these machines are 
+written out in the generated code. Defines are used for C and constant integers
+are used for D, Java and Ruby. See Section .ref{import} for a description of the
+import statement.
+
+.section Maintaining Pointers to Input Data
+
+In the creation of any parser it is not uncommon to require the collection of
+the data being parsed.  It is always possible to collect data into a growable
+buffer as the machine moves over it, however the copying of data is a somewhat
+wasteful use of processor cycles. The most efficient way to collect data from
+the parser is to set pointers into the input then later reference them.  This
+poses a problem for uses of Ragel where the input data arrives in blocks, such
+as over a socket or from a file. If a pointer is set in one buffer block but
+must be used while parsing a following buffer block, some extra consideration
+to correctness must be made.
+
+The scanner constructions exhibit this problem, requiring the maintenance
+code described in Section .ref{generating-scanners}. If a longest-match
+construction has been used somewhere in the machine then it is possible to
+take advantage of the required prefix maintenance code in the driver program to
+ensure pointers to the input are always valid. If laying down a pointer one can
+set .verb|ts| at the same spot or ahead of it. When data is shifted in
+between loops the user must also shift the pointer.  In this way it is possible
+to maintain pointers to the input that will always be consistent.
+
+.figure line-oriented
+.verbatim
+    int have = 0;
+    while ( 1 ) {
+        char *p, *pe, *data = buf + have;
+        int len, space = BUFSIZE - have;
+
+        if ( space == 0 ) { 
+            fprintf(stderr, "BUFFER OUT OF SPACE\n");
+            exit(1);
+        }
+
+        len = fread( data, 1, space, stdin );
+        if ( len == 0 )
+            break;
+
+        /* Find the last newline by searching backwards. */
+        p = buf;
+        pe = data + len - 1;
+        while ( *pe != '\n' && pe >= buf )
+            pe--;
+        pe += 1;
+
+        %% write exec;
+
+        /* How much is still in the buffer? */
+        have = data + len - pe;
+        if ( have > 0 )
+            memmove( buf, pe, have );
+
+        if ( len < space )
+            break;
+    }
+.end verbatim
+.caption An example of line-oriented processing.
+.end figure
+
+In general, there are two approaches for guaranteeing the consistency of
+pointers to input data. The first approach is the one just described;
+lay down a marker from an action,
+then later ensure that the data the marker points to is preserved ahead of
+the buffer on the next execute invocation. This approach is good because it
+allows the parser to decide on the pointer-use boundaries, which can be
+arbitrarily complex parsing conditions. A downside is that it requires any
+pointers that are set to be corrected in between execute invocations.
+
+The alternative is to find the pointer-use boundaries before invoking the execute
+routine, then pass in the data using these boundaries. For example, if the
+program must perform line-oriented processing, the user can scan backwards from
+the end of an input block that has just been read in and process only up to the
+first found newline. On the next input read, the new data is placed after the
+partially read line and processing continues from the beginning of the line.
+An example of line-oriented processing is given in Figure .ref{line-oriented}.
+
+.section Specifying the Host Language
+
+The .verb|ragel| program has a number of options for specifying the host
+language. The host-language options are:
+
+.itemize
+.item .verb|-C  | for C/C++/Objective-C code (default)
+.item .verb|-D  | for D code.
+.item .verb|-Z  | for Go code.
+.item .verb|-J  | for Java code.
+.item .verb|-R  | for Ruby code.
+.item .verb|-A  | for C\# code.
+.end itemize
+
+.section Choosing a Generated Code Style
+.label{genout}
+
+There are three styles of code output to choose from. Code style affects the
+size and speed of the compiled binary. Changing code style does not require any
+change to the Ragel program. There are two table-driven formats and a goto
+driven format.
+
+In addition to choosing a style to emit, there are various levels of action
+code reuse to choose from.  The maximum reuse levels (.verb|-T0|, .verb|-F0|
+and .verb|-G0|) ensure that no FSM action code is ever duplicated by encoding
+each transition's action list as static data and iterating
+through the lists on every transition. This will normally result in a smaller
+binary. The less action reuse options (.verb|-T1|, .verb|-F1| and .verb|-G1|)
+will usually produce faster running code by expanding each transition's action
+list into a single block of code, eliminating the need to iterate through the
+lists. This duplicates action code instead of generating the logic necessary
+for reuse. Consequently the binary will be larger. However, this tradeoff applies to
+machines with moderate to dense action lists only. If a machine's transitions
+frequently have less than two actions then the less reuse options will actually
+produce both a smaller and a faster running binary due to less action sharing
+overhead. The best way to choose the appropriate code style for your
+application is to perform your own tests.
+
+The table-driven FSM represents the state machine as constant static data. There are
+tables of states, transitions, indices and actions. The current state is
+stored in a variable. The execution is simply a loop that looks up the current
+state, looks up the transition to take, executes any actions and moves to the
+target state. In general, the table-driven FSM can handle any machine, produces
+a smaller binary and requires a less expensive host language compile, but
+results in slower running code.  Since the table-driven format is the most
+flexible it is the default code style.
+
+The flat table-driven machine is a table-based machine that is optimized for
+small alphabets. Where the regular table machine uses the current character as
+the key in a binary search for the transition to take, the flat table machine
+uses the current character as an index into an array of transitions. This is
+faster in general, however is only suitable if the span of possible characters
+is small.
+
+The goto-driven FSM represents the state machine using goto and switch
+statements. The execution is a flat code block where the transition to take is
+computed using switch statements and directly executable binary searches.  In
+general, the goto FSM produces faster code but results in a larger binary and a
+more expensive host language compile.
+
+The goto-driven format has an additional action reuse level (.verb|-G2|) that
+writes actions directly into the state transitioning logic rather than putting
+all the actions together into a single switch. Generally this produces faster
+running code because it allows the machine to encode the current state using
+the processor's instruction pointer. Again, sparse machines may actually
+compile to smaller binaries when .verb|-G2| is used due to less state and
+action management overhead. For many parsing applications .verb|-G2| is the
+preferred output format.
+
+.center
+
+Code Output Style Options
+
+.tabular
+.row .verb|-T0|&binary search table-driven&C/D/Java/Ruby/C\#
+.row .verb|-T1|&binary search, expanded actions&C/D/Ruby/C\#
+.row .verb|-F0|&flat table-driven&C/D/Ruby/C\#
+.row .verb|-F1|&flat table, expanded actions&C/D/Ruby/C\#
+.row .verb|-G0|&goto-driven&C/D/C\#
+.row .verb|-G1|&goto, expanded actions&C/D/C\#
+.row .verb|-G2|&goto, in-place actions&C/D/Go
+.end tabular
+.end center
+
+.chapter Beyond the Basic Model
+
+.section Parser Modularization
+.label{modularization}
+
+It is possible to use Ragel's machine construction and action embedding
+operators to specify an entire parser using a single regular expression. In
+many cases this is the desired way to specify a parser in Ragel. However, in
+some scenarios the language to parse may be so large that it is difficult to
+think about it as a single regular expression. It may also shift between distinct
+parsing strategies, in which case modularization into several coherent blocks
+of the language may be appropriate.
+
+It may also be the case that patterns that compile to a large number of states
+must be used in a number of different contexts and referencing them in each
+context results in a very large state machine. In this case, an ability to reuse
+parsers would reduce code size.
+
+To address this, distinct regular expressions may be instantiated and linked
+together by means of a jumping and calling mechanism. This mechanism is
+analogous to the jumping to and calling of processor instructions. A jump
+command, given in action code, causes control to be immediately passed to
+another portion of the machine by way of setting the current state variable. A
+call command causes the target state of the current transition to be pushed to
+a state stack before control is transferred.  Later on, the original location
+may be returned to with a return statement. In the following example, distinct
+state machines are used to handle the parsing of two types of headers.
+
+% GENERATE: call
+% %%{
+%   machine call;
+.code
+action return { fret; }
+action call_date { fcall date; }
+action call_name { fcall name; }
+
+# A parser for date strings.
+date := [0-9][0-9] '/' 
+        [0-9][0-9] '/' 
+        [0-9][0-9][0-9][0-9] '\n' @return;
+
+# A parser for name strings.
+name := ( [a-zA-Z]+ | ' ' )** '\n' @return;
+
+# The main parser.
+headers = 
+    ( 'from' | 'to' ) ':' @call_name | 
+    ( 'departed' | 'arrived' ) ':' @call_date;
+
+main := headers*;
+.end code
+% }%%
+% %% write data;
+% void f()
+% {
+%   %% write init;
+%   %% write exec;
+% }
+% END GENERATE
+
+Calling and jumping should be used carefully as they are operations that take
+one out of the domain of regular languages. A machine that contains a call or
+jump statement in one of its actions should be used as an argument to a machine
+construction operator only with considerable care. Since DFA transitions may
+actually represent several NFA transitions, a call or jump embedded in one
+machine can inadvertently terminate another machine that it shares prefixes
+with. Despite this danger, theses statements have proven useful for tying
+together sub-parsers of a language into a parser for the full language,
+especially for the purpose of modularizing code and reducing the number of
+states when the machine contains frequently recurring patterns.
+
+Section .ref{vals} describes the jump and call statements that are used to
+transfer control. These statements make use of two variables that must be
+declared by the user, .verb|stack| and .verb|top|. The .verb|stack| variable
+must be an array of integers and .verb|top| must be a single integer, which
+will point to the next available space in .verb|stack|. Sections .ref{prepush}
+and .ref{postpop} describe the Pre-Push and Post-Pop statements which can be
+used to implement a dynamically resizable array.
+
+.section Referencing Names
+.label{labels}
+
+This section describes how to reference names in epsilon transitions (Section
+.ref{state-charts}) and
+action-based control-flow statements such as .verb|fgoto|. There is a hierarchy
+of names implied in a Ragel specification.  At the top level are the machine
+instantiations. Beneath the instantiations are labels and references to machine
+definitions. Beneath those are more labels and references to definitions, and
+so on.
+
+Any name reference may contain multiple components separated with the .verb|::|
+compound symbol.  The search for the first component of a name reference is
+rooted at the join expression that the epsilon transition or action embedding
+is contained in. If the name reference is not contained in a join,
+the search is rooted at the machine definition that the epsilon transition or
+action embedding is contained in. Each component after the first is searched
+for beginning at the location in the name tree that the previous reference
+component refers to.
+
+In the case of action-based references, if the action is embedded more than
+once, the local search is performed for each embedding and the result is the
+union of all the searches. If no result is found for action-based references then
+the search is repeated at the root of the name tree.  Any action-based name
+search may be forced into a strictly global search by prefixing the name
+reference with .verb|::|.
+
+The final component of the name reference must resolve to a unique entry point.
+If a name is unique in the entire name tree it can be referenced as is. If it
+is not unique it can be specified by qualifying it with names above it in the
+name tree. However, it can always be renamed.
+
+% FIXME: Should fit this in somewhere.
+% Some kinds of name references are illegal. Cannot call into longest-match
+% machine, can only call its start state. Cannot make a call to anywhere from
+% any part of a longest-match machine except a rule's action. This would result
+% in an eventual return to some point inside a longest-match other than the
+% start state. This is banned for the same reason a call into the LM machine is
+% banned.
+
+
+.section Scanners
+.label{generating-scanners}
+
+Scanners are very much intertwined with regular-languages and their
+corresponding processors. For this reason Ragel supports the definition of
+scanners.  The generated code will repeatedly attempt to match patterns from a
+list, favouring longer patterns over shorter patterns.  In the case of
+equal-length matches, the generated code will favour patterns that appear ahead
+of others. When a scanner makes a match it executes the user code associated
+with the match, consumes the input then resumes scanning.
+
+.verbatim
+<machine_name> := |* 
+        pattern1 => action1;
+        pattern2 => action2;
+        ...
+    *|;
+.end verbatim
+
+On the surface, Ragel scanners are similar to those defined by Lex. Though
+there is a key distinguishing feature: patterns may be arbitrary Ragel
+expressions and can therefore contain embedded code. With a Ragel-based scanner
+the user need not wait until the end of a pattern before user code can be
+executed.
+
+Scanners can be used to process sub-languages, as well as for tokenizing
+programming languages. In the following example a scanner is used to tokenize
+the contents of a header field.
+
+.code
+word = [a-z]+;
+head_name = 'Header';
+
+header := |*
+    word;
+    ' ';
+    '\n' => { fret; };
+*|;
+
+main := ( head_name ':' @{ fcall header; } )*;
+.end code
+
+The scanner construction has a purpose similar to the longest-match kleene star
+operator .verb|**|. The key
+difference is that a scanner is able to backtrack to match a previously matched
+shorter string when the pursuit of a longer string fails.  For this reason the
+scanner construction operator is not a pure state machine construction
+operator. It relies on several variables that enable it to backtrack and make
+pointers to the matched input text available to the user.  For this reason
+scanners must be immediately instantiated. They cannot be defined inline or
+referenced by another expression. Scanners must be jumped to or called.
+
+Scanners rely on the .verb|ts|, .verb|te| and .verb|act|
+variables to be present so that they can backtrack and make pointers to the
+matched text available to the user. If input is processed using multiple calls
+to the execute code then the user must ensure that when a token is only
+partially matched that the prefix is preserved on the subsequent invocation of
+the execute code.
+
+The .verb|ts| variable must be defined as a pointer to the input data.
+It is used for recording where the current token match begins. This variable
+may be used in action code for retrieving the text of the current match.  Ragel
+ensures that in between tokens and outside of the longest-match machines that
+this pointer is set to null. In between calls to the execute code the user must
+check if .verb|ts| is set and if so, ensure that the data it points to is
+preserved ahead of the next buffer block. This is described in more detail
+below.
+
+The .verb|te| variable must also be defined as a pointer to the input data.
+It is used for recording where a match ends and where scanning of the next
+token should begin. This can also be used in action code for retrieving the
+text of the current match.
+
+The .verb|act| variable must be defined as an integer type. It is used for
+recording the identity of the last pattern matched when the scanner must go
+past a matched pattern in an attempt to make a longer match. If the longer
+match fails it may need to consult the .verb|act| variable. In some cases, use 
+of the .verb|act|
+variable can be avoided because the value of the current state is enough
+information to determine which token to accept, however in other cases this is
+not enough and so the .verb|act| variable is used. 
+
+When the longest-match operator is in use, the user's driver code must take on
+some buffer management functions. The following algorithm gives an overview of
+the steps that should be taken to properly use the longest-match operator.
+
+.itemize
+.item Read a block of input data.
+.item Run the execute code.
+.item If .verb|ts| is set, the execute code will expect the incomplete
+token to be preserved ahead of the buffer on the next invocation of the execute
+code.  
+.itemize
+.item Shift the data beginning at .verb|ts| and ending at .verb|pe| to the
+beginning of the input buffer.
+.item Reset .verb|ts| to the beginning of the buffer. 
+.item Shift .verb|te| by the distance from the old value of .verb|ts|
+to the new value. The .verb|te| variable may or may not be valid.  There is
+no way to know if it holds a meaningful value because it is not kept at null
+when it is not in use. It can be shifted regardless.
+.end itemize
+.item Read another block of data into the buffer, immediately following any
+preserved data.
+.item Run the scanner on the new data.
+.end itemize
+
+Figure .ref{preserve_example} shows the required handling of an input stream in
+which a token is broken by the input block boundaries. After processing up to
+and including the ``t'' of ``characters'', the prefix of the string token must be
+retained and processing should resume at the ``e'' on the next iteration of
+the execute code.
+
+If one uses a large input buffer for collecting input then the number of times
+the shifting must be done will be small. Furthermore, if one takes care not to
+define tokens that are allowed to be very long and instead processes these
+items using pure state machines or sub-scanners, then only a small amount of
+data will ever need to be shifted.
+
+.figure preserve_example
+.verbatim
+      a)           A stream "of characters" to be scanned.
+                   |        |          |
+                   p        ts         pe
+
+      b)           "of characters" to be scanned.
+                   |          |        |
+                   ts         p        pe
+.end verbatim
+.caption Following an invocation of the execute code there may be a partially
+matched token (a). The data of the partially matched token 
+must be preserved ahead of the new data on the next invocation (b).
+.end figure
+
+Since scanners attempt to make the longest possible match of input, patterns
+such as identifiers require one character of lookahead in order to trigger a
+match. In the case of the last token in the input stream the user must ensure
+that the .verb|eof| variable is set so that the final token is flushed out.
+
+An example scanner processing loop is given in Figure .ref{scanner-loop}.
+
+.figure scanner-loop
+.verbatim
+    int have = 0;
+    bool done = false;
+    while ( !done ) {
+        /* How much space is in the buffer? */
+        int space = BUFSIZE - have;
+        if ( space == 0 ) {
+            /* Buffer is full. */
+            cerr << "TOKEN TOO BIG" << endl;
+            exit(1);
+        }
+
+        /* Read in a block after any data we already have. */
+        char *p = inbuf + have;
+        cin.read( p, space );
+        int len = cin.gcount();
+
+        char *pe = p + len;
+        char *eof = 0;
+
+        /* If no data was read indicate EOF. */
+        if ( len == 0 ) {
+            eof = pe;
+            done = true;
+        }
+
+        %% write exec;
+
+        if ( cs == Scanner_error ) {
+            /* Machine failed before finding a token. */
+            cerr << "PARSE ERROR" << endl;
+            exit(1);
+        }
+
+        if ( ts == 0 )
+            have = 0;
+        else {
+            /* There is a prefix to preserve, shift it over. */
+            have = pe - ts;
+            memmove( inbuf, ts, have );
+            te = inbuf + (te-ts);
+            ts = inbuf;
+        }
+    }
+.end verbatim
+.caption A processing loop for a scanner.
+.end figure
+
+.section State Charts
+.label{state-charts}
+
+In addition to supporting the construction of state machines using regular
+languages, Ragel provides a way to manually specify state machines using
+state charts.  The comma operator combines machines together without any
+implied transitions. The user can then manually link machines by specifying
+epsilon transitions with the .verb|->| operator.  Epsilon transitions are drawn
+between the final states of a machine and entry points defined by labels.  This
+makes it possible to build machines using the explicit state-chart method while
+making minimal changes to the Ragel language. 
+
+An interesting feature of Ragel's state chart construction method is that it
+can be mixed freely with regular expression constructions. A state chart may be
+referenced from within a regular expression, or a regular expression may be
+used in the definition of a state chart transition.
+
+.subsection Join
+
+.verb|expr , expr , ...|
+
+Join a list of machines together without
+drawing any transitions, without setting up a start state, and without
+designating any final states. Transitions between the machines may be specified
+using labels and epsilon transitions. The start state must be explicity
+specified with the ``start'' label. Final states may be specified with an
+epsilon transition to the implicitly created ``final'' state. The join
+operation allows one to build machines using a state chart model.
+
+.subsection Label
+
+.verb|label: expr| 
+
+Attaches a label to an expression. Labels can be
+used as the target of epsilon transitions and explicit control transfer
+statements such as .verb|fgoto| and .verb|fnext| in action
+code.
+
+.subsection Epsilon
+
+.verb|expr -> label| 
+
+Draws an epsilon transition to the state defined
+by .verb|label|.  Epsilon transitions are made deterministic when join
+operators are evaluated. Epsilon transitions that are not in a join operation
+are made deterministic when the machine definition that contains the epsilon is
+complete. See Section .ref{labels} for information on referencing labels.
+
+.subsection Simplifying State Charts
+
+There are two benefits to providing state charts in Ragel. The first is that it
+allows us to take a state chart with a full listing of states and transitions
+and simplify it in selective places using regular expressions.
+
+The state chart method of specifying parsers is very common.  It is an
+effective programming technique for producing robust code. The key disadvantage
+becomes clear when one attempts to comprehend a large parser specified in this
+way.  These programs usually require many lines, causing logic to be spread out
+over large distances in the source file. Remembering the function of a large
+number of states can be difficult and organizing the parser in a sensible way
+requires discipline because branches and repetition present many file layout
+options.  This kind of programming takes a specification with inherent
+structure such as looping, alternation and concatenation and expresses it in a
+flat form. 
+
+If we could take an isolated component of a manually programmed state chart,
+that is, a subset of states that has only one entry point, and implement it
+using regular language operators then we could eliminate all the explicit
+naming of the states contained in it. By eliminating explicitly named states
+and replacing them with higher-level specifications we simplify a state machine
+specification.
+
+For example, sometimes chains of states are needed, with only a small number of
+possible characters appearing along the chain. These can easily be replaced
+with a concatenation of characters. Sometimes a group of common states
+implement a loop back to another single portion of the machine. Rather than
+manually duplicate all the transitions that loop back, we may be able to
+express the loop using a kleene star operator.
+
+Ragel allows one to take this state map simplification approach. We can build
+state machines using a state map model and implement portions of the state map
+using regular languages. In place of any transition in the state machine,
+entire sub-machines can be given. These can encapsulate functionality
+defined elsewhere. An important aspect of the Ragel approach is that when we
+wrap up a collection of states using a regular expression we do not lose
+access to the states and transitions. We can still execute code on the
+transitions that we have encapsulated.
+
+.subsection Dropping Down One Level of Abstraction
+.label{down}
+
+The second benefit of incorporating state charts into Ragel is that it permits
+us to bypass the regular language abstraction if we need to. Ragel's action
+embedding operators are sometimes insufficient for expressing certain parsing
+tasks.  In the same way that is useful for C language programmers to drop down
+to assembly language programming using embedded assembler, it is sometimes
+useful for the Ragel programmer to drop down to programming with state charts.
+
+In the following example, we wish to buffer the characters of an XML CDATA
+sequence. The sequence is terminated by the string .verb|]]>|. The challenge
+in our application is that we do not wish the terminating characters to be
+buffered. An expression of the form .verb|any* @buffer :>> ']]>'| will not work
+because the buffer will always contain the characters .verb|]]| on the end.
+Instead, what we need is to delay the buffering of .verb|]|
+characters until a time when we
+abandon the terminating sequence and go back into the main loop. There is no
+easy way to express this using Ragel's regular expression and action embedding
+operators, and so an ability to drop down to the state chart method is useful.
+
+% GENERATE: dropdown
+% OPT: -p
+% %%{
+% machine dropdown;
+.code
+action bchar { buff( fpc ); }    # Buffer the current character.
+action bbrack1 { buff( "]" ); }
+action bbrack2 { buff( "]]" ); }
+
+CDATA_body =
+start: (
+     ']' -> one |
+     (any-']') @bchar ->start
+),
+one: (
+     ']' -> two |
+     [^\]] @bbrack1 @bchar ->start
+),
+two: (
+     '>' -> final |
+     ']' @bbrack1 -> two |
+     [^>\]] @bbrack2 @bchar ->start
+);
+.end code
+% main := CDATA_body;
+% }%%
+% END GENERATE
+
+.graphic dropdown
+
+
+.section Semantic Conditions
+.label{semantic}
+
+Many communication protocols contain variable-length fields, where the length
+of the field is given ahead of the field as a value. This
+problem cannot be expressed using regular languages because of its
+context-dependent nature. The prevalence of variable-length fields in
+communication protocols motivated us to introduce semantic conditions into
+the Ragel language.
+
+A semantic condition is a block of user code that is interpreted as an
+expression and evaluated immediately
+before a transition is taken. If the code returns a value of true, the
+transition may be taken.  We can now embed code that extracts the length of a
+field, then proceed to match $n$ data values.
+
+% GENERATE: conds1
+% OPT: -p
+% %%{
+% machine conds1;
+% number = digit+;
+.code
+action rec_num { i = 0; n = getnumber(); }
+action test_len { i++ < n }
+data_fields = (
+    'd' 
+    [0-9]+ %rec_num 
+    ':'
+    ( [a-z] when test_len )*
+)**;
+.end code
+% main := data_fields;
+% }%%
+% END GENERATE
+
+.graphic conds1
+
+The Ragel implementation of semantic conditions does not force us to give up the
+compositional property of Ragel definitions. For example, a machine that tests
+the length of a field using conditions can be unioned with another machine
+that accepts some of the same strings, without the two machines interfering with
+one another. The user need not be concerned about whether or not the result of the
+semantic condition will affect the matching of the second machine.
+
+To see this, first consider that when a user associates a condition with an
+existing transition, the transition's label is translated from the base character
+to its corresponding value in the space that represents ``condition $c$ true''. Should
+the determinization process combine a state that has a conditional transition
+with another state that has a transition on the same input character but
+without a condition, then the condition-less transition first has its label
+translated into two values, one to its corresponding value in the space that
+represents ``condition $c$ true'' and another to its corresponding value in the
+space that represents ``condition $c$ false''. It
+is then safe to combine the two transitions. This is shown in the following
+example.  Two intersecting patterns are unioned, one with a condition and one
+without. The condition embedded in the first pattern does not affect the second
+pattern.
+
+% GENERATE: conds2
+% OPT: -p
+% %%{
+% machine conds2;
+% number = digit+;
+.code
+action test_len { i++ < n }
+action one { /* accept pattern one */ }
+action two { /* accept pattern two */ }
+patterns = 
+    ( [a-z] when test_len )+ %one |
+    [a-z][a-z0-9]* %two;
+main := patterns '\n';
+.end code
+% }%%
+% END GENERATE
+
+.graphic conds2
+
+There are many more potential uses for semantic conditions. The user is free to
+use arbitrary code and may therefore perform actions such as looking up names
+in dictionaries, validating input using external parsing mechanisms or
+performing checks on the semantic structure of input seen so far. In the next
+section we describe how Ragel accommodates several common parser engineering
+problems.
+
+The semantic condition feature works only with alphabet types that are smaller
+in width than the .verb|long| type. To implement semantic conditions Ragel
+needs to be able to allocate characters from the alphabet space. Ragel uses
+these allocated characters to express "character C with condition P true" or "C
+with P false." Since internally Ragel uses longs to store characters there is
+no room left in the alphabet space unless an alphabet type smaller than long is
+used.
+
+.section Implementing Lookahead
+
+There are a few strategies for implementing lookahead in Ragel programs.
+Leaving actions, which are described in Section .ref{out-actions}, can be
+used as a form of lookahead.  Ragel also provides the .verb|fhold| directive
+which can be used in actions to prevent the machine from advancing over the
+current character. It is also possible to manually adjust the current character
+position by shifting it backwards using .verb|fexec|, however when this is
+done, care must be taken not to overstep the beginning of the current buffer
+block. In both the use of .verb|fhold| and .verb|fexec| the user must be
+cautious of combining the resulting machine with another in such a way that the
+transition on which the current position is adjusted is not combined with a
+transition from the other machine.
+
+.section Parsing Recursive Language Structures
+
+In general Ragel cannot handle recursive structures because the grammar is
+interpreted as a regular language. However, depending on what needs to be
+parsed it is sometimes practical to implement the recursive parts using manual
+coding techniques. This often works in cases where the recursive structures are
+simple and easy to recognize, such as in the balancing of parentheses
+
+One approach to parsing recursive structures is to use actions that increment
+and decrement counters or otherwise recognize the entry to and exit from
+recursive structures and then jump to the appropriate machine defnition using
+.verb|fcall| and .verb|fret|. Alternatively, semantic conditions can be used to
+test counter variables.
+
+A more traditional approach is to call a separate parsing function (expressed
+in the host language) when a recursive structure is entered, then later return
+when the end is recognized.
+##### EXP #####
+\documentclass[letterpaper,11pt,oneside]{book}
+\usepackage{graphicx}
+\usepackage{comment}
+\usepackage{multicol}
+\usepackage[
+	colorlinks=true,
+	linkcolor=black,
+	citecolor=green,
+	filecolor=black,
+	urlcolor=black]{hyperref}
+
+\topmargin -0.20in
+\oddsidemargin 0in
+\textwidth 6.5in
+\textheight 9in
+
+\setlength{\parskip}{0pt}
+\setlength{\topsep}{0pt}
+\setlength{\partopsep}{0pt}
+\setlength{\itemsep}{0pt}
+
+\input{version}
+
+\newcommand{\verbspace}{\vspace{10pt}}
+\newcommand{\graphspace}{\vspace{10pt}}
+
+\renewcommand\floatpagefraction{.99}
+\renewcommand\topfraction{.99}
+\renewcommand\bottomfraction{.99}
+\renewcommand\textfraction{.01}   
+\setcounter{totalnumber}{50}
+\setcounter{topnumber}{50}
+\setcounter{bottomnumber}{50}
+
+\newenvironment{inline_code}{\def\baselinestretch{1}\vspace{12pt}\small}{}
+
+\begin{document}
+
+\thispagestyle{empty}
+\begin{center}
+\vspace*{3in}
+{\huge Ragel State Machine Compiler}\\
+\vspace*{12pt}
+{\Large User Guide}\\
+\vspace{1in}
+by\\
+\vspace{12pt}
+{\large Adrian Thurston}\\
+\end{center}
+\clearpage
+
+\pagenumbering{roman}
+
+\chapter*{License}
+Ragel version \version, \pubdate\\
+Copyright \copyright\ 2003-2012 Adrian D. Thurston
+\vspace{6mm}
+
+{\bf\it\noindent This document is part of Ragel, and as such, this document is
+released under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2 of the License, or (at your option)
+any later version.
+}
+
+\vspace{5pt}
+
+{\bf\it\noindent Ragel is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+details.
+}
+
+\vspace{5pt}
+
+{\bf\it\noindent You should have received a copy of the GNU General Public
+License along with Ragel; if not, write to the Free Software Foundation, Inc.,
+59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+}
+
+\clearpage
+\tableofcontents
+\clearpage
+
+\pagenumbering{arabic}
+\chapter{Introduction}
+
+\section{Abstract}
+
+Regular expressions are used heavily in practice for the purpose of specifying
+parsers. They are normally used as black boxes linked together with program
+logic.  User actions are executed in between invocations of the regular
+expression engine. Adding actions before a pattern terminates requires patterns
+to be broken and pasted back together with program logic. The more user actions
+are needed, the less the advantages of regular expressions are seen. 
+
+Ragel is a software development tool that allows user actions to be 
+embedded into the transitions of a regular expression's corresponding state
+machine, eliminating the need to switch from the regular expression engine and
+user code execution environment and back again. As a result, expressions can be
+maximally continuous.  One is free to specify an entire parser using a single
+regular expression.  The single-expression model affords concise and elegant
+descriptions of languages and the generation of very simple, fast and robust
+code.  Ragel compiles executable finite state machines from a high level regular language
+notation. Ragel targets C, C++, Objective-C, D, Go, Java and Ruby.
+
+In addition to building state machines from regular expressions, Ragel allows
+the programmer to directly specify state machines with state charts. These two
+notations may be freely combined. There are also facilities for controlling
+nondeterminism in the resulting machines and building scanners using patterns
+that themselves have embedded actions. Ragel can produce code that is small and
+runs very fast. Ragel can handle integer-sized alphabets and can compile very
+large state machines.
+
+\section{Motivation}
+
+When a programmer is faced with the task of producing a parser for a
+context-free language there are many tools to choose from. It is quite common
+to generate useful and efficient parsers for programming languages from a
+formal grammar. It is also quite common for programmers to avoid such tools
+when making parsers for simple computer languages, such as file formats and
+communication protocols.  Such languages are often regular and tools for
+processing the context-free languages are viewed as too heavyweight for the
+purpose of parsing regular languages. The extra run-time effort required for
+supporting the recursive nature of context-free languages is wasted.
+
+When we turn to the regular expression-based parsing tools, such as Lex, Re2C,
+and scripting languages such as Sed, Awk and Perl we find that they are split
+into two levels: a regular expression matching engine and some kind of program
+logic for linking patterns together.  For example, a Lex program is composed of
+sets of regular expressions. The implied program logic repeatedly attempts to
+match a pattern in the current set. When a match is found the associated user
+code executed. It requires the user to consider a language as a sequence of
+independent tokens. Scripting languages and regular expression libraries allow
+one to link patterns together using arbitrary program code.  This is very
+flexible and powerful, however we can be more concise and clear if we avoid
+gluing together regular expressions with if statements and while loops.
+
+This model of execution, where the runtime alternates between regular
+expression matching and user code exectution places restrictions on when
+action code may be executed. Since action code can only be associated with
+complete patterns, any action code that must be executed before an entire
+pattern is matched requires that the pattern be broken into smaller units.
+Instead of being forced to disrupt the regular expression syntax and write
+smaller expressions, it is desirable to retain a single expression and embed
+code for performing actions directly into the transitions that move over the
+characters. After all, capable programmers are astutely aware of the machinery
+underlying their programs, so why not provide them with access to that
+machinery? To achieve this we require an action execution model for associating
+code with the sub-expressions of a regular expression in a way that does not
+disrupt its syntax.
+
+The primary goal of Ragel is to provide developers with an ability to embed
+actions into the transitions and states of a regular expression's state machine
+in support of the definition of entire parsers or large sections of parsers
+using a single regular expression.  From the regular expression we gain a clear
+and concise statement of our language. From the state machine we obtain a very
+fast and robust executable that lends itself to many kinds of analysis and
+visualization.
+
+\section{Overview}
+
+Ragel is a language for specifying state machines. The Ragel program is a
+compiler that assembles a state machine definition to executable code.  Ragel
+is based on the principle that any regular language can be converted to a
+deterministic finite state automaton. Since every regular language has a state
+machine representation and vice versa, the terms regular language and state
+machine (or just machine) will be used interchangeably in this document.
+
+Ragel outputs machines to C, C++, Objective-C, D, Go, Java or Ruby code. The output is
+designed to be generic and is not bound to any particular input or processing
+method. A Ragel machine expects to have data passed to it in buffer blocks.
+When there is no more input, the machine can be queried for acceptance.  In
+this way, a Ragel machine can be used to simply recognize a regular language
+like a regular expression library. By embedding code into the regular language,
+a Ragel machine can also be used to parse input.
+
+The Ragel language has many operators for constructing and manipulating
+machines. Machines are built up from smaller machines, to bigger ones, to the
+final machine representing the language that needs to be recognized or parsed.
+
+The core state machine construction operators are those found in most theory
+of computation textbooks. They date back to the 1950s and are widely studied.
+They are based on set operations and permit one to think of languages as a set
+of strings. They are Union, Intersection, Difference, Concatenation and Kleene
+Star. Put together, these operators make up what most people know as regular
+expressions. Ragel also provides a scanner construction operator 
+and provides operators for explicitly constructing machines
+using a state chart method. In the state chart method, one joins machines
+together without any implied transitions and then explicitly specifies where
+epsilon transitions should be drawn.
+
+The state machine manipulation operators are specific to Ragel. They allow the
+programmer to access the states and transitions of regular language's
+corresponding machine. There are two uses of the manipulation operators. The
+first and primary use is to embed code into transitions and states, allowing
+the programmer to specify the actions of the state machine.
+
+Ragel attempts to make the action embedding facility as intuitive as possible.
+To do so, a number of issues need to be addressed.  For example, when making a
+nondeterministic specification into a DFA using machines that have embedded
+actions, new transitions are often made that have the combined actions of
+several source transitions. Ragel ensures that multiple actions associated with
+a single transition are ordered consistently with respect to the order of
+reference and the natural ordering implied by the construction operators.
+
+The second use of the manipulation operators is to assign priorities to
+transitions. Priorities provide a convenient way of controlling any
+nondeterminism introduced by the construction operators. Suppose two
+transitions leave from the same state and go to distinct target states on the
+same character. If these transitions are assigned conflicting priorities, then
+during the determinization process the transition with the higher priority will
+take precedence over the transition with the lower priority. The lower priority
+transition gets abandoned. The transitions would otherwise be combined into a new
+transition that goes to a new state that is a combination of the original
+target states. Priorities are often required for segmenting machines. The most
+common uses of priorities have been encoded into a set of simple operators
+that should be used instead of priority embeddings whenever possible.
+
+For the purposes of embedding, Ragel divides transitions and states into
+different classes. There are four operators for embedding actions and
+priorities into the transitions of a state machine. It is possible to embed
+into entering transitions, finishing transitions, all transitions and leaving
+transitions. The embedding into leaving transitions is a special case.
+These transition embeddings get stored in the final states of a machine.  They
+are transferred to any transitions that are made going out of the machine by
+future concatenation or kleene star operations.
+
+There are several more operators for embedding actions into states. Like the
+transition embeddings, there are various different classes of states that the
+embedding operators access. For example, one can access start states, final
+states or all states, among others. Unlike the transition embeddings, there are
+several different types of state action embeddings. These are executed at
+various different times during the processing of input. It is possible to embed
+actions that are exectued on transitions into a state, on transitions out of a
+state, on transitions taken on the error event, or on transitions taken on the
+EOF event.
+
+Within actions, it is possible to influence the behaviour of the state machine.
+The user can write action code that jumps or calls to another portion of the
+machine, changes the current character being processed, or breaks out of the
+processing loop. With the state machine calling feature Ragel can be used to
+parse languages that are not regular. For example, one can parse balanced
+parentheses by calling into a parser when an open parenthesis character is seen
+and returning to the state on the top of the stack when the corresponding
+closing parenthesis character is seen. More complicated context-free languages
+such as expressions in C are out of the scope of Ragel. 
+
+Ragel also provides a scanner construction operator that can be used to build
+scanners much the same way that Lex is used. The Ragel generated code, which
+relies on user-defined variables for backtracking, repeatedly tries to match
+patterns to the input, favouring longer patterns over shorter ones and patterns
+that appear ahead of others when the lengths of the possible matches are
+identical. When a pattern is matched the associated action is executed. 
+
+The key distinguishing feature between scanners in Ragel and scanners in Lex is
+that Ragel patterns may be arbitrary Ragel expressions and can therefore
+contain embedded code. With a Ragel-based scanner the user need not wait until
+the end of a pattern before user code can be executed.
+
+Scanners do take Ragel out of the domain of pure state machines and require the
+user to maintain the backtracking related variables.  However, scanners
+integrate well with regular state machine instantiations. They can be called to
+or jumped to only when needed, or they can be called out of or jumped out of
+when a simpler, pure state machine model is appropriate.
+
+Two types of output code style are available. Ragel can produce a table-driven
+machine or a directly executable machine. The directly executable machine is
+much faster than the table-driven. On the other hand, the table-driven machine
+is more compact and less demanding on the host language compiler. It is better
+suited to compiling large state machines.
+
+\section{Related Work}
+
+Lex is perhaps the best-known tool for constructing parsers from regular
+expressions. In the Lex processing model, generated code attempts to match one
+of the user's regular expression patterns, favouring longer matches over
+shorter ones. Once a match is made it then executes the code associated with
+the pattern and consumes the matching string.  This process is repeated until
+the input is fully consumed. 
+
+Through the use of start conditions, related sets of patterns may be defined.
+The active set may be changed at any time.  This allows the user to define
+different lexical regions. It also allows the user to link patterns together by
+requiring that some patterns come before others.  This is quite like a
+concatenation operation. However, use of Lex for languages that require a
+considerable amount of pattern concatenation is inappropriate. In such cases a
+Lex program deteriorates into a manually specified state machine, where start
+conditions define the states and pattern actions define the transitions.  Lex
+is therefore best suited to parsing tasks where the language to be parsed can
+be described in terms of regions of tokens. 
+
+Lex is useful in many scenarios and has undoubtedly stood the test of time.
+There are, however, several drawbacks to using Lex.  Lex can impose too much
+overhead for parsing applications where buffering is not required because all
+the characters are available in a single string.  In these cases there is
+structure to the language to be parsed and a parser specification tool can
+help, but employing a heavyweight processing loop that imposes a stream
+``pull'' model and dynamic input buffer allocation is inappropriate.  An
+example of this kind of scenario is the conversion of floating point numbers
+contained in a string to their corresponding numerical values.
+
+Another drawback is the very issue that Ragel attempts to solve.
+It is not possible to execute a user action while
+matching a character contained inside a pattern. For example, if scanning a
+programming language and string literals can contain newlines which must be
+counted, a Lex user must break up a string literal pattern so as to associate
+an action with newlines. This forces the definition of a new start condition.
+Alternatively the user can reprocess the text of the matched string literal to
+count newlines. 
+
+
+The Re2C program defines an input processing model similar to that of Lex.
+Re2C focuses on making generated state machines run very fast and
+integrate easily into any program, free of dependencies.  Re2C generates
+directly executable code and is able to claim that generated parsers run nearly
+as fast as their hand-coded equivalents.  This is very important for user
+adoption, as programmers are reluctant to use a tool when a faster alternative
+exists.  A consideration to ease of use is also important because developers
+need the freedom to integrate the generated code as they see fit. 
+
+Many scripting languages provide ways of composing parsers by linking regular
+expressions using program logic. For example, Sed and Awk are two established
+Unix scripting tools that allow the programmer to exploit regular expressions
+for the purpose of locating and extracting text of interest. High-level
+programming languages such as Perl, Python, PHP and Ruby all provide regular
+expression libraries that allow the user to combine regular expressions with
+arbitrary code.
+
+In addition to supporting the linking of regular expressions with arbitrary
+program logic, the Perl programming language permits the embedding of code into
+regular expressions. Perl embeddings do not translate into the embedding of
+code into deterministic state machines. Perl regular expressions are in fact
+not fully compiled to deterministic machines when embedded code is involved.
+They are instead interpreted and involve backtracking. This is shown by the
+following Perl program. When it is fed the input \verb|abcd| the interpretor
+attempts to match the first alternative, printing \verb|a1 b1|.  When this
+possibility fails it backtracks and tries the second possibility, printing
+\verb|a2 b2|, at which point it succeeds.
+
+\begin{inline_code}
+\begin{verbatim}
+print "YES\n" if ( <STDIN> =~
+        /( a (?{ print "a1 "; }) b (?{ print "b1 "; }) cX ) |
+         ( a (?{ print "a2 "; }) b (?{ print "b2 "; }) cd )/x )
+\end{verbatim}
+\end{inline_code}
+\verbspace
+
+In Ragel there is no regular expression interpretor. Aside from the scanner
+operator, all Ragel expressions are made into deterministic machines and the
+run time simply moves from state to state as it consumes input. An equivalent
+parser expressed in Ragel would attempt both of the alternatives concurrently,
+printing \verb|a1 a2 b1 b2|.
+
+\section{Development Status}
+
+Ragel is a relatively new tool and is under continuous development. As a rough
+release guide, minor revision number changes are for implementation
+improvements and feature additions. Major revision number changes are for
+implementation and language changes that do not preserve backwards
+compatibility. Though in the past this has not always held true: changes that
+break code have crept into minor version number changes. Typically, the
+documentation lags behind the development in the interest of documenting only
+the lasting features. The latest changes are always documented in the ChangeLog
+file. 
+
+\chapter{Constructing State Machines}
+
+\section{Ragel State Machine Specifications}
+
+A Ragel input file consists of a program in the host language that contains embedded machine
+specifications.  Ragel normally passes input straight to output.  When it sees
+a machine specification it stops to read the Ragel statements and possibly generate
+code in place of the specification.
+Afterwards it continues to pass input through.  There
+can be any number of FSM specifications in an input file. A multi-line FSM spec
+starts with \verb|%%{| and ends with \verb|}%%|. A single-line FSM spec starts
+with \verb|%%| and ends at the first newline.  
+
+While Ragel is looking for FSM specifications it does basic lexical analysis on
+the surrounding input. It interprets literal strings and comments so a
+\verb|%%| sequence in either of those will not trigger the parsing of an FSM
+specification. Ragel does not pass the input through any preprocessor nor does it
+interpret preprocessor directives itself so includes, defines and ifdef logic
+cannot be used to alter the parse of a Ragel input file. It is therefore not
+possible to use an \verb|#if 0| directive to comment out a machine as is
+commonly done in C code. As an alternative, a machine can be prevented from
+causing any generated output by commenting out write statements.
+
+In Figure \ref{cmd-line-parsing}, a multi-line specification is used to define the
+machine and single line specifications are used to trigger the writing of the machine
+data and execution code.
+
+\begin{figure}
+\small
+\begin{multicols}{2}
+\begin{verbatim}
+#include <string.h>
+#include <stdio.h>
+
+%%{ 
+    machine foo;
+    main := 
+        ( 'foo' | 'bar' ) 
+        0 @{ res = 1; };
+}%%
+
+%% write data;
+\end{verbatim}
+\verbspace
+\columnbreak
+\begin{verbatim}
+int main( int argc, char **argv )
+{
+    int cs, res = 0;
+    if ( argc > 1 ) {
+        char *p = argv[1];
+        char *pe = p + strlen(p) + 1;
+        %% write init;
+        %% write exec;
+    }
+    printf("result = %i\n", res );
+    return 0;
+}
+\end{verbatim}
+\verbspace
+\end{multicols}
+\caption{Parsing a command line argument.
+}
+\label{cmd-line-parsing}
+\end{figure}
+
+\subsection{Naming Ragel Blocks}
+
+\begin{verbatim}
+machine fsm_name;
+\end{verbatim}
+\verbspace
+
+The \verb|machine| statement gives the name of the FSM. If present in a
+specification, this statement must appear first. If a machine specification
+does not have a name then Ragel uses the previous specification name.  If no
+previous specification name exists then this is an error. Because FSM
+specifications persist in memory, a machine's statements can be spread across
+multiple machine specifications.  This allows one to break up a machine across
+several files or draw in statements that are common to multiple machines using
+the \verb|include| statement.
+
+\subsection{Machine Definition}
+\label{definition}
+
+\begin{verbatim}
+<name> = <expression>;
+\end{verbatim}
+\verbspace
+
+The machine definition statement associates an FSM expression with a name. Machine
+expressions assigned to names can later be referenced in other expressions. A
+definition statement on its own does not cause any states to be generated. It is simply a
+description of a machine to be used later. States are generated only when a definition is
+instantiated, which happens when a definition is referenced in an instantiated
+expression. 
+
+\subsection{Machine Instantiation}
+\label{instantiation}
+
+\begin{verbatim}
+<name> := <expression>;
+\end{verbatim}
+\verbspace
+
+The machine instantiation statement generates a set of states representing an
+expression. Each instantiation generates a distinct set of states.  The starting
+state of the instantiation is written in the data section of the generated code
+using the instantiation name.  If a machine named
+\verb|main| is instantiated, its start state is used as the
+specification's start state and is assigned to the \verb|cs| variable by the
+\verb|write init| command. If no \verb|main| machine is given, the start state
+of the last machine instantiation to appear is used as the specification's
+start state.
+
+From outside the execution loop, control may be passed to any machine by
+assigning the entry point to the \verb|cs| variable.  From inside the execution
+loop, control may be passed to any machine instantiation using \verb|fcall|,
+\verb|fgoto| or \verb|fnext| statements.
+
+\subsection{Including Ragel Code}
+
+\begin{verbatim}
+include FsmName "inputfile.rl";
+\end{verbatim}
+\verbspace
+
+The \verb|include| statement can be used to draw in the statements of another FSM
+specification. Both the name and input file are optional, however at least one
+must be given. Without an FSM name, the given input file is searched for an FSM
+of the same name as the current specification. Without an input file the
+current file is searched for a machine of the given name. If both are present,
+the given input file is searched for a machine of the given name.
+
+Ragel searches for included files from the location of the current file.
+Additional directories can be added to the search path using the \verb|-I|
+option.
+
+\subsection{Importing Definitions}
+\label{import}
+
+\begin{verbatim}
+import "inputfile.h";
+\end{verbatim}
+\verbspace
+
+The \verb|import| statement scrapes a file for sequences of tokens that match
+the following forms. Ragel treats these forms as state machine definitions.
+
+\noindent\hspace*{24pt}\verb|name '=' number|\\
+\noindent\hspace*{24pt}\verb|name '=' lit_string|\\
+\noindent\hspace*{24pt}\verb|'define' name number|\\
+\noindent\hspace*{24pt}\verb|'define' name lit_string|
+\vspace{12pt}
+
+If the input file is a Ragel program then tokens inside any Ragel
+specifications are ignored. See Section \ref{export} for a description of
+exporting machine definitions.
+
+Ragel searches for imported files from the location of the current file.
+Additional directories can be added to the search path using the \verb|-I|
+option.
+
+\section{Lexical Analysis of a Ragel Block}
+\label{lexing}
+
+Within a machine specification the following lexical rules apply to the input.
+
+\begin{itemize}
+
+\item The \verb|#| symbol begins a comment that terminates at the next newline.
+
+\item The symbols \verb|""|, \verb|''|, \verb|//|, \verb|[]| behave as the
+delimiters of literal strings. Within them, the following escape sequences 
+are interpreted: 
+
+\verb|    \0 \a \b \t \n \v \f \r|
+
+A backslash at the end of a line joins the following line onto the current. A
+backslash preceding any other character removes special meaning. This applies
+to terminating characters and to special characters in regular expression
+literals. As an exception, regular expression literals do not support escape
+sequences as the operands of a range within a list. See the bullet on regular
+expressions in Section \ref{basic}.
+
+\item The symbols \verb|{}| delimit a block of host language code that will be
+embedded into the machine as an action.  Within the block of host language
+code, basic lexical analysis of comments and strings is done in order to
+correctly find the closing brace of the block. With the exception of FSM
+commands embedded in code blocks, the entire block is preserved as is for
+identical reproduction in the output code.
+
+\item The pattern \verb|[+-]?[0-9]+| denotes an integer in decimal format.
+Integers used for specifying machines may be negative only if the alphabet type
+is signed. Integers used for specifying priorities may be positive or negative.
+
+\item The pattern \verb|0x[0-9A-Fa-f]+| denotes an integer in hexadecimal
+format.
+
+\item The keywords are \verb|access|, \verb|action|, \verb|alphtype|,
+\verb|getkey|, \verb|write|, \verb|machine| and \verb|include|.
+
+\item The pattern \verb|[a-zA-Z_][a-zA-Z_0-9]*| denotes an identifier.
+
+
+\item Any amount of whitespace may separate tokens.
+
+\end{itemize}
+
+
+\section{Basic Machines}
+\label{basic}
+
+The basic machines are the base operands of regular language expressions. They
+are the smallest unit to which machine construction and manipulation operators
+can be applied.
+
+\begin{itemize}
+
+\item \verb|'hello'| -- Concatenation Literal. Produces a machine that matches
+the sequence of characters in the quoted string. If there are 5 characters
+there will be 6 states chained together with the characters in the string. See
+Section \ref{lexing} for information on valid escape sequences. 
+
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{bmconcat}
+\end{center}
+\graphspace
+
+It is possible
+to make a concatenation literal case-insensitive by appending an \verb|i| to
+the string, for example \verb|'cmd'i|.
+
+\item \verb|"hello"| -- Identical to the single quoted version.
+
+\item \verb|[hello]| -- Or Expression. Produces a union of characters.  There
+will be two states with a transition for each unique character between the two states.
+The \verb|[]| delimiters behave like the quotes of a literal string. For example, 
+\verb|[ \t]| means tab or space. The \verb|or| expression supports character ranges
+with the \verb|-| symbol as a separator. The meaning of the union can be negated
+using an initial \verb|^| character as in standard regular expressions. 
+See Section \ref{lexing} for information on valid escape sequences
+in \verb|or| expressions.
+
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{bmor}
+\end{center}
+\graphspace
+
+\item \verb|''|, \verb|""|, and \verb|[]| -- Zero Length Machine.  Produces a machine
+that matches the zero length string. Zero length machines have one state that is both
+a start state and a final state.
+
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{bmnull}
+\end{center}
+\graphspace
+
+% FIXME: More on the range of values here.
+\item \verb|42| -- Numerical Literal. Produces a two state machine with one
+transition on the given number. The number may be in decimal or hexadecimal
+format and should be in the range allowed by the alphabet type. The minimum and
+maximum values permitted are defined by the host machine that Ragel is compiled
+on. For example, numbers in a \verb|short| alphabet on an i386 machine should
+be in the range \verb|-32768| to \verb|32767|.
+
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{bmnum}
+\end{center}
+\graphspace
+
+\item \verb|/simple_regex/| -- Regular Expression. Regular expressions are
+parsed as a series of expressions that are concatenated together. Each
+concatenated expression
+may be a literal character, the ``any'' character specified by the \verb|.|
+symbol, or a union of characters specified by the \verb|[]| delimiters. If the
+first character of a union is \verb|^| then it matches any character not in the
+list. Within a union, a range of characters can be given by separating the first
+and last characters of the range with the \verb|-| symbol. Each
+concatenated machine may have repetition specified by following it with the
+\verb|*| symbol. The standard escape sequences described in Section
+\ref{lexing} are supported everywhere in regular expressions except as the
+operands of a range within in a list. This notation also supports the \verb|i|
+trailing option. Use it to produce case-insensitive machines, as in \verb|/GET/i|.
+
+Ragel does not support very complex regular expressions because the desired
+results can always be achieved using the more general machine construction
+operators listed in Section \ref{machconst}. The following diagram shows the
+result of compiling \verb|/ab*[c-z].*[123]/|. \verb|DEF| represents the default
+transition, which is taken if no other transition can be taken. 
+
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{bmregex}
+\end{center}
+\graphspace
+
+\item \verb|'a' .. 'z'| -- Range. Produces a machine that matches any
+characters in the specified range.  Allowable upper and lower bounds of the
+range are concatenation literals of length one and numerical literals.  For
+example, \verb|0x10..0x20|, \verb|0..63|, and \verb|'a'..'z'| are valid ranges.
+The bounds should be in the range allowed by the alphabet type.
+
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{bmrange}
+\end{center}
+\graphspace
+
+\item \verb|variable_name| -- Lookup the machine definition assigned to the
+variable name given and use an instance of it. See Section \ref{definition} for
+an important note on what it means to reference a variable name.
+
+\item \verb|builtin_machine| -- There are several built-in machines available
+for use. They are all two state machines for the purpose of matching common
+classes of characters. They are:
+
+\begin{itemize}
+
+\item \verb|any   | -- Any character in the alphabet.
+
+\item \verb|ascii | -- Ascii characters. \verb|0..127|
+
+\item \verb|extend| -- Ascii extended characters. This is the range
+\verb|-128..127| for signed alphabets and the range \verb|0..255| for unsigned
+alphabets.
+
+\item \verb|alpha | -- Alphabetic characters. \verb|[A-Za-z]|
+
+\item \verb|digit | -- Digits. \verb|[0-9]|
+
+\item \verb|alnum | -- Alpha numerics. \verb|[0-9A-Za-z]|
+
+\item \verb|lower | -- Lowercase characters. \verb|[a-z]|
+
+\item \verb|upper | -- Uppercase characters. \verb|[A-Z]|
+
+\item \verb|xdigit| -- Hexadecimal digits. \verb|[0-9A-Fa-f]|
+
+\item \verb|cntrl | -- Control characters. \verb|0..31|
+
+\item \verb|graph | -- Graphical characters. \verb|[!-~]|
+
+\item \verb|print | -- Printable characters. \verb|[ -~]|
+
+\item \verb|punct | -- Punctuation. Graphical characters that are not alphanumerics.
+\verb|[!-/:-@[-`{-~]|
+
+\item \verb|space | -- Whitespace. \verb|[\t\v\f\n\r ]|
+
+\item \verb|zlen  | -- Zero length string. \verb|""|
+
+\item \verb|empty | -- Empty set. Matches nothing. \verb|^any|
+
+\end{itemize}
+\end{itemize}
+
+\section{Operator Precedence}
+The following table shows operator precedence from lowest to highest. Operators
+in the same precedence group are evaluated from left to right.
+
+\begin{tabular}{|c|c|c|}
+\hline
+1&\verb| , |&Join\\
+\hline
+2&\verb/ | & - --/&Union, Intersection and Subtraction\\
+\hline
+3&\verb| . <: :> :>> |&Concatenation\\
+\hline
+4&\verb| : |&Label\\
+\hline
+5&\verb| -> |&Epsilon Transition\\
+\hline
+6&\verb| >  @  $  % |&Transitions Actions and Priorities\\
+\hline
+6&\verb| >/  $/  %/  </  @/  <>/ |&EOF Actions\\
+\hline
+6&\verb| >!  $!  %!  <!  @!  <>! |&Global Error Actions\\
+\hline
+6&\verb| >^  $^  %^  <^  @^  <>^ |&Local Error Actions\\
+\hline
+6&\verb| >~  $~  %~  <~  @~  <>~ |&To-State Actions\\
+\hline
+6&\verb| >*  $*  %*  <*  @*  <>* |&From-State Action\\
+\hline
+7&\verb| * ** ? + {n} {,n} {n,} {n,m} |&Repetition\\
+\hline
+8&\verb| ! ^ |&Negation and Character-Level Negation\\
+\hline
+9&\verb| ( <expr> ) |&Grouping\\
+\hline
+\end{tabular}
+
+\section{Regular Language Operators}
+\label{machconst}
+
+When using Ragel it is helpful to have a sense of how it constructs machines.
+The determinization process can produce results that seem unusual to someone
+not familiar with the NFA to DFA conversion algorithm. In this section we
+describe Ragel's state machine operators. Though the operators are defined
+using epsilon transitions, it should be noted that this is for discussion only.
+The epsilon transitions described in this section do not persist, but are
+immediately removed by the determinization process which is executed at every
+operation. Ragel does not make use of any nondeterministic intermediate state
+machines. 
+
+To create an epsilon transition between two states \verb|x| and \verb|y| is to
+copy all of the properties of \verb|y| into \verb|x|. This involves drawing in
+all of \verb|y|'s to-state actions, EOF actions, etc., in addition to its
+transitions. If \verb|x| and \verb|y| both have a transition out on the same
+character, then the transitions must be combined.  During transition
+combination a new transition is made that goes to a new state that is the
+combination of both target states. The new combination state is created using
+the same epsilon transition method.  The new state has an epsilon transition
+drawn to all the states that compose it. Since the creation of new epsilon
+transitions may be triggered every time an epsilon transition is drawn, the
+process of drawing epsilon transitions is repeated until there are no more
+epsilon transitions to be made.
+
+A very common error that is made when using Ragel is to make machines that do
+too much. That is, to create machines that have unintentional
+nondetermistic properties. This usually results from being unaware of the common strings
+between machines that are combined together using the regular language
+operators. This can involve never leaving a machine, causing its actions to be
+propagated through all the following states. Or it can involve an alternation
+where both branches are unintentionally taken simultaneously.
+
+This problem forces one to think hard about the language that needs to be
+matched. To guard against this kind of problem one must ensure that the machine
+specification is divided up using boundaries that do not allow ambiguities from
+one portion of the machine to the next. See Chapter
+\ref{controlling-nondeterminism} for more on this problem and how to solve it.
+
+The Graphviz tool is an immense help when debugging improperly compiled
+machines or otherwise learning how to use Ragel. Graphviz Dot files can be
+generated from Ragel programs using the \verb|-V| option. See Section
+\ref{visualization} for more information.
+
+
+\subsection{Union}
+
+\verb/expr | expr/
+
+The union operation produces a machine that matches any string in machine one
+or machine two. The operation first creates a new start state. Epsilon
+transitions are drawn from the new start state to the start states of both
+input machines.  The resulting machine has a final state set equivalent to the
+union of the final state sets of both input machines. In this operation, there
+is the opportunity for nondeterminism among both branches. If there are
+strings, or prefixes of strings that are matched by both machines then the new
+machine will follow both parts of the alternation at once. The union operation is
+shown below.
+
+\graphspace
+\begin{center}
+\includegraphics[scale=1.0]{opor}
+\end{center}
+\graphspace
+
+The following example demonstrates the union of three machines representing
+common tokens.
+
+% GENERATE: exor
+% OPT: -p
+% %%{
+% machine exor;
+\begin{inline_code}
+\begin{verbatim}
+# Hex digits, decimal digits, or identifiers
+main := '0x' xdigit+ | digit+ | alpha alnum*;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{exor}
+\end{center}
+\graphspace
+
+\subsection{Intersection}
+
+\verb|expr & expr|
+
+Intersection produces a machine that matches any
+string that is in both machine one and machine two. To achieve intersection, a
+union is performed on the two machines. After the result has been made
+deterministic, any final state that is not a combination of final states from
+both machines has its final state status revoked. To complete the operation,
+paths that do not lead to a final state are pruned from the machine. Therefore,
+if there are any such paths in either of the expressions they will be removed
+by the intersection operator.  Intersection can be used to require that two
+independent patterns be simultaneously satisfied as in the following example.
+
+% GENERATE: exinter
+% OPT: -p
+% %%{
+% machine exinter;
+\begin{inline_code}
+\begin{verbatim}
+# Match lines four characters wide that contain 
+# words separated by whitespace.
+main :=
+    /[^\n][^\n][^\n][^\n]\n/* &
+    (/[a-z][a-z]*/ | [ \n])**;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{exinter}
+\end{center}
+\graphspace
+
+\subsection{Difference}
+
+\verb|expr - expr|
+
+The difference operation produces a machine that matches
+strings that are in machine one but are not in machine two. To achieve subtraction,
+a union is performed on the two machines. After the result has been made
+deterministic, any final state that came from machine two or is a combination
+of states involving a final state from machine two has its final state status
+revoked. As with intersection, the operation is completed by pruning any path
+that does not lead to a final state.  The following example demonstrates the
+use of subtraction to exclude specific cases from a set.
+
+% GENERATE: exsubtr
+% OPT: -p
+% %%{
+% machine exsubtr;
+\begin{inline_code}
+\begin{verbatim}
+# Subtract keywords from identifiers.
+main := /[a-z][a-z]*/ - ( 'for' | 'int' );
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{exsubtr}
+\end{center}
+\graphspace
+
+\subsection{Strong Difference}
+\label{strong_difference}
+
+\verb|expr -- expr|
+
+Strong difference produces a machine that matches any string of the first
+machine that does not have any string of the second machine as a substring. In
+the following example, strong subtraction is used to excluded \verb|CRLF| from
+a sequence. In the corresponding visualization, the label \verb|DEF| is short
+for default. The default transition is taken if no other transition can be
+taken.
+
+% GENERATE: exstrongsubtr
+% OPT: -p
+% %%{
+% machine exstrongsubtr;
+\begin{inline_code}
+\begin{verbatim}
+crlf = '\r\n';
+main := [a-z]+ ':' ( any* -- crlf ) crlf;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{exstrongsubtr}
+\end{center}
+\graphspace
+
+This operator is equivalent to the following.
+
+\begin{verbatim}
+expr - ( any* expr any* )
+\end{verbatim}
+\verbspace
+
+\subsection{Concatenation}
+
+\verb|expr . expr|
+
+Concatenation produces a machine that matches all the strings in machine one followed by all
+the strings in machine two.  Concatenation draws epsilon transitions from the
+final states of the first machine to the start state of the second machine. The
+final states of the first machine lose their final state status, unless the
+start state of the second machine is final as well. 
+Concatenation is the default operator. Two machines next to each other with no
+operator between them results in concatenation.
+
+\graphspace
+\begin{center}
+\includegraphics[scale=1.0]{opconcat}
+\end{center}
+\graphspace
+
+The opportunity for nondeterministic behaviour results from the possibility of
+the final states of the first machine accepting a string that is also accepted
+by the start state of the second machine.
+The most common scenario in which this happens is the
+concatenation of a machine that repeats some pattern with a machine that gives
+a terminating string, but the repetition machine does not exclude the
+terminating string. The example in Section \ref{strong_difference}
+guards against this. Another example is the expression \verb|("'" any* "'")|.
+When executed the thread of control will
+never leave the \verb|any*| machine.  This is a problem especially if actions
+are embedded to process the characters of the \verb|any*| component.
+
+In the following example, the first machine is always active due to the
+nondeterministic nature of concatenation. This particular nondeterminism is intended
+however because we wish to permit EOF strings before the end of the input.
+
+% GENERATE: exconcat
+% OPT: -p
+% %%{
+% machine exconcat;
+\begin{inline_code}
+\begin{verbatim}
+# Require an eof marker on the last line.
+main := /[^\n]*\n/* . 'EOF\n';
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{exconcat}
+\end{center}
+\graphspace
+
+There is a language
+ambiguity involving concatenation and subtraction. Because concatenation is the 
+default operator for two
+adjacent machines there is an ambiguity between subtraction of
+a positive numerical literal and concatenation of a negative numerical literal.
+For example, \verb|(x-7)| could be interpreted as \verb|(x . -7)| or 
+\verb|(x - 7)|. In the Ragel language, the subtraction operator always takes precedence
+over concatenation of a negative literal. We adhere to the rule that the default
+concatenation operator takes effect only when there are no other operators between
+two machines. Beware of writing machines such as \verb|(any -1)| when what is
+desired is a concatenation of \verb|any| and \verb|-1|. Instead write 
+\verb|(any . -1)| or \verb|(any (-1))|. If in doubt of the meaning of your program do not
+rely on the default concatenation operator; always use the \verb|.| symbol.
+
+
+\subsection{Kleene Star}
+
+\verb|expr*|
+
+The machine resulting from the Kleene Star operator will match zero or more
+repetitions of the machine it is applied to.
+It creates a new start state and an additional final
+state.  Epsilon transitions are drawn between the new start state and the old start
+state, between the new start state and the new final state, and
+between the final states of the machine and the new start state.  After the
+machine is made deterministic the effect is of the final states getting all the
+transitions of the start state. 
+
+\graphspace
+\begin{center}
+\includegraphics[scale=1.0]{opstar}
+\end{center}
+\graphspace
+
+The possibility for nondeterministic behaviour arises if the final states have
+transitions on any of the same characters as the start state.  This is common
+when applying kleene star to an alternation of tokens. Like the other problems
+arising from nondeterministic behavior, this is discussed in more detail in Chapter
+\ref{controlling-nondeterminism}. This particular problem can also be solved
+by using the longest-match construction discussed in Section 
+\ref{generating-scanners} on scanners.
+
+In this 
+example, there is no nondeterminism introduced by the exterior kleene star due to
+the newline at the end of the regular expression. Without the newline the
+exterior kleene star would be redundant and there would be ambiguity between
+repeating the inner range of the regular expression and the entire regular
+expression. Though it would not cause a problem in this case, unnecessary
+nondeterminism in the kleene star operator often causes undesired results for
+new Ragel users and must be guarded against.
+
+% GENERATE: exstar
+% OPT: -p
+% %%{
+% machine exstar;
+\begin{inline_code}
+\begin{verbatim}
+# Match any number of lines with only lowercase letters.
+main := /[a-z]*\n/*;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{exstar}
+\end{center}
+\graphspace
+
+\subsection{One Or More Repetition}
+
+\verb|expr+|
+
+This operator produces the concatenation of the machine with the kleene star of
+itself. The result will match one or more repetitions of the machine. The plus
+operator is equivalent to \verb|(expr . expr*)|.  
+
+% GENERATE: explus
+% OPT: -p
+% %%{
+% machine explus;
+\begin{inline_code}
+\begin{verbatim}
+# Match alpha-numeric words.
+main := alnum+;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{explus}
+\end{center}
+\graphspace
+
+\subsection{Optional}
+
+\verb|expr?|
+
+The {\em optional} operator produces a machine that accepts the machine
+given or the zero length string. The optional operator is equivalent to
+\verb/(expr | '' )/. In the following example the optional operator is used to
+possibly extend a token.
+
+% GENERATE: exoption
+% OPT: -p
+% %%{
+% machine exoption;
+\begin{inline_code}
+\begin{verbatim}
+# Match integers or floats.
+main := digit+ ('.' digit+)?;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{exoption}
+\end{center}
+\graphspace
+
+\subsection{Repetition}
+
+\noindent\hspace*{24pt}\verb|expr {n}| -- Exactly N copies of expr.\\
+\noindent\hspace*{24pt}\verb|expr {,n}| -- Zero to N copies of expr.\\
+\noindent\hspace*{24pt}\verb|expr {n,}| -- N or more copies of expr.\\
+\noindent\hspace*{24pt}\verb|expr {n,m}| -- N to M copies of expr.
+\vspace{12pt}
+
+\subsection{Negation}
+
+\verb|!expr|
+
+Negation produces a machine that matches any string not matched by the given
+machine. Negation is equivalent to \verb|(any* - expr)|.
+
+% GENERATE: exnegate
+% OPT: -p
+% %%{
+% machine exnegate;
+\begin{inline_code}
+\begin{verbatim}
+# Accept anything but a string beginning with a digit.
+main := ! ( digit any* );
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{exnegate}
+\end{center}
+\graphspace
+
+\subsection{Character-Level Negation}
+
+\verb|^expr|
+
+Character-level negation produces a machine that matches any single character
+not matched by the given machine. Character-Level Negation is equivalent to
+\verb|(any - expr)|. It must be applied only to machines that match strings of
+length one.
+
+\section{State Machine Minimization}
+
+State machine minimization is the process of finding the minimal equivalent FSM accepting
+the language. Minimization reduces the number of states in machines
+by merging equivalent states. It does not change the behaviour of the machine
+in any way. It will cause some states to be merged into one because they are
+functionally equivalent. State minimization is on by default. It can be turned
+off with the \verb|-n| option.
+
+The algorithm implemented is similar to Hopcroft's state minimization
+algorithm. Hopcroft's algorithm assumes a finite alphabet that can be listed in
+memory, whereas Ragel supports arbitrary integer alphabets that cannot be
+listed in memory. Though exact analysis is very difficult, Ragel minimization
+runs close to O(n * log(n)) and requires O(n) temporary storage where
+$n$ is the number of states.
+
+\section{Visualization}
+\label{visualization}
+
+%In many cases, practical
+%parsing programs will be too large to completely visualize with Graphviz.  The
+%proper approach is to reduce the language to the smallest subset possible that
+%still exhibits the characteristics that one wishes to learn about or to fix.
+%This can be done without modifying the source code using the \verb|-M| and
+%\verb|-S| options. If a machine cannot be easily reduced,
+%embeddings of unique actions can be very useful for tracing a
+%particular component of a larger machine specification, since action names are
+%written out on transition labels.
+
+Ragel is able to emit compiled state machines in Graphviz's Dot file format.
+This is done using the \verb|-V| option.
+Graphviz support allows users to perform
+incremental visualization of their parsers. User actions are displayed on
+transition labels of the graph. 
+
+If the final graph is too large to be
+meaningful, or even drawn, the user is able to inspect portions of the parser
+by naming particular regular expression definitions with the \verb|-S| and
+\verb|-M| options to the \verb|ragel| program. Use of Graphviz greatly
+improves the Ragel programming experience. It allows users to learn Ragel by
+experimentation and also to track down bugs caused by unintended
+nondeterminism.
+
+Ragel has another option to help debugging. The \verb|-x| option causes Ragel
+to emit the compiled machine in an XML format.
+
+\chapter{User Actions}
+
+Ragel permits the user to embed actions into the transitions of a regular
+expression's corresponding state machine. These actions are executed when the
+generated code moves over a transition.  Like the regular expression operators,
+the action embedding operators are fully compositional. They take a state
+machine and an action as input, embed the action and yield a new state machine
+that can be used in the construction of other machines. Due to the
+compositional nature of embeddings, the user has complete freedom in the
+placement of actions.
+
+A machine's transitions are categorized into four classes. The action embedding
+operators access the transitions defined by these classes.  The {\em entering
+transition} operator \verb|>| isolates the start state, then embeds an action
+into all transitions leaving it. The {\em finishing transition} operator
+\verb|@| embeds an action into all transitions going into a final state.  The
+{\em all transition} operator \verb|$| embeds an action into all transitions of
+an expression. The {\em leaving transition} operator \verb|%| provides access
+to the yet-unmade transitions moving out of the machine via the final states. 
+
+\section{Embedding Actions}
+
+\begin{verbatim}
+action ActionName {
+    /* Code an action here. */
+    count += 1;
+}
+\end{verbatim}
+\verbspace
+
+The action statement defines a block of code that can be embedded into an FSM.
+Action names can be referenced by the action embedding operators in
+expressions. Though actions need not be named in this way (literal blocks
+of code can be embedded directly when building machines), defining reusable
+blocks of code whenever possible is good practice because it potentially increases the
+degree to which the machine can be minimized. 
+
+Within an action some Ragel expressions and statements are parsed and
+translated. These allow the user to interact with the machine from action code.
+See Section \ref{vals} for a complete list of statements and values available
+in code blocks. 
+
+\subsection{Entering Action}
+
+\verb|expr > action| 
+
+The entering action operator embeds an action into all transitions
+that enter into the machine from the start state. If the start state is final,
+then the action is also embedded into the start state as a leaving action. This
+means that if a machine accepts the zero-length string and control passes
+through the start state then the entering action is executed. Note
+that this can happen on both a following character and on the EOF event.
+
+In some machines the start state has transtions coming in from within the
+machine. In these cases the start state is first isolated from the rest of the
+machine ensuring that the entering actions are exected once only.
+
+% GENERATE: exstact
+% OPT: -p
+% %%{
+% machine exstact;
+\begin{inline_code}
+\begin{verbatim}
+# Execute A at the beginning of a string of alpha.
+action A {}
+main := ( lower* >A ) . ' ';
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{exstact}
+\end{center}
+\graphspace
+
+\subsection{Finishing Action}
+
+\verb|expr @ action|
+
+The finishing action operator embeds an action into any transitions that move
+the machine into a final state. Further input may move the machine out of the
+final state, but keep it in the machine. Therefore finishing actions may be
+executed more than once if a machine has any internal transitions out of a
+final state. In the following example the final state has no transitions out
+and the finishing action is executed only once.
+
+% GENERATE: exdoneact
+% OPT: -p
+% %%{
+% machine exdoneact;
+% action A {}
+\begin{inline_code}
+\begin{verbatim}
+# Execute A when the trailing space is seen.
+main := ( lower* ' ' ) @A;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{exdoneact}
+\end{center}
+\graphspace
+
+\subsection{All Transition Action}
+
+\verb|expr $ action|
+
+The all transition operator embeds an action into all transitions of a machine.
+The action is executed whenever a transition of the machine is taken. In the
+following example, A is executed on every character matched.
+
+% GENERATE: exallact
+% OPT: -p
+% %%{
+% machine exallact;
+% action A {}
+\begin{inline_code}
+\begin{verbatim}
+# Execute A on any characters of the machine.
+main := ( 'm1' | 'm2' ) $A;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{exallact}
+\end{center}
+\graphspace
+
+\subsection{Leaving Actions}
+\label{out-actions}
+
+\verb|expr % action|
+
+The leaving action operator queues an action for embedding into the transitions
+that go out of a machine via a final state. The action is first stored in
+the machine's final states and is later transferred to any transitions that are
+made going out of the machine by a kleene star or concatenation operation.
+
+If a final state of the machine is still final when compilation is complete
+then the leaving action is also embedded as an EOF action. Therefore, leaving
+the machine is defined as either leaving on a character or as state machine
+acceptance.
+
+This operator allows one to associate an action with the termination of a
+sequence, without being concerned about what particular character terminates
+the sequence. In the following example, A is executed when leaving the alpha
+machine on the newline character.
+
+% GENERATE: exoutact1
+% OPT: -p
+% %%{
+% machine exoutact1;
+% action A {}
+\begin{inline_code}
+\begin{verbatim}
+# Match a word followed by a newline. Execute A when 
+# finishing the word.
+main := ( lower+ %A ) . '\n';
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{exoutact1}
+\end{center}
+\graphspace
+
+In the following example, the \verb|term_word| action could be used to register
+the appearance of a word and to clear the buffer that the \verb|lower| action used
+to store the text of it.
+
+% GENERATE: exoutact2
+% OPT: -p
+% %%{
+% machine exoutact2;
+% action lower {}
+% action space {}
+% action term_word {}
+% action newline {}
+\begin{inline_code}
+\begin{verbatim}
+word = ( [a-z] @lower )+ %term_word;
+main := word ( ' ' @space word )* '\n' @newline;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{exoutact2}
+\end{center}
+\graphspace
+
+In this final example of the action embedding operators, A is executed upon entering
+the alpha machine, B is executed on all transitions of the
+alpha machine, C is executed when the alpha machine is exited by moving into the
+newline machine and N is executed when the newline machine moves into a final
+state.  
+
+% GENERATE: exaction
+% OPT: -p
+% %%{
+% machine exaction;
+% action A {}
+% action B {}
+% action C {}
+% action N {}
+\begin{inline_code}
+\begin{verbatim}
+# Execute A on starting the alpha machine, B on every transition 
+# moving through it and C upon finishing. Execute N on the newline.
+main := ( lower* >A $B %C ) . '\n' @N;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{exaction}
+\end{center}
+\graphspace
+
+
+\section{State Action Embedding Operators}
+
+The state embedding operators allow one to embed actions into states. Like the
+transition embedding operators, there are several different classes of states
+that the operators access. The meanings of the symbols are similar to the
+meanings of the symbols used for the transition embedding operators. The design
+of the state selections was driven by a need to cover the states of an
+expression with exactly one error action.
+
+Unlike the transition embedding operators, the state embedding operators are
+also distinguished by the different kinds of events that embedded actions can
+be associated with. Therefore the state embedding operators have two
+components.  The first, which is the first one or two characters, specifies the
+class of states that the action will be embedded into. The second component
+specifies the type of event the action will be executed on. The symbols of the
+second component also have equivalent kewords. 
+
+\begin{multicols}{2}
+The different classes of states are:
+
+\noindent\hspace*{24pt}\verb|> | -- the start state\\
+\noindent\hspace*{24pt}\verb|< | -- any state except the start state\\
+\noindent\hspace*{24pt}\verb|$ | -- all states\\
+\noindent\hspace*{24pt}\verb|% | -- final states\\
+\noindent\hspace*{24pt}\verb|@ | -- any state except final states\\
+\noindent\hspace*{24pt}\verb|<>| -- any except start and final (middle)
+\vspace{12pt}
+
+\columnbreak
+
+The different kinds of embeddings are:
+
+\noindent\hspace*{24pt}\verb|~| -- to-state actions (\verb|to|)\\
+\noindent\hspace*{24pt}\verb|*| -- from-state actions (\verb|from|)\\
+\noindent\hspace*{24pt}\verb|/| -- EOF actions (\verb|eof|)\\
+\noindent\hspace*{24pt}\verb|!| -- error actions (\verb|err|)\\
+\noindent\hspace*{24pt}\verb|^| -- local error actions (\verb|lerr|)
+\vspace{12pt}
+
+\end{multicols}
+
+\subsection{To-State and From-State Actions}
+
+\subsubsection{To-State Actions}
+
+\noindent\hspace*{24pt}\verb|>~action      >to(name)      >to{...} | -- the start state\\
+\noindent\hspace*{24pt}\verb|<~action      <to(name)      <to{...} | -- any state except the start state\\
+\noindent\hspace*{24pt}\verb|$~action      $to(name)      $to{...} | -- all states\\
+\noindent\hspace*{24pt}\verb|%~action      %to(name)      %to{...} | -- final states\\
+\noindent\hspace*{24pt}\verb|@~action      @to(name)      @to{...} | -- any state except final states\\
+\noindent\hspace*{24pt}\verb|<>~action     <>to(name)     <>to{...}| -- any except start and final (middle)
+\vspace{12pt}
+
+
+To-state actions are executed whenever the state machine moves into the
+specified state, either by a natural movement over a transition or by an
+action-based transfer of control such as \verb|fgoto|. They are executed after the
+in-transition's actions but before the current character is advanced and
+tested against the end of the input block. To-state embeddings stay with the
+state. They are irrespective of the state's current set of transitions and any
+future transitions that may be added in or out of the state.
+
+Note that the setting of the current state variable \verb|cs| outside of the
+execute code is not considered by Ragel as moving into a state and consequently
+the to-state actions of the new current state are not executed. This includes
+the initialization of the current state when the machine begins.  This is
+because the entry point into the machine execution code is after the execution
+of to-state actions.
+
+\subsubsection{From-State Actions}
+
+\noindent\hspace*{24pt}\verb|>*action     >from(name)     >from{...} | -- the start state\\
+\noindent\hspace*{24pt}\verb|<*action     <from(name)     <from{...} | -- any state except the start state\\
+\noindent\hspace*{24pt}\verb|$*action     $from(name)     $from{...} | -- all states\\
+\noindent\hspace*{24pt}\verb|%*action     %from(name)     %from{...} | -- final states\\
+\noindent\hspace*{24pt}\verb|@*action     @from(name)     @from{...} | -- any state except final states\\
+\noindent\hspace*{24pt}\verb|<>*action    <>from(name)    <>from{...}| -- any except start and final (middle)
+\vspace{12pt}
+
+From-state actions are executed whenever the state machine takes a transition from a
+state, either to itself or to some other state. These actions are executed
+immediately after the current character is tested against the input block end
+marker and before the transition to take is sought based on the current
+character. From-state actions are therefore executed even if a transition
+cannot be found and the machine moves into the error state.  Like to-state
+embeddings, from-state embeddings stay with the state.
+
+\subsection{EOF Actions}
+
+\noindent\hspace*{24pt}\verb|>/action     >eof(name)     >eof{...} | -- the start state\\
+\noindent\hspace*{24pt}\verb|</action     <eof(name)     <eof{...} | -- any state except the start state\\
+\noindent\hspace*{24pt}\verb|$/action     $eof(name)     $eof{...} | -- all states\\
+\noindent\hspace*{24pt}\verb|%/action     %eof(name)     %eof{...} | -- final states\\
+\noindent\hspace*{24pt}\verb|@/action     @eof(name)     @eof{...} | -- any state except final states\\
+\noindent\hspace*{24pt}\verb|<>/action    <>eof(name)    <>eof{...}| -- any except start and final (middle)
+\vspace{12pt}
+
+The EOF action embedding operators enable the user to embed actions that are
+executed at the end of the input stream. EOF actions are stored in states and
+generated in the \verb|write exec| block. They are run when \verb|p == pe == eof|
+as the execute block is finishing. EOF actions are free to adjust \verb|p| and
+jump to another part of the machine to restart execution.
+
+\subsection{Handling Errors}
+
+In many applications it is useful to be able to react to parsing errors.  The
+user may wish to print an error message that depends on the context.  It
+may also be desirable to consume input in an attempt to return the input stream
+to some known state and resume parsing. To support error handling and recovery,
+Ragel provides error action embedding operators. There are two kinds of error
+actions: global error actions and local error actions.
+Error actions can be used to simply report errors, or by jumping to a machine
+instantiation that consumes input, can attempt to recover from errors.  
+
+\subsubsection{Global Error Actions}
+
+\noindent\hspace*{24pt}\verb|>!action     >err(name)     >err{...} | -- the start state\\
+\noindent\hspace*{24pt}\verb|<!action     <err(name)     <err{...} | -- any state except the start state\\
+\noindent\hspace*{24pt}\verb|$!action     $err(name)     $err{...} | -- all states\\
+\noindent\hspace*{24pt}\verb|%!action     %err(name)     %err{...} | -- final states\\
+\noindent\hspace*{24pt}\verb|@!action     @err(name)     @err{...} | -- any state except final states\\
+\noindent\hspace*{24pt}\verb|<>!action    <>err(name)    <>err{...}| -- any except start and final (middle)
+\vspace{12pt}
+
+Global error actions are stored in the states they are embedded into until
+compilation is complete. They are then transferred to the transitions that move
+into the error state. These transitions are taken on all input characters that
+are not already covered by the state's transitions. If a state with an error
+action is not final when compilation is complete, then the action is also
+embedded as an EOF action.
+
+Error actions can be used to recover from errors by jumping back into the
+machine with \verb|fgoto| and optionally altering \verb|p|.
+
+\subsubsection{Local Error Actions}
+
+\noindent\hspace*{24pt}\verb|>^action     >lerr(name)     >lerr{...} | -- the start state\\
+\noindent\hspace*{24pt}\verb|<^action     <lerr(name)     <lerr{...} | -- any state except the start state\\
+\noindent\hspace*{24pt}\verb|$^action     $lerr(name)     $lerr{...} | -- all states\\
+\noindent\hspace*{24pt}\verb|%^action     %lerr(name)     %lerr{...} | -- final states\\
+\noindent\hspace*{24pt}\verb|@^action     @lerr(name)     @lerr{...} | -- any state except final states\\
+\noindent\hspace*{24pt}\verb|<>^action    <>lerr(name)    <>lerr{...}| -- any except start and final (middle)
+\vspace{12pt}
+
+Like global error actions, local error actions are also stored in the states
+they are embedded into until a transfer point. The transfer point is different
+however. Each local error action embedding is associated with a name. When a
+machine definition has been fully constructed, all local error action
+embeddings associated with the same name as the machine definition are
+transferred to the error transitions. At this time they are also embedded as
+EOF actions in the case of non-final states.
+
+Local error actions can be used to specify an action to take when a particular
+section of a larger state machine fails to match. A particular machine
+definition's ``thread'' may die and the local error actions executed, however
+the machine as a whole may continue to match input.
+
+There are two forms of local error action embeddings. In the first form the
+name defaults to the current machine. In the second form the machine name can
+be specified.  This is useful when it is more convenient to specify the local
+error action in a sub-definition that is used to construct the machine
+definition that the local error action is associated with. To embed local 
+error actions and
+explicitly state the machine definition on which the transfer is to happen use
+\verb|(name, action)| as the action.
+
+\subsubsection{Example}
+
+The following example uses error actions to report an error and jump to a
+machine that consumes the remainder of the line when parsing fails. After
+consuming the line, the error recovery machine returns to the main loop.
+
+% GENERATE: erract
+% %%{
+%   machine erract;
+%   ws = ' ';
+%   address = 'foo AT bar..com';
+%   date = 'Monday May 12';
+\begin{inline_code}
+\begin{verbatim}
+action cmd_err { 
+    printf( "command error\n" ); 
+    fhold; fgoto line;
+}
+action from_err { 
+    printf( "from error\n" ); 
+    fhold; fgoto line; 
+}
+action to_err { 
+    printf( "to error\n" ); 
+    fhold; fgoto line;
+}
+
+line := [^\n]* '\n' @{ fgoto main; };
+
+main := (
+    (
+        'from' @err(cmd_err) 
+            ( ws+ address ws+ date '\n' ) $err(from_err) |
+        'to' @err(cmd_err)
+            ( ws+ address '\n' ) $err(to_err)
+    ) 
+)*;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% %% write data;
+% void f()
+% {
+%   %% write init;
+%   %% write exec;
+% }
+% END GENERATE
+
+
+
+\section{Action Ordering and Duplicates}
+
+When combining expressions that have embedded actions it is often the case that
+a number of actions must be executed on a single input character. For example,
+following a concatenation the leaving action of the left expression and the
+entering action of the right expression will be embedded into one transition.
+This requires a method of ordering actions that is intuitive and
+predictable for the user, and repeatable for the compiler. 
+
+We associate with the embedding of each action a unique timestamp that is
+used to order actions that appear together on a single transition in the final
+state machine. To accomplish this we recursively traverse the parse tree of
+regular expressions and assign timestamps to action embeddings. References to
+machine definitions are followed in the traversal. When we visit a
+parse tree node we assign timestamps to all {\em entering} action embeddings,
+recurse on the parse tree, then assign timestamps to the remaining {\em all},
+{\em finishing}, and {\em leaving} embeddings in the order in which they
+appear.
+
+By default Ragel does not permit a single action to appear multiple times in an action
+list. When the final machine has been created, actions that appear more than
+once in a single transition, to-state, from-state or EOF action list have their
+duplicates removed.
+The first appearance of the action is preserved. This is useful in a number of
+scenarios. First, it allows us to union machines with common prefixes without
+worrying about the action embeddings in the prefix being duplicated. Second, it
+prevents leaving actions from being transferred multiple times. This can
+happen when a machine is repeated, then followed with another machine that
+begins with a common character. For example:
+
+\begin{verbatim}
+word = [a-z]+ %act;
+main := word ( '\n' word )* '\n\n';
+\end{verbatim}
+\verbspace
+
+Note that Ragel does not compare action bodies to determine if they have
+identical program text. It simply checks for duplicates using each action
+block's unique location in the program.
+
+The removal of duplicates can be turned off using the \verb|-d| option.
+
+\section{Values and Statements Available in Code Blocks}
+\label{vals}
+
+The following values are available in code blocks:
+
+\begin{itemize}
+\item \verb|fpc| -- A pointer to the current character. This is equivalent to
+accessing the \verb|p| variable.
+
+\item \verb|fc| -- The current character. This is equivalent to the expression \verb|(*p)|.
+
+\item \verb|fcurs| -- An integer value representing the current state. This
+value should only be read from. To move to a different place in the machine
+from action code use the \verb|fgoto|, \verb|fnext| or \verb|fcall| statements.
+Outside of the machine execution code the \verb|cs| variable may be modified.
+
+\item \verb|ftargs| -- An integer value representing the target state. This
+value should only be read from. Again, \verb|fgoto|, \verb|fnext| and
+\verb|fcall| can be used to move to a specific entry point.
+
+\item \verb|fentry(<label>)| -- Retrieve an integer value representing the
+entry point \verb|label|. The integer value returned will be a compile time
+constant. This number is suitable for later use in control flow transfer
+statements that take an expression. This value should not be compared against
+the current state because any given label can have multiple states representing
+it. The value returned by \verb|fentry| can be any one of the multiple states that
+it represents.
+\end{itemize}
+
+The following statements are available in code blocks:
+
+\begin{itemize}
+
+\item \verb|fhold;| -- Do not advance over the current character. If processing
+data in multiple buffer blocks, the \verb|fhold| statement should only be used
+once in the set of actions executed on a character.  Multiple calls may result
+in backing up over the beginning of the buffer block. The \verb|fhold|
+statement does not imply any transfer of control. It is equivalent to the
+\verb|p--;| statement. 
+
+\item \verb|fexec <expr>;| -- Set the next character to process. This can be
+used to backtrack to previous input or advance ahead.
+Unlike \verb|fhold|, which can be used
+anywhere, \verb|fexec| requires the user to ensure that the target of the
+backtrack is in the current buffer block or is known to be somewhere ahead of
+it. The machine will continue iterating forward until \verb|pe| is arrived at,
+\verb|fbreak| is called or the machine moves into the error state. In actions
+embedded into transitions, the \verb|fexec| statement is equivalent to setting
+\verb|p| to one position ahead of the next character to process.  If the user
+also modifies \verb|pe|, it is possible to change the buffer block entirely.
+
+\item \verb|fgoto <label>;| -- Jump to an entry point defined by
+\verb|<label>|.  The \verb|fgoto| statement immediately transfers control to
+the destination state.
+
+\item \verb|fgoto *<expr>;| -- Jump to an entry point given by \verb|<expr>|.
+The expression must evaluate to an integer value representing a state.
+
+\item \verb|fnext <label>;| -- Set the next state to be the entry point defined
+by \verb|label|.  The \verb|fnext| statement does not immediately jump to the
+specified state. Any action code following the statement is executed.
+
+\item \verb|fnext *<expr>;| -- Set the next state to be the entry point given
+by \verb|<expr>|. The expression must evaluate to an integer value representing
+a state.
+
+\item \verb|fcall <label>;| -- Push the target state and jump to the entry
+point defined by \verb|<label>|.  The next \verb|fret| will jump to the target
+of the transition on which the call was made. Use of \verb|fcall| requires
+the declaration of a call stack. An array of integers named \verb|stack| and a
+single integer named \verb|top| must be declared. With the \verb|fcall|
+construct, control is immediately transferred to the destination state.
+See section \ref{modularization} for more information.
+
+\item \verb|fcall *<expr>;| -- Push the current state and jump to the entry
+point given by \verb|<expr>|. The expression must evaluate to an integer value
+representing a state.
+
+\item \verb|fret;| -- Return to the target state of the transition on which the
+last \verb|fcall| was made.  Use of \verb|fret| requires the declaration of a
+call stack. Control is immediately transferred to the destination state.
+
+\item \verb|fbreak;| -- Advance \verb|p|, save the target state to \verb|cs|
+and immediately break out of the execute loop. This statement is useful
+in conjunction with the \verb|noend| write option. Rather than process input
+until \verb|pe| is arrived at, the fbreak statement
+can be used to stop processing from an action.  After an \verb|fbreak|
+statement the \verb|p| variable will point to the next character in the input. The
+current state will be the target of the current transition. Note that \verb|fbreak|
+causes the target state's to-state actions to be skipped.
+
+\end{itemize}
+
+Once actions with control-flow commands are embedded into a
+machine, the user must exercise caution when using the machine as the operand
+to other machine construction operators. If an action jumps to another state
+then unioning any transition that executes that action with another transition
+that follows some other path will cause that other path to be lost. Using
+commands that manually jump around a machine takes us out of the domain of
+regular languages because transitions that the
+machine construction operators are not aware of are introduced.  These
+commands should therefore be used with caution.
+
+
+\chapter{Controlling Nondeterminism}
+\label{controlling-nondeterminism}
+
+Along with the flexibility of arbitrary action embeddings comes a need to
+control nondeterminism in regular expressions. If a regular expression is
+ambiguous, then sub-components of a parser other than the intended parts may become
+active. This means that actions that are irrelevant to the
+current subset of the parser may be executed, causing problems for the
+programmer.
+
+Tools that are based on regular expression engines and that are used for
+recognition tasks will usually function as intended regardless of the presence
+of ambiguities. It is quite common for users of scripting languages to write
+regular expressions that are heavily ambiguous and it generally does not
+matter. As long as one of the potential matches is recognized, there can be any
+number of other matches present.  In some parsing systems the run-time engine
+can employ a strategy for resolving ambiguities, for example always pursuing
+the longest possible match and discarding others.
+
+In Ragel, there is no regular expression run-time engine, just a simple state
+machine execution model. When we begin to embed actions and face the
+possibility of spurious action execution, it becomes clear that controlling
+nondeterminism at the machine construction level is very important. Consider
+the following example.
+
+% GENERATE: lines1
+% OPT: -p
+% %%{
+% machine lines1;
+% action first {}
+% action tail {}
+% word = [a-z]+;
+\begin{inline_code}
+\begin{verbatim}
+ws = [\n\t ];
+line = word $first ( ws word $tail )* '\n';
+lines = line*;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% main := lines;
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.53]{lines1}
+\end{center}
+\graphspace
+
+Since the \verb|ws| expression includes the newline character, we will
+not finish the \verb|line| expression when a newline character is seen. We will
+simultaneously pursue the possibility of matching further words on the same
+line and the possibility of matching a second line. Evidence of this fact is 
+in the state tables. On several transitions both the \verb|first| and
+\verb|tail| actions are executed.  The solution here is simple: exclude
+the newline character from the \verb|ws| expression. 
+
+% GENERATE: lines2
+% OPT: -p
+% %%{
+% machine lines2;
+% action first {}
+% action tail {}
+% word = [a-z]+;
+\begin{inline_code}
+\begin{verbatim}
+ws = [\t ];
+line = word $first ( ws word $tail )* '\n';
+lines = line*;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% main := lines;
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{lines2}
+\end{center}
+\graphspace
+
+Solving this kind of problem is straightforward when the ambiguity is created
+by strings that are a single character long.  When the ambiguity is created by
+strings that are multiple characters long we have a more difficult problem.
+The following example is an incorrect attempt at a regular expression for C
+language comments. 
+
+% GENERATE: comments1
+% OPT: -p
+% %%{
+% machine comments1;
+% action comm {}
+\begin{inline_code}
+\begin{verbatim}
+comment = '/*' ( any @comm )* '*/';
+main := comment ' ';
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{comments1}
+\end{center}
+\graphspace
+
+Using standard concatenation, we will never leave the \verb|any*| expression.
+We will forever entertain the possibility that a \verb|'*/'| string that we see
+is contained in a longer comment and that, simultaneously, the comment has
+ended.  The concatenation of the \verb|comment| machine with \verb|SP| is done
+to show this. When we match space, we are also still matching the comment body.
+
+One way to approach the problem is to exclude the terminating string
+from the \verb|any*| expression using set difference. We must be careful to
+exclude not just the terminating string, but any string that contains it as a
+substring. A verbose, but proper specification of a C comment parser is given
+by the following regular expression. 
+
+% GENERATE: comments2
+% OPT: -p
+% %%{
+% machine comments2;
+% action comm {}
+\begin{inline_code}
+\begin{verbatim}
+comment = '/*' ( ( any @comm )* - ( any* '*/' any* ) ) '*/';
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% main := comment;
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{comments2}
+\end{center}
+\graphspace
+
+Note that Ragel's strong subtraction operator \verb|--| can also be used here.
+In doing this subtraction we have phrased the problem of controlling non-determinism in
+terms of excluding strings common to two expressions that interact when
+combined.
+We can also phrase the problem in terms of the transitions of the state
+machines that implement these expressions. During the concatenation of
+\verb|any*| and \verb|'*/'| we will be making transitions that are composed of
+both the loop of the first expression and the final character of the second.
+At this time we want the transition on the \verb|'/'| character to take precedence
+over and disallow the transition that originated in the \verb|any*| loop.
+
+In another parsing problem, we wish to implement a lightweight tokenizer that we can
+utilize in the composition of a larger machine. For example, some HTTP headers
+have a token stream as a sub-language. The following example is an attempt
+at a regular expression-based tokenizer that does not function correctly due to
+unintended nondeterminism.
+
+% GENERATE: smallscanner
+% OPT: -p
+% %%{
+% machine smallscanner;
+% action start_str {}
+% action on_char {}
+% action finish_str {}
+\begin{inline_code}
+\begin{verbatim}
+header_contents = ( 
+    lower+ >start_str $on_char %finish_str | 
+    ' '
+)*;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% main := header_contents;
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{smallscanner}
+\end{center}
+\graphspace
+
+In this case, the problem with using a standard kleene star operation is that
+there is an ambiguity between extending a token and wrapping around the machine
+to begin a new token. Using the standard operator, we get an undesirable
+nondeterministic behaviour. Evidence of this can be seen on the transition out
+of state one to itself.  The transition extends the string, and simultaneously,
+finishes the string only to immediately begin a new one.  What is required is
+for the
+transitions that represent an extension of a token to take precedence over the
+transitions that represent the beginning of a new token. For this problem
+there is no simple solution that uses standard regular expression operators.
+
+\section{Priorities}
+
+A priority mechanism was devised and built into the determinization
+process, specifically for the purpose of allowing the user to control
+nondeterminism.  Priorities are integer values embedded into transitions. When
+the determinization process is combining transitions that have different
+priorities, the transition with the higher priority is preserved and the
+transition with the lower priority is dropped.
+
+Unfortunately, priorities can have unintended side effects because their
+operation requires that they linger in transitions indefinitely. They must linger
+because the Ragel program cannot know when the user is finished with a priority
+embedding.  A solution whereby they are explicitly deleted after use is
+conceivable; however this is not very user-friendly.  Priorities were therefore
+made into named entities. Only priorities with the same name are allowed to
+interact.  This allows any number of priorities to coexist in one machine for
+the purpose of controlling various different regular expression operations and
+eliminates the need to ever delete them. Such a scheme allows the user to
+choose a unique name, embed two different priority values using that name
+and be confident that the priority embedding will be free of any side effects.
+
+In the first form of priority embedding the name defaults to the name of the machine
+definition that the priority is assigned in. In this sense priorities are by
+default local to the current machine definition or instantiation. Beware of
+using this form in a longest-match machine, since there is only one name for
+the entire set of longest match patterns. In the second form the priority's
+name can be specified, allowing priority interaction across machine definition
+boundaries.
+
+\begin{itemize}
+\item \verb|expr > int| -- Sets starting transitions to have priority int.
+\item \verb|expr @ int| -- Sets transitions that go into a final state to have priority int. 
+\item \verb|expr $ int| -- Sets all transitions to have priority int.
+\item \verb|expr % int| -- Sets leaving transitions to
+have priority int. When a transition is made going out of the machine (either
+by concatenation or kleene star) its priority is immediately set to the 
+leaving priority.  
+\end{itemize}
+
+The second form of priority assignment allows the programmer to specify the name
+to which the priority is assigned.
+
+\begin{itemize}
+\item \verb|expr > (name, int)| -- Starting transitions.
+\item \verb|expr @ (name, int)| -- Finishing transitions (into a final state).
+\item \verb|expr $ (name, int)| -- All transitions.
+\item \verb|expr % (name, int)| -- Leaving transitions.
+\end{itemize}
+
+\section{Guarded Operators that Encapsulate Priorities}
+
+Priority embeddings are a very expressive mechanism. At the same time they
+can be very confusing for the user. They force the user to imagine
+the transitions inside two interacting expressions and work out the precise
+effects of the operations between them. When we consider
+that this problem is worsened by the
+potential for side effects caused by unintended priority name collisions, we
+see that exposing the user to priorities is undesirable.
+
+Fortunately, in practice the use of priorities has been necessary only in a
+small number of scenarios.  This allows us to encapsulate their functionality
+into a small set of operators and fully hide them from the user. This is
+advantageous from a language design point of view because it greatly simplifies
+the design.  
+
+Going back to the C comment example, we can now properly specify
+it using a guarded concatenation operator which we call {\em finish-guarded
+concatenation}. From the user's point of view, this operator terminates the
+first machine when the second machine moves into a final state.  It chooses a
+unique name and uses it to embed a low priority into all
+transitions of the first machine. A higher priority is then embedded into the
+transitions of the second machine that enter into a final state. The following
+example yields a machine identical to the example in Section 
+\ref{controlling-nondeterminism}.
+
+\begin{inline_code}
+\begin{verbatim}
+comment = '/*' ( any @comm )* :>> '*/';
+\end{verbatim}
+\end{inline_code}
+\verbspace
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{comments2}
+\end{center}
+\graphspace
+
+Another guarded operator is {\em left-guarded concatenation}, given by the
+\verb|<:| compound symbol. This operator places a higher priority on all
+transitions of the first machine. This is useful if one must forcibly separate
+two lists that contain common elements. For example, one may need to tokenize a
+stream, but first consume leading whitespace.
+
+Ragel also includes a {\em longest-match kleene star} operator, given by the
+\verb|**| compound symbol. This 
+guarded operator embeds a high
+priority into all transitions of the machine. 
+A lower priority is then embedded into the leaving transitions.  When the
+kleene star operator makes the epsilon transitions from
+the final states into the new start state, the lower priority will be transferred
+to the epsilon transitions. In cases where following an epsilon transition
+out of a final state conflicts with an existing transition out of a final
+state, the epsilon transition will be dropped.
+
+Other guarded operators are conceivable, such as guards on union that cause one
+alternative to take precedence over another. These may be implemented when it
+is clear they constitute a frequently used operation.
+In the next section we discuss the explicit specification of state machines
+using state charts.
+
+\subsection{Entry-Guarded Concatenation}
+
+\verb|expr :> expr| 
+
+This operator concatenates two machines, but first assigns a low
+priority to all transitions
+of the first machine and a high priority to the starting transitions of the
+second machine. This operator is useful if from the final states of the first
+machine it is possible to accept the characters in the entering transitions of
+the second machine. This operator effectively terminates the first machine
+immediately upon starting the second machine, where otherwise they would be
+pursued concurrently. In the following example, entry-guarded concatenation is
+used to move out of a machine that matches everything at the first sign of an
+end-of-input marker.
+
+% GENERATE: entryguard
+% OPT: -p
+% %%{
+% machine entryguard;
+\begin{inline_code}
+\begin{verbatim}
+# Leave the catch-all machine on the first character of FIN.
+main := any* :> 'FIN';
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{entryguard}
+\end{center}
+\graphspace
+
+Entry-guarded concatenation is equivalent to the following:
+
+\begin{verbatim}
+expr $(unique_name,0) . expr >(unique_name,1)
+\end{verbatim}
+\verbspace
+
+\subsection{Finish-Guarded Concatenation}
+
+\verb|expr :>> expr|
+
+This operator is
+like the previous operator, except the higher priority is placed on the final
+transitions of the second machine. This is useful if one wishes to entertain
+the possibility of continuing to match the first machine right up until the
+second machine enters a final state. In other words it terminates the first
+machine only when the second accepts. In the following example, finish-guarded
+concatenation causes the move out of the machine that matches everything to be
+delayed until the full end-of-input marker has been matched.
+
+% GENERATE: finguard
+% OPT: -p
+% %%{
+% machine finguard;
+\begin{inline_code}
+\begin{verbatim}
+# Leave the catch-all machine on the last character of FIN.
+main := any* :>> 'FIN';
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{finguard}
+\end{center}
+\graphspace
+
+Finish-guarded concatenation is equivalent to the following, with one
+exception. If the right machine's start state is final, the higher priority is
+also embedded into it as a leaving priority. This prevents the left machine
+from persisting via the zero-length string.
+
+\begin{verbatim}
+expr $(unique_name,0) . expr @(unique_name,1)
+\end{verbatim}
+\verbspace
+
+\subsection{Left-Guarded Concatenation}
+
+\verb|expr <: expr| 
+
+This operator places
+a higher priority on the left expression. It is useful if you want to prefix a
+sequence with another sequence composed of some of the same characters. For
+example, one can consume leading whitespace before tokenizing a sequence of
+whitespace-separated words as in:
+
+% GENERATE: leftguard
+% OPT: -p
+% %%{
+% machine leftguard;
+% action alpha {}
+% action ws {}
+% action start {}
+% action fin {}
+\begin{inline_code}
+\begin{verbatim}
+main := ( ' '* >start %fin ) <: ( ' ' $ws | [a-z] $alpha )*;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{leftguard}
+\end{center}
+\graphspace
+
+Left-guarded concatenation is equivalent to the following:
+
+\begin{verbatim}
+expr $(unique_name,1) . expr >(unique_name,0)
+\end{verbatim}
+\verbspace
+
+\subsection{Longest-Match Kleene Star}
+\label{longest_match_kleene_star}
+
+\verb|expr**| 
+
+This version of kleene star puts a higher priority on staying in the
+machine versus wrapping around and starting over. The LM kleene star is useful
+when writing simple tokenizers.  These machines are built by applying the
+longest-match kleene star to an alternation of token patterns, as in the
+following.
+
+% GENERATE: lmkleene
+% OPT: -p
+% %%{
+% machine exfinpri;
+% action A {}
+% action B {}
+\begin{inline_code}
+\begin{verbatim}
+# Repeat tokens, but make sure to get the longest match.
+main := (
+    lower ( lower | digit )* %A | 
+    digit+ %B | 
+    ' '
+)**;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{lmkleene}
+\end{center}
+\graphspace
+
+If a regular kleene star were used the machine above would not be able to
+distinguish between extending a word and beginning a new one.  This operator is
+equivalent to:
+
+\begin{verbatim}
+( expr $(unique_name,1) %(unique_name,0) )*
+\end{verbatim}
+\verbspace
+
+When the kleene star is applied, transitions that go out of the machine and
+back into it are made. These are assigned a priority of zero by the leaving 
+transition mechanism. This is less than the priority of one assigned to the
+transitions leaving the final states but not leaving the machine. When 
+these transitions clash on the same character, the 
+transition that stays in the machine takes precedence.  The transition
+that wraps around is dropped.
+
+Note that this operator does not build a scanner in the traditional sense
+because there is never any backtracking. To build a scanner with backtracking
+use the Longest-Match machine construction described in Section
+\ref{generating-scanners}.
+
+\chapter{Interface to Host Program}
+
+The Ragel code generator is very flexible. The generated code has no
+dependencies and can be inserted in any function, perhaps inside a loop if
+desired.  The user is responsible for declaring and initializing a number of
+required variables, including the current state and the pointer to the input
+stream. These can live in any scope. Control of the input processing loop is
+also possible: the user may break out of the processing loop and return to it
+at any time.
+
+In the case of the C, D, and Go host languages, Ragel is able to generate very
+fast-running code that implements state machines as directly executable code.
+Since very large files strain the host language compiler, table-based code
+generation is also supported. In the future we hope to provide a partitioned,
+directly executable format that is able to reduce the burden on the host
+compiler by splitting large machines across multiple functions.
+
+In the case of Java and Ruby, table-based code generation is the only code
+style supported. In the future this may be expanded to include other code
+styles.
+
+Ragel can be used to parse input in one block, or it can be used to parse input
+in a sequence of blocks as it arrives from a file or socket.  Parsing the input
+in a sequence of blocks brings with it a few responsibilities. If the parser
+utilizes a scanner, care must be taken to not break the input stream anywhere
+but token boundaries.  If pointers to the input stream are taken during
+parsing, care must be taken to not use a pointer that has been invalidated by
+movement to a subsequent block.  If the current input data pointer is moved
+backwards it must not be moved past the beginning of the current block.
+
+Figure \ref{basic-example} shows a simple Ragel program that does not have any
+actions. The example tests the first argument of the program against a number
+pattern and then prints the machine's acceptance status.
+
+\begin{figure}
+\small
+\begin{verbatim}
+#include <stdio.h>
+#include <string.h>
+%%{
+    machine foo;
+    write data;
+}%%
+int main( int argc, char **argv )
+{
+    int cs;
+    if ( argc > 1 ) {
+        char *p = argv[1];
+        char *pe = p + strlen( p );
+        %%{ 
+            main := [0-9]+ ( '.' [0-9]+ )?;
+
+            write init;
+            write exec;
+        }%%
+    }
+    printf("result = %i\n", cs >= foo_first_final );
+    return 0;
+}
+\end{verbatim}
+\verbspace
+\caption{A basic Ragel example without any actions.
+}
+\label{basic-example}
+\end{figure}
+
+\section{Variables Used by Ragel}
+
+There are a number of variables that Ragel expects the user to declare. At a
+very minimum the \verb|cs|, \verb|p| and \verb|pe| variables must be declared.
+In Go, Java and Ruby code the \verb|data| variable must also be declared. If
+EOF actions are used then the \verb|eof| variable is required. If
+stack-based state machine control flow statements are used then the
+\verb|stack| and \verb|top| variables are required. If a scanner is declared
+then the \verb|act|, \verb|ts| and \verb|te| variables must be
+declared.
+
+\begin{itemize}
+
+\item \verb|cs| - Current state. This must be an integer and it should persist
+across invocations of the machine when the data is broken into blocks that are
+processed independently. This variable may be modified from outside the
+execution loop, but not from within.
+
+\item \verb|p| - Data pointer. In C/D code this variable is expected to be a
+pointer to the character data to process. It should be initialized to the
+beginning of the data block on every run of the machine. In Go, Java and Ruby it is
+used as an offset to \verb|data| and must be an integer. In this case it should
+be initialized to zero on every run of the machine.
+
+\item \verb|pe| - Data end pointer. This should be initialized to \verb|p| plus
+the data length on every run of the machine. In Go, Java and Ruby code this should
+be initialized to the data length.
+
+\item \verb|eof| - End of file pointer. This should be set to \verb|pe| when
+the buffer block being processed is the last one, otherwise it should be set to
+null. In Go, Java and Ruby code \verb|-1| must be used instead of null. If the EOF
+event can be known only after the final buffer block has been processed, then
+it is possible to set \verb|p = pe = eof| and run the execute block.
+
+\item \verb|data| - This variable is only required in Go, Java and Ruby code. It
+must be an array containting the data to process.
+
+\item \verb|stack| - This must be an array of integers. It is used to store
+integer values representing states. If the stack must resize dynamically the
+Pre-push and Post-Pop statements can be used to do this (Sections
+\ref{prepush} and \ref{postpop}).
+
+\item \verb|top| - This must be an integer value and will be used as an offset
+to \verb|stack|, giving the next available spot on the top of the stack.
+
+\item \verb|act| - This must be an integer value. It is a variable sometimes
+used by scanner code to keep track of the most recent successful pattern match.
+
+\item \verb|ts| - This must be a pointer to character data. In Go, Java and
+Ruby code this must be an integer. See Section \ref{generating-scanners} for
+more information.
+
+\item \verb|te| - Also a pointer to character data.
+
+\end{itemize}
+
+\section{Alphtype Statement}
+
+\begin{verbatim}
+alphtype unsigned int;
+\end{verbatim}
+\verbspace
+
+The alphtype statement specifies the alphabet data type that the machine
+operates on. During the compilation of the machine, integer literals are
+expected to be in the range of possible values of the alphtype. The default
+is \verb|char| for all languages except Go where the default is \verb|byte|.
+
+\begin{multicols}{2}
+C/C++/Objective-C:
+\begin{verbatim}
+          char      unsigned char      
+          short     unsigned short
+          int       unsigned int
+          long      unsigned long
+\end{verbatim}
+\verbspace
+
+Go:
+\begin{verbatim}
+          byte
+          int8      uint8
+          int16     uint16
+          int32     uint32
+          int
+\end{verbatim}
+\verbspace
+
+Ruby: 
+\begin{verbatim}
+          char 
+          int
+\end{verbatim}
+\verbspace
+
+\columnbreak
+
+Java:
+\begin{verbatim}
+          char 
+          byte 
+          short 
+          int
+\end{verbatim}
+\verbspace
+
+D:
+\begin{verbatim}
+          char 
+          byte      ubyte   
+          short     ushort 
+          wchar 
+          int       uint 
+          dchar
+\end{verbatim}
+\verbspace
+
+\end{multicols}
+
+\section{Getkey Statement}
+
+\begin{verbatim}
+getkey fpc->id;
+\end{verbatim}
+\verbspace
+
+This statement specifies to Ragel how to retrieve the current character from 
+from the pointer to the current element (\verb|p|). Any expression that returns
+a value of the alphabet type
+may be used. The getkey statement may be used for looking into element
+structures or for translating the character to process. The getkey expression
+defaults to \verb|(*p)|. In goto-driven machines the getkey expression may be
+evaluated more than once per element processed, therefore it should not incur a
+large cost nor preclude optimization.
+
+\section{Access Statement}
+
+\begin{verbatim}
+access fsm->;
+\end{verbatim}
+\verbspace
+
+The access statement specifies how the generated code should
+access the machine data that is persistent across processing buffer blocks.
+This applies to all variables except \verb|p|, \verb|pe| and \verb|eof|. This includes
+\verb|cs|, \verb|top|, \verb|stack|, \verb|ts|, \verb|te| and \verb|act|.
+The access statement is useful if a machine is to be encapsulated inside a
+structure in C code. It can be used to give the name of
+a pointer to the structure.
+
+\section{Variable Statement}
+
+\begin{verbatim}
+variable p fsm->p;
+\end{verbatim}
+\verbspace
+
+The variable statement specifies how to access a specific
+variable. All of the variables that are declared by the user and
+used by Ragel can be changed. This includes \verb|p|, \verb|pe|, \verb|eof|, \verb|cs|,
+\verb|top|, \verb|stack|, \verb|ts|, \verb|te| and \verb|act|.
+In Go, Ruby and Java code generation the \verb|data| variable can also be changed.
+
+\section{Pre-Push Statement}
+\label{prepush}
+
+\begin{verbatim}
+prepush { 
+    /* stack growing code */
+}
+\end{verbatim}
+\verbspace
+
+The prepush statement allows the user to supply stack management code that is
+written out during the generation of fcall, immediately before the current
+state is pushed to the stack. This statement can be used to test the number of
+available spaces and dynamically grow the stack if necessary.
+
+\section{Post-Pop Statement}
+\label{postpop}
+
+\begin{verbatim}
+postpop { 
+    /* stack shrinking code */
+}
+\end{verbatim}
+\verbspace
+
+The postpop statement allows the user to supply stack management code that is
+written out during the generation of fret, immediately after the next state is
+popped from the stack. This statement can be used to dynamically shrink the
+stack.
+
+\section{Write Statement}
+\label{write-statement}
+
+\begin{verbatim}
+write <component> [options];
+\end{verbatim}
+\verbspace
+
+The write statement is used to generate parts of the machine. 
+There are seven
+components that can be generated by a write statement. These components make up the
+state machine's data, initialization code, execution code, and export definitions.
+A write statement may appear before a machine is fully defined.
+This allows one to write out the data first then later define the machine where
+it is used. An example of this is shown in Figure \ref{fbreak-example}.
+
+\subsection{Write Data}
+\begin{verbatim}
+write data [options];
+\end{verbatim}
+\verbspace
+
+The write data statement causes Ragel to emit the constant static data needed
+by the machine. In table-driven output styles (see Section \ref{genout}) this
+is a collection of arrays that represent the states and transitions of the
+machine.  In goto-driven machines much less data is emitted. At the very
+minimum a start state \verb|name_start| is generated.  All variables written
+out in machine data have both the \verb|static| and \verb|const| properties and
+are prefixed with the name of the machine and an
+underscore. The data can be placed inside a class, inside a function, or it can
+be defined as global data.
+
+Two variables are written that may be used to test the state of the machine
+after a buffer block has been processed. The \verb|name_error| variable gives
+the id of the state that the machine moves into when it cannot find a valid
+transition to take. The machine immediately breaks out of the processing loop when
+it finds itself in the error state. The error variable can be compared to the
+current state to determine if the machine has failed to parse the input. If the
+machine is complete, that is from every state there is a transition to a proper
+state on every possible character of the alphabet, then no error state is required
+and this variable will be set to -1.
+
+The \verb|name_first_final| variable stores the id of the first final state.
+All of the machine's states are sorted by their final state status before
+having their ids assigned. Checking if the machine has accepted its input can
+then be done by checking if the current state is greater-than or equal to the
+first final state.
+
+Data generation has several options:
+
+\noindent\hspace*{24pt}\verb|noerror  | - Do not generate the integer variable that gives the id of the error state.\\
+\noindent\hspace*{24pt}\verb|nofinal  | - Do not generate the integer variable that gives the id of the first final state.\\
+\noindent\hspace*{24pt}\verb|noprefix | - Do not prefix the variable names with the name of the machine.
+\vspace{12pt}
+
+\begin{figure}
+\small
+\begin{verbatim}
+#include <stdio.h>
+%% machine foo;
+%% write data;
+int main( int argc, char **argv )
+{
+    int cs, res = 0;
+    if ( argc > 1 ) {
+        char *p = argv[1];
+        %%{ 
+            main := 
+                [a-z]+ 
+                0 @{ res = 1; fbreak; };
+            write init;
+            write exec noend;
+        }%%
+    }
+    printf("execute = %i\n", res );
+    return 0;
+}
+\end{verbatim}
+\verbspace
+\caption{Use of {\tt noend} write option and the {\tt fbreak} statement for
+processing a string.
+}
+\label{fbreak-example}
+\end{figure}
+
+\subsection{Write Start, First Final and Error}
+
+\begin{verbatim}
+write start;
+write first_final;
+write error;
+\end{verbatim}
+\verbspace
+
+These three write statements provide an alternative means of accessing the
+\verb|start|, \verb|first_final| and \verb|error| states. If there are many
+different machine specifications in one file it is easy to get the prefix for
+these wrong. This is especially true if the state machine boilerplate is
+frequently made by a copy-paste-edit process. These write statements allow the
+problem to be avoided. They can be used as follows:
+
+\begin{verbatim}
+/* Did parsing succeed? */
+if ( cs < %%{ write first_final; }%% ) {
+    result = ERR_PARSE_ERROR;
+    goto fail;
+}
+\end{verbatim}
+\verbspace
+
+\subsection{Write Init}
+\begin{verbatim}
+write init [options];
+\end{verbatim}
+\verbspace
+
+The write init statement causes Ragel to emit initialization code. This should
+be executed once before the machine is started. At a very minimum this sets the
+current state to the start state. If other variables are needed by the
+generated code, such as call stack variables or scanner management
+variables, they are also initialized here.
+
+The \verb|nocs| option to the write init statement will cause ragel to skip
+intialization of the cs variable. This is useful if the user wishes to use
+custom logic to decide which state the specification should start in.
+
+\subsection{Write Exec}
+\begin{verbatim}
+write exec [options];
+\end{verbatim}
+\verbspace
+
+The write exec statement causes Ragel to emit the state machine's execution code.
+Ragel expects several variables to be available to this code. At a very minimum, the
+generated code needs access to the current character position \verb|p|, the ending
+position \verb|pe| and the current state \verb|cs| (though \verb|pe|
+can be omitted using the \verb|noend| write option).
+The \verb|p| variable is the cursor that the execute code will
+used to traverse the input. The \verb|pe| variable should be set up to point to one
+position past the last valid character in the buffer.
+
+Other variables are needed when certain features are used. For example using
+the \verb|fcall| or \verb|fret| statements requires \verb|stack| and
+\verb|top| variables to be defined. If a longest-match construction is used,
+variables for managing backtracking are required.
+
+The write exec statement has one option. The \verb|noend| option tells Ragel
+to generate code that ignores the end position \verb|pe|. In this
+case the user must explicitly break out of the processing loop using
+\verb|fbreak|, otherwise the machine will continue to process characters until
+it moves into the error state. This option is useful if one wishes to process a
+null terminated string. Rather than traverse the string to discover then length
+before processing the input, the user can break out when the null character is
+seen.  The example in Figure \ref{fbreak-example} shows the use of the
+\verb|noend| write option and the \verb|fbreak| statement for processing a string.
+
+\subsection{Write Exports}
+\label{export}
+
+\begin{verbatim}
+write exports;
+\end{verbatim}
+\verbspace
+
+The export feature can be used to export simple machine definitions. Machine definitions
+are marked for export using the \verb|export| keyword.
+
+\begin{verbatim}
+export machine_to_export = 0x44;
+\end{verbatim}
+\verbspace
+
+When the write exports statement is used these machines are 
+written out in the generated code. Defines are used for C and constant integers
+are used for D, Java and Ruby. See Section \ref{import} for a description of the
+import statement.
+
+\section{Maintaining Pointers to Input Data}
+
+In the creation of any parser it is not uncommon to require the collection of
+the data being parsed.  It is always possible to collect data into a growable
+buffer as the machine moves over it, however the copying of data is a somewhat
+wasteful use of processor cycles. The most efficient way to collect data from
+the parser is to set pointers into the input then later reference them.  This
+poses a problem for uses of Ragel where the input data arrives in blocks, such
+as over a socket or from a file. If a pointer is set in one buffer block but
+must be used while parsing a following buffer block, some extra consideration
+to correctness must be made.
+
+The scanner constructions exhibit this problem, requiring the maintenance
+code described in Section \ref{generating-scanners}. If a longest-match
+construction has been used somewhere in the machine then it is possible to
+take advantage of the required prefix maintenance code in the driver program to
+ensure pointers to the input are always valid. If laying down a pointer one can
+set \verb|ts| at the same spot or ahead of it. When data is shifted in
+between loops the user must also shift the pointer.  In this way it is possible
+to maintain pointers to the input that will always be consistent.
+
+\begin{figure}
+\small
+\begin{verbatim}
+    int have = 0;
+    while ( 1 ) {
+        char *p, *pe, *data = buf + have;
+        int len, space = BUFSIZE - have;
+
+        if ( space == 0 ) { 
+            fprintf(stderr, "BUFFER OUT OF SPACE\n");
+            exit(1);
+        }
+
+        len = fread( data, 1, space, stdin );
+        if ( len == 0 )
+            break;
+
+        /* Find the last newline by searching backwards. */
+        p = buf;
+        pe = data + len - 1;
+        while ( *pe != '\n' && pe >= buf )
+            pe--;
+        pe += 1;
+
+        %% write exec;
+
+        /* How much is still in the buffer? */
+        have = data + len - pe;
+        if ( have > 0 )
+            memmove( buf, pe, have );
+
+        if ( len < space )
+            break;
+    }
+\end{verbatim}
+\verbspace
+\caption{An example of line-oriented processing.
+}
+\label{line-oriented}
+\end{figure}
+
+In general, there are two approaches for guaranteeing the consistency of
+pointers to input data. The first approach is the one just described;
+lay down a marker from an action,
+then later ensure that the data the marker points to is preserved ahead of
+the buffer on the next execute invocation. This approach is good because it
+allows the parser to decide on the pointer-use boundaries, which can be
+arbitrarily complex parsing conditions. A downside is that it requires any
+pointers that are set to be corrected in between execute invocations.
+
+The alternative is to find the pointer-use boundaries before invoking the execute
+routine, then pass in the data using these boundaries. For example, if the
+program must perform line-oriented processing, the user can scan backwards from
+the end of an input block that has just been read in and process only up to the
+first found newline. On the next input read, the new data is placed after the
+partially read line and processing continues from the beginning of the line.
+An example of line-oriented processing is given in Figure \ref{line-oriented}.
+
+\section{Specifying the Host Language}
+
+The \verb|ragel| program has a number of options for specifying the host
+language. The host-language options are:
+
+\begin{itemize}
+\item \verb|-C  | for C/C++/Objective-C code (default)
+\item \verb|-D  | for D code.
+\item \verb|-Z  | for Go code.
+\item \verb|-J  | for Java code.
+\item \verb|-R  | for Ruby code.
+\item \verb|-A  | for C\# code.
+\end{itemize}
+
+\section{Choosing a Generated Code Style}
+\label{genout}
+
+There are three styles of code output to choose from. Code style affects the
+size and speed of the compiled binary. Changing code style does not require any
+change to the Ragel program. There are two table-driven formats and a goto
+driven format.
+
+In addition to choosing a style to emit, there are various levels of action
+code reuse to choose from.  The maximum reuse levels (\verb|-T0|, \verb|-F0|
+and \verb|-G0|) ensure that no FSM action code is ever duplicated by encoding
+each transition's action list as static data and iterating
+through the lists on every transition. This will normally result in a smaller
+binary. The less action reuse options (\verb|-T1|, \verb|-F1| and \verb|-G1|)
+will usually produce faster running code by expanding each transition's action
+list into a single block of code, eliminating the need to iterate through the
+lists. This duplicates action code instead of generating the logic necessary
+for reuse. Consequently the binary will be larger. However, this tradeoff applies to
+machines with moderate to dense action lists only. If a machine's transitions
+frequently have less than two actions then the less reuse options will actually
+produce both a smaller and a faster running binary due to less action sharing
+overhead. The best way to choose the appropriate code style for your
+application is to perform your own tests.
+
+The table-driven FSM represents the state machine as constant static data. There are
+tables of states, transitions, indices and actions. The current state is
+stored in a variable. The execution is simply a loop that looks up the current
+state, looks up the transition to take, executes any actions and moves to the
+target state. In general, the table-driven FSM can handle any machine, produces
+a smaller binary and requires a less expensive host language compile, but
+results in slower running code.  Since the table-driven format is the most
+flexible it is the default code style.
+
+The flat table-driven machine is a table-based machine that is optimized for
+small alphabets. Where the regular table machine uses the current character as
+the key in a binary search for the transition to take, the flat table machine
+uses the current character as an index into an array of transitions. This is
+faster in general, however is only suitable if the span of possible characters
+is small.
+
+The goto-driven FSM represents the state machine using goto and switch
+statements. The execution is a flat code block where the transition to take is
+computed using switch statements and directly executable binary searches.  In
+general, the goto FSM produces faster code but results in a larger binary and a
+more expensive host language compile.
+
+The goto-driven format has an additional action reuse level (\verb|-G2|) that
+writes actions directly into the state transitioning logic rather than putting
+all the actions together into a single switch. Generally this produces faster
+running code because it allows the machine to encode the current state using
+the processor's instruction pointer. Again, sparse machines may actually
+compile to smaller binaries when \verb|-G2| is used due to less state and
+action management overhead. For many parsing applications \verb|-G2| is the
+preferred output format.
+
+\begin{center}
+
+Code Output Style Options
+
+\begin{tabular}{|c|c|c|}
+\hline
+\verb|-T0|&binary search table-driven&C/D/Java/Ruby/C\#\\
+\hline
+\verb|-T1|&binary search, expanded actions&C/D/Ruby/C\#\\
+\hline
+\verb|-F0|&flat table-driven&C/D/Ruby/C\#\\
+\hline
+\verb|-F1|&flat table, expanded actions&C/D/Ruby/C\#\\
+\hline
+\verb|-G0|&goto-driven&C/D/C\#\\
+\hline
+\verb|-G1|&goto, expanded actions&C/D/C\#\\
+\hline
+\verb|-G2|&goto, in-place actions&C/D/Go\\
+\hline
+\end{tabular}
+\end{center}
+
+\chapter{Beyond the Basic Model}
+
+\section{Parser Modularization}
+\label{modularization}
+
+It is possible to use Ragel's machine construction and action embedding
+operators to specify an entire parser using a single regular expression. In
+many cases this is the desired way to specify a parser in Ragel. However, in
+some scenarios the language to parse may be so large that it is difficult to
+think about it as a single regular expression. It may also shift between distinct
+parsing strategies, in which case modularization into several coherent blocks
+of the language may be appropriate.
+
+It may also be the case that patterns that compile to a large number of states
+must be used in a number of different contexts and referencing them in each
+context results in a very large state machine. In this case, an ability to reuse
+parsers would reduce code size.
+
+To address this, distinct regular expressions may be instantiated and linked
+together by means of a jumping and calling mechanism. This mechanism is
+analogous to the jumping to and calling of processor instructions. A jump
+command, given in action code, causes control to be immediately passed to
+another portion of the machine by way of setting the current state variable. A
+call command causes the target state of the current transition to be pushed to
+a state stack before control is transferred.  Later on, the original location
+may be returned to with a return statement. In the following example, distinct
+state machines are used to handle the parsing of two types of headers.
+
+% GENERATE: call
+% %%{
+%   machine call;
+\begin{inline_code}
+\begin{verbatim}
+action return { fret; }
+action call_date { fcall date; }
+action call_name { fcall name; }
+
+# A parser for date strings.
+date := [0-9][0-9] '/' 
+        [0-9][0-9] '/' 
+        [0-9][0-9][0-9][0-9] '\n' @return;
+
+# A parser for name strings.
+name := ( [a-zA-Z]+ | ' ' )** '\n' @return;
+
+# The main parser.
+headers = 
+    ( 'from' | 'to' ) ':' @call_name | 
+    ( 'departed' | 'arrived' ) ':' @call_date;
+
+main := headers*;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% %% write data;
+% void f()
+% {
+%   %% write init;
+%   %% write exec;
+% }
+% END GENERATE
+
+Calling and jumping should be used carefully as they are operations that take
+one out of the domain of regular languages. A machine that contains a call or
+jump statement in one of its actions should be used as an argument to a machine
+construction operator only with considerable care. Since DFA transitions may
+actually represent several NFA transitions, a call or jump embedded in one
+machine can inadvertently terminate another machine that it shares prefixes
+with. Despite this danger, theses statements have proven useful for tying
+together sub-parsers of a language into a parser for the full language,
+especially for the purpose of modularizing code and reducing the number of
+states when the machine contains frequently recurring patterns.
+
+Section \ref{vals} describes the jump and call statements that are used to
+transfer control. These statements make use of two variables that must be
+declared by the user, \verb|stack| and \verb|top|. The \verb|stack| variable
+must be an array of integers and \verb|top| must be a single integer, which
+will point to the next available space in \verb|stack|. Sections \ref{prepush}
+and \ref{postpop} describe the Pre-Push and Post-Pop statements which can be
+used to implement a dynamically resizable array.
+
+\section{Referencing Names}
+\label{labels}
+
+This section describes how to reference names in epsilon transitions (Section
+\ref{state-charts}) and
+action-based control-flow statements such as \verb|fgoto|. There is a hierarchy
+of names implied in a Ragel specification.  At the top level are the machine
+instantiations. Beneath the instantiations are labels and references to machine
+definitions. Beneath those are more labels and references to definitions, and
+so on.
+
+Any name reference may contain multiple components separated with the \verb|::|
+compound symbol.  The search for the first component of a name reference is
+rooted at the join expression that the epsilon transition or action embedding
+is contained in. If the name reference is not contained in a join,
+the search is rooted at the machine definition that the epsilon transition or
+action embedding is contained in. Each component after the first is searched
+for beginning at the location in the name tree that the previous reference
+component refers to.
+
+In the case of action-based references, if the action is embedded more than
+once, the local search is performed for each embedding and the result is the
+union of all the searches. If no result is found for action-based references then
+the search is repeated at the root of the name tree.  Any action-based name
+search may be forced into a strictly global search by prefixing the name
+reference with \verb|::|.
+
+The final component of the name reference must resolve to a unique entry point.
+If a name is unique in the entire name tree it can be referenced as is. If it
+is not unique it can be specified by qualifying it with names above it in the
+name tree. However, it can always be renamed.
+
+% FIXME: Should fit this in somewhere.
+% Some kinds of name references are illegal. Cannot call into longest-match
+% machine, can only call its start state. Cannot make a call to anywhere from
+% any part of a longest-match machine except a rule's action. This would result
+% in an eventual return to some point inside a longest-match other than the
+% start state. This is banned for the same reason a call into the LM machine is
+% banned.
+
+
+\section{Scanners}
+\label{generating-scanners}
+
+Scanners are very much intertwined with regular-languages and their
+corresponding processors. For this reason Ragel supports the definition of
+scanners.  The generated code will repeatedly attempt to match patterns from a
+list, favouring longer patterns over shorter patterns.  In the case of
+equal-length matches, the generated code will favour patterns that appear ahead
+of others. When a scanner makes a match it executes the user code associated
+with the match, consumes the input then resumes scanning.
+
+\begin{verbatim}
+<machine_name> := |* 
+        pattern1 => action1;
+        pattern2 => action2;
+        ...
+    *|;
+\end{verbatim}
+\verbspace
+
+On the surface, Ragel scanners are similar to those defined by Lex. Though
+there is a key distinguishing feature: patterns may be arbitrary Ragel
+expressions and can therefore contain embedded code. With a Ragel-based scanner
+the user need not wait until the end of a pattern before user code can be
+executed.
+
+Scanners can be used to process sub-languages, as well as for tokenizing
+programming languages. In the following example a scanner is used to tokenize
+the contents of a header field.
+
+\begin{inline_code}
+\begin{verbatim}
+word = [a-z]+;
+head_name = 'Header';
+
+header := |*
+    word;
+    ' ';
+    '\n' => { fret; };
+*|;
+
+main := ( head_name ':' @{ fcall header; } )*;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+
+The scanner construction has a purpose similar to the longest-match kleene star
+operator \verb|**|. The key
+difference is that a scanner is able to backtrack to match a previously matched
+shorter string when the pursuit of a longer string fails.  For this reason the
+scanner construction operator is not a pure state machine construction
+operator. It relies on several variables that enable it to backtrack and make
+pointers to the matched input text available to the user.  For this reason
+scanners must be immediately instantiated. They cannot be defined inline or
+referenced by another expression. Scanners must be jumped to or called.
+
+Scanners rely on the \verb|ts|, \verb|te| and \verb|act|
+variables to be present so that they can backtrack and make pointers to the
+matched text available to the user. If input is processed using multiple calls
+to the execute code then the user must ensure that when a token is only
+partially matched that the prefix is preserved on the subsequent invocation of
+the execute code.
+
+The \verb|ts| variable must be defined as a pointer to the input data.
+It is used for recording where the current token match begins. This variable
+may be used in action code for retrieving the text of the current match.  Ragel
+ensures that in between tokens and outside of the longest-match machines that
+this pointer is set to null. In between calls to the execute code the user must
+check if \verb|ts| is set and if so, ensure that the data it points to is
+preserved ahead of the next buffer block. This is described in more detail
+below.
+
+The \verb|te| variable must also be defined as a pointer to the input data.
+It is used for recording where a match ends and where scanning of the next
+token should begin. This can also be used in action code for retrieving the
+text of the current match.
+
+The \verb|act| variable must be defined as an integer type. It is used for
+recording the identity of the last pattern matched when the scanner must go
+past a matched pattern in an attempt to make a longer match. If the longer
+match fails it may need to consult the \verb|act| variable. In some cases, use 
+of the \verb|act|
+variable can be avoided because the value of the current state is enough
+information to determine which token to accept, however in other cases this is
+not enough and so the \verb|act| variable is used. 
+
+When the longest-match operator is in use, the user's driver code must take on
+some buffer management functions. The following algorithm gives an overview of
+the steps that should be taken to properly use the longest-match operator.
+
+\begin{itemize}
+\item Read a block of input data.
+\item Run the execute code.
+\item If \verb|ts| is set, the execute code will expect the incomplete
+token to be preserved ahead of the buffer on the next invocation of the execute
+code.  
+\begin{itemize}
+\item Shift the data beginning at \verb|ts| and ending at \verb|pe| to the
+beginning of the input buffer.
+\item Reset \verb|ts| to the beginning of the buffer. 
+\item Shift \verb|te| by the distance from the old value of \verb|ts|
+to the new value. The \verb|te| variable may or may not be valid.  There is
+no way to know if it holds a meaningful value because it is not kept at null
+when it is not in use. It can be shifted regardless.
+\end{itemize}
+\item Read another block of data into the buffer, immediately following any
+preserved data.
+\item Run the scanner on the new data.
+\end{itemize}
+
+Figure \ref{preserve_example} shows the required handling of an input stream in
+which a token is broken by the input block boundaries. After processing up to
+and including the ``t'' of ``characters'', the prefix of the string token must be
+retained and processing should resume at the ``e'' on the next iteration of
+the execute code.
+
+If one uses a large input buffer for collecting input then the number of times
+the shifting must be done will be small. Furthermore, if one takes care not to
+define tokens that are allowed to be very long and instead processes these
+items using pure state machines or sub-scanners, then only a small amount of
+data will ever need to be shifted.
+
+\begin{figure}
+\small
+\begin{verbatim}
+      a)           A stream "of characters" to be scanned.
+                   |        |          |
+                   p        ts         pe
+
+      b)           "of characters" to be scanned.
+                   |          |        |
+                   ts         p        pe
+\end{verbatim}
+\verbspace
+\caption{Following an invocation of the execute code there may be a partially
+matched token (a). The data of the partially matched token 
+must be preserved ahead of the new data on the next invocation (b).
+}
+\label{preserve_example}
+\end{figure}
+
+Since scanners attempt to make the longest possible match of input, patterns
+such as identifiers require one character of lookahead in order to trigger a
+match. In the case of the last token in the input stream the user must ensure
+that the \verb|eof| variable is set so that the final token is flushed out.
+
+An example scanner processing loop is given in Figure \ref{scanner-loop}.
+
+\begin{figure}
+\small
+\begin{verbatim}
+    int have = 0;
+    bool done = false;
+    while ( !done ) {
+        /* How much space is in the buffer? */
+        int space = BUFSIZE - have;
+        if ( space == 0 ) {
+            /* Buffer is full. */
+            cerr << "TOKEN TOO BIG" << endl;
+            exit(1);
+        }
+
+        /* Read in a block after any data we already have. */
+        char *p = inbuf + have;
+        cin.read( p, space );
+        int len = cin.gcount();
+
+        char *pe = p + len;
+        char *eof = 0;
+
+        /* If no data was read indicate EOF. */
+        if ( len == 0 ) {
+            eof = pe;
+            done = true;
+        }
+
+        %% write exec;
+
+        if ( cs == Scanner_error ) {
+            /* Machine failed before finding a token. */
+            cerr << "PARSE ERROR" << endl;
+            exit(1);
+        }
+
+        if ( ts == 0 )
+            have = 0;
+        else {
+            /* There is a prefix to preserve, shift it over. */
+            have = pe - ts;
+            memmove( inbuf, ts, have );
+            te = inbuf + (te-ts);
+            ts = inbuf;
+        }
+    }
+\end{verbatim}
+\verbspace
+\caption{A processing loop for a scanner.
+}
+\label{scanner-loop}
+\end{figure}
+
+\section{State Charts}
+\label{state-charts}
+
+In addition to supporting the construction of state machines using regular
+languages, Ragel provides a way to manually specify state machines using
+state charts.  The comma operator combines machines together without any
+implied transitions. The user can then manually link machines by specifying
+epsilon transitions with the \verb|->| operator.  Epsilon transitions are drawn
+between the final states of a machine and entry points defined by labels.  This
+makes it possible to build machines using the explicit state-chart method while
+making minimal changes to the Ragel language. 
+
+An interesting feature of Ragel's state chart construction method is that it
+can be mixed freely with regular expression constructions. A state chart may be
+referenced from within a regular expression, or a regular expression may be
+used in the definition of a state chart transition.
+
+\subsection{Join}
+
+\verb|expr , expr , ...|
+
+Join a list of machines together without
+drawing any transitions, without setting up a start state, and without
+designating any final states. Transitions between the machines may be specified
+using labels and epsilon transitions. The start state must be explicity
+specified with the ``start'' label. Final states may be specified with an
+epsilon transition to the implicitly created ``final'' state. The join
+operation allows one to build machines using a state chart model.
+
+\subsection{Label}
+
+\verb|label: expr| 
+
+Attaches a label to an expression. Labels can be
+used as the target of epsilon transitions and explicit control transfer
+statements such as \verb|fgoto| and \verb|fnext| in action
+code.
+
+\subsection{Epsilon}
+
+\verb|expr -> label| 
+
+Draws an epsilon transition to the state defined
+by \verb|label|.  Epsilon transitions are made deterministic when join
+operators are evaluated. Epsilon transitions that are not in a join operation
+are made deterministic when the machine definition that contains the epsilon is
+complete. See Section \ref{labels} for information on referencing labels.
+
+\subsection{Simplifying State Charts}
+
+There are two benefits to providing state charts in Ragel. The first is that it
+allows us to take a state chart with a full listing of states and transitions
+and simplify it in selective places using regular expressions.
+
+The state chart method of specifying parsers is very common.  It is an
+effective programming technique for producing robust code. The key disadvantage
+becomes clear when one attempts to comprehend a large parser specified in this
+way.  These programs usually require many lines, causing logic to be spread out
+over large distances in the source file. Remembering the function of a large
+number of states can be difficult and organizing the parser in a sensible way
+requires discipline because branches and repetition present many file layout
+options.  This kind of programming takes a specification with inherent
+structure such as looping, alternation and concatenation and expresses it in a
+flat form. 
+
+If we could take an isolated component of a manually programmed state chart,
+that is, a subset of states that has only one entry point, and implement it
+using regular language operators then we could eliminate all the explicit
+naming of the states contained in it. By eliminating explicitly named states
+and replacing them with higher-level specifications we simplify a state machine
+specification.
+
+For example, sometimes chains of states are needed, with only a small number of
+possible characters appearing along the chain. These can easily be replaced
+with a concatenation of characters. Sometimes a group of common states
+implement a loop back to another single portion of the machine. Rather than
+manually duplicate all the transitions that loop back, we may be able to
+express the loop using a kleene star operator.
+
+Ragel allows one to take this state map simplification approach. We can build
+state machines using a state map model and implement portions of the state map
+using regular languages. In place of any transition in the state machine,
+entire sub-machines can be given. These can encapsulate functionality
+defined elsewhere. An important aspect of the Ragel approach is that when we
+wrap up a collection of states using a regular expression we do not lose
+access to the states and transitions. We can still execute code on the
+transitions that we have encapsulated.
+
+\subsection{Dropping Down One Level of Abstraction}
+\label{down}
+
+The second benefit of incorporating state charts into Ragel is that it permits
+us to bypass the regular language abstraction if we need to. Ragel's action
+embedding operators are sometimes insufficient for expressing certain parsing
+tasks.  In the same way that is useful for C language programmers to drop down
+to assembly language programming using embedded assembler, it is sometimes
+useful for the Ragel programmer to drop down to programming with state charts.
+
+In the following example, we wish to buffer the characters of an XML CDATA
+sequence. The sequence is terminated by the string \verb|]]>|. The challenge
+in our application is that we do not wish the terminating characters to be
+buffered. An expression of the form \verb|any* @buffer :>> ']]>'| will not work
+because the buffer will always contain the characters \verb|]]| on the end.
+Instead, what we need is to delay the buffering of \verb|]|
+characters until a time when we
+abandon the terminating sequence and go back into the main loop. There is no
+easy way to express this using Ragel's regular expression and action embedding
+operators, and so an ability to drop down to the state chart method is useful.
+
+% GENERATE: dropdown
+% OPT: -p
+% %%{
+% machine dropdown;
+\begin{inline_code}
+\begin{verbatim}
+action bchar { buff( fpc ); }    # Buffer the current character.
+action bbrack1 { buff( "]" ); }
+action bbrack2 { buff( "]]" ); }
+
+CDATA_body =
+start: (
+     ']' -> one |
+     (any-']') @bchar ->start
+),
+one: (
+     ']' -> two |
+     [^\]] @bbrack1 @bchar ->start
+),
+two: (
+     '>' -> final |
+     ']' @bbrack1 -> two |
+     [^>\]] @bbrack2 @bchar ->start
+);
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% main := CDATA_body;
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{dropdown}
+\end{center}
+\graphspace
+
+
+\section{Semantic Conditions}
+\label{semantic}
+
+Many communication protocols contain variable-length fields, where the length
+of the field is given ahead of the field as a value. This
+problem cannot be expressed using regular languages because of its
+context-dependent nature. The prevalence of variable-length fields in
+communication protocols motivated us to introduce semantic conditions into
+the Ragel language.
+
+A semantic condition is a block of user code that is interpreted as an
+expression and evaluated immediately
+before a transition is taken. If the code returns a value of true, the
+transition may be taken.  We can now embed code that extracts the length of a
+field, then proceed to match $n$ data values.
+
+% GENERATE: conds1
+% OPT: -p
+% %%{
+% machine conds1;
+% number = digit+;
+\begin{inline_code}
+\begin{verbatim}
+action rec_num { i = 0; n = getnumber(); }
+action test_len { i++ < n }
+data_fields = (
+    'd' 
+    [0-9]+ %rec_num 
+    ':'
+    ( [a-z] when test_len )*
+)**;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% main := data_fields;
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{conds1}
+\end{center}
+\graphspace
+
+The Ragel implementation of semantic conditions does not force us to give up the
+compositional property of Ragel definitions. For example, a machine that tests
+the length of a field using conditions can be unioned with another machine
+that accepts some of the same strings, without the two machines interfering with
+one another. The user need not be concerned about whether or not the result of the
+semantic condition will affect the matching of the second machine.
+
+To see this, first consider that when a user associates a condition with an
+existing transition, the transition's label is translated from the base character
+to its corresponding value in the space that represents ``condition $c$ true''. Should
+the determinization process combine a state that has a conditional transition
+with another state that has a transition on the same input character but
+without a condition, then the condition-less transition first has its label
+translated into two values, one to its corresponding value in the space that
+represents ``condition $c$ true'' and another to its corresponding value in the
+space that represents ``condition $c$ false''. It
+is then safe to combine the two transitions. This is shown in the following
+example.  Two intersecting patterns are unioned, one with a condition and one
+without. The condition embedded in the first pattern does not affect the second
+pattern.
+
+% GENERATE: conds2
+% OPT: -p
+% %%{
+% machine conds2;
+% number = digit+;
+\begin{inline_code}
+\begin{verbatim}
+action test_len { i++ < n }
+action one { /* accept pattern one */ }
+action two { /* accept pattern two */ }
+patterns = 
+    ( [a-z] when test_len )+ %one |
+    [a-z][a-z0-9]* %two;
+main := patterns '\n';
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{conds2}
+\end{center}
+\graphspace
+
+There are many more potential uses for semantic conditions. The user is free to
+use arbitrary code and may therefore perform actions such as looking up names
+in dictionaries, validating input using external parsing mechanisms or
+performing checks on the semantic structure of input seen so far. In the next
+section we describe how Ragel accommodates several common parser engineering
+problems.
+
+The semantic condition feature works only with alphabet types that are smaller
+in width than the \verb|long| type. To implement semantic conditions Ragel
+needs to be able to allocate characters from the alphabet space. Ragel uses
+these allocated characters to express "character C with condition P true" or "C
+with P false." Since internally Ragel uses longs to store characters there is
+no room left in the alphabet space unless an alphabet type smaller than long is
+used.
+
+\section{Implementing Lookahead}
+
+There are a few strategies for implementing lookahead in Ragel programs.
+Leaving actions, which are described in Section \ref{out-actions}, can be
+used as a form of lookahead.  Ragel also provides the \verb|fhold| directive
+which can be used in actions to prevent the machine from advancing over the
+current character. It is also possible to manually adjust the current character
+position by shifting it backwards using \verb|fexec|, however when this is
+done, care must be taken not to overstep the beginning of the current buffer
+block. In both the use of \verb|fhold| and \verb|fexec| the user must be
+cautious of combining the resulting machine with another in such a way that the
+transition on which the current position is adjusted is not combined with a
+transition from the other machine.
+
+\section{Parsing Recursive Language Structures}
+
+In general Ragel cannot handle recursive structures because the grammar is
+interpreted as a regular language. However, depending on what needs to be
+parsed it is sometimes practical to implement the recursive parts using manual
+coding techniques. This often works in cases where the recursive structures are
+simple and easy to recognize, such as in the balancing of parentheses
+
+One approach to parsing recursive structures is to use actions that increment
+and decrement counters or otherwise recognize the entry to and exit from
+recursive structures and then jump to the appropriate machine defnition using
+\verb|fcall| and \verb|fret|. Alternatively, semantic conditions can be used to
+test counter variables.
+
+A more traditional approach is to call a separate parsing function (expressed
+in the host language) when a recursive structure is entered, then later return
+when the end is recognized.
+
+\end{document}
author	Adrian Thurston <thurston@complang.org>	2013-03-22 23:39:49 -0400
committer	Adrian Thurston <thurston@complang.org>	2013-03-22 23:39:49 -0400
commit	9ddc931cbacf4d270a7113d07092203bec8aaaae (patch)
tree	b5591d9c8632de3f67e18aebc02216b4cd9a4663 /test
parent	56a51921c90303d65fd8a4b3fee99240676d43dd (diff)
download	colm-9ddc931cbacf4d270a7113d07092203bec8aaaae.tar.gz