summaryrefslogtreecommitdiff
path: root/test/repeat2.lm
diff options
context:
space:
mode:
Diffstat (limited to 'test/repeat2.lm')
-rw-r--r--test/repeat2.lm7408
1 files changed, 7408 insertions, 0 deletions
diff --git a/test/repeat2.lm b/test/repeat2.lm
new file mode 100644
index 0000000..e001d8f
--- /dev/null
+++ b/test/repeat2.lm
@@ -0,0 +1,7408 @@
+##### LM #####
+#
+# Copyright 2012 Adrian Thurston <thurston@complang.org>
+#
+
+# This file is part of Ragel.
+#
+# Ragel is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Ragel is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Ragel; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+lex
+ token word /( [^. \t\n]+ | '.' )/
+ token lws /[ \t]+/
+ token nl / '\n'/
+
+ token cmd_verb1 /'.verb|'/
+ token cmd_verb2 /'.verb/'/
+ token cmd_label /'.label{'/
+ token cmd_ref /'.ref{'/
+ token cmd_em /'.em{'/
+ token cmd_tt /'.tt{'/
+
+ token cmd_title /'.title' lws/
+ token cmd_sub_title /'.subtitle' lws/
+ token cmd_author /'.author' lws/
+
+ token cmd_chapter /'.chapter' lws/
+ token cmd_section /'.section' lws/
+ token cmd_sub_section /'.subsection' lws/
+ token cmd_sub_sub_section /'.subsubsection' lws/
+
+ token cmd_graphic /'.graphic' lws/
+ token cmd_comment /'.comment' lws? '\n'/
+ token cmd_verbatim /'.verbatim' lws? '\n'/
+ token cmd_code /'.code' lws? '\n'/
+
+ token cmd_itemize /'.itemize' lws? '\n'/
+ token end_itemize /'.end' lws 'itemize' lws? '\n'/
+ token cmd_item /'.item' lws/
+
+ token cmd_center /'.center' lws? '\n'/
+ token end_center /'.end' lws 'center' lws? '\n'/
+
+ token cmd_tabular /'.tabular' lws? '\n'/
+ token cmd_row /'.row' lws/
+ token end_tabular /'.end' lws 'tabular' lws? '\n'/
+
+ token cmd_multicols /'.multicols' lws? '\n'/
+ token cmd_columnbreak /'.columnbreak' lws? '\n'/
+ token end_multicols /'.end' lws 'multicols' lws? '\n'/
+
+ token cmd_figure / '.figure' lws?/
+ token cmd_caption / '.caption' lws/
+ token end_figure / '.end' lws 'figure' lws? '\n'/
+
+ token cmd_list /'.list' lws? '\n'/
+ token end_list /'.end' lws 'list' lws? '\n'/
+ token cmd_li /'.li' lws/
+
+ token cmd_license /'.license' lws? '\n'/
+end
+
+lex
+ token bar_data /[^|]*/
+ token end_bar /'|'/
+end
+
+lex
+ token slash_data /[^/]*/
+ token end_slash /'/'/
+end
+
+lex
+ token curly_data /[^}]*/
+ token end_curly /'}'/
+end
+
+def cmd_il
+ [cmd_verb1 bar_data end_bar]
+| [cmd_verb2 slash_data end_slash]
+| [cmd_label curly_data end_curly]
+| [cmd_ref curly_data end_curly]
+| [cmd_em curly_data end_curly]
+| [cmd_tt curly_data end_curly]
+
+def text
+ [word]
+| [lws]
+| [cmd_il]
+
+lex
+ token end_verbatim /lws? '.' lws? 'end' lws 'verbatim' lws? '\n'/
+ token verbatim_line /[^\n]* '\n'/
+end
+
+def verbatim
+ [cmd_verbatim verbatim_line* end_verbatim]
+
+lex
+ token end_code /lws? '.' lws? 'end' lws 'code' lws? '\n'/
+ token code_line /[^\n]* '\n'/
+end
+
+def code
+ [cmd_code code_line* end_code]
+
+lex
+ token end_comment /lws? '.' lws? 'end' lws 'comment' lws? '\n'/
+ token comment_line /[^\n]* '\n'/
+end
+
+def comment
+ [cmd_comment comment_line* end_comment]
+
+def figure
+ [cmd_figure text nl line* caption? end_figure]
+
+def li
+ [cmd_li text* nl]
+
+def _list
+ [cmd_list li* end_list]
+
+def scale
+ [lws word word*]
+
+def graphic
+ [cmd_graphic word scale? nl]
+
+def itemize
+ [cmd_itemize line* item* end_itemize]
+
+def center
+ [cmd_center line* end_center]
+
+def row
+ [cmd_row text* nl]
+
+def tabular
+ [cmd_tabular row* end_tabular]
+
+def multicols_line
+ [cmd_columnbreak]
+| [line]
+
+def multicols
+ [cmd_multicols multicols_line* end_multicols]
+
+def item
+ [cmd_item line*]
+
+def caption
+ [cmd_caption line*]
+
+def line
+ [text]
+| [nl]
+| [comment]
+| [verbatim]
+| [code]
+| [graphic]
+| [itemize]
+| [center]
+| [tabular]
+| [multicols]
+| [figure]
+| [_list]
+
+def sub_sub_section
+ [cmd_sub_sub_section text* nl line*]
+
+def sub_section
+ [cmd_sub_section text* nl line* sub_sub_section*]
+
+def section
+ [cmd_section text* nl line* sub_section*]
+
+def chapter
+ [cmd_chapter text* nl line* section*]
+
+def title
+ [cmd_title text* nl]
+
+def subtitle
+ [cmd_sub_title text* nl]
+
+def author
+ [cmd_author text* nl]
+
+#
+# Paragraphs.
+#
+
+def pline
+ [text text* nl]
+
+def paragraph
+ [pline pline*]
+
+def pextra
+ [nl paragraph]
+
+def block
+ [paragraph pextra*]
+
+def license
+ [cmd_license nl* block nl*]
+
+#
+# Preamble.
+#
+
+def preamble_item
+ [text]
+| [nl]
+| [title]
+| [subtitle]
+| [author]
+
+def preamble
+ [preamble_item* license]
+
+def start
+ [preamble chapter*]
+
+parse Start: start[ stdin ]
+if ( ! Start ) {
+ print( error '\n' )
+ exit( 1 )
+}
+
+int printPlData( Pld: cmd_il )
+{
+ if match Pld [ cmd_verb1 V: bar_data end_bar] {
+ print( '\\verb|' )
+ print( V )
+ print( '|' )
+ }
+ else if match Pld [cmd_verb2 V: slash_data end_slash] {
+ print( '\\verb/' )
+ print( V )
+ print( '/' )
+ }
+ else if match Pld [cmd_label L: curly_data end_curly] {
+ print( '\\label{' )
+ print( L )
+ print( '}' )
+ }
+ else if match Pld [cmd_ref L: curly_data end_curly] {
+ print( '\\ref{' )
+ print( L )
+ print( '}' )
+ }
+ else if match Pld [cmd_em L: curly_data end_curly] {
+ print( '{\\em ' )
+ print( L )
+ print( '}' )
+ }
+ else if match Pld [cmd_tt L: curly_data end_curly] {
+ print( '{\\tt ' )
+ print( L )
+ print( '}' )
+ }
+ else {
+ print( Pld )
+ }
+}
+
+int printText( Lines: text* )
+{
+ for L: text in repeat(Lines) {
+ if match L [PlData: cmd_il] {
+ printPlData( PlData )
+ }
+ else {
+ print( L )
+ }
+ }
+}
+
+int printLines( Lines: line* )
+{
+ for L: line in repeat(Lines) {
+ if match L [word] {
+ print( L )
+ }
+ if match L [lws] {
+ print( L )
+ }
+ if match L [nl] {
+ print( L )
+ }
+ if match L [PlData: cmd_il] {
+ printPlData( PlData )
+ }
+ if match L [cmd_verbatim Lines: verbatim_line* end_verbatim] {
+ print( '\\begin{verbatim}\n' )
+ print( Lines )
+ print( '\\end{verbatim}\n' )
+ print( '\\verbspace\n' )
+ }
+ if match L [cmd_code Lines: code_line* end_code] {
+ print( '\\begin{inline_code}\n' )
+ print( '\\begin{verbatim}\n' )
+ print( Lines )
+ print( '\\end{verbatim}\n' )
+ print( '\\end{inline_code}\n' )
+ print( '\\verbspace\n' )
+ }
+ if match L [cmd_graphic Name: word Scale: scale? nl] {
+ print( '\\graphspace\n' )
+ print( '\\begin{center}\n' )
+ print( '\\includegraphics' )
+ if match Scale [lws Spd: word Spd2: word*]
+ print( '[scale=' Spd Spd2 ']' )
+ else
+ print( '[scale=0.55]' )
+ print( '{' Name '}\n' )
+ print( '\\end{center}\n' )
+ print( '\\graphspace\n' )
+ }
+ if match L [cmd_itemize Lines: line* Items: item* end_itemize] {
+ print( '\\begin{itemize}\n' )
+ printLines( Lines )
+ for Item: item in repeat(Items) {
+ match Item [cmd_item Lines: line*]
+ print( '\\item ' )
+ printLines( Lines )
+ }
+ print( '\\end{itemize}\n' )
+ }
+ if match L [cmd_figure DirData: text nl Lines: line* Caption: caption? end_figure] {
+ print( '\\begin{figure}\n' )
+ print( '\\small\n' )
+ printLines( Lines )
+ if match Caption [cmd_caption CL: line*] {
+ print( '\\caption{' )
+ printLines( CL )
+ print( '}\n' )
+ }
+ print( '\\label{' DirData '}\n' )
+ print( '\\end{figure}\n' )
+ }
+ if match L [cmd_list LiList: li* end_list] {
+ for Li: li* in LiList {
+ if match Li [cmd_li Lines: text* nl Rest: li*] {
+ print( '\\noindent\\\hspace*{24pt}' )
+ printText( Lines )
+ if match Rest [ li li* ]
+ print( '\\\\' )
+ print( '\n' )
+ }
+ }
+ print( '\\vspace{12pt}\n' )
+ }
+ if match L [cmd_center Lines: line* end_center] {
+ print( '\\begin{center}\n' )
+ printLines( Lines )
+ print( '\\end{center}\n' )
+ }
+ if match L [cmd_tabular Rows: row* end_tabular] {
+ print( '\\begin{tabular}{|c|c|c|}\n' )
+ print( '\\hline\n' )
+ for Row: row in repeat(Rows) {
+ if match Row [cmd_row Lines: text* nl ] {
+ printText( Lines )
+ print( '\\\\' '\n' )
+ print( '\\hline\n' )
+ }
+ }
+ print( '\\end{tabular}\n' )
+ }
+ if match L [cmd_multicols Lines: multicols_line* end_multicols] {
+ print( '\\begin{multicols}{2}\n' )
+ for McLine: multicols_line in repeat( Lines ) {
+ if match McLine [Line: line]
+ printLines( cons line* [Line] )
+ else if match McLine [cmd_columnbreak] {
+ print( '\\columnbreak\n' )
+ }
+ }
+ print( '\\end{multicols}\n' )
+ }
+ }
+}
+
+match Start
+ [Preamble: preamble Chapters: chapter*]
+
+Title: title = title in Preamble
+match Title [cmd_title TitleData: text* nl]
+
+SubTitle: subtitle = subtitle in Preamble
+match SubTitle [cmd_sub_title SubTitleData: text* nl]
+
+Author: author = author in Preamble
+match Author [cmd_author AuthorData: text* nl]
+
+License: license = license in Preamble
+
+print(
+ ~\documentclass[letterpaper,11pt,oneside]{book}
+ ~\usepackage{graphicx}
+ ~\usepackage{comment}
+ ~\usepackage{multicol}
+ ~\usepackage[
+ ~ colorlinks=true,
+ ~ linkcolor=black,
+ ~ citecolor=green,
+ ~ filecolor=black,
+ ~ urlcolor=black]{hyperref}
+ ~
+ ~\topmargin -0.20in
+ ~\oddsidemargin 0in
+ ~\textwidth 6.5in
+ ~\textheight 9in
+ ~
+ ~\setlength{\parskip}{0pt}
+ ~\setlength{\topsep}{0pt}
+ ~\setlength{\partopsep}{0pt}
+ ~\setlength{\itemsep}{0pt}
+ ~
+ ~\input{version}
+ ~
+ ~\newcommand{\verbspace}{\vspace{10pt}}
+ ~\newcommand{\graphspace}{\vspace{10pt}}
+ ~
+ ~\renewcommand\floatpagefraction{.99}
+ ~\renewcommand\topfraction{.99}
+ ~\renewcommand\bottomfraction{.99}
+ ~\renewcommand\textfraction{.01}
+ ~\setcounter{totalnumber}{50}
+ ~\setcounter{topnumber}{50}
+ ~\setcounter{bottomnumber}{50}
+ ~
+ ~\newenvironment{inline_code}{\def\baselinestretch{1}\vspace{12pt}\small}{}
+ ~
+ ~\begin{document}
+ ~
+ ~\thispagestyle{empty}
+ ~\begin{center}
+ ~\vspace*{3in}
+)
+
+print( '{\\huge ' TitleData '}\\\\\n' )
+
+print( '\\vspace*{12pt}\n' )
+
+print( '{\\Large ' SubTitleData '}\\\\\n' )
+
+print(
+ ~\vspace{1in}
+ ~by\\
+ ~\vspace{12pt}
+)
+
+print( '{\\large ' AuthorData '}\\\\\n' )
+
+print(
+ ~\end{center}
+ ~\clearpage
+ ~
+ ~\pagenumbering{roman}
+ ~
+ ~\chapter*{License}
+)
+
+print(
+ ~Ragel version \version, \pubdate\\
+ ~Copyright \copyright\ 2003-2012 Adrian D. Thurston
+ ~\vspace{6mm}
+ ~
+)
+
+i: int = 0
+for P: paragraph in License {
+ if ( i != 0 ) {
+ print(
+ ~
+ ~\vspace{5pt}
+ ~
+ )
+ }
+ print( "{\\bf\\it\\noindent " )
+ print( P )
+ print( "}\n" )
+ i = i + 1
+}
+
+print(
+ ~
+ ~\clearpage
+ ~\tableofcontents
+ ~\clearpage
+ ~
+ ~\pagenumbering{arabic}
+)
+
+
+for Chapter: chapter in repeat(Chapters) {
+ match Chapter
+ [cmd_chapter DirData: text* nl Lines: line* SectionList: section*]
+
+ print( '\\chapter{' DirData '}\n' )
+ printLines( Lines )
+
+ for Section: section in repeat(SectionList) {
+ match Section
+ [cmd_section DirData: text* nl Lines: line* SubSectionList: sub_section*]
+
+ print( '\\section{' DirData '}\n' )
+ printLines( Lines )
+ for SubSection: sub_section in repeat(SubSectionList) {
+ match SubSection
+ [cmd_sub_section DirData: text* nl Lines: line*
+ SubSubSectionList: sub_sub_section*]
+
+ print( '\\subsection{' DirData '}\n' )
+ printLines( Lines )
+
+ for SubSubSection: sub_sub_section in repeat(SubSubSectionList) {
+ match SubSubSection
+ [cmd_sub_sub_section DirData: text* nl Lines: line*]
+
+ print( '\\subsubsection{' DirData '}\n' )
+ printLines( Lines )
+ }
+ }
+ }
+}
+
+print(
+ ~
+ ~\end{document}
+)
+##### IN #####
+.title Ragel State Machine Compiler
+
+.subtitle User Guide
+
+.author Adrian Thurston
+
+.license
+
+This document is part of Ragel, and as such, this document is
+released under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2 of the License, or (at your option)
+any later version.
+
+Ragel is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+details.
+
+You should have received a copy of the GNU General Public
+License along with Ragel; if not, write to the Free Software Foundation, Inc.,
+59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+.chapter Introduction
+
+.section Abstract
+
+Regular expressions are used heavily in practice for the purpose of specifying
+parsers. They are normally used as black boxes linked together with program
+logic. User actions are executed in between invocations of the regular
+expression engine. Adding actions before a pattern terminates requires patterns
+to be broken and pasted back together with program logic. The more user actions
+are needed, the less the advantages of regular expressions are seen.
+
+Ragel is a software development tool that allows user actions to be
+embedded into the transitions of a regular expression's corresponding state
+machine, eliminating the need to switch from the regular expression engine and
+user code execution environment and back again. As a result, expressions can be
+maximally continuous. One is free to specify an entire parser using a single
+regular expression. The single-expression model affords concise and elegant
+descriptions of languages and the generation of very simple, fast and robust
+code. Ragel compiles executable finite state machines from a high level regular language
+notation. Ragel targets C, C++, Objective-C, D, Go, Java and Ruby.
+
+In addition to building state machines from regular expressions, Ragel allows
+the programmer to directly specify state machines with state charts. These two
+notations may be freely combined. There are also facilities for controlling
+nondeterminism in the resulting machines and building scanners using patterns
+that themselves have embedded actions. Ragel can produce code that is small and
+runs very fast. Ragel can handle integer-sized alphabets and can compile very
+large state machines.
+
+.section Motivation
+
+When a programmer is faced with the task of producing a parser for a
+context-free language there are many tools to choose from. It is quite common
+to generate useful and efficient parsers for programming languages from a
+formal grammar. It is also quite common for programmers to avoid such tools
+when making parsers for simple computer languages, such as file formats and
+communication protocols. Such languages are often regular and tools for
+processing the context-free languages are viewed as too heavyweight for the
+purpose of parsing regular languages. The extra run-time effort required for
+supporting the recursive nature of context-free languages is wasted.
+
+When we turn to the regular expression-based parsing tools, such as Lex, Re2C,
+and scripting languages such as Sed, Awk and Perl we find that they are split
+into two levels: a regular expression matching engine and some kind of program
+logic for linking patterns together. For example, a Lex program is composed of
+sets of regular expressions. The implied program logic repeatedly attempts to
+match a pattern in the current set. When a match is found the associated user
+code executed. It requires the user to consider a language as a sequence of
+independent tokens. Scripting languages and regular expression libraries allow
+one to link patterns together using arbitrary program code. This is very
+flexible and powerful, however we can be more concise and clear if we avoid
+gluing together regular expressions with if statements and while loops.
+
+This model of execution, where the runtime alternates between regular
+expression matching and user code exectution places restrictions on when
+action code may be executed. Since action code can only be associated with
+complete patterns, any action code that must be executed before an entire
+pattern is matched requires that the pattern be broken into smaller units.
+Instead of being forced to disrupt the regular expression syntax and write
+smaller expressions, it is desirable to retain a single expression and embed
+code for performing actions directly into the transitions that move over the
+characters. After all, capable programmers are astutely aware of the machinery
+underlying their programs, so why not provide them with access to that
+machinery? To achieve this we require an action execution model for associating
+code with the sub-expressions of a regular expression in a way that does not
+disrupt its syntax.
+
+The primary goal of Ragel is to provide developers with an ability to embed
+actions into the transitions and states of a regular expression's state machine
+in support of the definition of entire parsers or large sections of parsers
+using a single regular expression. From the regular expression we gain a clear
+and concise statement of our language. From the state machine we obtain a very
+fast and robust executable that lends itself to many kinds of analysis and
+visualization.
+
+.section Overview
+
+Ragel is a language for specifying state machines. The Ragel program is a
+compiler that assembles a state machine definition to executable code. Ragel
+is based on the principle that any regular language can be converted to a
+deterministic finite state automaton. Since every regular language has a state
+machine representation and vice versa, the terms regular language and state
+machine (or just machine) will be used interchangeably in this document.
+
+Ragel outputs machines to C, C++, Objective-C, D, Go, Java or Ruby code. The output is
+designed to be generic and is not bound to any particular input or processing
+method. A Ragel machine expects to have data passed to it in buffer blocks.
+When there is no more input, the machine can be queried for acceptance. In
+this way, a Ragel machine can be used to simply recognize a regular language
+like a regular expression library. By embedding code into the regular language,
+a Ragel machine can also be used to parse input.
+
+The Ragel language has many operators for constructing and manipulating
+machines. Machines are built up from smaller machines, to bigger ones, to the
+final machine representing the language that needs to be recognized or parsed.
+
+The core state machine construction operators are those found in most theory
+of computation textbooks. They date back to the 1950s and are widely studied.
+They are based on set operations and permit one to think of languages as a set
+of strings. They are Union, Intersection, Difference, Concatenation and Kleene
+Star. Put together, these operators make up what most people know as regular
+expressions. Ragel also provides a scanner construction operator
+and provides operators for explicitly constructing machines
+using a state chart method. In the state chart method, one joins machines
+together without any implied transitions and then explicitly specifies where
+epsilon transitions should be drawn.
+
+The state machine manipulation operators are specific to Ragel. They allow the
+programmer to access the states and transitions of regular language's
+corresponding machine. There are two uses of the manipulation operators. The
+first and primary use is to embed code into transitions and states, allowing
+the programmer to specify the actions of the state machine.
+
+Ragel attempts to make the action embedding facility as intuitive as possible.
+To do so, a number of issues need to be addressed. For example, when making a
+nondeterministic specification into a DFA using machines that have embedded
+actions, new transitions are often made that have the combined actions of
+several source transitions. Ragel ensures that multiple actions associated with
+a single transition are ordered consistently with respect to the order of
+reference and the natural ordering implied by the construction operators.
+
+The second use of the manipulation operators is to assign priorities to
+transitions. Priorities provide a convenient way of controlling any
+nondeterminism introduced by the construction operators. Suppose two
+transitions leave from the same state and go to distinct target states on the
+same character. If these transitions are assigned conflicting priorities, then
+during the determinization process the transition with the higher priority will
+take precedence over the transition with the lower priority. The lower priority
+transition gets abandoned. The transitions would otherwise be combined into a new
+transition that goes to a new state that is a combination of the original
+target states. Priorities are often required for segmenting machines. The most
+common uses of priorities have been encoded into a set of simple operators
+that should be used instead of priority embeddings whenever possible.
+
+For the purposes of embedding, Ragel divides transitions and states into
+different classes. There are four operators for embedding actions and
+priorities into the transitions of a state machine. It is possible to embed
+into entering transitions, finishing transitions, all transitions and leaving
+transitions. The embedding into leaving transitions is a special case.
+These transition embeddings get stored in the final states of a machine. They
+are transferred to any transitions that are made going out of the machine by
+future concatenation or kleene star operations.
+
+There are several more operators for embedding actions into states. Like the
+transition embeddings, there are various different classes of states that the
+embedding operators access. For example, one can access start states, final
+states or all states, among others. Unlike the transition embeddings, there are
+several different types of state action embeddings. These are executed at
+various different times during the processing of input. It is possible to embed
+actions that are exectued on transitions into a state, on transitions out of a
+state, on transitions taken on the error event, or on transitions taken on the
+EOF event.
+
+Within actions, it is possible to influence the behaviour of the state machine.
+The user can write action code that jumps or calls to another portion of the
+machine, changes the current character being processed, or breaks out of the
+processing loop. With the state machine calling feature Ragel can be used to
+parse languages that are not regular. For example, one can parse balanced
+parentheses by calling into a parser when an open parenthesis character is seen
+and returning to the state on the top of the stack when the corresponding
+closing parenthesis character is seen. More complicated context-free languages
+such as expressions in C are out of the scope of Ragel.
+
+Ragel also provides a scanner construction operator that can be used to build
+scanners much the same way that Lex is used. The Ragel generated code, which
+relies on user-defined variables for backtracking, repeatedly tries to match
+patterns to the input, favouring longer patterns over shorter ones and patterns
+that appear ahead of others when the lengths of the possible matches are
+identical. When a pattern is matched the associated action is executed.
+
+The key distinguishing feature between scanners in Ragel and scanners in Lex is
+that Ragel patterns may be arbitrary Ragel expressions and can therefore
+contain embedded code. With a Ragel-based scanner the user need not wait until
+the end of a pattern before user code can be executed.
+
+Scanners do take Ragel out of the domain of pure state machines and require the
+user to maintain the backtracking related variables. However, scanners
+integrate well with regular state machine instantiations. They can be called to
+or jumped to only when needed, or they can be called out of or jumped out of
+when a simpler, pure state machine model is appropriate.
+
+Two types of output code style are available. Ragel can produce a table-driven
+machine or a directly executable machine. The directly executable machine is
+much faster than the table-driven. On the other hand, the table-driven machine
+is more compact and less demanding on the host language compiler. It is better
+suited to compiling large state machines.
+
+.section Related Work
+
+Lex is perhaps the best-known tool for constructing parsers from regular
+expressions. In the Lex processing model, generated code attempts to match one
+of the user's regular expression patterns, favouring longer matches over
+shorter ones. Once a match is made it then executes the code associated with
+the pattern and consumes the matching string. This process is repeated until
+the input is fully consumed.
+
+Through the use of start conditions, related sets of patterns may be defined.
+The active set may be changed at any time. This allows the user to define
+different lexical regions. It also allows the user to link patterns together by
+requiring that some patterns come before others. This is quite like a
+concatenation operation. However, use of Lex for languages that require a
+considerable amount of pattern concatenation is inappropriate. In such cases a
+Lex program deteriorates into a manually specified state machine, where start
+conditions define the states and pattern actions define the transitions. Lex
+is therefore best suited to parsing tasks where the language to be parsed can
+be described in terms of regions of tokens.
+
+Lex is useful in many scenarios and has undoubtedly stood the test of time.
+There are, however, several drawbacks to using Lex. Lex can impose too much
+overhead for parsing applications where buffering is not required because all
+the characters are available in a single string. In these cases there is
+structure to the language to be parsed and a parser specification tool can
+help, but employing a heavyweight processing loop that imposes a stream
+``pull'' model and dynamic input buffer allocation is inappropriate. An
+example of this kind of scenario is the conversion of floating point numbers
+contained in a string to their corresponding numerical values.
+
+Another drawback is the very issue that Ragel attempts to solve.
+It is not possible to execute a user action while
+matching a character contained inside a pattern. For example, if scanning a
+programming language and string literals can contain newlines which must be
+counted, a Lex user must break up a string literal pattern so as to associate
+an action with newlines. This forces the definition of a new start condition.
+Alternatively the user can reprocess the text of the matched string literal to
+count newlines.
+
+.comment
+
+How ragel is different from Lex.
+
+Like Re2c, Ragel provides a simple execution model that does not make any
+assumptions as to how the input is collected. Also, Ragel does not do any
+buffering in the generated code. Consequently there are no dependencies on
+external functions such as .verb|malloc|.
+
+If buffering is required it can be manually implemented by embedding actions
+that copy the current character to a buffer, or data can be passed to the
+parser using known block boundaries. If the longest-match operator is used,
+Ragel requires the user to ensure that the ending portion of the input buffer
+is preserved when the buffer is exhaused before a token is fully matched. The
+user should move the token prefix to a new memory location, such as back to the
+beginning of the input buffer, then place the subsequently read input
+immediately after the prefix.
+
+These properties of Ragel make it more work to write a program that requires
+the longest-match operator or buffering of input, however they make Ragel a
+more flexible tool that can produce very simple and fast-running programs under
+a variety of input acquisition arrangements.
+
+In Ragel, it is not necessary
+to introduce start conditions to concatenate tokens and retain action
+execution. Ragel allows one to structure a parser as a series of tokens, but
+does not require it.
+
+Like Lex and Re2C, Ragel is able to process input using a longest-match
+execution model, however the core of the Ragel language specifies parsers at a
+much lower level. This core is built around a pure state machine model. When
+building basic machines there is no implied algorithm for processing input
+other than to move from state to state on the transitions of the machine. This
+core of pure state machine operations makes Ragel well suited to handling
+parsing problems not based on token scanning. Should one need to use a
+longest-match model, the functionality is available and the lower level state
+machine construction facilities can be used to specify the patterns of a
+longest-match machine.
+
+This is not possible in Ragel. One can only program
+a longest-match instantiation with a fixed set of rules. One can jump to
+another longest-match machine that employs the same machine definitions in the
+construction of its rules, however no states will be shared.
+
+In Ragel, input may be re-parsed using a
+different machine, but since the action to be executed is associated with
+transitions of the compiled state machine, the longest-match construction does
+not permit a single rule to be excluded from the active set. It cannot be done
+ahead of time nor in the excluded rule's action.
+
+.end comment
+
+The Re2C program defines an input processing model similar to that of Lex.
+Re2C focuses on making generated state machines run very fast and
+integrate easily into any program, free of dependencies. Re2C generates
+directly executable code and is able to claim that generated parsers run nearly
+as fast as their hand-coded equivalents. This is very important for user
+adoption, as programmers are reluctant to use a tool when a faster alternative
+exists. A consideration to ease of use is also important because developers
+need the freedom to integrate the generated code as they see fit.
+
+Many scripting languages provide ways of composing parsers by linking regular
+expressions using program logic. For example, Sed and Awk are two established
+Unix scripting tools that allow the programmer to exploit regular expressions
+for the purpose of locating and extracting text of interest. High-level
+programming languages such as Perl, Python, PHP and Ruby all provide regular
+expression libraries that allow the user to combine regular expressions with
+arbitrary code.
+
+In addition to supporting the linking of regular expressions with arbitrary
+program logic, the Perl programming language permits the embedding of code into
+regular expressions. Perl embeddings do not translate into the embedding of
+code into deterministic state machines. Perl regular expressions are in fact
+not fully compiled to deterministic machines when embedded code is involved.
+They are instead interpreted and involve backtracking. This is shown by the
+following Perl program. When it is fed the input .verb|abcd| the interpretor
+attempts to match the first alternative, printing .verb|a1 b1|. When this
+possibility fails it backtracks and tries the second possibility, printing
+.verb|a2 b2|, at which point it succeeds.
+
+.code
+print "YES\n" if ( <STDIN> =~
+ /( a (?{ print "a1 "; }) b (?{ print "b1 "; }) cX ) |
+ ( a (?{ print "a2 "; }) b (?{ print "b2 "; }) cd )/x )
+.end code
+
+In Ragel there is no regular expression interpretor. Aside from the scanner
+operator, all Ragel expressions are made into deterministic machines and the
+run time simply moves from state to state as it consumes input. An equivalent
+parser expressed in Ragel would attempt both of the alternatives concurrently,
+printing .verb|a1 a2 b1 b2|.
+
+.section Development Status
+
+Ragel is a relatively new tool and is under continuous development. As a rough
+release guide, minor revision number changes are for implementation
+improvements and feature additions. Major revision number changes are for
+implementation and language changes that do not preserve backwards
+compatibility. Though in the past this has not always held true: changes that
+break code have crept into minor version number changes. Typically, the
+documentation lags behind the development in the interest of documenting only
+the lasting features. The latest changes are always documented in the ChangeLog
+file.
+
+.chapter Constructing State Machines
+
+.section Ragel State Machine Specifications
+
+A Ragel input file consists of a program in the host language that contains embedded machine
+specifications. Ragel normally passes input straight to output. When it sees
+a machine specification it stops to read the Ragel statements and possibly generate
+code in place of the specification.
+Afterwards it continues to pass input through. There
+can be any number of FSM specifications in an input file. A multi-line FSM spec
+starts with .verb|%%{| and ends with .verb|}%%|. A single-line FSM spec starts
+with .verb|%%| and ends at the first newline.
+
+While Ragel is looking for FSM specifications it does basic lexical analysis on
+the surrounding input. It interprets literal strings and comments so a
+.verb|%%| sequence in either of those will not trigger the parsing of an FSM
+specification. Ragel does not pass the input through any preprocessor nor does it
+interpret preprocessor directives itself so includes, defines and ifdef logic
+cannot be used to alter the parse of a Ragel input file. It is therefore not
+possible to use an .verb|#if 0| directive to comment out a machine as is
+commonly done in C code. As an alternative, a machine can be prevented from
+causing any generated output by commenting out write statements.
+
+In Figure .ref{cmd-line-parsing}, a multi-line specification is used to define the
+machine and single line specifications are used to trigger the writing of the machine
+data and execution code.
+
+.figure cmd-line-parsing
+.multicols
+.verbatim
+#include <string.h>
+#include <stdio.h>
+
+%%{
+ machine foo;
+ main :=
+ ( 'foo' | 'bar' )
+ 0 @{ res = 1; };
+}%%
+
+%% write data;
+.end verbatim
+.columnbreak
+.verbatim
+int main( int argc, char **argv )
+{
+ int cs, res = 0;
+ if ( argc > 1 ) {
+ char *p = argv[1];
+ char *pe = p + strlen(p) + 1;
+ %% write init;
+ %% write exec;
+ }
+ printf("result = %i\n", res );
+ return 0;
+}
+.end verbatim
+.end multicols
+.caption Parsing a command line argument.
+.end figure
+
+.subsection Naming Ragel Blocks
+
+.verbatim
+machine fsm_name;
+.end verbatim
+
+The .verb|machine| statement gives the name of the FSM. If present in a
+specification, this statement must appear first. If a machine specification
+does not have a name then Ragel uses the previous specification name. If no
+previous specification name exists then this is an error. Because FSM
+specifications persist in memory, a machine's statements can be spread across
+multiple machine specifications. This allows one to break up a machine across
+several files or draw in statements that are common to multiple machines using
+the .verb|include| statement.
+
+.subsection Machine Definition
+.label{definition}
+
+.verbatim
+<name> = <expression>;
+.end verbatim
+
+The machine definition statement associates an FSM expression with a name. Machine
+expressions assigned to names can later be referenced in other expressions. A
+definition statement on its own does not cause any states to be generated. It is simply a
+description of a machine to be used later. States are generated only when a definition is
+instantiated, which happens when a definition is referenced in an instantiated
+expression.
+
+.subsection Machine Instantiation
+.label{instantiation}
+
+.verbatim
+<name> := <expression>;
+.end verbatim
+
+The machine instantiation statement generates a set of states representing an
+expression. Each instantiation generates a distinct set of states. The starting
+state of the instantiation is written in the data section of the generated code
+using the instantiation name. If a machine named
+.verb|main| is instantiated, its start state is used as the
+specification's start state and is assigned to the .verb|cs| variable by the
+.verb|write init| command. If no .verb|main| machine is given, the start state
+of the last machine instantiation to appear is used as the specification's
+start state.
+
+From outside the execution loop, control may be passed to any machine by
+assigning the entry point to the .verb|cs| variable. From inside the execution
+loop, control may be passed to any machine instantiation using .verb|fcall|,
+.verb|fgoto| or .verb|fnext| statements.
+
+.subsection Including Ragel Code
+
+.verbatim
+include FsmName "inputfile.rl";
+.end verbatim
+
+The .verb|include| statement can be used to draw in the statements of another FSM
+specification. Both the name and input file are optional, however at least one
+must be given. Without an FSM name, the given input file is searched for an FSM
+of the same name as the current specification. Without an input file the
+current file is searched for a machine of the given name. If both are present,
+the given input file is searched for a machine of the given name.
+
+Ragel searches for included files from the location of the current file.
+Additional directories can be added to the search path using the .verb|-I|
+option.
+
+.subsection Importing Definitions
+.label{import}
+
+.verbatim
+import "inputfile.h";
+.end verbatim
+
+The .verb|import| statement scrapes a file for sequences of tokens that match
+the following forms. Ragel treats these forms as state machine definitions.
+
+.list
+.li .verb|name '=' number|
+.li .verb|name '=' lit_string|
+.li .verb|'define' name number|
+.li .verb|'define' name lit_string|
+.end list
+
+If the input file is a Ragel program then tokens inside any Ragel
+specifications are ignored. See Section .ref{export} for a description of
+exporting machine definitions.
+
+Ragel searches for imported files from the location of the current file.
+Additional directories can be added to the search path using the .verb|-I|
+option.
+
+.section Lexical Analysis of a Ragel Block
+.label{lexing}
+
+Within a machine specification the following lexical rules apply to the input.
+
+.itemize
+
+.item The .verb|#| symbol begins a comment that terminates at the next newline.
+
+.item The symbols .verb|""|, .verb|''|, .verb|//|, .verb|[]| behave as the
+delimiters of literal strings. Within them, the following escape sequences
+are interpreted:
+
+.verb| \0 \a \b \t \n \v \f \r|
+
+A backslash at the end of a line joins the following line onto the current. A
+backslash preceding any other character removes special meaning. This applies
+to terminating characters and to special characters in regular expression
+literals. As an exception, regular expression literals do not support escape
+sequences as the operands of a range within a list. See the bullet on regular
+expressions in Section .ref{basic}.
+
+.item The symbols .verb|{}| delimit a block of host language code that will be
+embedded into the machine as an action. Within the block of host language
+code, basic lexical analysis of comments and strings is done in order to
+correctly find the closing brace of the block. With the exception of FSM
+commands embedded in code blocks, the entire block is preserved as is for
+identical reproduction in the output code.
+
+.item The pattern .verb|[+-]?[0-9]+| denotes an integer in decimal format.
+Integers used for specifying machines may be negative only if the alphabet type
+is signed. Integers used for specifying priorities may be positive or negative.
+
+.item The pattern .verb|0x[0-9A-Fa-f]+| denotes an integer in hexadecimal
+format.
+
+.item The keywords are .verb|access|, .verb|action|, .verb|alphtype|,
+.verb|getkey|, .verb|write|, .verb|machine| and .verb|include|.
+
+.item The pattern .verb|[a-zA-Z_][a-zA-Z_0-9]*| denotes an identifier.
+
+.comment
+.item The allowable symbols are:
+
+.verb/ ( ) ! ^ * ? + : -> - | & . , := = ; > @ $ % /\\
+.verb| >/ $/ %/ </ @/ <>/ >! $! %! <! @! <>!|\\
+.verb| >^ $^ %^ <^ @^ <>^ >~ $~ %~ <~ @~ <>~|\\
+.verb| >* $* %* <* @* <>*|
+.end comment
+
+.item Any amount of whitespace may separate tokens.
+
+.end itemize
+
+.comment
+.section Parse of an FSM Specification
+
+The following statements are possible within an FSM specification. The
+requirements for trailing semicolons loosely follow that of C.
+A block
+specifying code does not require a trailing semicolon. An expression
+statement does require a trailing semicolon.
+.end comment
+
+.section Basic Machines
+.label{basic}
+
+The basic machines are the base operands of regular language expressions. They
+are the smallest unit to which machine construction and manipulation operators
+can be applied.
+
+.itemize
+
+.item .verb|'hello'| -- Concatenation Literal. Produces a machine that matches
+the sequence of characters in the quoted string. If there are 5 characters
+there will be 6 states chained together with the characters in the string. See
+Section .ref{lexing} for information on valid escape sequences.
+
+.comment
+% GENERATE: bmconcat
+% OPT: -p
+% %%{
+% machine bmconcat;
+.verbatim
+main := 'hello';
+.end verbatim
+% }%%
+% END GENERATE
+.end comment
+
+.graphic bmconcat
+
+It is possible
+to make a concatenation literal case-insensitive by appending an .verb|i| to
+the string, for example .verb|'cmd'i|.
+
+.item .verb|"hello"| -- Identical to the single quoted version.
+
+.item .verb|[hello]| -- Or Expression. Produces a union of characters. There
+will be two states with a transition for each unique character between the two states.
+The .verb|[]| delimiters behave like the quotes of a literal string. For example,
+.verb|[ \t]| means tab or space. The .verb|or| expression supports character ranges
+with the .verb|-| symbol as a separator. The meaning of the union can be negated
+using an initial .verb|^| character as in standard regular expressions.
+See Section .ref{lexing} for information on valid escape sequences
+in .verb|or| expressions.
+
+.comment
+% GENERATE: bmor
+% OPT: -p
+% %%{
+% machine bmor;
+.verbatim
+main := [hello];
+.end verbatim
+% }%%
+% END GENERATE
+.end comment
+
+.graphic bmor
+
+.item .verb|''|, .verb|""|, and .verb|[]| -- Zero Length Machine. Produces a machine
+that matches the zero length string. Zero length machines have one state that is both
+a start state and a final state.
+
+.comment
+% GENERATE: bmnull
+% OPT: -p
+% %%{
+% machine bmnull;
+.verbatim
+main := '';
+.end verbatim
+% }%%
+% END GENERATE
+.end comment
+
+.graphic bmnull
+
+% FIXME: More on the range of values here.
+.item .verb|42| -- Numerical Literal. Produces a two state machine with one
+transition on the given number. The number may be in decimal or hexadecimal
+format and should be in the range allowed by the alphabet type. The minimum and
+maximum values permitted are defined by the host machine that Ragel is compiled
+on. For example, numbers in a .verb|short| alphabet on an i386 machine should
+be in the range .verb|-32768| to .verb|32767|.
+
+.comment
+% GENERATE: bmnum
+% %%{
+% machine bmnum;
+.verbatim
+main := 42;
+.end verbatim
+% }%%
+% END GENERATE
+.end comment
+
+.graphic bmnum
+
+.item .verb|/simple_regex/| -- Regular Expression. Regular expressions are
+parsed as a series of expressions that are concatenated together. Each
+concatenated expression
+may be a literal character, the ``any'' character specified by the .verb|.|
+symbol, or a union of characters specified by the .verb|[]| delimiters. If the
+first character of a union is .verb|^| then it matches any character not in the
+list. Within a union, a range of characters can be given by separating the first
+and last characters of the range with the .verb|-| symbol. Each
+concatenated machine may have repetition specified by following it with the
+.verb|*| symbol. The standard escape sequences described in Section
+.ref{lexing} are supported everywhere in regular expressions except as the
+operands of a range within in a list. This notation also supports the .verb|i|
+trailing option. Use it to produce case-insensitive machines, as in .verb|/GET/i|.
+
+Ragel does not support very complex regular expressions because the desired
+results can always be achieved using the more general machine construction
+operators listed in Section .ref{machconst}. The following diagram shows the
+result of compiling .verb|/ab*[c-z].*[123]/|. .verb|DEF| represents the default
+transition, which is taken if no other transition can be taken.
+
+.comment
+% GENERATE: bmregex
+% OPT: -p
+% %%{
+% machine bmregex;
+.verbatim
+main := /ab*[c-z].*[123]/;
+.end verbatim
+% }%%
+% END GENERATE
+.end comment
+
+.graphic bmregex
+
+.item .verb|'a' .. 'z'| -- Range. Produces a machine that matches any
+characters in the specified range. Allowable upper and lower bounds of the
+range are concatenation literals of length one and numerical literals. For
+example, .verb|0x10..0x20|, .verb|0..63|, and .verb|'a'..'z'| are valid ranges.
+The bounds should be in the range allowed by the alphabet type.
+
+.comment
+% GENERATE: bmrange
+% OPT: -p
+% %%{
+% machine bmrange;
+.verbatim
+main := 'a' .. 'z';
+.end verbatim
+% }%%
+% END GENERATE
+.end comment
+
+.graphic bmrange
+
+.item .verb|variable_name| -- Lookup the machine definition assigned to the
+variable name given and use an instance of it. See Section .ref{definition} for
+an important note on what it means to reference a variable name.
+
+.item .verb|builtin_machine| -- There are several built-in machines available
+for use. They are all two state machines for the purpose of matching common
+classes of characters. They are:
+
+.itemize
+
+.item .verb|any | -- Any character in the alphabet.
+
+.item .verb|ascii | -- Ascii characters. .verb|0..127|
+
+.item .verb|extend| -- Ascii extended characters. This is the range
+.verb|-128..127| for signed alphabets and the range .verb|0..255| for unsigned
+alphabets.
+
+.item .verb|alpha | -- Alphabetic characters. .verb|[A-Za-z]|
+
+.item .verb|digit | -- Digits. .verb|[0-9]|
+
+.item .verb|alnum | -- Alpha numerics. .verb|[0-9A-Za-z]|
+
+.item .verb|lower | -- Lowercase characters. .verb|[a-z]|
+
+.item .verb|upper | -- Uppercase characters. .verb|[A-Z]|
+
+.item .verb|xdigit| -- Hexadecimal digits. .verb|[0-9A-Fa-f]|
+
+.item .verb|cntrl | -- Control characters. .verb|0..31|
+
+.item .verb|graph | -- Graphical characters. .verb|[!-~]|
+
+.item .verb|print | -- Printable characters. .verb|[ -~]|
+
+.item .verb|punct | -- Punctuation. Graphical characters that are not alphanumerics.
+.verb|[!-/:-@[-`{-~]|
+
+.item .verb|space | -- Whitespace. .verb|[\t\v\f\n\r ]|
+
+.item .verb|zlen | -- Zero length string. .verb|""|
+
+.item .verb|empty | -- Empty set. Matches nothing. .verb|^any|
+
+.end itemize
+.end itemize
+
+.section Operator Precedence
+The following table shows operator precedence from lowest to highest. Operators
+in the same precedence group are evaluated from left to right.
+
+.tabular
+.row 1&.verb| , |&Join
+.row 2&.verb/ | & - --/&Union, Intersection and Subtraction
+.row 3&.verb| . <: :> :>> |&Concatenation
+.row 4&.verb| : |&Label
+.row 5&.verb| -> |&Epsilon Transition
+.row 6&.verb| > @ $ % |&Transitions Actions and Priorities
+.row 6&.verb| >/ $/ %/ </ @/ <>/ |&EOF Actions
+.row 6&.verb| >! $! %! <! @! <>! |&Global Error Actions
+.row 6&.verb| >^ $^ %^ <^ @^ <>^ |&Local Error Actions
+.row 6&.verb| >~ $~ %~ <~ @~ <>~ |&To-State Actions
+.row 6&.verb| >* $* %* <* @* <>* |&From-State Action
+.row 7&.verb| * ** ? + {n} {,n} {n,} {n,m} |&Repetition
+.row 8&.verb| ! ^ |&Negation and Character-Level Negation
+.row 9&.verb| ( <expr> ) |&Grouping
+.end tabular
+
+.section Regular Language Operators
+.label{machconst}
+
+When using Ragel it is helpful to have a sense of how it constructs machines.
+The determinization process can produce results that seem unusual to someone
+not familiar with the NFA to DFA conversion algorithm. In this section we
+describe Ragel's state machine operators. Though the operators are defined
+using epsilon transitions, it should be noted that this is for discussion only.
+The epsilon transitions described in this section do not persist, but are
+immediately removed by the determinization process which is executed at every
+operation. Ragel does not make use of any nondeterministic intermediate state
+machines.
+
+To create an epsilon transition between two states .verb|x| and .verb|y| is to
+copy all of the properties of .verb|y| into .verb|x|. This involves drawing in
+all of .verb|y|'s to-state actions, EOF actions, etc., in addition to its
+transitions. If .verb|x| and .verb|y| both have a transition out on the same
+character, then the transitions must be combined. During transition
+combination a new transition is made that goes to a new state that is the
+combination of both target states. The new combination state is created using
+the same epsilon transition method. The new state has an epsilon transition
+drawn to all the states that compose it. Since the creation of new epsilon
+transitions may be triggered every time an epsilon transition is drawn, the
+process of drawing epsilon transitions is repeated until there are no more
+epsilon transitions to be made.
+
+A very common error that is made when using Ragel is to make machines that do
+too much. That is, to create machines that have unintentional
+nondetermistic properties. This usually results from being unaware of the common strings
+between machines that are combined together using the regular language
+operators. This can involve never leaving a machine, causing its actions to be
+propagated through all the following states. Or it can involve an alternation
+where both branches are unintentionally taken simultaneously.
+
+This problem forces one to think hard about the language that needs to be
+matched. To guard against this kind of problem one must ensure that the machine
+specification is divided up using boundaries that do not allow ambiguities from
+one portion of the machine to the next. See Chapter
+.ref{controlling-nondeterminism} for more on this problem and how to solve it.
+
+The Graphviz tool is an immense help when debugging improperly compiled
+machines or otherwise learning how to use Ragel. Graphviz Dot files can be
+generated from Ragel programs using the .verb|-V| option. See Section
+.ref{visualization} for more information.
+
+
+.subsection Union
+
+.verb/expr | expr/
+
+The union operation produces a machine that matches any string in machine one
+or machine two. The operation first creates a new start state. Epsilon
+transitions are drawn from the new start state to the start states of both
+input machines. The resulting machine has a final state set equivalent to the
+union of the final state sets of both input machines. In this operation, there
+is the opportunity for nondeterminism among both branches. If there are
+strings, or prefixes of strings that are matched by both machines then the new
+machine will follow both parts of the alternation at once. The union operation is
+shown below.
+
+.graphic opor 1.0
+
+The following example demonstrates the union of three machines representing
+common tokens.
+
+% GENERATE: exor
+% OPT: -p
+% %%{
+% machine exor;
+.code
+# Hex digits, decimal digits, or identifiers
+main := '0x' xdigit+ | digit+ | alpha alnum*;
+.end code
+% }%%
+% END GENERATE
+
+.graphic exor
+
+.subsection Intersection
+
+.verb|expr & expr|
+
+Intersection produces a machine that matches any
+string that is in both machine one and machine two. To achieve intersection, a
+union is performed on the two machines. After the result has been made
+deterministic, any final state that is not a combination of final states from
+both machines has its final state status revoked. To complete the operation,
+paths that do not lead to a final state are pruned from the machine. Therefore,
+if there are any such paths in either of the expressions they will be removed
+by the intersection operator. Intersection can be used to require that two
+independent patterns be simultaneously satisfied as in the following example.
+
+% GENERATE: exinter
+% OPT: -p
+% %%{
+% machine exinter;
+.code
+# Match lines four characters wide that contain
+# words separated by whitespace.
+main :=
+ /[^\n][^\n][^\n][^\n]\n/* &
+ (/[a-z][a-z]*/ | [ \n])**;
+.end code
+% }%%
+% END GENERATE
+
+.graphic exinter
+
+.subsection Difference
+
+.verb|expr - expr|
+
+The difference operation produces a machine that matches
+strings that are in machine one but are not in machine two. To achieve subtraction,
+a union is performed on the two machines. After the result has been made
+deterministic, any final state that came from machine two or is a combination
+of states involving a final state from machine two has its final state status
+revoked. As with intersection, the operation is completed by pruning any path
+that does not lead to a final state. The following example demonstrates the
+use of subtraction to exclude specific cases from a set.
+
+% GENERATE: exsubtr
+% OPT: -p
+% %%{
+% machine exsubtr;
+.code
+# Subtract keywords from identifiers.
+main := /[a-z][a-z]*/ - ( 'for' | 'int' );
+.end code
+% }%%
+% END GENERATE
+
+.graphic exsubtr
+
+.subsection Strong Difference
+.label{strong_difference}
+
+.verb|expr -- expr|
+
+Strong difference produces a machine that matches any string of the first
+machine that does not have any string of the second machine as a substring. In
+the following example, strong subtraction is used to excluded .verb|CRLF| from
+a sequence. In the corresponding visualization, the label .verb|DEF| is short
+for default. The default transition is taken if no other transition can be
+taken.
+
+% GENERATE: exstrongsubtr
+% OPT: -p
+% %%{
+% machine exstrongsubtr;
+.code
+crlf = '\r\n';
+main := [a-z]+ ':' ( any* -- crlf ) crlf;
+.end code
+% }%%
+% END GENERATE
+
+.graphic exstrongsubtr
+
+This operator is equivalent to the following.
+
+.verbatim
+expr - ( any* expr any* )
+.end verbatim
+
+.subsection Concatenation
+
+.verb|expr . expr|
+
+Concatenation produces a machine that matches all the strings in machine one followed by all
+the strings in machine two. Concatenation draws epsilon transitions from the
+final states of the first machine to the start state of the second machine. The
+final states of the first machine lose their final state status, unless the
+start state of the second machine is final as well.
+Concatenation is the default operator. Two machines next to each other with no
+operator between them results in concatenation.
+
+.graphic opconcat 1.0
+
+The opportunity for nondeterministic behaviour results from the possibility of
+the final states of the first machine accepting a string that is also accepted
+by the start state of the second machine.
+The most common scenario in which this happens is the
+concatenation of a machine that repeats some pattern with a machine that gives
+a terminating string, but the repetition machine does not exclude the
+terminating string. The example in Section .ref{strong_difference}
+guards against this. Another example is the expression .verb|("'" any* "'")|.
+When executed the thread of control will
+never leave the .verb|any*| machine. This is a problem especially if actions
+are embedded to process the characters of the .verb|any*| component.
+
+In the following example, the first machine is always active due to the
+nondeterministic nature of concatenation. This particular nondeterminism is intended
+however because we wish to permit EOF strings before the end of the input.
+
+% GENERATE: exconcat
+% OPT: -p
+% %%{
+% machine exconcat;
+.code
+# Require an eof marker on the last line.
+main := /[^\n]*\n/* . 'EOF\n';
+.end code
+% }%%
+% END GENERATE
+
+.graphic exconcat
+
+There is a language
+ambiguity involving concatenation and subtraction. Because concatenation is the
+default operator for two
+adjacent machines there is an ambiguity between subtraction of
+a positive numerical literal and concatenation of a negative numerical literal.
+For example, .verb|(x-7)| could be interpreted as .verb|(x . -7)| or
+.verb|(x - 7)|. In the Ragel language, the subtraction operator always takes precedence
+over concatenation of a negative literal. We adhere to the rule that the default
+concatenation operator takes effect only when there are no other operators between
+two machines. Beware of writing machines such as .verb|(any -1)| when what is
+desired is a concatenation of .verb|any| and .verb|-1|. Instead write
+.verb|(any . -1)| or .verb|(any (-1))|. If in doubt of the meaning of your program do not
+rely on the default concatenation operator; always use the .verb|.| symbol.
+
+
+.subsection Kleene Star
+
+.verb|expr*|
+
+The machine resulting from the Kleene Star operator will match zero or more
+repetitions of the machine it is applied to.
+It creates a new start state and an additional final
+state. Epsilon transitions are drawn between the new start state and the old start
+state, between the new start state and the new final state, and
+between the final states of the machine and the new start state. After the
+machine is made deterministic the effect is of the final states getting all the
+transitions of the start state.
+
+.graphic opstar 1.0
+
+The possibility for nondeterministic behaviour arises if the final states have
+transitions on any of the same characters as the start state. This is common
+when applying kleene star to an alternation of tokens. Like the other problems
+arising from nondeterministic behavior, this is discussed in more detail in Chapter
+.ref{controlling-nondeterminism}. This particular problem can also be solved
+by using the longest-match construction discussed in Section
+.ref{generating-scanners} on scanners.
+
+In this
+example, there is no nondeterminism introduced by the exterior kleene star due to
+the newline at the end of the regular expression. Without the newline the
+exterior kleene star would be redundant and there would be ambiguity between
+repeating the inner range of the regular expression and the entire regular
+expression. Though it would not cause a problem in this case, unnecessary
+nondeterminism in the kleene star operator often causes undesired results for
+new Ragel users and must be guarded against.
+
+% GENERATE: exstar
+% OPT: -p
+% %%{
+% machine exstar;
+.code
+# Match any number of lines with only lowercase letters.
+main := /[a-z]*\n/*;
+.end code
+% }%%
+% END GENERATE
+
+.graphic exstar
+
+.subsection One Or More Repetition
+
+.verb|expr+|
+
+This operator produces the concatenation of the machine with the kleene star of
+itself. The result will match one or more repetitions of the machine. The plus
+operator is equivalent to .verb|(expr . expr*)|.
+
+% GENERATE: explus
+% OPT: -p
+% %%{
+% machine explus;
+.code
+# Match alpha-numeric words.
+main := alnum+;
+.end code
+% }%%
+% END GENERATE
+
+.graphic explus
+
+.subsection Optional
+
+.verb|expr?|
+
+The .em{optional} operator produces a machine that accepts the machine
+given or the zero length string. The optional operator is equivalent to
+.verb/(expr | '' )/. In the following example the optional operator is used to
+possibly extend a token.
+
+% GENERATE: exoption
+% OPT: -p
+% %%{
+% machine exoption;
+.code
+# Match integers or floats.
+main := digit+ ('.' digit+)?;
+.end code
+% }%%
+% END GENERATE
+
+.graphic exoption
+
+.subsection Repetition
+
+.list
+.li .verb|expr {n}| -- Exactly N copies of expr.
+.li .verb|expr {,n}| -- Zero to N copies of expr.
+.li .verb|expr {n,}| -- N or more copies of expr.
+.li .verb|expr {n,m}| -- N to M copies of expr.
+.end list
+
+.subsection Negation
+
+.verb|!expr|
+
+Negation produces a machine that matches any string not matched by the given
+machine. Negation is equivalent to .verb|(any* - expr)|.
+
+% GENERATE: exnegate
+% OPT: -p
+% %%{
+% machine exnegate;
+.code
+# Accept anything but a string beginning with a digit.
+main := ! ( digit any* );
+.end code
+% }%%
+% END GENERATE
+
+.graphic exnegate
+
+.subsection Character-Level Negation
+
+.verb|^expr|
+
+Character-level negation produces a machine that matches any single character
+not matched by the given machine. Character-Level Negation is equivalent to
+.verb|(any - expr)|. It must be applied only to machines that match strings of
+length one.
+
+.section State Machine Minimization
+
+State machine minimization is the process of finding the minimal equivalent FSM accepting
+the language. Minimization reduces the number of states in machines
+by merging equivalent states. It does not change the behaviour of the machine
+in any way. It will cause some states to be merged into one because they are
+functionally equivalent. State minimization is on by default. It can be turned
+off with the .verb|-n| option.
+
+The algorithm implemented is similar to Hopcroft's state minimization
+algorithm. Hopcroft's algorithm assumes a finite alphabet that can be listed in
+memory, whereas Ragel supports arbitrary integer alphabets that cannot be
+listed in memory. Though exact analysis is very difficult, Ragel minimization
+runs close to O(n * log(n)) and requires O(n) temporary storage where
+$n$ is the number of states.
+
+.section Visualization
+.label{visualization}
+
+%In many cases, practical
+%parsing programs will be too large to completely visualize with Graphviz. The
+%proper approach is to reduce the language to the smallest subset possible that
+%still exhibits the characteristics that one wishes to learn about or to fix.
+%This can be done without modifying the source code using the .verb|-M| and
+%.verb|-S| options. If a machine cannot be easily reduced,
+%embeddings of unique actions can be very useful for tracing a
+%particular component of a larger machine specification, since action names are
+%written out on transition labels.
+
+Ragel is able to emit compiled state machines in Graphviz's Dot file format.
+This is done using the .verb|-V| option.
+Graphviz support allows users to perform
+incremental visualization of their parsers. User actions are displayed on
+transition labels of the graph.
+
+If the final graph is too large to be
+meaningful, or even drawn, the user is able to inspect portions of the parser
+by naming particular regular expression definitions with the .verb|-S| and
+.verb|-M| options to the .verb|ragel| program. Use of Graphviz greatly
+improves the Ragel programming experience. It allows users to learn Ragel by
+experimentation and also to track down bugs caused by unintended
+nondeterminism.
+
+Ragel has another option to help debugging. The .verb|-x| option causes Ragel
+to emit the compiled machine in an XML format.
+
+.chapter User Actions
+
+Ragel permits the user to embed actions into the transitions of a regular
+expression's corresponding state machine. These actions are executed when the
+generated code moves over a transition. Like the regular expression operators,
+the action embedding operators are fully compositional. They take a state
+machine and an action as input, embed the action and yield a new state machine
+that can be used in the construction of other machines. Due to the
+compositional nature of embeddings, the user has complete freedom in the
+placement of actions.
+
+A machine's transitions are categorized into four classes. The action embedding
+operators access the transitions defined by these classes. The .em{entering
+transition} operator .verb|>| isolates the start state, then embeds an action
+into all transitions leaving it. The .em{finishing transition} operator
+.verb|@| embeds an action into all transitions going into a final state. The
+.em{all transition} operator .verb|$| embeds an action into all transitions of
+an expression. The .em{leaving transition} operator .verb|%| provides access
+to the yet-unmade transitions moving out of the machine via the final states.
+
+.section Embedding Actions
+
+.verbatim
+action ActionName {
+ /* Code an action here. */
+ count += 1;
+}
+.end verbatim
+
+The action statement defines a block of code that can be embedded into an FSM.
+Action names can be referenced by the action embedding operators in
+expressions. Though actions need not be named in this way (literal blocks
+of code can be embedded directly when building machines), defining reusable
+blocks of code whenever possible is good practice because it potentially increases the
+degree to which the machine can be minimized.
+
+Within an action some Ragel expressions and statements are parsed and
+translated. These allow the user to interact with the machine from action code.
+See Section .ref{vals} for a complete list of statements and values available
+in code blocks.
+
+.subsection Entering Action
+
+.verb|expr > action|
+
+The entering action operator embeds an action into all transitions
+that enter into the machine from the start state. If the start state is final,
+then the action is also embedded into the start state as a leaving action. This
+means that if a machine accepts the zero-length string and control passes
+through the start state then the entering action is executed. Note
+that this can happen on both a following character and on the EOF event.
+
+In some machines the start state has transtions coming in from within the
+machine. In these cases the start state is first isolated from the rest of the
+machine ensuring that the entering actions are exected once only.
+
+% GENERATE: exstact
+% OPT: -p
+% %%{
+% machine exstact;
+.code
+# Execute A at the beginning of a string of alpha.
+action A {}
+main := ( lower* >A ) . ' ';
+.end code
+% }%%
+% END GENERATE
+
+.graphic exstact
+
+.subsection Finishing Action
+
+.verb|expr @ action|
+
+The finishing action operator embeds an action into any transitions that move
+the machine into a final state. Further input may move the machine out of the
+final state, but keep it in the machine. Therefore finishing actions may be
+executed more than once if a machine has any internal transitions out of a
+final state. In the following example the final state has no transitions out
+and the finishing action is executed only once.
+
+% GENERATE: exdoneact
+% OPT: -p
+% %%{
+% machine exdoneact;
+% action A {}
+.code
+# Execute A when the trailing space is seen.
+main := ( lower* ' ' ) @A;
+.end code
+% }%%
+% END GENERATE
+
+.graphic exdoneact
+
+.subsection All Transition Action
+
+.verb|expr $ action|
+
+The all transition operator embeds an action into all transitions of a machine.
+The action is executed whenever a transition of the machine is taken. In the
+following example, A is executed on every character matched.
+
+% GENERATE: exallact
+% OPT: -p
+% %%{
+% machine exallact;
+% action A {}
+.code
+# Execute A on any characters of the machine.
+main := ( 'm1' | 'm2' ) $A;
+.end code
+% }%%
+% END GENERATE
+
+.graphic exallact
+
+.subsection Leaving Actions
+.label{out-actions}
+
+.verb|expr % action|
+
+The leaving action operator queues an action for embedding into the transitions
+that go out of a machine via a final state. The action is first stored in
+the machine's final states and is later transferred to any transitions that are
+made going out of the machine by a kleene star or concatenation operation.
+
+If a final state of the machine is still final when compilation is complete
+then the leaving action is also embedded as an EOF action. Therefore, leaving
+the machine is defined as either leaving on a character or as state machine
+acceptance.
+
+This operator allows one to associate an action with the termination of a
+sequence, without being concerned about what particular character terminates
+the sequence. In the following example, A is executed when leaving the alpha
+machine on the newline character.
+
+% GENERATE: exoutact1
+% OPT: -p
+% %%{
+% machine exoutact1;
+% action A {}
+.code
+# Match a word followed by a newline. Execute A when
+# finishing the word.
+main := ( lower+ %A ) . '\n';
+.end code
+% }%%
+% END GENERATE
+
+.graphic exoutact1
+
+In the following example, the .verb|term_word| action could be used to register
+the appearance of a word and to clear the buffer that the .verb|lower| action used
+to store the text of it.
+
+% GENERATE: exoutact2
+% OPT: -p
+% %%{
+% machine exoutact2;
+% action lower {}
+% action space {}
+% action term_word {}
+% action newline {}
+.code
+word = ( [a-z] @lower )+ %term_word;
+main := word ( ' ' @space word )* '\n' @newline;
+.end code
+% }%%
+% END GENERATE
+
+.graphic exoutact2
+
+In this final example of the action embedding operators, A is executed upon entering
+the alpha machine, B is executed on all transitions of the
+alpha machine, C is executed when the alpha machine is exited by moving into the
+newline machine and N is executed when the newline machine moves into a final
+state.
+
+% GENERATE: exaction
+% OPT: -p
+% %%{
+% machine exaction;
+% action A {}
+% action B {}
+% action C {}
+% action N {}
+.code
+# Execute A on starting the alpha machine, B on every transition
+# moving through it and C upon finishing. Execute N on the newline.
+main := ( lower* >A $B %C ) . '\n' @N;
+.end code
+% }%%
+% END GENERATE
+
+.graphic exaction
+
+
+.section State Action Embedding Operators
+
+The state embedding operators allow one to embed actions into states. Like the
+transition embedding operators, there are several different classes of states
+that the operators access. The meanings of the symbols are similar to the
+meanings of the symbols used for the transition embedding operators. The design
+of the state selections was driven by a need to cover the states of an
+expression with exactly one error action.
+
+Unlike the transition embedding operators, the state embedding operators are
+also distinguished by the different kinds of events that embedded actions can
+be associated with. Therefore the state embedding operators have two
+components. The first, which is the first one or two characters, specifies the
+class of states that the action will be embedded into. The second component
+specifies the type of event the action will be executed on. The symbols of the
+second component also have equivalent kewords.
+
+.multicols
+The different classes of states are:
+
+.list
+.li .verb|> | -- the start state
+.li .verb|< | -- any state except the start state
+.li .verb|$ | -- all states
+.li .verb|% | -- final states
+.li .verb|@ | -- any state except final states
+.li .verb|<>| -- any except start and final (middle)
+.end list
+
+.columnbreak
+
+The different kinds of embeddings are:
+
+.list
+.li .verb|~| -- to-state actions (.verb|to|)
+.li .verb|*| -- from-state actions (.verb|from|)
+.li .verb|/| -- EOF actions (.verb|eof|)
+.li .verb|!| -- error actions (.verb|err|)
+.li .verb|^| -- local error actions (.verb|lerr|)
+.end list
+
+.end multicols
+
+.subsection To-State and From-State Actions
+
+.subsubsection To-State Actions
+
+.list
+.li .verb|>~action >to(name) >to{...} | -- the start state
+.li .verb|<~action <to(name) <to{...} | -- any state except the start state
+.li .verb|$~action $to(name) $to{...} | -- all states
+.li .verb|%~action %to(name) %to{...} | -- final states
+.li .verb|@~action @to(name) @to{...} | -- any state except final states
+.li .verb|<>~action <>to(name) <>to{...}| -- any except start and final (middle)
+.end list
+
+
+To-state actions are executed whenever the state machine moves into the
+specified state, either by a natural movement over a transition or by an
+action-based transfer of control such as .verb|fgoto|. They are executed after the
+in-transition's actions but before the current character is advanced and
+tested against the end of the input block. To-state embeddings stay with the
+state. They are irrespective of the state's current set of transitions and any
+future transitions that may be added in or out of the state.
+
+Note that the setting of the current state variable .verb|cs| outside of the
+execute code is not considered by Ragel as moving into a state and consequently
+the to-state actions of the new current state are not executed. This includes
+the initialization of the current state when the machine begins. This is
+because the entry point into the machine execution code is after the execution
+of to-state actions.
+
+.subsubsection From-State Actions
+
+.list
+.li .verb|>*action >from(name) >from{...} | -- the start state
+.li .verb|<*action <from(name) <from{...} | -- any state except the start state
+.li .verb|$*action $from(name) $from{...} | -- all states
+.li .verb|%*action %from(name) %from{...} | -- final states
+.li .verb|@*action @from(name) @from{...} | -- any state except final states
+.li .verb|<>*action <>from(name) <>from{...}| -- any except start and final (middle)
+.end list
+
+From-state actions are executed whenever the state machine takes a transition from a
+state, either to itself or to some other state. These actions are executed
+immediately after the current character is tested against the input block end
+marker and before the transition to take is sought based on the current
+character. From-state actions are therefore executed even if a transition
+cannot be found and the machine moves into the error state. Like to-state
+embeddings, from-state embeddings stay with the state.
+
+.subsection EOF Actions
+
+.list
+.li .verb|>/action >eof(name) >eof{...} | -- the start state
+.li .verb|</action <eof(name) <eof{...} | -- any state except the start state
+.li .verb|$/action $eof(name) $eof{...} | -- all states
+.li .verb|%/action %eof(name) %eof{...} | -- final states
+.li .verb|@/action @eof(name) @eof{...} | -- any state except final states
+.li .verb|<>/action <>eof(name) <>eof{...}| -- any except start and final (middle)
+.end list
+
+The EOF action embedding operators enable the user to embed actions that are
+executed at the end of the input stream. EOF actions are stored in states and
+generated in the .verb|write exec| block. They are run when .verb|p == pe == eof|
+as the execute block is finishing. EOF actions are free to adjust .verb|p| and
+jump to another part of the machine to restart execution.
+
+.subsection Handling Errors
+
+In many applications it is useful to be able to react to parsing errors. The
+user may wish to print an error message that depends on the context. It
+may also be desirable to consume input in an attempt to return the input stream
+to some known state and resume parsing. To support error handling and recovery,
+Ragel provides error action embedding operators. There are two kinds of error
+actions: global error actions and local error actions.
+Error actions can be used to simply report errors, or by jumping to a machine
+instantiation that consumes input, can attempt to recover from errors.
+
+.subsubsection Global Error Actions
+
+.list
+.li .verb|>!action >err(name) >err{...} | -- the start state
+.li .verb|<!action <err(name) <err{...} | -- any state except the start state
+.li .verb|$!action $err(name) $err{...} | -- all states
+.li .verb|%!action %err(name) %err{...} | -- final states
+.li .verb|@!action @err(name) @err{...} | -- any state except final states
+.li .verb|<>!action <>err(name) <>err{...}| -- any except start and final (middle)
+.end list
+
+Global error actions are stored in the states they are embedded into until
+compilation is complete. They are then transferred to the transitions that move
+into the error state. These transitions are taken on all input characters that
+are not already covered by the state's transitions. If a state with an error
+action is not final when compilation is complete, then the action is also
+embedded as an EOF action.
+
+Error actions can be used to recover from errors by jumping back into the
+machine with .verb|fgoto| and optionally altering .verb|p|.
+
+.subsubsection Local Error Actions
+
+.list
+.li .verb|>^action >lerr(name) >lerr{...} | -- the start state
+.li .verb|<^action <lerr(name) <lerr{...} | -- any state except the start state
+.li .verb|$^action $lerr(name) $lerr{...} | -- all states
+.li .verb|%^action %lerr(name) %lerr{...} | -- final states
+.li .verb|@^action @lerr(name) @lerr{...} | -- any state except final states
+.li .verb|<>^action <>lerr(name) <>lerr{...}| -- any except start and final (middle)
+.end list
+
+Like global error actions, local error actions are also stored in the states
+they are embedded into until a transfer point. The transfer point is different
+however. Each local error action embedding is associated with a name. When a
+machine definition has been fully constructed, all local error action
+embeddings associated with the same name as the machine definition are
+transferred to the error transitions. At this time they are also embedded as
+EOF actions in the case of non-final states.
+
+Local error actions can be used to specify an action to take when a particular
+section of a larger state machine fails to match. A particular machine
+definition's ``thread'' may die and the local error actions executed, however
+the machine as a whole may continue to match input.
+
+There are two forms of local error action embeddings. In the first form the
+name defaults to the current machine. In the second form the machine name can
+be specified. This is useful when it is more convenient to specify the local
+error action in a sub-definition that is used to construct the machine
+definition that the local error action is associated with. To embed local
+error actions and
+explicitly state the machine definition on which the transfer is to happen use
+.verb|(name, action)| as the action.
+
+.subsubsection Example
+
+The following example uses error actions to report an error and jump to a
+machine that consumes the remainder of the line when parsing fails. After
+consuming the line, the error recovery machine returns to the main loop.
+
+% GENERATE: erract
+% %%{
+% machine erract;
+% ws = ' ';
+% address = 'foo AT bar..com';
+% date = 'Monday May 12';
+.code
+action cmd_err {
+ printf( "command error\n" );
+ fhold; fgoto line;
+}
+action from_err {
+ printf( "from error\n" );
+ fhold; fgoto line;
+}
+action to_err {
+ printf( "to error\n" );
+ fhold; fgoto line;
+}
+
+line := [^\n]* '\n' @{ fgoto main; };
+
+main := (
+ (
+ 'from' @err(cmd_err)
+ ( ws+ address ws+ date '\n' ) $err(from_err) |
+ 'to' @err(cmd_err)
+ ( ws+ address '\n' ) $err(to_err)
+ )
+)*;
+.end code
+% }%%
+% %% write data;
+% void f()
+% {
+% %% write init;
+% %% write exec;
+% }
+% END GENERATE
+
+
+
+.section Action Ordering and Duplicates
+
+When combining expressions that have embedded actions it is often the case that
+a number of actions must be executed on a single input character. For example,
+following a concatenation the leaving action of the left expression and the
+entering action of the right expression will be embedded into one transition.
+This requires a method of ordering actions that is intuitive and
+predictable for the user, and repeatable for the compiler.
+
+We associate with the embedding of each action a unique timestamp that is
+used to order actions that appear together on a single transition in the final
+state machine. To accomplish this we recursively traverse the parse tree of
+regular expressions and assign timestamps to action embeddings. References to
+machine definitions are followed in the traversal. When we visit a
+parse tree node we assign timestamps to all .em{entering} action embeddings,
+recurse on the parse tree, then assign timestamps to the remaining .em{all},
+.em{finishing}, and .em{leaving} embeddings in the order in which they
+appear.
+
+By default Ragel does not permit a single action to appear multiple times in an action
+list. When the final machine has been created, actions that appear more than
+once in a single transition, to-state, from-state or EOF action list have their
+duplicates removed.
+The first appearance of the action is preserved. This is useful in a number of
+scenarios. First, it allows us to union machines with common prefixes without
+worrying about the action embeddings in the prefix being duplicated. Second, it
+prevents leaving actions from being transferred multiple times. This can
+happen when a machine is repeated, then followed with another machine that
+begins with a common character. For example:
+
+.verbatim
+word = [a-z]+ %act;
+main := word ( '\n' word )* '\n\n';
+.end verbatim
+
+Note that Ragel does not compare action bodies to determine if they have
+identical program text. It simply checks for duplicates using each action
+block's unique location in the program.
+
+The removal of duplicates can be turned off using the .verb|-d| option.
+
+.section Values and Statements Available in Code Blocks
+.label{vals}
+
+The following values are available in code blocks:
+
+.itemize
+.item .verb|fpc| -- A pointer to the current character. This is equivalent to
+accessing the .verb|p| variable.
+
+.item .verb|fc| -- The current character. This is equivalent to the expression .verb|(*p)|.
+
+.item .verb|fcurs| -- An integer value representing the current state. This
+value should only be read from. To move to a different place in the machine
+from action code use the .verb|fgoto|, .verb|fnext| or .verb|fcall| statements.
+Outside of the machine execution code the .verb|cs| variable may be modified.
+
+.item .verb|ftargs| -- An integer value representing the target state. This
+value should only be read from. Again, .verb|fgoto|, .verb|fnext| and
+.verb|fcall| can be used to move to a specific entry point.
+
+.item .verb|fentry(<label>)| -- Retrieve an integer value representing the
+entry point .verb|label|. The integer value returned will be a compile time
+constant. This number is suitable for later use in control flow transfer
+statements that take an expression. This value should not be compared against
+the current state because any given label can have multiple states representing
+it. The value returned by .verb|fentry| can be any one of the multiple states that
+it represents.
+.end itemize
+
+The following statements are available in code blocks:
+
+.itemize
+
+.item .verb|fhold;| -- Do not advance over the current character. If processing
+data in multiple buffer blocks, the .verb|fhold| statement should only be used
+once in the set of actions executed on a character. Multiple calls may result
+in backing up over the beginning of the buffer block. The .verb|fhold|
+statement does not imply any transfer of control. It is equivalent to the
+.verb|p--;| statement.
+
+.item .verb|fexec <expr>;| -- Set the next character to process. This can be
+used to backtrack to previous input or advance ahead.
+Unlike .verb|fhold|, which can be used
+anywhere, .verb|fexec| requires the user to ensure that the target of the
+backtrack is in the current buffer block or is known to be somewhere ahead of
+it. The machine will continue iterating forward until .verb|pe| is arrived at,
+.verb|fbreak| is called or the machine moves into the error state. In actions
+embedded into transitions, the .verb|fexec| statement is equivalent to setting
+.verb|p| to one position ahead of the next character to process. If the user
+also modifies .verb|pe|, it is possible to change the buffer block entirely.
+
+.item .verb|fgoto <label>;| -- Jump to an entry point defined by
+.verb|<label>|. The .verb|fgoto| statement immediately transfers control to
+the destination state.
+
+.item .verb|fgoto *<expr>;| -- Jump to an entry point given by .verb|<expr>|.
+The expression must evaluate to an integer value representing a state.
+
+.item .verb|fnext <label>;| -- Set the next state to be the entry point defined
+by .verb|label|. The .verb|fnext| statement does not immediately jump to the
+specified state. Any action code following the statement is executed.
+
+.item .verb|fnext *<expr>;| -- Set the next state to be the entry point given
+by .verb|<expr>|. The expression must evaluate to an integer value representing
+a state.
+
+.item .verb|fcall <label>;| -- Push the target state and jump to the entry
+point defined by .verb|<label>|. The next .verb|fret| will jump to the target
+of the transition on which the call was made. Use of .verb|fcall| requires
+the declaration of a call stack. An array of integers named .verb|stack| and a
+single integer named .verb|top| must be declared. With the .verb|fcall|
+construct, control is immediately transferred to the destination state.
+See section .ref{modularization} for more information.
+
+.item .verb|fcall *<expr>;| -- Push the current state and jump to the entry
+point given by .verb|<expr>|. The expression must evaluate to an integer value
+representing a state.
+
+.item .verb|fret;| -- Return to the target state of the transition on which the
+last .verb|fcall| was made. Use of .verb|fret| requires the declaration of a
+call stack. Control is immediately transferred to the destination state.
+
+.item .verb|fbreak;| -- Advance .verb|p|, save the target state to .verb|cs|
+and immediately break out of the execute loop. This statement is useful
+in conjunction with the .verb|noend| write option. Rather than process input
+until .verb|pe| is arrived at, the fbreak statement
+can be used to stop processing from an action. After an .verb|fbreak|
+statement the .verb|p| variable will point to the next character in the input. The
+current state will be the target of the current transition. Note that .verb|fbreak|
+causes the target state's to-state actions to be skipped.
+
+.end itemize
+
+Once actions with control-flow commands are embedded into a
+machine, the user must exercise caution when using the machine as the operand
+to other machine construction operators. If an action jumps to another state
+then unioning any transition that executes that action with another transition
+that follows some other path will cause that other path to be lost. Using
+commands that manually jump around a machine takes us out of the domain of
+regular languages because transitions that the
+machine construction operators are not aware of are introduced. These
+commands should therefore be used with caution.
+
+
+.chapter Controlling Nondeterminism
+.label{controlling-nondeterminism}
+
+Along with the flexibility of arbitrary action embeddings comes a need to
+control nondeterminism in regular expressions. If a regular expression is
+ambiguous, then sub-components of a parser other than the intended parts may become
+active. This means that actions that are irrelevant to the
+current subset of the parser may be executed, causing problems for the
+programmer.
+
+Tools that are based on regular expression engines and that are used for
+recognition tasks will usually function as intended regardless of the presence
+of ambiguities. It is quite common for users of scripting languages to write
+regular expressions that are heavily ambiguous and it generally does not
+matter. As long as one of the potential matches is recognized, there can be any
+number of other matches present. In some parsing systems the run-time engine
+can employ a strategy for resolving ambiguities, for example always pursuing
+the longest possible match and discarding others.
+
+In Ragel, there is no regular expression run-time engine, just a simple state
+machine execution model. When we begin to embed actions and face the
+possibility of spurious action execution, it becomes clear that controlling
+nondeterminism at the machine construction level is very important. Consider
+the following example.
+
+% GENERATE: lines1
+% OPT: -p
+% %%{
+% machine lines1;
+% action first {}
+% action tail {}
+% word = [a-z]+;
+.code
+ws = [\n\t ];
+line = word $first ( ws word $tail )* '\n';
+lines = line*;
+.end code
+% main := lines;
+% }%%
+% END GENERATE
+
+.graphic lines1 0.53
+
+Since the .verb|ws| expression includes the newline character, we will
+not finish the .verb|line| expression when a newline character is seen. We will
+simultaneously pursue the possibility of matching further words on the same
+line and the possibility of matching a second line. Evidence of this fact is
+in the state tables. On several transitions both the .verb|first| and
+.verb|tail| actions are executed. The solution here is simple: exclude
+the newline character from the .verb|ws| expression.
+
+% GENERATE: lines2
+% OPT: -p
+% %%{
+% machine lines2;
+% action first {}
+% action tail {}
+% word = [a-z]+;
+.code
+ws = [\t ];
+line = word $first ( ws word $tail )* '\n';
+lines = line*;
+.end code
+% main := lines;
+% }%%
+% END GENERATE
+
+.graphic lines2
+
+Solving this kind of problem is straightforward when the ambiguity is created
+by strings that are a single character long. When the ambiguity is created by
+strings that are multiple characters long we have a more difficult problem.
+The following example is an incorrect attempt at a regular expression for C
+language comments.
+
+% GENERATE: comments1
+% OPT: -p
+% %%{
+% machine comments1;
+% action comm {}
+.code
+comment = '/*' ( any @comm )* '*/';
+main := comment ' ';
+.end code
+% }%%
+% END GENERATE
+
+.graphic comments1
+
+Using standard concatenation, we will never leave the .verb|any*| expression.
+We will forever entertain the possibility that a .verb|'*/'| string that we see
+is contained in a longer comment and that, simultaneously, the comment has
+ended. The concatenation of the .verb|comment| machine with .verb|SP| is done
+to show this. When we match space, we are also still matching the comment body.
+
+One way to approach the problem is to exclude the terminating string
+from the .verb|any*| expression using set difference. We must be careful to
+exclude not just the terminating string, but any string that contains it as a
+substring. A verbose, but proper specification of a C comment parser is given
+by the following regular expression.
+
+% GENERATE: comments2
+% OPT: -p
+% %%{
+% machine comments2;
+% action comm {}
+.code
+comment = '/*' ( ( any @comm )* - ( any* '*/' any* ) ) '*/';
+.end code
+% main := comment;
+% }%%
+% END GENERATE
+
+.graphic comments2
+
+Note that Ragel's strong subtraction operator .verb|--| can also be used here.
+In doing this subtraction we have phrased the problem of controlling non-determinism in
+terms of excluding strings common to two expressions that interact when
+combined.
+We can also phrase the problem in terms of the transitions of the state
+machines that implement these expressions. During the concatenation of
+.verb|any*| and .verb|'*/'| we will be making transitions that are composed of
+both the loop of the first expression and the final character of the second.
+At this time we want the transition on the .verb|'/'| character to take precedence
+over and disallow the transition that originated in the .verb|any*| loop.
+
+In another parsing problem, we wish to implement a lightweight tokenizer that we can
+utilize in the composition of a larger machine. For example, some HTTP headers
+have a token stream as a sub-language. The following example is an attempt
+at a regular expression-based tokenizer that does not function correctly due to
+unintended nondeterminism.
+
+% GENERATE: smallscanner
+% OPT: -p
+% %%{
+% machine smallscanner;
+% action start_str {}
+% action on_char {}
+% action finish_str {}
+.code
+header_contents = (
+ lower+ >start_str $on_char %finish_str |
+ ' '
+)*;
+.end code
+% main := header_contents;
+% }%%
+% END GENERATE
+
+.graphic smallscanner
+
+In this case, the problem with using a standard kleene star operation is that
+there is an ambiguity between extending a token and wrapping around the machine
+to begin a new token. Using the standard operator, we get an undesirable
+nondeterministic behaviour. Evidence of this can be seen on the transition out
+of state one to itself. The transition extends the string, and simultaneously,
+finishes the string only to immediately begin a new one. What is required is
+for the
+transitions that represent an extension of a token to take precedence over the
+transitions that represent the beginning of a new token. For this problem
+there is no simple solution that uses standard regular expression operators.
+
+.section Priorities
+
+A priority mechanism was devised and built into the determinization
+process, specifically for the purpose of allowing the user to control
+nondeterminism. Priorities are integer values embedded into transitions. When
+the determinization process is combining transitions that have different
+priorities, the transition with the higher priority is preserved and the
+transition with the lower priority is dropped.
+
+Unfortunately, priorities can have unintended side effects because their
+operation requires that they linger in transitions indefinitely. They must linger
+because the Ragel program cannot know when the user is finished with a priority
+embedding. A solution whereby they are explicitly deleted after use is
+conceivable; however this is not very user-friendly. Priorities were therefore
+made into named entities. Only priorities with the same name are allowed to
+interact. This allows any number of priorities to coexist in one machine for
+the purpose of controlling various different regular expression operations and
+eliminates the need to ever delete them. Such a scheme allows the user to
+choose a unique name, embed two different priority values using that name
+and be confident that the priority embedding will be free of any side effects.
+
+In the first form of priority embedding the name defaults to the name of the machine
+definition that the priority is assigned in. In this sense priorities are by
+default local to the current machine definition or instantiation. Beware of
+using this form in a longest-match machine, since there is only one name for
+the entire set of longest match patterns. In the second form the priority's
+name can be specified, allowing priority interaction across machine definition
+boundaries.
+
+.itemize
+.item .verb|expr > int| -- Sets starting transitions to have priority int.
+.item .verb|expr @ int| -- Sets transitions that go into a final state to have priority int.
+.item .verb|expr $ int| -- Sets all transitions to have priority int.
+.item .verb|expr % int| -- Sets leaving transitions to
+have priority int. When a transition is made going out of the machine (either
+by concatenation or kleene star) its priority is immediately set to the
+leaving priority.
+.end itemize
+
+The second form of priority assignment allows the programmer to specify the name
+to which the priority is assigned.
+
+.itemize
+.item .verb|expr > (name, int)| -- Starting transitions.
+.item .verb|expr @ (name, int)| -- Finishing transitions (into a final state).
+.item .verb|expr $ (name, int)| -- All transitions.
+.item .verb|expr % (name, int)| -- Leaving transitions.
+.end itemize
+
+.section Guarded Operators that Encapsulate Priorities
+
+Priority embeddings are a very expressive mechanism. At the same time they
+can be very confusing for the user. They force the user to imagine
+the transitions inside two interacting expressions and work out the precise
+effects of the operations between them. When we consider
+that this problem is worsened by the
+potential for side effects caused by unintended priority name collisions, we
+see that exposing the user to priorities is undesirable.
+
+Fortunately, in practice the use of priorities has been necessary only in a
+small number of scenarios. This allows us to encapsulate their functionality
+into a small set of operators and fully hide them from the user. This is
+advantageous from a language design point of view because it greatly simplifies
+the design.
+
+Going back to the C comment example, we can now properly specify
+it using a guarded concatenation operator which we call .em{finish-guarded
+concatenation}. From the user's point of view, this operator terminates the
+first machine when the second machine moves into a final state. It chooses a
+unique name and uses it to embed a low priority into all
+transitions of the first machine. A higher priority is then embedded into the
+transitions of the second machine that enter into a final state. The following
+example yields a machine identical to the example in Section
+.ref{controlling-nondeterminism}.
+
+.code
+comment = '/*' ( any @comm )* :>> '*/';
+.end code
+
+.graphic comments2
+
+Another guarded operator is .em{left-guarded concatenation}, given by the
+.verb|<:| compound symbol. This operator places a higher priority on all
+transitions of the first machine. This is useful if one must forcibly separate
+two lists that contain common elements. For example, one may need to tokenize a
+stream, but first consume leading whitespace.
+
+Ragel also includes a .em{longest-match kleene star} operator, given by the
+.verb|**| compound symbol. This
+guarded operator embeds a high
+priority into all transitions of the machine.
+A lower priority is then embedded into the leaving transitions. When the
+kleene star operator makes the epsilon transitions from
+the final states into the new start state, the lower priority will be transferred
+to the epsilon transitions. In cases where following an epsilon transition
+out of a final state conflicts with an existing transition out of a final
+state, the epsilon transition will be dropped.
+
+Other guarded operators are conceivable, such as guards on union that cause one
+alternative to take precedence over another. These may be implemented when it
+is clear they constitute a frequently used operation.
+In the next section we discuss the explicit specification of state machines
+using state charts.
+
+.subsection Entry-Guarded Concatenation
+
+.verb|expr :> expr|
+
+This operator concatenates two machines, but first assigns a low
+priority to all transitions
+of the first machine and a high priority to the starting transitions of the
+second machine. This operator is useful if from the final states of the first
+machine it is possible to accept the characters in the entering transitions of
+the second machine. This operator effectively terminates the first machine
+immediately upon starting the second machine, where otherwise they would be
+pursued concurrently. In the following example, entry-guarded concatenation is
+used to move out of a machine that matches everything at the first sign of an
+end-of-input marker.
+
+% GENERATE: entryguard
+% OPT: -p
+% %%{
+% machine entryguard;
+.code
+# Leave the catch-all machine on the first character of FIN.
+main := any* :> 'FIN';
+.end code
+% }%%
+% END GENERATE
+
+.graphic entryguard
+
+Entry-guarded concatenation is equivalent to the following:
+
+.verbatim
+expr $(unique_name,0) . expr >(unique_name,1)
+.end verbatim
+
+.subsection Finish-Guarded Concatenation
+
+.verb|expr :>> expr|
+
+This operator is
+like the previous operator, except the higher priority is placed on the final
+transitions of the second machine. This is useful if one wishes to entertain
+the possibility of continuing to match the first machine right up until the
+second machine enters a final state. In other words it terminates the first
+machine only when the second accepts. In the following example, finish-guarded
+concatenation causes the move out of the machine that matches everything to be
+delayed until the full end-of-input marker has been matched.
+
+% GENERATE: finguard
+% OPT: -p
+% %%{
+% machine finguard;
+.code
+# Leave the catch-all machine on the last character of FIN.
+main := any* :>> 'FIN';
+.end code
+% }%%
+% END GENERATE
+
+.graphic finguard
+
+Finish-guarded concatenation is equivalent to the following, with one
+exception. If the right machine's start state is final, the higher priority is
+also embedded into it as a leaving priority. This prevents the left machine
+from persisting via the zero-length string.
+
+.verbatim
+expr $(unique_name,0) . expr @(unique_name,1)
+.end verbatim
+
+.subsection Left-Guarded Concatenation
+
+.verb|expr <: expr|
+
+This operator places
+a higher priority on the left expression. It is useful if you want to prefix a
+sequence with another sequence composed of some of the same characters. For
+example, one can consume leading whitespace before tokenizing a sequence of
+whitespace-separated words as in:
+
+% GENERATE: leftguard
+% OPT: -p
+% %%{
+% machine leftguard;
+% action alpha {}
+% action ws {}
+% action start {}
+% action fin {}
+.code
+main := ( ' '* >start %fin ) <: ( ' ' $ws | [a-z] $alpha )*;
+.end code
+% }%%
+% END GENERATE
+
+.graphic leftguard
+
+Left-guarded concatenation is equivalent to the following:
+
+.verbatim
+expr $(unique_name,1) . expr >(unique_name,0)
+.end verbatim
+
+.subsection Longest-Match Kleene Star
+.label{longest_match_kleene_star}
+
+.verb|expr**|
+
+This version of kleene star puts a higher priority on staying in the
+machine versus wrapping around and starting over. The LM kleene star is useful
+when writing simple tokenizers. These machines are built by applying the
+longest-match kleene star to an alternation of token patterns, as in the
+following.
+
+% GENERATE: lmkleene
+% OPT: -p
+% %%{
+% machine exfinpri;
+% action A {}
+% action B {}
+.code
+# Repeat tokens, but make sure to get the longest match.
+main := (
+ lower ( lower | digit )* %A |
+ digit+ %B |
+ ' '
+)**;
+.end code
+% }%%
+% END GENERATE
+
+.graphic lmkleene
+
+If a regular kleene star were used the machine above would not be able to
+distinguish between extending a word and beginning a new one. This operator is
+equivalent to:
+
+.verbatim
+( expr $(unique_name,1) %(unique_name,0) )*
+.end verbatim
+
+When the kleene star is applied, transitions that go out of the machine and
+back into it are made. These are assigned a priority of zero by the leaving
+transition mechanism. This is less than the priority of one assigned to the
+transitions leaving the final states but not leaving the machine. When
+these transitions clash on the same character, the
+transition that stays in the machine takes precedence. The transition
+that wraps around is dropped.
+
+Note that this operator does not build a scanner in the traditional sense
+because there is never any backtracking. To build a scanner with backtracking
+use the Longest-Match machine construction described in Section
+.ref{generating-scanners}.
+
+.chapter Interface to Host Program
+
+The Ragel code generator is very flexible. The generated code has no
+dependencies and can be inserted in any function, perhaps inside a loop if
+desired. The user is responsible for declaring and initializing a number of
+required variables, including the current state and the pointer to the input
+stream. These can live in any scope. Control of the input processing loop is
+also possible: the user may break out of the processing loop and return to it
+at any time.
+
+In the case of the C, D, and Go host languages, Ragel is able to generate very
+fast-running code that implements state machines as directly executable code.
+Since very large files strain the host language compiler, table-based code
+generation is also supported. In the future we hope to provide a partitioned,
+directly executable format that is able to reduce the burden on the host
+compiler by splitting large machines across multiple functions.
+
+In the case of Java and Ruby, table-based code generation is the only code
+style supported. In the future this may be expanded to include other code
+styles.
+
+Ragel can be used to parse input in one block, or it can be used to parse input
+in a sequence of blocks as it arrives from a file or socket. Parsing the input
+in a sequence of blocks brings with it a few responsibilities. If the parser
+utilizes a scanner, care must be taken to not break the input stream anywhere
+but token boundaries. If pointers to the input stream are taken during
+parsing, care must be taken to not use a pointer that has been invalidated by
+movement to a subsequent block. If the current input data pointer is moved
+backwards it must not be moved past the beginning of the current block.
+
+Figure .ref{basic-example} shows a simple Ragel program that does not have any
+actions. The example tests the first argument of the program against a number
+pattern and then prints the machine's acceptance status.
+
+.figure basic-example
+.verbatim
+#include <stdio.h>
+#include <string.h>
+%%{
+ machine foo;
+ write data;
+}%%
+int main( int argc, char **argv )
+{
+ int cs;
+ if ( argc > 1 ) {
+ char *p = argv[1];
+ char *pe = p + strlen( p );
+ %%{
+ main := [0-9]+ ( '.' [0-9]+ )?;
+
+ write init;
+ write exec;
+ }%%
+ }
+ printf("result = %i\n", cs >= foo_first_final );
+ return 0;
+}
+.end verbatim
+.caption A basic Ragel example without any actions.
+.end figure
+
+.section Variables Used by Ragel
+
+There are a number of variables that Ragel expects the user to declare. At a
+very minimum the .verb|cs|, .verb|p| and .verb|pe| variables must be declared.
+In Go, Java and Ruby code the .verb|data| variable must also be declared. If
+EOF actions are used then the .verb|eof| variable is required. If
+stack-based state machine control flow statements are used then the
+.verb|stack| and .verb|top| variables are required. If a scanner is declared
+then the .verb|act|, .verb|ts| and .verb|te| variables must be
+declared.
+
+.itemize
+
+.item .verb|cs| - Current state. This must be an integer and it should persist
+across invocations of the machine when the data is broken into blocks that are
+processed independently. This variable may be modified from outside the
+execution loop, but not from within.
+
+.item .verb|p| - Data pointer. In C/D code this variable is expected to be a
+pointer to the character data to process. It should be initialized to the
+beginning of the data block on every run of the machine. In Go, Java and Ruby it is
+used as an offset to .verb|data| and must be an integer. In this case it should
+be initialized to zero on every run of the machine.
+
+.item .verb|pe| - Data end pointer. This should be initialized to .verb|p| plus
+the data length on every run of the machine. In Go, Java and Ruby code this should
+be initialized to the data length.
+
+.item .verb|eof| - End of file pointer. This should be set to .verb|pe| when
+the buffer block being processed is the last one, otherwise it should be set to
+null. In Go, Java and Ruby code .verb|-1| must be used instead of null. If the EOF
+event can be known only after the final buffer block has been processed, then
+it is possible to set .verb|p = pe = eof| and run the execute block.
+
+.item .verb|data| - This variable is only required in Go, Java and Ruby code. It
+must be an array containting the data to process.
+
+.item .verb|stack| - This must be an array of integers. It is used to store
+integer values representing states. If the stack must resize dynamically the
+Pre-push and Post-Pop statements can be used to do this (Sections
+.ref{prepush} and .ref{postpop}).
+
+.item .verb|top| - This must be an integer value and will be used as an offset
+to .verb|stack|, giving the next available spot on the top of the stack.
+
+.item .verb|act| - This must be an integer value. It is a variable sometimes
+used by scanner code to keep track of the most recent successful pattern match.
+
+.item .verb|ts| - This must be a pointer to character data. In Go, Java and
+Ruby code this must be an integer. See Section .ref{generating-scanners} for
+more information.
+
+.item .verb|te| - Also a pointer to character data.
+
+.end itemize
+
+.section Alphtype Statement
+
+.verbatim
+alphtype unsigned int;
+.end verbatim
+
+The alphtype statement specifies the alphabet data type that the machine
+operates on. During the compilation of the machine, integer literals are
+expected to be in the range of possible values of the alphtype. The default
+is .verb|char| for all languages except Go where the default is .verb|byte|.
+
+.multicols
+C/C++/Objective-C:
+.verbatim
+ char unsigned char
+ short unsigned short
+ int unsigned int
+ long unsigned long
+.end verbatim
+
+Go:
+.verbatim
+ byte
+ int8 uint8
+ int16 uint16
+ int32 uint32
+ int
+.end verbatim
+
+Ruby:
+.verbatim
+ char
+ int
+.end verbatim
+
+.columnbreak
+
+Java:
+.verbatim
+ char
+ byte
+ short
+ int
+.end verbatim
+
+D:
+.verbatim
+ char
+ byte ubyte
+ short ushort
+ wchar
+ int uint
+ dchar
+.end verbatim
+
+.end multicols
+
+.section Getkey Statement
+
+.verbatim
+getkey fpc->id;
+.end verbatim
+
+This statement specifies to Ragel how to retrieve the current character from
+from the pointer to the current element (.verb|p|). Any expression that returns
+a value of the alphabet type
+may be used. The getkey statement may be used for looking into element
+structures or for translating the character to process. The getkey expression
+defaults to .verb|(*p)|. In goto-driven machines the getkey expression may be
+evaluated more than once per element processed, therefore it should not incur a
+large cost nor preclude optimization.
+
+.section Access Statement
+
+.verbatim
+access fsm->;
+.end verbatim
+
+The access statement specifies how the generated code should
+access the machine data that is persistent across processing buffer blocks.
+This applies to all variables except .verb|p|, .verb|pe| and .verb|eof|. This includes
+.verb|cs|, .verb|top|, .verb|stack|, .verb|ts|, .verb|te| and .verb|act|.
+The access statement is useful if a machine is to be encapsulated inside a
+structure in C code. It can be used to give the name of
+a pointer to the structure.
+
+.section Variable Statement
+
+.verbatim
+variable p fsm->p;
+.end verbatim
+
+The variable statement specifies how to access a specific
+variable. All of the variables that are declared by the user and
+used by Ragel can be changed. This includes .verb|p|, .verb|pe|, .verb|eof|, .verb|cs|,
+.verb|top|, .verb|stack|, .verb|ts|, .verb|te| and .verb|act|.
+In Go, Ruby and Java code generation the .verb|data| variable can also be changed.
+
+.section Pre-Push Statement
+.label{prepush}
+
+.verbatim
+prepush {
+ /* stack growing code */
+}
+.end verbatim
+
+The prepush statement allows the user to supply stack management code that is
+written out during the generation of fcall, immediately before the current
+state is pushed to the stack. This statement can be used to test the number of
+available spaces and dynamically grow the stack if necessary.
+
+.section Post-Pop Statement
+.label{postpop}
+
+.verbatim
+postpop {
+ /* stack shrinking code */
+}
+.end verbatim
+
+The postpop statement allows the user to supply stack management code that is
+written out during the generation of fret, immediately after the next state is
+popped from the stack. This statement can be used to dynamically shrink the
+stack.
+
+.section Write Statement
+.label{write-statement}
+
+.verbatim
+write <component> [options];
+.end verbatim
+
+The write statement is used to generate parts of the machine.
+There are seven
+components that can be generated by a write statement. These components make up the
+state machine's data, initialization code, execution code, and export definitions.
+A write statement may appear before a machine is fully defined.
+This allows one to write out the data first then later define the machine where
+it is used. An example of this is shown in Figure .ref{fbreak-example}.
+
+.subsection Write Data
+.verbatim
+write data [options];
+.end verbatim
+
+The write data statement causes Ragel to emit the constant static data needed
+by the machine. In table-driven output styles (see Section .ref{genout}) this
+is a collection of arrays that represent the states and transitions of the
+machine. In goto-driven machines much less data is emitted. At the very
+minimum a start state .verb|name_start| is generated. All variables written
+out in machine data have both the .verb|static| and .verb|const| properties and
+are prefixed with the name of the machine and an
+underscore. The data can be placed inside a class, inside a function, or it can
+be defined as global data.
+
+Two variables are written that may be used to test the state of the machine
+after a buffer block has been processed. The .verb|name_error| variable gives
+the id of the state that the machine moves into when it cannot find a valid
+transition to take. The machine immediately breaks out of the processing loop when
+it finds itself in the error state. The error variable can be compared to the
+current state to determine if the machine has failed to parse the input. If the
+machine is complete, that is from every state there is a transition to a proper
+state on every possible character of the alphabet, then no error state is required
+and this variable will be set to -1.
+
+The .verb|name_first_final| variable stores the id of the first final state.
+All of the machine's states are sorted by their final state status before
+having their ids assigned. Checking if the machine has accepted its input can
+then be done by checking if the current state is greater-than or equal to the
+first final state.
+
+Data generation has several options:
+
+.list
+.li .verb|noerror | - Do not generate the integer variable that gives the id of the error state.
+.li .verb|nofinal | - Do not generate the integer variable that gives the id of the first final state.
+.li .verb|noprefix | - Do not prefix the variable names with the name of the machine.
+.end list
+
+.figure fbreak-example
+.verbatim
+#include <stdio.h>
+%% machine foo;
+%% write data;
+int main( int argc, char **argv )
+{
+ int cs, res = 0;
+ if ( argc > 1 ) {
+ char *p = argv[1];
+ %%{
+ main :=
+ [a-z]+
+ 0 @{ res = 1; fbreak; };
+ write init;
+ write exec noend;
+ }%%
+ }
+ printf("execute = %i\n", res );
+ return 0;
+}
+.end verbatim
+.caption Use of .tt{noend} write option and the .tt{fbreak} statement for
+processing a string.
+.end figure
+
+.subsection Write Start, First Final and Error
+
+.verbatim
+write start;
+write first_final;
+write error;
+.end verbatim
+
+These three write statements provide an alternative means of accessing the
+.verb|start|, .verb|first_final| and .verb|error| states. If there are many
+different machine specifications in one file it is easy to get the prefix for
+these wrong. This is especially true if the state machine boilerplate is
+frequently made by a copy-paste-edit process. These write statements allow the
+problem to be avoided. They can be used as follows:
+
+.verbatim
+/* Did parsing succeed? */
+if ( cs < %%{ write first_final; }%% ) {
+ result = ERR_PARSE_ERROR;
+ goto fail;
+}
+.end verbatim
+
+.subsection Write Init
+.verbatim
+write init [options];
+.end verbatim
+
+The write init statement causes Ragel to emit initialization code. This should
+be executed once before the machine is started. At a very minimum this sets the
+current state to the start state. If other variables are needed by the
+generated code, such as call stack variables or scanner management
+variables, they are also initialized here.
+
+The .verb|nocs| option to the write init statement will cause ragel to skip
+intialization of the cs variable. This is useful if the user wishes to use
+custom logic to decide which state the specification should start in.
+
+.subsection Write Exec
+.verbatim
+write exec [options];
+.end verbatim
+
+The write exec statement causes Ragel to emit the state machine's execution code.
+Ragel expects several variables to be available to this code. At a very minimum, the
+generated code needs access to the current character position .verb|p|, the ending
+position .verb|pe| and the current state .verb|cs| (though .verb|pe|
+can be omitted using the .verb|noend| write option).
+The .verb|p| variable is the cursor that the execute code will
+used to traverse the input. The .verb|pe| variable should be set up to point to one
+position past the last valid character in the buffer.
+
+Other variables are needed when certain features are used. For example using
+the .verb|fcall| or .verb|fret| statements requires .verb|stack| and
+.verb|top| variables to be defined. If a longest-match construction is used,
+variables for managing backtracking are required.
+
+The write exec statement has one option. The .verb|noend| option tells Ragel
+to generate code that ignores the end position .verb|pe|. In this
+case the user must explicitly break out of the processing loop using
+.verb|fbreak|, otherwise the machine will continue to process characters until
+it moves into the error state. This option is useful if one wishes to process a
+null terminated string. Rather than traverse the string to discover then length
+before processing the input, the user can break out when the null character is
+seen. The example in Figure .ref{fbreak-example} shows the use of the
+.verb|noend| write option and the .verb|fbreak| statement for processing a string.
+
+.subsection Write Exports
+.label{export}
+
+.verbatim
+write exports;
+.end verbatim
+
+The export feature can be used to export simple machine definitions. Machine definitions
+are marked for export using the .verb|export| keyword.
+
+.verbatim
+export machine_to_export = 0x44;
+.end verbatim
+
+When the write exports statement is used these machines are
+written out in the generated code. Defines are used for C and constant integers
+are used for D, Java and Ruby. See Section .ref{import} for a description of the
+import statement.
+
+.section Maintaining Pointers to Input Data
+
+In the creation of any parser it is not uncommon to require the collection of
+the data being parsed. It is always possible to collect data into a growable
+buffer as the machine moves over it, however the copying of data is a somewhat
+wasteful use of processor cycles. The most efficient way to collect data from
+the parser is to set pointers into the input then later reference them. This
+poses a problem for uses of Ragel where the input data arrives in blocks, such
+as over a socket or from a file. If a pointer is set in one buffer block but
+must be used while parsing a following buffer block, some extra consideration
+to correctness must be made.
+
+The scanner constructions exhibit this problem, requiring the maintenance
+code described in Section .ref{generating-scanners}. If a longest-match
+construction has been used somewhere in the machine then it is possible to
+take advantage of the required prefix maintenance code in the driver program to
+ensure pointers to the input are always valid. If laying down a pointer one can
+set .verb|ts| at the same spot or ahead of it. When data is shifted in
+between loops the user must also shift the pointer. In this way it is possible
+to maintain pointers to the input that will always be consistent.
+
+.figure line-oriented
+.verbatim
+ int have = 0;
+ while ( 1 ) {
+ char *p, *pe, *data = buf + have;
+ int len, space = BUFSIZE - have;
+
+ if ( space == 0 ) {
+ fprintf(stderr, "BUFFER OUT OF SPACE\n");
+ exit(1);
+ }
+
+ len = fread( data, 1, space, stdin );
+ if ( len == 0 )
+ break;
+
+ /* Find the last newline by searching backwards. */
+ p = buf;
+ pe = data + len - 1;
+ while ( *pe != '\n' && pe >= buf )
+ pe--;
+ pe += 1;
+
+ %% write exec;
+
+ /* How much is still in the buffer? */
+ have = data + len - pe;
+ if ( have > 0 )
+ memmove( buf, pe, have );
+
+ if ( len < space )
+ break;
+ }
+.end verbatim
+.caption An example of line-oriented processing.
+.end figure
+
+In general, there are two approaches for guaranteeing the consistency of
+pointers to input data. The first approach is the one just described;
+lay down a marker from an action,
+then later ensure that the data the marker points to is preserved ahead of
+the buffer on the next execute invocation. This approach is good because it
+allows the parser to decide on the pointer-use boundaries, which can be
+arbitrarily complex parsing conditions. A downside is that it requires any
+pointers that are set to be corrected in between execute invocations.
+
+The alternative is to find the pointer-use boundaries before invoking the execute
+routine, then pass in the data using these boundaries. For example, if the
+program must perform line-oriented processing, the user can scan backwards from
+the end of an input block that has just been read in and process only up to the
+first found newline. On the next input read, the new data is placed after the
+partially read line and processing continues from the beginning of the line.
+An example of line-oriented processing is given in Figure .ref{line-oriented}.
+
+.section Specifying the Host Language
+
+The .verb|ragel| program has a number of options for specifying the host
+language. The host-language options are:
+
+.itemize
+.item .verb|-C | for C/C++/Objective-C code (default)
+.item .verb|-D | for D code.
+.item .verb|-Z | for Go code.
+.item .verb|-J | for Java code.
+.item .verb|-R | for Ruby code.
+.item .verb|-A | for C\# code.
+.end itemize
+
+.section Choosing a Generated Code Style
+.label{genout}
+
+There are three styles of code output to choose from. Code style affects the
+size and speed of the compiled binary. Changing code style does not require any
+change to the Ragel program. There are two table-driven formats and a goto
+driven format.
+
+In addition to choosing a style to emit, there are various levels of action
+code reuse to choose from. The maximum reuse levels (.verb|-T0|, .verb|-F0|
+and .verb|-G0|) ensure that no FSM action code is ever duplicated by encoding
+each transition's action list as static data and iterating
+through the lists on every transition. This will normally result in a smaller
+binary. The less action reuse options (.verb|-T1|, .verb|-F1| and .verb|-G1|)
+will usually produce faster running code by expanding each transition's action
+list into a single block of code, eliminating the need to iterate through the
+lists. This duplicates action code instead of generating the logic necessary
+for reuse. Consequently the binary will be larger. However, this tradeoff applies to
+machines with moderate to dense action lists only. If a machine's transitions
+frequently have less than two actions then the less reuse options will actually
+produce both a smaller and a faster running binary due to less action sharing
+overhead. The best way to choose the appropriate code style for your
+application is to perform your own tests.
+
+The table-driven FSM represents the state machine as constant static data. There are
+tables of states, transitions, indices and actions. The current state is
+stored in a variable. The execution is simply a loop that looks up the current
+state, looks up the transition to take, executes any actions and moves to the
+target state. In general, the table-driven FSM can handle any machine, produces
+a smaller binary and requires a less expensive host language compile, but
+results in slower running code. Since the table-driven format is the most
+flexible it is the default code style.
+
+The flat table-driven machine is a table-based machine that is optimized for
+small alphabets. Where the regular table machine uses the current character as
+the key in a binary search for the transition to take, the flat table machine
+uses the current character as an index into an array of transitions. This is
+faster in general, however is only suitable if the span of possible characters
+is small.
+
+The goto-driven FSM represents the state machine using goto and switch
+statements. The execution is a flat code block where the transition to take is
+computed using switch statements and directly executable binary searches. In
+general, the goto FSM produces faster code but results in a larger binary and a
+more expensive host language compile.
+
+The goto-driven format has an additional action reuse level (.verb|-G2|) that
+writes actions directly into the state transitioning logic rather than putting
+all the actions together into a single switch. Generally this produces faster
+running code because it allows the machine to encode the current state using
+the processor's instruction pointer. Again, sparse machines may actually
+compile to smaller binaries when .verb|-G2| is used due to less state and
+action management overhead. For many parsing applications .verb|-G2| is the
+preferred output format.
+
+.center
+
+Code Output Style Options
+
+.tabular
+.row .verb|-T0|&binary search table-driven&C/D/Java/Ruby/C\#
+.row .verb|-T1|&binary search, expanded actions&C/D/Ruby/C\#
+.row .verb|-F0|&flat table-driven&C/D/Ruby/C\#
+.row .verb|-F1|&flat table, expanded actions&C/D/Ruby/C\#
+.row .verb|-G0|&goto-driven&C/D/C\#
+.row .verb|-G1|&goto, expanded actions&C/D/C\#
+.row .verb|-G2|&goto, in-place actions&C/D/Go
+.end tabular
+.end center
+
+.chapter Beyond the Basic Model
+
+.section Parser Modularization
+.label{modularization}
+
+It is possible to use Ragel's machine construction and action embedding
+operators to specify an entire parser using a single regular expression. In
+many cases this is the desired way to specify a parser in Ragel. However, in
+some scenarios the language to parse may be so large that it is difficult to
+think about it as a single regular expression. It may also shift between distinct
+parsing strategies, in which case modularization into several coherent blocks
+of the language may be appropriate.
+
+It may also be the case that patterns that compile to a large number of states
+must be used in a number of different contexts and referencing them in each
+context results in a very large state machine. In this case, an ability to reuse
+parsers would reduce code size.
+
+To address this, distinct regular expressions may be instantiated and linked
+together by means of a jumping and calling mechanism. This mechanism is
+analogous to the jumping to and calling of processor instructions. A jump
+command, given in action code, causes control to be immediately passed to
+another portion of the machine by way of setting the current state variable. A
+call command causes the target state of the current transition to be pushed to
+a state stack before control is transferred. Later on, the original location
+may be returned to with a return statement. In the following example, distinct
+state machines are used to handle the parsing of two types of headers.
+
+% GENERATE: call
+% %%{
+% machine call;
+.code
+action return { fret; }
+action call_date { fcall date; }
+action call_name { fcall name; }
+
+# A parser for date strings.
+date := [0-9][0-9] '/'
+ [0-9][0-9] '/'
+ [0-9][0-9][0-9][0-9] '\n' @return;
+
+# A parser for name strings.
+name := ( [a-zA-Z]+ | ' ' )** '\n' @return;
+
+# The main parser.
+headers =
+ ( 'from' | 'to' ) ':' @call_name |
+ ( 'departed' | 'arrived' ) ':' @call_date;
+
+main := headers*;
+.end code
+% }%%
+% %% write data;
+% void f()
+% {
+% %% write init;
+% %% write exec;
+% }
+% END GENERATE
+
+Calling and jumping should be used carefully as they are operations that take
+one out of the domain of regular languages. A machine that contains a call or
+jump statement in one of its actions should be used as an argument to a machine
+construction operator only with considerable care. Since DFA transitions may
+actually represent several NFA transitions, a call or jump embedded in one
+machine can inadvertently terminate another machine that it shares prefixes
+with. Despite this danger, theses statements have proven useful for tying
+together sub-parsers of a language into a parser for the full language,
+especially for the purpose of modularizing code and reducing the number of
+states when the machine contains frequently recurring patterns.
+
+Section .ref{vals} describes the jump and call statements that are used to
+transfer control. These statements make use of two variables that must be
+declared by the user, .verb|stack| and .verb|top|. The .verb|stack| variable
+must be an array of integers and .verb|top| must be a single integer, which
+will point to the next available space in .verb|stack|. Sections .ref{prepush}
+and .ref{postpop} describe the Pre-Push and Post-Pop statements which can be
+used to implement a dynamically resizable array.
+
+.section Referencing Names
+.label{labels}
+
+This section describes how to reference names in epsilon transitions (Section
+.ref{state-charts}) and
+action-based control-flow statements such as .verb|fgoto|. There is a hierarchy
+of names implied in a Ragel specification. At the top level are the machine
+instantiations. Beneath the instantiations are labels and references to machine
+definitions. Beneath those are more labels and references to definitions, and
+so on.
+
+Any name reference may contain multiple components separated with the .verb|::|
+compound symbol. The search for the first component of a name reference is
+rooted at the join expression that the epsilon transition or action embedding
+is contained in. If the name reference is not contained in a join,
+the search is rooted at the machine definition that the epsilon transition or
+action embedding is contained in. Each component after the first is searched
+for beginning at the location in the name tree that the previous reference
+component refers to.
+
+In the case of action-based references, if the action is embedded more than
+once, the local search is performed for each embedding and the result is the
+union of all the searches. If no result is found for action-based references then
+the search is repeated at the root of the name tree. Any action-based name
+search may be forced into a strictly global search by prefixing the name
+reference with .verb|::|.
+
+The final component of the name reference must resolve to a unique entry point.
+If a name is unique in the entire name tree it can be referenced as is. If it
+is not unique it can be specified by qualifying it with names above it in the
+name tree. However, it can always be renamed.
+
+% FIXME: Should fit this in somewhere.
+% Some kinds of name references are illegal. Cannot call into longest-match
+% machine, can only call its start state. Cannot make a call to anywhere from
+% any part of a longest-match machine except a rule's action. This would result
+% in an eventual return to some point inside a longest-match other than the
+% start state. This is banned for the same reason a call into the LM machine is
+% banned.
+
+
+.section Scanners
+.label{generating-scanners}
+
+Scanners are very much intertwined with regular-languages and their
+corresponding processors. For this reason Ragel supports the definition of
+scanners. The generated code will repeatedly attempt to match patterns from a
+list, favouring longer patterns over shorter patterns. In the case of
+equal-length matches, the generated code will favour patterns that appear ahead
+of others. When a scanner makes a match it executes the user code associated
+with the match, consumes the input then resumes scanning.
+
+.verbatim
+<machine_name> := |*
+ pattern1 => action1;
+ pattern2 => action2;
+ ...
+ *|;
+.end verbatim
+
+On the surface, Ragel scanners are similar to those defined by Lex. Though
+there is a key distinguishing feature: patterns may be arbitrary Ragel
+expressions and can therefore contain embedded code. With a Ragel-based scanner
+the user need not wait until the end of a pattern before user code can be
+executed.
+
+Scanners can be used to process sub-languages, as well as for tokenizing
+programming languages. In the following example a scanner is used to tokenize
+the contents of a header field.
+
+.code
+word = [a-z]+;
+head_name = 'Header';
+
+header := |*
+ word;
+ ' ';
+ '\n' => { fret; };
+*|;
+
+main := ( head_name ':' @{ fcall header; } )*;
+.end code
+
+The scanner construction has a purpose similar to the longest-match kleene star
+operator .verb|**|. The key
+difference is that a scanner is able to backtrack to match a previously matched
+shorter string when the pursuit of a longer string fails. For this reason the
+scanner construction operator is not a pure state machine construction
+operator. It relies on several variables that enable it to backtrack and make
+pointers to the matched input text available to the user. For this reason
+scanners must be immediately instantiated. They cannot be defined inline or
+referenced by another expression. Scanners must be jumped to or called.
+
+Scanners rely on the .verb|ts|, .verb|te| and .verb|act|
+variables to be present so that they can backtrack and make pointers to the
+matched text available to the user. If input is processed using multiple calls
+to the execute code then the user must ensure that when a token is only
+partially matched that the prefix is preserved on the subsequent invocation of
+the execute code.
+
+The .verb|ts| variable must be defined as a pointer to the input data.
+It is used for recording where the current token match begins. This variable
+may be used in action code for retrieving the text of the current match. Ragel
+ensures that in between tokens and outside of the longest-match machines that
+this pointer is set to null. In between calls to the execute code the user must
+check if .verb|ts| is set and if so, ensure that the data it points to is
+preserved ahead of the next buffer block. This is described in more detail
+below.
+
+The .verb|te| variable must also be defined as a pointer to the input data.
+It is used for recording where a match ends and where scanning of the next
+token should begin. This can also be used in action code for retrieving the
+text of the current match.
+
+The .verb|act| variable must be defined as an integer type. It is used for
+recording the identity of the last pattern matched when the scanner must go
+past a matched pattern in an attempt to make a longer match. If the longer
+match fails it may need to consult the .verb|act| variable. In some cases, use
+of the .verb|act|
+variable can be avoided because the value of the current state is enough
+information to determine which token to accept, however in other cases this is
+not enough and so the .verb|act| variable is used.
+
+When the longest-match operator is in use, the user's driver code must take on
+some buffer management functions. The following algorithm gives an overview of
+the steps that should be taken to properly use the longest-match operator.
+
+.itemize
+.item Read a block of input data.
+.item Run the execute code.
+.item If .verb|ts| is set, the execute code will expect the incomplete
+token to be preserved ahead of the buffer on the next invocation of the execute
+code.
+.itemize
+.item Shift the data beginning at .verb|ts| and ending at .verb|pe| to the
+beginning of the input buffer.
+.item Reset .verb|ts| to the beginning of the buffer.
+.item Shift .verb|te| by the distance from the old value of .verb|ts|
+to the new value. The .verb|te| variable may or may not be valid. There is
+no way to know if it holds a meaningful value because it is not kept at null
+when it is not in use. It can be shifted regardless.
+.end itemize
+.item Read another block of data into the buffer, immediately following any
+preserved data.
+.item Run the scanner on the new data.
+.end itemize
+
+Figure .ref{preserve_example} shows the required handling of an input stream in
+which a token is broken by the input block boundaries. After processing up to
+and including the ``t'' of ``characters'', the prefix of the string token must be
+retained and processing should resume at the ``e'' on the next iteration of
+the execute code.
+
+If one uses a large input buffer for collecting input then the number of times
+the shifting must be done will be small. Furthermore, if one takes care not to
+define tokens that are allowed to be very long and instead processes these
+items using pure state machines or sub-scanners, then only a small amount of
+data will ever need to be shifted.
+
+.figure preserve_example
+.verbatim
+ a) A stream "of characters" to be scanned.
+ | | |
+ p ts pe
+
+ b) "of characters" to be scanned.
+ | | |
+ ts p pe
+.end verbatim
+.caption Following an invocation of the execute code there may be a partially
+matched token (a). The data of the partially matched token
+must be preserved ahead of the new data on the next invocation (b).
+.end figure
+
+Since scanners attempt to make the longest possible match of input, patterns
+such as identifiers require one character of lookahead in order to trigger a
+match. In the case of the last token in the input stream the user must ensure
+that the .verb|eof| variable is set so that the final token is flushed out.
+
+An example scanner processing loop is given in Figure .ref{scanner-loop}.
+
+.figure scanner-loop
+.verbatim
+ int have = 0;
+ bool done = false;
+ while ( !done ) {
+ /* How much space is in the buffer? */
+ int space = BUFSIZE - have;
+ if ( space == 0 ) {
+ /* Buffer is full. */
+ cerr << "TOKEN TOO BIG" << endl;
+ exit(1);
+ }
+
+ /* Read in a block after any data we already have. */
+ char *p = inbuf + have;
+ cin.read( p, space );
+ int len = cin.gcount();
+
+ char *pe = p + len;
+ char *eof = 0;
+
+ /* If no data was read indicate EOF. */
+ if ( len == 0 ) {
+ eof = pe;
+ done = true;
+ }
+
+ %% write exec;
+
+ if ( cs == Scanner_error ) {
+ /* Machine failed before finding a token. */
+ cerr << "PARSE ERROR" << endl;
+ exit(1);
+ }
+
+ if ( ts == 0 )
+ have = 0;
+ else {
+ /* There is a prefix to preserve, shift it over. */
+ have = pe - ts;
+ memmove( inbuf, ts, have );
+ te = inbuf + (te-ts);
+ ts = inbuf;
+ }
+ }
+.end verbatim
+.caption A processing loop for a scanner.
+.end figure
+
+.section State Charts
+.label{state-charts}
+
+In addition to supporting the construction of state machines using regular
+languages, Ragel provides a way to manually specify state machines using
+state charts. The comma operator combines machines together without any
+implied transitions. The user can then manually link machines by specifying
+epsilon transitions with the .verb|->| operator. Epsilon transitions are drawn
+between the final states of a machine and entry points defined by labels. This
+makes it possible to build machines using the explicit state-chart method while
+making minimal changes to the Ragel language.
+
+An interesting feature of Ragel's state chart construction method is that it
+can be mixed freely with regular expression constructions. A state chart may be
+referenced from within a regular expression, or a regular expression may be
+used in the definition of a state chart transition.
+
+.subsection Join
+
+.verb|expr , expr , ...|
+
+Join a list of machines together without
+drawing any transitions, without setting up a start state, and without
+designating any final states. Transitions between the machines may be specified
+using labels and epsilon transitions. The start state must be explicity
+specified with the ``start'' label. Final states may be specified with an
+epsilon transition to the implicitly created ``final'' state. The join
+operation allows one to build machines using a state chart model.
+
+.subsection Label
+
+.verb|label: expr|
+
+Attaches a label to an expression. Labels can be
+used as the target of epsilon transitions and explicit control transfer
+statements such as .verb|fgoto| and .verb|fnext| in action
+code.
+
+.subsection Epsilon
+
+.verb|expr -> label|
+
+Draws an epsilon transition to the state defined
+by .verb|label|. Epsilon transitions are made deterministic when join
+operators are evaluated. Epsilon transitions that are not in a join operation
+are made deterministic when the machine definition that contains the epsilon is
+complete. See Section .ref{labels} for information on referencing labels.
+
+.subsection Simplifying State Charts
+
+There are two benefits to providing state charts in Ragel. The first is that it
+allows us to take a state chart with a full listing of states and transitions
+and simplify it in selective places using regular expressions.
+
+The state chart method of specifying parsers is very common. It is an
+effective programming technique for producing robust code. The key disadvantage
+becomes clear when one attempts to comprehend a large parser specified in this
+way. These programs usually require many lines, causing logic to be spread out
+over large distances in the source file. Remembering the function of a large
+number of states can be difficult and organizing the parser in a sensible way
+requires discipline because branches and repetition present many file layout
+options. This kind of programming takes a specification with inherent
+structure such as looping, alternation and concatenation and expresses it in a
+flat form.
+
+If we could take an isolated component of a manually programmed state chart,
+that is, a subset of states that has only one entry point, and implement it
+using regular language operators then we could eliminate all the explicit
+naming of the states contained in it. By eliminating explicitly named states
+and replacing them with higher-level specifications we simplify a state machine
+specification.
+
+For example, sometimes chains of states are needed, with only a small number of
+possible characters appearing along the chain. These can easily be replaced
+with a concatenation of characters. Sometimes a group of common states
+implement a loop back to another single portion of the machine. Rather than
+manually duplicate all the transitions that loop back, we may be able to
+express the loop using a kleene star operator.
+
+Ragel allows one to take this state map simplification approach. We can build
+state machines using a state map model and implement portions of the state map
+using regular languages. In place of any transition in the state machine,
+entire sub-machines can be given. These can encapsulate functionality
+defined elsewhere. An important aspect of the Ragel approach is that when we
+wrap up a collection of states using a regular expression we do not lose
+access to the states and transitions. We can still execute code on the
+transitions that we have encapsulated.
+
+.subsection Dropping Down One Level of Abstraction
+.label{down}
+
+The second benefit of incorporating state charts into Ragel is that it permits
+us to bypass the regular language abstraction if we need to. Ragel's action
+embedding operators are sometimes insufficient for expressing certain parsing
+tasks. In the same way that is useful for C language programmers to drop down
+to assembly language programming using embedded assembler, it is sometimes
+useful for the Ragel programmer to drop down to programming with state charts.
+
+In the following example, we wish to buffer the characters of an XML CDATA
+sequence. The sequence is terminated by the string .verb|]]>|. The challenge
+in our application is that we do not wish the terminating characters to be
+buffered. An expression of the form .verb|any* @buffer :>> ']]>'| will not work
+because the buffer will always contain the characters .verb|]]| on the end.
+Instead, what we need is to delay the buffering of .verb|]|
+characters until a time when we
+abandon the terminating sequence and go back into the main loop. There is no
+easy way to express this using Ragel's regular expression and action embedding
+operators, and so an ability to drop down to the state chart method is useful.
+
+% GENERATE: dropdown
+% OPT: -p
+% %%{
+% machine dropdown;
+.code
+action bchar { buff( fpc ); } # Buffer the current character.
+action bbrack1 { buff( "]" ); }
+action bbrack2 { buff( "]]" ); }
+
+CDATA_body =
+start: (
+ ']' -> one |
+ (any-']') @bchar ->start
+),
+one: (
+ ']' -> two |
+ [^\]] @bbrack1 @bchar ->start
+),
+two: (
+ '>' -> final |
+ ']' @bbrack1 -> two |
+ [^>\]] @bbrack2 @bchar ->start
+);
+.end code
+% main := CDATA_body;
+% }%%
+% END GENERATE
+
+.graphic dropdown
+
+
+.section Semantic Conditions
+.label{semantic}
+
+Many communication protocols contain variable-length fields, where the length
+of the field is given ahead of the field as a value. This
+problem cannot be expressed using regular languages because of its
+context-dependent nature. The prevalence of variable-length fields in
+communication protocols motivated us to introduce semantic conditions into
+the Ragel language.
+
+A semantic condition is a block of user code that is interpreted as an
+expression and evaluated immediately
+before a transition is taken. If the code returns a value of true, the
+transition may be taken. We can now embed code that extracts the length of a
+field, then proceed to match $n$ data values.
+
+% GENERATE: conds1
+% OPT: -p
+% %%{
+% machine conds1;
+% number = digit+;
+.code
+action rec_num { i = 0; n = getnumber(); }
+action test_len { i++ < n }
+data_fields = (
+ 'd'
+ [0-9]+ %rec_num
+ ':'
+ ( [a-z] when test_len )*
+)**;
+.end code
+% main := data_fields;
+% }%%
+% END GENERATE
+
+.graphic conds1
+
+The Ragel implementation of semantic conditions does not force us to give up the
+compositional property of Ragel definitions. For example, a machine that tests
+the length of a field using conditions can be unioned with another machine
+that accepts some of the same strings, without the two machines interfering with
+one another. The user need not be concerned about whether or not the result of the
+semantic condition will affect the matching of the second machine.
+
+To see this, first consider that when a user associates a condition with an
+existing transition, the transition's label is translated from the base character
+to its corresponding value in the space that represents ``condition $c$ true''. Should
+the determinization process combine a state that has a conditional transition
+with another state that has a transition on the same input character but
+without a condition, then the condition-less transition first has its label
+translated into two values, one to its corresponding value in the space that
+represents ``condition $c$ true'' and another to its corresponding value in the
+space that represents ``condition $c$ false''. It
+is then safe to combine the two transitions. This is shown in the following
+example. Two intersecting patterns are unioned, one with a condition and one
+without. The condition embedded in the first pattern does not affect the second
+pattern.
+
+% GENERATE: conds2
+% OPT: -p
+% %%{
+% machine conds2;
+% number = digit+;
+.code
+action test_len { i++ < n }
+action one { /* accept pattern one */ }
+action two { /* accept pattern two */ }
+patterns =
+ ( [a-z] when test_len )+ %one |
+ [a-z][a-z0-9]* %two;
+main := patterns '\n';
+.end code
+% }%%
+% END GENERATE
+
+.graphic conds2
+
+There are many more potential uses for semantic conditions. The user is free to
+use arbitrary code and may therefore perform actions such as looking up names
+in dictionaries, validating input using external parsing mechanisms or
+performing checks on the semantic structure of input seen so far. In the next
+section we describe how Ragel accommodates several common parser engineering
+problems.
+
+The semantic condition feature works only with alphabet types that are smaller
+in width than the .verb|long| type. To implement semantic conditions Ragel
+needs to be able to allocate characters from the alphabet space. Ragel uses
+these allocated characters to express "character C with condition P true" or "C
+with P false." Since internally Ragel uses longs to store characters there is
+no room left in the alphabet space unless an alphabet type smaller than long is
+used.
+
+.section Implementing Lookahead
+
+There are a few strategies for implementing lookahead in Ragel programs.
+Leaving actions, which are described in Section .ref{out-actions}, can be
+used as a form of lookahead. Ragel also provides the .verb|fhold| directive
+which can be used in actions to prevent the machine from advancing over the
+current character. It is also possible to manually adjust the current character
+position by shifting it backwards using .verb|fexec|, however when this is
+done, care must be taken not to overstep the beginning of the current buffer
+block. In both the use of .verb|fhold| and .verb|fexec| the user must be
+cautious of combining the resulting machine with another in such a way that the
+transition on which the current position is adjusted is not combined with a
+transition from the other machine.
+
+.section Parsing Recursive Language Structures
+
+In general Ragel cannot handle recursive structures because the grammar is
+interpreted as a regular language. However, depending on what needs to be
+parsed it is sometimes practical to implement the recursive parts using manual
+coding techniques. This often works in cases where the recursive structures are
+simple and easy to recognize, such as in the balancing of parentheses
+
+One approach to parsing recursive structures is to use actions that increment
+and decrement counters or otherwise recognize the entry to and exit from
+recursive structures and then jump to the appropriate machine defnition using
+.verb|fcall| and .verb|fret|. Alternatively, semantic conditions can be used to
+test counter variables.
+
+A more traditional approach is to call a separate parsing function (expressed
+in the host language) when a recursive structure is entered, then later return
+when the end is recognized.
+##### EXP #####
+\documentclass[letterpaper,11pt,oneside]{book}
+\usepackage{graphicx}
+\usepackage{comment}
+\usepackage{multicol}
+\usepackage[
+ colorlinks=true,
+ linkcolor=black,
+ citecolor=green,
+ filecolor=black,
+ urlcolor=black]{hyperref}
+
+\topmargin -0.20in
+\oddsidemargin 0in
+\textwidth 6.5in
+\textheight 9in
+
+\setlength{\parskip}{0pt}
+\setlength{\topsep}{0pt}
+\setlength{\partopsep}{0pt}
+\setlength{\itemsep}{0pt}
+
+\input{version}
+
+\newcommand{\verbspace}{\vspace{10pt}}
+\newcommand{\graphspace}{\vspace{10pt}}
+
+\renewcommand\floatpagefraction{.99}
+\renewcommand\topfraction{.99}
+\renewcommand\bottomfraction{.99}
+\renewcommand\textfraction{.01}
+\setcounter{totalnumber}{50}
+\setcounter{topnumber}{50}
+\setcounter{bottomnumber}{50}
+
+\newenvironment{inline_code}{\def\baselinestretch{1}\vspace{12pt}\small}{}
+
+\begin{document}
+
+\thispagestyle{empty}
+\begin{center}
+\vspace*{3in}
+{\huge Ragel State Machine Compiler}\\
+\vspace*{12pt}
+{\Large User Guide}\\
+\vspace{1in}
+by\\
+\vspace{12pt}
+{\large Adrian Thurston}\\
+\end{center}
+\clearpage
+
+\pagenumbering{roman}
+
+\chapter*{License}
+Ragel version \version, \pubdate\\
+Copyright \copyright\ 2003-2012 Adrian D. Thurston
+\vspace{6mm}
+
+{\bf\it\noindent This document is part of Ragel, and as such, this document is
+released under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2 of the License, or (at your option)
+any later version.
+}
+
+\vspace{5pt}
+
+{\bf\it\noindent Ragel is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+details.
+}
+
+\vspace{5pt}
+
+{\bf\it\noindent You should have received a copy of the GNU General Public
+License along with Ragel; if not, write to the Free Software Foundation, Inc.,
+59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+}
+
+\clearpage
+\tableofcontents
+\clearpage
+
+\pagenumbering{arabic}
+\chapter{Introduction}
+
+\section{Abstract}
+
+Regular expressions are used heavily in practice for the purpose of specifying
+parsers. They are normally used as black boxes linked together with program
+logic. User actions are executed in between invocations of the regular
+expression engine. Adding actions before a pattern terminates requires patterns
+to be broken and pasted back together with program logic. The more user actions
+are needed, the less the advantages of regular expressions are seen.
+
+Ragel is a software development tool that allows user actions to be
+embedded into the transitions of a regular expression's corresponding state
+machine, eliminating the need to switch from the regular expression engine and
+user code execution environment and back again. As a result, expressions can be
+maximally continuous. One is free to specify an entire parser using a single
+regular expression. The single-expression model affords concise and elegant
+descriptions of languages and the generation of very simple, fast and robust
+code. Ragel compiles executable finite state machines from a high level regular language
+notation. Ragel targets C, C++, Objective-C, D, Go, Java and Ruby.
+
+In addition to building state machines from regular expressions, Ragel allows
+the programmer to directly specify state machines with state charts. These two
+notations may be freely combined. There are also facilities for controlling
+nondeterminism in the resulting machines and building scanners using patterns
+that themselves have embedded actions. Ragel can produce code that is small and
+runs very fast. Ragel can handle integer-sized alphabets and can compile very
+large state machines.
+
+\section{Motivation}
+
+When a programmer is faced with the task of producing a parser for a
+context-free language there are many tools to choose from. It is quite common
+to generate useful and efficient parsers for programming languages from a
+formal grammar. It is also quite common for programmers to avoid such tools
+when making parsers for simple computer languages, such as file formats and
+communication protocols. Such languages are often regular and tools for
+processing the context-free languages are viewed as too heavyweight for the
+purpose of parsing regular languages. The extra run-time effort required for
+supporting the recursive nature of context-free languages is wasted.
+
+When we turn to the regular expression-based parsing tools, such as Lex, Re2C,
+and scripting languages such as Sed, Awk and Perl we find that they are split
+into two levels: a regular expression matching engine and some kind of program
+logic for linking patterns together. For example, a Lex program is composed of
+sets of regular expressions. The implied program logic repeatedly attempts to
+match a pattern in the current set. When a match is found the associated user
+code executed. It requires the user to consider a language as a sequence of
+independent tokens. Scripting languages and regular expression libraries allow
+one to link patterns together using arbitrary program code. This is very
+flexible and powerful, however we can be more concise and clear if we avoid
+gluing together regular expressions with if statements and while loops.
+
+This model of execution, where the runtime alternates between regular
+expression matching and user code exectution places restrictions on when
+action code may be executed. Since action code can only be associated with
+complete patterns, any action code that must be executed before an entire
+pattern is matched requires that the pattern be broken into smaller units.
+Instead of being forced to disrupt the regular expression syntax and write
+smaller expressions, it is desirable to retain a single expression and embed
+code for performing actions directly into the transitions that move over the
+characters. After all, capable programmers are astutely aware of the machinery
+underlying their programs, so why not provide them with access to that
+machinery? To achieve this we require an action execution model for associating
+code with the sub-expressions of a regular expression in a way that does not
+disrupt its syntax.
+
+The primary goal of Ragel is to provide developers with an ability to embed
+actions into the transitions and states of a regular expression's state machine
+in support of the definition of entire parsers or large sections of parsers
+using a single regular expression. From the regular expression we gain a clear
+and concise statement of our language. From the state machine we obtain a very
+fast and robust executable that lends itself to many kinds of analysis and
+visualization.
+
+\section{Overview}
+
+Ragel is a language for specifying state machines. The Ragel program is a
+compiler that assembles a state machine definition to executable code. Ragel
+is based on the principle that any regular language can be converted to a
+deterministic finite state automaton. Since every regular language has a state
+machine representation and vice versa, the terms regular language and state
+machine (or just machine) will be used interchangeably in this document.
+
+Ragel outputs machines to C, C++, Objective-C, D, Go, Java or Ruby code. The output is
+designed to be generic and is not bound to any particular input or processing
+method. A Ragel machine expects to have data passed to it in buffer blocks.
+When there is no more input, the machine can be queried for acceptance. In
+this way, a Ragel machine can be used to simply recognize a regular language
+like a regular expression library. By embedding code into the regular language,
+a Ragel machine can also be used to parse input.
+
+The Ragel language has many operators for constructing and manipulating
+machines. Machines are built up from smaller machines, to bigger ones, to the
+final machine representing the language that needs to be recognized or parsed.
+
+The core state machine construction operators are those found in most theory
+of computation textbooks. They date back to the 1950s and are widely studied.
+They are based on set operations and permit one to think of languages as a set
+of strings. They are Union, Intersection, Difference, Concatenation and Kleene
+Star. Put together, these operators make up what most people know as regular
+expressions. Ragel also provides a scanner construction operator
+and provides operators for explicitly constructing machines
+using a state chart method. In the state chart method, one joins machines
+together without any implied transitions and then explicitly specifies where
+epsilon transitions should be drawn.
+
+The state machine manipulation operators are specific to Ragel. They allow the
+programmer to access the states and transitions of regular language's
+corresponding machine. There are two uses of the manipulation operators. The
+first and primary use is to embed code into transitions and states, allowing
+the programmer to specify the actions of the state machine.
+
+Ragel attempts to make the action embedding facility as intuitive as possible.
+To do so, a number of issues need to be addressed. For example, when making a
+nondeterministic specification into a DFA using machines that have embedded
+actions, new transitions are often made that have the combined actions of
+several source transitions. Ragel ensures that multiple actions associated with
+a single transition are ordered consistently with respect to the order of
+reference and the natural ordering implied by the construction operators.
+
+The second use of the manipulation operators is to assign priorities to
+transitions. Priorities provide a convenient way of controlling any
+nondeterminism introduced by the construction operators. Suppose two
+transitions leave from the same state and go to distinct target states on the
+same character. If these transitions are assigned conflicting priorities, then
+during the determinization process the transition with the higher priority will
+take precedence over the transition with the lower priority. The lower priority
+transition gets abandoned. The transitions would otherwise be combined into a new
+transition that goes to a new state that is a combination of the original
+target states. Priorities are often required for segmenting machines. The most
+common uses of priorities have been encoded into a set of simple operators
+that should be used instead of priority embeddings whenever possible.
+
+For the purposes of embedding, Ragel divides transitions and states into
+different classes. There are four operators for embedding actions and
+priorities into the transitions of a state machine. It is possible to embed
+into entering transitions, finishing transitions, all transitions and leaving
+transitions. The embedding into leaving transitions is a special case.
+These transition embeddings get stored in the final states of a machine. They
+are transferred to any transitions that are made going out of the machine by
+future concatenation or kleene star operations.
+
+There are several more operators for embedding actions into states. Like the
+transition embeddings, there are various different classes of states that the
+embedding operators access. For example, one can access start states, final
+states or all states, among others. Unlike the transition embeddings, there are
+several different types of state action embeddings. These are executed at
+various different times during the processing of input. It is possible to embed
+actions that are exectued on transitions into a state, on transitions out of a
+state, on transitions taken on the error event, or on transitions taken on the
+EOF event.
+
+Within actions, it is possible to influence the behaviour of the state machine.
+The user can write action code that jumps or calls to another portion of the
+machine, changes the current character being processed, or breaks out of the
+processing loop. With the state machine calling feature Ragel can be used to
+parse languages that are not regular. For example, one can parse balanced
+parentheses by calling into a parser when an open parenthesis character is seen
+and returning to the state on the top of the stack when the corresponding
+closing parenthesis character is seen. More complicated context-free languages
+such as expressions in C are out of the scope of Ragel.
+
+Ragel also provides a scanner construction operator that can be used to build
+scanners much the same way that Lex is used. The Ragel generated code, which
+relies on user-defined variables for backtracking, repeatedly tries to match
+patterns to the input, favouring longer patterns over shorter ones and patterns
+that appear ahead of others when the lengths of the possible matches are
+identical. When a pattern is matched the associated action is executed.
+
+The key distinguishing feature between scanners in Ragel and scanners in Lex is
+that Ragel patterns may be arbitrary Ragel expressions and can therefore
+contain embedded code. With a Ragel-based scanner the user need not wait until
+the end of a pattern before user code can be executed.
+
+Scanners do take Ragel out of the domain of pure state machines and require the
+user to maintain the backtracking related variables. However, scanners
+integrate well with regular state machine instantiations. They can be called to
+or jumped to only when needed, or they can be called out of or jumped out of
+when a simpler, pure state machine model is appropriate.
+
+Two types of output code style are available. Ragel can produce a table-driven
+machine or a directly executable machine. The directly executable machine is
+much faster than the table-driven. On the other hand, the table-driven machine
+is more compact and less demanding on the host language compiler. It is better
+suited to compiling large state machines.
+
+\section{Related Work}
+
+Lex is perhaps the best-known tool for constructing parsers from regular
+expressions. In the Lex processing model, generated code attempts to match one
+of the user's regular expression patterns, favouring longer matches over
+shorter ones. Once a match is made it then executes the code associated with
+the pattern and consumes the matching string. This process is repeated until
+the input is fully consumed.
+
+Through the use of start conditions, related sets of patterns may be defined.
+The active set may be changed at any time. This allows the user to define
+different lexical regions. It also allows the user to link patterns together by
+requiring that some patterns come before others. This is quite like a
+concatenation operation. However, use of Lex for languages that require a
+considerable amount of pattern concatenation is inappropriate. In such cases a
+Lex program deteriorates into a manually specified state machine, where start
+conditions define the states and pattern actions define the transitions. Lex
+is therefore best suited to parsing tasks where the language to be parsed can
+be described in terms of regions of tokens.
+
+Lex is useful in many scenarios and has undoubtedly stood the test of time.
+There are, however, several drawbacks to using Lex. Lex can impose too much
+overhead for parsing applications where buffering is not required because all
+the characters are available in a single string. In these cases there is
+structure to the language to be parsed and a parser specification tool can
+help, but employing a heavyweight processing loop that imposes a stream
+``pull'' model and dynamic input buffer allocation is inappropriate. An
+example of this kind of scenario is the conversion of floating point numbers
+contained in a string to their corresponding numerical values.
+
+Another drawback is the very issue that Ragel attempts to solve.
+It is not possible to execute a user action while
+matching a character contained inside a pattern. For example, if scanning a
+programming language and string literals can contain newlines which must be
+counted, a Lex user must break up a string literal pattern so as to associate
+an action with newlines. This forces the definition of a new start condition.
+Alternatively the user can reprocess the text of the matched string literal to
+count newlines.
+
+
+The Re2C program defines an input processing model similar to that of Lex.
+Re2C focuses on making generated state machines run very fast and
+integrate easily into any program, free of dependencies. Re2C generates
+directly executable code and is able to claim that generated parsers run nearly
+as fast as their hand-coded equivalents. This is very important for user
+adoption, as programmers are reluctant to use a tool when a faster alternative
+exists. A consideration to ease of use is also important because developers
+need the freedom to integrate the generated code as they see fit.
+
+Many scripting languages provide ways of composing parsers by linking regular
+expressions using program logic. For example, Sed and Awk are two established
+Unix scripting tools that allow the programmer to exploit regular expressions
+for the purpose of locating and extracting text of interest. High-level
+programming languages such as Perl, Python, PHP and Ruby all provide regular
+expression libraries that allow the user to combine regular expressions with
+arbitrary code.
+
+In addition to supporting the linking of regular expressions with arbitrary
+program logic, the Perl programming language permits the embedding of code into
+regular expressions. Perl embeddings do not translate into the embedding of
+code into deterministic state machines. Perl regular expressions are in fact
+not fully compiled to deterministic machines when embedded code is involved.
+They are instead interpreted and involve backtracking. This is shown by the
+following Perl program. When it is fed the input \verb|abcd| the interpretor
+attempts to match the first alternative, printing \verb|a1 b1|. When this
+possibility fails it backtracks and tries the second possibility, printing
+\verb|a2 b2|, at which point it succeeds.
+
+\begin{inline_code}
+\begin{verbatim}
+print "YES\n" if ( <STDIN> =~
+ /( a (?{ print "a1 "; }) b (?{ print "b1 "; }) cX ) |
+ ( a (?{ print "a2 "; }) b (?{ print "b2 "; }) cd )/x )
+\end{verbatim}
+\end{inline_code}
+\verbspace
+
+In Ragel there is no regular expression interpretor. Aside from the scanner
+operator, all Ragel expressions are made into deterministic machines and the
+run time simply moves from state to state as it consumes input. An equivalent
+parser expressed in Ragel would attempt both of the alternatives concurrently,
+printing \verb|a1 a2 b1 b2|.
+
+\section{Development Status}
+
+Ragel is a relatively new tool and is under continuous development. As a rough
+release guide, minor revision number changes are for implementation
+improvements and feature additions. Major revision number changes are for
+implementation and language changes that do not preserve backwards
+compatibility. Though in the past this has not always held true: changes that
+break code have crept into minor version number changes. Typically, the
+documentation lags behind the development in the interest of documenting only
+the lasting features. The latest changes are always documented in the ChangeLog
+file.
+
+\chapter{Constructing State Machines}
+
+\section{Ragel State Machine Specifications}
+
+A Ragel input file consists of a program in the host language that contains embedded machine
+specifications. Ragel normally passes input straight to output. When it sees
+a machine specification it stops to read the Ragel statements and possibly generate
+code in place of the specification.
+Afterwards it continues to pass input through. There
+can be any number of FSM specifications in an input file. A multi-line FSM spec
+starts with \verb|%%{| and ends with \verb|}%%|. A single-line FSM spec starts
+with \verb|%%| and ends at the first newline.
+
+While Ragel is looking for FSM specifications it does basic lexical analysis on
+the surrounding input. It interprets literal strings and comments so a
+\verb|%%| sequence in either of those will not trigger the parsing of an FSM
+specification. Ragel does not pass the input through any preprocessor nor does it
+interpret preprocessor directives itself so includes, defines and ifdef logic
+cannot be used to alter the parse of a Ragel input file. It is therefore not
+possible to use an \verb|#if 0| directive to comment out a machine as is
+commonly done in C code. As an alternative, a machine can be prevented from
+causing any generated output by commenting out write statements.
+
+In Figure \ref{cmd-line-parsing}, a multi-line specification is used to define the
+machine and single line specifications are used to trigger the writing of the machine
+data and execution code.
+
+\begin{figure}
+\small
+\begin{multicols}{2}
+\begin{verbatim}
+#include <string.h>
+#include <stdio.h>
+
+%%{
+ machine foo;
+ main :=
+ ( 'foo' | 'bar' )
+ 0 @{ res = 1; };
+}%%
+
+%% write data;
+\end{verbatim}
+\verbspace
+\columnbreak
+\begin{verbatim}
+int main( int argc, char **argv )
+{
+ int cs, res = 0;
+ if ( argc > 1 ) {
+ char *p = argv[1];
+ char *pe = p + strlen(p) + 1;
+ %% write init;
+ %% write exec;
+ }
+ printf("result = %i\n", res );
+ return 0;
+}
+\end{verbatim}
+\verbspace
+\end{multicols}
+\caption{Parsing a command line argument.
+}
+\label{cmd-line-parsing}
+\end{figure}
+
+\subsection{Naming Ragel Blocks}
+
+\begin{verbatim}
+machine fsm_name;
+\end{verbatim}
+\verbspace
+
+The \verb|machine| statement gives the name of the FSM. If present in a
+specification, this statement must appear first. If a machine specification
+does not have a name then Ragel uses the previous specification name. If no
+previous specification name exists then this is an error. Because FSM
+specifications persist in memory, a machine's statements can be spread across
+multiple machine specifications. This allows one to break up a machine across
+several files or draw in statements that are common to multiple machines using
+the \verb|include| statement.
+
+\subsection{Machine Definition}
+\label{definition}
+
+\begin{verbatim}
+<name> = <expression>;
+\end{verbatim}
+\verbspace
+
+The machine definition statement associates an FSM expression with a name. Machine
+expressions assigned to names can later be referenced in other expressions. A
+definition statement on its own does not cause any states to be generated. It is simply a
+description of a machine to be used later. States are generated only when a definition is
+instantiated, which happens when a definition is referenced in an instantiated
+expression.
+
+\subsection{Machine Instantiation}
+\label{instantiation}
+
+\begin{verbatim}
+<name> := <expression>;
+\end{verbatim}
+\verbspace
+
+The machine instantiation statement generates a set of states representing an
+expression. Each instantiation generates a distinct set of states. The starting
+state of the instantiation is written in the data section of the generated code
+using the instantiation name. If a machine named
+\verb|main| is instantiated, its start state is used as the
+specification's start state and is assigned to the \verb|cs| variable by the
+\verb|write init| command. If no \verb|main| machine is given, the start state
+of the last machine instantiation to appear is used as the specification's
+start state.
+
+From outside the execution loop, control may be passed to any machine by
+assigning the entry point to the \verb|cs| variable. From inside the execution
+loop, control may be passed to any machine instantiation using \verb|fcall|,
+\verb|fgoto| or \verb|fnext| statements.
+
+\subsection{Including Ragel Code}
+
+\begin{verbatim}
+include FsmName "inputfile.rl";
+\end{verbatim}
+\verbspace
+
+The \verb|include| statement can be used to draw in the statements of another FSM
+specification. Both the name and input file are optional, however at least one
+must be given. Without an FSM name, the given input file is searched for an FSM
+of the same name as the current specification. Without an input file the
+current file is searched for a machine of the given name. If both are present,
+the given input file is searched for a machine of the given name.
+
+Ragel searches for included files from the location of the current file.
+Additional directories can be added to the search path using the \verb|-I|
+option.
+
+\subsection{Importing Definitions}
+\label{import}
+
+\begin{verbatim}
+import "inputfile.h";
+\end{verbatim}
+\verbspace
+
+The \verb|import| statement scrapes a file for sequences of tokens that match
+the following forms. Ragel treats these forms as state machine definitions.
+
+\noindent\hspace*{24pt}\verb|name '=' number|\\
+\noindent\hspace*{24pt}\verb|name '=' lit_string|\\
+\noindent\hspace*{24pt}\verb|'define' name number|\\
+\noindent\hspace*{24pt}\verb|'define' name lit_string|
+\vspace{12pt}
+
+If the input file is a Ragel program then tokens inside any Ragel
+specifications are ignored. See Section \ref{export} for a description of
+exporting machine definitions.
+
+Ragel searches for imported files from the location of the current file.
+Additional directories can be added to the search path using the \verb|-I|
+option.
+
+\section{Lexical Analysis of a Ragel Block}
+\label{lexing}
+
+Within a machine specification the following lexical rules apply to the input.
+
+\begin{itemize}
+
+\item The \verb|#| symbol begins a comment that terminates at the next newline.
+
+\item The symbols \verb|""|, \verb|''|, \verb|//|, \verb|[]| behave as the
+delimiters of literal strings. Within them, the following escape sequences
+are interpreted:
+
+\verb| \0 \a \b \t \n \v \f \r|
+
+A backslash at the end of a line joins the following line onto the current. A
+backslash preceding any other character removes special meaning. This applies
+to terminating characters and to special characters in regular expression
+literals. As an exception, regular expression literals do not support escape
+sequences as the operands of a range within a list. See the bullet on regular
+expressions in Section \ref{basic}.
+
+\item The symbols \verb|{}| delimit a block of host language code that will be
+embedded into the machine as an action. Within the block of host language
+code, basic lexical analysis of comments and strings is done in order to
+correctly find the closing brace of the block. With the exception of FSM
+commands embedded in code blocks, the entire block is preserved as is for
+identical reproduction in the output code.
+
+\item The pattern \verb|[+-]?[0-9]+| denotes an integer in decimal format.
+Integers used for specifying machines may be negative only if the alphabet type
+is signed. Integers used for specifying priorities may be positive or negative.
+
+\item The pattern \verb|0x[0-9A-Fa-f]+| denotes an integer in hexadecimal
+format.
+
+\item The keywords are \verb|access|, \verb|action|, \verb|alphtype|,
+\verb|getkey|, \verb|write|, \verb|machine| and \verb|include|.
+
+\item The pattern \verb|[a-zA-Z_][a-zA-Z_0-9]*| denotes an identifier.
+
+
+\item Any amount of whitespace may separate tokens.
+
+\end{itemize}
+
+
+\section{Basic Machines}
+\label{basic}
+
+The basic machines are the base operands of regular language expressions. They
+are the smallest unit to which machine construction and manipulation operators
+can be applied.
+
+\begin{itemize}
+
+\item \verb|'hello'| -- Concatenation Literal. Produces a machine that matches
+the sequence of characters in the quoted string. If there are 5 characters
+there will be 6 states chained together with the characters in the string. See
+Section \ref{lexing} for information on valid escape sequences.
+
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{bmconcat}
+\end{center}
+\graphspace
+
+It is possible
+to make a concatenation literal case-insensitive by appending an \verb|i| to
+the string, for example \verb|'cmd'i|.
+
+\item \verb|"hello"| -- Identical to the single quoted version.
+
+\item \verb|[hello]| -- Or Expression. Produces a union of characters. There
+will be two states with a transition for each unique character between the two states.
+The \verb|[]| delimiters behave like the quotes of a literal string. For example,
+\verb|[ \t]| means tab or space. The \verb|or| expression supports character ranges
+with the \verb|-| symbol as a separator. The meaning of the union can be negated
+using an initial \verb|^| character as in standard regular expressions.
+See Section \ref{lexing} for information on valid escape sequences
+in \verb|or| expressions.
+
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{bmor}
+\end{center}
+\graphspace
+
+\item \verb|''|, \verb|""|, and \verb|[]| -- Zero Length Machine. Produces a machine
+that matches the zero length string. Zero length machines have one state that is both
+a start state and a final state.
+
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{bmnull}
+\end{center}
+\graphspace
+
+% FIXME: More on the range of values here.
+\item \verb|42| -- Numerical Literal. Produces a two state machine with one
+transition on the given number. The number may be in decimal or hexadecimal
+format and should be in the range allowed by the alphabet type. The minimum and
+maximum values permitted are defined by the host machine that Ragel is compiled
+on. For example, numbers in a \verb|short| alphabet on an i386 machine should
+be in the range \verb|-32768| to \verb|32767|.
+
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{bmnum}
+\end{center}
+\graphspace
+
+\item \verb|/simple_regex/| -- Regular Expression. Regular expressions are
+parsed as a series of expressions that are concatenated together. Each
+concatenated expression
+may be a literal character, the ``any'' character specified by the \verb|.|
+symbol, or a union of characters specified by the \verb|[]| delimiters. If the
+first character of a union is \verb|^| then it matches any character not in the
+list. Within a union, a range of characters can be given by separating the first
+and last characters of the range with the \verb|-| symbol. Each
+concatenated machine may have repetition specified by following it with the
+\verb|*| symbol. The standard escape sequences described in Section
+\ref{lexing} are supported everywhere in regular expressions except as the
+operands of a range within in a list. This notation also supports the \verb|i|
+trailing option. Use it to produce case-insensitive machines, as in \verb|/GET/i|.
+
+Ragel does not support very complex regular expressions because the desired
+results can always be achieved using the more general machine construction
+operators listed in Section \ref{machconst}. The following diagram shows the
+result of compiling \verb|/ab*[c-z].*[123]/|. \verb|DEF| represents the default
+transition, which is taken if no other transition can be taken.
+
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{bmregex}
+\end{center}
+\graphspace
+
+\item \verb|'a' .. 'z'| -- Range. Produces a machine that matches any
+characters in the specified range. Allowable upper and lower bounds of the
+range are concatenation literals of length one and numerical literals. For
+example, \verb|0x10..0x20|, \verb|0..63|, and \verb|'a'..'z'| are valid ranges.
+The bounds should be in the range allowed by the alphabet type.
+
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{bmrange}
+\end{center}
+\graphspace
+
+\item \verb|variable_name| -- Lookup the machine definition assigned to the
+variable name given and use an instance of it. See Section \ref{definition} for
+an important note on what it means to reference a variable name.
+
+\item \verb|builtin_machine| -- There are several built-in machines available
+for use. They are all two state machines for the purpose of matching common
+classes of characters. They are:
+
+\begin{itemize}
+
+\item \verb|any | -- Any character in the alphabet.
+
+\item \verb|ascii | -- Ascii characters. \verb|0..127|
+
+\item \verb|extend| -- Ascii extended characters. This is the range
+\verb|-128..127| for signed alphabets and the range \verb|0..255| for unsigned
+alphabets.
+
+\item \verb|alpha | -- Alphabetic characters. \verb|[A-Za-z]|
+
+\item \verb|digit | -- Digits. \verb|[0-9]|
+
+\item \verb|alnum | -- Alpha numerics. \verb|[0-9A-Za-z]|
+
+\item \verb|lower | -- Lowercase characters. \verb|[a-z]|
+
+\item \verb|upper | -- Uppercase characters. \verb|[A-Z]|
+
+\item \verb|xdigit| -- Hexadecimal digits. \verb|[0-9A-Fa-f]|
+
+\item \verb|cntrl | -- Control characters. \verb|0..31|
+
+\item \verb|graph | -- Graphical characters. \verb|[!-~]|
+
+\item \verb|print | -- Printable characters. \verb|[ -~]|
+
+\item \verb|punct | -- Punctuation. Graphical characters that are not alphanumerics.
+\verb|[!-/:-@[-`{-~]|
+
+\item \verb|space | -- Whitespace. \verb|[\t\v\f\n\r ]|
+
+\item \verb|zlen | -- Zero length string. \verb|""|
+
+\item \verb|empty | -- Empty set. Matches nothing. \verb|^any|
+
+\end{itemize}
+\end{itemize}
+
+\section{Operator Precedence}
+The following table shows operator precedence from lowest to highest. Operators
+in the same precedence group are evaluated from left to right.
+
+\begin{tabular}{|c|c|c|}
+\hline
+1&\verb| , |&Join\\
+\hline
+2&\verb/ | & - --/&Union, Intersection and Subtraction\\
+\hline
+3&\verb| . <: :> :>> |&Concatenation\\
+\hline
+4&\verb| : |&Label\\
+\hline
+5&\verb| -> |&Epsilon Transition\\
+\hline
+6&\verb| > @ $ % |&Transitions Actions and Priorities\\
+\hline
+6&\verb| >/ $/ %/ </ @/ <>/ |&EOF Actions\\
+\hline
+6&\verb| >! $! %! <! @! <>! |&Global Error Actions\\
+\hline
+6&\verb| >^ $^ %^ <^ @^ <>^ |&Local Error Actions\\
+\hline
+6&\verb| >~ $~ %~ <~ @~ <>~ |&To-State Actions\\
+\hline
+6&\verb| >* $* %* <* @* <>* |&From-State Action\\
+\hline
+7&\verb| * ** ? + {n} {,n} {n,} {n,m} |&Repetition\\
+\hline
+8&\verb| ! ^ |&Negation and Character-Level Negation\\
+\hline
+9&\verb| ( <expr> ) |&Grouping\\
+\hline
+\end{tabular}
+
+\section{Regular Language Operators}
+\label{machconst}
+
+When using Ragel it is helpful to have a sense of how it constructs machines.
+The determinization process can produce results that seem unusual to someone
+not familiar with the NFA to DFA conversion algorithm. In this section we
+describe Ragel's state machine operators. Though the operators are defined
+using epsilon transitions, it should be noted that this is for discussion only.
+The epsilon transitions described in this section do not persist, but are
+immediately removed by the determinization process which is executed at every
+operation. Ragel does not make use of any nondeterministic intermediate state
+machines.
+
+To create an epsilon transition between two states \verb|x| and \verb|y| is to
+copy all of the properties of \verb|y| into \verb|x|. This involves drawing in
+all of \verb|y|'s to-state actions, EOF actions, etc., in addition to its
+transitions. If \verb|x| and \verb|y| both have a transition out on the same
+character, then the transitions must be combined. During transition
+combination a new transition is made that goes to a new state that is the
+combination of both target states. The new combination state is created using
+the same epsilon transition method. The new state has an epsilon transition
+drawn to all the states that compose it. Since the creation of new epsilon
+transitions may be triggered every time an epsilon transition is drawn, the
+process of drawing epsilon transitions is repeated until there are no more
+epsilon transitions to be made.
+
+A very common error that is made when using Ragel is to make machines that do
+too much. That is, to create machines that have unintentional
+nondetermistic properties. This usually results from being unaware of the common strings
+between machines that are combined together using the regular language
+operators. This can involve never leaving a machine, causing its actions to be
+propagated through all the following states. Or it can involve an alternation
+where both branches are unintentionally taken simultaneously.
+
+This problem forces one to think hard about the language that needs to be
+matched. To guard against this kind of problem one must ensure that the machine
+specification is divided up using boundaries that do not allow ambiguities from
+one portion of the machine to the next. See Chapter
+\ref{controlling-nondeterminism} for more on this problem and how to solve it.
+
+The Graphviz tool is an immense help when debugging improperly compiled
+machines or otherwise learning how to use Ragel. Graphviz Dot files can be
+generated from Ragel programs using the \verb|-V| option. See Section
+\ref{visualization} for more information.
+
+
+\subsection{Union}
+
+\verb/expr | expr/
+
+The union operation produces a machine that matches any string in machine one
+or machine two. The operation first creates a new start state. Epsilon
+transitions are drawn from the new start state to the start states of both
+input machines. The resulting machine has a final state set equivalent to the
+union of the final state sets of both input machines. In this operation, there
+is the opportunity for nondeterminism among both branches. If there are
+strings, or prefixes of strings that are matched by both machines then the new
+machine will follow both parts of the alternation at once. The union operation is
+shown below.
+
+\graphspace
+\begin{center}
+\includegraphics[scale=1.0]{opor}
+\end{center}
+\graphspace
+
+The following example demonstrates the union of three machines representing
+common tokens.
+
+% GENERATE: exor
+% OPT: -p
+% %%{
+% machine exor;
+\begin{inline_code}
+\begin{verbatim}
+# Hex digits, decimal digits, or identifiers
+main := '0x' xdigit+ | digit+ | alpha alnum*;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{exor}
+\end{center}
+\graphspace
+
+\subsection{Intersection}
+
+\verb|expr & expr|
+
+Intersection produces a machine that matches any
+string that is in both machine one and machine two. To achieve intersection, a
+union is performed on the two machines. After the result has been made
+deterministic, any final state that is not a combination of final states from
+both machines has its final state status revoked. To complete the operation,
+paths that do not lead to a final state are pruned from the machine. Therefore,
+if there are any such paths in either of the expressions they will be removed
+by the intersection operator. Intersection can be used to require that two
+independent patterns be simultaneously satisfied as in the following example.
+
+% GENERATE: exinter
+% OPT: -p
+% %%{
+% machine exinter;
+\begin{inline_code}
+\begin{verbatim}
+# Match lines four characters wide that contain
+# words separated by whitespace.
+main :=
+ /[^\n][^\n][^\n][^\n]\n/* &
+ (/[a-z][a-z]*/ | [ \n])**;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{exinter}
+\end{center}
+\graphspace
+
+\subsection{Difference}
+
+\verb|expr - expr|
+
+The difference operation produces a machine that matches
+strings that are in machine one but are not in machine two. To achieve subtraction,
+a union is performed on the two machines. After the result has been made
+deterministic, any final state that came from machine two or is a combination
+of states involving a final state from machine two has its final state status
+revoked. As with intersection, the operation is completed by pruning any path
+that does not lead to a final state. The following example demonstrates the
+use of subtraction to exclude specific cases from a set.
+
+% GENERATE: exsubtr
+% OPT: -p
+% %%{
+% machine exsubtr;
+\begin{inline_code}
+\begin{verbatim}
+# Subtract keywords from identifiers.
+main := /[a-z][a-z]*/ - ( 'for' | 'int' );
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{exsubtr}
+\end{center}
+\graphspace
+
+\subsection{Strong Difference}
+\label{strong_difference}
+
+\verb|expr -- expr|
+
+Strong difference produces a machine that matches any string of the first
+machine that does not have any string of the second machine as a substring. In
+the following example, strong subtraction is used to excluded \verb|CRLF| from
+a sequence. In the corresponding visualization, the label \verb|DEF| is short
+for default. The default transition is taken if no other transition can be
+taken.
+
+% GENERATE: exstrongsubtr
+% OPT: -p
+% %%{
+% machine exstrongsubtr;
+\begin{inline_code}
+\begin{verbatim}
+crlf = '\r\n';
+main := [a-z]+ ':' ( any* -- crlf ) crlf;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{exstrongsubtr}
+\end{center}
+\graphspace
+
+This operator is equivalent to the following.
+
+\begin{verbatim}
+expr - ( any* expr any* )
+\end{verbatim}
+\verbspace
+
+\subsection{Concatenation}
+
+\verb|expr . expr|
+
+Concatenation produces a machine that matches all the strings in machine one followed by all
+the strings in machine two. Concatenation draws epsilon transitions from the
+final states of the first machine to the start state of the second machine. The
+final states of the first machine lose their final state status, unless the
+start state of the second machine is final as well.
+Concatenation is the default operator. Two machines next to each other with no
+operator between them results in concatenation.
+
+\graphspace
+\begin{center}
+\includegraphics[scale=1.0]{opconcat}
+\end{center}
+\graphspace
+
+The opportunity for nondeterministic behaviour results from the possibility of
+the final states of the first machine accepting a string that is also accepted
+by the start state of the second machine.
+The most common scenario in which this happens is the
+concatenation of a machine that repeats some pattern with a machine that gives
+a terminating string, but the repetition machine does not exclude the
+terminating string. The example in Section \ref{strong_difference}
+guards against this. Another example is the expression \verb|("'" any* "'")|.
+When executed the thread of control will
+never leave the \verb|any*| machine. This is a problem especially if actions
+are embedded to process the characters of the \verb|any*| component.
+
+In the following example, the first machine is always active due to the
+nondeterministic nature of concatenation. This particular nondeterminism is intended
+however because we wish to permit EOF strings before the end of the input.
+
+% GENERATE: exconcat
+% OPT: -p
+% %%{
+% machine exconcat;
+\begin{inline_code}
+\begin{verbatim}
+# Require an eof marker on the last line.
+main := /[^\n]*\n/* . 'EOF\n';
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{exconcat}
+\end{center}
+\graphspace
+
+There is a language
+ambiguity involving concatenation and subtraction. Because concatenation is the
+default operator for two
+adjacent machines there is an ambiguity between subtraction of
+a positive numerical literal and concatenation of a negative numerical literal.
+For example, \verb|(x-7)| could be interpreted as \verb|(x . -7)| or
+\verb|(x - 7)|. In the Ragel language, the subtraction operator always takes precedence
+over concatenation of a negative literal. We adhere to the rule that the default
+concatenation operator takes effect only when there are no other operators between
+two machines. Beware of writing machines such as \verb|(any -1)| when what is
+desired is a concatenation of \verb|any| and \verb|-1|. Instead write
+\verb|(any . -1)| or \verb|(any (-1))|. If in doubt of the meaning of your program do not
+rely on the default concatenation operator; always use the \verb|.| symbol.
+
+
+\subsection{Kleene Star}
+
+\verb|expr*|
+
+The machine resulting from the Kleene Star operator will match zero or more
+repetitions of the machine it is applied to.
+It creates a new start state and an additional final
+state. Epsilon transitions are drawn between the new start state and the old start
+state, between the new start state and the new final state, and
+between the final states of the machine and the new start state. After the
+machine is made deterministic the effect is of the final states getting all the
+transitions of the start state.
+
+\graphspace
+\begin{center}
+\includegraphics[scale=1.0]{opstar}
+\end{center}
+\graphspace
+
+The possibility for nondeterministic behaviour arises if the final states have
+transitions on any of the same characters as the start state. This is common
+when applying kleene star to an alternation of tokens. Like the other problems
+arising from nondeterministic behavior, this is discussed in more detail in Chapter
+\ref{controlling-nondeterminism}. This particular problem can also be solved
+by using the longest-match construction discussed in Section
+\ref{generating-scanners} on scanners.
+
+In this
+example, there is no nondeterminism introduced by the exterior kleene star due to
+the newline at the end of the regular expression. Without the newline the
+exterior kleene star would be redundant and there would be ambiguity between
+repeating the inner range of the regular expression and the entire regular
+expression. Though it would not cause a problem in this case, unnecessary
+nondeterminism in the kleene star operator often causes undesired results for
+new Ragel users and must be guarded against.
+
+% GENERATE: exstar
+% OPT: -p
+% %%{
+% machine exstar;
+\begin{inline_code}
+\begin{verbatim}
+# Match any number of lines with only lowercase letters.
+main := /[a-z]*\n/*;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{exstar}
+\end{center}
+\graphspace
+
+\subsection{One Or More Repetition}
+
+\verb|expr+|
+
+This operator produces the concatenation of the machine with the kleene star of
+itself. The result will match one or more repetitions of the machine. The plus
+operator is equivalent to \verb|(expr . expr*)|.
+
+% GENERATE: explus
+% OPT: -p
+% %%{
+% machine explus;
+\begin{inline_code}
+\begin{verbatim}
+# Match alpha-numeric words.
+main := alnum+;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{explus}
+\end{center}
+\graphspace
+
+\subsection{Optional}
+
+\verb|expr?|
+
+The {\em optional} operator produces a machine that accepts the machine
+given or the zero length string. The optional operator is equivalent to
+\verb/(expr | '' )/. In the following example the optional operator is used to
+possibly extend a token.
+
+% GENERATE: exoption
+% OPT: -p
+% %%{
+% machine exoption;
+\begin{inline_code}
+\begin{verbatim}
+# Match integers or floats.
+main := digit+ ('.' digit+)?;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{exoption}
+\end{center}
+\graphspace
+
+\subsection{Repetition}
+
+\noindent\hspace*{24pt}\verb|expr {n}| -- Exactly N copies of expr.\\
+\noindent\hspace*{24pt}\verb|expr {,n}| -- Zero to N copies of expr.\\
+\noindent\hspace*{24pt}\verb|expr {n,}| -- N or more copies of expr.\\
+\noindent\hspace*{24pt}\verb|expr {n,m}| -- N to M copies of expr.
+\vspace{12pt}
+
+\subsection{Negation}
+
+\verb|!expr|
+
+Negation produces a machine that matches any string not matched by the given
+machine. Negation is equivalent to \verb|(any* - expr)|.
+
+% GENERATE: exnegate
+% OPT: -p
+% %%{
+% machine exnegate;
+\begin{inline_code}
+\begin{verbatim}
+# Accept anything but a string beginning with a digit.
+main := ! ( digit any* );
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{exnegate}
+\end{center}
+\graphspace
+
+\subsection{Character-Level Negation}
+
+\verb|^expr|
+
+Character-level negation produces a machine that matches any single character
+not matched by the given machine. Character-Level Negation is equivalent to
+\verb|(any - expr)|. It must be applied only to machines that match strings of
+length one.
+
+\section{State Machine Minimization}
+
+State machine minimization is the process of finding the minimal equivalent FSM accepting
+the language. Minimization reduces the number of states in machines
+by merging equivalent states. It does not change the behaviour of the machine
+in any way. It will cause some states to be merged into one because they are
+functionally equivalent. State minimization is on by default. It can be turned
+off with the \verb|-n| option.
+
+The algorithm implemented is similar to Hopcroft's state minimization
+algorithm. Hopcroft's algorithm assumes a finite alphabet that can be listed in
+memory, whereas Ragel supports arbitrary integer alphabets that cannot be
+listed in memory. Though exact analysis is very difficult, Ragel minimization
+runs close to O(n * log(n)) and requires O(n) temporary storage where
+$n$ is the number of states.
+
+\section{Visualization}
+\label{visualization}
+
+%In many cases, practical
+%parsing programs will be too large to completely visualize with Graphviz. The
+%proper approach is to reduce the language to the smallest subset possible that
+%still exhibits the characteristics that one wishes to learn about or to fix.
+%This can be done without modifying the source code using the \verb|-M| and
+%\verb|-S| options. If a machine cannot be easily reduced,
+%embeddings of unique actions can be very useful for tracing a
+%particular component of a larger machine specification, since action names are
+%written out on transition labels.
+
+Ragel is able to emit compiled state machines in Graphviz's Dot file format.
+This is done using the \verb|-V| option.
+Graphviz support allows users to perform
+incremental visualization of their parsers. User actions are displayed on
+transition labels of the graph.
+
+If the final graph is too large to be
+meaningful, or even drawn, the user is able to inspect portions of the parser
+by naming particular regular expression definitions with the \verb|-S| and
+\verb|-M| options to the \verb|ragel| program. Use of Graphviz greatly
+improves the Ragel programming experience. It allows users to learn Ragel by
+experimentation and also to track down bugs caused by unintended
+nondeterminism.
+
+Ragel has another option to help debugging. The \verb|-x| option causes Ragel
+to emit the compiled machine in an XML format.
+
+\chapter{User Actions}
+
+Ragel permits the user to embed actions into the transitions of a regular
+expression's corresponding state machine. These actions are executed when the
+generated code moves over a transition. Like the regular expression operators,
+the action embedding operators are fully compositional. They take a state
+machine and an action as input, embed the action and yield a new state machine
+that can be used in the construction of other machines. Due to the
+compositional nature of embeddings, the user has complete freedom in the
+placement of actions.
+
+A machine's transitions are categorized into four classes. The action embedding
+operators access the transitions defined by these classes. The {\em entering
+transition} operator \verb|>| isolates the start state, then embeds an action
+into all transitions leaving it. The {\em finishing transition} operator
+\verb|@| embeds an action into all transitions going into a final state. The
+{\em all transition} operator \verb|$| embeds an action into all transitions of
+an expression. The {\em leaving transition} operator \verb|%| provides access
+to the yet-unmade transitions moving out of the machine via the final states.
+
+\section{Embedding Actions}
+
+\begin{verbatim}
+action ActionName {
+ /* Code an action here. */
+ count += 1;
+}
+\end{verbatim}
+\verbspace
+
+The action statement defines a block of code that can be embedded into an FSM.
+Action names can be referenced by the action embedding operators in
+expressions. Though actions need not be named in this way (literal blocks
+of code can be embedded directly when building machines), defining reusable
+blocks of code whenever possible is good practice because it potentially increases the
+degree to which the machine can be minimized.
+
+Within an action some Ragel expressions and statements are parsed and
+translated. These allow the user to interact with the machine from action code.
+See Section \ref{vals} for a complete list of statements and values available
+in code blocks.
+
+\subsection{Entering Action}
+
+\verb|expr > action|
+
+The entering action operator embeds an action into all transitions
+that enter into the machine from the start state. If the start state is final,
+then the action is also embedded into the start state as a leaving action. This
+means that if a machine accepts the zero-length string and control passes
+through the start state then the entering action is executed. Note
+that this can happen on both a following character and on the EOF event.
+
+In some machines the start state has transtions coming in from within the
+machine. In these cases the start state is first isolated from the rest of the
+machine ensuring that the entering actions are exected once only.
+
+% GENERATE: exstact
+% OPT: -p
+% %%{
+% machine exstact;
+\begin{inline_code}
+\begin{verbatim}
+# Execute A at the beginning of a string of alpha.
+action A {}
+main := ( lower* >A ) . ' ';
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{exstact}
+\end{center}
+\graphspace
+
+\subsection{Finishing Action}
+
+\verb|expr @ action|
+
+The finishing action operator embeds an action into any transitions that move
+the machine into a final state. Further input may move the machine out of the
+final state, but keep it in the machine. Therefore finishing actions may be
+executed more than once if a machine has any internal transitions out of a
+final state. In the following example the final state has no transitions out
+and the finishing action is executed only once.
+
+% GENERATE: exdoneact
+% OPT: -p
+% %%{
+% machine exdoneact;
+% action A {}
+\begin{inline_code}
+\begin{verbatim}
+# Execute A when the trailing space is seen.
+main := ( lower* ' ' ) @A;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{exdoneact}
+\end{center}
+\graphspace
+
+\subsection{All Transition Action}
+
+\verb|expr $ action|
+
+The all transition operator embeds an action into all transitions of a machine.
+The action is executed whenever a transition of the machine is taken. In the
+following example, A is executed on every character matched.
+
+% GENERATE: exallact
+% OPT: -p
+% %%{
+% machine exallact;
+% action A {}
+\begin{inline_code}
+\begin{verbatim}
+# Execute A on any characters of the machine.
+main := ( 'm1' | 'm2' ) $A;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{exallact}
+\end{center}
+\graphspace
+
+\subsection{Leaving Actions}
+\label{out-actions}
+
+\verb|expr % action|
+
+The leaving action operator queues an action for embedding into the transitions
+that go out of a machine via a final state. The action is first stored in
+the machine's final states and is later transferred to any transitions that are
+made going out of the machine by a kleene star or concatenation operation.
+
+If a final state of the machine is still final when compilation is complete
+then the leaving action is also embedded as an EOF action. Therefore, leaving
+the machine is defined as either leaving on a character or as state machine
+acceptance.
+
+This operator allows one to associate an action with the termination of a
+sequence, without being concerned about what particular character terminates
+the sequence. In the following example, A is executed when leaving the alpha
+machine on the newline character.
+
+% GENERATE: exoutact1
+% OPT: -p
+% %%{
+% machine exoutact1;
+% action A {}
+\begin{inline_code}
+\begin{verbatim}
+# Match a word followed by a newline. Execute A when
+# finishing the word.
+main := ( lower+ %A ) . '\n';
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{exoutact1}
+\end{center}
+\graphspace
+
+In the following example, the \verb|term_word| action could be used to register
+the appearance of a word and to clear the buffer that the \verb|lower| action used
+to store the text of it.
+
+% GENERATE: exoutact2
+% OPT: -p
+% %%{
+% machine exoutact2;
+% action lower {}
+% action space {}
+% action term_word {}
+% action newline {}
+\begin{inline_code}
+\begin{verbatim}
+word = ( [a-z] @lower )+ %term_word;
+main := word ( ' ' @space word )* '\n' @newline;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{exoutact2}
+\end{center}
+\graphspace
+
+In this final example of the action embedding operators, A is executed upon entering
+the alpha machine, B is executed on all transitions of the
+alpha machine, C is executed when the alpha machine is exited by moving into the
+newline machine and N is executed when the newline machine moves into a final
+state.
+
+% GENERATE: exaction
+% OPT: -p
+% %%{
+% machine exaction;
+% action A {}
+% action B {}
+% action C {}
+% action N {}
+\begin{inline_code}
+\begin{verbatim}
+# Execute A on starting the alpha machine, B on every transition
+# moving through it and C upon finishing. Execute N on the newline.
+main := ( lower* >A $B %C ) . '\n' @N;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{exaction}
+\end{center}
+\graphspace
+
+
+\section{State Action Embedding Operators}
+
+The state embedding operators allow one to embed actions into states. Like the
+transition embedding operators, there are several different classes of states
+that the operators access. The meanings of the symbols are similar to the
+meanings of the symbols used for the transition embedding operators. The design
+of the state selections was driven by a need to cover the states of an
+expression with exactly one error action.
+
+Unlike the transition embedding operators, the state embedding operators are
+also distinguished by the different kinds of events that embedded actions can
+be associated with. Therefore the state embedding operators have two
+components. The first, which is the first one or two characters, specifies the
+class of states that the action will be embedded into. The second component
+specifies the type of event the action will be executed on. The symbols of the
+second component also have equivalent kewords.
+
+\begin{multicols}{2}
+The different classes of states are:
+
+\noindent\hspace*{24pt}\verb|> | -- the start state\\
+\noindent\hspace*{24pt}\verb|< | -- any state except the start state\\
+\noindent\hspace*{24pt}\verb|$ | -- all states\\
+\noindent\hspace*{24pt}\verb|% | -- final states\\
+\noindent\hspace*{24pt}\verb|@ | -- any state except final states\\
+\noindent\hspace*{24pt}\verb|<>| -- any except start and final (middle)
+\vspace{12pt}
+
+\columnbreak
+
+The different kinds of embeddings are:
+
+\noindent\hspace*{24pt}\verb|~| -- to-state actions (\verb|to|)\\
+\noindent\hspace*{24pt}\verb|*| -- from-state actions (\verb|from|)\\
+\noindent\hspace*{24pt}\verb|/| -- EOF actions (\verb|eof|)\\
+\noindent\hspace*{24pt}\verb|!| -- error actions (\verb|err|)\\
+\noindent\hspace*{24pt}\verb|^| -- local error actions (\verb|lerr|)
+\vspace{12pt}
+
+\end{multicols}
+
+\subsection{To-State and From-State Actions}
+
+\subsubsection{To-State Actions}
+
+\noindent\hspace*{24pt}\verb|>~action >to(name) >to{...} | -- the start state\\
+\noindent\hspace*{24pt}\verb|<~action <to(name) <to{...} | -- any state except the start state\\
+\noindent\hspace*{24pt}\verb|$~action $to(name) $to{...} | -- all states\\
+\noindent\hspace*{24pt}\verb|%~action %to(name) %to{...} | -- final states\\
+\noindent\hspace*{24pt}\verb|@~action @to(name) @to{...} | -- any state except final states\\
+\noindent\hspace*{24pt}\verb|<>~action <>to(name) <>to{...}| -- any except start and final (middle)
+\vspace{12pt}
+
+
+To-state actions are executed whenever the state machine moves into the
+specified state, either by a natural movement over a transition or by an
+action-based transfer of control such as \verb|fgoto|. They are executed after the
+in-transition's actions but before the current character is advanced and
+tested against the end of the input block. To-state embeddings stay with the
+state. They are irrespective of the state's current set of transitions and any
+future transitions that may be added in or out of the state.
+
+Note that the setting of the current state variable \verb|cs| outside of the
+execute code is not considered by Ragel as moving into a state and consequently
+the to-state actions of the new current state are not executed. This includes
+the initialization of the current state when the machine begins. This is
+because the entry point into the machine execution code is after the execution
+of to-state actions.
+
+\subsubsection{From-State Actions}
+
+\noindent\hspace*{24pt}\verb|>*action >from(name) >from{...} | -- the start state\\
+\noindent\hspace*{24pt}\verb|<*action <from(name) <from{...} | -- any state except the start state\\
+\noindent\hspace*{24pt}\verb|$*action $from(name) $from{...} | -- all states\\
+\noindent\hspace*{24pt}\verb|%*action %from(name) %from{...} | -- final states\\
+\noindent\hspace*{24pt}\verb|@*action @from(name) @from{...} | -- any state except final states\\
+\noindent\hspace*{24pt}\verb|<>*action <>from(name) <>from{...}| -- any except start and final (middle)
+\vspace{12pt}
+
+From-state actions are executed whenever the state machine takes a transition from a
+state, either to itself or to some other state. These actions are executed
+immediately after the current character is tested against the input block end
+marker and before the transition to take is sought based on the current
+character. From-state actions are therefore executed even if a transition
+cannot be found and the machine moves into the error state. Like to-state
+embeddings, from-state embeddings stay with the state.
+
+\subsection{EOF Actions}
+
+\noindent\hspace*{24pt}\verb|>/action >eof(name) >eof{...} | -- the start state\\
+\noindent\hspace*{24pt}\verb|</action <eof(name) <eof{...} | -- any state except the start state\\
+\noindent\hspace*{24pt}\verb|$/action $eof(name) $eof{...} | -- all states\\
+\noindent\hspace*{24pt}\verb|%/action %eof(name) %eof{...} | -- final states\\
+\noindent\hspace*{24pt}\verb|@/action @eof(name) @eof{...} | -- any state except final states\\
+\noindent\hspace*{24pt}\verb|<>/action <>eof(name) <>eof{...}| -- any except start and final (middle)
+\vspace{12pt}
+
+The EOF action embedding operators enable the user to embed actions that are
+executed at the end of the input stream. EOF actions are stored in states and
+generated in the \verb|write exec| block. They are run when \verb|p == pe == eof|
+as the execute block is finishing. EOF actions are free to adjust \verb|p| and
+jump to another part of the machine to restart execution.
+
+\subsection{Handling Errors}
+
+In many applications it is useful to be able to react to parsing errors. The
+user may wish to print an error message that depends on the context. It
+may also be desirable to consume input in an attempt to return the input stream
+to some known state and resume parsing. To support error handling and recovery,
+Ragel provides error action embedding operators. There are two kinds of error
+actions: global error actions and local error actions.
+Error actions can be used to simply report errors, or by jumping to a machine
+instantiation that consumes input, can attempt to recover from errors.
+
+\subsubsection{Global Error Actions}
+
+\noindent\hspace*{24pt}\verb|>!action >err(name) >err{...} | -- the start state\\
+\noindent\hspace*{24pt}\verb|<!action <err(name) <err{...} | -- any state except the start state\\
+\noindent\hspace*{24pt}\verb|$!action $err(name) $err{...} | -- all states\\
+\noindent\hspace*{24pt}\verb|%!action %err(name) %err{...} | -- final states\\
+\noindent\hspace*{24pt}\verb|@!action @err(name) @err{...} | -- any state except final states\\
+\noindent\hspace*{24pt}\verb|<>!action <>err(name) <>err{...}| -- any except start and final (middle)
+\vspace{12pt}
+
+Global error actions are stored in the states they are embedded into until
+compilation is complete. They are then transferred to the transitions that move
+into the error state. These transitions are taken on all input characters that
+are not already covered by the state's transitions. If a state with an error
+action is not final when compilation is complete, then the action is also
+embedded as an EOF action.
+
+Error actions can be used to recover from errors by jumping back into the
+machine with \verb|fgoto| and optionally altering \verb|p|.
+
+\subsubsection{Local Error Actions}
+
+\noindent\hspace*{24pt}\verb|>^action >lerr(name) >lerr{...} | -- the start state\\
+\noindent\hspace*{24pt}\verb|<^action <lerr(name) <lerr{...} | -- any state except the start state\\
+\noindent\hspace*{24pt}\verb|$^action $lerr(name) $lerr{...} | -- all states\\
+\noindent\hspace*{24pt}\verb|%^action %lerr(name) %lerr{...} | -- final states\\
+\noindent\hspace*{24pt}\verb|@^action @lerr(name) @lerr{...} | -- any state except final states\\
+\noindent\hspace*{24pt}\verb|<>^action <>lerr(name) <>lerr{...}| -- any except start and final (middle)
+\vspace{12pt}
+
+Like global error actions, local error actions are also stored in the states
+they are embedded into until a transfer point. The transfer point is different
+however. Each local error action embedding is associated with a name. When a
+machine definition has been fully constructed, all local error action
+embeddings associated with the same name as the machine definition are
+transferred to the error transitions. At this time they are also embedded as
+EOF actions in the case of non-final states.
+
+Local error actions can be used to specify an action to take when a particular
+section of a larger state machine fails to match. A particular machine
+definition's ``thread'' may die and the local error actions executed, however
+the machine as a whole may continue to match input.
+
+There are two forms of local error action embeddings. In the first form the
+name defaults to the current machine. In the second form the machine name can
+be specified. This is useful when it is more convenient to specify the local
+error action in a sub-definition that is used to construct the machine
+definition that the local error action is associated with. To embed local
+error actions and
+explicitly state the machine definition on which the transfer is to happen use
+\verb|(name, action)| as the action.
+
+\subsubsection{Example}
+
+The following example uses error actions to report an error and jump to a
+machine that consumes the remainder of the line when parsing fails. After
+consuming the line, the error recovery machine returns to the main loop.
+
+% GENERATE: erract
+% %%{
+% machine erract;
+% ws = ' ';
+% address = 'foo AT bar..com';
+% date = 'Monday May 12';
+\begin{inline_code}
+\begin{verbatim}
+action cmd_err {
+ printf( "command error\n" );
+ fhold; fgoto line;
+}
+action from_err {
+ printf( "from error\n" );
+ fhold; fgoto line;
+}
+action to_err {
+ printf( "to error\n" );
+ fhold; fgoto line;
+}
+
+line := [^\n]* '\n' @{ fgoto main; };
+
+main := (
+ (
+ 'from' @err(cmd_err)
+ ( ws+ address ws+ date '\n' ) $err(from_err) |
+ 'to' @err(cmd_err)
+ ( ws+ address '\n' ) $err(to_err)
+ )
+)*;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% %% write data;
+% void f()
+% {
+% %% write init;
+% %% write exec;
+% }
+% END GENERATE
+
+
+
+\section{Action Ordering and Duplicates}
+
+When combining expressions that have embedded actions it is often the case that
+a number of actions must be executed on a single input character. For example,
+following a concatenation the leaving action of the left expression and the
+entering action of the right expression will be embedded into one transition.
+This requires a method of ordering actions that is intuitive and
+predictable for the user, and repeatable for the compiler.
+
+We associate with the embedding of each action a unique timestamp that is
+used to order actions that appear together on a single transition in the final
+state machine. To accomplish this we recursively traverse the parse tree of
+regular expressions and assign timestamps to action embeddings. References to
+machine definitions are followed in the traversal. When we visit a
+parse tree node we assign timestamps to all {\em entering} action embeddings,
+recurse on the parse tree, then assign timestamps to the remaining {\em all},
+{\em finishing}, and {\em leaving} embeddings in the order in which they
+appear.
+
+By default Ragel does not permit a single action to appear multiple times in an action
+list. When the final machine has been created, actions that appear more than
+once in a single transition, to-state, from-state or EOF action list have their
+duplicates removed.
+The first appearance of the action is preserved. This is useful in a number of
+scenarios. First, it allows us to union machines with common prefixes without
+worrying about the action embeddings in the prefix being duplicated. Second, it
+prevents leaving actions from being transferred multiple times. This can
+happen when a machine is repeated, then followed with another machine that
+begins with a common character. For example:
+
+\begin{verbatim}
+word = [a-z]+ %act;
+main := word ( '\n' word )* '\n\n';
+\end{verbatim}
+\verbspace
+
+Note that Ragel does not compare action bodies to determine if they have
+identical program text. It simply checks for duplicates using each action
+block's unique location in the program.
+
+The removal of duplicates can be turned off using the \verb|-d| option.
+
+\section{Values and Statements Available in Code Blocks}
+\label{vals}
+
+The following values are available in code blocks:
+
+\begin{itemize}
+\item \verb|fpc| -- A pointer to the current character. This is equivalent to
+accessing the \verb|p| variable.
+
+\item \verb|fc| -- The current character. This is equivalent to the expression \verb|(*p)|.
+
+\item \verb|fcurs| -- An integer value representing the current state. This
+value should only be read from. To move to a different place in the machine
+from action code use the \verb|fgoto|, \verb|fnext| or \verb|fcall| statements.
+Outside of the machine execution code the \verb|cs| variable may be modified.
+
+\item \verb|ftargs| -- An integer value representing the target state. This
+value should only be read from. Again, \verb|fgoto|, \verb|fnext| and
+\verb|fcall| can be used to move to a specific entry point.
+
+\item \verb|fentry(<label>)| -- Retrieve an integer value representing the
+entry point \verb|label|. The integer value returned will be a compile time
+constant. This number is suitable for later use in control flow transfer
+statements that take an expression. This value should not be compared against
+the current state because any given label can have multiple states representing
+it. The value returned by \verb|fentry| can be any one of the multiple states that
+it represents.
+\end{itemize}
+
+The following statements are available in code blocks:
+
+\begin{itemize}
+
+\item \verb|fhold;| -- Do not advance over the current character. If processing
+data in multiple buffer blocks, the \verb|fhold| statement should only be used
+once in the set of actions executed on a character. Multiple calls may result
+in backing up over the beginning of the buffer block. The \verb|fhold|
+statement does not imply any transfer of control. It is equivalent to the
+\verb|p--;| statement.
+
+\item \verb|fexec <expr>;| -- Set the next character to process. This can be
+used to backtrack to previous input or advance ahead.
+Unlike \verb|fhold|, which can be used
+anywhere, \verb|fexec| requires the user to ensure that the target of the
+backtrack is in the current buffer block or is known to be somewhere ahead of
+it. The machine will continue iterating forward until \verb|pe| is arrived at,
+\verb|fbreak| is called or the machine moves into the error state. In actions
+embedded into transitions, the \verb|fexec| statement is equivalent to setting
+\verb|p| to one position ahead of the next character to process. If the user
+also modifies \verb|pe|, it is possible to change the buffer block entirely.
+
+\item \verb|fgoto <label>;| -- Jump to an entry point defined by
+\verb|<label>|. The \verb|fgoto| statement immediately transfers control to
+the destination state.
+
+\item \verb|fgoto *<expr>;| -- Jump to an entry point given by \verb|<expr>|.
+The expression must evaluate to an integer value representing a state.
+
+\item \verb|fnext <label>;| -- Set the next state to be the entry point defined
+by \verb|label|. The \verb|fnext| statement does not immediately jump to the
+specified state. Any action code following the statement is executed.
+
+\item \verb|fnext *<expr>;| -- Set the next state to be the entry point given
+by \verb|<expr>|. The expression must evaluate to an integer value representing
+a state.
+
+\item \verb|fcall <label>;| -- Push the target state and jump to the entry
+point defined by \verb|<label>|. The next \verb|fret| will jump to the target
+of the transition on which the call was made. Use of \verb|fcall| requires
+the declaration of a call stack. An array of integers named \verb|stack| and a
+single integer named \verb|top| must be declared. With the \verb|fcall|
+construct, control is immediately transferred to the destination state.
+See section \ref{modularization} for more information.
+
+\item \verb|fcall *<expr>;| -- Push the current state and jump to the entry
+point given by \verb|<expr>|. The expression must evaluate to an integer value
+representing a state.
+
+\item \verb|fret;| -- Return to the target state of the transition on which the
+last \verb|fcall| was made. Use of \verb|fret| requires the declaration of a
+call stack. Control is immediately transferred to the destination state.
+
+\item \verb|fbreak;| -- Advance \verb|p|, save the target state to \verb|cs|
+and immediately break out of the execute loop. This statement is useful
+in conjunction with the \verb|noend| write option. Rather than process input
+until \verb|pe| is arrived at, the fbreak statement
+can be used to stop processing from an action. After an \verb|fbreak|
+statement the \verb|p| variable will point to the next character in the input. The
+current state will be the target of the current transition. Note that \verb|fbreak|
+causes the target state's to-state actions to be skipped.
+
+\end{itemize}
+
+Once actions with control-flow commands are embedded into a
+machine, the user must exercise caution when using the machine as the operand
+to other machine construction operators. If an action jumps to another state
+then unioning any transition that executes that action with another transition
+that follows some other path will cause that other path to be lost. Using
+commands that manually jump around a machine takes us out of the domain of
+regular languages because transitions that the
+machine construction operators are not aware of are introduced. These
+commands should therefore be used with caution.
+
+
+\chapter{Controlling Nondeterminism}
+\label{controlling-nondeterminism}
+
+Along with the flexibility of arbitrary action embeddings comes a need to
+control nondeterminism in regular expressions. If a regular expression is
+ambiguous, then sub-components of a parser other than the intended parts may become
+active. This means that actions that are irrelevant to the
+current subset of the parser may be executed, causing problems for the
+programmer.
+
+Tools that are based on regular expression engines and that are used for
+recognition tasks will usually function as intended regardless of the presence
+of ambiguities. It is quite common for users of scripting languages to write
+regular expressions that are heavily ambiguous and it generally does not
+matter. As long as one of the potential matches is recognized, there can be any
+number of other matches present. In some parsing systems the run-time engine
+can employ a strategy for resolving ambiguities, for example always pursuing
+the longest possible match and discarding others.
+
+In Ragel, there is no regular expression run-time engine, just a simple state
+machine execution model. When we begin to embed actions and face the
+possibility of spurious action execution, it becomes clear that controlling
+nondeterminism at the machine construction level is very important. Consider
+the following example.
+
+% GENERATE: lines1
+% OPT: -p
+% %%{
+% machine lines1;
+% action first {}
+% action tail {}
+% word = [a-z]+;
+\begin{inline_code}
+\begin{verbatim}
+ws = [\n\t ];
+line = word $first ( ws word $tail )* '\n';
+lines = line*;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% main := lines;
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.53]{lines1}
+\end{center}
+\graphspace
+
+Since the \verb|ws| expression includes the newline character, we will
+not finish the \verb|line| expression when a newline character is seen. We will
+simultaneously pursue the possibility of matching further words on the same
+line and the possibility of matching a second line. Evidence of this fact is
+in the state tables. On several transitions both the \verb|first| and
+\verb|tail| actions are executed. The solution here is simple: exclude
+the newline character from the \verb|ws| expression.
+
+% GENERATE: lines2
+% OPT: -p
+% %%{
+% machine lines2;
+% action first {}
+% action tail {}
+% word = [a-z]+;
+\begin{inline_code}
+\begin{verbatim}
+ws = [\t ];
+line = word $first ( ws word $tail )* '\n';
+lines = line*;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% main := lines;
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{lines2}
+\end{center}
+\graphspace
+
+Solving this kind of problem is straightforward when the ambiguity is created
+by strings that are a single character long. When the ambiguity is created by
+strings that are multiple characters long we have a more difficult problem.
+The following example is an incorrect attempt at a regular expression for C
+language comments.
+
+% GENERATE: comments1
+% OPT: -p
+% %%{
+% machine comments1;
+% action comm {}
+\begin{inline_code}
+\begin{verbatim}
+comment = '/*' ( any @comm )* '*/';
+main := comment ' ';
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{comments1}
+\end{center}
+\graphspace
+
+Using standard concatenation, we will never leave the \verb|any*| expression.
+We will forever entertain the possibility that a \verb|'*/'| string that we see
+is contained in a longer comment and that, simultaneously, the comment has
+ended. The concatenation of the \verb|comment| machine with \verb|SP| is done
+to show this. When we match space, we are also still matching the comment body.
+
+One way to approach the problem is to exclude the terminating string
+from the \verb|any*| expression using set difference. We must be careful to
+exclude not just the terminating string, but any string that contains it as a
+substring. A verbose, but proper specification of a C comment parser is given
+by the following regular expression.
+
+% GENERATE: comments2
+% OPT: -p
+% %%{
+% machine comments2;
+% action comm {}
+\begin{inline_code}
+\begin{verbatim}
+comment = '/*' ( ( any @comm )* - ( any* '*/' any* ) ) '*/';
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% main := comment;
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{comments2}
+\end{center}
+\graphspace
+
+Note that Ragel's strong subtraction operator \verb|--| can also be used here.
+In doing this subtraction we have phrased the problem of controlling non-determinism in
+terms of excluding strings common to two expressions that interact when
+combined.
+We can also phrase the problem in terms of the transitions of the state
+machines that implement these expressions. During the concatenation of
+\verb|any*| and \verb|'*/'| we will be making transitions that are composed of
+both the loop of the first expression and the final character of the second.
+At this time we want the transition on the \verb|'/'| character to take precedence
+over and disallow the transition that originated in the \verb|any*| loop.
+
+In another parsing problem, we wish to implement a lightweight tokenizer that we can
+utilize in the composition of a larger machine. For example, some HTTP headers
+have a token stream as a sub-language. The following example is an attempt
+at a regular expression-based tokenizer that does not function correctly due to
+unintended nondeterminism.
+
+% GENERATE: smallscanner
+% OPT: -p
+% %%{
+% machine smallscanner;
+% action start_str {}
+% action on_char {}
+% action finish_str {}
+\begin{inline_code}
+\begin{verbatim}
+header_contents = (
+ lower+ >start_str $on_char %finish_str |
+ ' '
+)*;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% main := header_contents;
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{smallscanner}
+\end{center}
+\graphspace
+
+In this case, the problem with using a standard kleene star operation is that
+there is an ambiguity between extending a token and wrapping around the machine
+to begin a new token. Using the standard operator, we get an undesirable
+nondeterministic behaviour. Evidence of this can be seen on the transition out
+of state one to itself. The transition extends the string, and simultaneously,
+finishes the string only to immediately begin a new one. What is required is
+for the
+transitions that represent an extension of a token to take precedence over the
+transitions that represent the beginning of a new token. For this problem
+there is no simple solution that uses standard regular expression operators.
+
+\section{Priorities}
+
+A priority mechanism was devised and built into the determinization
+process, specifically for the purpose of allowing the user to control
+nondeterminism. Priorities are integer values embedded into transitions. When
+the determinization process is combining transitions that have different
+priorities, the transition with the higher priority is preserved and the
+transition with the lower priority is dropped.
+
+Unfortunately, priorities can have unintended side effects because their
+operation requires that they linger in transitions indefinitely. They must linger
+because the Ragel program cannot know when the user is finished with a priority
+embedding. A solution whereby they are explicitly deleted after use is
+conceivable; however this is not very user-friendly. Priorities were therefore
+made into named entities. Only priorities with the same name are allowed to
+interact. This allows any number of priorities to coexist in one machine for
+the purpose of controlling various different regular expression operations and
+eliminates the need to ever delete them. Such a scheme allows the user to
+choose a unique name, embed two different priority values using that name
+and be confident that the priority embedding will be free of any side effects.
+
+In the first form of priority embedding the name defaults to the name of the machine
+definition that the priority is assigned in. In this sense priorities are by
+default local to the current machine definition or instantiation. Beware of
+using this form in a longest-match machine, since there is only one name for
+the entire set of longest match patterns. In the second form the priority's
+name can be specified, allowing priority interaction across machine definition
+boundaries.
+
+\begin{itemize}
+\item \verb|expr > int| -- Sets starting transitions to have priority int.
+\item \verb|expr @ int| -- Sets transitions that go into a final state to have priority int.
+\item \verb|expr $ int| -- Sets all transitions to have priority int.
+\item \verb|expr % int| -- Sets leaving transitions to
+have priority int. When a transition is made going out of the machine (either
+by concatenation or kleene star) its priority is immediately set to the
+leaving priority.
+\end{itemize}
+
+The second form of priority assignment allows the programmer to specify the name
+to which the priority is assigned.
+
+\begin{itemize}
+\item \verb|expr > (name, int)| -- Starting transitions.
+\item \verb|expr @ (name, int)| -- Finishing transitions (into a final state).
+\item \verb|expr $ (name, int)| -- All transitions.
+\item \verb|expr % (name, int)| -- Leaving transitions.
+\end{itemize}
+
+\section{Guarded Operators that Encapsulate Priorities}
+
+Priority embeddings are a very expressive mechanism. At the same time they
+can be very confusing for the user. They force the user to imagine
+the transitions inside two interacting expressions and work out the precise
+effects of the operations between them. When we consider
+that this problem is worsened by the
+potential for side effects caused by unintended priority name collisions, we
+see that exposing the user to priorities is undesirable.
+
+Fortunately, in practice the use of priorities has been necessary only in a
+small number of scenarios. This allows us to encapsulate their functionality
+into a small set of operators and fully hide them from the user. This is
+advantageous from a language design point of view because it greatly simplifies
+the design.
+
+Going back to the C comment example, we can now properly specify
+it using a guarded concatenation operator which we call {\em finish-guarded
+concatenation}. From the user's point of view, this operator terminates the
+first machine when the second machine moves into a final state. It chooses a
+unique name and uses it to embed a low priority into all
+transitions of the first machine. A higher priority is then embedded into the
+transitions of the second machine that enter into a final state. The following
+example yields a machine identical to the example in Section
+\ref{controlling-nondeterminism}.
+
+\begin{inline_code}
+\begin{verbatim}
+comment = '/*' ( any @comm )* :>> '*/';
+\end{verbatim}
+\end{inline_code}
+\verbspace
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{comments2}
+\end{center}
+\graphspace
+
+Another guarded operator is {\em left-guarded concatenation}, given by the
+\verb|<:| compound symbol. This operator places a higher priority on all
+transitions of the first machine. This is useful if one must forcibly separate
+two lists that contain common elements. For example, one may need to tokenize a
+stream, but first consume leading whitespace.
+
+Ragel also includes a {\em longest-match kleene star} operator, given by the
+\verb|**| compound symbol. This
+guarded operator embeds a high
+priority into all transitions of the machine.
+A lower priority is then embedded into the leaving transitions. When the
+kleene star operator makes the epsilon transitions from
+the final states into the new start state, the lower priority will be transferred
+to the epsilon transitions. In cases where following an epsilon transition
+out of a final state conflicts with an existing transition out of a final
+state, the epsilon transition will be dropped.
+
+Other guarded operators are conceivable, such as guards on union that cause one
+alternative to take precedence over another. These may be implemented when it
+is clear they constitute a frequently used operation.
+In the next section we discuss the explicit specification of state machines
+using state charts.
+
+\subsection{Entry-Guarded Concatenation}
+
+\verb|expr :> expr|
+
+This operator concatenates two machines, but first assigns a low
+priority to all transitions
+of the first machine and a high priority to the starting transitions of the
+second machine. This operator is useful if from the final states of the first
+machine it is possible to accept the characters in the entering transitions of
+the second machine. This operator effectively terminates the first machine
+immediately upon starting the second machine, where otherwise they would be
+pursued concurrently. In the following example, entry-guarded concatenation is
+used to move out of a machine that matches everything at the first sign of an
+end-of-input marker.
+
+% GENERATE: entryguard
+% OPT: -p
+% %%{
+% machine entryguard;
+\begin{inline_code}
+\begin{verbatim}
+# Leave the catch-all machine on the first character of FIN.
+main := any* :> 'FIN';
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{entryguard}
+\end{center}
+\graphspace
+
+Entry-guarded concatenation is equivalent to the following:
+
+\begin{verbatim}
+expr $(unique_name,0) . expr >(unique_name,1)
+\end{verbatim}
+\verbspace
+
+\subsection{Finish-Guarded Concatenation}
+
+\verb|expr :>> expr|
+
+This operator is
+like the previous operator, except the higher priority is placed on the final
+transitions of the second machine. This is useful if one wishes to entertain
+the possibility of continuing to match the first machine right up until the
+second machine enters a final state. In other words it terminates the first
+machine only when the second accepts. In the following example, finish-guarded
+concatenation causes the move out of the machine that matches everything to be
+delayed until the full end-of-input marker has been matched.
+
+% GENERATE: finguard
+% OPT: -p
+% %%{
+% machine finguard;
+\begin{inline_code}
+\begin{verbatim}
+# Leave the catch-all machine on the last character of FIN.
+main := any* :>> 'FIN';
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{finguard}
+\end{center}
+\graphspace
+
+Finish-guarded concatenation is equivalent to the following, with one
+exception. If the right machine's start state is final, the higher priority is
+also embedded into it as a leaving priority. This prevents the left machine
+from persisting via the zero-length string.
+
+\begin{verbatim}
+expr $(unique_name,0) . expr @(unique_name,1)
+\end{verbatim}
+\verbspace
+
+\subsection{Left-Guarded Concatenation}
+
+\verb|expr <: expr|
+
+This operator places
+a higher priority on the left expression. It is useful if you want to prefix a
+sequence with another sequence composed of some of the same characters. For
+example, one can consume leading whitespace before tokenizing a sequence of
+whitespace-separated words as in:
+
+% GENERATE: leftguard
+% OPT: -p
+% %%{
+% machine leftguard;
+% action alpha {}
+% action ws {}
+% action start {}
+% action fin {}
+\begin{inline_code}
+\begin{verbatim}
+main := ( ' '* >start %fin ) <: ( ' ' $ws | [a-z] $alpha )*;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{leftguard}
+\end{center}
+\graphspace
+
+Left-guarded concatenation is equivalent to the following:
+
+\begin{verbatim}
+expr $(unique_name,1) . expr >(unique_name,0)
+\end{verbatim}
+\verbspace
+
+\subsection{Longest-Match Kleene Star}
+\label{longest_match_kleene_star}
+
+\verb|expr**|
+
+This version of kleene star puts a higher priority on staying in the
+machine versus wrapping around and starting over. The LM kleene star is useful
+when writing simple tokenizers. These machines are built by applying the
+longest-match kleene star to an alternation of token patterns, as in the
+following.
+
+% GENERATE: lmkleene
+% OPT: -p
+% %%{
+% machine exfinpri;
+% action A {}
+% action B {}
+\begin{inline_code}
+\begin{verbatim}
+# Repeat tokens, but make sure to get the longest match.
+main := (
+ lower ( lower | digit )* %A |
+ digit+ %B |
+ ' '
+)**;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{lmkleene}
+\end{center}
+\graphspace
+
+If a regular kleene star were used the machine above would not be able to
+distinguish between extending a word and beginning a new one. This operator is
+equivalent to:
+
+\begin{verbatim}
+( expr $(unique_name,1) %(unique_name,0) )*
+\end{verbatim}
+\verbspace
+
+When the kleene star is applied, transitions that go out of the machine and
+back into it are made. These are assigned a priority of zero by the leaving
+transition mechanism. This is less than the priority of one assigned to the
+transitions leaving the final states but not leaving the machine. When
+these transitions clash on the same character, the
+transition that stays in the machine takes precedence. The transition
+that wraps around is dropped.
+
+Note that this operator does not build a scanner in the traditional sense
+because there is never any backtracking. To build a scanner with backtracking
+use the Longest-Match machine construction described in Section
+\ref{generating-scanners}.
+
+\chapter{Interface to Host Program}
+
+The Ragel code generator is very flexible. The generated code has no
+dependencies and can be inserted in any function, perhaps inside a loop if
+desired. The user is responsible for declaring and initializing a number of
+required variables, including the current state and the pointer to the input
+stream. These can live in any scope. Control of the input processing loop is
+also possible: the user may break out of the processing loop and return to it
+at any time.
+
+In the case of the C, D, and Go host languages, Ragel is able to generate very
+fast-running code that implements state machines as directly executable code.
+Since very large files strain the host language compiler, table-based code
+generation is also supported. In the future we hope to provide a partitioned,
+directly executable format that is able to reduce the burden on the host
+compiler by splitting large machines across multiple functions.
+
+In the case of Java and Ruby, table-based code generation is the only code
+style supported. In the future this may be expanded to include other code
+styles.
+
+Ragel can be used to parse input in one block, or it can be used to parse input
+in a sequence of blocks as it arrives from a file or socket. Parsing the input
+in a sequence of blocks brings with it a few responsibilities. If the parser
+utilizes a scanner, care must be taken to not break the input stream anywhere
+but token boundaries. If pointers to the input stream are taken during
+parsing, care must be taken to not use a pointer that has been invalidated by
+movement to a subsequent block. If the current input data pointer is moved
+backwards it must not be moved past the beginning of the current block.
+
+Figure \ref{basic-example} shows a simple Ragel program that does not have any
+actions. The example tests the first argument of the program against a number
+pattern and then prints the machine's acceptance status.
+
+\begin{figure}
+\small
+\begin{verbatim}
+#include <stdio.h>
+#include <string.h>
+%%{
+ machine foo;
+ write data;
+}%%
+int main( int argc, char **argv )
+{
+ int cs;
+ if ( argc > 1 ) {
+ char *p = argv[1];
+ char *pe = p + strlen( p );
+ %%{
+ main := [0-9]+ ( '.' [0-9]+ )?;
+
+ write init;
+ write exec;
+ }%%
+ }
+ printf("result = %i\n", cs >= foo_first_final );
+ return 0;
+}
+\end{verbatim}
+\verbspace
+\caption{A basic Ragel example without any actions.
+}
+\label{basic-example}
+\end{figure}
+
+\section{Variables Used by Ragel}
+
+There are a number of variables that Ragel expects the user to declare. At a
+very minimum the \verb|cs|, \verb|p| and \verb|pe| variables must be declared.
+In Go, Java and Ruby code the \verb|data| variable must also be declared. If
+EOF actions are used then the \verb|eof| variable is required. If
+stack-based state machine control flow statements are used then the
+\verb|stack| and \verb|top| variables are required. If a scanner is declared
+then the \verb|act|, \verb|ts| and \verb|te| variables must be
+declared.
+
+\begin{itemize}
+
+\item \verb|cs| - Current state. This must be an integer and it should persist
+across invocations of the machine when the data is broken into blocks that are
+processed independently. This variable may be modified from outside the
+execution loop, but not from within.
+
+\item \verb|p| - Data pointer. In C/D code this variable is expected to be a
+pointer to the character data to process. It should be initialized to the
+beginning of the data block on every run of the machine. In Go, Java and Ruby it is
+used as an offset to \verb|data| and must be an integer. In this case it should
+be initialized to zero on every run of the machine.
+
+\item \verb|pe| - Data end pointer. This should be initialized to \verb|p| plus
+the data length on every run of the machine. In Go, Java and Ruby code this should
+be initialized to the data length.
+
+\item \verb|eof| - End of file pointer. This should be set to \verb|pe| when
+the buffer block being processed is the last one, otherwise it should be set to
+null. In Go, Java and Ruby code \verb|-1| must be used instead of null. If the EOF
+event can be known only after the final buffer block has been processed, then
+it is possible to set \verb|p = pe = eof| and run the execute block.
+
+\item \verb|data| - This variable is only required in Go, Java and Ruby code. It
+must be an array containting the data to process.
+
+\item \verb|stack| - This must be an array of integers. It is used to store
+integer values representing states. If the stack must resize dynamically the
+Pre-push and Post-Pop statements can be used to do this (Sections
+\ref{prepush} and \ref{postpop}).
+
+\item \verb|top| - This must be an integer value and will be used as an offset
+to \verb|stack|, giving the next available spot on the top of the stack.
+
+\item \verb|act| - This must be an integer value. It is a variable sometimes
+used by scanner code to keep track of the most recent successful pattern match.
+
+\item \verb|ts| - This must be a pointer to character data. In Go, Java and
+Ruby code this must be an integer. See Section \ref{generating-scanners} for
+more information.
+
+\item \verb|te| - Also a pointer to character data.
+
+\end{itemize}
+
+\section{Alphtype Statement}
+
+\begin{verbatim}
+alphtype unsigned int;
+\end{verbatim}
+\verbspace
+
+The alphtype statement specifies the alphabet data type that the machine
+operates on. During the compilation of the machine, integer literals are
+expected to be in the range of possible values of the alphtype. The default
+is \verb|char| for all languages except Go where the default is \verb|byte|.
+
+\begin{multicols}{2}
+C/C++/Objective-C:
+\begin{verbatim}
+ char unsigned char
+ short unsigned short
+ int unsigned int
+ long unsigned long
+\end{verbatim}
+\verbspace
+
+Go:
+\begin{verbatim}
+ byte
+ int8 uint8
+ int16 uint16
+ int32 uint32
+ int
+\end{verbatim}
+\verbspace
+
+Ruby:
+\begin{verbatim}
+ char
+ int
+\end{verbatim}
+\verbspace
+
+\columnbreak
+
+Java:
+\begin{verbatim}
+ char
+ byte
+ short
+ int
+\end{verbatim}
+\verbspace
+
+D:
+\begin{verbatim}
+ char
+ byte ubyte
+ short ushort
+ wchar
+ int uint
+ dchar
+\end{verbatim}
+\verbspace
+
+\end{multicols}
+
+\section{Getkey Statement}
+
+\begin{verbatim}
+getkey fpc->id;
+\end{verbatim}
+\verbspace
+
+This statement specifies to Ragel how to retrieve the current character from
+from the pointer to the current element (\verb|p|). Any expression that returns
+a value of the alphabet type
+may be used. The getkey statement may be used for looking into element
+structures or for translating the character to process. The getkey expression
+defaults to \verb|(*p)|. In goto-driven machines the getkey expression may be
+evaluated more than once per element processed, therefore it should not incur a
+large cost nor preclude optimization.
+
+\section{Access Statement}
+
+\begin{verbatim}
+access fsm->;
+\end{verbatim}
+\verbspace
+
+The access statement specifies how the generated code should
+access the machine data that is persistent across processing buffer blocks.
+This applies to all variables except \verb|p|, \verb|pe| and \verb|eof|. This includes
+\verb|cs|, \verb|top|, \verb|stack|, \verb|ts|, \verb|te| and \verb|act|.
+The access statement is useful if a machine is to be encapsulated inside a
+structure in C code. It can be used to give the name of
+a pointer to the structure.
+
+\section{Variable Statement}
+
+\begin{verbatim}
+variable p fsm->p;
+\end{verbatim}
+\verbspace
+
+The variable statement specifies how to access a specific
+variable. All of the variables that are declared by the user and
+used by Ragel can be changed. This includes \verb|p|, \verb|pe|, \verb|eof|, \verb|cs|,
+\verb|top|, \verb|stack|, \verb|ts|, \verb|te| and \verb|act|.
+In Go, Ruby and Java code generation the \verb|data| variable can also be changed.
+
+\section{Pre-Push Statement}
+\label{prepush}
+
+\begin{verbatim}
+prepush {
+ /* stack growing code */
+}
+\end{verbatim}
+\verbspace
+
+The prepush statement allows the user to supply stack management code that is
+written out during the generation of fcall, immediately before the current
+state is pushed to the stack. This statement can be used to test the number of
+available spaces and dynamically grow the stack if necessary.
+
+\section{Post-Pop Statement}
+\label{postpop}
+
+\begin{verbatim}
+postpop {
+ /* stack shrinking code */
+}
+\end{verbatim}
+\verbspace
+
+The postpop statement allows the user to supply stack management code that is
+written out during the generation of fret, immediately after the next state is
+popped from the stack. This statement can be used to dynamically shrink the
+stack.
+
+\section{Write Statement}
+\label{write-statement}
+
+\begin{verbatim}
+write <component> [options];
+\end{verbatim}
+\verbspace
+
+The write statement is used to generate parts of the machine.
+There are seven
+components that can be generated by a write statement. These components make up the
+state machine's data, initialization code, execution code, and export definitions.
+A write statement may appear before a machine is fully defined.
+This allows one to write out the data first then later define the machine where
+it is used. An example of this is shown in Figure \ref{fbreak-example}.
+
+\subsection{Write Data}
+\begin{verbatim}
+write data [options];
+\end{verbatim}
+\verbspace
+
+The write data statement causes Ragel to emit the constant static data needed
+by the machine. In table-driven output styles (see Section \ref{genout}) this
+is a collection of arrays that represent the states and transitions of the
+machine. In goto-driven machines much less data is emitted. At the very
+minimum a start state \verb|name_start| is generated. All variables written
+out in machine data have both the \verb|static| and \verb|const| properties and
+are prefixed with the name of the machine and an
+underscore. The data can be placed inside a class, inside a function, or it can
+be defined as global data.
+
+Two variables are written that may be used to test the state of the machine
+after a buffer block has been processed. The \verb|name_error| variable gives
+the id of the state that the machine moves into when it cannot find a valid
+transition to take. The machine immediately breaks out of the processing loop when
+it finds itself in the error state. The error variable can be compared to the
+current state to determine if the machine has failed to parse the input. If the
+machine is complete, that is from every state there is a transition to a proper
+state on every possible character of the alphabet, then no error state is required
+and this variable will be set to -1.
+
+The \verb|name_first_final| variable stores the id of the first final state.
+All of the machine's states are sorted by their final state status before
+having their ids assigned. Checking if the machine has accepted its input can
+then be done by checking if the current state is greater-than or equal to the
+first final state.
+
+Data generation has several options:
+
+\noindent\hspace*{24pt}\verb|noerror | - Do not generate the integer variable that gives the id of the error state.\\
+\noindent\hspace*{24pt}\verb|nofinal | - Do not generate the integer variable that gives the id of the first final state.\\
+\noindent\hspace*{24pt}\verb|noprefix | - Do not prefix the variable names with the name of the machine.
+\vspace{12pt}
+
+\begin{figure}
+\small
+\begin{verbatim}
+#include <stdio.h>
+%% machine foo;
+%% write data;
+int main( int argc, char **argv )
+{
+ int cs, res = 0;
+ if ( argc > 1 ) {
+ char *p = argv[1];
+ %%{
+ main :=
+ [a-z]+
+ 0 @{ res = 1; fbreak; };
+ write init;
+ write exec noend;
+ }%%
+ }
+ printf("execute = %i\n", res );
+ return 0;
+}
+\end{verbatim}
+\verbspace
+\caption{Use of {\tt noend} write option and the {\tt fbreak} statement for
+processing a string.
+}
+\label{fbreak-example}
+\end{figure}
+
+\subsection{Write Start, First Final and Error}
+
+\begin{verbatim}
+write start;
+write first_final;
+write error;
+\end{verbatim}
+\verbspace
+
+These three write statements provide an alternative means of accessing the
+\verb|start|, \verb|first_final| and \verb|error| states. If there are many
+different machine specifications in one file it is easy to get the prefix for
+these wrong. This is especially true if the state machine boilerplate is
+frequently made by a copy-paste-edit process. These write statements allow the
+problem to be avoided. They can be used as follows:
+
+\begin{verbatim}
+/* Did parsing succeed? */
+if ( cs < %%{ write first_final; }%% ) {
+ result = ERR_PARSE_ERROR;
+ goto fail;
+}
+\end{verbatim}
+\verbspace
+
+\subsection{Write Init}
+\begin{verbatim}
+write init [options];
+\end{verbatim}
+\verbspace
+
+The write init statement causes Ragel to emit initialization code. This should
+be executed once before the machine is started. At a very minimum this sets the
+current state to the start state. If other variables are needed by the
+generated code, such as call stack variables or scanner management
+variables, they are also initialized here.
+
+The \verb|nocs| option to the write init statement will cause ragel to skip
+intialization of the cs variable. This is useful if the user wishes to use
+custom logic to decide which state the specification should start in.
+
+\subsection{Write Exec}
+\begin{verbatim}
+write exec [options];
+\end{verbatim}
+\verbspace
+
+The write exec statement causes Ragel to emit the state machine's execution code.
+Ragel expects several variables to be available to this code. At a very minimum, the
+generated code needs access to the current character position \verb|p|, the ending
+position \verb|pe| and the current state \verb|cs| (though \verb|pe|
+can be omitted using the \verb|noend| write option).
+The \verb|p| variable is the cursor that the execute code will
+used to traverse the input. The \verb|pe| variable should be set up to point to one
+position past the last valid character in the buffer.
+
+Other variables are needed when certain features are used. For example using
+the \verb|fcall| or \verb|fret| statements requires \verb|stack| and
+\verb|top| variables to be defined. If a longest-match construction is used,
+variables for managing backtracking are required.
+
+The write exec statement has one option. The \verb|noend| option tells Ragel
+to generate code that ignores the end position \verb|pe|. In this
+case the user must explicitly break out of the processing loop using
+\verb|fbreak|, otherwise the machine will continue to process characters until
+it moves into the error state. This option is useful if one wishes to process a
+null terminated string. Rather than traverse the string to discover then length
+before processing the input, the user can break out when the null character is
+seen. The example in Figure \ref{fbreak-example} shows the use of the
+\verb|noend| write option and the \verb|fbreak| statement for processing a string.
+
+\subsection{Write Exports}
+\label{export}
+
+\begin{verbatim}
+write exports;
+\end{verbatim}
+\verbspace
+
+The export feature can be used to export simple machine definitions. Machine definitions
+are marked for export using the \verb|export| keyword.
+
+\begin{verbatim}
+export machine_to_export = 0x44;
+\end{verbatim}
+\verbspace
+
+When the write exports statement is used these machines are
+written out in the generated code. Defines are used for C and constant integers
+are used for D, Java and Ruby. See Section \ref{import} for a description of the
+import statement.
+
+\section{Maintaining Pointers to Input Data}
+
+In the creation of any parser it is not uncommon to require the collection of
+the data being parsed. It is always possible to collect data into a growable
+buffer as the machine moves over it, however the copying of data is a somewhat
+wasteful use of processor cycles. The most efficient way to collect data from
+the parser is to set pointers into the input then later reference them. This
+poses a problem for uses of Ragel where the input data arrives in blocks, such
+as over a socket or from a file. If a pointer is set in one buffer block but
+must be used while parsing a following buffer block, some extra consideration
+to correctness must be made.
+
+The scanner constructions exhibit this problem, requiring the maintenance
+code described in Section \ref{generating-scanners}. If a longest-match
+construction has been used somewhere in the machine then it is possible to
+take advantage of the required prefix maintenance code in the driver program to
+ensure pointers to the input are always valid. If laying down a pointer one can
+set \verb|ts| at the same spot or ahead of it. When data is shifted in
+between loops the user must also shift the pointer. In this way it is possible
+to maintain pointers to the input that will always be consistent.
+
+\begin{figure}
+\small
+\begin{verbatim}
+ int have = 0;
+ while ( 1 ) {
+ char *p, *pe, *data = buf + have;
+ int len, space = BUFSIZE - have;
+
+ if ( space == 0 ) {
+ fprintf(stderr, "BUFFER OUT OF SPACE\n");
+ exit(1);
+ }
+
+ len = fread( data, 1, space, stdin );
+ if ( len == 0 )
+ break;
+
+ /* Find the last newline by searching backwards. */
+ p = buf;
+ pe = data + len - 1;
+ while ( *pe != '\n' && pe >= buf )
+ pe--;
+ pe += 1;
+
+ %% write exec;
+
+ /* How much is still in the buffer? */
+ have = data + len - pe;
+ if ( have > 0 )
+ memmove( buf, pe, have );
+
+ if ( len < space )
+ break;
+ }
+\end{verbatim}
+\verbspace
+\caption{An example of line-oriented processing.
+}
+\label{line-oriented}
+\end{figure}
+
+In general, there are two approaches for guaranteeing the consistency of
+pointers to input data. The first approach is the one just described;
+lay down a marker from an action,
+then later ensure that the data the marker points to is preserved ahead of
+the buffer on the next execute invocation. This approach is good because it
+allows the parser to decide on the pointer-use boundaries, which can be
+arbitrarily complex parsing conditions. A downside is that it requires any
+pointers that are set to be corrected in between execute invocations.
+
+The alternative is to find the pointer-use boundaries before invoking the execute
+routine, then pass in the data using these boundaries. For example, if the
+program must perform line-oriented processing, the user can scan backwards from
+the end of an input block that has just been read in and process only up to the
+first found newline. On the next input read, the new data is placed after the
+partially read line and processing continues from the beginning of the line.
+An example of line-oriented processing is given in Figure \ref{line-oriented}.
+
+\section{Specifying the Host Language}
+
+The \verb|ragel| program has a number of options for specifying the host
+language. The host-language options are:
+
+\begin{itemize}
+\item \verb|-C | for C/C++/Objective-C code (default)
+\item \verb|-D | for D code.
+\item \verb|-Z | for Go code.
+\item \verb|-J | for Java code.
+\item \verb|-R | for Ruby code.
+\item \verb|-A | for C\# code.
+\end{itemize}
+
+\section{Choosing a Generated Code Style}
+\label{genout}
+
+There are three styles of code output to choose from. Code style affects the
+size and speed of the compiled binary. Changing code style does not require any
+change to the Ragel program. There are two table-driven formats and a goto
+driven format.
+
+In addition to choosing a style to emit, there are various levels of action
+code reuse to choose from. The maximum reuse levels (\verb|-T0|, \verb|-F0|
+and \verb|-G0|) ensure that no FSM action code is ever duplicated by encoding
+each transition's action list as static data and iterating
+through the lists on every transition. This will normally result in a smaller
+binary. The less action reuse options (\verb|-T1|, \verb|-F1| and \verb|-G1|)
+will usually produce faster running code by expanding each transition's action
+list into a single block of code, eliminating the need to iterate through the
+lists. This duplicates action code instead of generating the logic necessary
+for reuse. Consequently the binary will be larger. However, this tradeoff applies to
+machines with moderate to dense action lists only. If a machine's transitions
+frequently have less than two actions then the less reuse options will actually
+produce both a smaller and a faster running binary due to less action sharing
+overhead. The best way to choose the appropriate code style for your
+application is to perform your own tests.
+
+The table-driven FSM represents the state machine as constant static data. There are
+tables of states, transitions, indices and actions. The current state is
+stored in a variable. The execution is simply a loop that looks up the current
+state, looks up the transition to take, executes any actions and moves to the
+target state. In general, the table-driven FSM can handle any machine, produces
+a smaller binary and requires a less expensive host language compile, but
+results in slower running code. Since the table-driven format is the most
+flexible it is the default code style.
+
+The flat table-driven machine is a table-based machine that is optimized for
+small alphabets. Where the regular table machine uses the current character as
+the key in a binary search for the transition to take, the flat table machine
+uses the current character as an index into an array of transitions. This is
+faster in general, however is only suitable if the span of possible characters
+is small.
+
+The goto-driven FSM represents the state machine using goto and switch
+statements. The execution is a flat code block where the transition to take is
+computed using switch statements and directly executable binary searches. In
+general, the goto FSM produces faster code but results in a larger binary and a
+more expensive host language compile.
+
+The goto-driven format has an additional action reuse level (\verb|-G2|) that
+writes actions directly into the state transitioning logic rather than putting
+all the actions together into a single switch. Generally this produces faster
+running code because it allows the machine to encode the current state using
+the processor's instruction pointer. Again, sparse machines may actually
+compile to smaller binaries when \verb|-G2| is used due to less state and
+action management overhead. For many parsing applications \verb|-G2| is the
+preferred output format.
+
+\begin{center}
+
+Code Output Style Options
+
+\begin{tabular}{|c|c|c|}
+\hline
+\verb|-T0|&binary search table-driven&C/D/Java/Ruby/C\#\\
+\hline
+\verb|-T1|&binary search, expanded actions&C/D/Ruby/C\#\\
+\hline
+\verb|-F0|&flat table-driven&C/D/Ruby/C\#\\
+\hline
+\verb|-F1|&flat table, expanded actions&C/D/Ruby/C\#\\
+\hline
+\verb|-G0|&goto-driven&C/D/C\#\\
+\hline
+\verb|-G1|&goto, expanded actions&C/D/C\#\\
+\hline
+\verb|-G2|&goto, in-place actions&C/D/Go\\
+\hline
+\end{tabular}
+\end{center}
+
+\chapter{Beyond the Basic Model}
+
+\section{Parser Modularization}
+\label{modularization}
+
+It is possible to use Ragel's machine construction and action embedding
+operators to specify an entire parser using a single regular expression. In
+many cases this is the desired way to specify a parser in Ragel. However, in
+some scenarios the language to parse may be so large that it is difficult to
+think about it as a single regular expression. It may also shift between distinct
+parsing strategies, in which case modularization into several coherent blocks
+of the language may be appropriate.
+
+It may also be the case that patterns that compile to a large number of states
+must be used in a number of different contexts and referencing them in each
+context results in a very large state machine. In this case, an ability to reuse
+parsers would reduce code size.
+
+To address this, distinct regular expressions may be instantiated and linked
+together by means of a jumping and calling mechanism. This mechanism is
+analogous to the jumping to and calling of processor instructions. A jump
+command, given in action code, causes control to be immediately passed to
+another portion of the machine by way of setting the current state variable. A
+call command causes the target state of the current transition to be pushed to
+a state stack before control is transferred. Later on, the original location
+may be returned to with a return statement. In the following example, distinct
+state machines are used to handle the parsing of two types of headers.
+
+% GENERATE: call
+% %%{
+% machine call;
+\begin{inline_code}
+\begin{verbatim}
+action return { fret; }
+action call_date { fcall date; }
+action call_name { fcall name; }
+
+# A parser for date strings.
+date := [0-9][0-9] '/'
+ [0-9][0-9] '/'
+ [0-9][0-9][0-9][0-9] '\n' @return;
+
+# A parser for name strings.
+name := ( [a-zA-Z]+ | ' ' )** '\n' @return;
+
+# The main parser.
+headers =
+ ( 'from' | 'to' ) ':' @call_name |
+ ( 'departed' | 'arrived' ) ':' @call_date;
+
+main := headers*;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% %% write data;
+% void f()
+% {
+% %% write init;
+% %% write exec;
+% }
+% END GENERATE
+
+Calling and jumping should be used carefully as they are operations that take
+one out of the domain of regular languages. A machine that contains a call or
+jump statement in one of its actions should be used as an argument to a machine
+construction operator only with considerable care. Since DFA transitions may
+actually represent several NFA transitions, a call or jump embedded in one
+machine can inadvertently terminate another machine that it shares prefixes
+with. Despite this danger, theses statements have proven useful for tying
+together sub-parsers of a language into a parser for the full language,
+especially for the purpose of modularizing code and reducing the number of
+states when the machine contains frequently recurring patterns.
+
+Section \ref{vals} describes the jump and call statements that are used to
+transfer control. These statements make use of two variables that must be
+declared by the user, \verb|stack| and \verb|top|. The \verb|stack| variable
+must be an array of integers and \verb|top| must be a single integer, which
+will point to the next available space in \verb|stack|. Sections \ref{prepush}
+and \ref{postpop} describe the Pre-Push and Post-Pop statements which can be
+used to implement a dynamically resizable array.
+
+\section{Referencing Names}
+\label{labels}
+
+This section describes how to reference names in epsilon transitions (Section
+\ref{state-charts}) and
+action-based control-flow statements such as \verb|fgoto|. There is a hierarchy
+of names implied in a Ragel specification. At the top level are the machine
+instantiations. Beneath the instantiations are labels and references to machine
+definitions. Beneath those are more labels and references to definitions, and
+so on.
+
+Any name reference may contain multiple components separated with the \verb|::|
+compound symbol. The search for the first component of a name reference is
+rooted at the join expression that the epsilon transition or action embedding
+is contained in. If the name reference is not contained in a join,
+the search is rooted at the machine definition that the epsilon transition or
+action embedding is contained in. Each component after the first is searched
+for beginning at the location in the name tree that the previous reference
+component refers to.
+
+In the case of action-based references, if the action is embedded more than
+once, the local search is performed for each embedding and the result is the
+union of all the searches. If no result is found for action-based references then
+the search is repeated at the root of the name tree. Any action-based name
+search may be forced into a strictly global search by prefixing the name
+reference with \verb|::|.
+
+The final component of the name reference must resolve to a unique entry point.
+If a name is unique in the entire name tree it can be referenced as is. If it
+is not unique it can be specified by qualifying it with names above it in the
+name tree. However, it can always be renamed.
+
+% FIXME: Should fit this in somewhere.
+% Some kinds of name references are illegal. Cannot call into longest-match
+% machine, can only call its start state. Cannot make a call to anywhere from
+% any part of a longest-match machine except a rule's action. This would result
+% in an eventual return to some point inside a longest-match other than the
+% start state. This is banned for the same reason a call into the LM machine is
+% banned.
+
+
+\section{Scanners}
+\label{generating-scanners}
+
+Scanners are very much intertwined with regular-languages and their
+corresponding processors. For this reason Ragel supports the definition of
+scanners. The generated code will repeatedly attempt to match patterns from a
+list, favouring longer patterns over shorter patterns. In the case of
+equal-length matches, the generated code will favour patterns that appear ahead
+of others. When a scanner makes a match it executes the user code associated
+with the match, consumes the input then resumes scanning.
+
+\begin{verbatim}
+<machine_name> := |*
+ pattern1 => action1;
+ pattern2 => action2;
+ ...
+ *|;
+\end{verbatim}
+\verbspace
+
+On the surface, Ragel scanners are similar to those defined by Lex. Though
+there is a key distinguishing feature: patterns may be arbitrary Ragel
+expressions and can therefore contain embedded code. With a Ragel-based scanner
+the user need not wait until the end of a pattern before user code can be
+executed.
+
+Scanners can be used to process sub-languages, as well as for tokenizing
+programming languages. In the following example a scanner is used to tokenize
+the contents of a header field.
+
+\begin{inline_code}
+\begin{verbatim}
+word = [a-z]+;
+head_name = 'Header';
+
+header := |*
+ word;
+ ' ';
+ '\n' => { fret; };
+*|;
+
+main := ( head_name ':' @{ fcall header; } )*;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+
+The scanner construction has a purpose similar to the longest-match kleene star
+operator \verb|**|. The key
+difference is that a scanner is able to backtrack to match a previously matched
+shorter string when the pursuit of a longer string fails. For this reason the
+scanner construction operator is not a pure state machine construction
+operator. It relies on several variables that enable it to backtrack and make
+pointers to the matched input text available to the user. For this reason
+scanners must be immediately instantiated. They cannot be defined inline or
+referenced by another expression. Scanners must be jumped to or called.
+
+Scanners rely on the \verb|ts|, \verb|te| and \verb|act|
+variables to be present so that they can backtrack and make pointers to the
+matched text available to the user. If input is processed using multiple calls
+to the execute code then the user must ensure that when a token is only
+partially matched that the prefix is preserved on the subsequent invocation of
+the execute code.
+
+The \verb|ts| variable must be defined as a pointer to the input data.
+It is used for recording where the current token match begins. This variable
+may be used in action code for retrieving the text of the current match. Ragel
+ensures that in between tokens and outside of the longest-match machines that
+this pointer is set to null. In between calls to the execute code the user must
+check if \verb|ts| is set and if so, ensure that the data it points to is
+preserved ahead of the next buffer block. This is described in more detail
+below.
+
+The \verb|te| variable must also be defined as a pointer to the input data.
+It is used for recording where a match ends and where scanning of the next
+token should begin. This can also be used in action code for retrieving the
+text of the current match.
+
+The \verb|act| variable must be defined as an integer type. It is used for
+recording the identity of the last pattern matched when the scanner must go
+past a matched pattern in an attempt to make a longer match. If the longer
+match fails it may need to consult the \verb|act| variable. In some cases, use
+of the \verb|act|
+variable can be avoided because the value of the current state is enough
+information to determine which token to accept, however in other cases this is
+not enough and so the \verb|act| variable is used.
+
+When the longest-match operator is in use, the user's driver code must take on
+some buffer management functions. The following algorithm gives an overview of
+the steps that should be taken to properly use the longest-match operator.
+
+\begin{itemize}
+\item Read a block of input data.
+\item Run the execute code.
+\item If \verb|ts| is set, the execute code will expect the incomplete
+token to be preserved ahead of the buffer on the next invocation of the execute
+code.
+\begin{itemize}
+\item Shift the data beginning at \verb|ts| and ending at \verb|pe| to the
+beginning of the input buffer.
+\item Reset \verb|ts| to the beginning of the buffer.
+\item Shift \verb|te| by the distance from the old value of \verb|ts|
+to the new value. The \verb|te| variable may or may not be valid. There is
+no way to know if it holds a meaningful value because it is not kept at null
+when it is not in use. It can be shifted regardless.
+\end{itemize}
+\item Read another block of data into the buffer, immediately following any
+preserved data.
+\item Run the scanner on the new data.
+\end{itemize}
+
+Figure \ref{preserve_example} shows the required handling of an input stream in
+which a token is broken by the input block boundaries. After processing up to
+and including the ``t'' of ``characters'', the prefix of the string token must be
+retained and processing should resume at the ``e'' on the next iteration of
+the execute code.
+
+If one uses a large input buffer for collecting input then the number of times
+the shifting must be done will be small. Furthermore, if one takes care not to
+define tokens that are allowed to be very long and instead processes these
+items using pure state machines or sub-scanners, then only a small amount of
+data will ever need to be shifted.
+
+\begin{figure}
+\small
+\begin{verbatim}
+ a) A stream "of characters" to be scanned.
+ | | |
+ p ts pe
+
+ b) "of characters" to be scanned.
+ | | |
+ ts p pe
+\end{verbatim}
+\verbspace
+\caption{Following an invocation of the execute code there may be a partially
+matched token (a). The data of the partially matched token
+must be preserved ahead of the new data on the next invocation (b).
+}
+\label{preserve_example}
+\end{figure}
+
+Since scanners attempt to make the longest possible match of input, patterns
+such as identifiers require one character of lookahead in order to trigger a
+match. In the case of the last token in the input stream the user must ensure
+that the \verb|eof| variable is set so that the final token is flushed out.
+
+An example scanner processing loop is given in Figure \ref{scanner-loop}.
+
+\begin{figure}
+\small
+\begin{verbatim}
+ int have = 0;
+ bool done = false;
+ while ( !done ) {
+ /* How much space is in the buffer? */
+ int space = BUFSIZE - have;
+ if ( space == 0 ) {
+ /* Buffer is full. */
+ cerr << "TOKEN TOO BIG" << endl;
+ exit(1);
+ }
+
+ /* Read in a block after any data we already have. */
+ char *p = inbuf + have;
+ cin.read( p, space );
+ int len = cin.gcount();
+
+ char *pe = p + len;
+ char *eof = 0;
+
+ /* If no data was read indicate EOF. */
+ if ( len == 0 ) {
+ eof = pe;
+ done = true;
+ }
+
+ %% write exec;
+
+ if ( cs == Scanner_error ) {
+ /* Machine failed before finding a token. */
+ cerr << "PARSE ERROR" << endl;
+ exit(1);
+ }
+
+ if ( ts == 0 )
+ have = 0;
+ else {
+ /* There is a prefix to preserve, shift it over. */
+ have = pe - ts;
+ memmove( inbuf, ts, have );
+ te = inbuf + (te-ts);
+ ts = inbuf;
+ }
+ }
+\end{verbatim}
+\verbspace
+\caption{A processing loop for a scanner.
+}
+\label{scanner-loop}
+\end{figure}
+
+\section{State Charts}
+\label{state-charts}
+
+In addition to supporting the construction of state machines using regular
+languages, Ragel provides a way to manually specify state machines using
+state charts. The comma operator combines machines together without any
+implied transitions. The user can then manually link machines by specifying
+epsilon transitions with the \verb|->| operator. Epsilon transitions are drawn
+between the final states of a machine and entry points defined by labels. This
+makes it possible to build machines using the explicit state-chart method while
+making minimal changes to the Ragel language.
+
+An interesting feature of Ragel's state chart construction method is that it
+can be mixed freely with regular expression constructions. A state chart may be
+referenced from within a regular expression, or a regular expression may be
+used in the definition of a state chart transition.
+
+\subsection{Join}
+
+\verb|expr , expr , ...|
+
+Join a list of machines together without
+drawing any transitions, without setting up a start state, and without
+designating any final states. Transitions between the machines may be specified
+using labels and epsilon transitions. The start state must be explicity
+specified with the ``start'' label. Final states may be specified with an
+epsilon transition to the implicitly created ``final'' state. The join
+operation allows one to build machines using a state chart model.
+
+\subsection{Label}
+
+\verb|label: expr|
+
+Attaches a label to an expression. Labels can be
+used as the target of epsilon transitions and explicit control transfer
+statements such as \verb|fgoto| and \verb|fnext| in action
+code.
+
+\subsection{Epsilon}
+
+\verb|expr -> label|
+
+Draws an epsilon transition to the state defined
+by \verb|label|. Epsilon transitions are made deterministic when join
+operators are evaluated. Epsilon transitions that are not in a join operation
+are made deterministic when the machine definition that contains the epsilon is
+complete. See Section \ref{labels} for information on referencing labels.
+
+\subsection{Simplifying State Charts}
+
+There are two benefits to providing state charts in Ragel. The first is that it
+allows us to take a state chart with a full listing of states and transitions
+and simplify it in selective places using regular expressions.
+
+The state chart method of specifying parsers is very common. It is an
+effective programming technique for producing robust code. The key disadvantage
+becomes clear when one attempts to comprehend a large parser specified in this
+way. These programs usually require many lines, causing logic to be spread out
+over large distances in the source file. Remembering the function of a large
+number of states can be difficult and organizing the parser in a sensible way
+requires discipline because branches and repetition present many file layout
+options. This kind of programming takes a specification with inherent
+structure such as looping, alternation and concatenation and expresses it in a
+flat form.
+
+If we could take an isolated component of a manually programmed state chart,
+that is, a subset of states that has only one entry point, and implement it
+using regular language operators then we could eliminate all the explicit
+naming of the states contained in it. By eliminating explicitly named states
+and replacing them with higher-level specifications we simplify a state machine
+specification.
+
+For example, sometimes chains of states are needed, with only a small number of
+possible characters appearing along the chain. These can easily be replaced
+with a concatenation of characters. Sometimes a group of common states
+implement a loop back to another single portion of the machine. Rather than
+manually duplicate all the transitions that loop back, we may be able to
+express the loop using a kleene star operator.
+
+Ragel allows one to take this state map simplification approach. We can build
+state machines using a state map model and implement portions of the state map
+using regular languages. In place of any transition in the state machine,
+entire sub-machines can be given. These can encapsulate functionality
+defined elsewhere. An important aspect of the Ragel approach is that when we
+wrap up a collection of states using a regular expression we do not lose
+access to the states and transitions. We can still execute code on the
+transitions that we have encapsulated.
+
+\subsection{Dropping Down One Level of Abstraction}
+\label{down}
+
+The second benefit of incorporating state charts into Ragel is that it permits
+us to bypass the regular language abstraction if we need to. Ragel's action
+embedding operators are sometimes insufficient for expressing certain parsing
+tasks. In the same way that is useful for C language programmers to drop down
+to assembly language programming using embedded assembler, it is sometimes
+useful for the Ragel programmer to drop down to programming with state charts.
+
+In the following example, we wish to buffer the characters of an XML CDATA
+sequence. The sequence is terminated by the string \verb|]]>|. The challenge
+in our application is that we do not wish the terminating characters to be
+buffered. An expression of the form \verb|any* @buffer :>> ']]>'| will not work
+because the buffer will always contain the characters \verb|]]| on the end.
+Instead, what we need is to delay the buffering of \verb|]|
+characters until a time when we
+abandon the terminating sequence and go back into the main loop. There is no
+easy way to express this using Ragel's regular expression and action embedding
+operators, and so an ability to drop down to the state chart method is useful.
+
+% GENERATE: dropdown
+% OPT: -p
+% %%{
+% machine dropdown;
+\begin{inline_code}
+\begin{verbatim}
+action bchar { buff( fpc ); } # Buffer the current character.
+action bbrack1 { buff( "]" ); }
+action bbrack2 { buff( "]]" ); }
+
+CDATA_body =
+start: (
+ ']' -> one |
+ (any-']') @bchar ->start
+),
+one: (
+ ']' -> two |
+ [^\]] @bbrack1 @bchar ->start
+),
+two: (
+ '>' -> final |
+ ']' @bbrack1 -> two |
+ [^>\]] @bbrack2 @bchar ->start
+);
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% main := CDATA_body;
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{dropdown}
+\end{center}
+\graphspace
+
+
+\section{Semantic Conditions}
+\label{semantic}
+
+Many communication protocols contain variable-length fields, where the length
+of the field is given ahead of the field as a value. This
+problem cannot be expressed using regular languages because of its
+context-dependent nature. The prevalence of variable-length fields in
+communication protocols motivated us to introduce semantic conditions into
+the Ragel language.
+
+A semantic condition is a block of user code that is interpreted as an
+expression and evaluated immediately
+before a transition is taken. If the code returns a value of true, the
+transition may be taken. We can now embed code that extracts the length of a
+field, then proceed to match $n$ data values.
+
+% GENERATE: conds1
+% OPT: -p
+% %%{
+% machine conds1;
+% number = digit+;
+\begin{inline_code}
+\begin{verbatim}
+action rec_num { i = 0; n = getnumber(); }
+action test_len { i++ < n }
+data_fields = (
+ 'd'
+ [0-9]+ %rec_num
+ ':'
+ ( [a-z] when test_len )*
+)**;
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% main := data_fields;
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{conds1}
+\end{center}
+\graphspace
+
+The Ragel implementation of semantic conditions does not force us to give up the
+compositional property of Ragel definitions. For example, a machine that tests
+the length of a field using conditions can be unioned with another machine
+that accepts some of the same strings, without the two machines interfering with
+one another. The user need not be concerned about whether or not the result of the
+semantic condition will affect the matching of the second machine.
+
+To see this, first consider that when a user associates a condition with an
+existing transition, the transition's label is translated from the base character
+to its corresponding value in the space that represents ``condition $c$ true''. Should
+the determinization process combine a state that has a conditional transition
+with another state that has a transition on the same input character but
+without a condition, then the condition-less transition first has its label
+translated into two values, one to its corresponding value in the space that
+represents ``condition $c$ true'' and another to its corresponding value in the
+space that represents ``condition $c$ false''. It
+is then safe to combine the two transitions. This is shown in the following
+example. Two intersecting patterns are unioned, one with a condition and one
+without. The condition embedded in the first pattern does not affect the second
+pattern.
+
+% GENERATE: conds2
+% OPT: -p
+% %%{
+% machine conds2;
+% number = digit+;
+\begin{inline_code}
+\begin{verbatim}
+action test_len { i++ < n }
+action one { /* accept pattern one */ }
+action two { /* accept pattern two */ }
+patterns =
+ ( [a-z] when test_len )+ %one |
+ [a-z][a-z0-9]* %two;
+main := patterns '\n';
+\end{verbatim}
+\end{inline_code}
+\verbspace
+% }%%
+% END GENERATE
+
+\graphspace
+\begin{center}
+\includegraphics[scale=0.55]{conds2}
+\end{center}
+\graphspace
+
+There are many more potential uses for semantic conditions. The user is free to
+use arbitrary code and may therefore perform actions such as looking up names
+in dictionaries, validating input using external parsing mechanisms or
+performing checks on the semantic structure of input seen so far. In the next
+section we describe how Ragel accommodates several common parser engineering
+problems.
+
+The semantic condition feature works only with alphabet types that are smaller
+in width than the \verb|long| type. To implement semantic conditions Ragel
+needs to be able to allocate characters from the alphabet space. Ragel uses
+these allocated characters to express "character C with condition P true" or "C
+with P false." Since internally Ragel uses longs to store characters there is
+no room left in the alphabet space unless an alphabet type smaller than long is
+used.
+
+\section{Implementing Lookahead}
+
+There are a few strategies for implementing lookahead in Ragel programs.
+Leaving actions, which are described in Section \ref{out-actions}, can be
+used as a form of lookahead. Ragel also provides the \verb|fhold| directive
+which can be used in actions to prevent the machine from advancing over the
+current character. It is also possible to manually adjust the current character
+position by shifting it backwards using \verb|fexec|, however when this is
+done, care must be taken not to overstep the beginning of the current buffer
+block. In both the use of \verb|fhold| and \verb|fexec| the user must be
+cautious of combining the resulting machine with another in such a way that the
+transition on which the current position is adjusted is not combined with a
+transition from the other machine.
+
+\section{Parsing Recursive Language Structures}
+
+In general Ragel cannot handle recursive structures because the grammar is
+interpreted as a regular language. However, depending on what needs to be
+parsed it is sometimes practical to implement the recursive parts using manual
+coding techniques. This often works in cases where the recursive structures are
+simple and easy to recognize, such as in the balancing of parentheses
+
+One approach to parsing recursive structures is to use actions that increment
+and decrement counters or otherwise recognize the entry to and exit from
+recursive structures and then jump to the appropriate machine defnition using
+\verb|fcall| and \verb|fret|. Alternatively, semantic conditions can be used to
+test counter variables.
+
+A more traditional approach is to call a separate parsing function (expressed
+in the host language) when a recursive structure is entered, then later return
+when the end is recognized.
+
+\end{document}