diff options
Diffstat (limited to 'test')
120 files changed, 24093 insertions, 0 deletions
diff --git a/test/Makefile.am b/test/Makefile.am new file mode 100644 index 0000000..a217e83 --- /dev/null +++ b/test/Makefile.am @@ -0,0 +1,121 @@ +TESTS = runtests.sh + +EXTRA_DIST = \ + runtests.sh \ + accum1.lm \ + accum2.lm \ + accum3.lm \ + accumbt1.lm \ + accumbt2.lm \ + accumbt3.lm \ + argv1.lm \ + argv2.lm \ + backtrack1.lm \ + backtrack2.lm \ + backtrack3.lm \ + binary1.lm \ + binary1.in \ + btscan1.lm \ + btscan2.lm \ + call1.lm \ + commitbt.lm \ + concat1.lm \ + concat2.lm \ + construct1.lm \ + construct2.lm \ + construct3.lm \ + constructex.lm \ + context1.lm \ + context2.lm \ + context3.lm \ + counting1.lm \ + counting2.lm \ + counting3.lm \ + counting4.lm \ + decl1.lm \ + decl2.lm \ + decl3.lm \ + div.lm \ + exit1.lm \ + exit2.lm \ + exit3.lm \ + export1.lm \ + factor1.lm \ + factor2.lm \ + factor3.lm \ + factor4.lm \ + factor5.lm \ + factor6.lm \ + forloop1.lm \ + forloop2.lm \ + forloop3.lm \ + func1.lm \ + func2.lm \ + func3.lm \ + generate1.lm \ + generate2.lm \ + heredoc.lm \ + ifblock1.lm \ + ignore1.lm \ + ignore2.lm \ + ignore3.lm \ + ignore4.lm \ + ignore5.lm \ + include1.lm \ + include1a.lmi \ + inpush1.lm \ + inpush1a.in \ + inpush1b.in \ + inpush1c.in \ + island.lm \ + lhs1.lm \ + liftattrs.lm \ + literal1.lm \ + lookup1.lm \ + mailbox.lm \ + matchex.lm \ + maxlen.lm \ + multiregion1.lm \ + multiregion2.lm \ + mutualrec.lm \ + namespace1.lm \ + nestedcomm.lm \ + order1.lm \ + order2.lm \ + parse1.lm \ + prints.lm \ + pull1.lm \ + pull2.lm \ + ragelambig1.lm \ + ragelambig2.lm \ + ragelambig3.lm \ + ragelambig4.lm \ + rediv.lm \ + reor1.lm \ + reor2.lm \ + reparse.lm \ + repeat1.lm \ + repeat2.lm \ + rhsref1.lm \ + rubyhere.lm \ + scope1.lm \ + sprintf.lm \ + string.lm \ + superid.lm \ + tags1.lm \ + tags2.lm \ + tags3.lm \ + tags4.lm \ + tcontext1.lm \ + til.lm \ + translate1.lm \ + translate2.lm \ + travs1.lm \ + treecmp1.lm \ + typeref1.lm \ + typeref2.lm \ + typeref3.lm \ + undofrag1.lm \ + undofrag2.lm \ + undofrag3.lm \ + while1.lm diff --git a/test/Makefile.in b/test/Makefile.in new file mode 100644 index 0000000..f8e9651 --- /dev/null +++ b/test/Makefile.in @@ -0,0 +1,579 @@ +# Makefile.in generated by automake 1.11.3 from Makefile.am. +# @configure_input@ + +# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, +# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software +# Foundation, Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ +VPATH = @srcdir@ +pkgdatadir = $(datadir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkglibexecdir = $(libexecdir)/@PACKAGE@ +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +build_triplet = @build@ +host_triplet = @host@ +subdir = test +DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/configure.in +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +mkinstalldirs = $(install_sh) -d +CONFIG_HEADER = $(top_builddir)/src/config.h \ + $(top_builddir)/src/defs.h +CONFIG_CLEAN_FILES = +CONFIG_CLEAN_VPATH_FILES = +SOURCES = +DIST_SOURCES = +am__tty_colors = \ +red=; grn=; lgn=; blu=; std= +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +ACLOCAL = @ACLOCAL@ +AMTAR = @AMTAR@ +AR = @AR@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +CC = @CC@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CXX = @CXX@ +CXXCPP = @CXXCPP@ +CXXDEPMODE = @CXXDEPMODE@ +CXXFLAGS = @CXXFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +DLLTOOL = @DLLTOOL@ +DSYMUTIL = @DSYMUTIL@ +DUMPBIN = @DUMPBIN@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +EXEEXT = @EXEEXT@ +FGREP = @FGREP@ +GREP = @GREP@ +INSTALL = @INSTALL@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +LD = @LD@ +LDFLAGS = @LDFLAGS@ +LIBOBJS = @LIBOBJS@ +LIBS = @LIBS@ +LIBTOOL = @LIBTOOL@ +LIPO = @LIPO@ +LN_S = @LN_S@ +LTLIBOBJS = @LTLIBOBJS@ +MAKEINFO = @MAKEINFO@ +MANIFEST_TOOL = @MANIFEST_TOOL@ +MKDIR_P = @MKDIR_P@ +NM = @NM@ +NMEDIT = @NMEDIT@ +OBJDUMP = @OBJDUMP@ +OBJEXT = @OBJEXT@ +OTOOL = @OTOOL@ +OTOOL64 = @OTOOL64@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_URL = @PACKAGE_URL@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +PUBDATE = @PUBDATE@ +RANLIB = @RANLIB@ +SED = @SED@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +STRIP = @STRIP@ +VERSION = @VERSION@ +abs_builddir = @abs_builddir@ +abs_srcdir = @abs_srcdir@ +abs_top_builddir = @abs_top_builddir@ +abs_top_srcdir = @abs_top_srcdir@ +ac_ct_AR = @ac_ct_AR@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_CXX = @ac_ct_CXX@ +ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ +am__include = @am__include@ +am__leading_dot = @am__leading_dot@ +am__quote = @am__quote@ +am__tar = @am__tar@ +am__untar = @am__untar@ +bindir = @bindir@ +build = @build@ +build_alias = @build_alias@ +build_cpu = @build_cpu@ +build_os = @build_os@ +build_vendor = @build_vendor@ +builddir = @builddir@ +datadir = @datadir@ +datarootdir = @datarootdir@ +docdir = @docdir@ +dvidir = @dvidir@ +exec_prefix = @exec_prefix@ +host = @host@ +host_alias = @host_alias@ +host_cpu = @host_cpu@ +host_os = @host_os@ +host_vendor = @host_vendor@ +htmldir = @htmldir@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +localedir = @localedir@ +localstatedir = @localstatedir@ +mandir = @mandir@ +mkdir_p = @mkdir_p@ +oldincludedir = @oldincludedir@ +pdfdir = @pdfdir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +psdir = @psdir@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +srcdir = @srcdir@ +sysconfdir = @sysconfdir@ +target_alias = @target_alias@ +top_build_prefix = @top_build_prefix@ +top_builddir = @top_builddir@ +top_srcdir = @top_srcdir@ +TESTS = runtests.sh +EXTRA_DIST = \ + runtests.sh \ + accum1.lm \ + accum2.lm \ + accum3.lm \ + accumbt1.lm \ + accumbt2.lm \ + accumbt3.lm \ + argv1.lm \ + argv2.lm \ + backtrack1.lm \ + backtrack2.lm \ + backtrack3.lm \ + binary1.lm \ + binary1.in \ + btscan1.lm \ + btscan2.lm \ + call1.lm \ + commitbt.lm \ + concat1.lm \ + concat2.lm \ + construct1.lm \ + construct2.lm \ + construct3.lm \ + constructex.lm \ + context1.lm \ + context2.lm \ + context3.lm \ + counting1.lm \ + counting2.lm \ + counting3.lm \ + counting4.lm \ + decl1.lm \ + decl2.lm \ + decl3.lm \ + div.lm \ + exit1.lm \ + exit2.lm \ + exit3.lm \ + export1.lm \ + factor1.lm \ + factor2.lm \ + factor3.lm \ + factor4.lm \ + factor5.lm \ + factor6.lm \ + forloop1.lm \ + forloop2.lm \ + forloop3.lm \ + func1.lm \ + func2.lm \ + func3.lm \ + generate1.lm \ + generate2.lm \ + heredoc.lm \ + ifblock1.lm \ + ignore1.lm \ + ignore2.lm \ + ignore3.lm \ + ignore4.lm \ + ignore5.lm \ + include1.lm \ + include1a.lmi \ + inpush1.lm \ + inpush1a.in \ + inpush1b.in \ + inpush1c.in \ + island.lm \ + lhs1.lm \ + liftattrs.lm \ + literal1.lm \ + lookup1.lm \ + mailbox.lm \ + matchex.lm \ + maxlen.lm \ + multiregion1.lm \ + multiregion2.lm \ + mutualrec.lm \ + namespace1.lm \ + nestedcomm.lm \ + order1.lm \ + order2.lm \ + parse1.lm \ + prints.lm \ + pull1.lm \ + pull2.lm \ + ragelambig1.lm \ + ragelambig2.lm \ + ragelambig3.lm \ + ragelambig4.lm \ + rediv.lm \ + reor1.lm \ + reor2.lm \ + reparse.lm \ + repeat1.lm \ + repeat2.lm \ + rhsref1.lm \ + rubyhere.lm \ + scope1.lm \ + sprintf.lm \ + string.lm \ + superid.lm \ + tags1.lm \ + tags2.lm \ + tags3.lm \ + tags4.lm \ + tcontext1.lm \ + til.lm \ + translate1.lm \ + translate2.lm \ + travs1.lm \ + treecmp1.lm \ + typeref1.lm \ + typeref2.lm \ + typeref3.lm \ + undofrag1.lm \ + undofrag2.lm \ + undofrag3.lm \ + while1.lm + +all: all-am + +.SUFFIXES: +$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ + && { if test -f $@; then exit 0; else break; fi; }; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign test/Makefile'; \ + $(am__cd) $(top_srcdir) && \ + $(AUTOMAKE) --foreign test/Makefile +.PRECIOUS: Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ + esac; + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(am__aclocal_m4_deps): + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs +tags: TAGS +TAGS: + +ctags: CTAGS +CTAGS: + + +check-TESTS: $(TESTS) + @failed=0; all=0; xfail=0; xpass=0; skip=0; \ + srcdir=$(srcdir); export srcdir; \ + list=' $(TESTS) '; \ + $(am__tty_colors); \ + if test -n "$$list"; then \ + for tst in $$list; do \ + if test -f ./$$tst; then dir=./; \ + elif test -f $$tst; then dir=; \ + else dir="$(srcdir)/"; fi; \ + if $(TESTS_ENVIRONMENT) $${dir}$$tst; then \ + all=`expr $$all + 1`; \ + case " $(XFAIL_TESTS) " in \ + *[\ \ ]$$tst[\ \ ]*) \ + xpass=`expr $$xpass + 1`; \ + failed=`expr $$failed + 1`; \ + col=$$red; res=XPASS; \ + ;; \ + *) \ + col=$$grn; res=PASS; \ + ;; \ + esac; \ + elif test $$? -ne 77; then \ + all=`expr $$all + 1`; \ + case " $(XFAIL_TESTS) " in \ + *[\ \ ]$$tst[\ \ ]*) \ + xfail=`expr $$xfail + 1`; \ + col=$$lgn; res=XFAIL; \ + ;; \ + *) \ + failed=`expr $$failed + 1`; \ + col=$$red; res=FAIL; \ + ;; \ + esac; \ + else \ + skip=`expr $$skip + 1`; \ + col=$$blu; res=SKIP; \ + fi; \ + echo "$${col}$$res$${std}: $$tst"; \ + done; \ + if test "$$all" -eq 1; then \ + tests="test"; \ + All=""; \ + else \ + tests="tests"; \ + All="All "; \ + fi; \ + if test "$$failed" -eq 0; then \ + if test "$$xfail" -eq 0; then \ + banner="$$All$$all $$tests passed"; \ + else \ + if test "$$xfail" -eq 1; then failures=failure; else failures=failures; fi; \ + banner="$$All$$all $$tests behaved as expected ($$xfail expected $$failures)"; \ + fi; \ + else \ + if test "$$xpass" -eq 0; then \ + banner="$$failed of $$all $$tests failed"; \ + else \ + if test "$$xpass" -eq 1; then passes=pass; else passes=passes; fi; \ + banner="$$failed of $$all $$tests did not behave as expected ($$xpass unexpected $$passes)"; \ + fi; \ + fi; \ + dashes="$$banner"; \ + skipped=""; \ + if test "$$skip" -ne 0; then \ + if test "$$skip" -eq 1; then \ + skipped="($$skip test was not run)"; \ + else \ + skipped="($$skip tests were not run)"; \ + fi; \ + test `echo "$$skipped" | wc -c` -le `echo "$$banner" | wc -c` || \ + dashes="$$skipped"; \ + fi; \ + report=""; \ + if test "$$failed" -ne 0 && test -n "$(PACKAGE_BUGREPORT)"; then \ + report="Please report to $(PACKAGE_BUGREPORT)"; \ + test `echo "$$report" | wc -c` -le `echo "$$banner" | wc -c` || \ + dashes="$$report"; \ + fi; \ + dashes=`echo "$$dashes" | sed s/./=/g`; \ + if test "$$failed" -eq 0; then \ + col="$$grn"; \ + else \ + col="$$red"; \ + fi; \ + echo "$${col}$$dashes$${std}"; \ + echo "$${col}$$banner$${std}"; \ + test -z "$$skipped" || echo "$${col}$$skipped$${std}"; \ + test -z "$$report" || echo "$${col}$$report$${std}"; \ + echo "$${col}$$dashes$${std}"; \ + test "$$failed" -eq 0; \ + else :; fi + +distdir: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + list='$(DISTFILES)'; \ + dist_files=`for file in $$list; do echo $$file; done | \ + sed -e "s|^$$srcdirstrip/||;t" \ + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ + case $$dist_files in \ + */*) $(MKDIR_P) `echo "$$dist_files" | \ + sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ + sort -u` ;; \ + esac; \ + for file in $$dist_files; do \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + if test -d $$d/$$file; then \ + dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test -d "$(distdir)/$$file"; then \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ + else \ + test -f "$(distdir)/$$file" \ + || cp -p $$d/$$file "$(distdir)/$$file" \ + || exit 1; \ + fi; \ + done +check-am: all-am + $(MAKE) $(AM_MAKEFLAGS) check-TESTS +check: check-am +all-am: Makefile +installdirs: +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + if test -z '$(STRIP)'; then \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + install; \ + else \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ + fi +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-generic clean-libtool mostlyclean-am + +distclean: distclean-am + -rm -f Makefile +distclean-am: clean-am distclean-generic + +dvi: dvi-am + +dvi-am: + +html: html-am + +html-am: + +info: info-am + +info-am: + +install-data-am: + +install-dvi: install-dvi-am + +install-dvi-am: + +install-exec-am: + +install-html: install-html-am + +install-html-am: + +install-info: install-info-am + +install-info-am: + +install-man: + +install-pdf: install-pdf-am + +install-pdf-am: + +install-ps: install-ps-am + +install-ps-am: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-generic mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: + +.MAKE: check-am install-am install-strip + +.PHONY: all all-am check check-TESTS check-am clean clean-generic \ + clean-libtool distclean distclean-generic distclean-libtool \ + distdir dvi dvi-am html html-am info info-am install \ + install-am install-data install-data-am install-dvi \ + install-dvi-am install-exec install-exec-am install-html \ + install-html-am install-info install-info-am install-man \ + install-pdf install-pdf-am install-ps install-ps-am \ + install-strip installcheck installcheck-am installdirs \ + maintainer-clean maintainer-clean-generic mostlyclean \ + mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ + uninstall uninstall-am + + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/test/accum1.lm b/test/accum1.lm new file mode 100644 index 0000000..f4aa963 --- /dev/null +++ b/test/accum1.lm @@ -0,0 +1,36 @@ +##### LM ##### +lex + ignore /space+/ + literal `* `( `) + token id /[a-zA-Z_]+/ +end + +def item + [id] +| [`( item* `)] + +def start + [item*] + +parse Input: start[ stdin ] + +cons Output: accum<start> [] + +for Id: id in Input { + send Output + "( [^Id] ) +} + +S: start = Output() + +print( S ) +##### IN ##### +a b c ( chocolate fudge ) d e +##### EXP ##### +( a ) +( b ) +( c ) +( chocolate ) +( fudge ) +( d ) +( e ) diff --git a/test/accum2.lm b/test/accum2.lm new file mode 100644 index 0000000..05e377b --- /dev/null +++ b/test/accum2.lm @@ -0,0 +1,38 @@ +##### LM ##### +context ctx + +i: int +j: int +k: int + +lex + ignore /space+/ + literal `* `( `) + token id /[a-zA-Z_]+/ +end + +def foo [id] + +def item + [id] +| [foo] +| [`( item* `)] + { + i = 0 + } + +def start + [item*] + +end # ctx + + +cons SP: parser<ctx::start> [] +SP.ctx = cons ctx [] +send SP [stdin] +Input: ctx::start = SP() +print( Input ) +##### IN ##### +a b c ( d e f ) +##### EXP ##### +a b c ( d e f ) diff --git a/test/accum3.lm b/test/accum3.lm new file mode 100644 index 0000000..956bb3f --- /dev/null +++ b/test/accum3.lm @@ -0,0 +1,36 @@ +##### LM ##### +lex + literal `- + token file /^('-'|0)(^0)*/ +end + +token word /(^0)+/ +token zero /0/ + +lex + token single /[qvh]/ + token with_opt /[oi]/ +end + +def item + [file zero] +| [`- single* zero] +| [`- with_opt zero? word zero] + +def args + [word zero item*] + +cons ArgParser: parser<args> [] + +ArgV: list<str> ArgV = argv +for A: str in ArgV + send ArgParser [A '\0'] + +Args: args = ArgParser() + +print_xml( Args ) +print( '\n' ) +##### ARGS ##### +-qv -h -o output sdf -i eth0 file +##### EXP ##### +<args><word>./working/accum3</word><zero>�</zero><_repeat_item><item><_literal_0001>-</_literal_0001><_repeat_single><single>q</single><single>v</single></_repeat_single><zero>�</zero></item><item><_literal_0001>-</_literal_0001><_repeat_single><single>h</single></_repeat_single><zero>�</zero></item><item><_literal_0001>-</_literal_0001><with_opt>o</with_opt><_opt_zero><zero>�</zero></_opt_zero><word>output</word><zero>�</zero></item><item><file>sdf</file><zero>�</zero></item><item><_literal_0001>-</_literal_0001><with_opt>i</with_opt><_opt_zero><zero>�</zero></_opt_zero><word>eth0</word><zero>�</zero></item><item><file>file</file><zero>�</zero></item></_repeat_item></args> diff --git a/test/accumbt1.lm b/test/accumbt1.lm new file mode 100644 index 0000000..77d56f5 --- /dev/null +++ b/test/accumbt1.lm @@ -0,0 +1,41 @@ +##### LM ##### + +# Token names. +lex + token number /[0-9]+/ + token id /[a-z]+/ + token string /'"' [^"]* '"'/ + ignore ws / [ \t\n]+ / +end + +def prefix [id] + +def choice1 + [number number] +| [number] + +def choice2 + [string id] +| [number number] +| [id number] +| [number] + +def start + [prefix choice1 choice2 string id id] + +cons I: parser<start> [] + +send I " id " +send I " 77 " +send I " 88 " +send I " \"hello\" " +send I " dude " +send I " dude\n" + +S: start = I() +S = match S + ~id 77 88 "hello" dude dude +print_xml( S ) +print( '\n' ) +##### EXP ##### +<start><prefix><id>id</id></prefix><choice1><number>77</number></choice1><choice2><number>88</number></choice2><string>"hello"</string><id>dude</id><id>dude</id></start> diff --git a/test/accumbt2.lm b/test/accumbt2.lm new file mode 100644 index 0000000..6d21c30 --- /dev/null +++ b/test/accumbt2.lm @@ -0,0 +1,54 @@ +##### LM ##### +context accum_bt + +lex + ignore /[ \t\n]+/ + token id1 /[a-zA-Z_][a-zA-Z_0-9]*/ + + def one [ id1* ] +end + +OneParser: accum<one> + +lex + ignore /[ \t]+/ + token id2 /[a-zA-Z_][a-zA-Z_0-9]*/ + literal `! `; + token NL /'\n'/ + + def A1 [] + { print( "A1\n" ) } + + def A2 [] + { print( "A2\n" ) } + + def item2 + [id2] + { + send OneParser [' extra '] + send OneParser [$r1] + } + + def two + [A1 item2* `! NL] + | [A2 item2* `; NL] +end + +end # accum_bt + +AccumBt: accum_bt = cons accum_bt[] +AccumBt.OneParser = cons parser<accum_bt::one>[] + +parse Two: accum_bt::two(AccumBt)[ stdin ] + +send AccumBt.OneParser ['\n'] eos + +print( ^Two ) +print( ^( AccumBt.OneParser.tree ) '\n' ) +##### IN ##### +a b c d e ; +##### EXP ##### +A1 +A2 +a b c d e ; +extra a extra b extra c extra d extra e diff --git a/test/accumbt3.lm b/test/accumbt3.lm new file mode 100644 index 0000000..3753282 --- /dev/null +++ b/test/accumbt3.lm @@ -0,0 +1,106 @@ +##### LM ##### +# +# Tokens +# + + +# Any single character can be a literal +lex + # Ignore whitespace. + ignore /[ \t\n\r\v]+/ + + # Open and close id + token id /[a-zA-Z_][a-zA-Z0-9_]*/ + + token open_paren /'('/ + { + parse_stop NC: nested_comment[ input ] + print( 'discarding: ' NC '\n' ) + } +end + +# +# Token translation +# + +lex + literal `( `) + token nc_data /[^()]+/ +end + +def nc_item + [nc_data] +| [nested_comment] + +def nested_comment + [`( nc_item* `)] + +def nested [id*] + +# +# Accumulator. +# +context accum_bt + +NestedParser: accum<nested> + +lex + ignore /[ \t]+/ + token word /[a-zA-Z0-9/*+_\-]+/ + token stuff /[a-zA-Z0-9()/*+_\- ]+/ + literal `! `; + token NL /'\n'/ + +end + +def A1 [] + { print( "A1\n" ) } + +def A2 [] + { print( "A2\n" ) } + +def item + [word] + { + send NestedParser [' '] + send NestedParser [$r1] + send NestedParser [' '] + } +| + [stuff] + { + send NestedParser [' '] + send NestedParser [$r1] + send NestedParser [' '] + } + +def two + [A1 item* `! NL] +| [A2 item* `; NL] + +end # accum_bt + +cons AccumBt: accum_bt[] +AccumBt.NestedParser = cons parser<nested>[] + +parse Two: accum_bt::two(AccumBt)[ stdin ] + +send AccumBt.NestedParser [] eos +Nested: nested = AccumBt.NestedParser.tree + +print( '\n------------\n' ) +print( ^Nested '\n' ) +print( ^Two '\n' ) + +##### IN ##### +hello there ( (this is a nested comment /*sdf asd_++_stuff) ) and this is not ; +##### EXP ##### +A1 +discarding: ( (this is a nested comment /*sdf asd_++_stuff) ) +A2 +discarding: ( (this is a nested comment /*sdf asd_++_stuff) ) + +------------ +hello there and this is not +hello there ( (this is a nested comment /*sdf asd_++_stuff) ) and this is not ; + diff --git a/test/argv1.lm b/test/argv1.lm new file mode 100644 index 0000000..74086e7 --- /dev/null +++ b/test/argv1.lm @@ -0,0 +1,8 @@ +##### LM ##### + +print_xml( argv ) +print( '\n' ) +##### ARGS ##### +a b c 1 2 3 +##### EXP ##### +<__list0><str>./working/argv1</str><str>a</str><str>b</str><str>c</str><str>1</str><str>2</str><str>3</str></__list0> diff --git a/test/argv2.lm b/test/argv2.lm new file mode 100644 index 0000000..5c84564 --- /dev/null +++ b/test/argv2.lm @@ -0,0 +1,91 @@ +##### LM ##### +lex + literal `- + token file /^('-'|0)(^0)*/ +end + +lex + token single /[qvh]/ + token with_opt /[oi]/ + token dash /'-'/ + + literal `help `verbose `input `= +end + +def long + [`help] +| [`verbose] + +def long_with_opt + [`input] + +def long_eqals + [`=] +| [zero] + +token word /(^0)+/ +token zero /0/ + +def item + [`- single* zero] +| [`- with_opt zero? word zero] +| [`- dash long zero] +| [`- dash long_with_opt long_eqals word zero] +| [file zero] + +def args + [word zero item*] + +# The argument parser. Using an accumulator so we can send nulls after each +# arg. +cons ArgParser: parser<args>[] + +# Parse the args and extract the result into Args. +ArgV: list<str> = argv +for A: str in ArgV + send ArgParser [A '\0'] +Args: args = ArgParser() + +# Process the args. +for Item: item in Args { + if match Item + [`- SL: single* zero] + { + for S: single in SL + print( "single: [$S]\n" ) + } + elsif match Item + [`- W: with_opt zero? Opt: word zero] + { + print( "with opt: [$W] -> [$Opt]\n" ) + } + elsif match Item + [`- dash L: long zero] + { + print("long: [$L]\n" ) + } + elsif match Item + [`- dash LO: long_with_opt long_eqals LongOpt: word zero] + { + print("long: [$LO] -> [$LongOpt]\n" ) + } + elsif match Item + [F: file zero] + { + print("file: [$F]\n" ) + } +} + +##### ARGS ##### +-qv -h -o output -iinput file --input=foo --input bar --help --verbose +##### EXP ##### +single: q +single: v +single: h +with opt: o -> output +with opt: i -> input +file: file +long: input -> foo +long: input -> bar +long: help +long: verbose diff --git a/test/backtrack1.lm b/test/backtrack1.lm new file mode 100644 index 0000000..76f3705 --- /dev/null +++ b/test/backtrack1.lm @@ -0,0 +1,29 @@ +##### LM ##### +# Token names. + +lex + literal `+ `* + token number /[0-9]+/ + ignore ws / [ \t\n]+ / +end + +def F + [number `+] +| [number] +| [F `* number] + +def E + [E `+ F] +| [F] + +def start + [E] + +parse S: start[ stdin ] +R: start = match S ~ 9 + 9 +print_xml( R ) +print( '\n' ) +##### IN ##### +9 + 9 +##### EXP ##### +<start><E><E><F><number>9</number></F></E><_literal_0001>+</_literal_0001><F><number>9</number></F></E></start> diff --git a/test/backtrack2.lm b/test/backtrack2.lm new file mode 100644 index 0000000..fa3cdfc --- /dev/null +++ b/test/backtrack2.lm @@ -0,0 +1,29 @@ +##### LM ##### + +# Token names. +lex + token id /[a-z]+/ + ignore ws /[ \t\n]+/ +end + +token bang1 /'!'/ +token bang2 /'!'/ + +def one [bang1 id id id] + +def two [bang2 id id id id] + +def prods + [one] +| [two] + +def start + [prods] + +parse S: start[ stdin ] +print_xml( match S "!aa bb cc dd" ) +print( '\n' ) +##### IN ##### +!aa bb cc dd +##### EXP ##### +<start><prods><two><bang2>!</bang2><id>aa</id><id>bb</id><id>cc</id><id>dd</id></two></prods></start> diff --git a/test/backtrack3.lm b/test/backtrack3.lm new file mode 100644 index 0000000..8c6dfc3 --- /dev/null +++ b/test/backtrack3.lm @@ -0,0 +1,34 @@ +##### LM ##### + +# Token names. +lex + token number /[0-9]+/ + token id /[a-z]+/ + token string /'"' [^"]* '"'/ + ignore ws / [ \t\n]+ / +end + +def prefix [id] + +def choice1 + [number number] +| [number] + +def choice2 + [string id] +| [number number] +| [id number] +| [number] + +def start + [prefix choice1 choice2 string id id] + { + print_xml( match lhs "id 77 88 \"hello\" dude dude\n" ) + } + +parse start[stdin] +print( '\n' ) +##### IN ##### +id 77 88 "hello" dude dude +##### EXP ##### +<start><prefix><id>id</id></prefix><choice1><number>77</number></choice1><choice2><number>88</number></choice2><string>"hello"</string><id>dude</id><id>dude</id></start> diff --git a/test/binary1.in b/test/binary1.in Binary files differnew file mode 100644 index 0000000..8da7d3d --- /dev/null +++ b/test/binary1.in diff --git a/test/binary1.lm b/test/binary1.lm new file mode 100644 index 0000000..7fc8337 --- /dev/null +++ b/test/binary1.lm @@ -0,0 +1,1902 @@ +##### LM ##### + +context binary + +# Used for most of the grammar. +token octet /any/ + +# Filled in during the parsing of resource records. Determine what RR_UNKNOWN +# translates to. +rr_type_value: int +rr_class_value: int + +# Tokens generated from RR_UNKNOWN. Used to pick the kind +# of resource record to attempt to parse. +token RR_A // # 1 a host address +token RR_NS // # 2 an authoritative name server +token RR_MD // # 3 a mail destination (Obsolete - use MX) +token RR_MF // # 4 a mail forwarder (Obsolete - use MX) +token RR_CNAME // # 5 the canonical name for an alias +token RR_SOA // # 6 marks the start of a zone of authority +token RR_MB // # 7 a mailbox domain name (EXPERIMENTAL) +token RR_MG // # 8 a mail group member (EXPERIMENTAL) +token RR_MR // # 9 a mail rename domain name (EXPERIMENTAL) +token RR_NULL // # 10 a null RR (EXPERIMENTAL) +token RR_WKS // # 11 a well known service description +token RR_PTR // # 12 a domain name pointer +token RR_HINFO // # 13 host information +token RR_MINFO // # 14 mailbox or mail list information +token RR_MX // # 15 mail exchange +token RR_TXT // # 16 text strings + +token RR_UNKNOWN + /''/ + { + id: int = typeid<RR_UNKNOWN> + if rr_type_value == 1 + id = typeid<RR_A> + elsif rr_type_value == 2 + id = typeid<RR_NS> + elsif rr_type_value == 5 + id = typeid<RR_CNAME> + elsif rr_type_value == 12 + id = typeid<RR_PTR> + elsif rr_type_value == 15 + id = typeid<RR_MX> + elsif rr_type_value == 16 + id = typeid<RR_TXT> + + input.push( make_token( id '' ) ) + } + +# Convert two octets in network order into an unsigned 16 bit value. +int network_uord16( o1: octet o2: octet ) +{ + return o1.data.uord8() * 256 + o2.data.uord8() +} + + +def message + [header questions answers authorities additionals] + +question_count: int +answer_count: int +authority_count: int +additional_count: int + +# Message Header +# +# 1 1 1 1 1 1 +# 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 +# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ +# | ID | +# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ +# |QR| Opcode |AA|TC|RD|RA| Z | RCODE | +# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ +# | QDCOUNT | +# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ +# | ANCOUNT | +# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ +# | NSCOUNT | +# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ +# | ARCOUNT | +# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ +def header + [header_id header_fields count count count count] + { + question_count = r3.count + answer_count = r4.count + authority_count = r5.count + additional_count = r6.count + } + +def header_id + [octet octet] + +def header_fields + [octet octet] + +def count + count: int + [octet octet] + { + lhs.count = network_uord16( r1 r2 ) + } + +# +# Counting Primitives +# +# Uses a stack of lengths. Using a stack allows for counted lists to be +# nested. As the list is consumed it brings the count down to zero. To use +# it, push a new count value to the list and include it in a +# right-recursive list like so: +# +# def LIST +# [count_inc ITEM LIST] +# [count_end] +# end +# + +CL: list<int> + +int start_list( count: int ) +{ + CL.push( count ) +} + +def count_inc + [] + { + if CL.top == 0 { + reject + } else { + CL.top = CL.top - 1 + } + } + +def count_end + [] + { + if CL.top != 0 { + reject + } else { + CL.pop() + } + } + +# +# Octet List +# + +# General octet list. Length must be set to use this. +def octet_list + [count_inc octet octet_list] +| [count_end] + + +# +# Names +# + +def name + [name_part* name_end] + +# Name part lists are terminated by a zero length or a pointer. +def name_end + # Zero length ending + [octet] + { + val: int = r1.data.uord8() + if val != 0 { + reject + } + } + + # Pointer ending + # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + # | 1 1| OFFSET | + # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ +| [octet octet] + { + val: int = r1.data.uord8() + if val < 64 { + reject + } + } + +# +# Get some number of bytes. +# + +# How many to get +nbytes: int + +# We use this token to eliminate the lookahead that would be needed to +# cause a reduce of part_len. This forces whatever comes before nbytes to +# be reduced before nbytes_data token is fetched from the scanner. We need +# this because nbytes_data depends on the nbytes in the context and we need +# to ensure that it is set. +token nb_empty /''/ + +# Fetch nbytes bytes. +token nbytes_data + /''/ + { + input.push( make_token( typeid<nbytes_data> input.pull(nbytes) ) ) + } + +def nbytes + [nb_empty nbytes_data] + +def name_part + [part_len nbytes] + + +def part_len + [octet] + { + # A name part list is terminated either by a zero length or a pointer, + # which must have the two high bits set. + count: int = r1.data.uord8() + if count == 0 || count >= 64 { + reject + } else { + # Set the number of bytes to get for the name part. + nbytes = count + } + } + +# +# Resource Records +# + +# 1 1 1 1 1 1 +# 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 +# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ +# | | +# / / +# / NAME / +# | | +# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ +# | TYPE | +# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ +# | CLASS | +# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ +# | TTL | +# | | +# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ +# | RDLENGTH | +# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--| +# / RDATA / +# / / +# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + +def resource_record + [name rr_type rr_class ttl rdlength rdata] + +def rr_type + [octet octet] + { + rr_type_value = network_uord16( r1 r2 ) + } + +def rr_class + value: int + [octet octet] + { + rr_class_value = network_uord16( r1 r2 ) + } + +def ttl + [octet octet octet octet] + +token rdata_bytes + /''/ + { + input.push( make_token( typeid<rdata_bytes> input.pull(rdata_length) ) ) + } + +def rdlength + [octet octet] + { + rdata_length = network_uord16( r1 r2 ) + } + +rdata_length: int + +def rdata + [RR_UNKNOWN rdata_bytes] +| [RR_A address] +| [RR_NS name] +| [RR_CNAME name] +| [RR_PTR name] +| [RR_MX octet octet name] +| [RR_TXT rdata_bytes] + + +# +# Address +# +def address [octet octet octet octet] + +# +# List of Questions +# + +def questions + [load_question_count question_list] + +def load_question_count + [] + { + start_list( question_count ) + } + +def question_list + [count_inc question question_list] +| [count_end] + +# +# Question +# + +# 1 1 1 1 1 1 +# 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 +# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ +# | | +# / QNAME / +# / / +# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ +# | QTYPE | +# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ +# | QCLASS | +# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + +def question + [name qtype qclass] + +def qtype + [octet octet] + +def qclass + [octet octet] + +# +# List of Answers +# + +def answers + [load_answer_count answer_list] + +def load_answer_count + [] + { + start_list( answer_count ) + } + +def answer_list + [count_inc answer answer_list] +| [count_end] + +# +# Answer +# + +def answer + [resource_record] + +# +# List of Authorities +# + +def authorities + [load_authority_count authority_list] + +def load_authority_count + [] + { + start_list( authority_count ) + } + +def authority_list + [count_inc authority authority_list] +| [count_end] + +# +# Authority +# + +def authority + [resource_record] + +# +# List of Additionals +# + +def additionals + [load_additional_count additional_list] + +def load_additional_count + [] + { + start_list( additional_count ) + } + +def additional_list + [count_inc additional additional_list] +| [count_end] + +# +# Additional +# + +def additional + [resource_record] + + +def start + [message*] + +# +# Grammar End. +# + +int print_RR_UNKNOWN( s: start ) +{ + for I:rdata in s { + if match I [u:RR_UNKNOWN rdata_bytes] { + print( 'UNKNOWN TYPE\n' ) + } + } +} + +int print_RR_A( s: start ) +{ + for I:rdata in s { + if match I [RR_A o1:octet o2:octet o3:octet o4:octet] { + print( 'RR_A: ' o1.data.uord8() '.' o2.data.uord8() '.' + o3.data.uord8() '.' o4.data.uord8() '\n' ) + } + } +} + +int print_name( n: name m: map<int name> ) +{ + for P: name_part in n { + match P [part_len D:nbytes] + print( D '.' ) + } + + for E:name_end in n { + if match E [o1:octet o2:octet] { + val: int = (o1.data.uord8() - 192) * 256 + o2.data.uord8() + print( '[' val ']' ) + nameInMap: name = m.find( val ) + print_name( nameInMap m ) + } + } +} + +int print_all_names( s: start ) +{ + for M: message in s { + construct m: map<int name> [] + + O: octet = octet in M + + for N: name in M { + match N [name_part* E:name_end] + + for NP: name_part* in N { + if match NP [L: octet nbytes name_part*] { + messageOffset: int = L.pos - O.pos + construct n: name [NP E] + m.insert( messageOffset n ) + } + } + } + + for I: name in M { + print_name( I m ) + print( '\n' ) + } + } +} + +end # binary + +cons Binary: binary[] +Binary.CL = cons list<int> [] + +parse S: binary::start(Binary) [ stdin ] +print_all_names( S ) +print( '*** SUCCESS ***\n' ) +##### EXP ##### +www.google.ca. +www.google.ca. +[12]www.google.ca. +www.google.com. +[43]www.google.com. +www.l.[47]google.com. +[71]www.l.[47]google.com. +[75]l.[47]google.com. +a.[75]l.[47]google.com. +[75]l.[47]google.com. +d.[75]l.[47]google.com. +[75]l.[47]google.com. +e.[75]l.[47]google.com. +[75]l.[47]google.com. +f.[75]l.[47]google.com. +[75]l.[47]google.com. +g.[75]l.[47]google.com. +[75]l.[47]google.com. +b.[75]l.[47]google.com. +[107]a.[75]l.[47]google.com. +[123]d.[75]l.[47]google.com. +[139]e.[75]l.[47]google.com. +[155]f.[75]l.[47]google.com. +[171]g.[75]l.[47]google.com. +[187]b.[75]l.[47]google.com. +clients1.google.ca. +clients1.google.ca. +[12]clients1.google.ca. +clients.l.google.com. +[48]clients.l.google.com. +[56]l.google.com. +e.[56]l.google.com. +[56]l.google.com. +f.[56]l.google.com. +[56]l.google.com. +g.[56]l.google.com. +[56]l.google.com. +b.[56]l.google.com. +[56]l.google.com. +a.[56]l.google.com. +[56]l.google.com. +d.[56]l.google.com. +[98]e.[56]l.google.com. +[114]f.[56]l.google.com. +[130]g.[56]l.google.com. +[146]b.[56]l.google.com. +[162]a.[56]l.google.com. +[178]d.[56]l.google.com. +en-us.fxfeeds.mozilla.com. +en-us.fxfeeds.mozilla.com. +[12]en-us.fxfeeds.mozilla.com. +fxfeeds.mozilla.org. +[55]fxfeeds.mozilla.org. +[63]mozilla.org. +ns1.[63]mozilla.org. +[63]mozilla.org. +ns2.[63]mozilla.org. +[63]mozilla.org. +ns3.[63]mozilla.org. +[104]ns1.[63]mozilla.org. +[122]ns2.[63]mozilla.org. +[140]ns3.[63]mozilla.org. +fxfeeds.mozilla.com. +fxfeeds.mozilla.com. +[12]fxfeeds.mozilla.com. +fxfeeds.mozilla.org. +[49]fxfeeds.mozilla.org. +[57]mozilla.org. +ns2.[57]mozilla.org. +[57]mozilla.org. +ns3.[57]mozilla.org. +[57]mozilla.org. +ns1.[57]mozilla.org. +[98]ns2.[57]mozilla.org. +[116]ns3.[57]mozilla.org. +[134]ns1.[57]mozilla.org. +newsrss.bbc.co.uk. +newsrss.bbc.co.uk. +[12]newsrss.bbc.co.uk. +newsrss.bbc.net.uk. +[47]newsrss.bbc.net.uk. +[55]bbc.net.uk. +ns0.thdo.bbc.co.[63]uk. +[55]bbc.net.uk. +ns0.rbsov.[104]bbc.co.[63]uk. +[95]ns0.thdo.bbc.co.[63]uk. +[125]ns0.rbsov.[104]bbc.co.[63]uk. +news.google.ca. +news.google.ca. +[12]news.google.ca. +news.google.com. +[44]news.google.com. +news.l.[49]google.com. +[73]news.l.[49]google.com. +[78]l.[49]google.com. +d.[78]l.[49]google.com. +[78]l.[49]google.com. +e.[78]l.[49]google.com. +[78]l.[49]google.com. +f.[78]l.[49]google.com. +[78]l.[49]google.com. +g.[78]l.[49]google.com. +[78]l.[49]google.com. +b.[78]l.[49]google.com. +[78]l.[49]google.com. +a.[78]l.[49]google.com. +[110]d.[78]l.[49]google.com. +[126]e.[78]l.[49]google.com. +[142]f.[78]l.[49]google.com. +[158]g.[78]l.[49]google.com. +[174]b.[78]l.[49]google.com. +[190]a.[78]l.[49]google.com. +nt3.ggpht.com. +nt3.ggpht.com. +[12]nt3.ggpht.com. +news.l.google.com. +[43]news.l.google.com. +[48]l.google.com. +f.[48]l.google.com. +[48]l.google.com. +g.[48]l.google.com. +[48]l.google.com. +b.[48]l.google.com. +[48]l.google.com. +a.[48]l.google.com. +[48]l.google.com. +d.[48]l.google.com. +[48]l.google.com. +e.[48]l.google.com. +[90]f.[48]l.google.com. +[106]g.[48]l.google.com. +[122]b.[48]l.google.com. +[138]a.[48]l.google.com. +[154]d.[48]l.google.com. +[170]e.[48]l.google.com. +csi.gstatic.com. +csi.gstatic.com. +[12]csi.gstatic.com. +csi.l.google.com. +[45]csi.l.google.com. +[49]l.google.com. +d.[49]l.google.com. +[49]l.google.com. +e.[49]l.google.com. +[49]l.google.com. +f.[49]l.google.com. +[49]l.google.com. +g.[49]l.google.com. +[49]l.google.com. +b.[49]l.google.com. +[49]l.google.com. +a.[49]l.google.com. +[91]d.[49]l.google.com. +[107]e.[49]l.google.com. +[123]f.[49]l.google.com. +[139]g.[49]l.google.com. +[155]b.[49]l.google.com. +[171]a.[49]l.google.com. +www.nytimes.com. +www.nytimes.com. +[12]www.nytimes.com. +nytimes.com. +nydns1.about.[57]com. +[49]nytimes.com. +ns1t.[49]nytimes.com. +[49]nytimes.com. +nydns2.[79]about.[57]com. +[72]nydns1.about.[57]com. +[99]ns1t.[49]nytimes.com. +[118]nydns2.[79]about.[57]com. +graphics8.nytimes.com. +graphics8.nytimes.com. +[12]graphics8.nytimes.com. +graphics478.nytimes.com.edgesuite.net. +[51]graphics478.nytimes.com.edgesuite.net. +a1116.x.akamai.[85]net. +[102]a1116.x.akamai.[85]net. +[102]a1116.x.akamai.[85]net. +[108]x.akamai.[85]net. +n0x.[110]akamai.[85]net. +[108]x.akamai.[85]net. +n1x.[110]akamai.[85]net. +[108]x.akamai.[85]net. +n6x.[110]akamai.[85]net. +[108]x.akamai.[85]net. +n3x.[110]akamai.[85]net. +[108]x.akamai.[85]net. +n2x.[110]akamai.[85]net. +[108]x.akamai.[85]net. +n7x.[110]akamai.[85]net. +[108]x.akamai.[85]net. +n5x.[110]akamai.[85]net. +[108]x.akamai.[85]net. +n8x.[110]akamai.[85]net. +[108]x.akamai.[85]net. +n4x.[110]akamai.[85]net. +[163]n0x.[110]akamai.[85]net. +[181]n1x.[110]akamai.[85]net. +[199]n6x.[110]akamai.[85]net. +[217]n3x.[110]akamai.[85]net. +[235]n2x.[110]akamai.[85]net. +[253]n7x.[110]akamai.[85]net. +[271]n5x.[110]akamai.[85]net. +[289]n8x.[110]akamai.[85]net. +[307]n4x.[110]akamai.[85]net. +timespeople.nytimes.com. +timespeople.nytimes.com. +[12]timespeople.nytimes.com. +nytimes.com. +nydns2.about.[65]com. +[57]nytimes.com. +nydns1.[87]about.[65]com. +[57]nytimes.com. +ns1t.[57]nytimes.com. +[80]nydns2.about.[65]com. +[107]nydns1.[87]about.[65]com. +[128]ns1t.[57]nytimes.com. +googleads.g.doubleclick.net. +googleads.g.doubleclick.net. +[12]googleads.g.doubleclick.net. +pagead.l.doubleclick.net. +[57]pagead.l.doubleclick.net. +[64]l.doubleclick.net. +b.l.google.com. +[64]l.doubleclick.net. +d.[113]l.google.com. +[64]l.doubleclick.net. +e.[113]l.google.com. +[64]l.doubleclick.net. +f.[113]l.google.com. +[64]l.doubleclick.net. +g.[113]l.google.com. +[64]l.doubleclick.net. +a.[113]l.google.com. +[111]b.l.google.com. +[139]d.[113]l.google.com. +[155]e.[113]l.google.com. +[171]f.[113]l.google.com. +[187]g.[113]l.google.com. +[203]a.[113]l.google.com. +up.nytimes.com. +up.nytimes.com. +[12]up.nytimes.com. +up.about.akadns.net. +[44]up.about.akadns.net. +[53]akadns.net. +eur1.[53]akadns.net. +[53]akadns.net. +use3.[53]akadns.net. +[53]akadns.net. +use4.[53]akadns.net. +[53]akadns.net. +usw2.[53]akadns.net. +[53]akadns.net. +za.akadns.org. +[53]akadns.net. +zb.[172]akadns.org. +[53]akadns.net. +zc.[172]akadns.org. +[53]akadns.net. +zd.[172]akadns.org. +[53]akadns.net. +asia9.[53]akadns.net. +[93]eur1.[53]akadns.net. +[112]use3.[53]akadns.net. +[131]use4.[53]akadns.net. +[150]usw2.[53]akadns.net. +[169]za.akadns.org. +[196]zb.[172]akadns.org. +[213]zc.[172]akadns.org. +[230]zd.[172]akadns.org. +[247]asia9.[53]akadns.net. +pix04.revsci.net. +pix04.revsci.net. +[12]pix04.revsci.net. +revsci.net. +ns2.p16.dynect.[57]net. +[50]revsci.net. +ns3.[76]p16.dynect.[57]net. +[50]revsci.net. +ns1.[76]p16.dynect.[57]net. +[50]revsci.net. +ns4.[76]p16.dynect.[57]net. +[72]ns2.p16.dynect.[57]net. +[101]ns3.[76]p16.dynect.[57]net. +[119]ns1.[76]p16.dynect.[57]net. +[137]ns4.[76]p16.dynect.[57]net. +wt.o.nytimes.com. +wt.o.nytimes.com. +[12]wt.o.nytimes.com. +nytimes.webtrends.akadns.net. +[46]nytimes.webtrends.akadns.net. +[64]akadns.net. +use3.[64]akadns.net. +[64]akadns.net. +use4.[64]akadns.net. +[64]akadns.net. +usw2.[64]akadns.net. +[64]akadns.net. +za.akadns.org. +[64]akadns.net. +zb.[164]akadns.org. +[64]akadns.net. +zc.[164]akadns.org. +[64]akadns.net. +zd.[164]akadns.org. +[64]akadns.net. +asia9.[64]akadns.net. +[64]akadns.net. +eur1.[64]akadns.net. +[104]use3.[64]akadns.net. +[123]use4.[64]akadns.net. +[142]usw2.[64]akadns.net. +[161]za.akadns.org. +[188]zb.[164]akadns.org. +[205]zc.[164]akadns.org. +[222]zd.[164]akadns.org. +[239]asia9.[64]akadns.net. +[259]eur1.[64]akadns.net. +te.nytimes.com. +ar.voicefive.com. +te.nytimes.com. +[12]te.nytimes.com. +nytd.te.tacoda.net. +[44]nytd.te.tacoda.net. +te.tacoda.akadns.[59]net. +[76]te.tacoda.akadns.[59]net. +[86]akadns.[59]net. +use4.[86]akadns.[59]net. +[86]akadns.[59]net. +usw2.[86]akadns.[59]net. +[86]akadns.[59]net. +za.akadns.org. +[86]akadns.[59]net. +zb.[164]akadns.org. +[86]akadns.[59]net. +zc.[164]akadns.org. +[86]akadns.[59]net. +zd.[164]akadns.org. +[86]akadns.[59]net. +asia9.[86]akadns.[59]net. +[86]akadns.[59]net. +eur1.[86]akadns.[59]net. +[86]akadns.[59]net. +use3.[86]akadns.[59]net. +[123]use4.[86]akadns.[59]net. +[142]usw2.[86]akadns.[59]net. +[161]za.akadns.org. +[188]zb.[164]akadns.org. +[205]zc.[164]akadns.org. +[222]zd.[164]akadns.org. +[239]asia9.[86]akadns.[59]net. +[259]eur1.[86]akadns.[59]net. +[278]use3.[86]akadns.[59]net. +ar.voicefive.com. +[12]ar.voicefive.com. +ar.gta.voicefive.com. +[46]ar.gta.voicefive.com. +[49]gta.voicefive.com. +gta02.ord.[53]voicefive.com. +[49]gta.voicefive.com. +gta01.iad.[53]voicefive.com. +[49]gta.voicefive.com. +gta01.[102]ord.[53]voicefive.com. +[49]gta.voicefive.com. +gta02.[126]iad.[53]voicefive.com. +[96]gta02.ord.[53]voicefive.com. +[120]gta01.iad.[53]voicefive.com. +[144]gta01.[102]ord.[53]voicefive.com. +[164]gta02.[126]iad.[53]voicefive.com. +www.google.com. +www.google.com. +[12]www.google.com. +www.l.google.com. +[44]www.l.google.com. +[48]l.google.com. +d.[48]l.google.com. +[48]l.google.com. +e.[48]l.google.com. +[48]l.google.com. +f.[48]l.google.com. +[48]l.google.com. +g.[48]l.google.com. +[48]l.google.com. +b.[48]l.google.com. +[48]l.google.com. +a.[48]l.google.com. +[90]d.[48]l.google.com. +[106]e.[48]l.google.com. +[122]f.[48]l.google.com. +[138]g.[48]l.google.com. +[154]b.[48]l.google.com. +[170]a.[48]l.google.com. +www.google-analytics.com. +www.google-analytics.com. +[12]www.google-analytics.com. +www-google-analytics.l.google.com. +[54]www-google-analytics.l.google.com. +[75]l.google.com. +f.[75]l.google.com. +[75]l.google.com. +g.[75]l.google.com. +[75]l.google.com. +b.[75]l.google.com. +[75]l.google.com. +a.[75]l.google.com. +[75]l.google.com. +d.[75]l.google.com. +[75]l.google.com. +e.[75]l.google.com. +[117]f.[75]l.google.com. +[133]g.[75]l.google.com. +[149]b.[75]l.google.com. +[165]a.[75]l.google.com. +[181]d.[75]l.google.com. +[197]e.[75]l.google.com. +pagead2.googlesyndication.com. +pagead2.googlesyndication.com. +[12]pagead2.googlesyndication.com. +pagead.l.google.com. +[59]pagead.l.google.com. +[66]l.google.com. +g.[66]l.google.com. +[66]l.google.com. +b.[66]l.google.com. +[66]l.google.com. +a.[66]l.google.com. +[66]l.google.com. +d.[66]l.google.com. +[66]l.google.com. +e.[66]l.google.com. +[66]l.google.com. +f.[66]l.google.com. +[108]g.[66]l.google.com. +[124]b.[66]l.google.com. +[140]a.[66]l.google.com. +[156]d.[66]l.google.com. +[172]e.[66]l.google.com. +[188]f.[66]l.google.com. +maps.google.com. +maps.google.com. +[12]maps.google.com. +maps.l.google.com. +[45]maps.l.google.com. +[50]l.google.com. +b.[50]l.google.com. +[50]l.google.com. +a.[50]l.google.com. +[50]l.google.com. +d.[50]l.google.com. +[50]l.google.com. +e.[50]l.google.com. +[50]l.google.com. +f.[50]l.google.com. +[50]l.google.com. +g.[50]l.google.com. +[92]b.[50]l.google.com. +[108]a.[50]l.google.com. +[124]d.[50]l.google.com. +[140]e.[50]l.google.com. +[156]f.[50]l.google.com. +[172]g.[50]l.google.com. +maps.gstatic.com. +maps.gstatic.com. +[12]maps.gstatic.com. +www2.l.google.com. +[46]www2.l.google.com. +[51]l.google.com. +a.[51]l.google.com. +[51]l.google.com. +d.[51]l.google.com. +[51]l.google.com. +e.[51]l.google.com. +[51]l.google.com. +f.[51]l.google.com. +[51]l.google.com. +g.[51]l.google.com. +[51]l.google.com. +b.[51]l.google.com. +[93]a.[51]l.google.com. +[109]d.[51]l.google.com. +[125]e.[51]l.google.com. +[141]f.[51]l.google.com. +[157]g.[51]l.google.com. +[173]b.[51]l.google.com. +www.calgaryherald.com. +www.calgaryherald.com. +[12]www.calgaryherald.com. +calgaryherald.com. +ns1.canwest.[69]com. +[55]calgaryherald.com. +ns2.[88]canwest.[69]com. +[84]ns1.canwest.[69]com. +[110]ns2.[88]canwest.[69]com. +a123.g.akamai.net. +a123.g.akamai.net. +[12]a123.g.akamai.net. +[12]a123.g.akamai.net. +members.canada.com. +members.canada.com. +[12]members.canada.com. +canada.com. +ns2.canwest.[59]com. +[52]canada.com. +ns1.[78]canwest.[59]com. +[74]ns2.canwest.[59]com. +[100]ns1.[78]canwest.[59]com. +www.canada.com. +www.canada.com. +[12]www.canada.com. +canada.com. +ns1.canwest.[55]com. +[48]canada.com. +ns2.[74]canwest.[55]com. +[70]ns1.canwest.[55]com. +[96]ns2.[74]canwest.[55]com. +s9.addthis.com. +s9.addthis.com. +[12]s9.addthis.com. +wildcard.addthis.com.edgekey.net. +[44]wildcard.addthis.com.edgekey.net. +e2943.c.akamaiedge.[73]net. +[90]e2943.c.akamaiedge.[73]net. +[96]c.akamaiedge.[73]net. +n7c.[98]akamaiedge.[73]net. +[96]c.akamaiedge.[73]net. +n5c.[98]akamaiedge.[73]net. +[96]c.akamaiedge.[73]net. +n8c.[98]akamaiedge.[73]net. +[96]c.akamaiedge.[73]net. +n0c.[98]akamaiedge.[73]net. +[96]c.akamaiedge.[73]net. +n6c.[98]akamaiedge.[73]net. +[96]c.akamaiedge.[73]net. +n1c.[98]akamaiedge.[73]net. +[96]c.akamaiedge.[73]net. +n4c.[98]akamaiedge.[73]net. +[96]c.akamaiedge.[73]net. +n3c.[98]akamaiedge.[73]net. +[96]c.akamaiedge.[73]net. +n2c.[98]akamaiedge.[73]net. +[139]n7c.[98]akamaiedge.[73]net. +[157]n5c.[98]akamaiedge.[73]net. +[175]n8c.[98]akamaiedge.[73]net. +[193]n0c.[98]akamaiedge.[73]net. +[211]n6c.[98]akamaiedge.[73]net. +[229]n1c.[98]akamaiedge.[73]net. +[247]n4c.[98]akamaiedge.[73]net. +[265]n3c.[98]akamaiedge.[73]net. +[283]n2c.[98]akamaiedge.[73]net. +communities.canada.com. +communities.canada.com. +[12]communities.canada.com. +canada.com. +ns1.canwest.[63]com. +[56]canada.com. +ns2.[82]canwest.[63]com. +[78]ns1.canwest.[63]com. +[104]ns2.[82]canwest.[63]com. +canwestglobal.112.2o7.net. +beacon.securestudies.com. +beacon.securestudies.com. +[12]beacon.securestudies.com. +beacon.gta.securestudies.com. +[54]beacon.gta.securestudies.com. +[61]gta.securestudies.com. +gta02.iad.[65]securestudies.com. +[61]gta.securestudies.com. +gta02.ord.[65]securestudies.com. +[61]gta.securestudies.com. +gta01.[118]iad.[65]securestudies.com. +[61]gta.securestudies.com. +gta01.[142]ord.[65]securestudies.com. +[112]gta02.iad.[65]securestudies.com. +[136]gta02.ord.[65]securestudies.com. +[160]gta01.[118]iad.[65]securestudies.com. +[180]gta01.[142]ord.[65]securestudies.com. +canwestglobal.112.2o7.net. +[12]canwestglobal.112.2o7.net. +[12]canwestglobal.112.2o7.net. +[12]canwestglobal.112.2o7.net. +[12]canwestglobal.112.2o7.net. +[12]canwestglobal.112.2o7.net. +[12]canwestglobal.112.2o7.net. +[12]canwestglobal.112.2o7.net. +2o7.net. +ns1.dal.omniture.com. +[155]2o7.net. +ns1.sj1.[182]omniture.com. +[155]2o7.net. +ns1.sj2.[182]omniture.com. +[174]ns1.dal.omniture.com. +[208]ns1.sj1.[182]omniture.com. +[230]ns1.sj2.[182]omniture.com. +s7.addthis.com. +s7.addthis.com. +[12]s7.addthis.com. +wildcard.addthis.com.edgekey.net. +[44]wildcard.addthis.com.edgekey.net. +e2943.c.akamaiedge.[73]net. +[90]e2943.c.akamaiedge.[73]net. +[96]c.akamaiedge.[73]net. +n5c.[98]akamaiedge.[73]net. +[96]c.akamaiedge.[73]net. +n8c.[98]akamaiedge.[73]net. +[96]c.akamaiedge.[73]net. +n0c.[98]akamaiedge.[73]net. +[96]c.akamaiedge.[73]net. +n6c.[98]akamaiedge.[73]net. +[96]c.akamaiedge.[73]net. +n1c.[98]akamaiedge.[73]net. +[96]c.akamaiedge.[73]net. +n4c.[98]akamaiedge.[73]net. +[96]c.akamaiedge.[73]net. +n3c.[98]akamaiedge.[73]net. +[96]c.akamaiedge.[73]net. +n2c.[98]akamaiedge.[73]net. +[96]c.akamaiedge.[73]net. +n7c.[98]akamaiedge.[73]net. +[139]n5c.[98]akamaiedge.[73]net. +[157]n8c.[98]akamaiedge.[73]net. +[175]n0c.[98]akamaiedge.[73]net. +[193]n6c.[98]akamaiedge.[73]net. +[211]n1c.[98]akamaiedge.[73]net. +[229]n4c.[98]akamaiedge.[73]net. +[247]n3c.[98]akamaiedge.[73]net. +[265]n2c.[98]akamaiedge.[73]net. +[283]n7c.[98]akamaiedge.[73]net. +csi.gstatic.com. +csi.gstatic.com. +[12]csi.gstatic.com. +csi.l.google.com. +[45]csi.l.google.com. +[49]l.google.com. +d.[49]l.google.com. +[49]l.google.com. +e.[49]l.google.com. +[49]l.google.com. +f.[49]l.google.com. +[49]l.google.com. +g.[49]l.google.com. +[49]l.google.com. +b.[49]l.google.com. +[49]l.google.com. +a.[49]l.google.com. +[91]d.[49]l.google.com. +[107]e.[49]l.google.com. +[123]f.[49]l.google.com. +[139]g.[49]l.google.com. +[155]b.[49]l.google.com. +[171]a.[49]l.google.com. +www.thestar.com. +www.thestar.com. +[12]www.thestar.com. +[16]thestar.com. +ns1.[16]thestar.com. +[16]thestar.com. +ns2.[16]thestar.com. +[61]ns1.[16]thestar.com. +[79]ns2.[16]thestar.com. +beacon.scorecardresearch.com. +beacon.scorecardresearch.com. +[12]beacon.scorecardresearch.com. +beacon.gta.scorecardresearch.com. +[58]beacon.gta.scorecardresearch.com. +[65]gta.scorecardresearch.com. +gta01.iad.[69]scorecardresearch.com. +[65]gta.scorecardresearch.com. +gta02.ord.[69]scorecardresearch.com. +[65]gta.scorecardresearch.com. +gta01.[150]ord.[69]scorecardresearch.com. +[65]gta.scorecardresearch.com. +gta02.[126]iad.[69]scorecardresearch.com. +[120]gta01.iad.[69]scorecardresearch.com. +[144]gta02.ord.[69]scorecardresearch.com. +[168]gta01.[150]ord.[69]scorecardresearch.com. +[188]gta02.[126]iad.[69]scorecardresearch.com. +media.thestar.topscms.com. +media.thestar.topscms.com. +[12]media.thestar.topscms.com. +media.thestar.topscms.com.edgesuite.net. +[55]media.thestar.topscms.com.edgesuite.net. +a1520.g.akamai.[91]net. +[108]a1520.g.akamai.[91]net. +[108]a1520.g.akamai.[91]net. +[114]g.akamai.[91]net. +n0g.[116]akamai.[91]net. +[114]g.akamai.[91]net. +n4g.[116]akamai.[91]net. +[114]g.akamai.[91]net. +n2g.[116]akamai.[91]net. +[114]g.akamai.[91]net. +n5g.[116]akamai.[91]net. +[114]g.akamai.[91]net. +n7g.[116]akamai.[91]net. +[114]g.akamai.[91]net. +n6g.[116]akamai.[91]net. +[114]g.akamai.[91]net. +n8g.[116]akamai.[91]net. +[114]g.akamai.[91]net. +n1g.[116]akamai.[91]net. +[114]g.akamai.[91]net. +n3g.[116]akamai.[91]net. +[169]n0g.[116]akamai.[91]net. +[187]n4g.[116]akamai.[91]net. +[205]n2g.[116]akamai.[91]net. +[223]n5g.[116]akamai.[91]net. +[241]n7g.[116]akamai.[91]net. +[259]n6g.[116]akamai.[91]net. +[277]n8g.[116]akamai.[91]net. +[295]n1g.[116]akamai.[91]net. +[313]n3g.[116]akamai.[91]net. +www.addthis.com. +www.addthis.com. +[12]www.addthis.com. +vp-www.addthis.com. +[45]vp-www.addthis.com. +[52]addthis.com. +eur2.akam.net. +[52]addthis.com. +usc1.[98]akam.net. +[52]addthis.com. +usc2.[98]akam.net. +[52]addthis.com. +usw1.[98]akam.net. +[52]addthis.com. +usw6.[98]akam.net. +[52]addthis.com. +asia3.[98]akam.net. +[52]addthis.com. +ns1-33.[98]akam.net. +[52]addthis.com. +ns1-43.[98]akam.net. +[93]eur2.akam.net. +[120]usc1.[98]akam.net. +[139]usc2.[98]akam.net. +[158]usw1.[98]akam.net. +[177]usw6.[98]akam.net. +[196]asia3.[98]akam.net. +[216]ns1-33.[98]akam.net. +[237]ns1-43.[98]akam.net. +n.thestar.com. +n.thestar.com. +[12]n.thestar.com. +thestar.com.122.2o7.net. +[43]thestar.com.122.2o7.net. +[43]thestar.com.122.2o7.net. +[43]thestar.com.122.2o7.net. +[43]thestar.com.122.2o7.net. +[43]thestar.com.122.2o7.net. +[43]thestar.com.122.2o7.net. +[59]2o7.net. +ns1.sj1.omniture.com. +[59]2o7.net. +ns1.sj2.[184]omniture.com. +[59]2o7.net. +ns1.dal.[184]omniture.com. +[176]ns1.sj1.omniture.com. +[210]ns1.sj2.[184]omniture.com. +[232]ns1.dal.[184]omniture.com. +news.therecord.com. +news.therecord.com. +[12]news.therecord.com. +therecord.com. +ns1.thestar.[62]com. +[52]therecord.com. +ns2.[81]thestar.[62]com. +[77]ns1.thestar.[62]com. +[103]ns2.[81]thestar.[62]com. +media.therecord.topscms.com. +media.therecord.topscms.com. +[12]media.therecord.topscms.com. +media.therecord.topscms.com.edgesuite.net. +[57]media.therecord.topscms.com.edgesuite.net. +a847.g.akamai.[95]net. +[112]a847.g.akamai.[95]net. +[112]a847.g.akamai.[95]net. +[117]g.akamai.[95]net. +n2g.[119]akamai.[95]net. +[117]g.akamai.[95]net. +n5g.[119]akamai.[95]net. +[117]g.akamai.[95]net. +n7g.[119]akamai.[95]net. +[117]g.akamai.[95]net. +n6g.[119]akamai.[95]net. +[117]g.akamai.[95]net. +n8g.[119]akamai.[95]net. +[117]g.akamai.[95]net. +n1g.[119]akamai.[95]net. +[117]g.akamai.[95]net. +n3g.[119]akamai.[95]net. +[117]g.akamai.[95]net. +n0g.[119]akamai.[95]net. +[117]g.akamai.[95]net. +n4g.[119]akamai.[95]net. +[172]n2g.[119]akamai.[95]net. +[190]n5g.[119]akamai.[95]net. +[208]n7g.[119]akamai.[95]net. +[226]n6g.[119]akamai.[95]net. +[244]n8g.[119]akamai.[95]net. +[262]n1g.[119]akamai.[95]net. +[280]n3g.[119]akamai.[95]net. +[298]n0g.[119]akamai.[95]net. +[316]n4g.[119]akamai.[95]net. +media.therecord.com. +www.goldbook.ca. +media.therecord.com. +[12]media.therecord.com. +therecord.com. +ns2.thestar.[63]com. +[53]therecord.com. +ns1.[82]thestar.[63]com. +[78]ns2.thestar.[63]com. +[104]ns1.[82]thestar.[63]com. +www.goldbook.ca. +[12]www.goldbook.ca. +goldbook.ca. +[45]goldbook.ca. +[45]goldbook.ca. +ns4.everydns.net. +[45]goldbook.ca. +ns1.[90]everydns.net. +[45]goldbook.ca. +ns2.[90]everydns.net. +[45]goldbook.ca. +ns3.[90]everydns.net. +[86]ns4.everydns.net. +[116]ns1.[90]everydns.net. +[134]ns2.[90]everydns.net. +[152]ns3.[90]everydns.net. +torstardigital.122.2o7.net. +torstardigital.122.2o7.net. +[12]torstardigital.122.2o7.net. +[12]torstardigital.122.2o7.net. +[12]torstardigital.122.2o7.net. +[12]torstardigital.122.2o7.net. +[12]torstardigital.122.2o7.net. +[12]torstardigital.122.2o7.net. +2o7.net. +ns1.dal.omniture.com. +[140]2o7.net. +ns1.sj1.[167]omniture.com. +[140]2o7.net. +ns1.sj2.[167]omniture.com. +[159]ns1.dal.omniture.com. +[193]ns1.sj1.[167]omniture.com. +[215]ns1.sj2.[167]omniture.com. +news.google.ca. +news.google.ca. +[12]news.google.ca. +news.google.com. +[44]news.google.com. +news.l.[49]google.com. +[73]news.l.[49]google.com. +[78]l.[49]google.com. +e.[78]l.[49]google.com. +[78]l.[49]google.com. +b.[78]l.[49]google.com. +[78]l.[49]google.com. +a.[78]l.[49]google.com. +[78]l.[49]google.com. +g.[78]l.[49]google.com. +[78]l.[49]google.com. +d.[78]l.[49]google.com. +[78]l.[49]google.com. +f.[78]l.[49]google.com. +[110]e.[78]l.[49]google.com. +[126]b.[78]l.[49]google.com. +[142]a.[78]l.[49]google.com. +[158]g.[78]l.[49]google.com. +[174]d.[78]l.[49]google.com. +[190]f.[78]l.[49]google.com. +googleads.g.doubleclick.net. +googleads.g.doubleclick.net. +[12]googleads.g.doubleclick.net. +pagead.l.doubleclick.net. +[57]pagead.l.doubleclick.net. +[64]l.doubleclick.net. +g.l.google.com. +[64]l.doubleclick.net. +a.[113]l.google.com. +[64]l.doubleclick.net. +b.[113]l.google.com. +[64]l.doubleclick.net. +d.[113]l.google.com. +[64]l.doubleclick.net. +e.[113]l.google.com. +[64]l.doubleclick.net. +f.[113]l.google.com. +[111]g.l.google.com. +[139]a.[113]l.google.com. +[155]b.[113]l.google.com. +[171]d.[113]l.google.com. +[187]e.[113]l.google.com. +[203]f.[113]l.google.com. +www.montrealgazette.com. +www.montrealgazette.com. +[12]www.montrealgazette.com. +montrealgazette.com. +ns2.canwest.[73]com. +[57]montrealgazette.com. +ns1.[92]canwest.[73]com. +[88]ns2.canwest.[73]com. +[114]ns1.[92]canwest.[73]com. +a123.g.akamai.net. +a123.g.akamai.net. +[12]a123.g.akamai.net. +[12]a123.g.akamai.net. +members.canada.com. +members.canada.com. +[12]members.canada.com. +canada.com. +ns2.canwest.[59]com. +[52]canada.com. +ns1.[78]canwest.[59]com. +[74]ns2.canwest.[59]com. +[100]ns1.[78]canwest.[59]com. +www.cbc.ca. +www.cbc.ca. +[12]www.cbc.ca. +www.cbc.ca.edgesuite.net. +[40]www.cbc.ca.edgesuite.net. +a1849.gc.akamai.[61]net. +[78]a1849.gc.akamai.[61]net. +[78]a1849.gc.akamai.[61]net. +[84]gc.akamai.[61]net. +n6gc.[87]akamai.[61]net. +[84]gc.akamai.[61]net. +n1gc.[87]akamai.[61]net. +[84]gc.akamai.[61]net. +n4gc.[87]akamai.[61]net. +[84]gc.akamai.[61]net. +n8gc.[87]akamai.[61]net. +[84]gc.akamai.[61]net. +n2gc.[87]akamai.[61]net. +[84]gc.akamai.[61]net. +n0gc.[87]akamai.[61]net. +[84]gc.akamai.[61]net. +n7gc.[87]akamai.[61]net. +[84]gc.akamai.[61]net. +n5gc.[87]akamai.[61]net. +[84]gc.akamai.[61]net. +n3gc.[87]akamai.[61]net. +[140]n6gc.[87]akamai.[61]net. +[159]n1gc.[87]akamai.[61]net. +[178]n4gc.[87]akamai.[61]net. +[197]n8gc.[87]akamai.[61]net. +[216]n2gc.[87]akamai.[61]net. +[235]n0gc.[87]akamai.[61]net. +[254]n7gc.[87]akamai.[61]net. +[273]n5gc.[87]akamai.[61]net. +[292]n3gc.[87]akamai.[61]net. +a.cbc.ca. +a.cbc.ca. +[12]a.cbc.ca. +ehg-cbc.hitbox.com. +[38]ehg-cbc.hitbox.com. +[46]hitbox.com. +dns06.omniture.[53]com. +[46]hitbox.com. +dns05.[92]omniture.[53]com. +[46]hitbox.com. +dns04.[92]omniture.[53]com. +[46]hitbox.com. +dns03.[92]omniture.[53]com. +[46]hitbox.com. +dns02.[92]omniture.[53]com. +[46]hitbox.com. +dns01.[92]omniture.[53]com. +[86]dns06.omniture.[53]com. +[115]dns05.[92]omniture.[53]com. +[135]dns04.[92]omniture.[53]com. +[155]dns03.[92]omniture.[53]com. +[175]dns02.[92]omniture.[53]com. +[195]dns01.[92]omniture.[53]com. +assets.loomia.com. +assets.loomia.com. +[12]assets.loomia.com. +a.[19]loomia.com. +[47]a.[19]loomia.com. +[19]loomia.com. +ns15.dnsmadeeasy.[26]com. +[19]loomia.com. +ns14.[84]dnsmadeeasy.[26]com. +[19]loomia.com. +ns12.[84]dnsmadeeasy.[26]com. +[19]loomia.com. +ns11.[84]dnsmadeeasy.[26]com. +[19]loomia.com. +ns13.[84]dnsmadeeasy.[26]com. +[19]loomia.com. +ns10.[84]dnsmadeeasy.[26]com. +recs-social.loomia.com. +recs-social.loomia.com. +[12]recs-social.loomia.com. +rec-assets.[24]loomia.com. +[52]rec-assets.[24]loomia.com. +[24]loomia.com. +ns14.dnsmadeeasy.[31]com. +[24]loomia.com. +ns11.[98]dnsmadeeasy.[31]com. +[24]loomia.com. +ns12.[98]dnsmadeeasy.[31]com. +[24]loomia.com. +ns15.[98]dnsmadeeasy.[31]com. +[24]loomia.com. +ns10.[98]dnsmadeeasy.[31]com. +[24]loomia.com. +ns13.[98]dnsmadeeasy.[31]com. +e1.clearspring.com. +static-cache.loomia.com. +static-cache.loomia.com. +[12]static-cache.loomia.com. +static-cache.loomia.com.edgesuite.net. +[53]static-cache.loomia.com.edgesuite.net. +a298.g.akamai.[87]net. +[104]a298.g.akamai.[87]net. +[104]a298.g.akamai.[87]net. +[109]g.akamai.[87]net. +n4g.[111]akamai.[87]net. +[109]g.akamai.[87]net. +n2g.[111]akamai.[87]net. +[109]g.akamai.[87]net. +n5g.[111]akamai.[87]net. +[109]g.akamai.[87]net. +n7g.[111]akamai.[87]net. +[109]g.akamai.[87]net. +n6g.[111]akamai.[87]net. +[109]g.akamai.[87]net. +n8g.[111]akamai.[87]net. +[109]g.akamai.[87]net. +n1g.[111]akamai.[87]net. +[109]g.akamai.[87]net. +n3g.[111]akamai.[87]net. +[109]g.akamai.[87]net. +n0g.[111]akamai.[87]net. +[164]n4g.[111]akamai.[87]net. +[182]n2g.[111]akamai.[87]net. +[200]n5g.[111]akamai.[87]net. +[218]n7g.[111]akamai.[87]net. +[236]n6g.[111]akamai.[87]net. +[254]n8g.[111]akamai.[87]net. +[272]n1g.[111]akamai.[87]net. +[290]n3g.[111]akamai.[87]net. +[308]n0g.[111]akamai.[87]net. +e1.clearspring.com. +[12]e1.clearspring.com. +[15]clearspring.com. +usc2.akam.net. +[15]clearspring.com. +usw1.[69]akam.net. +[15]clearspring.com. +usw6.[69]akam.net. +[15]clearspring.com. +asia3.[69]akam.net. +[15]clearspring.com. +ns1-33.[69]akam.net. +[15]clearspring.com. +ns1-43.[69]akam.net. +[15]clearspring.com. +eur2.[69]akam.net. +[15]clearspring.com. +usc1.[69]akam.net. +csi.gstatic.com. +csi.gstatic.com. +[12]csi.gstatic.com. +csi.l.google.com. +[45]csi.l.google.com. +[49]l.google.com. +b.[49]l.google.com. +[49]l.google.com. +e.[49]l.google.com. +[49]l.google.com. +d.[49]l.google.com. +[49]l.google.com. +a.[49]l.google.com. +[49]l.google.com. +f.[49]l.google.com. +[49]l.google.com. +g.[49]l.google.com. +[91]b.[49]l.google.com. +[107]e.[49]l.google.com. +[123]d.[49]l.google.com. +[139]a.[49]l.google.com. +[155]f.[49]l.google.com. +[171]g.[49]l.google.com. +www.gstatic.com. +www.gstatic.com. +[12]www.gstatic.com. +www2.l.google.com. +[45]www2.l.google.com. +[50]l.google.com. +d.[50]l.google.com. +[50]l.google.com. +f.[50]l.google.com. +[50]l.google.com. +e.[50]l.google.com. +[50]l.google.com. +b.[50]l.google.com. +[50]l.google.com. +a.[50]l.google.com. +[50]l.google.com. +g.[50]l.google.com. +[92]d.[50]l.google.com. +[108]f.[50]l.google.com. +[124]e.[50]l.google.com. +[140]b.[50]l.google.com. +[156]a.[50]l.google.com. +[172]g.[50]l.google.com. +i.ytimg.com. +i.ytimg.com. +[12]i.ytimg.com. +ytimg.l.google.com. +[41]ytimg.l.google.com. +[47]l.google.com. +b.[47]l.google.com. +[47]l.google.com. +a.[47]l.google.com. +[47]l.google.com. +g.[47]l.google.com. +[47]l.google.com. +d.[47]l.google.com. +[47]l.google.com. +f.[47]l.google.com. +[47]l.google.com. +e.[47]l.google.com. +[89]b.[47]l.google.com. +[105]a.[47]l.google.com. +[121]g.[47]l.google.com. +[137]d.[47]l.google.com. +[153]f.[47]l.google.com. +[169]e.[47]l.google.com. +news.bbc.co.uk. +news.bbc.co.uk. +[12]news.bbc.co.uk. +newswww.bbc.net.uk. +[44]newswww.bbc.net.uk. +[52]bbc.net.uk. +ns0.rbsov.bbc.co.[60]uk. +[52]bbc.net.uk. +ns0.thdo.[102]bbc.co.[60]uk. +[92]ns0.rbsov.bbc.co.[60]uk. +[123]ns0.thdo.[102]bbc.co.[60]uk. +node1.bbcimg.co.uk. +node1.bbcimg.co.uk. +[12]node1.bbcimg.co.uk. +img.bbc.net.uk. +[48]img.bbc.net.uk. +[52]bbc.net.uk. +ns0.rbsov.bbc.co.[60]uk. +[52]bbc.net.uk. +ns0.thdo.[102]bbc.co.[60]uk. +[92]ns0.rbsov.bbc.co.[60]uk. +[123]ns0.thdo.[102]bbc.co.[60]uk. +newsimg.bbc.co.uk. +newsimg.bbc.co.uk. +[12]newsimg.bbc.co.uk. +newsimg.bbc.net.uk. +[47]newsimg.bbc.net.uk. +news.bbc.co.uk.edgesuite.net. +[79]news.bbc.co.uk.edgesuite.net. +a1733.g.akamai.[104]net. +[121]a1733.g.akamai.[104]net. +[121]a1733.g.akamai.[104]net. +[127]g.akamai.[104]net. +n7g.[129]akamai.[104]net. +[127]g.akamai.[104]net. +n6g.[129]akamai.[104]net. +[127]g.akamai.[104]net. +n8g.[129]akamai.[104]net. +[127]g.akamai.[104]net. +n1g.[129]akamai.[104]net. +[127]g.akamai.[104]net. +n3g.[129]akamai.[104]net. +[127]g.akamai.[104]net. +n0g.[129]akamai.[104]net. +[127]g.akamai.[104]net. +n4g.[129]akamai.[104]net. +[127]g.akamai.[104]net. +n2g.[129]akamai.[104]net. +[127]g.akamai.[104]net. +n5g.[129]akamai.[104]net. +[182]n7g.[129]akamai.[104]net. +[200]n6g.[129]akamai.[104]net. +[218]n8g.[129]akamai.[104]net. +[236]n1g.[129]akamai.[104]net. +[254]n3g.[129]akamai.[104]net. +[272]n0g.[129]akamai.[104]net. +[290]n4g.[129]akamai.[104]net. +[308]n2g.[129]akamai.[104]net. +[326]n5g.[129]akamai.[104]net. +stats.bbc.co.uk. +stats.bbc.co.uk. +[12]stats.bbc.co.uk. +[12]stats.bbc.co.uk. +bbc.co.uk. +ns1.thls.[65]bbc.co.uk. +[65]bbc.co.uk. +ns1.thdo.[65]bbc.co.uk. +[65]bbc.co.uk. +ns1.rbsov.[65]bbc.co.uk. +[65]bbc.co.uk. +ns1.[65]bbc.co.uk. +[86]ns1.thls.[65]bbc.co.uk. +[109]ns1.thdo.[65]bbc.co.uk. +[132]ns1.rbsov.[65]bbc.co.uk. +[156]ns1.[65]bbc.co.uk. +visualscience.external.bbc.co.uk. +js.revsci.net. +visualscience.external.bbc.co.uk. +[12]visualscience.external.bbc.co.uk. +csvtm.interactionscience.com. +[62]csvtm.interactionscience.com. +[68]interactionscience.com. +ns1.sj1.omniture.[87]com. +[68]interactionscience.com. +ns1.sj2.[128]omniture.[87]com. +[68]interactionscience.com. +ns1.dal.[128]omniture.[87]com. +[120]ns1.sj1.omniture.[87]com. +[151]ns1.sj2.[128]omniture.[87]com. +[173]ns1.dal.[128]omniture.[87]com. +js.revsci.net. +[12]js.revsci.net. +[15]revsci.net. +ns3.p16.dynect.[22]net. +[15]revsci.net. +ns4.[63]p16.dynect.[22]net. +[15]revsci.net. +ns2.[63]p16.dynect.[22]net. +[15]revsci.net. +ns1.[63]p16.dynect.[22]net. +pix04.revsci.net. +pix04.revsci.net. +[12]pix04.revsci.net. +[18]revsci.net. +ns4.p16.dynect.[25]net. +[18]revsci.net. +ns1.[66]p16.dynect.[25]net. +[18]revsci.net. +ns3.[66]p16.dynect.[25]net. +[18]revsci.net. +ns2.[66]p16.dynect.[25]net. +pixel.quantserve.com. +pixel.quantserve.com. +[12]pixel.quantserve.com. +map-pb.quantserve.com.akadns.net. +[50]map-pb.quantserve.com.akadns.net. +ac-na.[57]quantserve.com.akadns.net. +[96]ac-na.[57]quantserve.com.akadns.net. +[96]ac-na.[57]quantserve.com.akadns.net. +[96]ac-na.[57]quantserve.com.akadns.net. +[96]ac-na.[57]quantserve.com.akadns.net. +[96]ac-na.[57]quantserve.com.akadns.net. +[96]ac-na.[57]quantserve.com.akadns.net. +[96]ac-na.[57]quantserve.com.akadns.net. +[72]akadns.net. +za.akadns.org. +[72]akadns.net. +zb.[231]akadns.org. +[72]akadns.net. +zc.[231]akadns.org. +[72]akadns.net. +zd.[231]akadns.org. +[72]akadns.net. +asia9.[72]akadns.net. +[72]akadns.net. +eur1.[72]akadns.net. +[72]akadns.net. +use3.[72]akadns.net. +[72]akadns.net. +use4.[72]akadns.net. +[72]akadns.net. +usw2.[72]akadns.net. +[228]za.akadns.org. +[255]zb.[231]akadns.org. +[272]zc.[231]akadns.org. +[289]zd.[231]akadns.org. +[306]asia9.[72]akadns.net. +[326]eur1.[72]akadns.net. +[345]use3.[72]akadns.net. +www.vancouversun.com. +www.vancouversun.com. +[12]www.vancouversun.com. +vancouversun.com. +ns1.canwest.[67]com. +[54]vancouversun.com. +ns2.[86]canwest.[67]com. +[82]ns1.canwest.[67]com. +[108]ns2.[86]canwest.[67]com. +www.scan.nowpublic.com. +www.scan.nowpublic.com. +[12]www.scan.nowpublic.com. +a1.panthercdn.com. +[52]a1.panthercdn.com. +[55]panthercdn.com. +ns1.[55]panthercdn.com. +[55]panthercdn.com. +ns2.[55]panthercdn.com. +[99]ns1.[55]panthercdn.com. +[117]ns2.[55]panthercdn.com. +a123.g.akamai.net. +a123.g.akamai.net. +[12]a123.g.akamai.net. +[12]a123.g.akamai.net. +feeds.theplatform.com. +canwestglobal.112.2o7.net. +beacon.securestudies.com. +canwestglobal.112.2o7.net. +[12]canwestglobal.112.2o7.net. +[12]canwestglobal.112.2o7.net. +[12]canwestglobal.112.2o7.net. +[12]canwestglobal.112.2o7.net. +[12]canwestglobal.112.2o7.net. +[12]canwestglobal.112.2o7.net. +[12]canwestglobal.112.2o7.net. +2o7.net. +ns1.sj1.omniture.com. +[155]2o7.net. +ns1.sj2.[182]omniture.com. +[155]2o7.net. +ns1.dal.[182]omniture.com. +[174]ns1.sj1.omniture.com. +[208]ns1.sj2.[182]omniture.com. +[230]ns1.dal.[182]omniture.com. +beacon.securestudies.com. +[12]beacon.securestudies.com. +beacon.gta.securestudies.com. +[54]beacon.gta.securestudies.com. +[61]gta.securestudies.com. +gta02.ord.[65]securestudies.com. +[61]gta.securestudies.com. +gta01.iad.[65]securestudies.com. +[61]gta.securestudies.com. +gta01.[118]ord.[65]securestudies.com. +[61]gta.securestudies.com. +gta02.[142]iad.[65]securestudies.com. +[112]gta02.ord.[65]securestudies.com. +[136]gta01.iad.[65]securestudies.com. +[160]gta01.[118]ord.[65]securestudies.com. +[180]gta02.[142]iad.[65]securestudies.com. +feeds.theplatform.com. +[12]feeds.theplatform.com. +[18]theplatform.com. +sea1tpgtm01.[18]theplatform.com. +[18]theplatform.com. +bfi1tpgtm01.[18]theplatform.com. +[67]sea1tpgtm01.[18]theplatform.com. +[93]bfi1tpgtm01.[18]theplatform.com. +s7.addthis.com. +s7.addthis.com. +[12]s7.addthis.com. +wildcard.addthis.com.edgekey.net. +[44]wildcard.addthis.com.edgekey.net. +e2943.c.akamaiedge.[73]net. +[90]e2943.c.akamaiedge.[73]net. +[96]c.akamaiedge.[73]net. +n5c.[98]akamaiedge.[73]net. +[96]c.akamaiedge.[73]net. +n8c.[98]akamaiedge.[73]net. +[96]c.akamaiedge.[73]net. +n0c.[98]akamaiedge.[73]net. +[96]c.akamaiedge.[73]net. +n6c.[98]akamaiedge.[73]net. +[96]c.akamaiedge.[73]net. +n1c.[98]akamaiedge.[73]net. +[96]c.akamaiedge.[73]net. +n4c.[98]akamaiedge.[73]net. +[96]c.akamaiedge.[73]net. +n3c.[98]akamaiedge.[73]net. +[96]c.akamaiedge.[73]net. +n2c.[98]akamaiedge.[73]net. +[96]c.akamaiedge.[73]net. +n7c.[98]akamaiedge.[73]net. +[139]n5c.[98]akamaiedge.[73]net. +[157]n8c.[98]akamaiedge.[73]net. +[175]n0c.[98]akamaiedge.[73]net. +[193]n6c.[98]akamaiedge.[73]net. +[211]n1c.[98]akamaiedge.[73]net. +[229]n4c.[98]akamaiedge.[73]net. +[247]n3c.[98]akamaiedge.[73]net. +[265]n2c.[98]akamaiedge.[73]net. +[283]n7c.[98]akamaiedge.[73]net. +ad.doubleclick.net. +ad.doubleclick.net. +[12]ad.doubleclick.net. +dart-ad.l.doubleclick.net. +[48]dart-ad.l.doubleclick.net. +[48]dart-ad.l.doubleclick.net. +[56]l.doubleclick.net. +g.l.google.com. +[56]l.doubleclick.net. +a.[121]l.google.com. +[56]l.doubleclick.net. +b.[121]l.google.com. +[56]l.doubleclick.net. +d.[121]l.google.com. +[56]l.doubleclick.net. +e.[121]l.google.com. +[56]l.doubleclick.net. +f.[121]l.google.com. +[119]g.l.google.com. +[147]a.[121]l.google.com. +[163]b.[121]l.google.com. +[179]d.[121]l.google.com. +[195]e.[121]l.google.com. +[211]f.[121]l.google.com. +*** SUCCESS *** diff --git a/test/btscan1.lm b/test/btscan1.lm new file mode 100644 index 0000000..558b890 --- /dev/null +++ b/test/btscan1.lm @@ -0,0 +1,47 @@ +##### LM ##### +# +# R1 +# +namespace r1 + + lex + literal `! `a `b + ignore /[ \n\t]+/ + end + + def line [ `! `a `b `b `a] + +end # r1 + +# +# R2 +# +namespace r2 + + lex + literal `! + token id /[a-zA-Z_]+/ + ignore /[ \n\t]+/ + end + + def line [ `! id ] + +end # r2 + +def item + [r1::line] +| [r2::line] + +def btscan + [item*] + +parse P: btscan[ stdin ] + +match P ~!abb !abba !aab +print_xml(P) +print( '\n' ) +##### IN ##### +!abb !abba !aab + +##### EXP ##### +<btscan><_repeat_item><item><r2::line><r2::_literal_0009>!</r2::_literal_0009><r2::id>abb</r2::id></r2::line></item><item><r1::line><r1::_literal_0001>!</r1::_literal_0001><r1::_literal_0003>a</r1::_literal_0003><r1::_literal_0005>b</r1::_literal_0005><r1::_literal_0005>b</r1::_literal_0005><r1::_literal_0003>a</r1::_literal_0003></r1::line></item><item><r2::line><r2::_literal_0009>!</r2::_literal_0009><r2::id>aab</r2::id></r2::line></item></_repeat_item></btscan> diff --git a/test/btscan2.lm b/test/btscan2.lm new file mode 100644 index 0000000..bfc77db --- /dev/null +++ b/test/btscan2.lm @@ -0,0 +1,42 @@ +##### LM ##### +namespace r1 + + lex + literal `! `a `b + ignore /[ \n\t]+/ + end + + def line [ `! `a `b `b `a] + +end # r1 + +namespace r2 + + lex + literal `! + token id /[a-zA-Z_]+/ + ignore /[ \n\t]+/ + end + + def line [ `! id ] + +end # r2 + +def item + [r1::line] +| [r2::line] + +def btscan + [item*] + +cons Parser: parser<btscan> [] + +send Parser "!ab" +send Parser "b " +send Parser "!ab" +send Parser "ba !aab\n" + +print_xml( Parser() ) +print( '\n' ) +##### EXP ##### +<btscan><_repeat_item><item><r2::line><r2::_literal_0009>!</r2::_literal_0009><r2::id>abb</r2::id></r2::line></item><item><r1::line><r1::_literal_0001>!</r1::_literal_0001><r1::_literal_0003>a</r1::_literal_0003><r1::_literal_0005>b</r1::_literal_0005><r1::_literal_0005>b</r1::_literal_0005><r1::_literal_0003>a</r1::_literal_0003></r1::line></item><item><r2::line><r2::_literal_0009>!</r2::_literal_0009><r2::id>aab</r2::id></r2::line></item></_repeat_item></btscan> diff --git a/test/call1.lm b/test/call1.lm new file mode 100644 index 0000000..82f6c76 --- /dev/null +++ b/test/call1.lm @@ -0,0 +1,17 @@ +##### LM ##### +int f1( i: int j: int ) +{ + return i + j +} + +int main() +{ + print( f1( + f1( f1( 1 1 ) f1( 1 1 ) ) + f1( f1( 1 1 ) f1( 1 1 ) ) + ) '\n' ) +} + +main() +##### EXP ##### +8 diff --git a/test/commitbt.lm b/test/commitbt.lm new file mode 100644 index 0000000..da78807 --- /dev/null +++ b/test/commitbt.lm @@ -0,0 +1,109 @@ +##### LM ##### +# 2010: I'm not sure what the following means. + +# +# Local commit: +# -clears reparse flags underneath +# -must be possible to backtrack after +# Global commit (revertOn) +# -clears all reparse flags +# -must be possible to backtrack after +# Global commit (!revertOn) +# -clears all reparse flags +# -clears all 'parsed' reverse code +# -clears all reverse code +# -clears all alg structures +# + +# This test shows that a global commit with revertOn correctly does not clear +# 'parsed' items because it must entertain the possibility of backtracking. + +lex + ignore /[\t\n ]+/ + literal `^ `| `- `, `: `! `? `. + literal `( `) `{ `} `* `& `+ + + literal `-- `:> `:>> `<: `-> `** + + token word /[a-zA-Z_][a-zA-Z0-9_]*/ + token uint /[0-9]+/ +end + + +def expression [term expression_op*] + +def expression_op + [`| term] +| [`& term] +| [`- term] +| [`-- term] + +def term [factor_rep term_rest] + +# This list is done manually to get shortest match. +def term_rest + [] +| [term_op term_rest] + +def term_op + [factor_rep] +| [`. factor_rep] +| [`:> factor_rep] +| [`:>> factor_rep] +| [`<: factor_rep] + +def factor_rep + [factor_neg factor_rep_op*] + +def factor_rep_op + [`*] +| [`**] +| [`?] +| [`+] +| [`{ factor_rep_num `}] +| [`{ `, factor_rep_num `}] +| [`{ factor_rep_num `, `}] +| [`{ factor_rep_num `, factor_rep_num `}] + +def factor_rep_num [uint] + +def factor_neg + [`! factor_neg] +| [`^ factor_neg] +| [factor] + +def factor + [alphabet_num] +| [word] +| [`( expression `)] + +def alphabet_num + [uint] + +def suint + i: int + [uint] + +def sub + [suint* `*] + +token item + S: sub + /[0-9]+/ + { + M: str = input.pull(match_length) + parse_stop S: sub[input] + input.push( make_token( typeid<item> M S ) ) + } + +def stuff + [item* `!] +| [sub] + +parse S: stuff[ stdin ] +print_xml( S ) +print( '\n' ) +##### IN ##### +1 2 3 * ! +##### EXP ##### +<stuff><_repeat_item><item>1</item></_repeat_item><_literal_000d>!</_literal_000d></stuff> diff --git a/test/concat1.lm b/test/concat1.lm new file mode 100644 index 0000000..ee92409 --- /dev/null +++ b/test/concat1.lm @@ -0,0 +1,100 @@ +##### LM ##### + +lex + literal `type `include + token id /[A-Za-z_][A-Za-z_0-9]*/ + ignore /'#' [^\n]* '\n'/ + ignore /[ \t\r\n]+/ +end + +lex + token ifn_part /[a-zA-Z0-9_.\-]+/ + token ifn_slash /'/'/ +end + +def ifn_path_part + [ifn_part] +| [ifn_slash] + +def ifn_path + [ifn_path_part ifn_path] +| [ifn_path_part] + + +literal `%% + +lex + token em_ws /( any - 33..126 )+/ +end + +def em_item + [em_ws] + +def prelude + [em_item* `%%] + +def item + [`include ifn_path] +| [`type id] + +def start + [prelude item*] + +start parseStart( InputFile: stream ) +{ + return parse start[ InputFile ] +} + +start parseTxt( T: str ) +{ + cons a: parser<start>[] + send a [T] eos + return a.tree +} + +item* concatItems( IL1: item* IL2: item* ) +{ + for IL: item* in IL1 { + if match IL [] { + IL = IL2 + break + } + } + return IL1 +} + +item* expandIncludes( ItemList: ref<item*> ) +{ + for IL: item* in ItemList { + if match IL + [`include FN: ifn_path Rest: item*] + { + S: start = parseTxt( + " + "%% + " + ) + + match S [em_item* `%% IncludedItems: item*] + + IL = concatItems( IncludedItems Rest ) + } + } +} + +int main() +{ + S: start = parseStart(stdin) + match S [em_item* `%% ItemList: item*] + expandIncludes( ItemList ) +} + +main() +##### IN ##### + +%% + +type foo + +include smtp.vpt +##### EXP ##### diff --git a/test/concat2.lm b/test/concat2.lm new file mode 100644 index 0000000..781c2f2 --- /dev/null +++ b/test/concat2.lm @@ -0,0 +1,98 @@ +##### LM ##### + +lex + literal `type `include + token id /[A-Za-z_][A-Za-z_0-9]*/ + ignore /'#' [^\n]* '\n'/ + ignore /[ \t\r\n]+/ +end + +lex + token ifn_part /[a-zA-Z0-9_.\-]+/ + token ifn_slash /'/'/ +end + +def ifn_path_part + [ifn_part] +| [ifn_slash] + +def ifn_path + [ifn_path_part ifn_path] +| [ifn_path_part] + + +literal `%% + +lex + token em_ws /( any - 33..126 )+/ +end + +def em_item + [em_ws] + +def prelude + [em_item* `%%] + +def item + [`include ifn_path] +| [`type id] + +def start + [prelude item*] + +start parseStart( InputFile: stream ) +{ + return parse start[ InputFile ] +} + +start parseTxt( T: str ) +{ + cons a: accum<start>[] + send a [T] eos + return a.tree +} + +item* concatItems( IL1: item* IL2: item* ) +{ + for IL: item* in IL1 { + if match IL [] { + IL = IL2 + break + } + } + return IL1 +} + +item* expandIncludes( ItemList: ref<item*> ) +{ + for IL: item* in ItemList { + if match IL + [`include FN: ifn_path Rest: item*] + { + S: start = parseTxt( + " + "%% + " + ) + + match S [em_item* `%% IncludedItems: item*] + + IL = concatItems( IncludedItems Rest ) + } + } +} + +int main() +{ + S: start = parseStart(stdin) + match S [em_item* `%% ItemList: item*] + expandIncludes( ItemList ) +} + +main() +##### IN ##### + +%% + +include smtp.vpt +##### EXP ##### diff --git a/test/construct1.lm b/test/construct1.lm new file mode 100644 index 0000000..ee9b36b --- /dev/null +++ b/test/construct1.lm @@ -0,0 +1,19 @@ +##### LM ##### +rl ident_pattern /[a-zA-Z_][a-zA-Z_0-9]*/ +rl number_pattern /[0-9]+/ + +lex + ignore /[ \t\n]+/ + token id /ident_pattern/ + token number /number_pattern/ +end + +def four_ids + [id id id id] + +Constructed: four_ids = construct four_ids "a b c d" +print_xml( Constructed ) +print( '\n' ) + +##### EXP ##### +<four_ids><id>a</id><id>b</id><id>c</id><id>d</id></four_ids> diff --git a/test/construct2.lm b/test/construct2.lm new file mode 100644 index 0000000..fd60e9d --- /dev/null +++ b/test/construct2.lm @@ -0,0 +1,14 @@ +##### LM ##### + +lex + ignore /[ \t\n]+/ + token id /[a-z]+/ + literal `, `. `* `( `) +end + +def lang [id*] + +print( construct lang "a b c" '\n' ) + +##### EXP ##### +a b c diff --git a/test/construct3.lm b/test/construct3.lm new file mode 100644 index 0000000..8edb038 --- /dev/null +++ b/test/construct3.lm @@ -0,0 +1,19 @@ +##### LM ##### + +lex + ignore /[ \t\n]+/ + token id /[a-z0-9]+/ + literal `, `. `* `( `) +end + +def bigger [`( item* `)] + +def item [id] | [bigger] + +def lang [item*] + +B: bigger = construct bigger "( b1 b2 )" +print( construct lang "a [B] c" '\n' ) + +##### EXP ##### +a ( b1 b2 ) c diff --git a/test/constructex.lm b/test/constructex.lm new file mode 100644 index 0000000..3596363 --- /dev/null +++ b/test/constructex.lm @@ -0,0 +1,44 @@ +##### LM ##### +lex + token id /[a-zA-Z_][a-zA-Z0-9_]*/ + literal `= `< `> `/ + ignore /[ \t\n\r\v]+/ +end + +def attr + [id `= id] + +def open_tag + [`< id attr* `>] + +def close_tag + [`< `/ id `>] + +def tag + [open_tag item* close_tag] + +def item + [tag] +| [id] + +parse PersonTag: tag[ stdin ] + +match PersonTag + ["<person name=" Val:id attr*">" item* "</person>"] + +NameTag1: tag = construct tag + ["<name type=person>" ^Val "</name>"] + +NameTag2: tag = construct tag + "<name type=person>[^Val]</name>" + +print( NameTag1 '\n' ) +print( NameTag2 '\n' ) + +##### IN ##### +<person name=adrian hometown=kingston> + <t1 foo=bar2 e=f></t2> +</person> +##### EXP ##### +<name type=person>adrian</name> +<name type=person>adrian</name> diff --git a/test/context1.lm b/test/context1.lm new file mode 100644 index 0000000..2832045 --- /dev/null +++ b/test/context1.lm @@ -0,0 +1,39 @@ +##### LM ##### + +context ctx + i: int + j: int + k: int + + lex + ignore /space+/ + literal `* `( `) + token id /[a-zA-Z_]+/ + end + + def foo [id] + + def item + [id] + | [foo] + | [`( item* `)] + { + i = 0 + j = i + 1 + k = j + 1 + print( k '\n' ) + } + + def start + [item*] +end # ctx + +cons CTX: ctx[] +parse Input: ctx::start( CTX ) [ stdin ] +print( Input ) + +##### IN ##### +a b c ( d e f ) +##### EXP ##### +2 +a b c ( d e f ) diff --git a/test/context2.lm b/test/context2.lm new file mode 100644 index 0000000..e04354a --- /dev/null +++ b/test/context2.lm @@ -0,0 +1,124 @@ +##### LM ##### +context ruby_here + + rl ident_pattern /[a-zA-Z_][a-zA-Z_0-9]*/ + rl number_pattern /[0-9]+/ + + lex + ignore /[ \t\n]+/ + token id /ident_pattern/ + token number /number_pattern/ + literal `<< `* `, `( `) `! + end + + HereId: str + + token rest_of_line /[^\n]*'\n'/ + + lex + ignore /[ \t\n]+/ + token here_id + HereData: here_data + /ident_pattern/ + { + # Take the text of the here_id from the input stream. + HereId = input.pull( match_length ) + + # Get the data up to the rest of the line. + parse_stop ROL: rest_of_line(ctx)[ input ] + + # Parse the heredoc data. + parse_stop HereData: here_data(ctx)[ input ] + + # Push the rest-of-line data back to the input stream. + input.push( $ROL ) + + # Send the here_id token. Attach the heredoc data as an attribute. + input.push( make_token( typeid<here_id> HereId HereData ) ) + } + end + + lex + token here_close_id + / ident_pattern '\n' / + { + if match_text == HereId + '\n' { + input.push( make_token( + typeid<here_close_id> + input.pull( match_length ) ) ) + } + else + input.push( make_token( typeid<here_line> input.pull(match_length) ) ) + } + + token here_line + / [^\n]* '\n' / + end + + def here_data + [here_line* here_close_id] + + def heredoc + [`<< here_id] + + def primary + [id] + | [number] + | [heredoc] + + def arglist + [primary arglist_more*] + + def arglist_more + [`, primary] + + def call + [id `( arglist? `)] + + def statement + [primary] + | [call] + + token foobar /any+/ + + def item + [statement `!] + | [foobar] + + def start + [item*] + +end # ruby_here + +CTX: ruby_here = cons ruby_here [] + +parse S: ruby_here::start( CTX ) [ stdin ] +print_xml(S) +print('\n') +##### IN ##### +print( <<DATA1, more, <<DATA2, 99 ) +"&^#(@ almost +!arbitrary text! +DATA1 +hello +world +DATA2 +! +print( <<DATA1, more, <<DATA2, 99 ) +"&^#(@ almost +!arbitrary text! +DATA1 +hello +world +DATA2 +# error here +##### EXP ##### +<ruby_here::start><ruby_here::_repeat_item><ruby_here::item><ruby_here::statement><ruby_here::call><ruby_here::id>print</ruby_here::id><ruby_here::_literal_000d>(</ruby_here::_literal_000d><ruby_here::_opt_arglist><ruby_here::arglist><ruby_here::primary><ruby_here::heredoc><ruby_here::_literal_0007><<</ruby_here::_literal_0007><ruby_here::here_id>DATA1</ruby_here::here_id></ruby_here::heredoc></ruby_here::primary><ruby_here::_repeat_arglist_more><ruby_here::arglist_more><ruby_here::_literal_000b>,</ruby_here::_literal_000b><ruby_here::primary><ruby_here::id>more</ruby_here::id></ruby_here::primary></ruby_here::arglist_more><ruby_here::arglist_more><ruby_here::_literal_000b>,</ruby_here::_literal_000b><ruby_here::primary><ruby_here::heredoc><ruby_here::_literal_0007><<</ruby_here::_literal_0007><ruby_here::here_id>DATA2</ruby_here::here_id></ruby_here::heredoc></ruby_here::primary></ruby_here::arglist_more><ruby_here::arglist_more><ruby_here::_literal_000b>,</ruby_here::_literal_000b><ruby_here::primary><ruby_here::number>99</ruby_here::number></ruby_here::primary></ruby_here::arglist_more></ruby_here::_repeat_arglist_more></ruby_here::arglist></ruby_here::_opt_arglist><ruby_here::_literal_000f>)</ruby_here::_literal_000f></ruby_here::call></ruby_here::statement><ruby_here::_literal_0011>!</ruby_here::_literal_0011></ruby_here::item><ruby_here::item><ruby_here::foobar>print( <<DATA1, more, <<DATA2, 99 ) +"&^#(@ almost +!arbitrary text! +DATA1 +hello +world +DATA2 +# error here +</ruby_here::foobar></ruby_here::item></ruby_here::_repeat_item></ruby_here::start> diff --git a/test/context3.lm b/test/context3.lm new file mode 100644 index 0000000..f990837 --- /dev/null +++ b/test/context3.lm @@ -0,0 +1,47 @@ +##### LM ##### +context ctx + + i: int + j: int + k: int + + lex + ignore /space+/ + literal `* `( `) + token id /[a-zA-Z_]+/ + end + + def foo [id] + + int f() + { + i = i + 1 + } + + def item + [id] + | [foo] + | [`( item* `)] + { + i = 0 + f() + f() + f() + print( i '\n' ) + } + + + def start + [item*] + +end # ctx + +CTX: ctx = cons ctx [] +parse Input: ctx::start( CTX ) [stdin] +print( Input ) + +##### IN ##### +a b c ( d ) e f +##### EXP ##### +3 +a b c ( d ) e f diff --git a/test/counting1.lm b/test/counting1.lm new file mode 100644 index 0000000..83e70d5 --- /dev/null +++ b/test/counting1.lm @@ -0,0 +1,109 @@ +##### LM ##### +context counting + + # + # Regular Definitions + # + rl rl_ws /[ \t\n\r\v]+/ + rl rl_id /[a-zA-Z_][a-zA-Z0-9_]*/ + rl rl_num /[0-9]+/ + + # + # Tokens + # + + lex + # Ignore whitespace. + ignore /rl_ws/ + + # Tokens. + token id /rl_id/ + token number /rl_num/ + end + + # + # Global Data + # + + target: int + + # + # Productions + # + + + def get_target + [number] + { + match lhs [Number:number] + target = Number.data.atoi() + } + + # Arbitrary item. + def item + [number] + | [id] + + # Type definition for the count_items nonterminal. + def count_items + count: int + + # List production one. The condition stops the + # greedy list when it has gone too far. + [count_items item] + { + # Pass up the data + lhs.count = r1.count + 1 + if lhs.count > target { + reject + } + } + + # List production two, the base. + | [] + { + lhs.count = 0 + } + + # Wrapper which prevents short lists from getting through if the parser + # encounters an error and needs to backtrack over counted list. + def counted_list + [get_target count_items] + { + if r2.count < target { + reject + } + } + + def start + [counted_list*] + { + for List:counted_list in lhs { + match List [Count:number Items:count_items] + print( 'num items: ' Count.data.atoi() '\n' ) + + i: int = 1 + for Item:item in Items { + print( ' item ' i ': ' ^Item '\n' ) + i = i + 1 + } + } + } +end # counting + +cons Counting: counting[] +parse counting::start(Counting)[ stdin ] +##### IN ##### +3 1 b c 1 1 0 3 a b c +##### EXP ##### +num items: 3 + item 1: 1 + item 2: b + item 3: c +num items: 1 + item 1: 1 +num items: 0 +num items: 3 + item 1: a + item 2: b + item 3: c diff --git a/test/counting2.lm b/test/counting2.lm new file mode 100644 index 0000000..0ca75be --- /dev/null +++ b/test/counting2.lm @@ -0,0 +1,98 @@ +##### LM ##### + +# +# Regular Definitions +# + +rl rl_ws /[ \t\n\r\v]+/ +rl rl_id /[a-zA-Z_][a-zA-Z0-9_]*/ +rl rl_num /[0-9]+/ + +# +# Tokens +# + +lex + # Ignore whitespace. + ignore /rl_ws/ + + # Tokens. + token id /rl_id/ + token number /rl_num/ +end + +# +# Productions +# + +# Arbitrary item. +def item + [id] +| [number] + +# List production one. The condition stops the +# greedy list when it has gone too far. +def count_items + target: int + count: int + + [count_items item] + { + # Pass up the data + lhs.target = r1.target + lhs.count = r1.count + 1 + + if lhs.count > lhs.target { + reject + } + } + + # List production two, the base. +| [number] + { + match lhs [Number: number] + lhs.target = Number.data.atoi() + lhs.count = 0 + } + + +# Wrapper which prevents short lists from getting through if the parser +# encounters an error and needs to backtrack over counted list. +def counted_list + [count_items] + { + if r1.count < r1.target { + reject + } + } + +def start + [counted_list*] + { + for List: counted_list in lhs { + match List [CountItems:count_items] + print( 'num items: ' CountItems.target '\n' ) + + i: int = 1 + for Item:item in CountItems { + print( ' item ' i ': ' ^Item '\n' ) + i = i + 1 + } + } + } + +parse start[ stdin ] +##### IN ##### +3 1 b c 1 1 0 3 a b c +##### EXP ##### +num items: 3 + item 1: 1 + item 2: b + item 3: c +num items: 1 + item 1: 1 +num items: 0 +num items: 3 + item 1: a + item 2: b + item 3: c diff --git a/test/counting3.lm b/test/counting3.lm new file mode 100644 index 0000000..027f456 --- /dev/null +++ b/test/counting3.lm @@ -0,0 +1,130 @@ +##### LM ##### +context counting + + # + # Regular Definitions + # + rl rl_ws /[ \t\n\r\v]+/ + rl rl_id /[a-zA-Z_][a-zA-Z0-9_]*/ + rl rl_num /[0-9]+/ + + # + # Tokens + # + + lex + # Ignore whitespace. + ignore /rl_ws/ + + literal `; + + # Tokens. + token id /rl_id/ + token number /rl_num/ + end + + # + # Global Data + # + + target: int + count: int + + # + # Productions + # + + def get_target + [number] + { + count = 0 + target = r1.data.atoi() + print( 'target: ' target '\n' ) + } + + # Arbitrary item. + def item + [number] + | [id] + + def count_items + [one_item count_items] + | [] + + def one_item + [item] + { + count = count + 1 + if count > target { + reject + } + print( 'ITEM\n' ) + } + + + # Wrapper which prevents short lists from getting through if the parser + # encounters an error and needs to backtrack over counted list. + def counted_list + [get_target count_items] + { + print( 'trying: ' count ' for: ' target '\n' ) + if count < target { + reject + } + } + + + def start + [counted_list*] + { + + for List: counted_list in lhs { + match List [Count: number Items: count_items] + print( 'num items: ' Count.data.atoi() '\n' ) + + i: int = 1 + for Item: item in Items { + print( ' item ' i ': ' ^Item '\n' ) + i = i + 1 + } + } + print( '*** SUCCESS ***\n' ) + } + +end # counting + +cons Counting: counting[] +parse counting::start(Counting)[ stdin ] +##### IN ##### +3 1 b c 1 1 0 3 a b c +##### EXP ##### +target: 3 +ITEM +ITEM +ITEM +ITEM +trying: 3 for: 3 +target: 1 +ITEM +ITEM +trying: 1 for: 1 +target: 0 +ITEM +trying: 0 for: 0 +target: 3 +ITEM +ITEM +ITEM +trying: 3 for: 3 +num items: 3 + item 1: 1 + item 2: b + item 3: c +num items: 1 + item 1: 1 +num items: 0 +num items: 3 + item 1: a + item 2: b + item 3: c +*** SUCCESS *** diff --git a/test/counting4.lm b/test/counting4.lm new file mode 100644 index 0000000..ef9f87f --- /dev/null +++ b/test/counting4.lm @@ -0,0 +1,111 @@ +##### LM ##### +context counting + + # + # Regular Definitions + # + rl rl_ws /[ \t\n\r\v]+/ + rl rl_id /[a-zA-Z_][a-zA-Z0-9_]*/ + rl rl_num /[0-9]+/ + + # + # Tokens + # + + lex + # Ignore whitespace. + ignore /rl_ws/ + + literal `; + + # Tokens. + token id /rl_id/ + token number /rl_num/ + end + + # + # Global Data + # + + target: int + count: int + + # + # Productions + # + + + def get_target + [number] + { + count = 0 + target = r1.data.atoi() + print( 'target: ' target '\n' ) + } + + # Arbitrary item. + def item + [number] + | [id] + + def count_items + [count_inc item count_items] + | [count_end] + + def count_inc + [] + { + if count < target + count = count + 1 + else + reject + } + + def count_end + [] + { + if count < target + reject + } + + def counted_list + [get_target count_items] + + def start + [counted_list*] + { + for List: counted_list in lhs { + match List [Count: number Items: count_items] + print( 'num items: ' Count.data.atoi() '\n' ) + + i: int = 1 + for Item: item in Items { + print( ' item ' i ': ' ^Item '\n' ) + i = i + 1 + } + } + print( '*** SUCCESS ***\n' ) + } +end # counting + +cons Counting: counting[] +parse counting::start(Counting)[stdin] +##### IN ##### +3 1 b c 1 1 0 3 a b c +##### EXP ##### +target: 3 +target: 1 +target: 0 +target: 3 +num items: 3 + item 1: 1 + item 2: b + item 3: c +num items: 1 + item 1: 1 +num items: 0 +num items: 3 + item 1: a + item 2: b + item 3: c +*** SUCCESS *** diff --git a/test/decl1.lm b/test/decl1.lm new file mode 100644 index 0000000..2d3c03b --- /dev/null +++ b/test/decl1.lm @@ -0,0 +1,5 @@ +##### LM ##### +Int: int = 7 +print( Int '\n' ) +##### EXP ##### +7 diff --git a/test/decl2.lm b/test/decl2.lm new file mode 100644 index 0000000..79fdc67 --- /dev/null +++ b/test/decl2.lm @@ -0,0 +1,5 @@ +##### LM ##### +Str: str = '77' +print( Str '\n' ) +##### EXP ##### +77 diff --git a/test/decl3.lm b/test/decl3.lm new file mode 100644 index 0000000..1c9ef23 --- /dev/null +++ b/test/decl3.lm @@ -0,0 +1,3 @@ +##### LM ##### +Int: int +##### EXP ##### diff --git a/test/div.lm b/test/div.lm new file mode 100644 index 0000000..84dd807 --- /dev/null +++ b/test/div.lm @@ -0,0 +1,42 @@ +##### LM ##### + +i: int = 0 +while ( i < 34 ) { + print( (i / 4) '\n' ) + i = i + 1 +} +##### EXP ##### +0 +0 +0 +0 +1 +1 +1 +1 +2 +2 +2 +2 +3 +3 +3 +3 +4 +4 +4 +4 +5 +5 +5 +5 +6 +6 +6 +6 +7 +7 +7 +7 +8 +8 diff --git a/test/exit1.lm b/test/exit1.lm new file mode 100644 index 0000000..1a4f82a --- /dev/null +++ b/test/exit1.lm @@ -0,0 +1,7 @@ +##### LM ##### + +print( 'before\n' ) +exit( 0 ) +print( 'after\n' ) +##### EXP ##### +before diff --git a/test/exit2.lm b/test/exit2.lm new file mode 100644 index 0000000..2105f98 --- /dev/null +++ b/test/exit2.lm @@ -0,0 +1,24 @@ +##### LM ##### + +int f3() +{ + I: int = 1 + exit( 0 ) +} + +int f2() +{ + I: int = 1 + f3() +} + +int f1() +{ + I: int = 1 + f2() +} + +I: int = 1 + +f1() +##### EXP ##### diff --git a/test/exit3.lm b/test/exit3.lm new file mode 100644 index 0000000..4cf4686 --- /dev/null +++ b/test/exit3.lm @@ -0,0 +1,24 @@ +##### LM ##### + +int f3() +{ + I: int = 1 + print( "hello\n" ) +} + +int f2() +{ + I: int = 1 + f3() +} + +int f1() +{ + I: int = 1 + f2() +} + +I: int = 1 + +exit( 0 ) +##### EXP ##### diff --git a/test/export1.lm b/test/export1.lm new file mode 100644 index 0000000..253b688 --- /dev/null +++ b/test/export1.lm @@ -0,0 +1,16 @@ +##### LM ##### +lex + token id /[a-z]+/ + ignore /[ \t]+/ +end + +def start + [id*] + +export Start: start +export Error: str + +parse P: start[ stdin ] +Error = error +##### IN ##### +##### EXP ##### diff --git a/test/factor1.lm b/test/factor1.lm new file mode 100644 index 0000000..74c7453 --- /dev/null +++ b/test/factor1.lm @@ -0,0 +1,4 @@ +##### LM ##### +print( 'hello\n') +##### EXP ##### +hello diff --git a/test/factor2.lm b/test/factor2.lm new file mode 100644 index 0000000..a368537 --- /dev/null +++ b/test/factor2.lm @@ -0,0 +1,4 @@ +##### LM ##### +print( 77 '\n' ) +##### EXP ##### +77 diff --git a/test/factor3.lm b/test/factor3.lm new file mode 100644 index 0000000..cc988af --- /dev/null +++ b/test/factor3.lm @@ -0,0 +1,3 @@ +##### LM ##### +open( 'x' ('r') ) +##### EXP ##### diff --git a/test/factor4.lm b/test/factor4.lm new file mode 100644 index 0000000..f0143ea --- /dev/null +++ b/test/factor4.lm @@ -0,0 +1,3 @@ +##### LM ##### +argv +##### EXP ##### diff --git a/test/factor5.lm b/test/factor5.lm new file mode 100644 index 0000000..e2be8c0 --- /dev/null +++ b/test/factor5.lm @@ -0,0 +1,6 @@ +##### LM ##### +print( argv.pop() '\n' ) +##### ARGS ##### +a +##### EXP ##### +a diff --git a/test/factor6.lm b/test/factor6.lm new file mode 100644 index 0000000..a9c0c19 --- /dev/null +++ b/test/factor6.lm @@ -0,0 +1,12 @@ +##### LM ##### +if ( 1 ) + print( 'a\n' ) +if ( nil ) + print( 'b\n' ) +if ( true ) + print( 'c\n' ) +if ( false ) + print( 'd\n' ) +##### EXP ##### +a +c diff --git a/test/forloop1.lm b/test/forloop1.lm new file mode 100644 index 0000000..31e37fd --- /dev/null +++ b/test/forloop1.lm @@ -0,0 +1,19 @@ +##### LM ##### +lex + token id / 'a' .. 'z' / + ignore / '\n' | '\t' | ' ' / +end + +def start + [id*] + +parse P: start[stdin] +Start: start = P +for Id: id in Start + print( ^Id '\n' ) +##### IN ##### +a b c +##### EXP ##### +a +b +c diff --git a/test/forloop2.lm b/test/forloop2.lm new file mode 100644 index 0000000..7d54f8a --- /dev/null +++ b/test/forloop2.lm @@ -0,0 +1,19 @@ +##### LM ##### +lex + token id / 'a' .. 'z' / + ignore / '\n' | '\t' | ' ' / +end + +def start + [id*] + +parse P: start[stdin] +Start: start = P +for Id: id in triter(Start) + print( ^Id '\n' ) +##### IN ##### +d e f +##### EXP ##### +d +e +f diff --git a/test/forloop3.lm b/test/forloop3.lm new file mode 100644 index 0000000..3749244 --- /dev/null +++ b/test/forloop3.lm @@ -0,0 +1,21 @@ +##### LM ##### +lex + token id / 'a' .. 'z' / + ignore / '\n' | '\t' | ' ' / +end + +def start + [id*] + +parse P: start[stdin] +Start: start = P +for Id: id in triter(Start) { + print( ^Id ) + print( '\n' ) +} +##### IN ##### +d e f +##### EXP ##### +d +e +f diff --git a/test/func1.lm b/test/func1.lm new file mode 100644 index 0000000..c973e44 --- /dev/null +++ b/test/func1.lm @@ -0,0 +1,9 @@ +##### LM ##### +int f() +{ + print( 'hello world\n' ) +} + +f() +##### EXP ##### +hello world diff --git a/test/func2.lm b/test/func2.lm new file mode 100644 index 0000000..75de72d --- /dev/null +++ b/test/func2.lm @@ -0,0 +1,9 @@ +##### LM ##### +int f( I: int Str: str ) +{ + print( I ' ' Str '\n' ) +} + +f( 50 'hello world' ) +##### EXP ##### +50 hello world diff --git a/test/func3.lm b/test/func3.lm new file mode 100644 index 0000000..cdfcbc3 --- /dev/null +++ b/test/func3.lm @@ -0,0 +1,40 @@ +##### LM ##### +lex + literal `{ `} + literal `struct `type + token id /[A-Za-z_][A-Za-z_0-9]*/ + ignore /[ \t\r\n]+/ +end + +def attribute + [`type id] + +def struct + [`struct id `{ attribute* `}] + +def program + [struct*] + +int func( P: program ) +{ + print( P ) +} + +int main() +{ + parse P: program[ stdin ] + func( P ) +} + +main() + +##### IN ##### +struct S +{ + type T +} +##### EXP ##### +struct S +{ + type T +} diff --git a/test/generate1.lm b/test/generate1.lm new file mode 100644 index 0000000..ef76f8d --- /dev/null +++ b/test/generate1.lm @@ -0,0 +1,759 @@ +##### LM ##### +context generate + # Regular definitions + rl ident_char /[a-zA-Z_]/ + + # List used as a stack of indentations. + IndentStack: list<int> + + # Has a newline been sent for this '\n' .. whitespace match. + newline_sent: int + + # Tokens. + lex + # Python keywords. + literal `and `del `from `not `while `as `elif `global `or + `with `assert `else `if `pass `yield `break `except + `import `print `class `exec `in `raise `continue + `finally `is `return `def `for `lambda `try + + # Identifiers + rl lowercase /'a'..'z'/ + rl uppercase /'A'..'Z'/ + rl letter /lowercase | uppercase/ + token identifier /(letter|'_') (letter | digit | '_')*/ + + # Literals + rl escapeseq /'\\' any / + rl longstringchar /[^\\]/ + rl shortstringchar_s /[^\\\n']/ + rl shortstringchar_d /[^\\\n"]/ + rl longstringitem /longstringchar | escapeseq/ + rl shortstringitem_s /shortstringchar_s | escapeseq/ + rl shortstringitem_d /shortstringchar_d | escapeseq/ + rl longstring /"'''" longstringitem* :>> "'''" | '"""' longstringitem* :>> '"""'/ + rl shortstring /"'" shortstringitem_s* "'" | '"' shortstringitem_d* '"'/ + rl stringprefix /"r" | "u" | "ur" | "R" | "U" | "UR" | "Ur" | "uR"/ + token stringliteral /stringprefix? (shortstring | longstring)/ + + # Integers + rl hexdigit /digit | 'a'..'f' | 'A'..'F'/ + rl octdigit /'0'..'7'/ + rl nonzerodigit /'1'..'9'/ + rl hexinteger /'0' ('x' | 'X') hexdigit+/ + rl octinteger /'0' octdigit+/ + rl decimalinteger /nonzerodigit digit* | '0'/ + token integer /decimalinteger | octinteger | hexinteger/ + token longinteger /integer ('l' | 'L')/ + + # Floats. + rl exponent /('e' | 'E') ('+' | '-')? digit+/ + rl fraction /'.' digit+/ + rl intpart /digit+/ + rl pointfloat /intpart? fraction | intpart '.'/ + rl exponentfloat /(intpart | pointfloat) exponent/ + token floatnumber /pointfloat | exponentfloat/ + + # Imaginaries. + token imagnumber /(floatnumber | intpart) ("j" | "J")/ + + # Operators. + literal `+ `- `* `** `/ `// `% `<< `>> `& `| `^ + `~ `< `> `<= `>= `== `!= `<> + + # Delimiters + literal `( `) `[ `] `{ `} `@ `, `: `. `` `= `; + `+= `-= `*= `/= `//= `%= `&= `|= `^= `>>= `<<= + `**= + + literal `... + + # In general whitespace is ignored. + ignore WS /' '+/ + + # Find and ignore entire blank lines. + token BLANK_LINE + / '\n' [ \t]* ('#' [^\n]*)? '\n' / + { + # Need to shorten to take off the newline. + # Turn it into ignore. + input.push_ignore( make_token( typeid<WS> input.pull(match_length - 1) ) ) + } + + # Find and ignore comments. + token COMMENT + / '#' [^\n]* '\n' / + { + # Need to shorten to take off the newline. Turn it into ignore. + input.push_ignore( make_token( typeid<WS> input.pull(match_length - 1) ) ) + } + + # These tokens are generated + token INDENT // + token DEDENT // + token NEWLINE // + ignore IND_WS // + + token INDENTATION + /'\n' [ \t]*/ + { + # We have squared up INDENTs and DEDENTs. Ignore the entire match. + input.push_ignore( make_token( typeid<WS> input.pull(match_length) ) ) + + # We have already sent the newline, compute the indentation level. + data_length: int = match_length - 1 + + if data_length > IndentStack.top { + # The indentation level is more than the level on the top + # of the stack. This is an indent event. Send as an INDENT. + input.push( make_token( typeid<INDENT> '' ) ) + + # Push to the stack as per python manual. + IndentStack.push( data_length ) + } else { + while data_length < IndentStack.top { + # The indentation level is less than the level on the top of + # the stack. Pop the level and send one dedent. This flow of + # control will execute until we find the right indentation level + # to match up with. + IndentStack.pop() + + # Send as a DEDENT + input.push( make_token( typeid<DEDENT> '' ) ) + } + } + + # FIXME: if data.length is now > top of stack then error. This + # means the outdent does not match anything. + + # First the newline. + input.push( make_token( typeid<NEWLINE> '' ) ) + } + end + + # Blank lines or comment lines at the beginning of the file. + token LEADER / ( [ \t]* ('#' [^\n]*)? '\n' )* / + + def start + [file_input] + + def file_input + [file_input_forms*] + + def file_input_forms + [statement] + | [NEWLINE] + + def statement + [stmt_list NEWLINE] + | [compound_stmt] + + def stmt_list + [simple_stmt another_stmt* opt_semi] + + def another_stmt + [`; simple_stmt] + + def opt_semi + [`;] + | [] + + def suite + [stmt_list NEWLINE] + | [NEWLINE INDENT statement_seq DEDENT] + + def statement_seq + [statement_seq statement] + | [statement] + + def compound_stmt + [if_stmt] + | [while_stmt] + | [for_stmt] + | [funcdef] + + def if_stmt + [`if expression `: suite elif_part* opt_else_part] + + def elif_part + [`elif expression `: suite] + + def opt_else_part + [`else `: suite] + | [] + + def while_stmt + [`while expression `: suite opt_else_part] + + def for_stmt + [`for target_list `in expression_list `: suite opt_else_part] + + def funcdef + [`def funcname `( opt_parameter_list `) `: suite] + + def funcname + [identifier] + + def dotted_name + [dotted_name `. identifier] + | [identifier] + + def opt_parameter_list + [parameter_list] + | [] + + def parameter_list + [defparameter_list defparameter opt_comma] + + def defparameter_list + [defparameter_list defparameter `,] + | [] + + def defparameter + [parameter] + | [parameter `= expression] + + def sublist + [sublist_pl opt_comma] + + def sublist_pl + [sublist_pl `, parameter] + | [parameter] + + def parameter + [identifier] + | [`( sublist `)] + + def classname + [identifier] + + def simple_stmt + [expression_stmt] + | [assignment_stmt] + | [print_stmt] + + def expression_stmt + [expression_list] + + def assignment_stmt + [target_equals_list expression_list] + + def target_equals_list + [target_equals_list target_equals] + | [target_equals] + + def target_equals + [target_list `=] + + def target_list + [target_list_core opt_comma] + + def target_list_core + [target_list_core `, target] + | [target] + + def target + [target_atom target_ext_rep] + + def target_atom + [identifier] + | [`( target_list `)] + | [`[ target_list `]] + + def target_ext_rep + [target_ext target_ext_rep] + | [] + + def target_ext + [attributeref] + | [subscription] + | [slicing] + + def print_stmt + [`print opt_expression_list] + + def opt_expression_list + [expression_list] + | [] + + def expression_list + [expression_list_core opt_comma] + + def expression_list_core + [expression_list_core `, expression] + | [expression] + + def opt_comma + [`,] + | [] + + def expression + [or_test `if or_test `else test] + | [or_test] + | [lambda_form] + + def or_test + [or_test `or and_test] + | [and_test] + + def and_test + [and_test `and not_test] + | [not_test] + + def not_test + [comparison] + | [`not not_test] + + def lambda_form + [`lambda opt_parameter_list `: expression] + + def test + [or_test] + | [lambda_form] + + def comparison + [or_expr comparison_part*] + + def comparison_part + [comp_operator or_expr] + + def comp_operator + [`<] | [`>] | [`==] | [`>=] | [`<=] | [`<>] | [`!=] | [`is] | + [`is `not] | [`in] | [`not `in] + + def or_expr + [primary] + + def primary + [atom primary_ext_rep] + + def atom + [identifier] + | [pyliteral] + | [enclosure] + + def primary_ext_rep + [primary_ext primary_ext_rep] + | [] + + def primary_ext + [attributeref] + | [subscription] + | [slicing] + | [call] + + def pyliteral + [stringliteral] + | [integer] + | [longinteger] + | [floatnumber] + | [imagnumber] + + def enclosure + [parenth_form] + | [list_display] + | [generator_expression] + | [dict_display] + | [string_conversion] + + def parenth_form + [`( opt_expression_list `)] + + def list_display + [`[ opt_listmaker `]] + + def opt_listmaker + [listmaker] + | [] + + def listmaker + [expression list_for] + | [expression listmaker_ext* opt_comma] + + def listmaker_ext + [`, expression] + + def opt_list_iter + [list_iter] + | [] + + def list_iter + [list_for] + | [list_if] + + def list_if + [`if test opt_list_iter] + + def list_for + [`for expression_list `in testlist opt_list_iter] + + def testlist + [test testlist_ext* opt_comma] + + def testlist_ext + [`, test ] + + def generator_expression + [`( test genexpr_for `)] + + def genexpr_for + [`for expression_list `in test opt_genexpr_iter] + + def opt_genexpr_iter + [genexpr_iter] + | [] + + def genexpr_iter + [genexpr_for] + | [genexpr_if] + + def genexpr_if + [`if test opt_genexpr_iter] + + def dict_display + [`{ opt_key_datum_list `}] + + def opt_key_datum_list + [key_datum_list] + | [] + + def key_datum_list + [key_datum key_datum_list_ext* opt_comma] + + def key_datum_list_ext + [`, key_datum] + + def key_datum + [expression `: expression] + + def string_conversion + [`` expression_list ``] + + def attributeref + [`. identifier] + + def subscription + [`[ expression_list `]] + + # The natural ordered choice does not suffice here. Must force it. + + def slicing + [simple_slicing] + | [extended_slicing] + + def simple_slicing + [`[ short_slice `]] + + def extended_slicing + [`[ slice_list `]] + + def slice_list + [slice_item slice_list_ext* opt_comma] + + def slice_list_ext + [`, slice_item] + + def slice_item + [expression] + | [proper_slice] + | [ellipsis] + + def proper_slice + [short_slice] + | [long_slice] + + def short_slice + [`:] + | [`: upper_bound] + | [lower_bound `:] + | [lower_bound `: upper_bound] + + def long_slice + [short_slice `: stride] + | [short_slice `:] + + def lower_bound + [expression] + + def upper_bound + [expression] + + def stride + [expression] + + def ellipsis + [`...] + + def call + [`( opt_argument_list `)] + + def opt_argument_list + [argument_list opt_comma] + | [] + + def argument_list + [positional_arguments opt_comma_keyword_arguments] + | [keyword_arguments] + + def positional_arguments + [positional_arguments `, expression] + | [expression] + + def opt_comma_keyword_arguments + [`, keyword_arguments] + | [] + + def keyword_arguments + [keyword_arguments `, keyword_item] + | [keyword_item] + + def keyword_item + [identifier `= expression] + +end # generate + +int print_stmts( S: generate::start ) +{ + for Stmt: generate::statement in S + print( 'STMT: ' ^Stmt '\n' ) +} + +int print_target_subscriptions_and_slicings( Start: generate::start ) +{ + for TI: generate::target_ext in Start { + if match TI [generate::subscription] { + print( 'TARGET SUBSCRIPTION: ' ^TI '\n' ) + } + + if match TI [generate::simple_slicing] { + print( 'TARGET SIMPLE SLICING: ' ^TI '\n' ) + } + + if match TI [generate::extended_slicing] { + print( 'TARGET EXTENDED SLICING: ' ^TI '\n' ) + } + } + +} + +int print_primary_subscriptions_and_slicings( Start: generate::start ) +{ + for PI: generate::primary_ext in Start { + if match PI [generate::subscription] { + print( 'PRIMARY SUBSCRIPTION: ' ^PI '\n' ) + } + + if match PI [generate::simple_slicing] { + print( 'PRIMARY SIMPLE SLICING: ' ^PI '\n' ) + } + + if match PI [generate::extended_slicing] { + print( 'PRIMARY EXTENDED SLICING: ' ^PI '\n' ) + } + } +} + +cons Generate: generate[] + +# List used as a stack of indentations. +Generate.IndentStack = cons list<int> [] +Generate.IndentStack.push( 0 ) + +# Has a newline been sent for this '\n' .. whitespace match. +Generate.newline_sent = 0 + +parse S: generate::start(Generate)[ stdin ] + +print( '*** SUCCESS ***\n' ) +print( ^S '\n' ) +print( '***\n' ) +print_stmts( S ) +print_target_subscriptions_and_slicings( S ) +print_primary_subscriptions_and_slicings( S ) +print( '*** SUCCESS ***\n' ) +##### IN ##### + +# dude, this is a comment + # some more +hello +def dude(): + yes + awesome; + + # Here we have a comment + def realy_awesome(): # hi there + in_more + + same_level + def one_liner(): first; second # both inside one_liner + + back_down + +last_statement + +# dude, this is a comment + # some more +hello +if 1: + yes + awesome; + + # Here we have a comment + if ('hello'): # hi there + in_more + + same_level + if ['dude', 'dudess'].horsie(): first; second # both inside one_liner + 1 + + back_down + +last_statement + +hello = 1.1(20); + +# subscription +a[1] = b[2]; + +# simple slicing +c[1:1] = d[2:2]; + +# simple slicing +e[1:1, 2:2] = f[3:3, 4:4]; +##### EXP ##### +*** SUCCESS *** + +hello +def dude(): + yes + awesome; + + # Here we have a comment + def realy_awesome(): # hi there + in_more + + same_level + def one_liner(): first; second # both inside one_liner + + back_down + +last_statement + +# dude, this is a comment + # some more +hello +if 1: + yes + awesome; + + # Here we have a comment + if ('hello'): # hi there + in_more + + same_level + if ['dude', 'dudess'].horsie(): first; second # both inside one_liner + 1 + + back_down + +last_statement + +hello = 1.1(20); + +# subscription +a[1] = b[2]; + +# simple slicing +c[1:1] = d[2:2]; + +# simple slicing +e[1:1, 2:2] = f[3:3, 4:4]; +*** +STMT: hello +STMT: def dude(): + yes + awesome; + + # Here we have a comment + def realy_awesome(): # hi there + in_more + + same_level + def one_liner(): first; second # both inside one_liner + + back_down + +STMT: yes +STMT: awesome; + + # Here we have a comment +STMT: def realy_awesome(): # hi there + in_more + + same_level + def one_liner(): first; second # both inside one_liner + +STMT: in_more + +STMT: same_level +STMT: def one_liner(): first; second # both inside one_liner + +STMT: back_down + +STMT: last_statement + +# dude, this is a comment + # some more +STMT: hello +STMT: if 1: + yes + awesome; + + # Here we have a comment + if ('hello'): # hi there + in_more + + same_level + if ['dude', 'dudess'].horsie(): first; second # both inside one_liner + 1 + + back_down + +STMT: yes +STMT: awesome; + + # Here we have a comment +STMT: if ('hello'): # hi there + in_more + + same_level + if ['dude', 'dudess'].horsie(): first; second # both inside one_liner + 1 + +STMT: in_more + +STMT: same_level +STMT: if ['dude', 'dudess'].horsie(): first; second # both inside one_liner +STMT: 1 + +STMT: back_down + +STMT: last_statement + +STMT: hello = 1.1(20); + +# subscription +STMT: a[1] = b[2]; + +# simple slicing +STMT: c[1:1] = d[2:2]; + +# simple slicing +STMT: e[1:1, 2:2] = f[3:3, 4:4]; +TARGET SUBSCRIPTION: [1] +TARGET SIMPLE SLICING: [1:1] +TARGET EXTENDED SLICING: [1:1, 2:2] +PRIMARY SUBSCRIPTION: [2] +PRIMARY SIMPLE SLICING: [2:2] +PRIMARY EXTENDED SLICING: [3:3, 4:4] +*** SUCCESS *** diff --git a/test/generate2.lm b/test/generate2.lm new file mode 100644 index 0000000..946c478 --- /dev/null +++ b/test/generate2.lm @@ -0,0 +1,214 @@ +##### LM ##### +context generate + def open_item + type: str + num: int + [] + + OpenStack: list<open_item> + + lex + token stray_close // + + token ocurly /'{'+/ + { + input.pull( match_length ) + + cons OI: open_item( '{' match_length ) [] + OpenStack.push( OI ) + i: int = 0 + while ( i < match_length ) { + input.push( make_token( typeid<ocurly> '{' ) ) + i = i + 1 + } + } + + token ccurly1 // + token ccurly2 // + token ccurly3 // + token missing_curly // + + token tmp1 /'}'+/ + { + if OpenStack.length > 0 && OpenStack.tail.type == '{' { + length: int = 3 + if ( length > match_length ) + length = match_length + + Tail: open_item = OpenStack.pop() + if ( length > Tail.num ) + length = Tail.num + + if ( length == 1 ) + input.push( make_token( typeid<ccurly1> input.pull( 1 ) ) ) + else if ( length == 2 ) + input.push( make_token( typeid<ccurly2> input.pull( 2 ) ) ) + else if ( length == 3 ) + input.push( make_token( typeid<ccurly3> input.pull( 3 ) ) ) + + Tail.num = Tail.num - length + + if ( Tail.num > 0 ) + OpenStack.push( Tail ) + } + else { + input.push( make_token( typeid<stray_close> input.pull( match_length ) ) ) + } + } + + token osquare /'['+/ + { + input.pull( match_length ) + OI: open_item = construct open_item( '[' match_length ) [] + OpenStack.push( OI ) + i: int = 0 + while ( i < match_length ) { + input.push( make_token( typeid<osquare> '[' ) ) + i = i + 1 + } + } + + token csquare1 // + token csquare2 // + token missing_square // + + token tmp2 /']'+/ + { + if OpenStack.length > 0 && OpenStack.tail.type == '[' { + length: int = 2 + if ( length > match_length ) + length = match_length + + Tail: open_item = OpenStack.pop() + if ( length > Tail.num ) + length = Tail.num + + if ( length == 1 ) + input.push( make_token( typeid<csquare1> input.pull( 1 ) ) ) + else if ( length == 2 ) + input.push( make_token( typeid<csquare2> input.pull( 2 ) ) ) + + Tail.num = Tail.num - length + + if ( Tail.num > 0 ) + OpenStack.push( Tail ) + } + else { + input.push( make_token( typeid<stray_close> input.pull( match_length ) ) ) + } + } + + literal `| + token char /any/ + + preeof { + while ( OpenStack.length > 0 ) { + Tail: open_item = OpenStack.pop() + i: int + if ( Tail.type == '{' ) { + i = 0 + while ( i < Tail.num ) { + input.push( make_token( typeid<missing_curly> '}' ) ) + i = i + 1 + } + } + else if ( Tail.type == '[' ) { + i = 0 + while ( i < Tail.num ) { + input.push( make_token( typeid<missing_square> ']' ) ) + i = i + 1 + } + } + } + } + end + + # + # Internal Links + # + + + lex + literal `http: + literal `ftp: + literal `mailto: + end + + def el_prefix + [`http:] + | [`ftp:] + | [`mailto:] + + def external_link + [osquare item* csquare1] + + def internal_link + [osquare osquare item* csquare2] + + def unclosed_square + [osquare item* missing_square] + + # + # Templates + # + + def sing_template + [ocurly item* ccurly1] + + def template + [ocurly ocurly item* ccurly2] + + def parameter + [ocurly ocurly ocurly item* ccurly3] + + def unclosed_curly + [ocurly item* missing_curly] + + # + # Template Parameters + # + + + def U1 [] + def U2 [] + def U3 [] + + def item + [external_link] + | [internal_link] + | [unclosed_curly] + | [sing_template] + | [template] + | [parameter] + | [unclosed_curly] + | [stray_close] + | [osquare] + | [`|] + | [char] + + def start + [item*] + +end # generate + +cons Generate: generate[] + +Generate.OpenStack = construct list<generate::open_item> [] +Sentinal: generate::open_item = construct generate::open_item( '** SENTINAL **' 1 ) [] +Generate.OpenStack.push( Sentinal ) + +parse S: generate::start(Generate)[stdin] + +if S { + for I: generate::external_link in S + print( 'EXTERNAL LINK: ' I '\n' ) + + for I: generate::internal_link in S + print( 'INTERNAL LINK: ' I '\n' ) +} +##### IN ##### +[external] +[[internal]] +##### EXP ##### +EXTERNAL LINK: [external] +INTERNAL LINK: [[internal]] diff --git a/test/heredoc.lm b/test/heredoc.lm new file mode 100644 index 0000000..05bf300 --- /dev/null +++ b/test/heredoc.lm @@ -0,0 +1,59 @@ +##### LM ##### +context heredoc + rl ident_char /[a-zA-Z_]/ + + lex + # Tokens + token other /(^(ident_char|0|'\n'))+/ + + token here_close // + token id + /ident_char+/ + { + if HereId && HereId == match_text { + input.push( make_token( + typeid<here_close> + input.pull(match_length - 1) ) ) + } + else { + input.push( make_token( typeid<id> input.pull(match_length) ) ) + } + } + + token nl /'\n'/ + end + + def here_name + [id] + { + HereId = $r1 + } + + HereId: str + + def here_data + [here_data_item*] + + def here_data_item + [id] + | [other] + | [nl] + + def start + [here_name here_data here_close id nl] +end # heredoc + +cons HereDoc: heredoc[] + +parse S: heredoc::start(HereDoc)[stdin] +print_xml(S) +print( '\n' ) +##### IN ##### +hello +random 9392af j9 stuff +hello +##### EXP ##### +<heredoc::start><heredoc::here_name><heredoc::id>hello</heredoc::id></heredoc::here_name><heredoc::here_data><heredoc::_repeat_here_data_item><heredoc::here_data_item><heredoc::nl> +</heredoc::nl></heredoc::here_data_item><heredoc::here_data_item><heredoc::id>random</heredoc::id></heredoc::here_data_item><heredoc::here_data_item><heredoc::other> 9392</heredoc::other></heredoc::here_data_item><heredoc::here_data_item><heredoc::id>af</heredoc::id></heredoc::here_data_item><heredoc::here_data_item><heredoc::other> </heredoc::other></heredoc::here_data_item><heredoc::here_data_item><heredoc::id>j</heredoc::id></heredoc::here_data_item><heredoc::here_data_item><heredoc::other>9 </heredoc::other></heredoc::here_data_item><heredoc::here_data_item><heredoc::id>stuff</heredoc::id></heredoc::here_data_item><heredoc::here_data_item><heredoc::nl> +</heredoc::nl></heredoc::here_data_item></heredoc::_repeat_here_data_item></heredoc::here_data><heredoc::here_close>hell</heredoc::here_close><heredoc::id>o</heredoc::id><heredoc::nl> +</heredoc::nl></heredoc::start> diff --git a/test/ifblock1.lm b/test/ifblock1.lm new file mode 100644 index 0000000..3797529 --- /dev/null +++ b/test/ifblock1.lm @@ -0,0 +1,46 @@ +##### LM ##### +if 1 + print( '1\n' ) + +if 2 { + print( '2\n' ) +} + +if 3 { + print( '3\n' ) + print( '4\n' ) +} + +if 0 + print( '0\n' ) +elsif 0 + print( '0\n' ) + +if 0 + print( '0\n' ) +elsif 1 + print( '5\n' ) + +if 0 + print( '0\n' ) +elsif 0 + print( '0\n' ) +elsif 1 + print( '6\n' ) + +if 0 + print( '0\n' ) +elsif 0 + print( '0\n' ) +elsif 0 + print( '0\n' ) +else + print( '7\n' ) +##### EXP ##### +1 +2 +3 +4 +5 +6 +7 diff --git a/test/ignore1.lm b/test/ignore1.lm new file mode 100644 index 0000000..47f631c --- /dev/null +++ b/test/ignore1.lm @@ -0,0 +1,59 @@ +##### LM ##### + +# +# Regular Definitions +# +rl rl_ws /[.+ \t\n\r\v]+/ +rl rl_id /[a-zA-Z_][a-zA-Z0-9_]*/ + +# +# Tokens +# + +lex + literal `= `< `> `/ + + # Ignore whitespace. + ignore /rl_ws/ + + # Open and close id + token id /rl_id/ +end + +# +# Productions +# + +def attr [id `= id] + +def attr_list + [attr_list attr] +| [] + +def open_tag + [`< id attr_list `>] + +def close_tag + [`< `/ id `>] + +def tag + [open_tag item_list close_tag] + +def item_list + [item_list tag] +| [] + +parse Attrs: attr_list[ stdin ] + +print( Attrs ) + +construct IL: item_list + ["<wrapper .[Attrs]. ></wrapper>\n"] + +print( IL ) +##### IN ##### ++ foo = asdf + +##### EXP ##### ++ foo = asdf + +<wrapper .+ foo = asdf + +. ></wrapper> diff --git a/test/ignore2.lm b/test/ignore2.lm new file mode 100644 index 0000000..f4aa963 --- /dev/null +++ b/test/ignore2.lm @@ -0,0 +1,36 @@ +##### LM ##### +lex + ignore /space+/ + literal `* `( `) + token id /[a-zA-Z_]+/ +end + +def item + [id] +| [`( item* `)] + +def start + [item*] + +parse Input: start[ stdin ] + +cons Output: accum<start> [] + +for Id: id in Input { + send Output + "( [^Id] ) +} + +S: start = Output() + +print( S ) +##### IN ##### +a b c ( chocolate fudge ) d e +##### EXP ##### +( a ) +( b ) +( c ) +( chocolate ) +( fudge ) +( d ) +( e ) diff --git a/test/ignore3.lm b/test/ignore3.lm new file mode 100644 index 0000000..ccf9c13 --- /dev/null +++ b/test/ignore3.lm @@ -0,0 +1,53 @@ +##### LM ##### +lex + ignore /space+/ + literal `* + literal `( -ni + literal ni- `) + literal `! `; + token id /[a-zA-Z_0-9]+/ +end + +lex + ignore /space+/ + token inner_t /[a-zA-Z_0-9]+/ + token empty - +end + +def inner + [inner_t*] +| [empty] + +def item + [id] +| [`( inner `)] + +def start + [item* `;] + +parse Start: start[ stdin ] + +if ( ! Start ) { + print( 'parse error\n' ) + exit( 0 ) +} + +for I: item in Start { + print( 'item: .' I '.\n' ) + if match I [ O: `( Inner: inner C: `) ] + print( 'innr: .' O '.' Inner '.' C '.\n' ) +} + +##### IN ##### +a b c ( d ) e ( ) f g; +##### EXP ##### +item: .a . +item: .b . +item: .c . +item: .( d ) . +innr: .(. d .) . +item: .e . +item: .( ) . +innr: .(. .) . +item: .f . +item: .g. diff --git a/test/ignore4.lm b/test/ignore4.lm new file mode 100644 index 0000000..cdd94b3 --- /dev/null +++ b/test/ignore4.lm @@ -0,0 +1,74 @@ +##### LM ##### +namespace hash + + lex + literal `define `include + token NL /'\n'/ -ni + + token id /[a-zA-Z_][a-zA-Z_0-9]*/ + token number /[0-9]+/ + token string /'"' ( [^"\\] | '\\' any )* '"'/ + + ignore /[ \t]+/ + + end + + def hash + [`define id number NL] + | [`include string NL] + +end # hash + +namespace lang + + lex + ignore /space+/ + literal `* `( `) `; `# + token id /[a-zA-Z_][a-zA-Z_0-9]*/ + token number /[0-9]+/ + end + + def item + [id] + | [`( item* `)] + + def statement + [item* `;] + | [`# hash::hash] + + def start + [statement*] + +end # lang + +parse Input: lang::start[ stdin ] + +if ! Input + print( error '\n' ) +else { + #print( Input.tree '\n' ) + for H: lang::statement in Input { + require H [ lang::`# hash::hash ] + print( '--' H '==\n' ) + } +} +##### IN ##### + +hello; + +#include "input1" + +#include "input2" + +#include "input3" + +there; +##### EXP ##### +--#include "input1" +== +-- +#include "input2" +== +-- +#include "input3" +== diff --git a/test/ignore5.lm b/test/ignore5.lm new file mode 100644 index 0000000..28a3392 --- /dev/null +++ b/test/ignore5.lm @@ -0,0 +1,51 @@ +##### LM ##### +lex + ignore /space+/ + literal `* `! `; + literal `( -ni ni- `) + token id /[a-zA-Z_0-9]+/ +end + +lex + ignore /space+/ + token inner_t /[a-zA-Z_0-9]+/ + + token empty - +end + +def inner + [empty inner_t*] + +def item + [id] +| [`( inner `)] + +def start + [item* `;] + +parse Start: start[ stdin ] + +if ( ! Start ) { + print( 'parse error\n' ) + exit( 0 ) +} + +for I: item in Start { + print( 'item: .' I '.\n' ) + if match I [ O: `( Inner: inner C: `) ] + print( 'innr: .' O '.' Inner '.' C '.\n' ) +} + +##### IN ##### +a b c ( d ) e ( ) f g; +##### EXP ##### +item: .a . +item: .b . +item: .c . +item: .( d ) . +innr: .(. d .) . +item: .e . +item: .( ) . +innr: .(. .) . +item: .f . +item: .g. diff --git a/test/include1.lm b/test/include1.lm new file mode 100644 index 0000000..efd29ba --- /dev/null +++ b/test/include1.lm @@ -0,0 +1,5 @@ +##### LM ##### +include 'include1a.lmi' +print( ' world\n' ) +##### EXP ##### +hello world diff --git a/test/include1a.lmi b/test/include1a.lmi new file mode 100644 index 0000000..5de6f69 --- /dev/null +++ b/test/include1a.lmi @@ -0,0 +1,2 @@ + +print( 'hello' ) diff --git a/test/inpush1.lm b/test/inpush1.lm new file mode 100644 index 0000000..9f7be15 --- /dev/null +++ b/test/inpush1.lm @@ -0,0 +1,134 @@ +##### LM ##### +namespace string + lex + literal `" + token data /[^"\\]+/ + token escape /'\\' any/ + end + + def string_data + [data] + | [escape] + + def string + [`" string_data* `"] + + str unquote( S: string ) + { + match S [`" DL: string_data* `"] + for E: escape in DL + E.data = 'x' + return $DL + } + +end # string + +namespace hash + + lex + literal `define `include + literal `# + token NL /'\n'/ -ni + + token id /[a-zA-Z_][a-zA-Z_0-9]*/ + token number /[0-9]+/ + + ignore /[ \t]/ + end + + def hash + [`# `define Id: id number NL] + | [`# `include Inc: string::string NL] + +end # hash + +token rest_of_line /[^\n]* '\n'/ + +namespace lang + + lex + ignore /space/ + literal `* `( `) `; + token id /[a-zA-Z_][a-zA-Z_0-9]*/ + token number /[0-9]+/ + + token hash /'#'/ { + parse_stop H: hash::hash[ input ] + if ( H ) { + if ( H.Inc ) { + FN: str = unquote( H.Inc ) + print( 'opening ' FN '\n' ) + IS: stream = open( FN 'r' ) + if ( ! IS ) { + print( 'ERROR: failed to open ' FN '\n' ) + exit(1) + } + input.push( IS ) + } + } + else { + parse_stop L: rest_of_line[ input ] + if ! L { + print( "ERROR: stuck: " error ) + exit(1) + } + print( "ERROR: failed to parse # directive: " L ) + } + } + end + + def item + [id] + | [`( item* `)] + + def statement + [item* `;] + + def start + [statement*] + +end # lang + +parse Input: lang::start[ stdin ] + +if ! Input + print( error '\n' ) +else { + print( Input ) +} +##### IN ##### + +hello; + +#include "inpush1a.in" + +there; + +#include "inpush1b.in" + +dude; + +#include "inpush1c.in" + +and dudettes; +##### EXP ##### +opening inpush1a.in +opening inpush1b.in +opening inpush1c.in + +hello; + +a; +b; + +there; + +c; +d; + +dude; + +e; +f; + +and dudettes; diff --git a/test/inpush1a.in b/test/inpush1a.in new file mode 100644 index 0000000..26da0af --- /dev/null +++ b/test/inpush1a.in @@ -0,0 +1,2 @@ +a; +b; diff --git a/test/inpush1b.in b/test/inpush1b.in new file mode 100644 index 0000000..6c57432 --- /dev/null +++ b/test/inpush1b.in @@ -0,0 +1,2 @@ +c; +d; diff --git a/test/inpush1c.in b/test/inpush1c.in new file mode 100644 index 0000000..5373832 --- /dev/null +++ b/test/inpush1c.in @@ -0,0 +1,2 @@ +e; +f; diff --git a/test/island.lm b/test/island.lm new file mode 100644 index 0000000..8515eb4 --- /dev/null +++ b/test/island.lm @@ -0,0 +1,85 @@ +##### LM ##### + +lex + token func_chr /[^{}]+/ + token func_open /'{'/ + token func_close /'}'/ +end + +def func_item + [func_chr] +| [func_open func_body func_close] + +def func_body + [func_item*] + +def func + [ident `( `) `{ func_body func_close ] + +lex + token ident /[a-zA-Z_]+/ + token number /[0-9]+/ + + rl s_string / "'" ([^'\\\n] | '\\' any )* "'" / + rl d_string / '"' ([^"\\\n] | '\\' any )* '"' / + token string /s_string | d_string/ + + literal `+ `* `; `( `) `{ `} + + ignore wp / [ \t\n]+ / +end + +def class_item + [func] +| [class] +| [ident `;] +| [number `;] +| [string `;] + +def class_body + [class_item*] + +def class + [ident `{ class_body `} ] + +def top_item + [func] +| [class] + +def start + [top_item*] + +parse S: start[ stdin ] +print_xml( S ) +print( '\n' ) + +#pattern start +# ~class { func() { func() { 1+{2}} } } func() {{a}} +##### IN ##### +class +{ + 1; + "string"; + foo; + func() + { + func() + { + 1+{2} + } + } +} + +func() +{ + "data" + {a} +} +##### EXP ##### +<start><_repeat_top_item><top_item><class><ident>class</ident><_literal_0017>{</_literal_0017><class_body><_repeat_class_item><class_item><number>1</number><_literal_0011>;</_literal_0011></class_item><class_item><string>"string"</string><_literal_0011>;</_literal_0011></class_item><class_item><ident>foo</ident><_literal_0011>;</_literal_0011></class_item><class_item><func><ident>func</ident><_literal_0013>(</_literal_0013><_literal_0015>)</_literal_0015><_literal_0017>{</_literal_0017><func_body><_repeat_func_item><func_item><func_chr>func() + </func_chr></func_item><func_item><func_open>{</func_open><func_body><_repeat_func_item><func_item><func_chr> + 1+</func_chr></func_item><func_item><func_open>{</func_open><func_body><_repeat_func_item><func_item><func_chr>2</func_chr></func_item></_repeat_func_item></func_body><func_close>}</func_close></func_item><func_item><func_chr> + </func_chr></func_item></_repeat_func_item></func_body><func_close>}</func_close></func_item><func_item><func_chr> + </func_chr></func_item></_repeat_func_item></func_body><func_close>}</func_close></func></class_item></_repeat_class_item></class_body><_literal_0019>}</_literal_0019></class></top_item><top_item><func><ident>func</ident><_literal_0013>(</_literal_0013><_literal_0015>)</_literal_0015><_literal_0017>{</_literal_0017><func_body><_repeat_func_item><func_item><func_chr>"data" + </func_chr></func_item><func_item><func_open>{</func_open><func_body><_repeat_func_item><func_item><func_chr>a</func_chr></func_item></_repeat_func_item></func_body><func_close>}</func_close></func_item><func_item><func_chr> +</func_chr></func_item></_repeat_func_item></func_body><func_close>}</func_close></func></top_item></_repeat_top_item></start> diff --git a/test/lhs1.lm b/test/lhs1.lm new file mode 100644 index 0000000..f40297f --- /dev/null +++ b/test/lhs1.lm @@ -0,0 +1,42 @@ +##### LM ##### + +lex + ignore /space+/ + literal `* `( `) `! + token SEMI_NL /';\n'/ + token id /[a-zA-Z_0-9]+/ +end + +def item + [id] + { + lhs = cons item ["( " ^r1 " )"] + } +| [`( item* `)] + { + lhs = cons item ["( " ^r2 " )"] + } + +def A + [] { + print( 'A\n' ) + } + +def B + [] { + print( 'B\n' ) + } + +def start + [A item* `!] +| [B item* SEMI_NL] + +parse Start: start[ stdin ] +print( Start "\n" ) +##### IN ##### +a b c ( d1 d2 ) e f g ; +##### EXP ##### +A +B +( a )( b )( c )( ( d1 )( d2 ) )( e )( f )( g ); + diff --git a/test/liftattrs.lm b/test/liftattrs.lm new file mode 100644 index 0000000..574ea35 --- /dev/null +++ b/test/liftattrs.lm @@ -0,0 +1,83 @@ +##### LM ##### + +# +# Regular Definitions +# +rl rl_ws /[ \t\n\r\v]+/ +rl rl_id /[a-zA-Z_][a-zA-Z0-9_]*/ + +# +# Tokens +# + +lex + literal `= `< `> `/ + + # Ignore whitespace. + ignore /rl_ws/ + + # Open and close id + token id /rl_id/ +end + +# +# Productions +# + +def attr [id `= id] + +def attr_list + [attr_list attr] +| [] + +def open_tag + [`< id attr_list `>] + +def close_tag + [`< `/ id `>] + +def tag + [open_tag item_list close_tag] + +def item_list + [item_list tag] +| [] + +parse ILP: item_list[stdin] +IL: item_list = ILP + +# Get the item list +match IL [RootItemList: item_list] + +# List for collecting the attrs we pull out. +CollectedAttrs: attr_list = construct attr_list [] + +# Iterate through all attributes +for AttrListIter:attr_list in RootItemList { + # If the name of the attr is foo, remove it. + if match AttrListIter + [SubAttrList:attr_list "foo=" Val:id] + { + # Remove the attribute + AttrListIter = construct attr_list + [SubAttrList] + + # Add it to the colection + CollectedAttrs = construct attr_list + [CollectedAttrs " foo=" ^Val] + } +} + +# Reconstruct the left hand side with the +IL = construct item_list + ["<wrapper " ^CollectedAttrs ">" ^RootItemList "</wrapper>"] + +print( ^IL '\n' ) +##### IN ##### +<t1 a=b foo=bar1 c=d> + <t2 foo=bar2 e=f></t2> +</t1> +##### EXP ##### +<wrapper foo=bar1 foo=bar2><t1 a=b c=d> + <t2 e=f></t2> +</t1></wrapper> diff --git a/test/literal1.lm b/test/literal1.lm new file mode 100644 index 0000000..b18c2b7 --- /dev/null +++ b/test/literal1.lm @@ -0,0 +1,4 @@ +##### LM ##### +print( '\hello\tworld\n' ) +##### EXP ##### +hello world diff --git a/test/lookup1.lm b/test/lookup1.lm new file mode 100644 index 0000000..7eaf58f --- /dev/null +++ b/test/lookup1.lm @@ -0,0 +1,2416 @@ +##### LM ##### +context lookup + # + # Data types for global data. + # + + # Language objects. + def lang_object + typeId: int + name: str + + # If the object is a typedef, this points to the real object. + typedefOf: ptr<lang_object> + + objectMap: map<str list<ptr<lang_object>>> + inherited: list<ptr<lang_object>> + lookupParent: ptr<lang_object> + specializationOf: ptr<lang_object> + [] + + # This structure is used to keep track of information necessary to make a + # declaration. While parsing a declaration it records the declaration's + # attributes. + def declaration_data + isTypedef: int + isFriend: int + isTemplate: int + + typeObj: ptr<lang_object> + [] + + def declarator_data + qualObj: ptr<lang_object> + pdcScope: ptr<lang_object> + lookupObj: ptr<lang_object> + [] + + # Constants for language object types. + NamespaceType: int + ClassType: int + TemplateClassType: int + EnumType: int + IdType: int + TypedefType: int + TemplateIdType: int + + # + # Global data declarations + # + + # Object stacks. + curNamespace: list<ptr<lang_object>> + declNs: list<ptr<lang_object>> + lookupNs: list<ptr<lang_object>> + qualNs: list<ptr<lang_object>> + templateParamNs: list<ptr<lang_object>> + + # Declaration, declarator data. + declarationData: list<declaration_data> + declaratorData: list<declarator_data> + + # Template declarations + templDecl: list<int> + + # Root namespace object + rootNamespace: ptr<lang_object> + + # + # Identifier lookup. + # + + # Lookup the token in the members of an object. + ptr<lang_object> lookupInObject( obj: ptr<lang_object> name: str ) + { + # LOG print( ' looking in ', obj->name, '\n' ) + + ol: list<ptr<lang_object>> = obj->objectMap.find( name ) + if ol { + # LOG print( ' * found an object: ', ol.head, '\n' ) + return ol.head + } + + return nil + } + + # Lookup in an object and all the objects beneath it in the inheritance + # tree. + ptr<lang_object> lookupWithInheritance( obj: ptr<lang_object> name: str ) + { + found: ptr<lang_object> = lookupInObject( obj name ) + if found + return found + + localObjInherited: list<ptr<lang_object>> = obj->inherited + for II: ptr<lang_object> in localObjInherited { + inh: ptr<lang_object> = II + + # First check if the inherited object is the one we are after. + if inh->name == name && inh->typeId == ClassType { + # LOG print( ' * found a class name\n' ) + return inh + } + + # Otherwise look inside the inherited object. + found = lookupWithInheritance( inh name ) + if found + return found + } + + return nil + } + + ptr<lang_object> unqualifiedLookup( name: str ) + { + found: ptr<lang_object> + + # Start with the objects in the templateParamNs. + localTemplateParamNs: list<ptr<lang_object>> = templateParamNs + for TemplParaObjIter: ptr<lang_object> in rev_child(localTemplateParamNs) { + found = lookupWithInheritance( TemplParaObjIter name ) + if found + break + } + + if !found { + # Iterator over the objects starting at the head of the lookup stack + # and going up through the lookup parents. + lookupIn: ptr<lang_object> = lookupNs.top + while lookupIn { + found = lookupWithInheritance( lookupIn name ) + if found + break + lookupIn = lookupIn->lookupParent + } + } + + return found + } + + # The C++ scanner. + lex + rl fract_const / digit* '.' digit+ | digit+ '.' / + rl exponent / [eE] [+\-]? digit+ / + rl float_suffix / [flFL] / + + # Single and double literals. + token TK_SingleLit /( 'L'? "'" ( [^'\\\n] | '\\' any )* "'" )/ + token TK_DoubleLit /( 'L'? '"' ( [^"\\\n] | '\\' any )* '"' )/ + + literal `extern `namespace `friend `typedef `auto `register + `static `mutable `inline `virtual `explicit `const + `volatile `restrict `class `struct `union `template + `private `protected `public `using `void `char + `wchar_t `bool `int `float `double `short `long + `signed `unsigned `enum `new `delete `operator + `typename `export `throw `try `catch `sizeof + `dynamic_cast `static_cast `reinterpret_cast `const_cast + `typeid `this `true `false `switch `case `default + `if `else `while `do `for `break `continue + `return `goto + + # Extensions + literal `__typeof `__is_pod `__is_empty + + literal `{ `} `; `, `= `( `) `: `& `* `[ `] `~ `+ `- + `/ `< `> `| `^ `% `! `? `. + + literal `:: `== `!= `&& `|| `*= `/= `%= `+= `-= `&= + `^= `|= `++ `-- `-> `->* `.* `... `<<= `>>= + + # Token translation targets. + def unknown_id [lookup_id] + def class_id [lookup_id] + def namespace_id [lookup_id] + def templ_class_id [lookup_id] + def enum_id [lookup_id] + def typedef_id [lookup_id] + def identifier [lookup_id] + def template_id [lookup_id] + + # Identifiers + token lookup_id + obj: ptr<lang_object> + qualObj: ptr<lang_object> + + /( [a-zA-Z_] [a-zA-Z0-9_]* )/ + { + name: str = match_text + found: ptr<lang_object> = nil + qualObj: ptr<lang_object> = nil + if qualNs.top { + # LOG print( 'qualified lookup of ', name, '\n' ) + + # Transfer the qualification to the token and reset it. + qualObj = qualNs.top + qualNs.top = nil + + # Lookup using the qualification. + found = lookupWithInheritance( qualObj name ) + } + else { + # No qualification, full search. + # LOG print( 'unqualified lookup of ', name, '\n' ) + found = unqualifiedLookup( name ) + } + + # If no match, return an Unknown ID + id: int = typeid<unknown_id> + if found + id = found->typeId + + LookupId: any = make_token( typeid<lookup_id> + input.pull(match_length) found qualObj ) + input.push( make_tree( id LookupId ) ) + + } + + # Floats. + token TK_Float /( fract_const exponent? float_suffix? | + digit+ exponent float_suffix? )/ + + # Integer decimal. Leading part buffered by float. + token TK_IntegerDecimal /( ( '0' | [1-9] [0-9]* ) [ulUL]{0,3} )/ + + # Integer octal. Leading part buffered by float. + token TK_IntegerOctal /( '0' [0-9]+ [ulUL]{0,2} )/ + + # Integer hex. Leading 0 buffered by float. + token TK_IntegerHex /( '0x' [0-9a-fA-F]+ [ulUL]{0,2} )/ + + # Preprocessor line. + ignore /'#' [^\n]* '\n'/ + + # Comments and whitespace. + ignore /( '/*' (any | '\n')* :>> '*/' )/ + ignore /( '//' any* :> '\n' )/ + ignore /( any - 33..126 )+/ + end + + # + # Support functions + # + + typeId: int + name: str + + # If the object is a typedef, this points to the real object. + typedefOf: ptr<lang_object> + + objectMap: map<str list<ptr<lang_object>>> + inherited: list<ptr<lang_object>> + lookupParent: ptr<lang_object> + specializationOf: ptr<lang_object> + + ptr<lang_object> createLangObject( typeId: int name: str lookupParent: ptr<lang_object> ) + { + obj: ptr<lang_object> = new construct lang_object( + typeId + name + nil + construct map<str list<ptr<lang_object>>> [] + construct list<ptr<lang_object>> [] + lookupParent ) [] + return obj + } + + # Building the language object tree. + int insertObject( definedIn: ptr<lang_object> name: str obj: ptr<lang_object> ) + { + ol: list<ptr<lang_object>> = definedIn->objectMap.find( name ) + if !ol { + # Element not in the map already + ol = construct list<ptr<lang_object>> [] + } + ol.append( obj ) + definedIn->objectMap.store( name ol ) + } + + ptr<lang_object> findClass( inObj: ptr<lang_object>name: str ) + { + ol: list<ptr<lang_object>> = inObj->objectMap.find( name ) + if ol { + for ObjIter: ptr<lang_object> in ol { + obj: ptr<lang_object> = ObjIter + if obj->typeId == ClassType { + return obj + } + } + } + return nil + } + + ptr<lang_object> findTemplateClass( inObj: ptr<lang_object> name: str ) + { + ol: list<ptr<lang_object>> = inObj->objectMap.find( name ) + if ol { + for ObjIter: ptr<lang_object> in ol { + obj: ptr<lang_object> = ObjIter + if obj->typeId == TemplateClassType + return obj + } + } + return nil + } + + def root_qual_opt + [] + | [`::] + + def nested_name_specifier_opt + [nested_name_specifier_opt qualifying_name `:: designated_qualifying_name `::] + | [nested_name_specifier_opt qualifying_name `::] + | [] + + def nested_name_specifier + [nested_name_specifier designated_qualifying_name `::] + | [nested_name_specifier qualifying_name `::] + | [qualifying_name `::] + + def qualifying_name + [class_name] + { + qualNs.top = r1.lookupId.obj + } + + | [namespace_id] + { + match r1 [Id: lookup_id] + qualNs.top = Id.obj + } + + | [typedef_id] + { + match r1 [Id: lookup_id] + qualNs.top = Id.obj->typedefOf + } + + def designated_qualifying_name + [`template any_id] + { + # FIXME: nulling qualNs is not the right thing to do here. + qualNs.top = nil + } + + | [`template any_id + templ_arg_open template_argument_list_opt templ_arg_close] + { + # FIXME: nulling qualNs is not the right thing to do here. + qualNs.top = nil + } + + # + # Id Expression + # + + def id_expression + lookupId: lookup_id + + [root_qual_opt nested_name_specifier_opt unknown_id] + { + lhs.lookupId = lookup_id in r3 + } + + | [root_qual_opt nested_name_specifier_opt identifier] + { + lhs.lookupId = lookup_id in r3 + } + + | [root_qual_opt nested_name_specifier_opt operator_function_id] + { + # Normally the token translation transfers the qualification. Since + # the operator_function_id does not end in a lookup we must do it ourselves. + qualObj: ptr<lang_object> = qualNs.top + qualNs.top = nil + + lhs.lookupId = construct lookup_id ["x"] + lhs.lookupId.data = '<operator_function_id>' + lhs.lookupId.qualObj = qualObj + } + + | [root_qual_opt nested_name_specifier_opt conversion_function_id] + { + # Normally the token translation transfers the qualification. Since + # the operator_function_id does not } in a lookup we must do it ourselves. + qualObj: ptr<lang_object> = qualNs.top + qualNs.top = nil + + # Do we need qual reset here becauase operator_function_id does not do it? + lhs.lookupId = construct lookup_id ["x"] + lhs.lookupId.data = '<conversion_function_id>' + lhs.lookupId.qualObj = qualObj + } + + | [root_qual_opt nested_name_specifier_opt `~ class_name] + { + lhs.lookupId = r4.lookupId + } + + | [root_qual_opt nested_name_specifier_opt template_name] + { + lhs.lookupId = r3.lookupId + } + + def template_name + lookupId: lookup_id + + [template_id templ_arg_open template_argument_list_opt templ_arg_close] + { + lhs.lookupId = lookup_id in r1 + } + + | [template_id] + { + lhs.lookupId = lookup_id in r1 + } + + + # + # Class Names + # + + def class_name + lookupId: lookup_id + + [class_id] + { + lhs.lookupId = lookup_id in r1 + } + + | [templ_class_id] + { + lhs.lookupId = lookup_id in r1 + } + + | [templ_class_id templ_arg_open template_argument_list_opt templ_arg_close] + { + # TODO: Look for a specialization. + lhs.lookupId = lookup_id in r1 + } + + def templ_arg_open + [`<] + { + qualNs.push( nil ) + } + + def templ_arg_close + [`>] + { + qualNs.pop() + } + + def declaration + [block_declaration] commit + | [function_definition] commit + | [template_declaration] commit + | [explicit_instantiation] commit + | [explicit_specialization] commit + | [linkage_specification] commit + | [namespace_definition] commit + + # + # Declarations + # + + def block_declaration + [simple_declaration] + | [using_declaration] + | [using_directive] + + def simple_declaration + [declaration_start simple_declaration_forms declaration_end `;] + + # Ordering is important for optimization. The form with the optional + # decl_specifier_sing should go second. + def simple_declaration_forms + [decl_specifier_mult_seq_opt decl_specifier_sing + decl_specifier_mult_seq_opt init_declarator_list_opt] + + | [decl_specifier_mult_seq_opt init_declarator_list_opt] + + def declaration_start + [] + { + # LOG print( 'opening new declaration_data with templDecl: ', templDecl.top, '\n' ) + declarationData.push( construct declaration_data ( 0 0 0 ) [] ) + + # Transfer the template flag and reset it. + declarationData.top.isTemplate = templDecl.top + templDecl.push( 0 ) + } + + def declaration_end + [] + { + # LOG print( 'closing declaration_data\n' ) + declarationData.pop() + templDecl.pop() + } + + def decl_specifier_sing + [type_specifier_sing] + { + # Store the object type of the declaration (if any) for use + # by typedefs. + declarationData.top.typeObj = r1.lookupId.obj + } + + def type_specifier_seq + lookupId: lookup_id + + [type_specifier_mult_seq_opt type_specifier_sing type_specifier_mult_seq_opt] + { + lhs.lookupId = r2.lookupId + } + + def type_specifier_sing + lookupId: lookup_id + + [simple_type_specifier] + { + lhs.lookupId = r1.lookupId + } + + | [class_specifier] + { + lhs.lookupId = construct lookup_id ["x"] + lhs.lookupId.data = '<class_specifier>' + } + + | [enum_specifier] + { + lhs.lookupId = construct lookup_id ["x"] + lhs.lookupId.data = '<enum_specifier>' + } + + | [elaborated_type_specifier] + { + lhs.lookupId = construct lookup_id ["x"] + lhs.lookupId.data = '<elaborated_type_specifier>' + } + + # Type specifier sequence without enum specifier or class specifier. + def necs_type_specifier_seq + [type_specifier_mult_seq_opt necs_type_specifier_sing type_specifier_mult_seq_opt] + + # Type specifier singular without enum specifier or class specifier. + def necs_type_specifier_sing + [simple_type_specifier] + | [elaborated_type_specifier] + + def type_specifier_mult_seq_opt + [type_specifier_mult_seq_opt type_specifier_mult] + | [] + + def type_specifier_mult_seq + [type_specifier_mult_seq type_specifier_mult] + | [type_specifier_mult] + + def simple_type_specifier + lookupId: lookup_id + + [simple_type_specifier_name] + { + lhs.lookupId = r1.lookupId + } + + | [simple_type_specifier_kw_seq] + { + lhs.lookupId = construct lookup_id ["x"] + lhs.lookupId.data = '<simple_type_specifier_kw_seq>' + } + + | [`typename root_qual_opt nested_name_specifier type_name] + { + lhs.lookupId = r4.lookupId + } + + | [`typename root_qual_opt nested_name_specifier identifier] + { + lhs.lookupId = lookup_id in r4 + } + + | [`typename root_qual_opt nested_name_specifier unknown_id] + { + lhs.lookupId = lookup_id in r4 + } + + # Extension. + | [`__typeof `( expression `)] + { + lhs.lookupId = construct lookup_id ["x"] + lhs.lookupId.data = '<simple_type_specifier_kw_seq>' + } + + def simple_type_specifier_name + lookupId: lookup_id + + [qual_type_name] + { + lhs.lookupId = r1.lookupId + } + + def simple_type_specifier_kw_seq + [simple_type_specifier_kw_seq simple_type_specifier_kw] + | [simple_type_specifier_kw] + + def simple_type_specifier_kw + [`void] + | [`char] + | [`wchar_t] + | [`bool] + | [`int] + | [`float] + | [`double] + | [`short] + | [`long] + | [`signed] + | [`unsigned] + + def qual_type_name + lookupId: lookup_id + + [root_qual_opt nested_name_specifier_opt type_name] + { + lhs.lookupId = r3.lookupId + } + + def type_name + lookupId: lookup_id + + [class_name] + { + lhs.lookupId = r1.lookupId + } + + | [enum_id] + { + lhs.lookupId = lookup_id in r1 + } + + | [typedef_id] + { + lhs.lookupId = lookup_id in r1 + } + + # NOTE: the typename case is moved to simple type specifier + # to take advantage of its conflict resolution. + def elaborated_type_specifier + [class_key nested_name_specifier_opt class_head_name] + { + Id: lookup_id = lookup_id in r3 + name: str = Id.data + + # Get the ns the class is declared in. + parentObj: ptr<lang_object> = declNs.top + if Id.qualObj + parentObj = Id.qualObj + + # Look for the class in the given scope. + declaredClass: ptr<lang_object> = findClass( parentObj name ) + if !declaredClass + declaredClass = findTemplateClass( parentObj name ) + + if !declaredClass { + # LOG print( 'creating new class: ', name, '\n' ) + + # Class does not exist in the parent scope, create it. + nsType: int = declaredClassType() + + declaredClass = createLangObject( nsType name lookupNs.top ) + + # FIXME: handle friends. Make the class visible only if we are NOT + # in a friend declaration. The new class object is necessary to + # properly process the body of the class. + if declarationData.top.isFriend == 0 + insertObject( parentObj name declaredClass ) + } + } + + # TODO: Lookup type specialization. + | [class_key nested_name_specifier_opt templ_class_id + templ_arg_open template_argument_list_opt templ_arg_close] + + | [`enum nested_name_specifier_opt enum_head_name] + { + # TODO: should look for existing enums of the same name. + Id: lookup_id = lookup_id in r3 + # LOG print( 'creating enumeration ' Id.data '\n' ) + enum: ptr<lang_object> = createLangObject( EnumType Id.data lookupNs.top ) + insertObject( declNs.top Id.data enum ) + } + + def decl_specifier_mult_seq_opt + [decl_specifier_mult_seq_opt decl_specifier_mult] + | [] + + def decl_specifier_mult_seq + [decl_specifier_mult_seq decl_specifier_mult] + | [decl_specifier_mult] + + def decl_specifier_mult + [type_specifier_mult] + | [storage_class_specifier] + | [function_specifier] + + | [`friend] + { + declarationData.top.isFriend = 1 + } + + | [`typedef] + { + declarationData.top.isTypedef = 1 + } + + def storage_class_specifier + [`auto] + | [`register] + | [`static] + | [`extern] + | [`mutable] + + def function_specifier + [`inline] + | [`virtual] + | [`explicit] + + def type_specifier_mult + [cv_qualifier] + + def cv_qualifier + [`const] + | [`volatile] + | [`restrict] + + def cv_qualifier_rep + [cv_qualifier_rep cv_qualifier] + | [] + + def namespace_definition + [named_namespace_definition] + | [unnamed_namespace_definition] + + def named_namespace_definition + [original_namespace_definition] + | [extension_namespace_definition] + + # + # Enumerations + # + + def enum_specifier + [`enum nested_name_specifier_opt + enum_head_name `{ enumerator_list_opt `}] + { + # TODO: should look for existing enums of the same name. + Id: lookup_id = lookup_id in r3 + # LOG print( 'creating enumeration ' Id.data '\n' ) + enum: ptr<lang_object> = createLangObject( EnumType Id.data lookupNs.top ) + insertObject( declNs.top Id.data enum ) + } + + | [`enum `{ enumerator_list_opt `}] + + def enum_head_name + [class_id] + | [templ_class_id] + | [namespace_id] + | [typedef_id] + | [enum_id] + | [identifier] + | [template_id] + | [unknown_id] + + def enumerator_list_opt + [enumerator_list] + | [enumerator_list `,] + | [] + + def enumerator_list + [enumerator_list `, enumerator_definition] + | [enumerator_definition] + + def enumerator_definition + [enumerator_id] + { + Id: lookup_id = lookup_id in r1 + enumId: ptr<lang_object> = createLangObject( IdType Id.data lookupNs.top ) + insertObject( declNs.top Id.data enumId ) + } + + | [enumerator_id `= constant_expression] + { + Id: lookup_id = lookup_id in r1 + enumId: ptr<lang_object> = createLangObject( IdType Id.data lookupNs.top ) + insertObject( declNs.top Id.data enumId ) + } + + def enumerator_id + [namespace_id] + | [typedef_id] + | [enum_id] + | [class_id] + | [templ_class_id] + | [template_id] + | [identifier] + | [unknown_id] + + # + # Declarators + # + + def init_declarator_list_opt + [init_declarator_list] + | [] + + def init_declarator_list + [init_declarator_list `, init_declarator] + | [init_declarator] + + def init_declarator + [declarator initializer_opt] + + def initializer_opt + [`= initializer_clause] + | [`( expression `)] + | [] + + def initializer_clause + [assignment_expression] + | [`{ initializer_list `}] + | [`{ initializer_list `, `}] + | [`{ `}] + + def initializer_list + [initializer_list `, initializer_clause] + | [initializer_clause] + + # + # Expressions + # + + def expression + [expression `, assignment_expression] + | [assignment_expression] + + def expression_opt + [expression] + | [] + + def constant_expression + [conditional_expression] + + def constant_expression_opt + [constant_expression] + | [] + + def assignment_expression + [conditional_expression] + | [logical_or_expression assignment_op assignment_expression] + | [throw_expression] + + def assignment_op + [`=] + | [`*=] + | [`/=] + | [`%=] + | [`+=] + | [`-=] + | [`>>=] + | [`<<=] + | [`&=] + | [`^=] + | [`|=] + + def conditional_expression + [logical_or_expression] + | [logical_or_expression `? expression `: assignment_expression] + + def logical_or_expression + [logical_or_expression `|| logical_and_expression] + | [logical_and_expression] + + def logical_and_expression + [logical_and_expression `&& inclusive_or_expression] + | [inclusive_or_expression] + + def inclusive_or_expression + [inclusive_or_expression `| exclusive_or_expression] + | [exclusive_or_expression] + + def exclusive_or_expression + [exclusive_or_expression `^ and_expression] + | [and_expression] + + def and_expression + [and_expression `& equality_expression] + | [equality_expression] + + def equality_expression + [equality_expression `== relational_expression] + | [equality_expression `!= relational_expression] + | [relational_expression] + + def relational_expression + [relational_expression `< shift_expression] + | [relational_expression `> shift_expression] + | [relational_expression lt_eq shift_expression] + | [relational_expression gt_eq shift_expression] + | [shift_expression] + + def shift_expression + [shift_expression shift_left additive_expression] + | [shift_expression shift_right additive_expression] + | [additive_expression] + + def additive_expression + [additive_expression `+ multiplicative_expression] + | [additive_expression `- multiplicative_expression] + | [multiplicative_expression] + + def multiplicative_expression + [multiplicative_expression `* pm_expression] + | [multiplicative_expression `/ pm_expression] + | [multiplicative_expression `% pm_expression] + | [pm_expression] + + def pm_expression + [pm_expression `->* cast_expression] + | [pm_expression `.* cast_expression] + | [cast_expression] + + def cast_expression + [unary_expression] + | [`( type_id `) cast_expression] + + def delete_expression + [root_qual_opt `delete cast_expression] + | [root_qual_opt `delete `[ `] cast_expression] + + def new_initializer_opt + [new_initializer] + | [] + + def new_initializer + [`( expression_opt `)] + + def direct_new_declarator + [`[ expression `]] + | [direct_new_declarator `[ constant_expression `]] + + def new_declarator_opt + [new_declarator] + | [] + + def new_declarator + [direct_new_declarator] + | [ptr_operator_seq direct_new_declarator] + | [ptr_operator_seq] + + def new_type_id + [necs_type_specifier_seq new_declarator_opt] + + def new_placement + [`( expression `)] + + def new_expression + [root_qual_opt `new new_type_id new_initializer_opt] + | [root_qual_opt `new new_placement new_type_id new_initializer_opt] + | [root_qual_opt `new `( type_id `) new_initializer_opt] + | [root_qual_opt `new new_placement `( type_id `) new_initializer_opt] + + def unary_operator + [`*] + | [`&] + | [`+] + | [`-] + | [`!] + | [`~] + + def unary_expression + [postfix_expression] + | [`++ cast_expression] + | [`-- cast_expression] + | [unary_operator cast_expression] + | [`sizeof `( type_id `)] + | [`sizeof unary_expression] + | [new_expression] + | [delete_expression] + + def function_style_type_conv + [simple_type_specifier] + + + def postfix_expression + [primary_expression] + | [postfix_expression `[ expression `]] + | [postfix_expression `( expression_opt `)] + | [function_style_type_conv `( expression_opt `)] + | [member_request_expr dot_arrow id_expression] + | [member_request_expr dot_arrow pseudo_destructor_call] + | [postfix_expression `++] + | [postfix_expression `--] + | [`dynamic_cast templ_arg_open type_id templ_arg_close `( expression `)] + | [`static_cast templ_arg_open type_id templ_arg_close `( expression `)] + | [`reinterpret_cast templ_arg_open type_id templ_arg_close `( expression `)] + | [`const_cast templ_arg_open type_id templ_arg_close `( expression `)] + | [`typeid `( expression `)] + | [`typeid `( type_id `)] + + def pseudo_destructor_call + [root_qual_opt nested_name_specifier_opt `~ pdc_type_name] + + def primary_expression + [expr_lit] + | [`this] + | [`( expression `)] + | [id_expression] + # GNU extensions + | [`( `{ statement_rep `} `)] + | [`__is_pod `( type_id `)] + | [`__is_empty `( type_id `)] + + def expr_lit + [TK_IntegerDecimal] + | [TK_IntegerOctal] + | [TK_IntegerHex] + | [TK_SingleLit] + | [TK_Float] + | [double_lit_list] + | [`true] + | [`false] + + def double_lit_list + [TK_DoubleLit double_lit_list] + | [TK_DoubleLit] + + def member_request_expr + [postfix_expression] + # { + # # FIXME: If no proper type is found, we must fail. + # # LOG print( 'setting member request scope\n' ) + # # qualNs.set( $1->type != 0 ? $1->type->getObject() : 0 ); + # } + + def dot_arrow + [`->] + | [`.] + + def pdc_type_name + [enum_id] + | [typedef_id] + + # + # Statements + # + + def statement_rep + [statement_rep statement] + | [] + + def statement + [declaration_statement] + | [labeled_statement] + | [expression_statement] + | [compound_statement] + | [selection_statement] + | [iteration_statement] + | [jump_statement] + | [try_block] + + def labeled_statement + [label_id `: statement] + | [`case constant_expression `: statement] + | [`default `: statement] + + def label_id + [unknown_id] + | [identifier] + | [class_id] + | [templ_class_id] + | [namespace_id] + | [typedef_id] + | [enum_id] + | [template_id] + + def compound_statement + [`{ compound_begin statement_rep compound_end `}] + + def compound_begin + [] + { + newCompound: ptr<lang_object> = createLangObject( 0 '<compound_begin>' lookupNs.top ) + lookupNs.push( newCompound ) + declNs.push( newCompound ) + # LOG print( 'opening <compound>\n' ) + } + + def compound_end + [] + { + lookupNs.pop() + declNs.pop() + # LOG print( 'closing <compound>\n' ) + } + + def selection_statement + [`if `( condition `) statement elseif_clauses else_clause] + | [`switch `( condition `) statement] + + def elseif_clauses + [elseif_clauses `else `if `( condition `) statement] + | [] + + def else_clause + [`else statement] + | [] + + def iteration_statement + [`while `( condition `) statement] + | [`do statement `while `( expression `) `;] + | [`for `( for_init_statement condition_opt `; expression_opt `) statement] + + def jump_statement + [`break `;] + | [`continue `;] + | [`return expression_opt `;] + | [`goto any_id `;] + + def any_id + [unknown_id] + | [class_id] + | [namespace_id] + | [templ_class_id] + | [enum_id] + | [typedef_id] + | [identifier] + | [template_id] + + + def for_init_statement + [expression_statement] + | [stmt_block_declaration_forms `;] + + def condition + [expression] + | [type_specifier_seq declarator `= assignment_expression] + + def condition_opt + [condition] + | [] + + def expression_statement + [expression `;] + | [`;] + + def declaration_statement + [stmt_block_declaration] + + def stmt_block_declaration + [declaration_start stmt_block_declaration_forms declaration_end `;] + | [using_declaration] + | [using_directive] + + def stmt_block_declaration_forms + [decl_specifier_mult_seq_opt decl_specifier_sing decl_specifier_mult_seq_opt + init_declarator_list_opt] + | [decl_specifier_mult_seq init_declarator_list_opt] + + # + # Declarators + # + + def declarator + lookupObj: ptr<lang_object> + + [ptr_operator_seq_opt declarator_id decl_array_or_param_rep declarator_end] + { + lhs.lookupObj = r4.lookupObj + } + + | [ptr_operator_seq_opt `( sub_declarator `) decl_array_or_param_rep declarator_end] + { + lhs.lookupObj = r6.lookupObj + } + + def sub_declarator + [ptr_operator_seq declarator_id decl_array_or_param_rep] + | [ptr_operator_seq `( sub_declarator `) decl_array_or_param_rep] + | [`( sub_declarator `) decl_array_or_param_rep] + | [declarator_id decl_array_or_param_rep] + + def decl_array_or_param_rep + [decl_array_or_param_rep decl_array_or_param] + | [] + + def decl_array_or_param + [`[ constant_expression_opt `]] + | [`( parameter_declaration_clause `) cv_qualifier_rep exception_specification_opt] + + def declarator_id + [declarator_id_forms] + { + name: str = r1.lookupId.data + qualObj: ptr<lang_object> = r1.lookupId.qualObj + + parentObj: ptr<lang_object> = declNs.top + if qualObj { + parentObj = qualObj + } + + # Decide if we are declaring a constructor/destructor. + isConstructor: bool + if parentObj == r1.lookupId.obj { + isConstructor = true + # LOG print( 'making declarator ' name ' a constructor/destructor\n' ) + } + + if parentObj->specializationOf && + parentObj->specializationOf == r1.lookupId.obj + { + isConstructor = true + # LOG print( 'making declarator ' name ' a constructor/destructor\n' ) + } + + obj: ptr<lang_object> = nil + if name && !isConstructor && declarationData.top.isFriend == 0 { + if declarationData.top.isTypedef { + obj = createLangObject( TypedefType name lookupNs.top ) + obj->typedefOf = declarationData.top.typeObj + insertObject( parentObj name obj ) + + # LOG print( 'making declarator ' name ' a typedef\n' ) + } + else { + if !qualObj { + if declarationData.top.isTemplate { + # If in a template declaration and the name is not qualified then + # create the template id. + obj = createLangObject( TemplateIdType name lookupNs.top ) + #object->objType = declarationData.top.type + insertObject( declNs.top name obj ) + + # LOG print( 'making declarator ' name ' a template id\n' ) + } + else { + obj = createLangObject( IdType name lookupNs.top ) + #object->objType = declarationData.top().type; + insertObject( declNs.top name obj ) + + # LOG print( 'making declarator ' name ' an id\n' ) + } + } + } + } + + declaratorData.push( construct declarator_data ( + qualObj nil lookupNs.top ) [] ) + + # If the declarator is qualified, push the qualification to the lookup + # stack. Also save it in the declarator data so it can be passed to a + # function body if needed. + if qualObj { + lookupNs.push( qualObj ) + declaratorData.top.lookupObj = qualObj + } + + # LOG print( 'reduced declarator_id: ' name '\n' ) + } + + # Undoes the setup done by declarator_id and pdc_start. + def declarator_end + lookupObj: ptr<lang_object> + + [] + { + # Get the lookupObject from the scope and pass it up. If we are about to + # parse a function body it will be needed. + lhs.lookupObj = declaratorData.top.lookupObj + + pdcScope: ptr<lang_object> = declaratorData.top.pdcScope + qualObj: ptr<lang_object> = declaratorData.top.qualObj + + declaratorData.pop() + + if pdcScope { + # LOG print( 'closing <pdc_scope>\n' ) + lookupNs.pop() + declNs.pop() + } + + if qualObj { + # LOG print( 'popping lookupNs\n' ) + lookupNs.pop() + } + } + + def declarator_id_forms + lookupId: lookup_id + + [id_expression] + { + lhs.lookupId = r1.lookupId + } + + | [root_qual_opt nested_name_specifier_opt type_name] + { + lhs.lookupId = r3.lookupId + } + + | [root_qual_opt nested_name_specifier_opt `~ class_id] + { + lhs.lookupId = lookup_id in r4 + } + + | [root_qual_opt nested_name_specifier_opt `~ templ_class_id] + { + lhs.lookupId = lookup_id in r4 + } + | [root_qual_opt nested_name_specifier_opt `~ unknown_id] + { + lhs.lookupId = lookup_id in r4 + } + + def type_id + lookupId: lookup_id + + [type_specifier_seq abstract_declarator_opt] + { + lhs.lookupId = r1.lookupId + } + + def abstract_declarator_opt + [abstract_declarator] + | [] + + def abstract_declarator + [ptr_operator_seq abstract_noid abstract_decl_array_or_param_seq_opt declarator_end] + | [ptr_operator_seq `( sub_abstract_declarator `) + abstract_decl_array_or_param_seq_opt declarator_end] + | [abstract_noid abstract_decl_array_or_param_seq declarator_end] + | [`( sub_abstract_declarator `) abstract_decl_array_or_param_seq_opt declarator_end] + + def sub_abstract_declarator + [ptr_operator_seq abstract_noid abstract_decl_array_or_param_seq_opt] + + | [ptr_operator_seq `( sub_abstract_declarator `) + abstract_decl_array_or_param_seq_opt] + + | [`( sub_abstract_declarator `) abstract_decl_array_or_param_seq_opt] + + def abstract_noid + [] + { + # Make scope for declarator. + declaratorData.push( construct declarator_data [] ) + } + + def abstract_decl_array_or_param_seq_opt + [abstract_decl_array_or_param_seq_opt abstract_decl_array_or_param] + | [] + + def abstract_decl_array_or_param_seq + [abstract_decl_array_or_param_seq abstract_decl_array_or_param] + | [abstract_decl_array_or_param] + + def abstract_decl_array_or_param + [`[ constant_expression_opt `]] + | [`( parameter_declaration_clause `) cv_qualifier_rep + exception_specification_opt] + + def parameter_declaration_clause + [pdc_start parameter_declaration_list] + | [pdc_start parameter_declaration_list `...] + | [pdc_start parameter_declaration_list `, `...] + | [pdc_start `...] + | [pdc_start] + + def pdc_start + [] + { + if !declaratorData.top.pdcScope { + # We are going to need a scope for the declarator. + pdcScope: ptr<lang_object> = createLangObject( 0 '<pdc_scope>' lookupNs.top ) + lookupNs.push( pdcScope ) + declNs.push( pdcScope ) + + declaratorData.top.pdcScope = pdcScope + declaratorData.top.lookupObj = pdcScope + # LOG print( 'opening <pdc_scope>\n' ) + } + } + + def parameter_declaration_list + [parameter_declaration_list `, parameter_declaration] + | [parameter_declaration] + + def parameter_declaration + [declaration_start parameter_declaration_forms declaration_end] + + # Ordering the productions such that decl_specifier_sing is tried first is good + # for performance. + def parameter_declaration_forms + [decl_specifier_mult_seq_opt decl_specifier_sing decl_specifier_mult_seq_opt + param_maybe_declarator maybe_parameter_init] + + | [decl_specifier_mult_seq param_maybe_declarator maybe_parameter_init] + + def param_maybe_declarator + [abstract_declarator] + | [declarator] + | [] + + def maybe_parameter_init + [`= constant_expression] + | [] + + def ptr_operator + [`&] + | [root_qual_opt nested_name_specifier_opt `* cv_qualifier_rep] + + def ptr_operator_seq + [ptr_operator_seq ptr_operator] + | [ptr_operator] + + def ptr_operator_seq_opt + [ptr_operator_seq_opt ptr_operator] + | [] + + # + # Functions + # + + def function_definition + [function_def_declaration ctor_initializer_opt function_body function_def_end] + + def function_def_declaration + [declaration_start function_def_declaration_forms declaration_end] + + def function_def_declaration_forms + [decl_specifier_mult_seq_opt decl_specifier_sing + decl_specifier_mult_seq_opt function_def_declarator] + | [decl_specifier_mult_seq function_def_declarator] + | [function_def_declarator] + + def function_def_declarator + [declarator] + { + # The lookupObj from the declarator is the deepest lookup object found + # while parsing the declarator. Make it visible in the function body. + # This could be the args, the qualObj, or the parent to the function. + lookupNs.push( r1.lookupObj ) + } + + def function_def_end + [] + { + # Pop the lookup object. + lookupNs.pop() + } + + def function_body + [function_body_begin `{ statement_rep function_body_end `}] + + def function_body_begin + [] + { + newFunctionBody: ptr<lang_object> = createLangObject( 0 + '<function_body_begin>' lookupNs.top ) + lookupNs.push( newFunctionBody ) + declNs.push( newFunctionBody ) + templDecl.push( 0 ) + # LOG print( 'opening <function_body>\n' ) + } + + def function_body_end + [] + { + # First undoes the function body begin work. Then undoes the setup in + # function_def_declarator. + declNs.pop() + lookupNs.pop() + templDecl.pop() + # LOG print( 'closing <function_body>\n' ) + } + + + + # + # Classs + # + + int declaredClassType() + { + if declarationData.top.isTemplate { + return TemplateClassType + } else { + return ClassType + } + } + + def class_specifier + [class_head base_clause_opt `{ class_member_rep class_body_end `}] + { + # FIXME: reparse not implemented yet + # FIXME FIXME: reparse is actually implemented now implemented + # # Visit class function bodies, but skip nested classes. + # for CFB: class_function_body in lhs { + # skipping class_specifier + # + # # Reparse the text of the class function body as a function body + # function_body FB = parse function_body[ $CFB ] + # + # # Replace the class function body with the parsed function body. + # CFB = cons class_function_body [FB.tree] + # } + } + + def class_head + [class_key] + { + nsType: int = declaredClassType() + + # LOG print( 'creating new anonymous class\n' ) + newClass: ptr<lang_object> = createLangObject( nsType + '<anon_class>' lookupNs.top ) + lookupNs.push( newClass ) + declNs.push( newClass ) + } + + | [class_key nested_name_specifier_opt class_head_name] + { + Id: lookup_id = lookup_id in r3 + name: str = Id.data + + # Get the ns the class is declared in. + parentObj: ptr<lang_object> = declNs.top + if Id.qualObj + parentObj = Id.qualObj + + # Look for the class in the given scope. + declaredClass: ptr<lang_object> = findClass( parentObj name ) + if !declaredClass + declaredClass = findTemplateClass( parentObj name ) + + if !declaredClass { + # LOG print( 'creating new class: ' name '\n' ) + + # Class does not exist in the parent scope, create it. + nsType: int = declaredClassType() + + declaredClass = createLangObject( nsType name lookupNs.top ) + + # FIXME: handle friends. Make the class visible only if we are NOT + # in a friend declaration. The new class object is necessary to + # properly process the body of the class. + if declarationData.top.isFriend == 0 + insertObject( parentObj name declaredClass ) + } + + # Push the found/new class. + lookupNs.push( declaredClass ) + declNs.push( declaredClass ) + } + + | [class_key nested_name_specifier_opt templ_class_id + templ_arg_open template_argument_list_opt templ_arg_close] + { + match r3 [Id: lookup_id] + id: str = Id.data + classObj: ptr<lang_object> = Id.obj + + # TODO: Try to find the specializaition in the template class object. + # TypeList typeList; + # makeTypeList( typeList $6->last ); + + declaredClass: ptr<lang_object> + #declaredClass = classObj->findSpecExact( typeList ); + if !declaredClass { + # LOG print( 'making new template specialization\n' ) + nsType: int = declaredClassType() + declaredClass = createLangObject( nsType id lookupNs.top ) + # LOG print( 'declaredClass: ' declaredClass '\n' ) + declaredClass->specializationOf = classObj + # $$->typeListMapEl = classObj->typeListMap.insert( typeList declaredClass ); + } + + # Push the found/new class. + lookupNs.push( declaredClass ) + declNs.push( declaredClass ) + } + + def class_body_end + [] + { + # Pop the class ns. + lookupNs.pop() + declNs.pop() + + # LOG print( 'closing off class\n' ) + } + + def class_head_name + [class_id] + | [templ_class_id] + | [namespace_id] + | [typedef_id] + | [enum_id] + | [unknown_id] + | [identifier] + | [template_id] + + def class_key + [`class] + | [`struct] + | [`union] + + def class_member_rep + [class_member_rep class_member] + | [] + + def class_member + [member_declaration] + | [access_specifier `:] + + def member_declaration + [declaration_start member_declaration_forms declaration_end `;] + | [class_function_definition] + | [using_declaration] + | [template_declaration] + + def class_function_definition + [function_def_declaration ctor_initializer_opt class_function_body function_def_end] + + lex + token cfb_open /'{'/ + token cfb_close /'}'/ + token cfb_string / + "'" ( [^'\\\n] | '\\' any )* "'" | + '"' ( [^"\\\n] | '\\' any )* '"'/ + token cfb_comment / + ( '/*' (any | '\n')* :>> '*/' ) | + ( '//' any* :> '\n' )/ + token cfb_data /[^{}'"/]+ | '/'/ + end + + def cfb_item + [cfb_data] + | [cfb_string] + | [cfb_comment] + | [cfb_open cfb_item* cfb_close] + + def cfb_conts + [cfb_item* cfb_close] + + + + def class_function_body + # ['{' cfb_conts] + #| [function_body] + [function_body] + + # Get better performance if the form with decl_specifier_sing comes first. + def member_declaration_forms + [decl_specifier_mult_seq_opt decl_specifier_sing + decl_specifier_mult_seq_opt member_declarator_list_opt] + | [decl_specifier_mult_seq_opt member_declarator_list_opt] + + def member_declarator_list_opt + [member_declarator_list] + | [] + + def member_declarator_list + [member_declarator_list `, member_declarator] + | [member_declarator] + + def member_declarator + [declarator] + | [declarator `= constant_expression] + | [declarator `: constant_expression] + | [`: constant_expression] + + def access_specifier + [`private] + | [`protected] + | [`public] + + def access_specifier_opt + [access_specifier] + | [] + + def using_declaration + [`using id_expression `;] + { + obj: ptr<lang_object> = r2.lookupId.obj + if obj + insertObject( declNs.top obj->name obj ) + } + + | [`using type_id `;] + { + obj: ptr<lang_object> = r2.lookupId.obj + if obj + insertObject( declNs.top obj->name obj ) + } + + def using_directive + [`using `namespace root_qual_opt nested_name_specifier_opt + namespace_id `;] + { + # This uses a simple, incomplete guard against cycles in the graph of + # using namespaces. A more sophisticated and complete guard would look + # for longer cycles as well. Note that even gcc 3.3.5 does not bother. + match r5 [Id: lookup_id] + usingObject: ptr<lang_object> = Id.obj + inObject: ptr<lang_object> = declNs.top + if usingObject != inObject + inObject->inherited.append( usingObject ) + } + + + # + # Derived classes + # + + def base_clause_opt + [base_clause] + | [] + + def base_clause + [`: base_specifier_list] + + def base_specifier_list + [base_specifier_list `, base_specifier] + | [base_specifier] + + int addBaseSpecifier( inObject: ptr<lang_object> inheritedObject: ptr<lang_object> ) + { + # Resolve typedefs. + if inheritedObject->typeId == TypedefType + inheritedObject = inheritedObject->typedefOf + + inObject->inherited.append( inheritedObject ) + } + + def base_specifier + [root_qual_opt nested_name_specifier_opt type_name] + { + addBaseSpecifier( declNs.top r3.lookupId.obj ) + } + + | [`virtual access_specifier_opt root_qual_opt nested_name_specifier_opt type_name] + { + addBaseSpecifier( declNs.top r5.lookupId.obj ) + } + + | [access_specifier virtual_opt root_qual_opt nested_name_specifier_opt type_name] + { + addBaseSpecifier( declNs.top r5.lookupId.obj ) + } + + def virtual_opt + [`virtual] + | [] + + # + # Special member functions + # + + def conversion_function_id + [`operator conversion_type_id] + + def conversion_type_id + [necs_type_specifier_seq ptr_operator_seq_opt] + + def ctor_initializer_opt + [ctor_initializer] + | [] + + def ctor_initializer + [`: mem_initializer_list] + + def mem_initializer_list + [mem_initializer_list `, mem_initializer] + | [mem_initializer] + + def mem_initializer + [mem_initializer_id `( expression_opt `)] + + def mem_initializer_id + [root_qual_opt nested_name_specifier_opt unknown_id] + | [root_qual_opt nested_name_specifier_opt identifier] + | [root_qual_opt nested_name_specifier_opt type_name] + | [root_qual_opt nested_name_specifier_opt template_name] + + + # + # Overloading + # + def operator_function_id + [`operator operator] + + def operator + [`+] | [`-] | [`*] | [`/] | [`=] | [`<] | [`>] | [`&] | [`|] | + [`^] | [`%] | [`~] | [`!] | [`( `)] | [`[ `]] | [`new] | + [`delete] | [`->] | [`++] | [`--] | [`*=] | [`/=] | [`%=] | + [`+=] | [`-=] | [`>>=] | [`<<=] | [`&=] | [`^=] | [`|=] | [`==] | + [`!=] | [`&&] | [`||] | [lt_eq] | [gt_eq] | [shift_left] | [shift_right] + + def lt_eq + [`< `=] + # try { + # if ( $2->leader != 0 ) { + # #ifdef LOG_REDUCE + # cerr << "rejecting less-than equals-to" << endl; + # #endif + # reject(); + # } + # }; + + def gt_eq + [`> `=] + # try { + # if ( $2->leader != 0 ) { + # #ifdef LOG_REDUCE + # cerr << "rejecting greater-than equals-to" << endl; + # #endif + # reject(); + # } + # }; + + def shift_left + [`< `<] + # try { + # if ( $2->leader != 0 ) { + # #ifdef LOG_REDUCE + # cerr << "rejecting shift left" << endl; + # #endif + # reject(); + # } + # }; + + def shift_right + [`> `>] + # try { + # if ( $2->leader != 0 ) { + # #ifdef LOG_REDUCE + # cerr << "rejecting shift right" << endl; + # #endif + # reject(); + # } + # }; + + # + # Templates + # + + def template_declaration + [template_declaration_params declaration] + { + templDecl.pop() + templateParamNs.pop() + } + + def template_declaration_params + [`template `< tpl_start template_parameter_list `>] + { + templDecl.push( 1 ) + } + + | [`export `template `< tpl_start template_parameter_list `>] + { + templDecl.push( 1 ) + } + + def tpl_start + [] + { + # Create a new scope for the template parameters. + newTemplateParamScope: ptr<lang_object> = + createLangObject( 0 '<tpl_start>' lookupNs.top ) + templateParamNs.push( newTemplateParamScope ) + } + + def template_parameter_list + [template_parameter_list `, template_parameter] + | [template_parameter] + + def template_parameter + [type_parameter] + | [template_parameter_declaration] + + def template_parameter_declaration + [declaration_start template_parameter_declaration_forms declaration_end] + + def template_parameter_declaration_forms + [decl_specifier_mult_seq param_maybe_declarator maybe_parameter_init] + + | [temp_param_decl_specifier_sing decl_specifier_mult_seq_opt + param_maybe_declarator maybe_parameter_init] + + | [decl_specifier_mult_seq temp_param_decl_specifier_sing + decl_specifier_mult_seq_opt param_maybe_declarator maybe_parameter_init] + + def temp_param_decl_specifier_sing + [temp_param_type_specifier_sing] + + # Template parameters cannot support elaborated type specifer or class specifier. + def temp_param_type_specifier_sing + [templ_simple_type_specifier] + | [enum_specifier] + + def templ_simple_type_specifier + [simple_type_specifier_name] + | [simple_type_specifier_kw_seq] + + def type_parameter + [`class type_param_id type_param_init_opt] + { + Id: lookup_id = lookup_id in r2 + if Id { + # The lookup ns should be a template param scope. + newClass: ptr<lang_object> = + createLangObject( ClassType Id.data lookupNs.top ) + insertObject( templateParamNs.top Id.data newClass ) + } + } + + | [`typename type_param_id type_param_init_opt] + { + Id: lookup_id = lookup_id in r2 + if Id { + # The lookup ns should be a template param scope. + newClass: ptr<lang_object> = + createLangObject( ClassType Id.data lookupNs.top ) + insertObject( templateParamNs.top Id.data newClass ) + } + } + + | [`template `< tpl_start template_parameter_list `> + `class type_param_id templ_type_param_init_opt] + { + Id: lookup_id = lookup_id in r7 + if Id { + newClass: ptr<lang_object> = + createLangObject( TemplateClassType Id.data lookupNs.top ) + insertObject( templateParamNs.top Id.data newClass ) + } + } + + def templ_type_param_init_opt + [`= id_expression] + | [] + + def type_param_init_opt + [`= type_id] + | [] + + def type_param_id + [namespace_id] + | [typedef_id] + | [enum_id] + | [class_id] + | [templ_class_id] + | [identifier] + | [template_id] + | [unknown_id] + | [] + + def template_argument_list_opt + [template_argument_list] + | [] + + def template_argument_list + [template_argument_list `, template_argument] + | [template_argument] + + def template_argument + [type_id] + | [assignment_expression] + + def explicit_instantiation + [`template declaration] + | [declaration_start decl_specifier_mult_seq `template declaration declaration_end] + + def explicit_specialization + [`template `< `> declaration] + + ## Not sure what this one is about? + #explicit_specialization: + # declaration_start decl_specifier_mult_seq KW_Template '<' '>' + # declaration declaration_end; + + + # + # Original namespace definition + # + + def original_namespace_definition + [orig_namespace_def_name `{ declaration* namespace_end `}] + + def orig_namespace_def_name [`namespace unknown_id] + { + match r2 [Id: lookup_id] + nspace: ptr<lang_object> = createLangObject( + NamespaceType Id.data lookupNs.top ) + + # Insert the new object into the dictionary of the parent. + insertObject( curNamespace.top Id.data nspace ) + + # Push the namespace + curNamespace.push( nspace ) + declNs.push( nspace ) + lookupNs.push( nspace ) + + # LOG print( 'created original namespace: ' Id.data '\n' ) + } + + def namespace_end [] + { + # Pop the namespace. + curNamespace.pop() + declNs.pop() + lookupNs.pop() + + # LOG print( 'closed namespace\n' ) + } + + # + # Extension namespace definition + # + + def extension_namespace_definition + [ext_namespace_def_name `{ declaration* namespace_end `}] + + def ext_namespace_def_name [`namespace namespace_id] + { + match r2 [Id: lookup_id] + nspace: ptr<lang_object> = Id.obj + + # Push the namespace + curNamespace.push( nspace ) + declNs.push( nspace ) + lookupNs.push( nspace ) + + # LOG print( 'found extended namespace: ' Id.data '\n' ) + } + + # + # Unnamed namespace definition + # + def unnamed_namespace_definition + [unnamed_namespace_def_name `{ declaration* namespace_end `}] + + def unnamed_namespace_def_name [`namespace] + { + nspace: ptr<lang_object> = createLangObject( + NamespaceType '<unnamed_namespace>' + lookupNs.top ) + + # Push the namespace + curNamespace.push( nspace ) + declNs.push( nspace ) + lookupNs.push( nspace ) + + # LOG print( 'parsed unnamed namespace\n' ) + } + + # + # linkage_specification + # + def linkage_specification + [`extern TK_DoubleLit `{ declaration* `}] + | [`extern TK_DoubleLit declaration] + + # + # Exception Handling. + # + + def try_block + [`try compound_statement handler_seq] + + def handler_seq + [handler_seq handler] + | [handler] + + def handler + [`catch `( exception_declaration `) compound_statement] + + def exception_declaration + [type_specifier_seq declarator] + | [type_specifier_seq abstract_declarator] + | [type_specifier_seq] + | [`...] + + def throw_expression + [`throw assignment_expression] + | [`throw] + + def exception_specification_opt + [exception_specification] + | [] + + def exception_specification + [`throw `( type_id_list_opt `)] + + def type_id_list_opt + [type_id_list] + | [] + + def type_id_list + [type_id_list `, type_id] + | [type_id] + + def start + [declaration*] + + # + # Grammar done. + # + + int printObject( indent: str obj: ptr<lang_object> ) + { + print( indent obj->name ) + + if obj->objectMap.length > 0 + print( ' {\n' ) + + ChildNames: map<str list<ptr<lang_object>>> = obj->objectMap + for MapEl: list<ptr<lang_object>> in child( ChildNames ) { + for Obj: ptr<lang_object> in MapEl + printObject( indent + ' ' Obj ) + } + + if obj->objectMap.length > 0 + print( indent '}' ) + + print( '\n' ) + } + +end # lookup + +# +# Global data declarations +# + +cons Lookup: lookup[] + +# Constants for language object types. +Lookup.NamespaceType = typeid<lookup::namespace_id> +Lookup.ClassType = typeid<lookup::class_id> +Lookup.TemplateClassType = typeid<lookup::templ_class_id> +Lookup.EnumType = typeid<lookup::enum_id> +Lookup.IdType = typeid<lookup::identifier> +Lookup.TypedefType = typeid<lookup::typedef_id> +Lookup.TemplateIdType = typeid<lookup::template_id> + + +# Object stacks. +Lookup.curNamespace = construct list<ptr<lookup::lang_object>> [] +Lookup.declNs = construct list<ptr<lookup::lang_object>> [] +Lookup.lookupNs = construct list<ptr<lookup::lang_object>> [] +Lookup.qualNs = construct list<ptr<lookup::lang_object>> [] +Lookup.templateParamNs = construct list<ptr<lookup::lang_object>> [] + +# Declaration, declarator data. +Lookup.declarationData = construct list<lookup::declaration_data> [] +Lookup.declaratorData = construct list<lookup::declarator_data> [] + +# Template declarations +Lookup.templDecl = construct list<int> [] + +# Root namespace object +Lookup.rootNamespace = createLangObject( Lookup.NamespaceType '<root_namespace>' nil ) + +# Initialize the namespace and declaration stacks with the root namespace +Lookup.curNamespace.push( Lookup.rootNamespace ) +Lookup.declNs.push( Lookup.rootNamespace ) +Lookup.lookupNs.push( Lookup.rootNamespace ) + +# Start with no qualification (note variables are initialized to zero) +Lookup.qualNs.push( nil ) + +Lookup.templDecl.push( 0 ) +Lookup.declarationData.push( construct lookup::declaration_data( 0 0 0 ) [] ) + +parse S: lookup::start( Lookup )[ stdin ] +if ! S { + print( error ) + exit( 1 ) +} + +print( '***** NAMSPACES *****\n' ) +printObject( '' Lookup.rootNamespace ) +print( '***** UNKNOWN DECLARATORS *****\n' ) +for DI: lookup::declarator_id in S { + if match DI + [lookup::root_qual_opt lookup::nested_name_specifier_opt lookup::`~ UID: lookup::unknown_id] + { + print( UID '\n' ) + } +} +##### IN ##### +namespace ns1 +{ + namespace sub1 { class A {}; } + namespace sub2 { class B {}; } +} + +namespace ns2 +{ + int i = b; + class C + { + }; + + using namespace ns1; +} + +ns2::sub1::A a; + +struct A +{ + struct B {}; +}; + +struct C +{ + struct D : virtual public A {}; +}; + +C::D::A d; + +C c; + +struct C +{ + +}; + +enum E +{ + C, + b +}; + +E e; + +enum E +{ + C, + b +}; + + +int i; +class C +{ + int j; +}; + +class D +{ + int ~D(); +}; + +int C::k; +int C::~C; + +typedef int Int; + +class C {}; +void ~C( ); +void C::operator +( int i ); + +int i; + +//void operator C( void k ); + +class C +{ + +}; + +int C::f( int i, int j( void v ) ); +class C +{ + class D {}; + + typedef C I; + + I::D i; +}; + +C c; + +void function( int i, int j ) +{ + function(); +} + + + +class B { class Find {}; }; + +typedef B T; + +class C : public T +{ + Find find; +}; + + +template <class X> struct Y +{ + X t; + void f(); +}; + +template <class X> void Y<X>::f(); +template <class X> struct Y +{ + class Z {}; +}; + +class Y<int> +{ + int i; +}; + +//void f( class C<int> i, int j ); + +int f( int (*) [](), void ); +void f(); +class C +{ + class D {}; + void g(); +}; + +//typename C c; + +class C +{ + class D {}; + int f(); +}; + +int f() +{ +} + +int C::f() +{ + D d; +} +##### EXP ##### +***** NAMSPACES ***** +<root_namespace> { + A { + B + } + B { + Find + } + C { + D + I + f + g + i + j + find + } + C + C + C + D + E + E + T + Y { + Z + f + t + } + a + b + b + c + c + d + e + f + f + f + i + i + Int + ns1 { + sub1 { + A + } + sub2 { + B + } + } + ns2 { + C + i + } + function +} +***** UNKNOWN DECLARATORS ***** +C diff --git a/test/mailbox.lm b/test/mailbox.lm new file mode 100644 index 0000000..ca3b9a9 --- /dev/null +++ b/test/mailbox.lm @@ -0,0 +1,106 @@ +##### LM ##### + +# lines, and fromlines +lex + rl day /[A-Z][a-z][a-z]/ + rl month /[A-Z][a-z][a-z]/ + rl year /[0-9][0-9][0-9][0-9]/ + rl time /[0-9][0-9] ':' [0-9][0-9] ( ':' [0-9][0-9] )? / + rl letterZone /[A-Z][A-Z][A-Z]/ + rl numZone /[+\-][0-9][0-9][0-9][0-9]/ + rl zone / letterZone | numZone/ + rl dayNum /[0-9 ][0-9]/ + + # These are the different formats of the date minus an obscure + # type that has a funny string 'remote from xxx' on the end. Taken + # from c-client in the imap-2000 distribution. + rl date / day ' ' month ' ' dayNum ' ' time ' ' + ( year | year ' ' zone | zone ' ' year ) / + + # From lines separate messages. We will exclude from_line from a message + # body line. This will cause us to stay in message line up until an + # entirely correct from line is matched. + token from_line / 'From ' (any-'\n')* ' ' date '\n' / + token simple_line / [^\n]* '\n' / +end + +rl hchar /print - [ :]/ +token header_name /hchar+/ + +token colon /':' ' '*/ +token header_content / ([^\n] | '\n' [ \t])* '\n'/ +token blank_line / '\n' / + +def header + [header_name colon header_content] + +def message + [from_line header* blank_line simple_line*] + +def start + [message*] + +parse S: start[ stdin ] +print_xml( S ) +print( '\n' ) +##### IN ##### +From thurston Tue Jan 2 21:16:50 2007 +Return-Path: <unknown> +X-Spam-Level: * +Received: from [109.111.71.111] (helo=twfmtr) + by zifreax with smtp (Exim 4.43) + id 1H1vfs-0005LN-HW; Tue, 2 Jan 2007 21:16:16 -0500 +Message-ID: <459B113F.8050903@immoarthabitatge.com> +X-Keywords: +X-UID: 1 + +Content-Type: text/html; charset=ISO-8859-1 +</body> +</html> + +From thurston Wed Jan 3 02:35:48 2007 +Return-Path: <unknown> +X-Spam-Checker-Version: SpamAssassin 3.1.1 (2006-03-10) on mambo.cs.queensu.ca +X-Spam-Level: ** +X-Spam-Status: No, score=2.9 required=5.0 tests=BAYES_20,EXTRA_MPART_TYPE, + HTML_40_50,HTML_IMAGE_ONLY_16,HTML_MESSAGE,RCVD_IN_BL_SPAMCOP_NET + autolearn=no version=3.1.1 +X-Bogosity: Unsure, tests=bogofilter, spamicity=0.971708, version=1.0.2 +Status: RO +X-UID: 2 + +------=_NextPart_000_0010_01C72F11.F137BD60 + charset="windows-1252" +Content-Transfer-Encoding: quoted-printable + +##### EXP ##### +<start><_repeat_message><message><from_line>From thurston Tue Jan 2 21:16:50 2007 +</from_line><_repeat_header><header><header_name>Return-Path</header_name><colon>: </colon><header_content><unknown> +</header_content></header><header><header_name>X-Spam-Level</header_name><colon>: </colon><header_content>* +</header_content></header><header><header_name>Received</header_name><colon>: </colon><header_content>from [109.111.71.111] (helo=twfmtr) + by zifreax with smtp (Exim 4.43) + id 1H1vfs-0005LN-HW; Tue, 2 Jan 2007 21:16:16 -0500 +</header_content></header><header><header_name>Message-ID</header_name><colon>: </colon><header_content><459B113F.8050903@immoarthabitatge.com> +</header_content></header><header><header_name>X-Keywords</header_name><colon>: </colon><header_content> +</header_content></header><header><header_name>X-UID</header_name><colon>: </colon><header_content>1 +</header_content></header></_repeat_header><blank_line> +</blank_line><_repeat_simple_line><simple_line>Content-Type: text/html; charset=ISO-8859-1 +</simple_line><simple_line></body> +</simple_line><simple_line></html> +</simple_line><simple_line> +</simple_line></_repeat_simple_line></message><message><from_line>From thurston Wed Jan 3 02:35:48 2007 +</from_line><_repeat_header><header><header_name>Return-Path</header_name><colon>: </colon><header_content><unknown> +</header_content></header><header><header_name>X-Spam-Checker-Version</header_name><colon>: </colon><header_content>SpamAssassin 3.1.1 (2006-03-10) on mambo.cs.queensu.ca +</header_content></header><header><header_name>X-Spam-Level</header_name><colon>: </colon><header_content>** +</header_content></header><header><header_name>X-Spam-Status</header_name><colon>: </colon><header_content>No, score=2.9 required=5.0 tests=BAYES_20,EXTRA_MPART_TYPE, + HTML_40_50,HTML_IMAGE_ONLY_16,HTML_MESSAGE,RCVD_IN_BL_SPAMCOP_NET + autolearn=no version=3.1.1 +</header_content></header><header><header_name>X-Bogosity</header_name><colon>: </colon><header_content>Unsure, tests=bogofilter, spamicity=0.971708, version=1.0.2 +</header_content></header><header><header_name>Status</header_name><colon>: </colon><header_content>RO +</header_content></header><header><header_name>X-UID</header_name><colon>: </colon><header_content>2 +</header_content></header></_repeat_header><blank_line> +</blank_line><_repeat_simple_line><simple_line>------=_NextPart_000_0010_01C72F11.F137BD60 +</simple_line><simple_line> charset="windows-1252" +</simple_line><simple_line>Content-Transfer-Encoding: quoted-printable +</simple_line><simple_line> +</simple_line></_repeat_simple_line></message></_repeat_message></start> diff --git a/test/matchex.lm b/test/matchex.lm new file mode 100644 index 0000000..9dd24c4 --- /dev/null +++ b/test/matchex.lm @@ -0,0 +1,41 @@ +##### LM ##### +lex + token id /[a-zA-Z_][a-zA-Z0-9_]*/ + literal `= `< `> `/ + ignore /[ \t\n\r\v]+/ +end + +def attr + [id `= id] + +def open_tag + [`< id attr* `>] + +def close_tag + [`< `/ id `>] + +def tag + [open_tag item* close_tag] + +def item + [tag] +| [id] + +parse Tag: tag[ stdin ] + +# Style: List of literal text and types. +match Tag ["<person name=" Val1:id attr*">" item* "</person>"] + +# Style: Literal text with embedded lists of types. +match Tag "<person name=[Val2:id attr*]>[item*]</person>" + +print( ^Val1 '\n' ) +print( ^Val2 '\n' ) + +##### IN ##### +<person name=adrian hometown=kingston> + <t1 foo=bar2 e=f></t2> +</person> +##### EXP ##### +adrian +adrian diff --git a/test/maxlen.lm b/test/maxlen.lm new file mode 100644 index 0000000..2d220d1 --- /dev/null +++ b/test/maxlen.lm @@ -0,0 +1,57 @@ +##### LM ##### + +context maxlen + + # + # Regular Definitions + # + rl rl_ws /[ \t\n\r\v]+/ + rl rl_id /[a-zA-Z_][a-zA-Z0-9_]*/ + + # + # Tokens + # + + lex + ignore /rl_ws/ + token id /rl_id/ + end + + num: int + allow: int + + def item + [id] + { + num = num + 1 + toomuch: int = allow+1 + if num == toomuch { + reject + } + } + + def open + [] + { + num = 0 + } + + def close [] + + def restricted_list + [open item*] + + def start + [restricted_list id*] +end # maxlen + +cons MaxLen: maxlen[] +MaxLen.allow = 3 + +parse S: maxlen::start(MaxLen)[stdin] +print_xml( S ) +print('\n') +##### IN ##### +a b c d e f g +##### EXP ##### +<maxlen::start><maxlen::restricted_list><maxlen::open></maxlen::open><maxlen::_repeat_item><maxlen::item><maxlen::id>a</maxlen::id></maxlen::item><maxlen::item><maxlen::id>b</maxlen::id></maxlen::item><maxlen::item><maxlen::id>c</maxlen::id></maxlen::item></maxlen::_repeat_item></maxlen::restricted_list><maxlen::_repeat_id><maxlen::id>d</maxlen::id><maxlen::id>e</maxlen::id><maxlen::id>f</maxlen::id><maxlen::id>g</maxlen::id></maxlen::_repeat_id></maxlen::start> diff --git a/test/multiregion1.lm b/test/multiregion1.lm new file mode 100644 index 0000000..5c8bdea --- /dev/null +++ b/test/multiregion1.lm @@ -0,0 +1,242 @@ +##### LM ##### + + +token newline / '\n' / +token index / 'Index:' [ \t]* / +token consume_line / [^\n]* / + + +def index_stmt [index consume_line newline] + +token separator_line / '='+ '\n' / + +# Whitespace separated word list +lex + token word /[^\t \n]+/ + ignore /[\t ]+/ + + def word_list + [word word_list] + | [] +end + +token old_file_start / '---' [\t ]+ / +token new_file_start / '+++' [\t ]+ / + +def old_file + [old_file_start word_list newline] + +def new_file + [new_file_start word_list newline] + +def file_header + [index_stmt separator_line old_file new_file] + +token hunk_header / '@@' any* :>> '@@' '\n' / +token hunk_line / ( ' ' | '-' | '+' ) [^\n]* '\n' / + +def hunk_body + [hunk_line*] + +def hunk + [hunk_header hunk_body] + +# diff of a single file: header followed by a hunk list. +def file_diff + [file_header hunk*] + +def start + [file_diff*] + +parse S: start[ stdin ] + +for OF: old_file in S { + # Get the first word and check if it is + # the file we are interested in. + if match OF [ + "--- fsmrun.cpp" + Rest: word_list + "\n" + ] + { + OF = construct old_file + ["--- newfilename.cpp " Rest "\n"] + } +} + +print( S ) + +##### IN ##### +Index: fsmrun.cpp +=================================================================== +--- fsmrun.cpp (revision 4555) ++++ fsmrun.cpp (working copy) +@@ -150,7 +150,7 @@ + peof = 0; + if ( parser != 0 ) { + region = parser->getNextRegion(); +- cs = getStateFromNextRegion(); ++ cs = tables->entryByRegion[region]; + } + else { + region = 0; +@@ -189,7 +189,7 @@ + + tokstart = 0; + region = parser->getNextRegion(); +- cs = getStateFromNextRegion(); ++ cs = tables->entryByRegion[region]; + } + + void FsmRun::sendToken( int id ) +@@ -222,7 +222,7 @@ + parser = newParser; + + region = parser->getNextRegion(); +- cs = getStateFromNextRegion(); ++ cs = tables->entryByRegion[region]; + } + else { + #ifdef LOG_ACTIONS +@@ -355,7 +355,7 @@ + + /* Set the current state from the next region. */ + region = parser->getNextRegion(); +- cs = getStateFromNextRegion(); ++ cs = tables->entryByRegion[region]; + } + } + +@@ -452,7 +452,7 @@ + /* First thing check for error. */ + if ( cs == tables->errorState ) { + if ( parser != 0 ) { +- if ( getStateFromNextRegion( 1 ) != 0 ) { ++ if ( parser->getNextRegion( 1 ) != 0 ) { + #ifdef LOG_BACKTRACK + cerr << "scanner failed, trying next region" << endl; + #endif +@@ -462,7 +462,7 @@ + + parser->nextRegionInd += 1; + region = parser->getNextRegion(); +- cs = getStateFromNextRegion(); ++ cs = tables->entryByRegion[region]; + cerr << "new token region: " << + parser->tables->gbl->regionInfo[region].name << endl; + continue; +@@ -495,7 +495,7 @@ + } + else { + region = parser->getNextRegion(); +- cs = getStateFromNextRegion(); ++ cs = tables->entryByRegion[region]; + cerr << "new token region: " << + parser->tables->gbl->regionInfo[region].name << endl; + continue; +Index: junk.cpp +=================================================================== +--- ++++ junk.cpp (working copy) +Index: fsmrun.h +=================================================================== +--- fsmrun.h (revision 4557) ++++ fsmrun.h (working copy) +@@ -197,10 +197,6 @@ + void runOnInputStream( PdaRun *parser, InputStream &in ); + void execute(); + +- /* Offset can be used to look at the next nextRegionInd. */ +- int getStateFromNextRegion( int offset = 0 ) +- { return tables->entryByRegion[parser->getNextRegion(offset)]; } +- + FsmTables *tables; + PdaRun *parser; + InputStream *inputStream; +##### EXP ##### +Index: fsmrun.cpp +=================================================================== +--- newfilename.cpp (revision 4555) ++++ fsmrun.cpp (working copy) +@@ -150,7 +150,7 @@ + peof = 0; + if ( parser != 0 ) { + region = parser->getNextRegion(); +- cs = getStateFromNextRegion(); ++ cs = tables->entryByRegion[region]; + } + else { + region = 0; +@@ -189,7 +189,7 @@ + + tokstart = 0; + region = parser->getNextRegion(); +- cs = getStateFromNextRegion(); ++ cs = tables->entryByRegion[region]; + } + + void FsmRun::sendToken( int id ) +@@ -222,7 +222,7 @@ + parser = newParser; + + region = parser->getNextRegion(); +- cs = getStateFromNextRegion(); ++ cs = tables->entryByRegion[region]; + } + else { + #ifdef LOG_ACTIONS +@@ -355,7 +355,7 @@ + + /* Set the current state from the next region. */ + region = parser->getNextRegion(); +- cs = getStateFromNextRegion(); ++ cs = tables->entryByRegion[region]; + } + } + +@@ -452,7 +452,7 @@ + /* First thing check for error. */ + if ( cs == tables->errorState ) { + if ( parser != 0 ) { +- if ( getStateFromNextRegion( 1 ) != 0 ) { ++ if ( parser->getNextRegion( 1 ) != 0 ) { + #ifdef LOG_BACKTRACK + cerr << "scanner failed, trying next region" << endl; + #endif +@@ -462,7 +462,7 @@ + + parser->nextRegionInd += 1; + region = parser->getNextRegion(); +- cs = getStateFromNextRegion(); ++ cs = tables->entryByRegion[region]; + cerr << "new token region: " << + parser->tables->gbl->regionInfo[region].name << endl; + continue; +@@ -495,7 +495,7 @@ + } + else { + region = parser->getNextRegion(); +- cs = getStateFromNextRegion(); ++ cs = tables->entryByRegion[region]; + cerr << "new token region: " << + parser->tables->gbl->regionInfo[region].name << endl; + continue; +Index: junk.cpp +=================================================================== +--- ++++ junk.cpp (working copy) +Index: fsmrun.h +=================================================================== +--- fsmrun.h (revision 4557) ++++ fsmrun.h (working copy) +@@ -197,10 +197,6 @@ + void runOnInputStream( PdaRun *parser, InputStream &in ); + void execute(); + +- /* Offset can be used to look at the next nextRegionInd. */ +- int getStateFromNextRegion( int offset = 0 ) +- { return tables->entryByRegion[parser->getNextRegion(offset)]; } +- + FsmTables *tables; + PdaRun *parser; + InputStream *inputStream; diff --git a/test/multiregion2.lm b/test/multiregion2.lm new file mode 100644 index 0000000..d69b8d4 --- /dev/null +++ b/test/multiregion2.lm @@ -0,0 +1,124 @@ +##### LM ##### +# +# Character classes +# +rl CTL /0..31 | 127/ +rl CR /13/ +rl LF /10/ +rl SP /32/ +rl HT /9/ +rl CHAR /0..127/ + +rl separators / '(' | ')' | '<' | '>' + | '@' | ',' | ';' | ':' | '\\' + | '"' | '/' | '[' | ']' | '?' + | '=' | '{' | '}' | SP | HT / + +rl token_char /CHAR - CTL - separators/ + +# +# Literal tokens +# + +literal `HTTP/ `: +token SPT /' '/ +token CRLF /CR LF/ + +# +# Request Line +# + +token method /token_char+/ + +token request_uri /(^SP)+/ + +token http_number /digit+ '.' digit+/ + +def http_version + [ `HTTP/ http_number ] + +def request_line + [method SPT request_uri + SPT http_version CRLF] + +# +# Header +# + +token field_name /token_char+/ + +lex + token fv_plain /(^(CR|LF))*/ + token fv_ext /CR LF (SP|HT)/ + token fv_term /CR LF/ +end + +def fv + [fv_plain] +| [fv_ext] + +def field_value + [fv* fv_term] + +def header + [field_name `: field_value] + +# +# Request +# + +def request + [request_line header* CRLF] + +parse R: request*[ stdin ] + +if !R { + print( error ) + exit( 1 ) +} + +for FV: fv in R { + if match FV [fv_ext] + FV = cons fv " " +} + +print( R ) + +##### IN ##### +GET /hi/there/ HTTP/1.1
+
+GET /hithere/ HTTP/1.1
+Host: localhost:3535
+User-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.12) Gecko/20080207 Ubuntu/7.10 (gutsy) Firefox/2.0.0.12
+Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5
+Accept-Language: en-us,en;q=0.5
+Accept-Encoding: gzip,deflate
+Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7
+Keep-Alive: 300
+Connection: keep-alive
+Cache-Control: max-age=0
+
+GET foo HTTP/1.1
+hello: foo
+hi: there
+ my
+ friend
+
+##### EXP ##### +GET /hi/there/ HTTP/1.1
+
+GET /hithere/ HTTP/1.1
+Host: localhost:3535
+User-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.12) Gecko/20080207 Ubuntu/7.10 (gutsy) Firefox/2.0.0.12
+Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5
+Accept-Language: en-us,en;q=0.5
+Accept-Encoding: gzip,deflate
+Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7
+Keep-Alive: 300
+Connection: keep-alive
+Cache-Control: max-age=0
+
+GET foo HTTP/1.1
+hello: foo
+hi: there my friend
+
diff --git a/test/mutualrec.lm b/test/mutualrec.lm new file mode 100644 index 0000000..2eafd28 --- /dev/null +++ b/test/mutualrec.lm @@ -0,0 +1,18 @@ +##### LM ##### + +int f1( i: int ) +{ + return f2( i + 1 ) + 1 +} + +int f2( i: int ) +{ + if i < 10 + return f1( i + 1 ) + 1 + else + return i +} + +print( "f1() = [$f1(0)]\n" ) +##### EXP ##### +f1() = 22 diff --git a/test/namespace1.lm b/test/namespace1.lm new file mode 100644 index 0000000..b31b453 --- /dev/null +++ b/test/namespace1.lm @@ -0,0 +1,24 @@ +##### LM ##### +namespace n1 + + lex + token id / 'a' .. 'z' / + ignore / '\n' | '\t' | ' ' / + end + + def start + [id*] +end + +parse P: n1::start[stdin] +print( P ) +##### IN ##### +a + b + c +d +##### EXP ##### +a + b + c +d diff --git a/test/nestedcomm.lm b/test/nestedcomm.lm new file mode 100644 index 0000000..1b1b6a4 --- /dev/null +++ b/test/nestedcomm.lm @@ -0,0 +1,55 @@ +##### LM ##### +# +# Tokens +# + +# Any single character can be a literal +lex + # Ignore whitespace. + ignore /[ \t\n\r\v]+/ + + # Open and close id + token id /[a-zA-Z_][a-zA-Z0-9_]*/ + + token open_paren /'('/ + { + parse_stop NC: nested_comment[ stdin ] + print( %NC '\n' ) + input.push_ignore( NC ) + } +end + +# +# Token translation +# + +lex + literal `( `) + token nc_data /[^()]+/ +end + +def nc_item + [nc_data] +| [nested_comment] + +def nested_comment + [`( nc_item* `)] + +def nested [id*] + +parse P: nested[ stdin ] + +print( ^P '\n' ) +print_xml( ^P ) +print( '\n' ) +print_xml_ac( ^P ) +print( '\n' ) +print( ^P '\n' ) +##### IN ##### +hello there ( (this is a nested comment /*sdf;asd_++_stuff) ) and this is not +##### EXP ##### +( (this is a nested comment /*sdf;asd_++_stuff) ) +hello there ( (this is a nested comment /*sdf;asd_++_stuff) ) and this is not +<nested><_repeat_id><id>hello</id><id>there</id><id>and</id><id>this</id><id>is</id><id>not</id></_repeat_id></nested> +<nested><_repeat_id><id>hello</id><_ignore_0001> </_ignore_0001><id>there</id><_ignore_0001> </_ignore_0001><nested_comment><_literal_0007>(</_literal_0007><_repeat_nc_item><nc_item><nc_data> </nc_data></nc_item><nc_item><nested_comment><_literal_0007>(</_literal_0007><_repeat_nc_item><nc_item><nc_data>this is a nested comment /*sdf;asd_++_stuff</nc_data></nc_item></_repeat_nc_item><_literal_0009>)</_literal_0009></nested_comment></nc_item><nc_item><nc_data> </nc_data></nc_item></_repeat_nc_item><_literal_0009>)</_literal_0009></nested_comment><_ignore_0001> </_ignore_0001><id>and</id><_ignore_0001> </_ignore_0001><id>this</id><_ignore_0001> </_ignore_0001><id>is</id><_ignore_0001> </_ignore_0001><id>not</id></_repeat_id></nested> +hello there ( (this is a nested comment /*sdf;asd_++_stuff) ) and this is not diff --git a/test/order1.lm b/test/order1.lm new file mode 100644 index 0000000..e510bbb --- /dev/null +++ b/test/order1.lm @@ -0,0 +1,115 @@ +##### LM ##### + +lex + token c_single_lit /( 'L'? "'" ( [^'\\\n] | '\\' any )* "'" )/ + token c_double_lit /( 'L'? '"' ( [^"\\\n] | '\\' any )* '"' )/ + + token sym / ';' | ',' | '=' | '(' | ')' | ':' | '&' | '*' | + '[' | ']' | '~' | '+' | '-' | '/' | '<' | '>' | '|' | + '^' | '%' | '!' | '?' | '.' | '#'/ + + # Identifiers + token c_id /( [a-zA-Z_] [a-zA-Z0-9_]* )/ + + # Comments and whitespace. + token comm_c /( '/*' (any | '\n')* :>> '*/' )/ + token comm_cxx /( '//' any* :> '\n' )/ + token ws /( any - 33..126 )+/ +end + +def c_token + [c_single_lit] +| [c_double_lit] +| [sym] +| [c_id] +| [comm_c] +| [comm_cxx] +| [ws] + +def c_token_list + [c_token c_token_list] +| [c_token] + +# Can parse this, use ful for single constructs. +def c + [c_token*] + +literal `%% + +lex + literal `{ `} + literal `protocol `client `server `port `by `tcp `udp + token id /[A-Za-z_][A-Za-z_0-9]*/ + token number /[0-9]+/ + + ignore /'/*' any* :>> '*/'/ + ignore /[ \t\r\n]+/ +end + +def tcp_by_port + [`tcp `by `port] + +def udp_by_port + [`udp `by `port] + +def attribute + [`client id] +| [`server id] +| [`port number] +| [`udp id] +| [tcp_by_port] +| [udp_by_port] + +def tcp_protocol + [`tcp `protocol id `{ attribute* `}] + +def udp_protocol + [`udp `protocol id `{ attribute* `}] + +def protocol + [tcp_protocol] +| [udp_protocol] + +def program + [c `%% protocol*] + +alias output accum<c> + +def port + Port: int + Protocol: str + [] + +# Parse the input. +parse P: program[ stdin ] + +Output: output Output = construct output [] + +# Take off the leading C from the input file and send it out. +match P [C: c `%% protocol*] + +send Output [$C] +send Output + "#include <assert.h> + " + eos + +print( Output.tree ) +##### IN ##### +#include "some_header.h" + +%% + +tcp protocol FOO +{ + port 99 + + client c + server s +} + +##### EXP ##### +#include "some_header.h" + +#include <assert.h> + diff --git a/test/order2.lm b/test/order2.lm new file mode 100644 index 0000000..d91dd42 --- /dev/null +++ b/test/order2.lm @@ -0,0 +1,116 @@ +##### LM ##### + +lex + token c_single_lit /( 'L'? "'" ( [^'\\\n] | '\\' any )* "'" )/ + token c_double_lit /( 'L'? '"' ( [^"\\\n] | '\\' any )* '"' )/ + + token sym / ';' | ',' | '=' | '(' | ')' | ':' | '&' | '*' | + '[' | ']' | '~' | '+' | '-' | '/' | '<' | '>' | '|' | + '^' | '!' | '?' | '.' | '#'/ + + # Identifiers + token c_id /( [a-zA-Z_] [a-zA-Z0-9_]* )/ + + # Comments and whitespace. + token comm_c /( '/*' (any | '\n')* :>> '*/' )/ + token comm_cxx /( '//' any* :> '\n' )/ + token ws /( any - 33..126 )+/ +end + +def c_token + [c_single_lit] +| [c_double_lit] +| [sym] +| [c_id] +| [comm_c] +| [comm_cxx] +| [ws] + +def c_token_list + [c_token c_token_list] +| [c_token] + +# Can parse this, use ful for single constructs. +#def c +# [c_token*] + +def c + [c_token_list] + +lex + literal `%% + literal `{ `} + literal `protocol `client `server `port `by `tcp `udp + token id /[A-Za-z_][A-Za-z_0-9]*/ + token number /[0-9]+/ + + ignore /'/*' any* :>> '*/'/ + ignore /[ \t\r\n]+/ +end + +def tcp_by_port + [`tcp `by `port] + +def udp_by_port + [`udp `by `port] + +def attribute + [`client id] +| [`server id] +| [`port number] +| [`udp id] +| [tcp_by_port] +| [udp_by_port] + +def tcp_protocol + [`tcp `protocol id `{ attribute* `}] + +def udp_protocol + [`udp `protocol id `{ attribute* `}] + +def protocol + [tcp_protocol] +| [udp_protocol] + +def program + [c `%% protocol*] + +alias output parser<c> + +def port + Port: int + Protocol: str + [] + +# Parse the input. +parse P: program[ stdin ] + +Output: output Output = construct output [] + +# Take off the leading C from the input file and send it out. +match P [C: c '%%' protocol*] + +send Output [ + $C + "#include <assert.h> + " + ] eos +print( Output.tree ) +##### IN ##### +#include "some_header.h" + +%% + +tcp protocol FOO +{ + port 99 + + client c + server s +} + +##### EXP ##### +#include "some_header.h" + +#include <assert.h> + diff --git a/test/parse1.lm b/test/parse1.lm new file mode 100644 index 0000000..f8ecab8 --- /dev/null +++ b/test/parse1.lm @@ -0,0 +1,14 @@ +##### LM ##### +lex + token id / [a-z] / + ignore / [\n\t ] / +end + +def start [id*] + +parse S: start[stdin] +print( S ) +##### IN ##### +ab cd ef +##### EXP ##### +ab cd ef diff --git a/test/prints.lm b/test/prints.lm new file mode 100644 index 0000000..ad08220 --- /dev/null +++ b/test/prints.lm @@ -0,0 +1,17 @@ +##### LM ##### +lex + token word /[a-z]+/ + ignore /[\t\n ]+/ +end + +def start + [word*] + +parse Start: start[stdin] + +prints( stderr 'fd stderr: ' ^Start '\n' ) +prints( stdout 'fd stdout: ' ^Start '\n' ) +##### IN ##### +a b c +##### EXP ##### +fd stdout: a b c diff --git a/test/pull1.lm b/test/pull1.lm new file mode 100644 index 0000000..f86bd6c --- /dev/null +++ b/test/pull1.lm @@ -0,0 +1,7 @@ +##### LM ##### +String: str = stdin.pull( 10 ) +print( String '\n' ) +##### IN ##### +this is input for a non-parse pull +##### EXP ##### +this is in diff --git a/test/pull2.lm b/test/pull2.lm new file mode 100644 index 0000000..7b50092 --- /dev/null +++ b/test/pull2.lm @@ -0,0 +1,8 @@ +##### LM ##### +Stream: stream = open( 'working/pull2.in' ('r') ) +String: str = Stream.pull( 10 ) +print( String '\n' ) +##### IN ##### +this is input for a non-parse pull +##### EXP ##### +this is in diff --git a/test/ragelambig1.lm b/test/ragelambig1.lm new file mode 100644 index 0000000..845a07b --- /dev/null +++ b/test/ragelambig1.lm @@ -0,0 +1,72 @@ +##### LM ##### +lex + ignore /[\t\n ]+/ + literal `^ `| `- `, `: `! `? `. + literal `( `) `{ `} `* `& `+ + + literal `-- `:> `:>> `<: `-> `** + + token word /[a-zA-Z_][a-zA-Z0-9_]*/ + token uint /[0-9]+/ +end + + +def start + [expression] + { + print_xml( lhs ) + } + +def expression + [expression `| term] +| [expression `& term] +| [expression `- term] +| [expression `-- term] +| [term] + +def term + [term factor_with_rep] + { + if match lhs [term `- uint] + reject + } +| [term `. factor_with_rep] +| [term `:> factor_with_rep] +| [term `:>> factor_with_rep] +| [term `<: factor_with_rep] +| [factor_with_rep] + +def factor_with_rep + [factor_with_rep `*] +| [factor_with_rep `**] +| [factor_with_rep `?] +| [factor_with_rep `+] +| [factor_with_rep `{ factor_rep_num `}] +| [factor_with_rep `{ `, factor_rep_num `}] +| [factor_with_rep `{ factor_rep_num `, `}] +| [factor_with_rep `{ factor_rep_num `, factor_rep_num `}] +| [factor_with_neg] + +def factor_rep_num [uint] + +def factor_with_neg + [`! factor_with_neg] +| [`^ factor_with_neg] +| [factor] + +def factor + [alphabet_num] +| [word] +| [`( expression `)] + +def alphabet_num + [uint] +| [`- uint] + +parse start[ stdin ] + +print( '\n' ) +##### IN ##### +1 - 1 +##### EXP ##### +<start><expression><expression><term><factor_with_rep><factor_with_neg><factor><alphabet_num><uint>1</uint></alphabet_num></factor></factor_with_neg></factor_with_rep></term></expression><_literal_0007>-</_literal_0007><term><factor_with_rep><factor_with_neg><factor><alphabet_num><uint>1</uint></alphabet_num></factor></factor_with_neg></factor_with_rep></term></expression></start> diff --git a/test/ragelambig2.lm b/test/ragelambig2.lm new file mode 100644 index 0000000..39602bc --- /dev/null +++ b/test/ragelambig2.lm @@ -0,0 +1,72 @@ +##### LM ##### +lex + ignore /[\t\n ]+/ + literal `^ `| `- `, `: `! `? `. + literal `( `) `{ `} `* `& `+ + + literal `-- `:> `:>> `<: `-> `** + + token word /[a-zA-Z_][a-zA-Z0-9_]*/ + token uint /[0-9]+/ +end + + +def start + [expression] + { + print_xml( lhs ) + } + +def expression + [expression `| term] +| [expression `& term] +| [expression `- term] +| [expression `-- term] +| [term] + +def term + [factor_with_rep more_term] + +# Can resolve the ambiguity by making more_term shortest match. +def more_term + [] +| [factor_with_rep more_term] +| [`. factor_with_rep more_term] +| [`:> factor_with_rep more_term] +| [`:>> factor_with_rep more_term] +| [`<: factor_with_rep more_term] + +def factor_with_rep + [factor_with_rep `*] +| [factor_with_rep `**] +| [factor_with_rep `?] +| [factor_with_rep `+] +| [factor_with_rep `{ factor_rep_num `}] +| [factor_with_rep `{ `, factor_rep_num `}] +| [factor_with_rep `{ factor_rep_num `, `}] +| [factor_with_rep `{ factor_rep_num `, factor_rep_num `}] +| [factor_with_neg] + +def factor_rep_num + [uint] + +def factor_with_neg + [`! factor_with_neg] +| [`^ factor_with_neg] +| [factor] + +def factor + [alphabet_num] +| [word] +| [`( expression `)] + +def alphabet_num + [uint] +| [`- uint] + +parse start[ stdin ] +print( '\n' ) +##### IN ##### +1 - 1 +##### EXP ##### +<start><expression><expression><term><factor_with_rep><factor_with_neg><factor><alphabet_num><uint>1</uint></alphabet_num></factor></factor_with_neg></factor_with_rep><more_term></more_term></term></expression><_literal_0007>-</_literal_0007><term><factor_with_rep><factor_with_neg><factor><alphabet_num><uint>1</uint></alphabet_num></factor></factor_with_neg></factor_with_rep><more_term></more_term></term></expression></start> diff --git a/test/ragelambig3.lm b/test/ragelambig3.lm new file mode 100644 index 0000000..74b7254 --- /dev/null +++ b/test/ragelambig3.lm @@ -0,0 +1,72 @@ +##### LM ##### +lex + ignore /[\t\n ]+/ + literal `^ `| `- `, `: `! `? `. + literal `( `) `{ `} `* `& `+ + + literal `-- `:> `:>> `<: `-> `** + + token word /[a-zA-Z_][a-zA-Z0-9_]*/ + token uint /[0-9]+/ +end + + +def start + [expression] + { + print_xml( lhs ) + } + +def expression + [expression `| term_short] +| [expression `& term_short] +| [expression `- term_short] +| [expression `-- term_short] +| [term_short] + +# Works, but is confusing. +def term_short + reducefirst + [term] + +def term + [term factor_with_rep] +| [term `. factor_with_rep] +| [term `:> factor_with_rep] +| [term `:>> factor_with_rep] +| [term `<: factor_with_rep] +| [factor_with_rep] + +def factor_with_rep + [factor_with_rep `*] +| [factor_with_rep `**] +| [factor_with_rep `?] +| [factor_with_rep `+] +| [factor_with_rep `{ factor_rep_num `}] +| [factor_with_rep `{ `, factor_rep_num `}] +| [factor_with_rep `{ factor_rep_num `, `}] +| [factor_with_rep `{ factor_rep_num `, factor_rep_num `}] +| [factor_with_neg] + +def factor_rep_num [uint] + +def factor_with_neg + [`! factor_with_neg] +| [`^ factor_with_neg] +| [factor] + +def factor + [alphabet_num] +| [word] +| [`( expression `)] + +def alphabet_num + [uint] +| [`- uint] + +parse start[ stdin ] +print( '\n' ) +##### IN ##### +1 - 1 +##### EXP ##### +<start><expression><expression><term_short><term><factor_with_rep><factor_with_neg><factor><alphabet_num><uint>1</uint></alphabet_num></factor></factor_with_neg></factor_with_rep></term></term_short></expression><_literal_0007>-</_literal_0007><term_short><term><factor_with_rep><factor_with_neg><factor><alphabet_num><uint>1</uint></alphabet_num></factor></factor_with_neg></factor_with_rep></term></term_short></expression></start> diff --git a/test/ragelambig4.lm b/test/ragelambig4.lm new file mode 100644 index 0000000..e841b80 --- /dev/null +++ b/test/ragelambig4.lm @@ -0,0 +1,76 @@ +##### LM ##### +lex + ignore /[\t\n ]+/ + literal `^ `| `- `, `: `! `? `. + literal `( `) `{ `} `* `& `+ + + literal `-- `:> `:>> `<: `-> `** + + token word /[a-zA-Z_][a-zA-Z0-9_]*/ + token uint /[0-9]+/ +end + + +def start + [expression] + { + print_xml( lhs ) + } + +def expression [term expression_op*] + +def expression_op + [`| term] +| [`& term] +| [`- term] +| [`-- term] + +def term [factor_rep term_op_list_short] + +# This list is done manually to get shortest match. +def term_op_list_short + [] +| [term_op term_op_list_short] + +def term_op + [factor_rep] +| [`. factor_rep] +| [`:> factor_rep] +| [`:>> factor_rep] +| [`<: factor_rep] + +def factor_rep + [factor_neg factor_rep_op*] + +def factor_rep_op + [`*] +| [`**] +| [`?] +| [`+] +| [`{ factor_rep_num `}] +| [`{ `, factor_rep_num `}] +| [`{ factor_rep_num `, `}] +| [`{ factor_rep_num `, factor_rep_num `}] + +def factor_rep_num [uint] + +def factor_neg + [`! factor_neg] +| [`^ factor_neg] +| [factor] + +def factor + [alphabet_num] +| [word] +| [`( expression `)] + +def alphabet_num + [uint] +| [`- uint] + +parse start[ stdin ] +print( '\n' ) +##### IN ##### +1 - 1 +##### EXP ##### +<start><expression><term><factor_rep><factor_neg><factor><alphabet_num><uint>1</uint></alphabet_num></factor></factor_neg><_repeat_factor_rep_op></_repeat_factor_rep_op></factor_rep><term_op_list_short></term_op_list_short></term><_repeat_expression_op><expression_op><_literal_0007>-</_literal_0007><term><factor_rep><factor_neg><factor><alphabet_num><uint>1</uint></alphabet_num></factor></factor_neg><_repeat_factor_rep_op></_repeat_factor_rep_op></factor_rep><term_op_list_short></term_op_list_short></term></expression_op></_repeat_expression_op></expression></start> diff --git a/test/rediv.lm b/test/rediv.lm new file mode 100644 index 0000000..c5ac955 --- /dev/null +++ b/test/rediv.lm @@ -0,0 +1,99 @@ +##### LM ##### +# Or-literal scanner +lex + token orlit_dash /'-' / + token orlit_close /']'/ + + rl orlit_specials /[\-\]]/ + token orlit_chr /^orlit_specials | '\\' any/ +end + +def orlit_item + [orlit_chr] +| [orlit_chr orlit_dash orlit_chr] + +def orlit + [orlit_item*] + +# Regex scanner +lex + token orlit_open /'['/ + token orlit_neg_open /'[^'/ + token regex_dot /'.'/ + token regex_star /'*'/ + token regex_close /'/'/ + + rl regex_specials /[\[\.\*\/\\]/ + token regex_chr /(^regex_specials)+ | '\\' any/ +end + +def regex_rep + [regex_star] +| [] + +def regex_base + [regex_chr] +| [regex_dot] +| [orlit_open orlit orlit_close] +| [orlit_neg_open orlit orlit_close] + +def regex_item + [regex_base regex_rep] + +def regex_body + [regex_item*] + +rl s_string /"'" ([^'\\\n] | '\\' any )* "'"/ +rl d_string /'"' ([^"\\\n] | '\\' any )* '"'/ + +# Root scanner +lex + token ident /[a-zA-Z_]+/ + token number /[0-9]+/ + token string /s_string | d_string/ + + literal `+ `- `* `; `/ + token slash /'/'/ + token semi /';'/ + + ignore wp /[ \t\n]+/ +end + +def factor + [ident] +| [number] +| [string] +| [`/ regex_body regex_close] + +def term + [term `* factor] +| [term `/ factor] +| [factor] + +def expr + [expr `+ term] +| [expr `- term] +| [term] + +def statement + [expr `;] + +def start + [statement*] + +parse S: start[ stdin ] + +for I:orlit_item in S { + if match I [orlit_chr] { + print( I '\n' ) + } +} +print_xml( S ) +print( '\n' ) +##### IN ##### +2 / /[^gu-zy].*o[\d-xa]*/; +##### EXP ##### +g +y +a +<start><_repeat_statement><statement><expr><term><term><factor><number>2</number></factor></term><_literal_0021>/</_literal_0021><factor><_literal_0021>/</_literal_0021><regex_body><_repeat_regex_item><regex_item><regex_base><orlit_neg_open>[^</orlit_neg_open><orlit><_repeat_orlit_item><orlit_item><orlit_chr>g</orlit_chr></orlit_item><orlit_item><orlit_chr>u</orlit_chr><orlit_dash>-</orlit_dash><orlit_chr>z</orlit_chr></orlit_item><orlit_item><orlit_chr>y</orlit_chr></orlit_item></_repeat_orlit_item></orlit><orlit_close>]</orlit_close></regex_base><regex_rep></regex_rep></regex_item><regex_item><regex_base><regex_dot>.</regex_dot></regex_base><regex_rep><regex_star>*</regex_star></regex_rep></regex_item><regex_item><regex_base><regex_chr>o</regex_chr></regex_base><regex_rep></regex_rep></regex_item><regex_item><regex_base><orlit_open>[</orlit_open><orlit><_repeat_orlit_item><orlit_item><orlit_chr>\d</orlit_chr><orlit_dash>-</orlit_dash><orlit_chr>x</orlit_chr></orlit_item><orlit_item><orlit_chr>a</orlit_chr></orlit_item></_repeat_orlit_item></orlit><orlit_close>]</orlit_close></regex_base><regex_rep><regex_star>*</regex_star></regex_rep></regex_item></_repeat_regex_item></regex_body><regex_close>/</regex_close></factor></term></expr><_literal_001f>;</_literal_001f></statement></_repeat_statement></start> diff --git a/test/reor1.lm b/test/reor1.lm new file mode 100644 index 0000000..816b2f1 --- /dev/null +++ b/test/reor1.lm @@ -0,0 +1,27 @@ +##### LM ##### +lex + token id / [abcdef] / + token number / [0-9] / + ignore / [\n\t ] / +end + +def item [id] | [number] + +def start [item*] + +parse P: start[stdin] +print( P ) +##### IN ##### +ab cd ef +##### EXP ##### +ab cd ef +##### IN ##### +ag +##### EXP ##### +NIL--noeol +##### IN ##### +93 +ab 22 +##### EXP ##### +93 +ab 22 diff --git a/test/reor2.lm b/test/reor2.lm new file mode 100644 index 0000000..51f0dd3 --- /dev/null +++ b/test/reor2.lm @@ -0,0 +1,24 @@ +##### LM ##### +context undo + + lex + ignore /[ ]+/ + literal `; + token NL /'\n'/ + token id /[a-zA-Z_]+/ + end + + def item + [id] + + def start + [item* `; NL] +end + +cons Undo: undo[] +parse Input: undo::start( Undo )[ stdin ] +print( Input ) +##### IN ##### +a b; +##### EXP ##### +a b; diff --git a/test/reparse.lm b/test/reparse.lm new file mode 100644 index 0000000..907ca6a --- /dev/null +++ b/test/reparse.lm @@ -0,0 +1,26 @@ +##### LM ##### +lex + ignore /space+/ + literal `* `( `) + token id /[a-zA-Z_]+/ +end + +def item + [id] +| [`( item* `)] + +def start + [item*] + +parse Input: item*[ stdin ] + +S: start = cons start[ Input ] + +parse Again: start[ %Input ] + +print( Again ) + +##### IN ##### +a b c ( chocolate fudge ) d e +##### EXP ##### +a b c ( chocolate fudge ) d e diff --git a/test/repeat1.lm b/test/repeat1.lm new file mode 100644 index 0000000..315a63e --- /dev/null +++ b/test/repeat1.lm @@ -0,0 +1,42 @@ +##### LM ##### +lex + ignore /space+/ + literal `* `( `) + token id /[a-zA-Z_]+/ +end + +def item + [id] +| [`( item* `)] + +def start + [item*] + +parse Input: start[ stdin ] + +match Input [ItemList: item*] + +for I: item* in repeat( ItemList ) + print( ^I '\n' ) + +for I: item* in rev_repeat( ItemList ) + print( ^I '\n' ) +##### IN ##### +a b ( c d ) e ( f g ) h i +##### EXP ##### +a b ( c d ) e ( f g ) h i +b ( c d ) e ( f g ) h i +( c d ) e ( f g ) h i +e ( f g ) h i +( f g ) h i +h i +i + + +i +h i +( f g ) h i +e ( f g ) h i +( c d ) e ( f g ) h i +b ( c d ) e ( f g ) h i +a b ( c d ) e ( f g ) h i diff --git a/test/repeat2.lm b/test/repeat2.lm new file mode 100644 index 0000000..e001d8f --- /dev/null +++ b/test/repeat2.lm @@ -0,0 +1,7408 @@ +##### LM ##### +# +# Copyright 2012 Adrian Thurston <thurston@complang.org> +# + +# This file is part of Ragel. +# +# Ragel is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# Ragel is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Ragel; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +lex + token word /( [^. \t\n]+ | '.' )/ + token lws /[ \t]+/ + token nl / '\n'/ + + token cmd_verb1 /'.verb|'/ + token cmd_verb2 /'.verb/'/ + token cmd_label /'.label{'/ + token cmd_ref /'.ref{'/ + token cmd_em /'.em{'/ + token cmd_tt /'.tt{'/ + + token cmd_title /'.title' lws/ + token cmd_sub_title /'.subtitle' lws/ + token cmd_author /'.author' lws/ + + token cmd_chapter /'.chapter' lws/ + token cmd_section /'.section' lws/ + token cmd_sub_section /'.subsection' lws/ + token cmd_sub_sub_section /'.subsubsection' lws/ + + token cmd_graphic /'.graphic' lws/ + token cmd_comment /'.comment' lws? '\n'/ + token cmd_verbatim /'.verbatim' lws? '\n'/ + token cmd_code /'.code' lws? '\n'/ + + token cmd_itemize /'.itemize' lws? '\n'/ + token end_itemize /'.end' lws 'itemize' lws? '\n'/ + token cmd_item /'.item' lws/ + + token cmd_center /'.center' lws? '\n'/ + token end_center /'.end' lws 'center' lws? '\n'/ + + token cmd_tabular /'.tabular' lws? '\n'/ + token cmd_row /'.row' lws/ + token end_tabular /'.end' lws 'tabular' lws? '\n'/ + + token cmd_multicols /'.multicols' lws? '\n'/ + token cmd_columnbreak /'.columnbreak' lws? '\n'/ + token end_multicols /'.end' lws 'multicols' lws? '\n'/ + + token cmd_figure / '.figure' lws?/ + token cmd_caption / '.caption' lws/ + token end_figure / '.end' lws 'figure' lws? '\n'/ + + token cmd_list /'.list' lws? '\n'/ + token end_list /'.end' lws 'list' lws? '\n'/ + token cmd_li /'.li' lws/ + + token cmd_license /'.license' lws? '\n'/ +end + +lex + token bar_data /[^|]*/ + token end_bar /'|'/ +end + +lex + token slash_data /[^/]*/ + token end_slash /'/'/ +end + +lex + token curly_data /[^}]*/ + token end_curly /'}'/ +end + +def cmd_il + [cmd_verb1 bar_data end_bar] +| [cmd_verb2 slash_data end_slash] +| [cmd_label curly_data end_curly] +| [cmd_ref curly_data end_curly] +| [cmd_em curly_data end_curly] +| [cmd_tt curly_data end_curly] + +def text + [word] +| [lws] +| [cmd_il] + +lex + token end_verbatim /lws? '.' lws? 'end' lws 'verbatim' lws? '\n'/ + token verbatim_line /[^\n]* '\n'/ +end + +def verbatim + [cmd_verbatim verbatim_line* end_verbatim] + +lex + token end_code /lws? '.' lws? 'end' lws 'code' lws? '\n'/ + token code_line /[^\n]* '\n'/ +end + +def code + [cmd_code code_line* end_code] + +lex + token end_comment /lws? '.' lws? 'end' lws 'comment' lws? '\n'/ + token comment_line /[^\n]* '\n'/ +end + +def comment + [cmd_comment comment_line* end_comment] + +def figure + [cmd_figure text nl line* caption? end_figure] + +def li + [cmd_li text* nl] + +def _list + [cmd_list li* end_list] + +def scale + [lws word word*] + +def graphic + [cmd_graphic word scale? nl] + +def itemize + [cmd_itemize line* item* end_itemize] + +def center + [cmd_center line* end_center] + +def row + [cmd_row text* nl] + +def tabular + [cmd_tabular row* end_tabular] + +def multicols_line + [cmd_columnbreak] +| [line] + +def multicols + [cmd_multicols multicols_line* end_multicols] + +def item + [cmd_item line*] + +def caption + [cmd_caption line*] + +def line + [text] +| [nl] +| [comment] +| [verbatim] +| [code] +| [graphic] +| [itemize] +| [center] +| [tabular] +| [multicols] +| [figure] +| [_list] + +def sub_sub_section + [cmd_sub_sub_section text* nl line*] + +def sub_section + [cmd_sub_section text* nl line* sub_sub_section*] + +def section + [cmd_section text* nl line* sub_section*] + +def chapter + [cmd_chapter text* nl line* section*] + +def title + [cmd_title text* nl] + +def subtitle + [cmd_sub_title text* nl] + +def author + [cmd_author text* nl] + +# +# Paragraphs. +# + +def pline + [text text* nl] + +def paragraph + [pline pline*] + +def pextra + [nl paragraph] + +def block + [paragraph pextra*] + +def license + [cmd_license nl* block nl*] + +# +# Preamble. +# + +def preamble_item + [text] +| [nl] +| [title] +| [subtitle] +| [author] + +def preamble + [preamble_item* license] + +def start + [preamble chapter*] + +parse Start: start[ stdin ] +if ( ! Start ) { + print( error '\n' ) + exit( 1 ) +} + +int printPlData( Pld: cmd_il ) +{ + if match Pld [ cmd_verb1 V: bar_data end_bar] { + print( '\\verb|' ) + print( V ) + print( '|' ) + } + else if match Pld [cmd_verb2 V: slash_data end_slash] { + print( '\\verb/' ) + print( V ) + print( '/' ) + } + else if match Pld [cmd_label L: curly_data end_curly] { + print( '\\label{' ) + print( L ) + print( '}' ) + } + else if match Pld [cmd_ref L: curly_data end_curly] { + print( '\\ref{' ) + print( L ) + print( '}' ) + } + else if match Pld [cmd_em L: curly_data end_curly] { + print( '{\\em ' ) + print( L ) + print( '}' ) + } + else if match Pld [cmd_tt L: curly_data end_curly] { + print( '{\\tt ' ) + print( L ) + print( '}' ) + } + else { + print( Pld ) + } +} + +int printText( Lines: text* ) +{ + for L: text in repeat(Lines) { + if match L [PlData: cmd_il] { + printPlData( PlData ) + } + else { + print( L ) + } + } +} + +int printLines( Lines: line* ) +{ + for L: line in repeat(Lines) { + if match L [word] { + print( L ) + } + if match L [lws] { + print( L ) + } + if match L [nl] { + print( L ) + } + if match L [PlData: cmd_il] { + printPlData( PlData ) + } + if match L [cmd_verbatim Lines: verbatim_line* end_verbatim] { + print( '\\begin{verbatim}\n' ) + print( Lines ) + print( '\\end{verbatim}\n' ) + print( '\\verbspace\n' ) + } + if match L [cmd_code Lines: code_line* end_code] { + print( '\\begin{inline_code}\n' ) + print( '\\begin{verbatim}\n' ) + print( Lines ) + print( '\\end{verbatim}\n' ) + print( '\\end{inline_code}\n' ) + print( '\\verbspace\n' ) + } + if match L [cmd_graphic Name: word Scale: scale? nl] { + print( '\\graphspace\n' ) + print( '\\begin{center}\n' ) + print( '\\includegraphics' ) + if match Scale [lws Spd: word Spd2: word*] + print( '[scale=' Spd Spd2 ']' ) + else + print( '[scale=0.55]' ) + print( '{' Name '}\n' ) + print( '\\end{center}\n' ) + print( '\\graphspace\n' ) + } + if match L [cmd_itemize Lines: line* Items: item* end_itemize] { + print( '\\begin{itemize}\n' ) + printLines( Lines ) + for Item: item in repeat(Items) { + match Item [cmd_item Lines: line*] + print( '\\item ' ) + printLines( Lines ) + } + print( '\\end{itemize}\n' ) + } + if match L [cmd_figure DirData: text nl Lines: line* Caption: caption? end_figure] { + print( '\\begin{figure}\n' ) + print( '\\small\n' ) + printLines( Lines ) + if match Caption [cmd_caption CL: line*] { + print( '\\caption{' ) + printLines( CL ) + print( '}\n' ) + } + print( '\\label{' DirData '}\n' ) + print( '\\end{figure}\n' ) + } + if match L [cmd_list LiList: li* end_list] { + for Li: li* in LiList { + if match Li [cmd_li Lines: text* nl Rest: li*] { + print( '\\noindent\\\hspace*{24pt}' ) + printText( Lines ) + if match Rest [ li li* ] + print( '\\\\' ) + print( '\n' ) + } + } + print( '\\vspace{12pt}\n' ) + } + if match L [cmd_center Lines: line* end_center] { + print( '\\begin{center}\n' ) + printLines( Lines ) + print( '\\end{center}\n' ) + } + if match L [cmd_tabular Rows: row* end_tabular] { + print( '\\begin{tabular}{|c|c|c|}\n' ) + print( '\\hline\n' ) + for Row: row in repeat(Rows) { + if match Row [cmd_row Lines: text* nl ] { + printText( Lines ) + print( '\\\\' '\n' ) + print( '\\hline\n' ) + } + } + print( '\\end{tabular}\n' ) + } + if match L [cmd_multicols Lines: multicols_line* end_multicols] { + print( '\\begin{multicols}{2}\n' ) + for McLine: multicols_line in repeat( Lines ) { + if match McLine [Line: line] + printLines( cons line* [Line] ) + else if match McLine [cmd_columnbreak] { + print( '\\columnbreak\n' ) + } + } + print( '\\end{multicols}\n' ) + } + } +} + +match Start + [Preamble: preamble Chapters: chapter*] + +Title: title = title in Preamble +match Title [cmd_title TitleData: text* nl] + +SubTitle: subtitle = subtitle in Preamble +match SubTitle [cmd_sub_title SubTitleData: text* nl] + +Author: author = author in Preamble +match Author [cmd_author AuthorData: text* nl] + +License: license = license in Preamble + +print( + ~\documentclass[letterpaper,11pt,oneside]{book} + ~\usepackage{graphicx} + ~\usepackage{comment} + ~\usepackage{multicol} + ~\usepackage[ + ~ colorlinks=true, + ~ linkcolor=black, + ~ citecolor=green, + ~ filecolor=black, + ~ urlcolor=black]{hyperref} + ~ + ~\topmargin -0.20in + ~\oddsidemargin 0in + ~\textwidth 6.5in + ~\textheight 9in + ~ + ~\setlength{\parskip}{0pt} + ~\setlength{\topsep}{0pt} + ~\setlength{\partopsep}{0pt} + ~\setlength{\itemsep}{0pt} + ~ + ~\input{version} + ~ + ~\newcommand{\verbspace}{\vspace{10pt}} + ~\newcommand{\graphspace}{\vspace{10pt}} + ~ + ~\renewcommand\floatpagefraction{.99} + ~\renewcommand\topfraction{.99} + ~\renewcommand\bottomfraction{.99} + ~\renewcommand\textfraction{.01} + ~\setcounter{totalnumber}{50} + ~\setcounter{topnumber}{50} + ~\setcounter{bottomnumber}{50} + ~ + ~\newenvironment{inline_code}{\def\baselinestretch{1}\vspace{12pt}\small}{} + ~ + ~\begin{document} + ~ + ~\thispagestyle{empty} + ~\begin{center} + ~\vspace*{3in} +) + +print( '{\\huge ' TitleData '}\\\\\n' ) + +print( '\\vspace*{12pt}\n' ) + +print( '{\\Large ' SubTitleData '}\\\\\n' ) + +print( + ~\vspace{1in} + ~by\\ + ~\vspace{12pt} +) + +print( '{\\large ' AuthorData '}\\\\\n' ) + +print( + ~\end{center} + ~\clearpage + ~ + ~\pagenumbering{roman} + ~ + ~\chapter*{License} +) + +print( + ~Ragel version \version, \pubdate\\ + ~Copyright \copyright\ 2003-2012 Adrian D. Thurston + ~\vspace{6mm} + ~ +) + +i: int = 0 +for P: paragraph in License { + if ( i != 0 ) { + print( + ~ + ~\vspace{5pt} + ~ + ) + } + print( "{\\bf\\it\\noindent " ) + print( P ) + print( "}\n" ) + i = i + 1 +} + +print( + ~ + ~\clearpage + ~\tableofcontents + ~\clearpage + ~ + ~\pagenumbering{arabic} +) + + +for Chapter: chapter in repeat(Chapters) { + match Chapter + [cmd_chapter DirData: text* nl Lines: line* SectionList: section*] + + print( '\\chapter{' DirData '}\n' ) + printLines( Lines ) + + for Section: section in repeat(SectionList) { + match Section + [cmd_section DirData: text* nl Lines: line* SubSectionList: sub_section*] + + print( '\\section{' DirData '}\n' ) + printLines( Lines ) + for SubSection: sub_section in repeat(SubSectionList) { + match SubSection + [cmd_sub_section DirData: text* nl Lines: line* + SubSubSectionList: sub_sub_section*] + + print( '\\subsection{' DirData '}\n' ) + printLines( Lines ) + + for SubSubSection: sub_sub_section in repeat(SubSubSectionList) { + match SubSubSection + [cmd_sub_sub_section DirData: text* nl Lines: line*] + + print( '\\subsubsection{' DirData '}\n' ) + printLines( Lines ) + } + } + } +} + +print( + ~ + ~\end{document} +) +##### IN ##### +.title Ragel State Machine Compiler + +.subtitle User Guide + +.author Adrian Thurston + +.license + +This document is part of Ragel, and as such, this document is +released under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 2 of the License, or (at your option) +any later version. + +Ragel is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. + +You should have received a copy of the GNU General Public +License along with Ragel; if not, write to the Free Software Foundation, Inc., +59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +.chapter Introduction + +.section Abstract + +Regular expressions are used heavily in practice for the purpose of specifying +parsers. They are normally used as black boxes linked together with program +logic. User actions are executed in between invocations of the regular +expression engine. Adding actions before a pattern terminates requires patterns +to be broken and pasted back together with program logic. The more user actions +are needed, the less the advantages of regular expressions are seen. + +Ragel is a software development tool that allows user actions to be +embedded into the transitions of a regular expression's corresponding state +machine, eliminating the need to switch from the regular expression engine and +user code execution environment and back again. As a result, expressions can be +maximally continuous. One is free to specify an entire parser using a single +regular expression. The single-expression model affords concise and elegant +descriptions of languages and the generation of very simple, fast and robust +code. Ragel compiles executable finite state machines from a high level regular language +notation. Ragel targets C, C++, Objective-C, D, Go, Java and Ruby. + +In addition to building state machines from regular expressions, Ragel allows +the programmer to directly specify state machines with state charts. These two +notations may be freely combined. There are also facilities for controlling +nondeterminism in the resulting machines and building scanners using patterns +that themselves have embedded actions. Ragel can produce code that is small and +runs very fast. Ragel can handle integer-sized alphabets and can compile very +large state machines. + +.section Motivation + +When a programmer is faced with the task of producing a parser for a +context-free language there are many tools to choose from. It is quite common +to generate useful and efficient parsers for programming languages from a +formal grammar. It is also quite common for programmers to avoid such tools +when making parsers for simple computer languages, such as file formats and +communication protocols. Such languages are often regular and tools for +processing the context-free languages are viewed as too heavyweight for the +purpose of parsing regular languages. The extra run-time effort required for +supporting the recursive nature of context-free languages is wasted. + +When we turn to the regular expression-based parsing tools, such as Lex, Re2C, +and scripting languages such as Sed, Awk and Perl we find that they are split +into two levels: a regular expression matching engine and some kind of program +logic for linking patterns together. For example, a Lex program is composed of +sets of regular expressions. The implied program logic repeatedly attempts to +match a pattern in the current set. When a match is found the associated user +code executed. It requires the user to consider a language as a sequence of +independent tokens. Scripting languages and regular expression libraries allow +one to link patterns together using arbitrary program code. This is very +flexible and powerful, however we can be more concise and clear if we avoid +gluing together regular expressions with if statements and while loops. + +This model of execution, where the runtime alternates between regular +expression matching and user code exectution places restrictions on when +action code may be executed. Since action code can only be associated with +complete patterns, any action code that must be executed before an entire +pattern is matched requires that the pattern be broken into smaller units. +Instead of being forced to disrupt the regular expression syntax and write +smaller expressions, it is desirable to retain a single expression and embed +code for performing actions directly into the transitions that move over the +characters. After all, capable programmers are astutely aware of the machinery +underlying their programs, so why not provide them with access to that +machinery? To achieve this we require an action execution model for associating +code with the sub-expressions of a regular expression in a way that does not +disrupt its syntax. + +The primary goal of Ragel is to provide developers with an ability to embed +actions into the transitions and states of a regular expression's state machine +in support of the definition of entire parsers or large sections of parsers +using a single regular expression. From the regular expression we gain a clear +and concise statement of our language. From the state machine we obtain a very +fast and robust executable that lends itself to many kinds of analysis and +visualization. + +.section Overview + +Ragel is a language for specifying state machines. The Ragel program is a +compiler that assembles a state machine definition to executable code. Ragel +is based on the principle that any regular language can be converted to a +deterministic finite state automaton. Since every regular language has a state +machine representation and vice versa, the terms regular language and state +machine (or just machine) will be used interchangeably in this document. + +Ragel outputs machines to C, C++, Objective-C, D, Go, Java or Ruby code. The output is +designed to be generic and is not bound to any particular input or processing +method. A Ragel machine expects to have data passed to it in buffer blocks. +When there is no more input, the machine can be queried for acceptance. In +this way, a Ragel machine can be used to simply recognize a regular language +like a regular expression library. By embedding code into the regular language, +a Ragel machine can also be used to parse input. + +The Ragel language has many operators for constructing and manipulating +machines. Machines are built up from smaller machines, to bigger ones, to the +final machine representing the language that needs to be recognized or parsed. + +The core state machine construction operators are those found in most theory +of computation textbooks. They date back to the 1950s and are widely studied. +They are based on set operations and permit one to think of languages as a set +of strings. They are Union, Intersection, Difference, Concatenation and Kleene +Star. Put together, these operators make up what most people know as regular +expressions. Ragel also provides a scanner construction operator +and provides operators for explicitly constructing machines +using a state chart method. In the state chart method, one joins machines +together without any implied transitions and then explicitly specifies where +epsilon transitions should be drawn. + +The state machine manipulation operators are specific to Ragel. They allow the +programmer to access the states and transitions of regular language's +corresponding machine. There are two uses of the manipulation operators. The +first and primary use is to embed code into transitions and states, allowing +the programmer to specify the actions of the state machine. + +Ragel attempts to make the action embedding facility as intuitive as possible. +To do so, a number of issues need to be addressed. For example, when making a +nondeterministic specification into a DFA using machines that have embedded +actions, new transitions are often made that have the combined actions of +several source transitions. Ragel ensures that multiple actions associated with +a single transition are ordered consistently with respect to the order of +reference and the natural ordering implied by the construction operators. + +The second use of the manipulation operators is to assign priorities to +transitions. Priorities provide a convenient way of controlling any +nondeterminism introduced by the construction operators. Suppose two +transitions leave from the same state and go to distinct target states on the +same character. If these transitions are assigned conflicting priorities, then +during the determinization process the transition with the higher priority will +take precedence over the transition with the lower priority. The lower priority +transition gets abandoned. The transitions would otherwise be combined into a new +transition that goes to a new state that is a combination of the original +target states. Priorities are often required for segmenting machines. The most +common uses of priorities have been encoded into a set of simple operators +that should be used instead of priority embeddings whenever possible. + +For the purposes of embedding, Ragel divides transitions and states into +different classes. There are four operators for embedding actions and +priorities into the transitions of a state machine. It is possible to embed +into entering transitions, finishing transitions, all transitions and leaving +transitions. The embedding into leaving transitions is a special case. +These transition embeddings get stored in the final states of a machine. They +are transferred to any transitions that are made going out of the machine by +future concatenation or kleene star operations. + +There are several more operators for embedding actions into states. Like the +transition embeddings, there are various different classes of states that the +embedding operators access. For example, one can access start states, final +states or all states, among others. Unlike the transition embeddings, there are +several different types of state action embeddings. These are executed at +various different times during the processing of input. It is possible to embed +actions that are exectued on transitions into a state, on transitions out of a +state, on transitions taken on the error event, or on transitions taken on the +EOF event. + +Within actions, it is possible to influence the behaviour of the state machine. +The user can write action code that jumps or calls to another portion of the +machine, changes the current character being processed, or breaks out of the +processing loop. With the state machine calling feature Ragel can be used to +parse languages that are not regular. For example, one can parse balanced +parentheses by calling into a parser when an open parenthesis character is seen +and returning to the state on the top of the stack when the corresponding +closing parenthesis character is seen. More complicated context-free languages +such as expressions in C are out of the scope of Ragel. + +Ragel also provides a scanner construction operator that can be used to build +scanners much the same way that Lex is used. The Ragel generated code, which +relies on user-defined variables for backtracking, repeatedly tries to match +patterns to the input, favouring longer patterns over shorter ones and patterns +that appear ahead of others when the lengths of the possible matches are +identical. When a pattern is matched the associated action is executed. + +The key distinguishing feature between scanners in Ragel and scanners in Lex is +that Ragel patterns may be arbitrary Ragel expressions and can therefore +contain embedded code. With a Ragel-based scanner the user need not wait until +the end of a pattern before user code can be executed. + +Scanners do take Ragel out of the domain of pure state machines and require the +user to maintain the backtracking related variables. However, scanners +integrate well with regular state machine instantiations. They can be called to +or jumped to only when needed, or they can be called out of or jumped out of +when a simpler, pure state machine model is appropriate. + +Two types of output code style are available. Ragel can produce a table-driven +machine or a directly executable machine. The directly executable machine is +much faster than the table-driven. On the other hand, the table-driven machine +is more compact and less demanding on the host language compiler. It is better +suited to compiling large state machines. + +.section Related Work + +Lex is perhaps the best-known tool for constructing parsers from regular +expressions. In the Lex processing model, generated code attempts to match one +of the user's regular expression patterns, favouring longer matches over +shorter ones. Once a match is made it then executes the code associated with +the pattern and consumes the matching string. This process is repeated until +the input is fully consumed. + +Through the use of start conditions, related sets of patterns may be defined. +The active set may be changed at any time. This allows the user to define +different lexical regions. It also allows the user to link patterns together by +requiring that some patterns come before others. This is quite like a +concatenation operation. However, use of Lex for languages that require a +considerable amount of pattern concatenation is inappropriate. In such cases a +Lex program deteriorates into a manually specified state machine, where start +conditions define the states and pattern actions define the transitions. Lex +is therefore best suited to parsing tasks where the language to be parsed can +be described in terms of regions of tokens. + +Lex is useful in many scenarios and has undoubtedly stood the test of time. +There are, however, several drawbacks to using Lex. Lex can impose too much +overhead for parsing applications where buffering is not required because all +the characters are available in a single string. In these cases there is +structure to the language to be parsed and a parser specification tool can +help, but employing a heavyweight processing loop that imposes a stream +``pull'' model and dynamic input buffer allocation is inappropriate. An +example of this kind of scenario is the conversion of floating point numbers +contained in a string to their corresponding numerical values. + +Another drawback is the very issue that Ragel attempts to solve. +It is not possible to execute a user action while +matching a character contained inside a pattern. For example, if scanning a +programming language and string literals can contain newlines which must be +counted, a Lex user must break up a string literal pattern so as to associate +an action with newlines. This forces the definition of a new start condition. +Alternatively the user can reprocess the text of the matched string literal to +count newlines. + +.comment + +How ragel is different from Lex. + +Like Re2c, Ragel provides a simple execution model that does not make any +assumptions as to how the input is collected. Also, Ragel does not do any +buffering in the generated code. Consequently there are no dependencies on +external functions such as .verb|malloc|. + +If buffering is required it can be manually implemented by embedding actions +that copy the current character to a buffer, or data can be passed to the +parser using known block boundaries. If the longest-match operator is used, +Ragel requires the user to ensure that the ending portion of the input buffer +is preserved when the buffer is exhaused before a token is fully matched. The +user should move the token prefix to a new memory location, such as back to the +beginning of the input buffer, then place the subsequently read input +immediately after the prefix. + +These properties of Ragel make it more work to write a program that requires +the longest-match operator or buffering of input, however they make Ragel a +more flexible tool that can produce very simple and fast-running programs under +a variety of input acquisition arrangements. + +In Ragel, it is not necessary +to introduce start conditions to concatenate tokens and retain action +execution. Ragel allows one to structure a parser as a series of tokens, but +does not require it. + +Like Lex and Re2C, Ragel is able to process input using a longest-match +execution model, however the core of the Ragel language specifies parsers at a +much lower level. This core is built around a pure state machine model. When +building basic machines there is no implied algorithm for processing input +other than to move from state to state on the transitions of the machine. This +core of pure state machine operations makes Ragel well suited to handling +parsing problems not based on token scanning. Should one need to use a +longest-match model, the functionality is available and the lower level state +machine construction facilities can be used to specify the patterns of a +longest-match machine. + +This is not possible in Ragel. One can only program +a longest-match instantiation with a fixed set of rules. One can jump to +another longest-match machine that employs the same machine definitions in the +construction of its rules, however no states will be shared. + +In Ragel, input may be re-parsed using a +different machine, but since the action to be executed is associated with +transitions of the compiled state machine, the longest-match construction does +not permit a single rule to be excluded from the active set. It cannot be done +ahead of time nor in the excluded rule's action. + +.end comment + +The Re2C program defines an input processing model similar to that of Lex. +Re2C focuses on making generated state machines run very fast and +integrate easily into any program, free of dependencies. Re2C generates +directly executable code and is able to claim that generated parsers run nearly +as fast as their hand-coded equivalents. This is very important for user +adoption, as programmers are reluctant to use a tool when a faster alternative +exists. A consideration to ease of use is also important because developers +need the freedom to integrate the generated code as they see fit. + +Many scripting languages provide ways of composing parsers by linking regular +expressions using program logic. For example, Sed and Awk are two established +Unix scripting tools that allow the programmer to exploit regular expressions +for the purpose of locating and extracting text of interest. High-level +programming languages such as Perl, Python, PHP and Ruby all provide regular +expression libraries that allow the user to combine regular expressions with +arbitrary code. + +In addition to supporting the linking of regular expressions with arbitrary +program logic, the Perl programming language permits the embedding of code into +regular expressions. Perl embeddings do not translate into the embedding of +code into deterministic state machines. Perl regular expressions are in fact +not fully compiled to deterministic machines when embedded code is involved. +They are instead interpreted and involve backtracking. This is shown by the +following Perl program. When it is fed the input .verb|abcd| the interpretor +attempts to match the first alternative, printing .verb|a1 b1|. When this +possibility fails it backtracks and tries the second possibility, printing +.verb|a2 b2|, at which point it succeeds. + +.code +print "YES\n" if ( <STDIN> =~ + /( a (?{ print "a1 "; }) b (?{ print "b1 "; }) cX ) | + ( a (?{ print "a2 "; }) b (?{ print "b2 "; }) cd )/x ) +.end code + +In Ragel there is no regular expression interpretor. Aside from the scanner +operator, all Ragel expressions are made into deterministic machines and the +run time simply moves from state to state as it consumes input. An equivalent +parser expressed in Ragel would attempt both of the alternatives concurrently, +printing .verb|a1 a2 b1 b2|. + +.section Development Status + +Ragel is a relatively new tool and is under continuous development. As a rough +release guide, minor revision number changes are for implementation +improvements and feature additions. Major revision number changes are for +implementation and language changes that do not preserve backwards +compatibility. Though in the past this has not always held true: changes that +break code have crept into minor version number changes. Typically, the +documentation lags behind the development in the interest of documenting only +the lasting features. The latest changes are always documented in the ChangeLog +file. + +.chapter Constructing State Machines + +.section Ragel State Machine Specifications + +A Ragel input file consists of a program in the host language that contains embedded machine +specifications. Ragel normally passes input straight to output. When it sees +a machine specification it stops to read the Ragel statements and possibly generate +code in place of the specification. +Afterwards it continues to pass input through. There +can be any number of FSM specifications in an input file. A multi-line FSM spec +starts with .verb|%%{| and ends with .verb|}%%|. A single-line FSM spec starts +with .verb|%%| and ends at the first newline. + +While Ragel is looking for FSM specifications it does basic lexical analysis on +the surrounding input. It interprets literal strings and comments so a +.verb|%%| sequence in either of those will not trigger the parsing of an FSM +specification. Ragel does not pass the input through any preprocessor nor does it +interpret preprocessor directives itself so includes, defines and ifdef logic +cannot be used to alter the parse of a Ragel input file. It is therefore not +possible to use an .verb|#if 0| directive to comment out a machine as is +commonly done in C code. As an alternative, a machine can be prevented from +causing any generated output by commenting out write statements. + +In Figure .ref{cmd-line-parsing}, a multi-line specification is used to define the +machine and single line specifications are used to trigger the writing of the machine +data and execution code. + +.figure cmd-line-parsing +.multicols +.verbatim +#include <string.h> +#include <stdio.h> + +%%{ + machine foo; + main := + ( 'foo' | 'bar' ) + 0 @{ res = 1; }; +}%% + +%% write data; +.end verbatim +.columnbreak +.verbatim +int main( int argc, char **argv ) +{ + int cs, res = 0; + if ( argc > 1 ) { + char *p = argv[1]; + char *pe = p + strlen(p) + 1; + %% write init; + %% write exec; + } + printf("result = %i\n", res ); + return 0; +} +.end verbatim +.end multicols +.caption Parsing a command line argument. +.end figure + +.subsection Naming Ragel Blocks + +.verbatim +machine fsm_name; +.end verbatim + +The .verb|machine| statement gives the name of the FSM. If present in a +specification, this statement must appear first. If a machine specification +does not have a name then Ragel uses the previous specification name. If no +previous specification name exists then this is an error. Because FSM +specifications persist in memory, a machine's statements can be spread across +multiple machine specifications. This allows one to break up a machine across +several files or draw in statements that are common to multiple machines using +the .verb|include| statement. + +.subsection Machine Definition +.label{definition} + +.verbatim +<name> = <expression>; +.end verbatim + +The machine definition statement associates an FSM expression with a name. Machine +expressions assigned to names can later be referenced in other expressions. A +definition statement on its own does not cause any states to be generated. It is simply a +description of a machine to be used later. States are generated only when a definition is +instantiated, which happens when a definition is referenced in an instantiated +expression. + +.subsection Machine Instantiation +.label{instantiation} + +.verbatim +<name> := <expression>; +.end verbatim + +The machine instantiation statement generates a set of states representing an +expression. Each instantiation generates a distinct set of states. The starting +state of the instantiation is written in the data section of the generated code +using the instantiation name. If a machine named +.verb|main| is instantiated, its start state is used as the +specification's start state and is assigned to the .verb|cs| variable by the +.verb|write init| command. If no .verb|main| machine is given, the start state +of the last machine instantiation to appear is used as the specification's +start state. + +From outside the execution loop, control may be passed to any machine by +assigning the entry point to the .verb|cs| variable. From inside the execution +loop, control may be passed to any machine instantiation using .verb|fcall|, +.verb|fgoto| or .verb|fnext| statements. + +.subsection Including Ragel Code + +.verbatim +include FsmName "inputfile.rl"; +.end verbatim + +The .verb|include| statement can be used to draw in the statements of another FSM +specification. Both the name and input file are optional, however at least one +must be given. Without an FSM name, the given input file is searched for an FSM +of the same name as the current specification. Without an input file the +current file is searched for a machine of the given name. If both are present, +the given input file is searched for a machine of the given name. + +Ragel searches for included files from the location of the current file. +Additional directories can be added to the search path using the .verb|-I| +option. + +.subsection Importing Definitions +.label{import} + +.verbatim +import "inputfile.h"; +.end verbatim + +The .verb|import| statement scrapes a file for sequences of tokens that match +the following forms. Ragel treats these forms as state machine definitions. + +.list +.li .verb|name '=' number| +.li .verb|name '=' lit_string| +.li .verb|'define' name number| +.li .verb|'define' name lit_string| +.end list + +If the input file is a Ragel program then tokens inside any Ragel +specifications are ignored. See Section .ref{export} for a description of +exporting machine definitions. + +Ragel searches for imported files from the location of the current file. +Additional directories can be added to the search path using the .verb|-I| +option. + +.section Lexical Analysis of a Ragel Block +.label{lexing} + +Within a machine specification the following lexical rules apply to the input. + +.itemize + +.item The .verb|#| symbol begins a comment that terminates at the next newline. + +.item The symbols .verb|""|, .verb|''|, .verb|//|, .verb|[]| behave as the +delimiters of literal strings. Within them, the following escape sequences +are interpreted: + +.verb| \0 \a \b \t \n \v \f \r| + +A backslash at the end of a line joins the following line onto the current. A +backslash preceding any other character removes special meaning. This applies +to terminating characters and to special characters in regular expression +literals. As an exception, regular expression literals do not support escape +sequences as the operands of a range within a list. See the bullet on regular +expressions in Section .ref{basic}. + +.item The symbols .verb|{}| delimit a block of host language code that will be +embedded into the machine as an action. Within the block of host language +code, basic lexical analysis of comments and strings is done in order to +correctly find the closing brace of the block. With the exception of FSM +commands embedded in code blocks, the entire block is preserved as is for +identical reproduction in the output code. + +.item The pattern .verb|[+-]?[0-9]+| denotes an integer in decimal format. +Integers used for specifying machines may be negative only if the alphabet type +is signed. Integers used for specifying priorities may be positive or negative. + +.item The pattern .verb|0x[0-9A-Fa-f]+| denotes an integer in hexadecimal +format. + +.item The keywords are .verb|access|, .verb|action|, .verb|alphtype|, +.verb|getkey|, .verb|write|, .verb|machine| and .verb|include|. + +.item The pattern .verb|[a-zA-Z_][a-zA-Z_0-9]*| denotes an identifier. + +.comment +.item The allowable symbols are: + +.verb/ ( ) ! ^ * ? + : -> - | & . , := = ; > @ $ % /\\ +.verb| >/ $/ %/ </ @/ <>/ >! $! %! <! @! <>!|\\ +.verb| >^ $^ %^ <^ @^ <>^ >~ $~ %~ <~ @~ <>~|\\ +.verb| >* $* %* <* @* <>*| +.end comment + +.item Any amount of whitespace may separate tokens. + +.end itemize + +.comment +.section Parse of an FSM Specification + +The following statements are possible within an FSM specification. The +requirements for trailing semicolons loosely follow that of C. +A block +specifying code does not require a trailing semicolon. An expression +statement does require a trailing semicolon. +.end comment + +.section Basic Machines +.label{basic} + +The basic machines are the base operands of regular language expressions. They +are the smallest unit to which machine construction and manipulation operators +can be applied. + +.itemize + +.item .verb|'hello'| -- Concatenation Literal. Produces a machine that matches +the sequence of characters in the quoted string. If there are 5 characters +there will be 6 states chained together with the characters in the string. See +Section .ref{lexing} for information on valid escape sequences. + +.comment +% GENERATE: bmconcat +% OPT: -p +% %%{ +% machine bmconcat; +.verbatim +main := 'hello'; +.end verbatim +% }%% +% END GENERATE +.end comment + +.graphic bmconcat + +It is possible +to make a concatenation literal case-insensitive by appending an .verb|i| to +the string, for example .verb|'cmd'i|. + +.item .verb|"hello"| -- Identical to the single quoted version. + +.item .verb|[hello]| -- Or Expression. Produces a union of characters. There +will be two states with a transition for each unique character between the two states. +The .verb|[]| delimiters behave like the quotes of a literal string. For example, +.verb|[ \t]| means tab or space. The .verb|or| expression supports character ranges +with the .verb|-| symbol as a separator. The meaning of the union can be negated +using an initial .verb|^| character as in standard regular expressions. +See Section .ref{lexing} for information on valid escape sequences +in .verb|or| expressions. + +.comment +% GENERATE: bmor +% OPT: -p +% %%{ +% machine bmor; +.verbatim +main := [hello]; +.end verbatim +% }%% +% END GENERATE +.end comment + +.graphic bmor + +.item .verb|''|, .verb|""|, and .verb|[]| -- Zero Length Machine. Produces a machine +that matches the zero length string. Zero length machines have one state that is both +a start state and a final state. + +.comment +% GENERATE: bmnull +% OPT: -p +% %%{ +% machine bmnull; +.verbatim +main := ''; +.end verbatim +% }%% +% END GENERATE +.end comment + +.graphic bmnull + +% FIXME: More on the range of values here. +.item .verb|42| -- Numerical Literal. Produces a two state machine with one +transition on the given number. The number may be in decimal or hexadecimal +format and should be in the range allowed by the alphabet type. The minimum and +maximum values permitted are defined by the host machine that Ragel is compiled +on. For example, numbers in a .verb|short| alphabet on an i386 machine should +be in the range .verb|-32768| to .verb|32767|. + +.comment +% GENERATE: bmnum +% %%{ +% machine bmnum; +.verbatim +main := 42; +.end verbatim +% }%% +% END GENERATE +.end comment + +.graphic bmnum + +.item .verb|/simple_regex/| -- Regular Expression. Regular expressions are +parsed as a series of expressions that are concatenated together. Each +concatenated expression +may be a literal character, the ``any'' character specified by the .verb|.| +symbol, or a union of characters specified by the .verb|[]| delimiters. If the +first character of a union is .verb|^| then it matches any character not in the +list. Within a union, a range of characters can be given by separating the first +and last characters of the range with the .verb|-| symbol. Each +concatenated machine may have repetition specified by following it with the +.verb|*| symbol. The standard escape sequences described in Section +.ref{lexing} are supported everywhere in regular expressions except as the +operands of a range within in a list. This notation also supports the .verb|i| +trailing option. Use it to produce case-insensitive machines, as in .verb|/GET/i|. + +Ragel does not support very complex regular expressions because the desired +results can always be achieved using the more general machine construction +operators listed in Section .ref{machconst}. The following diagram shows the +result of compiling .verb|/ab*[c-z].*[123]/|. .verb|DEF| represents the default +transition, which is taken if no other transition can be taken. + +.comment +% GENERATE: bmregex +% OPT: -p +% %%{ +% machine bmregex; +.verbatim +main := /ab*[c-z].*[123]/; +.end verbatim +% }%% +% END GENERATE +.end comment + +.graphic bmregex + +.item .verb|'a' .. 'z'| -- Range. Produces a machine that matches any +characters in the specified range. Allowable upper and lower bounds of the +range are concatenation literals of length one and numerical literals. For +example, .verb|0x10..0x20|, .verb|0..63|, and .verb|'a'..'z'| are valid ranges. +The bounds should be in the range allowed by the alphabet type. + +.comment +% GENERATE: bmrange +% OPT: -p +% %%{ +% machine bmrange; +.verbatim +main := 'a' .. 'z'; +.end verbatim +% }%% +% END GENERATE +.end comment + +.graphic bmrange + +.item .verb|variable_name| -- Lookup the machine definition assigned to the +variable name given and use an instance of it. See Section .ref{definition} for +an important note on what it means to reference a variable name. + +.item .verb|builtin_machine| -- There are several built-in machines available +for use. They are all two state machines for the purpose of matching common +classes of characters. They are: + +.itemize + +.item .verb|any | -- Any character in the alphabet. + +.item .verb|ascii | -- Ascii characters. .verb|0..127| + +.item .verb|extend| -- Ascii extended characters. This is the range +.verb|-128..127| for signed alphabets and the range .verb|0..255| for unsigned +alphabets. + +.item .verb|alpha | -- Alphabetic characters. .verb|[A-Za-z]| + +.item .verb|digit | -- Digits. .verb|[0-9]| + +.item .verb|alnum | -- Alpha numerics. .verb|[0-9A-Za-z]| + +.item .verb|lower | -- Lowercase characters. .verb|[a-z]| + +.item .verb|upper | -- Uppercase characters. .verb|[A-Z]| + +.item .verb|xdigit| -- Hexadecimal digits. .verb|[0-9A-Fa-f]| + +.item .verb|cntrl | -- Control characters. .verb|0..31| + +.item .verb|graph | -- Graphical characters. .verb|[!-~]| + +.item .verb|print | -- Printable characters. .verb|[ -~]| + +.item .verb|punct | -- Punctuation. Graphical characters that are not alphanumerics. +.verb|[!-/:-@[-`{-~]| + +.item .verb|space | -- Whitespace. .verb|[\t\v\f\n\r ]| + +.item .verb|zlen | -- Zero length string. .verb|""| + +.item .verb|empty | -- Empty set. Matches nothing. .verb|^any| + +.end itemize +.end itemize + +.section Operator Precedence +The following table shows operator precedence from lowest to highest. Operators +in the same precedence group are evaluated from left to right. + +.tabular +.row 1&.verb| , |&Join +.row 2&.verb/ | & - --/&Union, Intersection and Subtraction +.row 3&.verb| . <: :> :>> |&Concatenation +.row 4&.verb| : |&Label +.row 5&.verb| -> |&Epsilon Transition +.row 6&.verb| > @ $ % |&Transitions Actions and Priorities +.row 6&.verb| >/ $/ %/ </ @/ <>/ |&EOF Actions +.row 6&.verb| >! $! %! <! @! <>! |&Global Error Actions +.row 6&.verb| >^ $^ %^ <^ @^ <>^ |&Local Error Actions +.row 6&.verb| >~ $~ %~ <~ @~ <>~ |&To-State Actions +.row 6&.verb| >* $* %* <* @* <>* |&From-State Action +.row 7&.verb| * ** ? + {n} {,n} {n,} {n,m} |&Repetition +.row 8&.verb| ! ^ |&Negation and Character-Level Negation +.row 9&.verb| ( <expr> ) |&Grouping +.end tabular + +.section Regular Language Operators +.label{machconst} + +When using Ragel it is helpful to have a sense of how it constructs machines. +The determinization process can produce results that seem unusual to someone +not familiar with the NFA to DFA conversion algorithm. In this section we +describe Ragel's state machine operators. Though the operators are defined +using epsilon transitions, it should be noted that this is for discussion only. +The epsilon transitions described in this section do not persist, but are +immediately removed by the determinization process which is executed at every +operation. Ragel does not make use of any nondeterministic intermediate state +machines. + +To create an epsilon transition between two states .verb|x| and .verb|y| is to +copy all of the properties of .verb|y| into .verb|x|. This involves drawing in +all of .verb|y|'s to-state actions, EOF actions, etc., in addition to its +transitions. If .verb|x| and .verb|y| both have a transition out on the same +character, then the transitions must be combined. During transition +combination a new transition is made that goes to a new state that is the +combination of both target states. The new combination state is created using +the same epsilon transition method. The new state has an epsilon transition +drawn to all the states that compose it. Since the creation of new epsilon +transitions may be triggered every time an epsilon transition is drawn, the +process of drawing epsilon transitions is repeated until there are no more +epsilon transitions to be made. + +A very common error that is made when using Ragel is to make machines that do +too much. That is, to create machines that have unintentional +nondetermistic properties. This usually results from being unaware of the common strings +between machines that are combined together using the regular language +operators. This can involve never leaving a machine, causing its actions to be +propagated through all the following states. Or it can involve an alternation +where both branches are unintentionally taken simultaneously. + +This problem forces one to think hard about the language that needs to be +matched. To guard against this kind of problem one must ensure that the machine +specification is divided up using boundaries that do not allow ambiguities from +one portion of the machine to the next. See Chapter +.ref{controlling-nondeterminism} for more on this problem and how to solve it. + +The Graphviz tool is an immense help when debugging improperly compiled +machines or otherwise learning how to use Ragel. Graphviz Dot files can be +generated from Ragel programs using the .verb|-V| option. See Section +.ref{visualization} for more information. + + +.subsection Union + +.verb/expr | expr/ + +The union operation produces a machine that matches any string in machine one +or machine two. The operation first creates a new start state. Epsilon +transitions are drawn from the new start state to the start states of both +input machines. The resulting machine has a final state set equivalent to the +union of the final state sets of both input machines. In this operation, there +is the opportunity for nondeterminism among both branches. If there are +strings, or prefixes of strings that are matched by both machines then the new +machine will follow both parts of the alternation at once. The union operation is +shown below. + +.graphic opor 1.0 + +The following example demonstrates the union of three machines representing +common tokens. + +% GENERATE: exor +% OPT: -p +% %%{ +% machine exor; +.code +# Hex digits, decimal digits, or identifiers +main := '0x' xdigit+ | digit+ | alpha alnum*; +.end code +% }%% +% END GENERATE + +.graphic exor + +.subsection Intersection + +.verb|expr & expr| + +Intersection produces a machine that matches any +string that is in both machine one and machine two. To achieve intersection, a +union is performed on the two machines. After the result has been made +deterministic, any final state that is not a combination of final states from +both machines has its final state status revoked. To complete the operation, +paths that do not lead to a final state are pruned from the machine. Therefore, +if there are any such paths in either of the expressions they will be removed +by the intersection operator. Intersection can be used to require that two +independent patterns be simultaneously satisfied as in the following example. + +% GENERATE: exinter +% OPT: -p +% %%{ +% machine exinter; +.code +# Match lines four characters wide that contain +# words separated by whitespace. +main := + /[^\n][^\n][^\n][^\n]\n/* & + (/[a-z][a-z]*/ | [ \n])**; +.end code +% }%% +% END GENERATE + +.graphic exinter + +.subsection Difference + +.verb|expr - expr| + +The difference operation produces a machine that matches +strings that are in machine one but are not in machine two. To achieve subtraction, +a union is performed on the two machines. After the result has been made +deterministic, any final state that came from machine two or is a combination +of states involving a final state from machine two has its final state status +revoked. As with intersection, the operation is completed by pruning any path +that does not lead to a final state. The following example demonstrates the +use of subtraction to exclude specific cases from a set. + +% GENERATE: exsubtr +% OPT: -p +% %%{ +% machine exsubtr; +.code +# Subtract keywords from identifiers. +main := /[a-z][a-z]*/ - ( 'for' | 'int' ); +.end code +% }%% +% END GENERATE + +.graphic exsubtr + +.subsection Strong Difference +.label{strong_difference} + +.verb|expr -- expr| + +Strong difference produces a machine that matches any string of the first +machine that does not have any string of the second machine as a substring. In +the following example, strong subtraction is used to excluded .verb|CRLF| from +a sequence. In the corresponding visualization, the label .verb|DEF| is short +for default. The default transition is taken if no other transition can be +taken. + +% GENERATE: exstrongsubtr +% OPT: -p +% %%{ +% machine exstrongsubtr; +.code +crlf = '\r\n'; +main := [a-z]+ ':' ( any* -- crlf ) crlf; +.end code +% }%% +% END GENERATE + +.graphic exstrongsubtr + +This operator is equivalent to the following. + +.verbatim +expr - ( any* expr any* ) +.end verbatim + +.subsection Concatenation + +.verb|expr . expr| + +Concatenation produces a machine that matches all the strings in machine one followed by all +the strings in machine two. Concatenation draws epsilon transitions from the +final states of the first machine to the start state of the second machine. The +final states of the first machine lose their final state status, unless the +start state of the second machine is final as well. +Concatenation is the default operator. Two machines next to each other with no +operator between them results in concatenation. + +.graphic opconcat 1.0 + +The opportunity for nondeterministic behaviour results from the possibility of +the final states of the first machine accepting a string that is also accepted +by the start state of the second machine. +The most common scenario in which this happens is the +concatenation of a machine that repeats some pattern with a machine that gives +a terminating string, but the repetition machine does not exclude the +terminating string. The example in Section .ref{strong_difference} +guards against this. Another example is the expression .verb|("'" any* "'")|. +When executed the thread of control will +never leave the .verb|any*| machine. This is a problem especially if actions +are embedded to process the characters of the .verb|any*| component. + +In the following example, the first machine is always active due to the +nondeterministic nature of concatenation. This particular nondeterminism is intended +however because we wish to permit EOF strings before the end of the input. + +% GENERATE: exconcat +% OPT: -p +% %%{ +% machine exconcat; +.code +# Require an eof marker on the last line. +main := /[^\n]*\n/* . 'EOF\n'; +.end code +% }%% +% END GENERATE + +.graphic exconcat + +There is a language +ambiguity involving concatenation and subtraction. Because concatenation is the +default operator for two +adjacent machines there is an ambiguity between subtraction of +a positive numerical literal and concatenation of a negative numerical literal. +For example, .verb|(x-7)| could be interpreted as .verb|(x . -7)| or +.verb|(x - 7)|. In the Ragel language, the subtraction operator always takes precedence +over concatenation of a negative literal. We adhere to the rule that the default +concatenation operator takes effect only when there are no other operators between +two machines. Beware of writing machines such as .verb|(any -1)| when what is +desired is a concatenation of .verb|any| and .verb|-1|. Instead write +.verb|(any . -1)| or .verb|(any (-1))|. If in doubt of the meaning of your program do not +rely on the default concatenation operator; always use the .verb|.| symbol. + + +.subsection Kleene Star + +.verb|expr*| + +The machine resulting from the Kleene Star operator will match zero or more +repetitions of the machine it is applied to. +It creates a new start state and an additional final +state. Epsilon transitions are drawn between the new start state and the old start +state, between the new start state and the new final state, and +between the final states of the machine and the new start state. After the +machine is made deterministic the effect is of the final states getting all the +transitions of the start state. + +.graphic opstar 1.0 + +The possibility for nondeterministic behaviour arises if the final states have +transitions on any of the same characters as the start state. This is common +when applying kleene star to an alternation of tokens. Like the other problems +arising from nondeterministic behavior, this is discussed in more detail in Chapter +.ref{controlling-nondeterminism}. This particular problem can also be solved +by using the longest-match construction discussed in Section +.ref{generating-scanners} on scanners. + +In this +example, there is no nondeterminism introduced by the exterior kleene star due to +the newline at the end of the regular expression. Without the newline the +exterior kleene star would be redundant and there would be ambiguity between +repeating the inner range of the regular expression and the entire regular +expression. Though it would not cause a problem in this case, unnecessary +nondeterminism in the kleene star operator often causes undesired results for +new Ragel users and must be guarded against. + +% GENERATE: exstar +% OPT: -p +% %%{ +% machine exstar; +.code +# Match any number of lines with only lowercase letters. +main := /[a-z]*\n/*; +.end code +% }%% +% END GENERATE + +.graphic exstar + +.subsection One Or More Repetition + +.verb|expr+| + +This operator produces the concatenation of the machine with the kleene star of +itself. The result will match one or more repetitions of the machine. The plus +operator is equivalent to .verb|(expr . expr*)|. + +% GENERATE: explus +% OPT: -p +% %%{ +% machine explus; +.code +# Match alpha-numeric words. +main := alnum+; +.end code +% }%% +% END GENERATE + +.graphic explus + +.subsection Optional + +.verb|expr?| + +The .em{optional} operator produces a machine that accepts the machine +given or the zero length string. The optional operator is equivalent to +.verb/(expr | '' )/. In the following example the optional operator is used to +possibly extend a token. + +% GENERATE: exoption +% OPT: -p +% %%{ +% machine exoption; +.code +# Match integers or floats. +main := digit+ ('.' digit+)?; +.end code +% }%% +% END GENERATE + +.graphic exoption + +.subsection Repetition + +.list +.li .verb|expr {n}| -- Exactly N copies of expr. +.li .verb|expr {,n}| -- Zero to N copies of expr. +.li .verb|expr {n,}| -- N or more copies of expr. +.li .verb|expr {n,m}| -- N to M copies of expr. +.end list + +.subsection Negation + +.verb|!expr| + +Negation produces a machine that matches any string not matched by the given +machine. Negation is equivalent to .verb|(any* - expr)|. + +% GENERATE: exnegate +% OPT: -p +% %%{ +% machine exnegate; +.code +# Accept anything but a string beginning with a digit. +main := ! ( digit any* ); +.end code +% }%% +% END GENERATE + +.graphic exnegate + +.subsection Character-Level Negation + +.verb|^expr| + +Character-level negation produces a machine that matches any single character +not matched by the given machine. Character-Level Negation is equivalent to +.verb|(any - expr)|. It must be applied only to machines that match strings of +length one. + +.section State Machine Minimization + +State machine minimization is the process of finding the minimal equivalent FSM accepting +the language. Minimization reduces the number of states in machines +by merging equivalent states. It does not change the behaviour of the machine +in any way. It will cause some states to be merged into one because they are +functionally equivalent. State minimization is on by default. It can be turned +off with the .verb|-n| option. + +The algorithm implemented is similar to Hopcroft's state minimization +algorithm. Hopcroft's algorithm assumes a finite alphabet that can be listed in +memory, whereas Ragel supports arbitrary integer alphabets that cannot be +listed in memory. Though exact analysis is very difficult, Ragel minimization +runs close to O(n * log(n)) and requires O(n) temporary storage where +$n$ is the number of states. + +.section Visualization +.label{visualization} + +%In many cases, practical +%parsing programs will be too large to completely visualize with Graphviz. The +%proper approach is to reduce the language to the smallest subset possible that +%still exhibits the characteristics that one wishes to learn about or to fix. +%This can be done without modifying the source code using the .verb|-M| and +%.verb|-S| options. If a machine cannot be easily reduced, +%embeddings of unique actions can be very useful for tracing a +%particular component of a larger machine specification, since action names are +%written out on transition labels. + +Ragel is able to emit compiled state machines in Graphviz's Dot file format. +This is done using the .verb|-V| option. +Graphviz support allows users to perform +incremental visualization of their parsers. User actions are displayed on +transition labels of the graph. + +If the final graph is too large to be +meaningful, or even drawn, the user is able to inspect portions of the parser +by naming particular regular expression definitions with the .verb|-S| and +.verb|-M| options to the .verb|ragel| program. Use of Graphviz greatly +improves the Ragel programming experience. It allows users to learn Ragel by +experimentation and also to track down bugs caused by unintended +nondeterminism. + +Ragel has another option to help debugging. The .verb|-x| option causes Ragel +to emit the compiled machine in an XML format. + +.chapter User Actions + +Ragel permits the user to embed actions into the transitions of a regular +expression's corresponding state machine. These actions are executed when the +generated code moves over a transition. Like the regular expression operators, +the action embedding operators are fully compositional. They take a state +machine and an action as input, embed the action and yield a new state machine +that can be used in the construction of other machines. Due to the +compositional nature of embeddings, the user has complete freedom in the +placement of actions. + +A machine's transitions are categorized into four classes. The action embedding +operators access the transitions defined by these classes. The .em{entering +transition} operator .verb|>| isolates the start state, then embeds an action +into all transitions leaving it. The .em{finishing transition} operator +.verb|@| embeds an action into all transitions going into a final state. The +.em{all transition} operator .verb|$| embeds an action into all transitions of +an expression. The .em{leaving transition} operator .verb|%| provides access +to the yet-unmade transitions moving out of the machine via the final states. + +.section Embedding Actions + +.verbatim +action ActionName { + /* Code an action here. */ + count += 1; +} +.end verbatim + +The action statement defines a block of code that can be embedded into an FSM. +Action names can be referenced by the action embedding operators in +expressions. Though actions need not be named in this way (literal blocks +of code can be embedded directly when building machines), defining reusable +blocks of code whenever possible is good practice because it potentially increases the +degree to which the machine can be minimized. + +Within an action some Ragel expressions and statements are parsed and +translated. These allow the user to interact with the machine from action code. +See Section .ref{vals} for a complete list of statements and values available +in code blocks. + +.subsection Entering Action + +.verb|expr > action| + +The entering action operator embeds an action into all transitions +that enter into the machine from the start state. If the start state is final, +then the action is also embedded into the start state as a leaving action. This +means that if a machine accepts the zero-length string and control passes +through the start state then the entering action is executed. Note +that this can happen on both a following character and on the EOF event. + +In some machines the start state has transtions coming in from within the +machine. In these cases the start state is first isolated from the rest of the +machine ensuring that the entering actions are exected once only. + +% GENERATE: exstact +% OPT: -p +% %%{ +% machine exstact; +.code +# Execute A at the beginning of a string of alpha. +action A {} +main := ( lower* >A ) . ' '; +.end code +% }%% +% END GENERATE + +.graphic exstact + +.subsection Finishing Action + +.verb|expr @ action| + +The finishing action operator embeds an action into any transitions that move +the machine into a final state. Further input may move the machine out of the +final state, but keep it in the machine. Therefore finishing actions may be +executed more than once if a machine has any internal transitions out of a +final state. In the following example the final state has no transitions out +and the finishing action is executed only once. + +% GENERATE: exdoneact +% OPT: -p +% %%{ +% machine exdoneact; +% action A {} +.code +# Execute A when the trailing space is seen. +main := ( lower* ' ' ) @A; +.end code +% }%% +% END GENERATE + +.graphic exdoneact + +.subsection All Transition Action + +.verb|expr $ action| + +The all transition operator embeds an action into all transitions of a machine. +The action is executed whenever a transition of the machine is taken. In the +following example, A is executed on every character matched. + +% GENERATE: exallact +% OPT: -p +% %%{ +% machine exallact; +% action A {} +.code +# Execute A on any characters of the machine. +main := ( 'm1' | 'm2' ) $A; +.end code +% }%% +% END GENERATE + +.graphic exallact + +.subsection Leaving Actions +.label{out-actions} + +.verb|expr % action| + +The leaving action operator queues an action for embedding into the transitions +that go out of a machine via a final state. The action is first stored in +the machine's final states and is later transferred to any transitions that are +made going out of the machine by a kleene star or concatenation operation. + +If a final state of the machine is still final when compilation is complete +then the leaving action is also embedded as an EOF action. Therefore, leaving +the machine is defined as either leaving on a character or as state machine +acceptance. + +This operator allows one to associate an action with the termination of a +sequence, without being concerned about what particular character terminates +the sequence. In the following example, A is executed when leaving the alpha +machine on the newline character. + +% GENERATE: exoutact1 +% OPT: -p +% %%{ +% machine exoutact1; +% action A {} +.code +# Match a word followed by a newline. Execute A when +# finishing the word. +main := ( lower+ %A ) . '\n'; +.end code +% }%% +% END GENERATE + +.graphic exoutact1 + +In the following example, the .verb|term_word| action could be used to register +the appearance of a word and to clear the buffer that the .verb|lower| action used +to store the text of it. + +% GENERATE: exoutact2 +% OPT: -p +% %%{ +% machine exoutact2; +% action lower {} +% action space {} +% action term_word {} +% action newline {} +.code +word = ( [a-z] @lower )+ %term_word; +main := word ( ' ' @space word )* '\n' @newline; +.end code +% }%% +% END GENERATE + +.graphic exoutact2 + +In this final example of the action embedding operators, A is executed upon entering +the alpha machine, B is executed on all transitions of the +alpha machine, C is executed when the alpha machine is exited by moving into the +newline machine and N is executed when the newline machine moves into a final +state. + +% GENERATE: exaction +% OPT: -p +% %%{ +% machine exaction; +% action A {} +% action B {} +% action C {} +% action N {} +.code +# Execute A on starting the alpha machine, B on every transition +# moving through it and C upon finishing. Execute N on the newline. +main := ( lower* >A $B %C ) . '\n' @N; +.end code +% }%% +% END GENERATE + +.graphic exaction + + +.section State Action Embedding Operators + +The state embedding operators allow one to embed actions into states. Like the +transition embedding operators, there are several different classes of states +that the operators access. The meanings of the symbols are similar to the +meanings of the symbols used for the transition embedding operators. The design +of the state selections was driven by a need to cover the states of an +expression with exactly one error action. + +Unlike the transition embedding operators, the state embedding operators are +also distinguished by the different kinds of events that embedded actions can +be associated with. Therefore the state embedding operators have two +components. The first, which is the first one or two characters, specifies the +class of states that the action will be embedded into. The second component +specifies the type of event the action will be executed on. The symbols of the +second component also have equivalent kewords. + +.multicols +The different classes of states are: + +.list +.li .verb|> | -- the start state +.li .verb|< | -- any state except the start state +.li .verb|$ | -- all states +.li .verb|% | -- final states +.li .verb|@ | -- any state except final states +.li .verb|<>| -- any except start and final (middle) +.end list + +.columnbreak + +The different kinds of embeddings are: + +.list +.li .verb|~| -- to-state actions (.verb|to|) +.li .verb|*| -- from-state actions (.verb|from|) +.li .verb|/| -- EOF actions (.verb|eof|) +.li .verb|!| -- error actions (.verb|err|) +.li .verb|^| -- local error actions (.verb|lerr|) +.end list + +.end multicols + +.subsection To-State and From-State Actions + +.subsubsection To-State Actions + +.list +.li .verb|>~action >to(name) >to{...} | -- the start state +.li .verb|<~action <to(name) <to{...} | -- any state except the start state +.li .verb|$~action $to(name) $to{...} | -- all states +.li .verb|%~action %to(name) %to{...} | -- final states +.li .verb|@~action @to(name) @to{...} | -- any state except final states +.li .verb|<>~action <>to(name) <>to{...}| -- any except start and final (middle) +.end list + + +To-state actions are executed whenever the state machine moves into the +specified state, either by a natural movement over a transition or by an +action-based transfer of control such as .verb|fgoto|. They are executed after the +in-transition's actions but before the current character is advanced and +tested against the end of the input block. To-state embeddings stay with the +state. They are irrespective of the state's current set of transitions and any +future transitions that may be added in or out of the state. + +Note that the setting of the current state variable .verb|cs| outside of the +execute code is not considered by Ragel as moving into a state and consequently +the to-state actions of the new current state are not executed. This includes +the initialization of the current state when the machine begins. This is +because the entry point into the machine execution code is after the execution +of to-state actions. + +.subsubsection From-State Actions + +.list +.li .verb|>*action >from(name) >from{...} | -- the start state +.li .verb|<*action <from(name) <from{...} | -- any state except the start state +.li .verb|$*action $from(name) $from{...} | -- all states +.li .verb|%*action %from(name) %from{...} | -- final states +.li .verb|@*action @from(name) @from{...} | -- any state except final states +.li .verb|<>*action <>from(name) <>from{...}| -- any except start and final (middle) +.end list + +From-state actions are executed whenever the state machine takes a transition from a +state, either to itself or to some other state. These actions are executed +immediately after the current character is tested against the input block end +marker and before the transition to take is sought based on the current +character. From-state actions are therefore executed even if a transition +cannot be found and the machine moves into the error state. Like to-state +embeddings, from-state embeddings stay with the state. + +.subsection EOF Actions + +.list +.li .verb|>/action >eof(name) >eof{...} | -- the start state +.li .verb|</action <eof(name) <eof{...} | -- any state except the start state +.li .verb|$/action $eof(name) $eof{...} | -- all states +.li .verb|%/action %eof(name) %eof{...} | -- final states +.li .verb|@/action @eof(name) @eof{...} | -- any state except final states +.li .verb|<>/action <>eof(name) <>eof{...}| -- any except start and final (middle) +.end list + +The EOF action embedding operators enable the user to embed actions that are +executed at the end of the input stream. EOF actions are stored in states and +generated in the .verb|write exec| block. They are run when .verb|p == pe == eof| +as the execute block is finishing. EOF actions are free to adjust .verb|p| and +jump to another part of the machine to restart execution. + +.subsection Handling Errors + +In many applications it is useful to be able to react to parsing errors. The +user may wish to print an error message that depends on the context. It +may also be desirable to consume input in an attempt to return the input stream +to some known state and resume parsing. To support error handling and recovery, +Ragel provides error action embedding operators. There are two kinds of error +actions: global error actions and local error actions. +Error actions can be used to simply report errors, or by jumping to a machine +instantiation that consumes input, can attempt to recover from errors. + +.subsubsection Global Error Actions + +.list +.li .verb|>!action >err(name) >err{...} | -- the start state +.li .verb|<!action <err(name) <err{...} | -- any state except the start state +.li .verb|$!action $err(name) $err{...} | -- all states +.li .verb|%!action %err(name) %err{...} | -- final states +.li .verb|@!action @err(name) @err{...} | -- any state except final states +.li .verb|<>!action <>err(name) <>err{...}| -- any except start and final (middle) +.end list + +Global error actions are stored in the states they are embedded into until +compilation is complete. They are then transferred to the transitions that move +into the error state. These transitions are taken on all input characters that +are not already covered by the state's transitions. If a state with an error +action is not final when compilation is complete, then the action is also +embedded as an EOF action. + +Error actions can be used to recover from errors by jumping back into the +machine with .verb|fgoto| and optionally altering .verb|p|. + +.subsubsection Local Error Actions + +.list +.li .verb|>^action >lerr(name) >lerr{...} | -- the start state +.li .verb|<^action <lerr(name) <lerr{...} | -- any state except the start state +.li .verb|$^action $lerr(name) $lerr{...} | -- all states +.li .verb|%^action %lerr(name) %lerr{...} | -- final states +.li .verb|@^action @lerr(name) @lerr{...} | -- any state except final states +.li .verb|<>^action <>lerr(name) <>lerr{...}| -- any except start and final (middle) +.end list + +Like global error actions, local error actions are also stored in the states +they are embedded into until a transfer point. The transfer point is different +however. Each local error action embedding is associated with a name. When a +machine definition has been fully constructed, all local error action +embeddings associated with the same name as the machine definition are +transferred to the error transitions. At this time they are also embedded as +EOF actions in the case of non-final states. + +Local error actions can be used to specify an action to take when a particular +section of a larger state machine fails to match. A particular machine +definition's ``thread'' may die and the local error actions executed, however +the machine as a whole may continue to match input. + +There are two forms of local error action embeddings. In the first form the +name defaults to the current machine. In the second form the machine name can +be specified. This is useful when it is more convenient to specify the local +error action in a sub-definition that is used to construct the machine +definition that the local error action is associated with. To embed local +error actions and +explicitly state the machine definition on which the transfer is to happen use +.verb|(name, action)| as the action. + +.subsubsection Example + +The following example uses error actions to report an error and jump to a +machine that consumes the remainder of the line when parsing fails. After +consuming the line, the error recovery machine returns to the main loop. + +% GENERATE: erract +% %%{ +% machine erract; +% ws = ' '; +% address = 'foo AT bar..com'; +% date = 'Monday May 12'; +.code +action cmd_err { + printf( "command error\n" ); + fhold; fgoto line; +} +action from_err { + printf( "from error\n" ); + fhold; fgoto line; +} +action to_err { + printf( "to error\n" ); + fhold; fgoto line; +} + +line := [^\n]* '\n' @{ fgoto main; }; + +main := ( + ( + 'from' @err(cmd_err) + ( ws+ address ws+ date '\n' ) $err(from_err) | + 'to' @err(cmd_err) + ( ws+ address '\n' ) $err(to_err) + ) +)*; +.end code +% }%% +% %% write data; +% void f() +% { +% %% write init; +% %% write exec; +% } +% END GENERATE + + + +.section Action Ordering and Duplicates + +When combining expressions that have embedded actions it is often the case that +a number of actions must be executed on a single input character. For example, +following a concatenation the leaving action of the left expression and the +entering action of the right expression will be embedded into one transition. +This requires a method of ordering actions that is intuitive and +predictable for the user, and repeatable for the compiler. + +We associate with the embedding of each action a unique timestamp that is +used to order actions that appear together on a single transition in the final +state machine. To accomplish this we recursively traverse the parse tree of +regular expressions and assign timestamps to action embeddings. References to +machine definitions are followed in the traversal. When we visit a +parse tree node we assign timestamps to all .em{entering} action embeddings, +recurse on the parse tree, then assign timestamps to the remaining .em{all}, +.em{finishing}, and .em{leaving} embeddings in the order in which they +appear. + +By default Ragel does not permit a single action to appear multiple times in an action +list. When the final machine has been created, actions that appear more than +once in a single transition, to-state, from-state or EOF action list have their +duplicates removed. +The first appearance of the action is preserved. This is useful in a number of +scenarios. First, it allows us to union machines with common prefixes without +worrying about the action embeddings in the prefix being duplicated. Second, it +prevents leaving actions from being transferred multiple times. This can +happen when a machine is repeated, then followed with another machine that +begins with a common character. For example: + +.verbatim +word = [a-z]+ %act; +main := word ( '\n' word )* '\n\n'; +.end verbatim + +Note that Ragel does not compare action bodies to determine if they have +identical program text. It simply checks for duplicates using each action +block's unique location in the program. + +The removal of duplicates can be turned off using the .verb|-d| option. + +.section Values and Statements Available in Code Blocks +.label{vals} + +The following values are available in code blocks: + +.itemize +.item .verb|fpc| -- A pointer to the current character. This is equivalent to +accessing the .verb|p| variable. + +.item .verb|fc| -- The current character. This is equivalent to the expression .verb|(*p)|. + +.item .verb|fcurs| -- An integer value representing the current state. This +value should only be read from. To move to a different place in the machine +from action code use the .verb|fgoto|, .verb|fnext| or .verb|fcall| statements. +Outside of the machine execution code the .verb|cs| variable may be modified. + +.item .verb|ftargs| -- An integer value representing the target state. This +value should only be read from. Again, .verb|fgoto|, .verb|fnext| and +.verb|fcall| can be used to move to a specific entry point. + +.item .verb|fentry(<label>)| -- Retrieve an integer value representing the +entry point .verb|label|. The integer value returned will be a compile time +constant. This number is suitable for later use in control flow transfer +statements that take an expression. This value should not be compared against +the current state because any given label can have multiple states representing +it. The value returned by .verb|fentry| can be any one of the multiple states that +it represents. +.end itemize + +The following statements are available in code blocks: + +.itemize + +.item .verb|fhold;| -- Do not advance over the current character. If processing +data in multiple buffer blocks, the .verb|fhold| statement should only be used +once in the set of actions executed on a character. Multiple calls may result +in backing up over the beginning of the buffer block. The .verb|fhold| +statement does not imply any transfer of control. It is equivalent to the +.verb|p--;| statement. + +.item .verb|fexec <expr>;| -- Set the next character to process. This can be +used to backtrack to previous input or advance ahead. +Unlike .verb|fhold|, which can be used +anywhere, .verb|fexec| requires the user to ensure that the target of the +backtrack is in the current buffer block or is known to be somewhere ahead of +it. The machine will continue iterating forward until .verb|pe| is arrived at, +.verb|fbreak| is called or the machine moves into the error state. In actions +embedded into transitions, the .verb|fexec| statement is equivalent to setting +.verb|p| to one position ahead of the next character to process. If the user +also modifies .verb|pe|, it is possible to change the buffer block entirely. + +.item .verb|fgoto <label>;| -- Jump to an entry point defined by +.verb|<label>|. The .verb|fgoto| statement immediately transfers control to +the destination state. + +.item .verb|fgoto *<expr>;| -- Jump to an entry point given by .verb|<expr>|. +The expression must evaluate to an integer value representing a state. + +.item .verb|fnext <label>;| -- Set the next state to be the entry point defined +by .verb|label|. The .verb|fnext| statement does not immediately jump to the +specified state. Any action code following the statement is executed. + +.item .verb|fnext *<expr>;| -- Set the next state to be the entry point given +by .verb|<expr>|. The expression must evaluate to an integer value representing +a state. + +.item .verb|fcall <label>;| -- Push the target state and jump to the entry +point defined by .verb|<label>|. The next .verb|fret| will jump to the target +of the transition on which the call was made. Use of .verb|fcall| requires +the declaration of a call stack. An array of integers named .verb|stack| and a +single integer named .verb|top| must be declared. With the .verb|fcall| +construct, control is immediately transferred to the destination state. +See section .ref{modularization} for more information. + +.item .verb|fcall *<expr>;| -- Push the current state and jump to the entry +point given by .verb|<expr>|. The expression must evaluate to an integer value +representing a state. + +.item .verb|fret;| -- Return to the target state of the transition on which the +last .verb|fcall| was made. Use of .verb|fret| requires the declaration of a +call stack. Control is immediately transferred to the destination state. + +.item .verb|fbreak;| -- Advance .verb|p|, save the target state to .verb|cs| +and immediately break out of the execute loop. This statement is useful +in conjunction with the .verb|noend| write option. Rather than process input +until .verb|pe| is arrived at, the fbreak statement +can be used to stop processing from an action. After an .verb|fbreak| +statement the .verb|p| variable will point to the next character in the input. The +current state will be the target of the current transition. Note that .verb|fbreak| +causes the target state's to-state actions to be skipped. + +.end itemize + +Once actions with control-flow commands are embedded into a +machine, the user must exercise caution when using the machine as the operand +to other machine construction operators. If an action jumps to another state +then unioning any transition that executes that action with another transition +that follows some other path will cause that other path to be lost. Using +commands that manually jump around a machine takes us out of the domain of +regular languages because transitions that the +machine construction operators are not aware of are introduced. These +commands should therefore be used with caution. + + +.chapter Controlling Nondeterminism +.label{controlling-nondeterminism} + +Along with the flexibility of arbitrary action embeddings comes a need to +control nondeterminism in regular expressions. If a regular expression is +ambiguous, then sub-components of a parser other than the intended parts may become +active. This means that actions that are irrelevant to the +current subset of the parser may be executed, causing problems for the +programmer. + +Tools that are based on regular expression engines and that are used for +recognition tasks will usually function as intended regardless of the presence +of ambiguities. It is quite common for users of scripting languages to write +regular expressions that are heavily ambiguous and it generally does not +matter. As long as one of the potential matches is recognized, there can be any +number of other matches present. In some parsing systems the run-time engine +can employ a strategy for resolving ambiguities, for example always pursuing +the longest possible match and discarding others. + +In Ragel, there is no regular expression run-time engine, just a simple state +machine execution model. When we begin to embed actions and face the +possibility of spurious action execution, it becomes clear that controlling +nondeterminism at the machine construction level is very important. Consider +the following example. + +% GENERATE: lines1 +% OPT: -p +% %%{ +% machine lines1; +% action first {} +% action tail {} +% word = [a-z]+; +.code +ws = [\n\t ]; +line = word $first ( ws word $tail )* '\n'; +lines = line*; +.end code +% main := lines; +% }%% +% END GENERATE + +.graphic lines1 0.53 + +Since the .verb|ws| expression includes the newline character, we will +not finish the .verb|line| expression when a newline character is seen. We will +simultaneously pursue the possibility of matching further words on the same +line and the possibility of matching a second line. Evidence of this fact is +in the state tables. On several transitions both the .verb|first| and +.verb|tail| actions are executed. The solution here is simple: exclude +the newline character from the .verb|ws| expression. + +% GENERATE: lines2 +% OPT: -p +% %%{ +% machine lines2; +% action first {} +% action tail {} +% word = [a-z]+; +.code +ws = [\t ]; +line = word $first ( ws word $tail )* '\n'; +lines = line*; +.end code +% main := lines; +% }%% +% END GENERATE + +.graphic lines2 + +Solving this kind of problem is straightforward when the ambiguity is created +by strings that are a single character long. When the ambiguity is created by +strings that are multiple characters long we have a more difficult problem. +The following example is an incorrect attempt at a regular expression for C +language comments. + +% GENERATE: comments1 +% OPT: -p +% %%{ +% machine comments1; +% action comm {} +.code +comment = '/*' ( any @comm )* '*/'; +main := comment ' '; +.end code +% }%% +% END GENERATE + +.graphic comments1 + +Using standard concatenation, we will never leave the .verb|any*| expression. +We will forever entertain the possibility that a .verb|'*/'| string that we see +is contained in a longer comment and that, simultaneously, the comment has +ended. The concatenation of the .verb|comment| machine with .verb|SP| is done +to show this. When we match space, we are also still matching the comment body. + +One way to approach the problem is to exclude the terminating string +from the .verb|any*| expression using set difference. We must be careful to +exclude not just the terminating string, but any string that contains it as a +substring. A verbose, but proper specification of a C comment parser is given +by the following regular expression. + +% GENERATE: comments2 +% OPT: -p +% %%{ +% machine comments2; +% action comm {} +.code +comment = '/*' ( ( any @comm )* - ( any* '*/' any* ) ) '*/'; +.end code +% main := comment; +% }%% +% END GENERATE + +.graphic comments2 + +Note that Ragel's strong subtraction operator .verb|--| can also be used here. +In doing this subtraction we have phrased the problem of controlling non-determinism in +terms of excluding strings common to two expressions that interact when +combined. +We can also phrase the problem in terms of the transitions of the state +machines that implement these expressions. During the concatenation of +.verb|any*| and .verb|'*/'| we will be making transitions that are composed of +both the loop of the first expression and the final character of the second. +At this time we want the transition on the .verb|'/'| character to take precedence +over and disallow the transition that originated in the .verb|any*| loop. + +In another parsing problem, we wish to implement a lightweight tokenizer that we can +utilize in the composition of a larger machine. For example, some HTTP headers +have a token stream as a sub-language. The following example is an attempt +at a regular expression-based tokenizer that does not function correctly due to +unintended nondeterminism. + +% GENERATE: smallscanner +% OPT: -p +% %%{ +% machine smallscanner; +% action start_str {} +% action on_char {} +% action finish_str {} +.code +header_contents = ( + lower+ >start_str $on_char %finish_str | + ' ' +)*; +.end code +% main := header_contents; +% }%% +% END GENERATE + +.graphic smallscanner + +In this case, the problem with using a standard kleene star operation is that +there is an ambiguity between extending a token and wrapping around the machine +to begin a new token. Using the standard operator, we get an undesirable +nondeterministic behaviour. Evidence of this can be seen on the transition out +of state one to itself. The transition extends the string, and simultaneously, +finishes the string only to immediately begin a new one. What is required is +for the +transitions that represent an extension of a token to take precedence over the +transitions that represent the beginning of a new token. For this problem +there is no simple solution that uses standard regular expression operators. + +.section Priorities + +A priority mechanism was devised and built into the determinization +process, specifically for the purpose of allowing the user to control +nondeterminism. Priorities are integer values embedded into transitions. When +the determinization process is combining transitions that have different +priorities, the transition with the higher priority is preserved and the +transition with the lower priority is dropped. + +Unfortunately, priorities can have unintended side effects because their +operation requires that they linger in transitions indefinitely. They must linger +because the Ragel program cannot know when the user is finished with a priority +embedding. A solution whereby they are explicitly deleted after use is +conceivable; however this is not very user-friendly. Priorities were therefore +made into named entities. Only priorities with the same name are allowed to +interact. This allows any number of priorities to coexist in one machine for +the purpose of controlling various different regular expression operations and +eliminates the need to ever delete them. Such a scheme allows the user to +choose a unique name, embed two different priority values using that name +and be confident that the priority embedding will be free of any side effects. + +In the first form of priority embedding the name defaults to the name of the machine +definition that the priority is assigned in. In this sense priorities are by +default local to the current machine definition or instantiation. Beware of +using this form in a longest-match machine, since there is only one name for +the entire set of longest match patterns. In the second form the priority's +name can be specified, allowing priority interaction across machine definition +boundaries. + +.itemize +.item .verb|expr > int| -- Sets starting transitions to have priority int. +.item .verb|expr @ int| -- Sets transitions that go into a final state to have priority int. +.item .verb|expr $ int| -- Sets all transitions to have priority int. +.item .verb|expr % int| -- Sets leaving transitions to +have priority int. When a transition is made going out of the machine (either +by concatenation or kleene star) its priority is immediately set to the +leaving priority. +.end itemize + +The second form of priority assignment allows the programmer to specify the name +to which the priority is assigned. + +.itemize +.item .verb|expr > (name, int)| -- Starting transitions. +.item .verb|expr @ (name, int)| -- Finishing transitions (into a final state). +.item .verb|expr $ (name, int)| -- All transitions. +.item .verb|expr % (name, int)| -- Leaving transitions. +.end itemize + +.section Guarded Operators that Encapsulate Priorities + +Priority embeddings are a very expressive mechanism. At the same time they +can be very confusing for the user. They force the user to imagine +the transitions inside two interacting expressions and work out the precise +effects of the operations between them. When we consider +that this problem is worsened by the +potential for side effects caused by unintended priority name collisions, we +see that exposing the user to priorities is undesirable. + +Fortunately, in practice the use of priorities has been necessary only in a +small number of scenarios. This allows us to encapsulate their functionality +into a small set of operators and fully hide them from the user. This is +advantageous from a language design point of view because it greatly simplifies +the design. + +Going back to the C comment example, we can now properly specify +it using a guarded concatenation operator which we call .em{finish-guarded +concatenation}. From the user's point of view, this operator terminates the +first machine when the second machine moves into a final state. It chooses a +unique name and uses it to embed a low priority into all +transitions of the first machine. A higher priority is then embedded into the +transitions of the second machine that enter into a final state. The following +example yields a machine identical to the example in Section +.ref{controlling-nondeterminism}. + +.code +comment = '/*' ( any @comm )* :>> '*/'; +.end code + +.graphic comments2 + +Another guarded operator is .em{left-guarded concatenation}, given by the +.verb|<:| compound symbol. This operator places a higher priority on all +transitions of the first machine. This is useful if one must forcibly separate +two lists that contain common elements. For example, one may need to tokenize a +stream, but first consume leading whitespace. + +Ragel also includes a .em{longest-match kleene star} operator, given by the +.verb|**| compound symbol. This +guarded operator embeds a high +priority into all transitions of the machine. +A lower priority is then embedded into the leaving transitions. When the +kleene star operator makes the epsilon transitions from +the final states into the new start state, the lower priority will be transferred +to the epsilon transitions. In cases where following an epsilon transition +out of a final state conflicts with an existing transition out of a final +state, the epsilon transition will be dropped. + +Other guarded operators are conceivable, such as guards on union that cause one +alternative to take precedence over another. These may be implemented when it +is clear they constitute a frequently used operation. +In the next section we discuss the explicit specification of state machines +using state charts. + +.subsection Entry-Guarded Concatenation + +.verb|expr :> expr| + +This operator concatenates two machines, but first assigns a low +priority to all transitions +of the first machine and a high priority to the starting transitions of the +second machine. This operator is useful if from the final states of the first +machine it is possible to accept the characters in the entering transitions of +the second machine. This operator effectively terminates the first machine +immediately upon starting the second machine, where otherwise they would be +pursued concurrently. In the following example, entry-guarded concatenation is +used to move out of a machine that matches everything at the first sign of an +end-of-input marker. + +% GENERATE: entryguard +% OPT: -p +% %%{ +% machine entryguard; +.code +# Leave the catch-all machine on the first character of FIN. +main := any* :> 'FIN'; +.end code +% }%% +% END GENERATE + +.graphic entryguard + +Entry-guarded concatenation is equivalent to the following: + +.verbatim +expr $(unique_name,0) . expr >(unique_name,1) +.end verbatim + +.subsection Finish-Guarded Concatenation + +.verb|expr :>> expr| + +This operator is +like the previous operator, except the higher priority is placed on the final +transitions of the second machine. This is useful if one wishes to entertain +the possibility of continuing to match the first machine right up until the +second machine enters a final state. In other words it terminates the first +machine only when the second accepts. In the following example, finish-guarded +concatenation causes the move out of the machine that matches everything to be +delayed until the full end-of-input marker has been matched. + +% GENERATE: finguard +% OPT: -p +% %%{ +% machine finguard; +.code +# Leave the catch-all machine on the last character of FIN. +main := any* :>> 'FIN'; +.end code +% }%% +% END GENERATE + +.graphic finguard + +Finish-guarded concatenation is equivalent to the following, with one +exception. If the right machine's start state is final, the higher priority is +also embedded into it as a leaving priority. This prevents the left machine +from persisting via the zero-length string. + +.verbatim +expr $(unique_name,0) . expr @(unique_name,1) +.end verbatim + +.subsection Left-Guarded Concatenation + +.verb|expr <: expr| + +This operator places +a higher priority on the left expression. It is useful if you want to prefix a +sequence with another sequence composed of some of the same characters. For +example, one can consume leading whitespace before tokenizing a sequence of +whitespace-separated words as in: + +% GENERATE: leftguard +% OPT: -p +% %%{ +% machine leftguard; +% action alpha {} +% action ws {} +% action start {} +% action fin {} +.code +main := ( ' '* >start %fin ) <: ( ' ' $ws | [a-z] $alpha )*; +.end code +% }%% +% END GENERATE + +.graphic leftguard + +Left-guarded concatenation is equivalent to the following: + +.verbatim +expr $(unique_name,1) . expr >(unique_name,0) +.end verbatim + +.subsection Longest-Match Kleene Star +.label{longest_match_kleene_star} + +.verb|expr**| + +This version of kleene star puts a higher priority on staying in the +machine versus wrapping around and starting over. The LM kleene star is useful +when writing simple tokenizers. These machines are built by applying the +longest-match kleene star to an alternation of token patterns, as in the +following. + +% GENERATE: lmkleene +% OPT: -p +% %%{ +% machine exfinpri; +% action A {} +% action B {} +.code +# Repeat tokens, but make sure to get the longest match. +main := ( + lower ( lower | digit )* %A | + digit+ %B | + ' ' +)**; +.end code +% }%% +% END GENERATE + +.graphic lmkleene + +If a regular kleene star were used the machine above would not be able to +distinguish between extending a word and beginning a new one. This operator is +equivalent to: + +.verbatim +( expr $(unique_name,1) %(unique_name,0) )* +.end verbatim + +When the kleene star is applied, transitions that go out of the machine and +back into it are made. These are assigned a priority of zero by the leaving +transition mechanism. This is less than the priority of one assigned to the +transitions leaving the final states but not leaving the machine. When +these transitions clash on the same character, the +transition that stays in the machine takes precedence. The transition +that wraps around is dropped. + +Note that this operator does not build a scanner in the traditional sense +because there is never any backtracking. To build a scanner with backtracking +use the Longest-Match machine construction described in Section +.ref{generating-scanners}. + +.chapter Interface to Host Program + +The Ragel code generator is very flexible. The generated code has no +dependencies and can be inserted in any function, perhaps inside a loop if +desired. The user is responsible for declaring and initializing a number of +required variables, including the current state and the pointer to the input +stream. These can live in any scope. Control of the input processing loop is +also possible: the user may break out of the processing loop and return to it +at any time. + +In the case of the C, D, and Go host languages, Ragel is able to generate very +fast-running code that implements state machines as directly executable code. +Since very large files strain the host language compiler, table-based code +generation is also supported. In the future we hope to provide a partitioned, +directly executable format that is able to reduce the burden on the host +compiler by splitting large machines across multiple functions. + +In the case of Java and Ruby, table-based code generation is the only code +style supported. In the future this may be expanded to include other code +styles. + +Ragel can be used to parse input in one block, or it can be used to parse input +in a sequence of blocks as it arrives from a file or socket. Parsing the input +in a sequence of blocks brings with it a few responsibilities. If the parser +utilizes a scanner, care must be taken to not break the input stream anywhere +but token boundaries. If pointers to the input stream are taken during +parsing, care must be taken to not use a pointer that has been invalidated by +movement to a subsequent block. If the current input data pointer is moved +backwards it must not be moved past the beginning of the current block. + +Figure .ref{basic-example} shows a simple Ragel program that does not have any +actions. The example tests the first argument of the program against a number +pattern and then prints the machine's acceptance status. + +.figure basic-example +.verbatim +#include <stdio.h> +#include <string.h> +%%{ + machine foo; + write data; +}%% +int main( int argc, char **argv ) +{ + int cs; + if ( argc > 1 ) { + char *p = argv[1]; + char *pe = p + strlen( p ); + %%{ + main := [0-9]+ ( '.' [0-9]+ )?; + + write init; + write exec; + }%% + } + printf("result = %i\n", cs >= foo_first_final ); + return 0; +} +.end verbatim +.caption A basic Ragel example without any actions. +.end figure + +.section Variables Used by Ragel + +There are a number of variables that Ragel expects the user to declare. At a +very minimum the .verb|cs|, .verb|p| and .verb|pe| variables must be declared. +In Go, Java and Ruby code the .verb|data| variable must also be declared. If +EOF actions are used then the .verb|eof| variable is required. If +stack-based state machine control flow statements are used then the +.verb|stack| and .verb|top| variables are required. If a scanner is declared +then the .verb|act|, .verb|ts| and .verb|te| variables must be +declared. + +.itemize + +.item .verb|cs| - Current state. This must be an integer and it should persist +across invocations of the machine when the data is broken into blocks that are +processed independently. This variable may be modified from outside the +execution loop, but not from within. + +.item .verb|p| - Data pointer. In C/D code this variable is expected to be a +pointer to the character data to process. It should be initialized to the +beginning of the data block on every run of the machine. In Go, Java and Ruby it is +used as an offset to .verb|data| and must be an integer. In this case it should +be initialized to zero on every run of the machine. + +.item .verb|pe| - Data end pointer. This should be initialized to .verb|p| plus +the data length on every run of the machine. In Go, Java and Ruby code this should +be initialized to the data length. + +.item .verb|eof| - End of file pointer. This should be set to .verb|pe| when +the buffer block being processed is the last one, otherwise it should be set to +null. In Go, Java and Ruby code .verb|-1| must be used instead of null. If the EOF +event can be known only after the final buffer block has been processed, then +it is possible to set .verb|p = pe = eof| and run the execute block. + +.item .verb|data| - This variable is only required in Go, Java and Ruby code. It +must be an array containting the data to process. + +.item .verb|stack| - This must be an array of integers. It is used to store +integer values representing states. If the stack must resize dynamically the +Pre-push and Post-Pop statements can be used to do this (Sections +.ref{prepush} and .ref{postpop}). + +.item .verb|top| - This must be an integer value and will be used as an offset +to .verb|stack|, giving the next available spot on the top of the stack. + +.item .verb|act| - This must be an integer value. It is a variable sometimes +used by scanner code to keep track of the most recent successful pattern match. + +.item .verb|ts| - This must be a pointer to character data. In Go, Java and +Ruby code this must be an integer. See Section .ref{generating-scanners} for +more information. + +.item .verb|te| - Also a pointer to character data. + +.end itemize + +.section Alphtype Statement + +.verbatim +alphtype unsigned int; +.end verbatim + +The alphtype statement specifies the alphabet data type that the machine +operates on. During the compilation of the machine, integer literals are +expected to be in the range of possible values of the alphtype. The default +is .verb|char| for all languages except Go where the default is .verb|byte|. + +.multicols +C/C++/Objective-C: +.verbatim + char unsigned char + short unsigned short + int unsigned int + long unsigned long +.end verbatim + +Go: +.verbatim + byte + int8 uint8 + int16 uint16 + int32 uint32 + int +.end verbatim + +Ruby: +.verbatim + char + int +.end verbatim + +.columnbreak + +Java: +.verbatim + char + byte + short + int +.end verbatim + +D: +.verbatim + char + byte ubyte + short ushort + wchar + int uint + dchar +.end verbatim + +.end multicols + +.section Getkey Statement + +.verbatim +getkey fpc->id; +.end verbatim + +This statement specifies to Ragel how to retrieve the current character from +from the pointer to the current element (.verb|p|). Any expression that returns +a value of the alphabet type +may be used. The getkey statement may be used for looking into element +structures or for translating the character to process. The getkey expression +defaults to .verb|(*p)|. In goto-driven machines the getkey expression may be +evaluated more than once per element processed, therefore it should not incur a +large cost nor preclude optimization. + +.section Access Statement + +.verbatim +access fsm->; +.end verbatim + +The access statement specifies how the generated code should +access the machine data that is persistent across processing buffer blocks. +This applies to all variables except .verb|p|, .verb|pe| and .verb|eof|. This includes +.verb|cs|, .verb|top|, .verb|stack|, .verb|ts|, .verb|te| and .verb|act|. +The access statement is useful if a machine is to be encapsulated inside a +structure in C code. It can be used to give the name of +a pointer to the structure. + +.section Variable Statement + +.verbatim +variable p fsm->p; +.end verbatim + +The variable statement specifies how to access a specific +variable. All of the variables that are declared by the user and +used by Ragel can be changed. This includes .verb|p|, .verb|pe|, .verb|eof|, .verb|cs|, +.verb|top|, .verb|stack|, .verb|ts|, .verb|te| and .verb|act|. +In Go, Ruby and Java code generation the .verb|data| variable can also be changed. + +.section Pre-Push Statement +.label{prepush} + +.verbatim +prepush { + /* stack growing code */ +} +.end verbatim + +The prepush statement allows the user to supply stack management code that is +written out during the generation of fcall, immediately before the current +state is pushed to the stack. This statement can be used to test the number of +available spaces and dynamically grow the stack if necessary. + +.section Post-Pop Statement +.label{postpop} + +.verbatim +postpop { + /* stack shrinking code */ +} +.end verbatim + +The postpop statement allows the user to supply stack management code that is +written out during the generation of fret, immediately after the next state is +popped from the stack. This statement can be used to dynamically shrink the +stack. + +.section Write Statement +.label{write-statement} + +.verbatim +write <component> [options]; +.end verbatim + +The write statement is used to generate parts of the machine. +There are seven +components that can be generated by a write statement. These components make up the +state machine's data, initialization code, execution code, and export definitions. +A write statement may appear before a machine is fully defined. +This allows one to write out the data first then later define the machine where +it is used. An example of this is shown in Figure .ref{fbreak-example}. + +.subsection Write Data +.verbatim +write data [options]; +.end verbatim + +The write data statement causes Ragel to emit the constant static data needed +by the machine. In table-driven output styles (see Section .ref{genout}) this +is a collection of arrays that represent the states and transitions of the +machine. In goto-driven machines much less data is emitted. At the very +minimum a start state .verb|name_start| is generated. All variables written +out in machine data have both the .verb|static| and .verb|const| properties and +are prefixed with the name of the machine and an +underscore. The data can be placed inside a class, inside a function, or it can +be defined as global data. + +Two variables are written that may be used to test the state of the machine +after a buffer block has been processed. The .verb|name_error| variable gives +the id of the state that the machine moves into when it cannot find a valid +transition to take. The machine immediately breaks out of the processing loop when +it finds itself in the error state. The error variable can be compared to the +current state to determine if the machine has failed to parse the input. If the +machine is complete, that is from every state there is a transition to a proper +state on every possible character of the alphabet, then no error state is required +and this variable will be set to -1. + +The .verb|name_first_final| variable stores the id of the first final state. +All of the machine's states are sorted by their final state status before +having their ids assigned. Checking if the machine has accepted its input can +then be done by checking if the current state is greater-than or equal to the +first final state. + +Data generation has several options: + +.list +.li .verb|noerror | - Do not generate the integer variable that gives the id of the error state. +.li .verb|nofinal | - Do not generate the integer variable that gives the id of the first final state. +.li .verb|noprefix | - Do not prefix the variable names with the name of the machine. +.end list + +.figure fbreak-example +.verbatim +#include <stdio.h> +%% machine foo; +%% write data; +int main( int argc, char **argv ) +{ + int cs, res = 0; + if ( argc > 1 ) { + char *p = argv[1]; + %%{ + main := + [a-z]+ + 0 @{ res = 1; fbreak; }; + write init; + write exec noend; + }%% + } + printf("execute = %i\n", res ); + return 0; +} +.end verbatim +.caption Use of .tt{noend} write option and the .tt{fbreak} statement for +processing a string. +.end figure + +.subsection Write Start, First Final and Error + +.verbatim +write start; +write first_final; +write error; +.end verbatim + +These three write statements provide an alternative means of accessing the +.verb|start|, .verb|first_final| and .verb|error| states. If there are many +different machine specifications in one file it is easy to get the prefix for +these wrong. This is especially true if the state machine boilerplate is +frequently made by a copy-paste-edit process. These write statements allow the +problem to be avoided. They can be used as follows: + +.verbatim +/* Did parsing succeed? */ +if ( cs < %%{ write first_final; }%% ) { + result = ERR_PARSE_ERROR; + goto fail; +} +.end verbatim + +.subsection Write Init +.verbatim +write init [options]; +.end verbatim + +The write init statement causes Ragel to emit initialization code. This should +be executed once before the machine is started. At a very minimum this sets the +current state to the start state. If other variables are needed by the +generated code, such as call stack variables or scanner management +variables, they are also initialized here. + +The .verb|nocs| option to the write init statement will cause ragel to skip +intialization of the cs variable. This is useful if the user wishes to use +custom logic to decide which state the specification should start in. + +.subsection Write Exec +.verbatim +write exec [options]; +.end verbatim + +The write exec statement causes Ragel to emit the state machine's execution code. +Ragel expects several variables to be available to this code. At a very minimum, the +generated code needs access to the current character position .verb|p|, the ending +position .verb|pe| and the current state .verb|cs| (though .verb|pe| +can be omitted using the .verb|noend| write option). +The .verb|p| variable is the cursor that the execute code will +used to traverse the input. The .verb|pe| variable should be set up to point to one +position past the last valid character in the buffer. + +Other variables are needed when certain features are used. For example using +the .verb|fcall| or .verb|fret| statements requires .verb|stack| and +.verb|top| variables to be defined. If a longest-match construction is used, +variables for managing backtracking are required. + +The write exec statement has one option. The .verb|noend| option tells Ragel +to generate code that ignores the end position .verb|pe|. In this +case the user must explicitly break out of the processing loop using +.verb|fbreak|, otherwise the machine will continue to process characters until +it moves into the error state. This option is useful if one wishes to process a +null terminated string. Rather than traverse the string to discover then length +before processing the input, the user can break out when the null character is +seen. The example in Figure .ref{fbreak-example} shows the use of the +.verb|noend| write option and the .verb|fbreak| statement for processing a string. + +.subsection Write Exports +.label{export} + +.verbatim +write exports; +.end verbatim + +The export feature can be used to export simple machine definitions. Machine definitions +are marked for export using the .verb|export| keyword. + +.verbatim +export machine_to_export = 0x44; +.end verbatim + +When the write exports statement is used these machines are +written out in the generated code. Defines are used for C and constant integers +are used for D, Java and Ruby. See Section .ref{import} for a description of the +import statement. + +.section Maintaining Pointers to Input Data + +In the creation of any parser it is not uncommon to require the collection of +the data being parsed. It is always possible to collect data into a growable +buffer as the machine moves over it, however the copying of data is a somewhat +wasteful use of processor cycles. The most efficient way to collect data from +the parser is to set pointers into the input then later reference them. This +poses a problem for uses of Ragel where the input data arrives in blocks, such +as over a socket or from a file. If a pointer is set in one buffer block but +must be used while parsing a following buffer block, some extra consideration +to correctness must be made. + +The scanner constructions exhibit this problem, requiring the maintenance +code described in Section .ref{generating-scanners}. If a longest-match +construction has been used somewhere in the machine then it is possible to +take advantage of the required prefix maintenance code in the driver program to +ensure pointers to the input are always valid. If laying down a pointer one can +set .verb|ts| at the same spot or ahead of it. When data is shifted in +between loops the user must also shift the pointer. In this way it is possible +to maintain pointers to the input that will always be consistent. + +.figure line-oriented +.verbatim + int have = 0; + while ( 1 ) { + char *p, *pe, *data = buf + have; + int len, space = BUFSIZE - have; + + if ( space == 0 ) { + fprintf(stderr, "BUFFER OUT OF SPACE\n"); + exit(1); + } + + len = fread( data, 1, space, stdin ); + if ( len == 0 ) + break; + + /* Find the last newline by searching backwards. */ + p = buf; + pe = data + len - 1; + while ( *pe != '\n' && pe >= buf ) + pe--; + pe += 1; + + %% write exec; + + /* How much is still in the buffer? */ + have = data + len - pe; + if ( have > 0 ) + memmove( buf, pe, have ); + + if ( len < space ) + break; + } +.end verbatim +.caption An example of line-oriented processing. +.end figure + +In general, there are two approaches for guaranteeing the consistency of +pointers to input data. The first approach is the one just described; +lay down a marker from an action, +then later ensure that the data the marker points to is preserved ahead of +the buffer on the next execute invocation. This approach is good because it +allows the parser to decide on the pointer-use boundaries, which can be +arbitrarily complex parsing conditions. A downside is that it requires any +pointers that are set to be corrected in between execute invocations. + +The alternative is to find the pointer-use boundaries before invoking the execute +routine, then pass in the data using these boundaries. For example, if the +program must perform line-oriented processing, the user can scan backwards from +the end of an input block that has just been read in and process only up to the +first found newline. On the next input read, the new data is placed after the +partially read line and processing continues from the beginning of the line. +An example of line-oriented processing is given in Figure .ref{line-oriented}. + +.section Specifying the Host Language + +The .verb|ragel| program has a number of options for specifying the host +language. The host-language options are: + +.itemize +.item .verb|-C | for C/C++/Objective-C code (default) +.item .verb|-D | for D code. +.item .verb|-Z | for Go code. +.item .verb|-J | for Java code. +.item .verb|-R | for Ruby code. +.item .verb|-A | for C\# code. +.end itemize + +.section Choosing a Generated Code Style +.label{genout} + +There are three styles of code output to choose from. Code style affects the +size and speed of the compiled binary. Changing code style does not require any +change to the Ragel program. There are two table-driven formats and a goto +driven format. + +In addition to choosing a style to emit, there are various levels of action +code reuse to choose from. The maximum reuse levels (.verb|-T0|, .verb|-F0| +and .verb|-G0|) ensure that no FSM action code is ever duplicated by encoding +each transition's action list as static data and iterating +through the lists on every transition. This will normally result in a smaller +binary. The less action reuse options (.verb|-T1|, .verb|-F1| and .verb|-G1|) +will usually produce faster running code by expanding each transition's action +list into a single block of code, eliminating the need to iterate through the +lists. This duplicates action code instead of generating the logic necessary +for reuse. Consequently the binary will be larger. However, this tradeoff applies to +machines with moderate to dense action lists only. If a machine's transitions +frequently have less than two actions then the less reuse options will actually +produce both a smaller and a faster running binary due to less action sharing +overhead. The best way to choose the appropriate code style for your +application is to perform your own tests. + +The table-driven FSM represents the state machine as constant static data. There are +tables of states, transitions, indices and actions. The current state is +stored in a variable. The execution is simply a loop that looks up the current +state, looks up the transition to take, executes any actions and moves to the +target state. In general, the table-driven FSM can handle any machine, produces +a smaller binary and requires a less expensive host language compile, but +results in slower running code. Since the table-driven format is the most +flexible it is the default code style. + +The flat table-driven machine is a table-based machine that is optimized for +small alphabets. Where the regular table machine uses the current character as +the key in a binary search for the transition to take, the flat table machine +uses the current character as an index into an array of transitions. This is +faster in general, however is only suitable if the span of possible characters +is small. + +The goto-driven FSM represents the state machine using goto and switch +statements. The execution is a flat code block where the transition to take is +computed using switch statements and directly executable binary searches. In +general, the goto FSM produces faster code but results in a larger binary and a +more expensive host language compile. + +The goto-driven format has an additional action reuse level (.verb|-G2|) that +writes actions directly into the state transitioning logic rather than putting +all the actions together into a single switch. Generally this produces faster +running code because it allows the machine to encode the current state using +the processor's instruction pointer. Again, sparse machines may actually +compile to smaller binaries when .verb|-G2| is used due to less state and +action management overhead. For many parsing applications .verb|-G2| is the +preferred output format. + +.center + +Code Output Style Options + +.tabular +.row .verb|-T0|&binary search table-driven&C/D/Java/Ruby/C\# +.row .verb|-T1|&binary search, expanded actions&C/D/Ruby/C\# +.row .verb|-F0|&flat table-driven&C/D/Ruby/C\# +.row .verb|-F1|&flat table, expanded actions&C/D/Ruby/C\# +.row .verb|-G0|&goto-driven&C/D/C\# +.row .verb|-G1|&goto, expanded actions&C/D/C\# +.row .verb|-G2|&goto, in-place actions&C/D/Go +.end tabular +.end center + +.chapter Beyond the Basic Model + +.section Parser Modularization +.label{modularization} + +It is possible to use Ragel's machine construction and action embedding +operators to specify an entire parser using a single regular expression. In +many cases this is the desired way to specify a parser in Ragel. However, in +some scenarios the language to parse may be so large that it is difficult to +think about it as a single regular expression. It may also shift between distinct +parsing strategies, in which case modularization into several coherent blocks +of the language may be appropriate. + +It may also be the case that patterns that compile to a large number of states +must be used in a number of different contexts and referencing them in each +context results in a very large state machine. In this case, an ability to reuse +parsers would reduce code size. + +To address this, distinct regular expressions may be instantiated and linked +together by means of a jumping and calling mechanism. This mechanism is +analogous to the jumping to and calling of processor instructions. A jump +command, given in action code, causes control to be immediately passed to +another portion of the machine by way of setting the current state variable. A +call command causes the target state of the current transition to be pushed to +a state stack before control is transferred. Later on, the original location +may be returned to with a return statement. In the following example, distinct +state machines are used to handle the parsing of two types of headers. + +% GENERATE: call +% %%{ +% machine call; +.code +action return { fret; } +action call_date { fcall date; } +action call_name { fcall name; } + +# A parser for date strings. +date := [0-9][0-9] '/' + [0-9][0-9] '/' + [0-9][0-9][0-9][0-9] '\n' @return; + +# A parser for name strings. +name := ( [a-zA-Z]+ | ' ' )** '\n' @return; + +# The main parser. +headers = + ( 'from' | 'to' ) ':' @call_name | + ( 'departed' | 'arrived' ) ':' @call_date; + +main := headers*; +.end code +% }%% +% %% write data; +% void f() +% { +% %% write init; +% %% write exec; +% } +% END GENERATE + +Calling and jumping should be used carefully as they are operations that take +one out of the domain of regular languages. A machine that contains a call or +jump statement in one of its actions should be used as an argument to a machine +construction operator only with considerable care. Since DFA transitions may +actually represent several NFA transitions, a call or jump embedded in one +machine can inadvertently terminate another machine that it shares prefixes +with. Despite this danger, theses statements have proven useful for tying +together sub-parsers of a language into a parser for the full language, +especially for the purpose of modularizing code and reducing the number of +states when the machine contains frequently recurring patterns. + +Section .ref{vals} describes the jump and call statements that are used to +transfer control. These statements make use of two variables that must be +declared by the user, .verb|stack| and .verb|top|. The .verb|stack| variable +must be an array of integers and .verb|top| must be a single integer, which +will point to the next available space in .verb|stack|. Sections .ref{prepush} +and .ref{postpop} describe the Pre-Push and Post-Pop statements which can be +used to implement a dynamically resizable array. + +.section Referencing Names +.label{labels} + +This section describes how to reference names in epsilon transitions (Section +.ref{state-charts}) and +action-based control-flow statements such as .verb|fgoto|. There is a hierarchy +of names implied in a Ragel specification. At the top level are the machine +instantiations. Beneath the instantiations are labels and references to machine +definitions. Beneath those are more labels and references to definitions, and +so on. + +Any name reference may contain multiple components separated with the .verb|::| +compound symbol. The search for the first component of a name reference is +rooted at the join expression that the epsilon transition or action embedding +is contained in. If the name reference is not contained in a join, +the search is rooted at the machine definition that the epsilon transition or +action embedding is contained in. Each component after the first is searched +for beginning at the location in the name tree that the previous reference +component refers to. + +In the case of action-based references, if the action is embedded more than +once, the local search is performed for each embedding and the result is the +union of all the searches. If no result is found for action-based references then +the search is repeated at the root of the name tree. Any action-based name +search may be forced into a strictly global search by prefixing the name +reference with .verb|::|. + +The final component of the name reference must resolve to a unique entry point. +If a name is unique in the entire name tree it can be referenced as is. If it +is not unique it can be specified by qualifying it with names above it in the +name tree. However, it can always be renamed. + +% FIXME: Should fit this in somewhere. +% Some kinds of name references are illegal. Cannot call into longest-match +% machine, can only call its start state. Cannot make a call to anywhere from +% any part of a longest-match machine except a rule's action. This would result +% in an eventual return to some point inside a longest-match other than the +% start state. This is banned for the same reason a call into the LM machine is +% banned. + + +.section Scanners +.label{generating-scanners} + +Scanners are very much intertwined with regular-languages and their +corresponding processors. For this reason Ragel supports the definition of +scanners. The generated code will repeatedly attempt to match patterns from a +list, favouring longer patterns over shorter patterns. In the case of +equal-length matches, the generated code will favour patterns that appear ahead +of others. When a scanner makes a match it executes the user code associated +with the match, consumes the input then resumes scanning. + +.verbatim +<machine_name> := |* + pattern1 => action1; + pattern2 => action2; + ... + *|; +.end verbatim + +On the surface, Ragel scanners are similar to those defined by Lex. Though +there is a key distinguishing feature: patterns may be arbitrary Ragel +expressions and can therefore contain embedded code. With a Ragel-based scanner +the user need not wait until the end of a pattern before user code can be +executed. + +Scanners can be used to process sub-languages, as well as for tokenizing +programming languages. In the following example a scanner is used to tokenize +the contents of a header field. + +.code +word = [a-z]+; +head_name = 'Header'; + +header := |* + word; + ' '; + '\n' => { fret; }; +*|; + +main := ( head_name ':' @{ fcall header; } )*; +.end code + +The scanner construction has a purpose similar to the longest-match kleene star +operator .verb|**|. The key +difference is that a scanner is able to backtrack to match a previously matched +shorter string when the pursuit of a longer string fails. For this reason the +scanner construction operator is not a pure state machine construction +operator. It relies on several variables that enable it to backtrack and make +pointers to the matched input text available to the user. For this reason +scanners must be immediately instantiated. They cannot be defined inline or +referenced by another expression. Scanners must be jumped to or called. + +Scanners rely on the .verb|ts|, .verb|te| and .verb|act| +variables to be present so that they can backtrack and make pointers to the +matched text available to the user. If input is processed using multiple calls +to the execute code then the user must ensure that when a token is only +partially matched that the prefix is preserved on the subsequent invocation of +the execute code. + +The .verb|ts| variable must be defined as a pointer to the input data. +It is used for recording where the current token match begins. This variable +may be used in action code for retrieving the text of the current match. Ragel +ensures that in between tokens and outside of the longest-match machines that +this pointer is set to null. In between calls to the execute code the user must +check if .verb|ts| is set and if so, ensure that the data it points to is +preserved ahead of the next buffer block. This is described in more detail +below. + +The .verb|te| variable must also be defined as a pointer to the input data. +It is used for recording where a match ends and where scanning of the next +token should begin. This can also be used in action code for retrieving the +text of the current match. + +The .verb|act| variable must be defined as an integer type. It is used for +recording the identity of the last pattern matched when the scanner must go +past a matched pattern in an attempt to make a longer match. If the longer +match fails it may need to consult the .verb|act| variable. In some cases, use +of the .verb|act| +variable can be avoided because the value of the current state is enough +information to determine which token to accept, however in other cases this is +not enough and so the .verb|act| variable is used. + +When the longest-match operator is in use, the user's driver code must take on +some buffer management functions. The following algorithm gives an overview of +the steps that should be taken to properly use the longest-match operator. + +.itemize +.item Read a block of input data. +.item Run the execute code. +.item If .verb|ts| is set, the execute code will expect the incomplete +token to be preserved ahead of the buffer on the next invocation of the execute +code. +.itemize +.item Shift the data beginning at .verb|ts| and ending at .verb|pe| to the +beginning of the input buffer. +.item Reset .verb|ts| to the beginning of the buffer. +.item Shift .verb|te| by the distance from the old value of .verb|ts| +to the new value. The .verb|te| variable may or may not be valid. There is +no way to know if it holds a meaningful value because it is not kept at null +when it is not in use. It can be shifted regardless. +.end itemize +.item Read another block of data into the buffer, immediately following any +preserved data. +.item Run the scanner on the new data. +.end itemize + +Figure .ref{preserve_example} shows the required handling of an input stream in +which a token is broken by the input block boundaries. After processing up to +and including the ``t'' of ``characters'', the prefix of the string token must be +retained and processing should resume at the ``e'' on the next iteration of +the execute code. + +If one uses a large input buffer for collecting input then the number of times +the shifting must be done will be small. Furthermore, if one takes care not to +define tokens that are allowed to be very long and instead processes these +items using pure state machines or sub-scanners, then only a small amount of +data will ever need to be shifted. + +.figure preserve_example +.verbatim + a) A stream "of characters" to be scanned. + | | | + p ts pe + + b) "of characters" to be scanned. + | | | + ts p pe +.end verbatim +.caption Following an invocation of the execute code there may be a partially +matched token (a). The data of the partially matched token +must be preserved ahead of the new data on the next invocation (b). +.end figure + +Since scanners attempt to make the longest possible match of input, patterns +such as identifiers require one character of lookahead in order to trigger a +match. In the case of the last token in the input stream the user must ensure +that the .verb|eof| variable is set so that the final token is flushed out. + +An example scanner processing loop is given in Figure .ref{scanner-loop}. + +.figure scanner-loop +.verbatim + int have = 0; + bool done = false; + while ( !done ) { + /* How much space is in the buffer? */ + int space = BUFSIZE - have; + if ( space == 0 ) { + /* Buffer is full. */ + cerr << "TOKEN TOO BIG" << endl; + exit(1); + } + + /* Read in a block after any data we already have. */ + char *p = inbuf + have; + cin.read( p, space ); + int len = cin.gcount(); + + char *pe = p + len; + char *eof = 0; + + /* If no data was read indicate EOF. */ + if ( len == 0 ) { + eof = pe; + done = true; + } + + %% write exec; + + if ( cs == Scanner_error ) { + /* Machine failed before finding a token. */ + cerr << "PARSE ERROR" << endl; + exit(1); + } + + if ( ts == 0 ) + have = 0; + else { + /* There is a prefix to preserve, shift it over. */ + have = pe - ts; + memmove( inbuf, ts, have ); + te = inbuf + (te-ts); + ts = inbuf; + } + } +.end verbatim +.caption A processing loop for a scanner. +.end figure + +.section State Charts +.label{state-charts} + +In addition to supporting the construction of state machines using regular +languages, Ragel provides a way to manually specify state machines using +state charts. The comma operator combines machines together without any +implied transitions. The user can then manually link machines by specifying +epsilon transitions with the .verb|->| operator. Epsilon transitions are drawn +between the final states of a machine and entry points defined by labels. This +makes it possible to build machines using the explicit state-chart method while +making minimal changes to the Ragel language. + +An interesting feature of Ragel's state chart construction method is that it +can be mixed freely with regular expression constructions. A state chart may be +referenced from within a regular expression, or a regular expression may be +used in the definition of a state chart transition. + +.subsection Join + +.verb|expr , expr , ...| + +Join a list of machines together without +drawing any transitions, without setting up a start state, and without +designating any final states. Transitions between the machines may be specified +using labels and epsilon transitions. The start state must be explicity +specified with the ``start'' label. Final states may be specified with an +epsilon transition to the implicitly created ``final'' state. The join +operation allows one to build machines using a state chart model. + +.subsection Label + +.verb|label: expr| + +Attaches a label to an expression. Labels can be +used as the target of epsilon transitions and explicit control transfer +statements such as .verb|fgoto| and .verb|fnext| in action +code. + +.subsection Epsilon + +.verb|expr -> label| + +Draws an epsilon transition to the state defined +by .verb|label|. Epsilon transitions are made deterministic when join +operators are evaluated. Epsilon transitions that are not in a join operation +are made deterministic when the machine definition that contains the epsilon is +complete. See Section .ref{labels} for information on referencing labels. + +.subsection Simplifying State Charts + +There are two benefits to providing state charts in Ragel. The first is that it +allows us to take a state chart with a full listing of states and transitions +and simplify it in selective places using regular expressions. + +The state chart method of specifying parsers is very common. It is an +effective programming technique for producing robust code. The key disadvantage +becomes clear when one attempts to comprehend a large parser specified in this +way. These programs usually require many lines, causing logic to be spread out +over large distances in the source file. Remembering the function of a large +number of states can be difficult and organizing the parser in a sensible way +requires discipline because branches and repetition present many file layout +options. This kind of programming takes a specification with inherent +structure such as looping, alternation and concatenation and expresses it in a +flat form. + +If we could take an isolated component of a manually programmed state chart, +that is, a subset of states that has only one entry point, and implement it +using regular language operators then we could eliminate all the explicit +naming of the states contained in it. By eliminating explicitly named states +and replacing them with higher-level specifications we simplify a state machine +specification. + +For example, sometimes chains of states are needed, with only a small number of +possible characters appearing along the chain. These can easily be replaced +with a concatenation of characters. Sometimes a group of common states +implement a loop back to another single portion of the machine. Rather than +manually duplicate all the transitions that loop back, we may be able to +express the loop using a kleene star operator. + +Ragel allows one to take this state map simplification approach. We can build +state machines using a state map model and implement portions of the state map +using regular languages. In place of any transition in the state machine, +entire sub-machines can be given. These can encapsulate functionality +defined elsewhere. An important aspect of the Ragel approach is that when we +wrap up a collection of states using a regular expression we do not lose +access to the states and transitions. We can still execute code on the +transitions that we have encapsulated. + +.subsection Dropping Down One Level of Abstraction +.label{down} + +The second benefit of incorporating state charts into Ragel is that it permits +us to bypass the regular language abstraction if we need to. Ragel's action +embedding operators are sometimes insufficient for expressing certain parsing +tasks. In the same way that is useful for C language programmers to drop down +to assembly language programming using embedded assembler, it is sometimes +useful for the Ragel programmer to drop down to programming with state charts. + +In the following example, we wish to buffer the characters of an XML CDATA +sequence. The sequence is terminated by the string .verb|]]>|. The challenge +in our application is that we do not wish the terminating characters to be +buffered. An expression of the form .verb|any* @buffer :>> ']]>'| will not work +because the buffer will always contain the characters .verb|]]| on the end. +Instead, what we need is to delay the buffering of .verb|]| +characters until a time when we +abandon the terminating sequence and go back into the main loop. There is no +easy way to express this using Ragel's regular expression and action embedding +operators, and so an ability to drop down to the state chart method is useful. + +% GENERATE: dropdown +% OPT: -p +% %%{ +% machine dropdown; +.code +action bchar { buff( fpc ); } # Buffer the current character. +action bbrack1 { buff( "]" ); } +action bbrack2 { buff( "]]" ); } + +CDATA_body = +start: ( + ']' -> one | + (any-']') @bchar ->start +), +one: ( + ']' -> two | + [^\]] @bbrack1 @bchar ->start +), +two: ( + '>' -> final | + ']' @bbrack1 -> two | + [^>\]] @bbrack2 @bchar ->start +); +.end code +% main := CDATA_body; +% }%% +% END GENERATE + +.graphic dropdown + + +.section Semantic Conditions +.label{semantic} + +Many communication protocols contain variable-length fields, where the length +of the field is given ahead of the field as a value. This +problem cannot be expressed using regular languages because of its +context-dependent nature. The prevalence of variable-length fields in +communication protocols motivated us to introduce semantic conditions into +the Ragel language. + +A semantic condition is a block of user code that is interpreted as an +expression and evaluated immediately +before a transition is taken. If the code returns a value of true, the +transition may be taken. We can now embed code that extracts the length of a +field, then proceed to match $n$ data values. + +% GENERATE: conds1 +% OPT: -p +% %%{ +% machine conds1; +% number = digit+; +.code +action rec_num { i = 0; n = getnumber(); } +action test_len { i++ < n } +data_fields = ( + 'd' + [0-9]+ %rec_num + ':' + ( [a-z] when test_len )* +)**; +.end code +% main := data_fields; +% }%% +% END GENERATE + +.graphic conds1 + +The Ragel implementation of semantic conditions does not force us to give up the +compositional property of Ragel definitions. For example, a machine that tests +the length of a field using conditions can be unioned with another machine +that accepts some of the same strings, without the two machines interfering with +one another. The user need not be concerned about whether or not the result of the +semantic condition will affect the matching of the second machine. + +To see this, first consider that when a user associates a condition with an +existing transition, the transition's label is translated from the base character +to its corresponding value in the space that represents ``condition $c$ true''. Should +the determinization process combine a state that has a conditional transition +with another state that has a transition on the same input character but +without a condition, then the condition-less transition first has its label +translated into two values, one to its corresponding value in the space that +represents ``condition $c$ true'' and another to its corresponding value in the +space that represents ``condition $c$ false''. It +is then safe to combine the two transitions. This is shown in the following +example. Two intersecting patterns are unioned, one with a condition and one +without. The condition embedded in the first pattern does not affect the second +pattern. + +% GENERATE: conds2 +% OPT: -p +% %%{ +% machine conds2; +% number = digit+; +.code +action test_len { i++ < n } +action one { /* accept pattern one */ } +action two { /* accept pattern two */ } +patterns = + ( [a-z] when test_len )+ %one | + [a-z][a-z0-9]* %two; +main := patterns '\n'; +.end code +% }%% +% END GENERATE + +.graphic conds2 + +There are many more potential uses for semantic conditions. The user is free to +use arbitrary code and may therefore perform actions such as looking up names +in dictionaries, validating input using external parsing mechanisms or +performing checks on the semantic structure of input seen so far. In the next +section we describe how Ragel accommodates several common parser engineering +problems. + +The semantic condition feature works only with alphabet types that are smaller +in width than the .verb|long| type. To implement semantic conditions Ragel +needs to be able to allocate characters from the alphabet space. Ragel uses +these allocated characters to express "character C with condition P true" or "C +with P false." Since internally Ragel uses longs to store characters there is +no room left in the alphabet space unless an alphabet type smaller than long is +used. + +.section Implementing Lookahead + +There are a few strategies for implementing lookahead in Ragel programs. +Leaving actions, which are described in Section .ref{out-actions}, can be +used as a form of lookahead. Ragel also provides the .verb|fhold| directive +which can be used in actions to prevent the machine from advancing over the +current character. It is also possible to manually adjust the current character +position by shifting it backwards using .verb|fexec|, however when this is +done, care must be taken not to overstep the beginning of the current buffer +block. In both the use of .verb|fhold| and .verb|fexec| the user must be +cautious of combining the resulting machine with another in such a way that the +transition on which the current position is adjusted is not combined with a +transition from the other machine. + +.section Parsing Recursive Language Structures + +In general Ragel cannot handle recursive structures because the grammar is +interpreted as a regular language. However, depending on what needs to be +parsed it is sometimes practical to implement the recursive parts using manual +coding techniques. This often works in cases where the recursive structures are +simple and easy to recognize, such as in the balancing of parentheses + +One approach to parsing recursive structures is to use actions that increment +and decrement counters or otherwise recognize the entry to and exit from +recursive structures and then jump to the appropriate machine defnition using +.verb|fcall| and .verb|fret|. Alternatively, semantic conditions can be used to +test counter variables. + +A more traditional approach is to call a separate parsing function (expressed +in the host language) when a recursive structure is entered, then later return +when the end is recognized. +##### EXP ##### +\documentclass[letterpaper,11pt,oneside]{book} +\usepackage{graphicx} +\usepackage{comment} +\usepackage{multicol} +\usepackage[ + colorlinks=true, + linkcolor=black, + citecolor=green, + filecolor=black, + urlcolor=black]{hyperref} + +\topmargin -0.20in +\oddsidemargin 0in +\textwidth 6.5in +\textheight 9in + +\setlength{\parskip}{0pt} +\setlength{\topsep}{0pt} +\setlength{\partopsep}{0pt} +\setlength{\itemsep}{0pt} + +\input{version} + +\newcommand{\verbspace}{\vspace{10pt}} +\newcommand{\graphspace}{\vspace{10pt}} + +\renewcommand\floatpagefraction{.99} +\renewcommand\topfraction{.99} +\renewcommand\bottomfraction{.99} +\renewcommand\textfraction{.01} +\setcounter{totalnumber}{50} +\setcounter{topnumber}{50} +\setcounter{bottomnumber}{50} + +\newenvironment{inline_code}{\def\baselinestretch{1}\vspace{12pt}\small}{} + +\begin{document} + +\thispagestyle{empty} +\begin{center} +\vspace*{3in} +{\huge Ragel State Machine Compiler}\\ +\vspace*{12pt} +{\Large User Guide}\\ +\vspace{1in} +by\\ +\vspace{12pt} +{\large Adrian Thurston}\\ +\end{center} +\clearpage + +\pagenumbering{roman} + +\chapter*{License} +Ragel version \version, \pubdate\\ +Copyright \copyright\ 2003-2012 Adrian D. Thurston +\vspace{6mm} + +{\bf\it\noindent This document is part of Ragel, and as such, this document is +released under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 2 of the License, or (at your option) +any later version. +} + +\vspace{5pt} + +{\bf\it\noindent Ragel is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. +} + +\vspace{5pt} + +{\bf\it\noindent You should have received a copy of the GNU General Public +License along with Ragel; if not, write to the Free Software Foundation, Inc., +59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +} + +\clearpage +\tableofcontents +\clearpage + +\pagenumbering{arabic} +\chapter{Introduction} + +\section{Abstract} + +Regular expressions are used heavily in practice for the purpose of specifying +parsers. They are normally used as black boxes linked together with program +logic. User actions are executed in between invocations of the regular +expression engine. Adding actions before a pattern terminates requires patterns +to be broken and pasted back together with program logic. The more user actions +are needed, the less the advantages of regular expressions are seen. + +Ragel is a software development tool that allows user actions to be +embedded into the transitions of a regular expression's corresponding state +machine, eliminating the need to switch from the regular expression engine and +user code execution environment and back again. As a result, expressions can be +maximally continuous. One is free to specify an entire parser using a single +regular expression. The single-expression model affords concise and elegant +descriptions of languages and the generation of very simple, fast and robust +code. Ragel compiles executable finite state machines from a high level regular language +notation. Ragel targets C, C++, Objective-C, D, Go, Java and Ruby. + +In addition to building state machines from regular expressions, Ragel allows +the programmer to directly specify state machines with state charts. These two +notations may be freely combined. There are also facilities for controlling +nondeterminism in the resulting machines and building scanners using patterns +that themselves have embedded actions. Ragel can produce code that is small and +runs very fast. Ragel can handle integer-sized alphabets and can compile very +large state machines. + +\section{Motivation} + +When a programmer is faced with the task of producing a parser for a +context-free language there are many tools to choose from. It is quite common +to generate useful and efficient parsers for programming languages from a +formal grammar. It is also quite common for programmers to avoid such tools +when making parsers for simple computer languages, such as file formats and +communication protocols. Such languages are often regular and tools for +processing the context-free languages are viewed as too heavyweight for the +purpose of parsing regular languages. The extra run-time effort required for +supporting the recursive nature of context-free languages is wasted. + +When we turn to the regular expression-based parsing tools, such as Lex, Re2C, +and scripting languages such as Sed, Awk and Perl we find that they are split +into two levels: a regular expression matching engine and some kind of program +logic for linking patterns together. For example, a Lex program is composed of +sets of regular expressions. The implied program logic repeatedly attempts to +match a pattern in the current set. When a match is found the associated user +code executed. It requires the user to consider a language as a sequence of +independent tokens. Scripting languages and regular expression libraries allow +one to link patterns together using arbitrary program code. This is very +flexible and powerful, however we can be more concise and clear if we avoid +gluing together regular expressions with if statements and while loops. + +This model of execution, where the runtime alternates between regular +expression matching and user code exectution places restrictions on when +action code may be executed. Since action code can only be associated with +complete patterns, any action code that must be executed before an entire +pattern is matched requires that the pattern be broken into smaller units. +Instead of being forced to disrupt the regular expression syntax and write +smaller expressions, it is desirable to retain a single expression and embed +code for performing actions directly into the transitions that move over the +characters. After all, capable programmers are astutely aware of the machinery +underlying their programs, so why not provide them with access to that +machinery? To achieve this we require an action execution model for associating +code with the sub-expressions of a regular expression in a way that does not +disrupt its syntax. + +The primary goal of Ragel is to provide developers with an ability to embed +actions into the transitions and states of a regular expression's state machine +in support of the definition of entire parsers or large sections of parsers +using a single regular expression. From the regular expression we gain a clear +and concise statement of our language. From the state machine we obtain a very +fast and robust executable that lends itself to many kinds of analysis and +visualization. + +\section{Overview} + +Ragel is a language for specifying state machines. The Ragel program is a +compiler that assembles a state machine definition to executable code. Ragel +is based on the principle that any regular language can be converted to a +deterministic finite state automaton. Since every regular language has a state +machine representation and vice versa, the terms regular language and state +machine (or just machine) will be used interchangeably in this document. + +Ragel outputs machines to C, C++, Objective-C, D, Go, Java or Ruby code. The output is +designed to be generic and is not bound to any particular input or processing +method. A Ragel machine expects to have data passed to it in buffer blocks. +When there is no more input, the machine can be queried for acceptance. In +this way, a Ragel machine can be used to simply recognize a regular language +like a regular expression library. By embedding code into the regular language, +a Ragel machine can also be used to parse input. + +The Ragel language has many operators for constructing and manipulating +machines. Machines are built up from smaller machines, to bigger ones, to the +final machine representing the language that needs to be recognized or parsed. + +The core state machine construction operators are those found in most theory +of computation textbooks. They date back to the 1950s and are widely studied. +They are based on set operations and permit one to think of languages as a set +of strings. They are Union, Intersection, Difference, Concatenation and Kleene +Star. Put together, these operators make up what most people know as regular +expressions. Ragel also provides a scanner construction operator +and provides operators for explicitly constructing machines +using a state chart method. In the state chart method, one joins machines +together without any implied transitions and then explicitly specifies where +epsilon transitions should be drawn. + +The state machine manipulation operators are specific to Ragel. They allow the +programmer to access the states and transitions of regular language's +corresponding machine. There are two uses of the manipulation operators. The +first and primary use is to embed code into transitions and states, allowing +the programmer to specify the actions of the state machine. + +Ragel attempts to make the action embedding facility as intuitive as possible. +To do so, a number of issues need to be addressed. For example, when making a +nondeterministic specification into a DFA using machines that have embedded +actions, new transitions are often made that have the combined actions of +several source transitions. Ragel ensures that multiple actions associated with +a single transition are ordered consistently with respect to the order of +reference and the natural ordering implied by the construction operators. + +The second use of the manipulation operators is to assign priorities to +transitions. Priorities provide a convenient way of controlling any +nondeterminism introduced by the construction operators. Suppose two +transitions leave from the same state and go to distinct target states on the +same character. If these transitions are assigned conflicting priorities, then +during the determinization process the transition with the higher priority will +take precedence over the transition with the lower priority. The lower priority +transition gets abandoned. The transitions would otherwise be combined into a new +transition that goes to a new state that is a combination of the original +target states. Priorities are often required for segmenting machines. The most +common uses of priorities have been encoded into a set of simple operators +that should be used instead of priority embeddings whenever possible. + +For the purposes of embedding, Ragel divides transitions and states into +different classes. There are four operators for embedding actions and +priorities into the transitions of a state machine. It is possible to embed +into entering transitions, finishing transitions, all transitions and leaving +transitions. The embedding into leaving transitions is a special case. +These transition embeddings get stored in the final states of a machine. They +are transferred to any transitions that are made going out of the machine by +future concatenation or kleene star operations. + +There are several more operators for embedding actions into states. Like the +transition embeddings, there are various different classes of states that the +embedding operators access. For example, one can access start states, final +states or all states, among others. Unlike the transition embeddings, there are +several different types of state action embeddings. These are executed at +various different times during the processing of input. It is possible to embed +actions that are exectued on transitions into a state, on transitions out of a +state, on transitions taken on the error event, or on transitions taken on the +EOF event. + +Within actions, it is possible to influence the behaviour of the state machine. +The user can write action code that jumps or calls to another portion of the +machine, changes the current character being processed, or breaks out of the +processing loop. With the state machine calling feature Ragel can be used to +parse languages that are not regular. For example, one can parse balanced +parentheses by calling into a parser when an open parenthesis character is seen +and returning to the state on the top of the stack when the corresponding +closing parenthesis character is seen. More complicated context-free languages +such as expressions in C are out of the scope of Ragel. + +Ragel also provides a scanner construction operator that can be used to build +scanners much the same way that Lex is used. The Ragel generated code, which +relies on user-defined variables for backtracking, repeatedly tries to match +patterns to the input, favouring longer patterns over shorter ones and patterns +that appear ahead of others when the lengths of the possible matches are +identical. When a pattern is matched the associated action is executed. + +The key distinguishing feature between scanners in Ragel and scanners in Lex is +that Ragel patterns may be arbitrary Ragel expressions and can therefore +contain embedded code. With a Ragel-based scanner the user need not wait until +the end of a pattern before user code can be executed. + +Scanners do take Ragel out of the domain of pure state machines and require the +user to maintain the backtracking related variables. However, scanners +integrate well with regular state machine instantiations. They can be called to +or jumped to only when needed, or they can be called out of or jumped out of +when a simpler, pure state machine model is appropriate. + +Two types of output code style are available. Ragel can produce a table-driven +machine or a directly executable machine. The directly executable machine is +much faster than the table-driven. On the other hand, the table-driven machine +is more compact and less demanding on the host language compiler. It is better +suited to compiling large state machines. + +\section{Related Work} + +Lex is perhaps the best-known tool for constructing parsers from regular +expressions. In the Lex processing model, generated code attempts to match one +of the user's regular expression patterns, favouring longer matches over +shorter ones. Once a match is made it then executes the code associated with +the pattern and consumes the matching string. This process is repeated until +the input is fully consumed. + +Through the use of start conditions, related sets of patterns may be defined. +The active set may be changed at any time. This allows the user to define +different lexical regions. It also allows the user to link patterns together by +requiring that some patterns come before others. This is quite like a +concatenation operation. However, use of Lex for languages that require a +considerable amount of pattern concatenation is inappropriate. In such cases a +Lex program deteriorates into a manually specified state machine, where start +conditions define the states and pattern actions define the transitions. Lex +is therefore best suited to parsing tasks where the language to be parsed can +be described in terms of regions of tokens. + +Lex is useful in many scenarios and has undoubtedly stood the test of time. +There are, however, several drawbacks to using Lex. Lex can impose too much +overhead for parsing applications where buffering is not required because all +the characters are available in a single string. In these cases there is +structure to the language to be parsed and a parser specification tool can +help, but employing a heavyweight processing loop that imposes a stream +``pull'' model and dynamic input buffer allocation is inappropriate. An +example of this kind of scenario is the conversion of floating point numbers +contained in a string to their corresponding numerical values. + +Another drawback is the very issue that Ragel attempts to solve. +It is not possible to execute a user action while +matching a character contained inside a pattern. For example, if scanning a +programming language and string literals can contain newlines which must be +counted, a Lex user must break up a string literal pattern so as to associate +an action with newlines. This forces the definition of a new start condition. +Alternatively the user can reprocess the text of the matched string literal to +count newlines. + + +The Re2C program defines an input processing model similar to that of Lex. +Re2C focuses on making generated state machines run very fast and +integrate easily into any program, free of dependencies. Re2C generates +directly executable code and is able to claim that generated parsers run nearly +as fast as their hand-coded equivalents. This is very important for user +adoption, as programmers are reluctant to use a tool when a faster alternative +exists. A consideration to ease of use is also important because developers +need the freedom to integrate the generated code as they see fit. + +Many scripting languages provide ways of composing parsers by linking regular +expressions using program logic. For example, Sed and Awk are two established +Unix scripting tools that allow the programmer to exploit regular expressions +for the purpose of locating and extracting text of interest. High-level +programming languages such as Perl, Python, PHP and Ruby all provide regular +expression libraries that allow the user to combine regular expressions with +arbitrary code. + +In addition to supporting the linking of regular expressions with arbitrary +program logic, the Perl programming language permits the embedding of code into +regular expressions. Perl embeddings do not translate into the embedding of +code into deterministic state machines. Perl regular expressions are in fact +not fully compiled to deterministic machines when embedded code is involved. +They are instead interpreted and involve backtracking. This is shown by the +following Perl program. When it is fed the input \verb|abcd| the interpretor +attempts to match the first alternative, printing \verb|a1 b1|. When this +possibility fails it backtracks and tries the second possibility, printing +\verb|a2 b2|, at which point it succeeds. + +\begin{inline_code} +\begin{verbatim} +print "YES\n" if ( <STDIN> =~ + /( a (?{ print "a1 "; }) b (?{ print "b1 "; }) cX ) | + ( a (?{ print "a2 "; }) b (?{ print "b2 "; }) cd )/x ) +\end{verbatim} +\end{inline_code} +\verbspace + +In Ragel there is no regular expression interpretor. Aside from the scanner +operator, all Ragel expressions are made into deterministic machines and the +run time simply moves from state to state as it consumes input. An equivalent +parser expressed in Ragel would attempt both of the alternatives concurrently, +printing \verb|a1 a2 b1 b2|. + +\section{Development Status} + +Ragel is a relatively new tool and is under continuous development. As a rough +release guide, minor revision number changes are for implementation +improvements and feature additions. Major revision number changes are for +implementation and language changes that do not preserve backwards +compatibility. Though in the past this has not always held true: changes that +break code have crept into minor version number changes. Typically, the +documentation lags behind the development in the interest of documenting only +the lasting features. The latest changes are always documented in the ChangeLog +file. + +\chapter{Constructing State Machines} + +\section{Ragel State Machine Specifications} + +A Ragel input file consists of a program in the host language that contains embedded machine +specifications. Ragel normally passes input straight to output. When it sees +a machine specification it stops to read the Ragel statements and possibly generate +code in place of the specification. +Afterwards it continues to pass input through. There +can be any number of FSM specifications in an input file. A multi-line FSM spec +starts with \verb|%%{| and ends with \verb|}%%|. A single-line FSM spec starts +with \verb|%%| and ends at the first newline. + +While Ragel is looking for FSM specifications it does basic lexical analysis on +the surrounding input. It interprets literal strings and comments so a +\verb|%%| sequence in either of those will not trigger the parsing of an FSM +specification. Ragel does not pass the input through any preprocessor nor does it +interpret preprocessor directives itself so includes, defines and ifdef logic +cannot be used to alter the parse of a Ragel input file. It is therefore not +possible to use an \verb|#if 0| directive to comment out a machine as is +commonly done in C code. As an alternative, a machine can be prevented from +causing any generated output by commenting out write statements. + +In Figure \ref{cmd-line-parsing}, a multi-line specification is used to define the +machine and single line specifications are used to trigger the writing of the machine +data and execution code. + +\begin{figure} +\small +\begin{multicols}{2} +\begin{verbatim} +#include <string.h> +#include <stdio.h> + +%%{ + machine foo; + main := + ( 'foo' | 'bar' ) + 0 @{ res = 1; }; +}%% + +%% write data; +\end{verbatim} +\verbspace +\columnbreak +\begin{verbatim} +int main( int argc, char **argv ) +{ + int cs, res = 0; + if ( argc > 1 ) { + char *p = argv[1]; + char *pe = p + strlen(p) + 1; + %% write init; + %% write exec; + } + printf("result = %i\n", res ); + return 0; +} +\end{verbatim} +\verbspace +\end{multicols} +\caption{Parsing a command line argument. +} +\label{cmd-line-parsing} +\end{figure} + +\subsection{Naming Ragel Blocks} + +\begin{verbatim} +machine fsm_name; +\end{verbatim} +\verbspace + +The \verb|machine| statement gives the name of the FSM. If present in a +specification, this statement must appear first. If a machine specification +does not have a name then Ragel uses the previous specification name. If no +previous specification name exists then this is an error. Because FSM +specifications persist in memory, a machine's statements can be spread across +multiple machine specifications. This allows one to break up a machine across +several files or draw in statements that are common to multiple machines using +the \verb|include| statement. + +\subsection{Machine Definition} +\label{definition} + +\begin{verbatim} +<name> = <expression>; +\end{verbatim} +\verbspace + +The machine definition statement associates an FSM expression with a name. Machine +expressions assigned to names can later be referenced in other expressions. A +definition statement on its own does not cause any states to be generated. It is simply a +description of a machine to be used later. States are generated only when a definition is +instantiated, which happens when a definition is referenced in an instantiated +expression. + +\subsection{Machine Instantiation} +\label{instantiation} + +\begin{verbatim} +<name> := <expression>; +\end{verbatim} +\verbspace + +The machine instantiation statement generates a set of states representing an +expression. Each instantiation generates a distinct set of states. The starting +state of the instantiation is written in the data section of the generated code +using the instantiation name. If a machine named +\verb|main| is instantiated, its start state is used as the +specification's start state and is assigned to the \verb|cs| variable by the +\verb|write init| command. If no \verb|main| machine is given, the start state +of the last machine instantiation to appear is used as the specification's +start state. + +From outside the execution loop, control may be passed to any machine by +assigning the entry point to the \verb|cs| variable. From inside the execution +loop, control may be passed to any machine instantiation using \verb|fcall|, +\verb|fgoto| or \verb|fnext| statements. + +\subsection{Including Ragel Code} + +\begin{verbatim} +include FsmName "inputfile.rl"; +\end{verbatim} +\verbspace + +The \verb|include| statement can be used to draw in the statements of another FSM +specification. Both the name and input file are optional, however at least one +must be given. Without an FSM name, the given input file is searched for an FSM +of the same name as the current specification. Without an input file the +current file is searched for a machine of the given name. If both are present, +the given input file is searched for a machine of the given name. + +Ragel searches for included files from the location of the current file. +Additional directories can be added to the search path using the \verb|-I| +option. + +\subsection{Importing Definitions} +\label{import} + +\begin{verbatim} +import "inputfile.h"; +\end{verbatim} +\verbspace + +The \verb|import| statement scrapes a file for sequences of tokens that match +the following forms. Ragel treats these forms as state machine definitions. + +\noindent\hspace*{24pt}\verb|name '=' number|\\ +\noindent\hspace*{24pt}\verb|name '=' lit_string|\\ +\noindent\hspace*{24pt}\verb|'define' name number|\\ +\noindent\hspace*{24pt}\verb|'define' name lit_string| +\vspace{12pt} + +If the input file is a Ragel program then tokens inside any Ragel +specifications are ignored. See Section \ref{export} for a description of +exporting machine definitions. + +Ragel searches for imported files from the location of the current file. +Additional directories can be added to the search path using the \verb|-I| +option. + +\section{Lexical Analysis of a Ragel Block} +\label{lexing} + +Within a machine specification the following lexical rules apply to the input. + +\begin{itemize} + +\item The \verb|#| symbol begins a comment that terminates at the next newline. + +\item The symbols \verb|""|, \verb|''|, \verb|//|, \verb|[]| behave as the +delimiters of literal strings. Within them, the following escape sequences +are interpreted: + +\verb| \0 \a \b \t \n \v \f \r| + +A backslash at the end of a line joins the following line onto the current. A +backslash preceding any other character removes special meaning. This applies +to terminating characters and to special characters in regular expression +literals. As an exception, regular expression literals do not support escape +sequences as the operands of a range within a list. See the bullet on regular +expressions in Section \ref{basic}. + +\item The symbols \verb|{}| delimit a block of host language code that will be +embedded into the machine as an action. Within the block of host language +code, basic lexical analysis of comments and strings is done in order to +correctly find the closing brace of the block. With the exception of FSM +commands embedded in code blocks, the entire block is preserved as is for +identical reproduction in the output code. + +\item The pattern \verb|[+-]?[0-9]+| denotes an integer in decimal format. +Integers used for specifying machines may be negative only if the alphabet type +is signed. Integers used for specifying priorities may be positive or negative. + +\item The pattern \verb|0x[0-9A-Fa-f]+| denotes an integer in hexadecimal +format. + +\item The keywords are \verb|access|, \verb|action|, \verb|alphtype|, +\verb|getkey|, \verb|write|, \verb|machine| and \verb|include|. + +\item The pattern \verb|[a-zA-Z_][a-zA-Z_0-9]*| denotes an identifier. + + +\item Any amount of whitespace may separate tokens. + +\end{itemize} + + +\section{Basic Machines} +\label{basic} + +The basic machines are the base operands of regular language expressions. They +are the smallest unit to which machine construction and manipulation operators +can be applied. + +\begin{itemize} + +\item \verb|'hello'| -- Concatenation Literal. Produces a machine that matches +the sequence of characters in the quoted string. If there are 5 characters +there will be 6 states chained together with the characters in the string. See +Section \ref{lexing} for information on valid escape sequences. + + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{bmconcat} +\end{center} +\graphspace + +It is possible +to make a concatenation literal case-insensitive by appending an \verb|i| to +the string, for example \verb|'cmd'i|. + +\item \verb|"hello"| -- Identical to the single quoted version. + +\item \verb|[hello]| -- Or Expression. Produces a union of characters. There +will be two states with a transition for each unique character between the two states. +The \verb|[]| delimiters behave like the quotes of a literal string. For example, +\verb|[ \t]| means tab or space. The \verb|or| expression supports character ranges +with the \verb|-| symbol as a separator. The meaning of the union can be negated +using an initial \verb|^| character as in standard regular expressions. +See Section \ref{lexing} for information on valid escape sequences +in \verb|or| expressions. + + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{bmor} +\end{center} +\graphspace + +\item \verb|''|, \verb|""|, and \verb|[]| -- Zero Length Machine. Produces a machine +that matches the zero length string. Zero length machines have one state that is both +a start state and a final state. + + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{bmnull} +\end{center} +\graphspace + +% FIXME: More on the range of values here. +\item \verb|42| -- Numerical Literal. Produces a two state machine with one +transition on the given number. The number may be in decimal or hexadecimal +format and should be in the range allowed by the alphabet type. The minimum and +maximum values permitted are defined by the host machine that Ragel is compiled +on. For example, numbers in a \verb|short| alphabet on an i386 machine should +be in the range \verb|-32768| to \verb|32767|. + + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{bmnum} +\end{center} +\graphspace + +\item \verb|/simple_regex/| -- Regular Expression. Regular expressions are +parsed as a series of expressions that are concatenated together. Each +concatenated expression +may be a literal character, the ``any'' character specified by the \verb|.| +symbol, or a union of characters specified by the \verb|[]| delimiters. If the +first character of a union is \verb|^| then it matches any character not in the +list. Within a union, a range of characters can be given by separating the first +and last characters of the range with the \verb|-| symbol. Each +concatenated machine may have repetition specified by following it with the +\verb|*| symbol. The standard escape sequences described in Section +\ref{lexing} are supported everywhere in regular expressions except as the +operands of a range within in a list. This notation also supports the \verb|i| +trailing option. Use it to produce case-insensitive machines, as in \verb|/GET/i|. + +Ragel does not support very complex regular expressions because the desired +results can always be achieved using the more general machine construction +operators listed in Section \ref{machconst}. The following diagram shows the +result of compiling \verb|/ab*[c-z].*[123]/|. \verb|DEF| represents the default +transition, which is taken if no other transition can be taken. + + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{bmregex} +\end{center} +\graphspace + +\item \verb|'a' .. 'z'| -- Range. Produces a machine that matches any +characters in the specified range. Allowable upper and lower bounds of the +range are concatenation literals of length one and numerical literals. For +example, \verb|0x10..0x20|, \verb|0..63|, and \verb|'a'..'z'| are valid ranges. +The bounds should be in the range allowed by the alphabet type. + + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{bmrange} +\end{center} +\graphspace + +\item \verb|variable_name| -- Lookup the machine definition assigned to the +variable name given and use an instance of it. See Section \ref{definition} for +an important note on what it means to reference a variable name. + +\item \verb|builtin_machine| -- There are several built-in machines available +for use. They are all two state machines for the purpose of matching common +classes of characters. They are: + +\begin{itemize} + +\item \verb|any | -- Any character in the alphabet. + +\item \verb|ascii | -- Ascii characters. \verb|0..127| + +\item \verb|extend| -- Ascii extended characters. This is the range +\verb|-128..127| for signed alphabets and the range \verb|0..255| for unsigned +alphabets. + +\item \verb|alpha | -- Alphabetic characters. \verb|[A-Za-z]| + +\item \verb|digit | -- Digits. \verb|[0-9]| + +\item \verb|alnum | -- Alpha numerics. \verb|[0-9A-Za-z]| + +\item \verb|lower | -- Lowercase characters. \verb|[a-z]| + +\item \verb|upper | -- Uppercase characters. \verb|[A-Z]| + +\item \verb|xdigit| -- Hexadecimal digits. \verb|[0-9A-Fa-f]| + +\item \verb|cntrl | -- Control characters. \verb|0..31| + +\item \verb|graph | -- Graphical characters. \verb|[!-~]| + +\item \verb|print | -- Printable characters. \verb|[ -~]| + +\item \verb|punct | -- Punctuation. Graphical characters that are not alphanumerics. +\verb|[!-/:-@[-`{-~]| + +\item \verb|space | -- Whitespace. \verb|[\t\v\f\n\r ]| + +\item \verb|zlen | -- Zero length string. \verb|""| + +\item \verb|empty | -- Empty set. Matches nothing. \verb|^any| + +\end{itemize} +\end{itemize} + +\section{Operator Precedence} +The following table shows operator precedence from lowest to highest. Operators +in the same precedence group are evaluated from left to right. + +\begin{tabular}{|c|c|c|} +\hline +1&\verb| , |&Join\\ +\hline +2&\verb/ | & - --/&Union, Intersection and Subtraction\\ +\hline +3&\verb| . <: :> :>> |&Concatenation\\ +\hline +4&\verb| : |&Label\\ +\hline +5&\verb| -> |&Epsilon Transition\\ +\hline +6&\verb| > @ $ % |&Transitions Actions and Priorities\\ +\hline +6&\verb| >/ $/ %/ </ @/ <>/ |&EOF Actions\\ +\hline +6&\verb| >! $! %! <! @! <>! |&Global Error Actions\\ +\hline +6&\verb| >^ $^ %^ <^ @^ <>^ |&Local Error Actions\\ +\hline +6&\verb| >~ $~ %~ <~ @~ <>~ |&To-State Actions\\ +\hline +6&\verb| >* $* %* <* @* <>* |&From-State Action\\ +\hline +7&\verb| * ** ? + {n} {,n} {n,} {n,m} |&Repetition\\ +\hline +8&\verb| ! ^ |&Negation and Character-Level Negation\\ +\hline +9&\verb| ( <expr> ) |&Grouping\\ +\hline +\end{tabular} + +\section{Regular Language Operators} +\label{machconst} + +When using Ragel it is helpful to have a sense of how it constructs machines. +The determinization process can produce results that seem unusual to someone +not familiar with the NFA to DFA conversion algorithm. In this section we +describe Ragel's state machine operators. Though the operators are defined +using epsilon transitions, it should be noted that this is for discussion only. +The epsilon transitions described in this section do not persist, but are +immediately removed by the determinization process which is executed at every +operation. Ragel does not make use of any nondeterministic intermediate state +machines. + +To create an epsilon transition between two states \verb|x| and \verb|y| is to +copy all of the properties of \verb|y| into \verb|x|. This involves drawing in +all of \verb|y|'s to-state actions, EOF actions, etc., in addition to its +transitions. If \verb|x| and \verb|y| both have a transition out on the same +character, then the transitions must be combined. During transition +combination a new transition is made that goes to a new state that is the +combination of both target states. The new combination state is created using +the same epsilon transition method. The new state has an epsilon transition +drawn to all the states that compose it. Since the creation of new epsilon +transitions may be triggered every time an epsilon transition is drawn, the +process of drawing epsilon transitions is repeated until there are no more +epsilon transitions to be made. + +A very common error that is made when using Ragel is to make machines that do +too much. That is, to create machines that have unintentional +nondetermistic properties. This usually results from being unaware of the common strings +between machines that are combined together using the regular language +operators. This can involve never leaving a machine, causing its actions to be +propagated through all the following states. Or it can involve an alternation +where both branches are unintentionally taken simultaneously. + +This problem forces one to think hard about the language that needs to be +matched. To guard against this kind of problem one must ensure that the machine +specification is divided up using boundaries that do not allow ambiguities from +one portion of the machine to the next. See Chapter +\ref{controlling-nondeterminism} for more on this problem and how to solve it. + +The Graphviz tool is an immense help when debugging improperly compiled +machines or otherwise learning how to use Ragel. Graphviz Dot files can be +generated from Ragel programs using the \verb|-V| option. See Section +\ref{visualization} for more information. + + +\subsection{Union} + +\verb/expr | expr/ + +The union operation produces a machine that matches any string in machine one +or machine two. The operation first creates a new start state. Epsilon +transitions are drawn from the new start state to the start states of both +input machines. The resulting machine has a final state set equivalent to the +union of the final state sets of both input machines. In this operation, there +is the opportunity for nondeterminism among both branches. If there are +strings, or prefixes of strings that are matched by both machines then the new +machine will follow both parts of the alternation at once. The union operation is +shown below. + +\graphspace +\begin{center} +\includegraphics[scale=1.0]{opor} +\end{center} +\graphspace + +The following example demonstrates the union of three machines representing +common tokens. + +% GENERATE: exor +% OPT: -p +% %%{ +% machine exor; +\begin{inline_code} +\begin{verbatim} +# Hex digits, decimal digits, or identifiers +main := '0x' xdigit+ | digit+ | alpha alnum*; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{exor} +\end{center} +\graphspace + +\subsection{Intersection} + +\verb|expr & expr| + +Intersection produces a machine that matches any +string that is in both machine one and machine two. To achieve intersection, a +union is performed on the two machines. After the result has been made +deterministic, any final state that is not a combination of final states from +both machines has its final state status revoked. To complete the operation, +paths that do not lead to a final state are pruned from the machine. Therefore, +if there are any such paths in either of the expressions they will be removed +by the intersection operator. Intersection can be used to require that two +independent patterns be simultaneously satisfied as in the following example. + +% GENERATE: exinter +% OPT: -p +% %%{ +% machine exinter; +\begin{inline_code} +\begin{verbatim} +# Match lines four characters wide that contain +# words separated by whitespace. +main := + /[^\n][^\n][^\n][^\n]\n/* & + (/[a-z][a-z]*/ | [ \n])**; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{exinter} +\end{center} +\graphspace + +\subsection{Difference} + +\verb|expr - expr| + +The difference operation produces a machine that matches +strings that are in machine one but are not in machine two. To achieve subtraction, +a union is performed on the two machines. After the result has been made +deterministic, any final state that came from machine two or is a combination +of states involving a final state from machine two has its final state status +revoked. As with intersection, the operation is completed by pruning any path +that does not lead to a final state. The following example demonstrates the +use of subtraction to exclude specific cases from a set. + +% GENERATE: exsubtr +% OPT: -p +% %%{ +% machine exsubtr; +\begin{inline_code} +\begin{verbatim} +# Subtract keywords from identifiers. +main := /[a-z][a-z]*/ - ( 'for' | 'int' ); +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{exsubtr} +\end{center} +\graphspace + +\subsection{Strong Difference} +\label{strong_difference} + +\verb|expr -- expr| + +Strong difference produces a machine that matches any string of the first +machine that does not have any string of the second machine as a substring. In +the following example, strong subtraction is used to excluded \verb|CRLF| from +a sequence. In the corresponding visualization, the label \verb|DEF| is short +for default. The default transition is taken if no other transition can be +taken. + +% GENERATE: exstrongsubtr +% OPT: -p +% %%{ +% machine exstrongsubtr; +\begin{inline_code} +\begin{verbatim} +crlf = '\r\n'; +main := [a-z]+ ':' ( any* -- crlf ) crlf; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{exstrongsubtr} +\end{center} +\graphspace + +This operator is equivalent to the following. + +\begin{verbatim} +expr - ( any* expr any* ) +\end{verbatim} +\verbspace + +\subsection{Concatenation} + +\verb|expr . expr| + +Concatenation produces a machine that matches all the strings in machine one followed by all +the strings in machine two. Concatenation draws epsilon transitions from the +final states of the first machine to the start state of the second machine. The +final states of the first machine lose their final state status, unless the +start state of the second machine is final as well. +Concatenation is the default operator. Two machines next to each other with no +operator between them results in concatenation. + +\graphspace +\begin{center} +\includegraphics[scale=1.0]{opconcat} +\end{center} +\graphspace + +The opportunity for nondeterministic behaviour results from the possibility of +the final states of the first machine accepting a string that is also accepted +by the start state of the second machine. +The most common scenario in which this happens is the +concatenation of a machine that repeats some pattern with a machine that gives +a terminating string, but the repetition machine does not exclude the +terminating string. The example in Section \ref{strong_difference} +guards against this. Another example is the expression \verb|("'" any* "'")|. +When executed the thread of control will +never leave the \verb|any*| machine. This is a problem especially if actions +are embedded to process the characters of the \verb|any*| component. + +In the following example, the first machine is always active due to the +nondeterministic nature of concatenation. This particular nondeterminism is intended +however because we wish to permit EOF strings before the end of the input. + +% GENERATE: exconcat +% OPT: -p +% %%{ +% machine exconcat; +\begin{inline_code} +\begin{verbatim} +# Require an eof marker on the last line. +main := /[^\n]*\n/* . 'EOF\n'; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{exconcat} +\end{center} +\graphspace + +There is a language +ambiguity involving concatenation and subtraction. Because concatenation is the +default operator for two +adjacent machines there is an ambiguity between subtraction of +a positive numerical literal and concatenation of a negative numerical literal. +For example, \verb|(x-7)| could be interpreted as \verb|(x . -7)| or +\verb|(x - 7)|. In the Ragel language, the subtraction operator always takes precedence +over concatenation of a negative literal. We adhere to the rule that the default +concatenation operator takes effect only when there are no other operators between +two machines. Beware of writing machines such as \verb|(any -1)| when what is +desired is a concatenation of \verb|any| and \verb|-1|. Instead write +\verb|(any . -1)| or \verb|(any (-1))|. If in doubt of the meaning of your program do not +rely on the default concatenation operator; always use the \verb|.| symbol. + + +\subsection{Kleene Star} + +\verb|expr*| + +The machine resulting from the Kleene Star operator will match zero or more +repetitions of the machine it is applied to. +It creates a new start state and an additional final +state. Epsilon transitions are drawn between the new start state and the old start +state, between the new start state and the new final state, and +between the final states of the machine and the new start state. After the +machine is made deterministic the effect is of the final states getting all the +transitions of the start state. + +\graphspace +\begin{center} +\includegraphics[scale=1.0]{opstar} +\end{center} +\graphspace + +The possibility for nondeterministic behaviour arises if the final states have +transitions on any of the same characters as the start state. This is common +when applying kleene star to an alternation of tokens. Like the other problems +arising from nondeterministic behavior, this is discussed in more detail in Chapter +\ref{controlling-nondeterminism}. This particular problem can also be solved +by using the longest-match construction discussed in Section +\ref{generating-scanners} on scanners. + +In this +example, there is no nondeterminism introduced by the exterior kleene star due to +the newline at the end of the regular expression. Without the newline the +exterior kleene star would be redundant and there would be ambiguity between +repeating the inner range of the regular expression and the entire regular +expression. Though it would not cause a problem in this case, unnecessary +nondeterminism in the kleene star operator often causes undesired results for +new Ragel users and must be guarded against. + +% GENERATE: exstar +% OPT: -p +% %%{ +% machine exstar; +\begin{inline_code} +\begin{verbatim} +# Match any number of lines with only lowercase letters. +main := /[a-z]*\n/*; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{exstar} +\end{center} +\graphspace + +\subsection{One Or More Repetition} + +\verb|expr+| + +This operator produces the concatenation of the machine with the kleene star of +itself. The result will match one or more repetitions of the machine. The plus +operator is equivalent to \verb|(expr . expr*)|. + +% GENERATE: explus +% OPT: -p +% %%{ +% machine explus; +\begin{inline_code} +\begin{verbatim} +# Match alpha-numeric words. +main := alnum+; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{explus} +\end{center} +\graphspace + +\subsection{Optional} + +\verb|expr?| + +The {\em optional} operator produces a machine that accepts the machine +given or the zero length string. The optional operator is equivalent to +\verb/(expr | '' )/. In the following example the optional operator is used to +possibly extend a token. + +% GENERATE: exoption +% OPT: -p +% %%{ +% machine exoption; +\begin{inline_code} +\begin{verbatim} +# Match integers or floats. +main := digit+ ('.' digit+)?; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{exoption} +\end{center} +\graphspace + +\subsection{Repetition} + +\noindent\hspace*{24pt}\verb|expr {n}| -- Exactly N copies of expr.\\ +\noindent\hspace*{24pt}\verb|expr {,n}| -- Zero to N copies of expr.\\ +\noindent\hspace*{24pt}\verb|expr {n,}| -- N or more copies of expr.\\ +\noindent\hspace*{24pt}\verb|expr {n,m}| -- N to M copies of expr. +\vspace{12pt} + +\subsection{Negation} + +\verb|!expr| + +Negation produces a machine that matches any string not matched by the given +machine. Negation is equivalent to \verb|(any* - expr)|. + +% GENERATE: exnegate +% OPT: -p +% %%{ +% machine exnegate; +\begin{inline_code} +\begin{verbatim} +# Accept anything but a string beginning with a digit. +main := ! ( digit any* ); +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{exnegate} +\end{center} +\graphspace + +\subsection{Character-Level Negation} + +\verb|^expr| + +Character-level negation produces a machine that matches any single character +not matched by the given machine. Character-Level Negation is equivalent to +\verb|(any - expr)|. It must be applied only to machines that match strings of +length one. + +\section{State Machine Minimization} + +State machine minimization is the process of finding the minimal equivalent FSM accepting +the language. Minimization reduces the number of states in machines +by merging equivalent states. It does not change the behaviour of the machine +in any way. It will cause some states to be merged into one because they are +functionally equivalent. State minimization is on by default. It can be turned +off with the \verb|-n| option. + +The algorithm implemented is similar to Hopcroft's state minimization +algorithm. Hopcroft's algorithm assumes a finite alphabet that can be listed in +memory, whereas Ragel supports arbitrary integer alphabets that cannot be +listed in memory. Though exact analysis is very difficult, Ragel minimization +runs close to O(n * log(n)) and requires O(n) temporary storage where +$n$ is the number of states. + +\section{Visualization} +\label{visualization} + +%In many cases, practical +%parsing programs will be too large to completely visualize with Graphviz. The +%proper approach is to reduce the language to the smallest subset possible that +%still exhibits the characteristics that one wishes to learn about or to fix. +%This can be done without modifying the source code using the \verb|-M| and +%\verb|-S| options. If a machine cannot be easily reduced, +%embeddings of unique actions can be very useful for tracing a +%particular component of a larger machine specification, since action names are +%written out on transition labels. + +Ragel is able to emit compiled state machines in Graphviz's Dot file format. +This is done using the \verb|-V| option. +Graphviz support allows users to perform +incremental visualization of their parsers. User actions are displayed on +transition labels of the graph. + +If the final graph is too large to be +meaningful, or even drawn, the user is able to inspect portions of the parser +by naming particular regular expression definitions with the \verb|-S| and +\verb|-M| options to the \verb|ragel| program. Use of Graphviz greatly +improves the Ragel programming experience. It allows users to learn Ragel by +experimentation and also to track down bugs caused by unintended +nondeterminism. + +Ragel has another option to help debugging. The \verb|-x| option causes Ragel +to emit the compiled machine in an XML format. + +\chapter{User Actions} + +Ragel permits the user to embed actions into the transitions of a regular +expression's corresponding state machine. These actions are executed when the +generated code moves over a transition. Like the regular expression operators, +the action embedding operators are fully compositional. They take a state +machine and an action as input, embed the action and yield a new state machine +that can be used in the construction of other machines. Due to the +compositional nature of embeddings, the user has complete freedom in the +placement of actions. + +A machine's transitions are categorized into four classes. The action embedding +operators access the transitions defined by these classes. The {\em entering +transition} operator \verb|>| isolates the start state, then embeds an action +into all transitions leaving it. The {\em finishing transition} operator +\verb|@| embeds an action into all transitions going into a final state. The +{\em all transition} operator \verb|$| embeds an action into all transitions of +an expression. The {\em leaving transition} operator \verb|%| provides access +to the yet-unmade transitions moving out of the machine via the final states. + +\section{Embedding Actions} + +\begin{verbatim} +action ActionName { + /* Code an action here. */ + count += 1; +} +\end{verbatim} +\verbspace + +The action statement defines a block of code that can be embedded into an FSM. +Action names can be referenced by the action embedding operators in +expressions. Though actions need not be named in this way (literal blocks +of code can be embedded directly when building machines), defining reusable +blocks of code whenever possible is good practice because it potentially increases the +degree to which the machine can be minimized. + +Within an action some Ragel expressions and statements are parsed and +translated. These allow the user to interact with the machine from action code. +See Section \ref{vals} for a complete list of statements and values available +in code blocks. + +\subsection{Entering Action} + +\verb|expr > action| + +The entering action operator embeds an action into all transitions +that enter into the machine from the start state. If the start state is final, +then the action is also embedded into the start state as a leaving action. This +means that if a machine accepts the zero-length string and control passes +through the start state then the entering action is executed. Note +that this can happen on both a following character and on the EOF event. + +In some machines the start state has transtions coming in from within the +machine. In these cases the start state is first isolated from the rest of the +machine ensuring that the entering actions are exected once only. + +% GENERATE: exstact +% OPT: -p +% %%{ +% machine exstact; +\begin{inline_code} +\begin{verbatim} +# Execute A at the beginning of a string of alpha. +action A {} +main := ( lower* >A ) . ' '; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{exstact} +\end{center} +\graphspace + +\subsection{Finishing Action} + +\verb|expr @ action| + +The finishing action operator embeds an action into any transitions that move +the machine into a final state. Further input may move the machine out of the +final state, but keep it in the machine. Therefore finishing actions may be +executed more than once if a machine has any internal transitions out of a +final state. In the following example the final state has no transitions out +and the finishing action is executed only once. + +% GENERATE: exdoneact +% OPT: -p +% %%{ +% machine exdoneact; +% action A {} +\begin{inline_code} +\begin{verbatim} +# Execute A when the trailing space is seen. +main := ( lower* ' ' ) @A; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{exdoneact} +\end{center} +\graphspace + +\subsection{All Transition Action} + +\verb|expr $ action| + +The all transition operator embeds an action into all transitions of a machine. +The action is executed whenever a transition of the machine is taken. In the +following example, A is executed on every character matched. + +% GENERATE: exallact +% OPT: -p +% %%{ +% machine exallact; +% action A {} +\begin{inline_code} +\begin{verbatim} +# Execute A on any characters of the machine. +main := ( 'm1' | 'm2' ) $A; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{exallact} +\end{center} +\graphspace + +\subsection{Leaving Actions} +\label{out-actions} + +\verb|expr % action| + +The leaving action operator queues an action for embedding into the transitions +that go out of a machine via a final state. The action is first stored in +the machine's final states and is later transferred to any transitions that are +made going out of the machine by a kleene star or concatenation operation. + +If a final state of the machine is still final when compilation is complete +then the leaving action is also embedded as an EOF action. Therefore, leaving +the machine is defined as either leaving on a character or as state machine +acceptance. + +This operator allows one to associate an action with the termination of a +sequence, without being concerned about what particular character terminates +the sequence. In the following example, A is executed when leaving the alpha +machine on the newline character. + +% GENERATE: exoutact1 +% OPT: -p +% %%{ +% machine exoutact1; +% action A {} +\begin{inline_code} +\begin{verbatim} +# Match a word followed by a newline. Execute A when +# finishing the word. +main := ( lower+ %A ) . '\n'; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{exoutact1} +\end{center} +\graphspace + +In the following example, the \verb|term_word| action could be used to register +the appearance of a word and to clear the buffer that the \verb|lower| action used +to store the text of it. + +% GENERATE: exoutact2 +% OPT: -p +% %%{ +% machine exoutact2; +% action lower {} +% action space {} +% action term_word {} +% action newline {} +\begin{inline_code} +\begin{verbatim} +word = ( [a-z] @lower )+ %term_word; +main := word ( ' ' @space word )* '\n' @newline; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{exoutact2} +\end{center} +\graphspace + +In this final example of the action embedding operators, A is executed upon entering +the alpha machine, B is executed on all transitions of the +alpha machine, C is executed when the alpha machine is exited by moving into the +newline machine and N is executed when the newline machine moves into a final +state. + +% GENERATE: exaction +% OPT: -p +% %%{ +% machine exaction; +% action A {} +% action B {} +% action C {} +% action N {} +\begin{inline_code} +\begin{verbatim} +# Execute A on starting the alpha machine, B on every transition +# moving through it and C upon finishing. Execute N on the newline. +main := ( lower* >A $B %C ) . '\n' @N; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{exaction} +\end{center} +\graphspace + + +\section{State Action Embedding Operators} + +The state embedding operators allow one to embed actions into states. Like the +transition embedding operators, there are several different classes of states +that the operators access. The meanings of the symbols are similar to the +meanings of the symbols used for the transition embedding operators. The design +of the state selections was driven by a need to cover the states of an +expression with exactly one error action. + +Unlike the transition embedding operators, the state embedding operators are +also distinguished by the different kinds of events that embedded actions can +be associated with. Therefore the state embedding operators have two +components. The first, which is the first one or two characters, specifies the +class of states that the action will be embedded into. The second component +specifies the type of event the action will be executed on. The symbols of the +second component also have equivalent kewords. + +\begin{multicols}{2} +The different classes of states are: + +\noindent\hspace*{24pt}\verb|> | -- the start state\\ +\noindent\hspace*{24pt}\verb|< | -- any state except the start state\\ +\noindent\hspace*{24pt}\verb|$ | -- all states\\ +\noindent\hspace*{24pt}\verb|% | -- final states\\ +\noindent\hspace*{24pt}\verb|@ | -- any state except final states\\ +\noindent\hspace*{24pt}\verb|<>| -- any except start and final (middle) +\vspace{12pt} + +\columnbreak + +The different kinds of embeddings are: + +\noindent\hspace*{24pt}\verb|~| -- to-state actions (\verb|to|)\\ +\noindent\hspace*{24pt}\verb|*| -- from-state actions (\verb|from|)\\ +\noindent\hspace*{24pt}\verb|/| -- EOF actions (\verb|eof|)\\ +\noindent\hspace*{24pt}\verb|!| -- error actions (\verb|err|)\\ +\noindent\hspace*{24pt}\verb|^| -- local error actions (\verb|lerr|) +\vspace{12pt} + +\end{multicols} + +\subsection{To-State and From-State Actions} + +\subsubsection{To-State Actions} + +\noindent\hspace*{24pt}\verb|>~action >to(name) >to{...} | -- the start state\\ +\noindent\hspace*{24pt}\verb|<~action <to(name) <to{...} | -- any state except the start state\\ +\noindent\hspace*{24pt}\verb|$~action $to(name) $to{...} | -- all states\\ +\noindent\hspace*{24pt}\verb|%~action %to(name) %to{...} | -- final states\\ +\noindent\hspace*{24pt}\verb|@~action @to(name) @to{...} | -- any state except final states\\ +\noindent\hspace*{24pt}\verb|<>~action <>to(name) <>to{...}| -- any except start and final (middle) +\vspace{12pt} + + +To-state actions are executed whenever the state machine moves into the +specified state, either by a natural movement over a transition or by an +action-based transfer of control such as \verb|fgoto|. They are executed after the +in-transition's actions but before the current character is advanced and +tested against the end of the input block. To-state embeddings stay with the +state. They are irrespective of the state's current set of transitions and any +future transitions that may be added in or out of the state. + +Note that the setting of the current state variable \verb|cs| outside of the +execute code is not considered by Ragel as moving into a state and consequently +the to-state actions of the new current state are not executed. This includes +the initialization of the current state when the machine begins. This is +because the entry point into the machine execution code is after the execution +of to-state actions. + +\subsubsection{From-State Actions} + +\noindent\hspace*{24pt}\verb|>*action >from(name) >from{...} | -- the start state\\ +\noindent\hspace*{24pt}\verb|<*action <from(name) <from{...} | -- any state except the start state\\ +\noindent\hspace*{24pt}\verb|$*action $from(name) $from{...} | -- all states\\ +\noindent\hspace*{24pt}\verb|%*action %from(name) %from{...} | -- final states\\ +\noindent\hspace*{24pt}\verb|@*action @from(name) @from{...} | -- any state except final states\\ +\noindent\hspace*{24pt}\verb|<>*action <>from(name) <>from{...}| -- any except start and final (middle) +\vspace{12pt} + +From-state actions are executed whenever the state machine takes a transition from a +state, either to itself or to some other state. These actions are executed +immediately after the current character is tested against the input block end +marker and before the transition to take is sought based on the current +character. From-state actions are therefore executed even if a transition +cannot be found and the machine moves into the error state. Like to-state +embeddings, from-state embeddings stay with the state. + +\subsection{EOF Actions} + +\noindent\hspace*{24pt}\verb|>/action >eof(name) >eof{...} | -- the start state\\ +\noindent\hspace*{24pt}\verb|</action <eof(name) <eof{...} | -- any state except the start state\\ +\noindent\hspace*{24pt}\verb|$/action $eof(name) $eof{...} | -- all states\\ +\noindent\hspace*{24pt}\verb|%/action %eof(name) %eof{...} | -- final states\\ +\noindent\hspace*{24pt}\verb|@/action @eof(name) @eof{...} | -- any state except final states\\ +\noindent\hspace*{24pt}\verb|<>/action <>eof(name) <>eof{...}| -- any except start and final (middle) +\vspace{12pt} + +The EOF action embedding operators enable the user to embed actions that are +executed at the end of the input stream. EOF actions are stored in states and +generated in the \verb|write exec| block. They are run when \verb|p == pe == eof| +as the execute block is finishing. EOF actions are free to adjust \verb|p| and +jump to another part of the machine to restart execution. + +\subsection{Handling Errors} + +In many applications it is useful to be able to react to parsing errors. The +user may wish to print an error message that depends on the context. It +may also be desirable to consume input in an attempt to return the input stream +to some known state and resume parsing. To support error handling and recovery, +Ragel provides error action embedding operators. There are two kinds of error +actions: global error actions and local error actions. +Error actions can be used to simply report errors, or by jumping to a machine +instantiation that consumes input, can attempt to recover from errors. + +\subsubsection{Global Error Actions} + +\noindent\hspace*{24pt}\verb|>!action >err(name) >err{...} | -- the start state\\ +\noindent\hspace*{24pt}\verb|<!action <err(name) <err{...} | -- any state except the start state\\ +\noindent\hspace*{24pt}\verb|$!action $err(name) $err{...} | -- all states\\ +\noindent\hspace*{24pt}\verb|%!action %err(name) %err{...} | -- final states\\ +\noindent\hspace*{24pt}\verb|@!action @err(name) @err{...} | -- any state except final states\\ +\noindent\hspace*{24pt}\verb|<>!action <>err(name) <>err{...}| -- any except start and final (middle) +\vspace{12pt} + +Global error actions are stored in the states they are embedded into until +compilation is complete. They are then transferred to the transitions that move +into the error state. These transitions are taken on all input characters that +are not already covered by the state's transitions. If a state with an error +action is not final when compilation is complete, then the action is also +embedded as an EOF action. + +Error actions can be used to recover from errors by jumping back into the +machine with \verb|fgoto| and optionally altering \verb|p|. + +\subsubsection{Local Error Actions} + +\noindent\hspace*{24pt}\verb|>^action >lerr(name) >lerr{...} | -- the start state\\ +\noindent\hspace*{24pt}\verb|<^action <lerr(name) <lerr{...} | -- any state except the start state\\ +\noindent\hspace*{24pt}\verb|$^action $lerr(name) $lerr{...} | -- all states\\ +\noindent\hspace*{24pt}\verb|%^action %lerr(name) %lerr{...} | -- final states\\ +\noindent\hspace*{24pt}\verb|@^action @lerr(name) @lerr{...} | -- any state except final states\\ +\noindent\hspace*{24pt}\verb|<>^action <>lerr(name) <>lerr{...}| -- any except start and final (middle) +\vspace{12pt} + +Like global error actions, local error actions are also stored in the states +they are embedded into until a transfer point. The transfer point is different +however. Each local error action embedding is associated with a name. When a +machine definition has been fully constructed, all local error action +embeddings associated with the same name as the machine definition are +transferred to the error transitions. At this time they are also embedded as +EOF actions in the case of non-final states. + +Local error actions can be used to specify an action to take when a particular +section of a larger state machine fails to match. A particular machine +definition's ``thread'' may die and the local error actions executed, however +the machine as a whole may continue to match input. + +There are two forms of local error action embeddings. In the first form the +name defaults to the current machine. In the second form the machine name can +be specified. This is useful when it is more convenient to specify the local +error action in a sub-definition that is used to construct the machine +definition that the local error action is associated with. To embed local +error actions and +explicitly state the machine definition on which the transfer is to happen use +\verb|(name, action)| as the action. + +\subsubsection{Example} + +The following example uses error actions to report an error and jump to a +machine that consumes the remainder of the line when parsing fails. After +consuming the line, the error recovery machine returns to the main loop. + +% GENERATE: erract +% %%{ +% machine erract; +% ws = ' '; +% address = 'foo AT bar..com'; +% date = 'Monday May 12'; +\begin{inline_code} +\begin{verbatim} +action cmd_err { + printf( "command error\n" ); + fhold; fgoto line; +} +action from_err { + printf( "from error\n" ); + fhold; fgoto line; +} +action to_err { + printf( "to error\n" ); + fhold; fgoto line; +} + +line := [^\n]* '\n' @{ fgoto main; }; + +main := ( + ( + 'from' @err(cmd_err) + ( ws+ address ws+ date '\n' ) $err(from_err) | + 'to' @err(cmd_err) + ( ws+ address '\n' ) $err(to_err) + ) +)*; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% %% write data; +% void f() +% { +% %% write init; +% %% write exec; +% } +% END GENERATE + + + +\section{Action Ordering and Duplicates} + +When combining expressions that have embedded actions it is often the case that +a number of actions must be executed on a single input character. For example, +following a concatenation the leaving action of the left expression and the +entering action of the right expression will be embedded into one transition. +This requires a method of ordering actions that is intuitive and +predictable for the user, and repeatable for the compiler. + +We associate with the embedding of each action a unique timestamp that is +used to order actions that appear together on a single transition in the final +state machine. To accomplish this we recursively traverse the parse tree of +regular expressions and assign timestamps to action embeddings. References to +machine definitions are followed in the traversal. When we visit a +parse tree node we assign timestamps to all {\em entering} action embeddings, +recurse on the parse tree, then assign timestamps to the remaining {\em all}, +{\em finishing}, and {\em leaving} embeddings in the order in which they +appear. + +By default Ragel does not permit a single action to appear multiple times in an action +list. When the final machine has been created, actions that appear more than +once in a single transition, to-state, from-state or EOF action list have their +duplicates removed. +The first appearance of the action is preserved. This is useful in a number of +scenarios. First, it allows us to union machines with common prefixes without +worrying about the action embeddings in the prefix being duplicated. Second, it +prevents leaving actions from being transferred multiple times. This can +happen when a machine is repeated, then followed with another machine that +begins with a common character. For example: + +\begin{verbatim} +word = [a-z]+ %act; +main := word ( '\n' word )* '\n\n'; +\end{verbatim} +\verbspace + +Note that Ragel does not compare action bodies to determine if they have +identical program text. It simply checks for duplicates using each action +block's unique location in the program. + +The removal of duplicates can be turned off using the \verb|-d| option. + +\section{Values and Statements Available in Code Blocks} +\label{vals} + +The following values are available in code blocks: + +\begin{itemize} +\item \verb|fpc| -- A pointer to the current character. This is equivalent to +accessing the \verb|p| variable. + +\item \verb|fc| -- The current character. This is equivalent to the expression \verb|(*p)|. + +\item \verb|fcurs| -- An integer value representing the current state. This +value should only be read from. To move to a different place in the machine +from action code use the \verb|fgoto|, \verb|fnext| or \verb|fcall| statements. +Outside of the machine execution code the \verb|cs| variable may be modified. + +\item \verb|ftargs| -- An integer value representing the target state. This +value should only be read from. Again, \verb|fgoto|, \verb|fnext| and +\verb|fcall| can be used to move to a specific entry point. + +\item \verb|fentry(<label>)| -- Retrieve an integer value representing the +entry point \verb|label|. The integer value returned will be a compile time +constant. This number is suitable for later use in control flow transfer +statements that take an expression. This value should not be compared against +the current state because any given label can have multiple states representing +it. The value returned by \verb|fentry| can be any one of the multiple states that +it represents. +\end{itemize} + +The following statements are available in code blocks: + +\begin{itemize} + +\item \verb|fhold;| -- Do not advance over the current character. If processing +data in multiple buffer blocks, the \verb|fhold| statement should only be used +once in the set of actions executed on a character. Multiple calls may result +in backing up over the beginning of the buffer block. The \verb|fhold| +statement does not imply any transfer of control. It is equivalent to the +\verb|p--;| statement. + +\item \verb|fexec <expr>;| -- Set the next character to process. This can be +used to backtrack to previous input or advance ahead. +Unlike \verb|fhold|, which can be used +anywhere, \verb|fexec| requires the user to ensure that the target of the +backtrack is in the current buffer block or is known to be somewhere ahead of +it. The machine will continue iterating forward until \verb|pe| is arrived at, +\verb|fbreak| is called or the machine moves into the error state. In actions +embedded into transitions, the \verb|fexec| statement is equivalent to setting +\verb|p| to one position ahead of the next character to process. If the user +also modifies \verb|pe|, it is possible to change the buffer block entirely. + +\item \verb|fgoto <label>;| -- Jump to an entry point defined by +\verb|<label>|. The \verb|fgoto| statement immediately transfers control to +the destination state. + +\item \verb|fgoto *<expr>;| -- Jump to an entry point given by \verb|<expr>|. +The expression must evaluate to an integer value representing a state. + +\item \verb|fnext <label>;| -- Set the next state to be the entry point defined +by \verb|label|. The \verb|fnext| statement does not immediately jump to the +specified state. Any action code following the statement is executed. + +\item \verb|fnext *<expr>;| -- Set the next state to be the entry point given +by \verb|<expr>|. The expression must evaluate to an integer value representing +a state. + +\item \verb|fcall <label>;| -- Push the target state and jump to the entry +point defined by \verb|<label>|. The next \verb|fret| will jump to the target +of the transition on which the call was made. Use of \verb|fcall| requires +the declaration of a call stack. An array of integers named \verb|stack| and a +single integer named \verb|top| must be declared. With the \verb|fcall| +construct, control is immediately transferred to the destination state. +See section \ref{modularization} for more information. + +\item \verb|fcall *<expr>;| -- Push the current state and jump to the entry +point given by \verb|<expr>|. The expression must evaluate to an integer value +representing a state. + +\item \verb|fret;| -- Return to the target state of the transition on which the +last \verb|fcall| was made. Use of \verb|fret| requires the declaration of a +call stack. Control is immediately transferred to the destination state. + +\item \verb|fbreak;| -- Advance \verb|p|, save the target state to \verb|cs| +and immediately break out of the execute loop. This statement is useful +in conjunction with the \verb|noend| write option. Rather than process input +until \verb|pe| is arrived at, the fbreak statement +can be used to stop processing from an action. After an \verb|fbreak| +statement the \verb|p| variable will point to the next character in the input. The +current state will be the target of the current transition. Note that \verb|fbreak| +causes the target state's to-state actions to be skipped. + +\end{itemize} + +Once actions with control-flow commands are embedded into a +machine, the user must exercise caution when using the machine as the operand +to other machine construction operators. If an action jumps to another state +then unioning any transition that executes that action with another transition +that follows some other path will cause that other path to be lost. Using +commands that manually jump around a machine takes us out of the domain of +regular languages because transitions that the +machine construction operators are not aware of are introduced. These +commands should therefore be used with caution. + + +\chapter{Controlling Nondeterminism} +\label{controlling-nondeterminism} + +Along with the flexibility of arbitrary action embeddings comes a need to +control nondeterminism in regular expressions. If a regular expression is +ambiguous, then sub-components of a parser other than the intended parts may become +active. This means that actions that are irrelevant to the +current subset of the parser may be executed, causing problems for the +programmer. + +Tools that are based on regular expression engines and that are used for +recognition tasks will usually function as intended regardless of the presence +of ambiguities. It is quite common for users of scripting languages to write +regular expressions that are heavily ambiguous and it generally does not +matter. As long as one of the potential matches is recognized, there can be any +number of other matches present. In some parsing systems the run-time engine +can employ a strategy for resolving ambiguities, for example always pursuing +the longest possible match and discarding others. + +In Ragel, there is no regular expression run-time engine, just a simple state +machine execution model. When we begin to embed actions and face the +possibility of spurious action execution, it becomes clear that controlling +nondeterminism at the machine construction level is very important. Consider +the following example. + +% GENERATE: lines1 +% OPT: -p +% %%{ +% machine lines1; +% action first {} +% action tail {} +% word = [a-z]+; +\begin{inline_code} +\begin{verbatim} +ws = [\n\t ]; +line = word $first ( ws word $tail )* '\n'; +lines = line*; +\end{verbatim} +\end{inline_code} +\verbspace +% main := lines; +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.53]{lines1} +\end{center} +\graphspace + +Since the \verb|ws| expression includes the newline character, we will +not finish the \verb|line| expression when a newline character is seen. We will +simultaneously pursue the possibility of matching further words on the same +line and the possibility of matching a second line. Evidence of this fact is +in the state tables. On several transitions both the \verb|first| and +\verb|tail| actions are executed. The solution here is simple: exclude +the newline character from the \verb|ws| expression. + +% GENERATE: lines2 +% OPT: -p +% %%{ +% machine lines2; +% action first {} +% action tail {} +% word = [a-z]+; +\begin{inline_code} +\begin{verbatim} +ws = [\t ]; +line = word $first ( ws word $tail )* '\n'; +lines = line*; +\end{verbatim} +\end{inline_code} +\verbspace +% main := lines; +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{lines2} +\end{center} +\graphspace + +Solving this kind of problem is straightforward when the ambiguity is created +by strings that are a single character long. When the ambiguity is created by +strings that are multiple characters long we have a more difficult problem. +The following example is an incorrect attempt at a regular expression for C +language comments. + +% GENERATE: comments1 +% OPT: -p +% %%{ +% machine comments1; +% action comm {} +\begin{inline_code} +\begin{verbatim} +comment = '/*' ( any @comm )* '*/'; +main := comment ' '; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{comments1} +\end{center} +\graphspace + +Using standard concatenation, we will never leave the \verb|any*| expression. +We will forever entertain the possibility that a \verb|'*/'| string that we see +is contained in a longer comment and that, simultaneously, the comment has +ended. The concatenation of the \verb|comment| machine with \verb|SP| is done +to show this. When we match space, we are also still matching the comment body. + +One way to approach the problem is to exclude the terminating string +from the \verb|any*| expression using set difference. We must be careful to +exclude not just the terminating string, but any string that contains it as a +substring. A verbose, but proper specification of a C comment parser is given +by the following regular expression. + +% GENERATE: comments2 +% OPT: -p +% %%{ +% machine comments2; +% action comm {} +\begin{inline_code} +\begin{verbatim} +comment = '/*' ( ( any @comm )* - ( any* '*/' any* ) ) '*/'; +\end{verbatim} +\end{inline_code} +\verbspace +% main := comment; +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{comments2} +\end{center} +\graphspace + +Note that Ragel's strong subtraction operator \verb|--| can also be used here. +In doing this subtraction we have phrased the problem of controlling non-determinism in +terms of excluding strings common to two expressions that interact when +combined. +We can also phrase the problem in terms of the transitions of the state +machines that implement these expressions. During the concatenation of +\verb|any*| and \verb|'*/'| we will be making transitions that are composed of +both the loop of the first expression and the final character of the second. +At this time we want the transition on the \verb|'/'| character to take precedence +over and disallow the transition that originated in the \verb|any*| loop. + +In another parsing problem, we wish to implement a lightweight tokenizer that we can +utilize in the composition of a larger machine. For example, some HTTP headers +have a token stream as a sub-language. The following example is an attempt +at a regular expression-based tokenizer that does not function correctly due to +unintended nondeterminism. + +% GENERATE: smallscanner +% OPT: -p +% %%{ +% machine smallscanner; +% action start_str {} +% action on_char {} +% action finish_str {} +\begin{inline_code} +\begin{verbatim} +header_contents = ( + lower+ >start_str $on_char %finish_str | + ' ' +)*; +\end{verbatim} +\end{inline_code} +\verbspace +% main := header_contents; +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{smallscanner} +\end{center} +\graphspace + +In this case, the problem with using a standard kleene star operation is that +there is an ambiguity between extending a token and wrapping around the machine +to begin a new token. Using the standard operator, we get an undesirable +nondeterministic behaviour. Evidence of this can be seen on the transition out +of state one to itself. The transition extends the string, and simultaneously, +finishes the string only to immediately begin a new one. What is required is +for the +transitions that represent an extension of a token to take precedence over the +transitions that represent the beginning of a new token. For this problem +there is no simple solution that uses standard regular expression operators. + +\section{Priorities} + +A priority mechanism was devised and built into the determinization +process, specifically for the purpose of allowing the user to control +nondeterminism. Priorities are integer values embedded into transitions. When +the determinization process is combining transitions that have different +priorities, the transition with the higher priority is preserved and the +transition with the lower priority is dropped. + +Unfortunately, priorities can have unintended side effects because their +operation requires that they linger in transitions indefinitely. They must linger +because the Ragel program cannot know when the user is finished with a priority +embedding. A solution whereby they are explicitly deleted after use is +conceivable; however this is not very user-friendly. Priorities were therefore +made into named entities. Only priorities with the same name are allowed to +interact. This allows any number of priorities to coexist in one machine for +the purpose of controlling various different regular expression operations and +eliminates the need to ever delete them. Such a scheme allows the user to +choose a unique name, embed two different priority values using that name +and be confident that the priority embedding will be free of any side effects. + +In the first form of priority embedding the name defaults to the name of the machine +definition that the priority is assigned in. In this sense priorities are by +default local to the current machine definition or instantiation. Beware of +using this form in a longest-match machine, since there is only one name for +the entire set of longest match patterns. In the second form the priority's +name can be specified, allowing priority interaction across machine definition +boundaries. + +\begin{itemize} +\item \verb|expr > int| -- Sets starting transitions to have priority int. +\item \verb|expr @ int| -- Sets transitions that go into a final state to have priority int. +\item \verb|expr $ int| -- Sets all transitions to have priority int. +\item \verb|expr % int| -- Sets leaving transitions to +have priority int. When a transition is made going out of the machine (either +by concatenation or kleene star) its priority is immediately set to the +leaving priority. +\end{itemize} + +The second form of priority assignment allows the programmer to specify the name +to which the priority is assigned. + +\begin{itemize} +\item \verb|expr > (name, int)| -- Starting transitions. +\item \verb|expr @ (name, int)| -- Finishing transitions (into a final state). +\item \verb|expr $ (name, int)| -- All transitions. +\item \verb|expr % (name, int)| -- Leaving transitions. +\end{itemize} + +\section{Guarded Operators that Encapsulate Priorities} + +Priority embeddings are a very expressive mechanism. At the same time they +can be very confusing for the user. They force the user to imagine +the transitions inside two interacting expressions and work out the precise +effects of the operations between them. When we consider +that this problem is worsened by the +potential for side effects caused by unintended priority name collisions, we +see that exposing the user to priorities is undesirable. + +Fortunately, in practice the use of priorities has been necessary only in a +small number of scenarios. This allows us to encapsulate their functionality +into a small set of operators and fully hide them from the user. This is +advantageous from a language design point of view because it greatly simplifies +the design. + +Going back to the C comment example, we can now properly specify +it using a guarded concatenation operator which we call {\em finish-guarded +concatenation}. From the user's point of view, this operator terminates the +first machine when the second machine moves into a final state. It chooses a +unique name and uses it to embed a low priority into all +transitions of the first machine. A higher priority is then embedded into the +transitions of the second machine that enter into a final state. The following +example yields a machine identical to the example in Section +\ref{controlling-nondeterminism}. + +\begin{inline_code} +\begin{verbatim} +comment = '/*' ( any @comm )* :>> '*/'; +\end{verbatim} +\end{inline_code} +\verbspace + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{comments2} +\end{center} +\graphspace + +Another guarded operator is {\em left-guarded concatenation}, given by the +\verb|<:| compound symbol. This operator places a higher priority on all +transitions of the first machine. This is useful if one must forcibly separate +two lists that contain common elements. For example, one may need to tokenize a +stream, but first consume leading whitespace. + +Ragel also includes a {\em longest-match kleene star} operator, given by the +\verb|**| compound symbol. This +guarded operator embeds a high +priority into all transitions of the machine. +A lower priority is then embedded into the leaving transitions. When the +kleene star operator makes the epsilon transitions from +the final states into the new start state, the lower priority will be transferred +to the epsilon transitions. In cases where following an epsilon transition +out of a final state conflicts with an existing transition out of a final +state, the epsilon transition will be dropped. + +Other guarded operators are conceivable, such as guards on union that cause one +alternative to take precedence over another. These may be implemented when it +is clear they constitute a frequently used operation. +In the next section we discuss the explicit specification of state machines +using state charts. + +\subsection{Entry-Guarded Concatenation} + +\verb|expr :> expr| + +This operator concatenates two machines, but first assigns a low +priority to all transitions +of the first machine and a high priority to the starting transitions of the +second machine. This operator is useful if from the final states of the first +machine it is possible to accept the characters in the entering transitions of +the second machine. This operator effectively terminates the first machine +immediately upon starting the second machine, where otherwise they would be +pursued concurrently. In the following example, entry-guarded concatenation is +used to move out of a machine that matches everything at the first sign of an +end-of-input marker. + +% GENERATE: entryguard +% OPT: -p +% %%{ +% machine entryguard; +\begin{inline_code} +\begin{verbatim} +# Leave the catch-all machine on the first character of FIN. +main := any* :> 'FIN'; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{entryguard} +\end{center} +\graphspace + +Entry-guarded concatenation is equivalent to the following: + +\begin{verbatim} +expr $(unique_name,0) . expr >(unique_name,1) +\end{verbatim} +\verbspace + +\subsection{Finish-Guarded Concatenation} + +\verb|expr :>> expr| + +This operator is +like the previous operator, except the higher priority is placed on the final +transitions of the second machine. This is useful if one wishes to entertain +the possibility of continuing to match the first machine right up until the +second machine enters a final state. In other words it terminates the first +machine only when the second accepts. In the following example, finish-guarded +concatenation causes the move out of the machine that matches everything to be +delayed until the full end-of-input marker has been matched. + +% GENERATE: finguard +% OPT: -p +% %%{ +% machine finguard; +\begin{inline_code} +\begin{verbatim} +# Leave the catch-all machine on the last character of FIN. +main := any* :>> 'FIN'; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{finguard} +\end{center} +\graphspace + +Finish-guarded concatenation is equivalent to the following, with one +exception. If the right machine's start state is final, the higher priority is +also embedded into it as a leaving priority. This prevents the left machine +from persisting via the zero-length string. + +\begin{verbatim} +expr $(unique_name,0) . expr @(unique_name,1) +\end{verbatim} +\verbspace + +\subsection{Left-Guarded Concatenation} + +\verb|expr <: expr| + +This operator places +a higher priority on the left expression. It is useful if you want to prefix a +sequence with another sequence composed of some of the same characters. For +example, one can consume leading whitespace before tokenizing a sequence of +whitespace-separated words as in: + +% GENERATE: leftguard +% OPT: -p +% %%{ +% machine leftguard; +% action alpha {} +% action ws {} +% action start {} +% action fin {} +\begin{inline_code} +\begin{verbatim} +main := ( ' '* >start %fin ) <: ( ' ' $ws | [a-z] $alpha )*; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{leftguard} +\end{center} +\graphspace + +Left-guarded concatenation is equivalent to the following: + +\begin{verbatim} +expr $(unique_name,1) . expr >(unique_name,0) +\end{verbatim} +\verbspace + +\subsection{Longest-Match Kleene Star} +\label{longest_match_kleene_star} + +\verb|expr**| + +This version of kleene star puts a higher priority on staying in the +machine versus wrapping around and starting over. The LM kleene star is useful +when writing simple tokenizers. These machines are built by applying the +longest-match kleene star to an alternation of token patterns, as in the +following. + +% GENERATE: lmkleene +% OPT: -p +% %%{ +% machine exfinpri; +% action A {} +% action B {} +\begin{inline_code} +\begin{verbatim} +# Repeat tokens, but make sure to get the longest match. +main := ( + lower ( lower | digit )* %A | + digit+ %B | + ' ' +)**; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{lmkleene} +\end{center} +\graphspace + +If a regular kleene star were used the machine above would not be able to +distinguish between extending a word and beginning a new one. This operator is +equivalent to: + +\begin{verbatim} +( expr $(unique_name,1) %(unique_name,0) )* +\end{verbatim} +\verbspace + +When the kleene star is applied, transitions that go out of the machine and +back into it are made. These are assigned a priority of zero by the leaving +transition mechanism. This is less than the priority of one assigned to the +transitions leaving the final states but not leaving the machine. When +these transitions clash on the same character, the +transition that stays in the machine takes precedence. The transition +that wraps around is dropped. + +Note that this operator does not build a scanner in the traditional sense +because there is never any backtracking. To build a scanner with backtracking +use the Longest-Match machine construction described in Section +\ref{generating-scanners}. + +\chapter{Interface to Host Program} + +The Ragel code generator is very flexible. The generated code has no +dependencies and can be inserted in any function, perhaps inside a loop if +desired. The user is responsible for declaring and initializing a number of +required variables, including the current state and the pointer to the input +stream. These can live in any scope. Control of the input processing loop is +also possible: the user may break out of the processing loop and return to it +at any time. + +In the case of the C, D, and Go host languages, Ragel is able to generate very +fast-running code that implements state machines as directly executable code. +Since very large files strain the host language compiler, table-based code +generation is also supported. In the future we hope to provide a partitioned, +directly executable format that is able to reduce the burden on the host +compiler by splitting large machines across multiple functions. + +In the case of Java and Ruby, table-based code generation is the only code +style supported. In the future this may be expanded to include other code +styles. + +Ragel can be used to parse input in one block, or it can be used to parse input +in a sequence of blocks as it arrives from a file or socket. Parsing the input +in a sequence of blocks brings with it a few responsibilities. If the parser +utilizes a scanner, care must be taken to not break the input stream anywhere +but token boundaries. If pointers to the input stream are taken during +parsing, care must be taken to not use a pointer that has been invalidated by +movement to a subsequent block. If the current input data pointer is moved +backwards it must not be moved past the beginning of the current block. + +Figure \ref{basic-example} shows a simple Ragel program that does not have any +actions. The example tests the first argument of the program against a number +pattern and then prints the machine's acceptance status. + +\begin{figure} +\small +\begin{verbatim} +#include <stdio.h> +#include <string.h> +%%{ + machine foo; + write data; +}%% +int main( int argc, char **argv ) +{ + int cs; + if ( argc > 1 ) { + char *p = argv[1]; + char *pe = p + strlen( p ); + %%{ + main := [0-9]+ ( '.' [0-9]+ )?; + + write init; + write exec; + }%% + } + printf("result = %i\n", cs >= foo_first_final ); + return 0; +} +\end{verbatim} +\verbspace +\caption{A basic Ragel example without any actions. +} +\label{basic-example} +\end{figure} + +\section{Variables Used by Ragel} + +There are a number of variables that Ragel expects the user to declare. At a +very minimum the \verb|cs|, \verb|p| and \verb|pe| variables must be declared. +In Go, Java and Ruby code the \verb|data| variable must also be declared. If +EOF actions are used then the \verb|eof| variable is required. If +stack-based state machine control flow statements are used then the +\verb|stack| and \verb|top| variables are required. If a scanner is declared +then the \verb|act|, \verb|ts| and \verb|te| variables must be +declared. + +\begin{itemize} + +\item \verb|cs| - Current state. This must be an integer and it should persist +across invocations of the machine when the data is broken into blocks that are +processed independently. This variable may be modified from outside the +execution loop, but not from within. + +\item \verb|p| - Data pointer. In C/D code this variable is expected to be a +pointer to the character data to process. It should be initialized to the +beginning of the data block on every run of the machine. In Go, Java and Ruby it is +used as an offset to \verb|data| and must be an integer. In this case it should +be initialized to zero on every run of the machine. + +\item \verb|pe| - Data end pointer. This should be initialized to \verb|p| plus +the data length on every run of the machine. In Go, Java and Ruby code this should +be initialized to the data length. + +\item \verb|eof| - End of file pointer. This should be set to \verb|pe| when +the buffer block being processed is the last one, otherwise it should be set to +null. In Go, Java and Ruby code \verb|-1| must be used instead of null. If the EOF +event can be known only after the final buffer block has been processed, then +it is possible to set \verb|p = pe = eof| and run the execute block. + +\item \verb|data| - This variable is only required in Go, Java and Ruby code. It +must be an array containting the data to process. + +\item \verb|stack| - This must be an array of integers. It is used to store +integer values representing states. If the stack must resize dynamically the +Pre-push and Post-Pop statements can be used to do this (Sections +\ref{prepush} and \ref{postpop}). + +\item \verb|top| - This must be an integer value and will be used as an offset +to \verb|stack|, giving the next available spot on the top of the stack. + +\item \verb|act| - This must be an integer value. It is a variable sometimes +used by scanner code to keep track of the most recent successful pattern match. + +\item \verb|ts| - This must be a pointer to character data. In Go, Java and +Ruby code this must be an integer. See Section \ref{generating-scanners} for +more information. + +\item \verb|te| - Also a pointer to character data. + +\end{itemize} + +\section{Alphtype Statement} + +\begin{verbatim} +alphtype unsigned int; +\end{verbatim} +\verbspace + +The alphtype statement specifies the alphabet data type that the machine +operates on. During the compilation of the machine, integer literals are +expected to be in the range of possible values of the alphtype. The default +is \verb|char| for all languages except Go where the default is \verb|byte|. + +\begin{multicols}{2} +C/C++/Objective-C: +\begin{verbatim} + char unsigned char + short unsigned short + int unsigned int + long unsigned long +\end{verbatim} +\verbspace + +Go: +\begin{verbatim} + byte + int8 uint8 + int16 uint16 + int32 uint32 + int +\end{verbatim} +\verbspace + +Ruby: +\begin{verbatim} + char + int +\end{verbatim} +\verbspace + +\columnbreak + +Java: +\begin{verbatim} + char + byte + short + int +\end{verbatim} +\verbspace + +D: +\begin{verbatim} + char + byte ubyte + short ushort + wchar + int uint + dchar +\end{verbatim} +\verbspace + +\end{multicols} + +\section{Getkey Statement} + +\begin{verbatim} +getkey fpc->id; +\end{verbatim} +\verbspace + +This statement specifies to Ragel how to retrieve the current character from +from the pointer to the current element (\verb|p|). Any expression that returns +a value of the alphabet type +may be used. The getkey statement may be used for looking into element +structures or for translating the character to process. The getkey expression +defaults to \verb|(*p)|. In goto-driven machines the getkey expression may be +evaluated more than once per element processed, therefore it should not incur a +large cost nor preclude optimization. + +\section{Access Statement} + +\begin{verbatim} +access fsm->; +\end{verbatim} +\verbspace + +The access statement specifies how the generated code should +access the machine data that is persistent across processing buffer blocks. +This applies to all variables except \verb|p|, \verb|pe| and \verb|eof|. This includes +\verb|cs|, \verb|top|, \verb|stack|, \verb|ts|, \verb|te| and \verb|act|. +The access statement is useful if a machine is to be encapsulated inside a +structure in C code. It can be used to give the name of +a pointer to the structure. + +\section{Variable Statement} + +\begin{verbatim} +variable p fsm->p; +\end{verbatim} +\verbspace + +The variable statement specifies how to access a specific +variable. All of the variables that are declared by the user and +used by Ragel can be changed. This includes \verb|p|, \verb|pe|, \verb|eof|, \verb|cs|, +\verb|top|, \verb|stack|, \verb|ts|, \verb|te| and \verb|act|. +In Go, Ruby and Java code generation the \verb|data| variable can also be changed. + +\section{Pre-Push Statement} +\label{prepush} + +\begin{verbatim} +prepush { + /* stack growing code */ +} +\end{verbatim} +\verbspace + +The prepush statement allows the user to supply stack management code that is +written out during the generation of fcall, immediately before the current +state is pushed to the stack. This statement can be used to test the number of +available spaces and dynamically grow the stack if necessary. + +\section{Post-Pop Statement} +\label{postpop} + +\begin{verbatim} +postpop { + /* stack shrinking code */ +} +\end{verbatim} +\verbspace + +The postpop statement allows the user to supply stack management code that is +written out during the generation of fret, immediately after the next state is +popped from the stack. This statement can be used to dynamically shrink the +stack. + +\section{Write Statement} +\label{write-statement} + +\begin{verbatim} +write <component> [options]; +\end{verbatim} +\verbspace + +The write statement is used to generate parts of the machine. +There are seven +components that can be generated by a write statement. These components make up the +state machine's data, initialization code, execution code, and export definitions. +A write statement may appear before a machine is fully defined. +This allows one to write out the data first then later define the machine where +it is used. An example of this is shown in Figure \ref{fbreak-example}. + +\subsection{Write Data} +\begin{verbatim} +write data [options]; +\end{verbatim} +\verbspace + +The write data statement causes Ragel to emit the constant static data needed +by the machine. In table-driven output styles (see Section \ref{genout}) this +is a collection of arrays that represent the states and transitions of the +machine. In goto-driven machines much less data is emitted. At the very +minimum a start state \verb|name_start| is generated. All variables written +out in machine data have both the \verb|static| and \verb|const| properties and +are prefixed with the name of the machine and an +underscore. The data can be placed inside a class, inside a function, or it can +be defined as global data. + +Two variables are written that may be used to test the state of the machine +after a buffer block has been processed. The \verb|name_error| variable gives +the id of the state that the machine moves into when it cannot find a valid +transition to take. The machine immediately breaks out of the processing loop when +it finds itself in the error state. The error variable can be compared to the +current state to determine if the machine has failed to parse the input. If the +machine is complete, that is from every state there is a transition to a proper +state on every possible character of the alphabet, then no error state is required +and this variable will be set to -1. + +The \verb|name_first_final| variable stores the id of the first final state. +All of the machine's states are sorted by their final state status before +having their ids assigned. Checking if the machine has accepted its input can +then be done by checking if the current state is greater-than or equal to the +first final state. + +Data generation has several options: + +\noindent\hspace*{24pt}\verb|noerror | - Do not generate the integer variable that gives the id of the error state.\\ +\noindent\hspace*{24pt}\verb|nofinal | - Do not generate the integer variable that gives the id of the first final state.\\ +\noindent\hspace*{24pt}\verb|noprefix | - Do not prefix the variable names with the name of the machine. +\vspace{12pt} + +\begin{figure} +\small +\begin{verbatim} +#include <stdio.h> +%% machine foo; +%% write data; +int main( int argc, char **argv ) +{ + int cs, res = 0; + if ( argc > 1 ) { + char *p = argv[1]; + %%{ + main := + [a-z]+ + 0 @{ res = 1; fbreak; }; + write init; + write exec noend; + }%% + } + printf("execute = %i\n", res ); + return 0; +} +\end{verbatim} +\verbspace +\caption{Use of {\tt noend} write option and the {\tt fbreak} statement for +processing a string. +} +\label{fbreak-example} +\end{figure} + +\subsection{Write Start, First Final and Error} + +\begin{verbatim} +write start; +write first_final; +write error; +\end{verbatim} +\verbspace + +These three write statements provide an alternative means of accessing the +\verb|start|, \verb|first_final| and \verb|error| states. If there are many +different machine specifications in one file it is easy to get the prefix for +these wrong. This is especially true if the state machine boilerplate is +frequently made by a copy-paste-edit process. These write statements allow the +problem to be avoided. They can be used as follows: + +\begin{verbatim} +/* Did parsing succeed? */ +if ( cs < %%{ write first_final; }%% ) { + result = ERR_PARSE_ERROR; + goto fail; +} +\end{verbatim} +\verbspace + +\subsection{Write Init} +\begin{verbatim} +write init [options]; +\end{verbatim} +\verbspace + +The write init statement causes Ragel to emit initialization code. This should +be executed once before the machine is started. At a very minimum this sets the +current state to the start state. If other variables are needed by the +generated code, such as call stack variables or scanner management +variables, they are also initialized here. + +The \verb|nocs| option to the write init statement will cause ragel to skip +intialization of the cs variable. This is useful if the user wishes to use +custom logic to decide which state the specification should start in. + +\subsection{Write Exec} +\begin{verbatim} +write exec [options]; +\end{verbatim} +\verbspace + +The write exec statement causes Ragel to emit the state machine's execution code. +Ragel expects several variables to be available to this code. At a very minimum, the +generated code needs access to the current character position \verb|p|, the ending +position \verb|pe| and the current state \verb|cs| (though \verb|pe| +can be omitted using the \verb|noend| write option). +The \verb|p| variable is the cursor that the execute code will +used to traverse the input. The \verb|pe| variable should be set up to point to one +position past the last valid character in the buffer. + +Other variables are needed when certain features are used. For example using +the \verb|fcall| or \verb|fret| statements requires \verb|stack| and +\verb|top| variables to be defined. If a longest-match construction is used, +variables for managing backtracking are required. + +The write exec statement has one option. The \verb|noend| option tells Ragel +to generate code that ignores the end position \verb|pe|. In this +case the user must explicitly break out of the processing loop using +\verb|fbreak|, otherwise the machine will continue to process characters until +it moves into the error state. This option is useful if one wishes to process a +null terminated string. Rather than traverse the string to discover then length +before processing the input, the user can break out when the null character is +seen. The example in Figure \ref{fbreak-example} shows the use of the +\verb|noend| write option and the \verb|fbreak| statement for processing a string. + +\subsection{Write Exports} +\label{export} + +\begin{verbatim} +write exports; +\end{verbatim} +\verbspace + +The export feature can be used to export simple machine definitions. Machine definitions +are marked for export using the \verb|export| keyword. + +\begin{verbatim} +export machine_to_export = 0x44; +\end{verbatim} +\verbspace + +When the write exports statement is used these machines are +written out in the generated code. Defines are used for C and constant integers +are used for D, Java and Ruby. See Section \ref{import} for a description of the +import statement. + +\section{Maintaining Pointers to Input Data} + +In the creation of any parser it is not uncommon to require the collection of +the data being parsed. It is always possible to collect data into a growable +buffer as the machine moves over it, however the copying of data is a somewhat +wasteful use of processor cycles. The most efficient way to collect data from +the parser is to set pointers into the input then later reference them. This +poses a problem for uses of Ragel where the input data arrives in blocks, such +as over a socket or from a file. If a pointer is set in one buffer block but +must be used while parsing a following buffer block, some extra consideration +to correctness must be made. + +The scanner constructions exhibit this problem, requiring the maintenance +code described in Section \ref{generating-scanners}. If a longest-match +construction has been used somewhere in the machine then it is possible to +take advantage of the required prefix maintenance code in the driver program to +ensure pointers to the input are always valid. If laying down a pointer one can +set \verb|ts| at the same spot or ahead of it. When data is shifted in +between loops the user must also shift the pointer. In this way it is possible +to maintain pointers to the input that will always be consistent. + +\begin{figure} +\small +\begin{verbatim} + int have = 0; + while ( 1 ) { + char *p, *pe, *data = buf + have; + int len, space = BUFSIZE - have; + + if ( space == 0 ) { + fprintf(stderr, "BUFFER OUT OF SPACE\n"); + exit(1); + } + + len = fread( data, 1, space, stdin ); + if ( len == 0 ) + break; + + /* Find the last newline by searching backwards. */ + p = buf; + pe = data + len - 1; + while ( *pe != '\n' && pe >= buf ) + pe--; + pe += 1; + + %% write exec; + + /* How much is still in the buffer? */ + have = data + len - pe; + if ( have > 0 ) + memmove( buf, pe, have ); + + if ( len < space ) + break; + } +\end{verbatim} +\verbspace +\caption{An example of line-oriented processing. +} +\label{line-oriented} +\end{figure} + +In general, there are two approaches for guaranteeing the consistency of +pointers to input data. The first approach is the one just described; +lay down a marker from an action, +then later ensure that the data the marker points to is preserved ahead of +the buffer on the next execute invocation. This approach is good because it +allows the parser to decide on the pointer-use boundaries, which can be +arbitrarily complex parsing conditions. A downside is that it requires any +pointers that are set to be corrected in between execute invocations. + +The alternative is to find the pointer-use boundaries before invoking the execute +routine, then pass in the data using these boundaries. For example, if the +program must perform line-oriented processing, the user can scan backwards from +the end of an input block that has just been read in and process only up to the +first found newline. On the next input read, the new data is placed after the +partially read line and processing continues from the beginning of the line. +An example of line-oriented processing is given in Figure \ref{line-oriented}. + +\section{Specifying the Host Language} + +The \verb|ragel| program has a number of options for specifying the host +language. The host-language options are: + +\begin{itemize} +\item \verb|-C | for C/C++/Objective-C code (default) +\item \verb|-D | for D code. +\item \verb|-Z | for Go code. +\item \verb|-J | for Java code. +\item \verb|-R | for Ruby code. +\item \verb|-A | for C\# code. +\end{itemize} + +\section{Choosing a Generated Code Style} +\label{genout} + +There are three styles of code output to choose from. Code style affects the +size and speed of the compiled binary. Changing code style does not require any +change to the Ragel program. There are two table-driven formats and a goto +driven format. + +In addition to choosing a style to emit, there are various levels of action +code reuse to choose from. The maximum reuse levels (\verb|-T0|, \verb|-F0| +and \verb|-G0|) ensure that no FSM action code is ever duplicated by encoding +each transition's action list as static data and iterating +through the lists on every transition. This will normally result in a smaller +binary. The less action reuse options (\verb|-T1|, \verb|-F1| and \verb|-G1|) +will usually produce faster running code by expanding each transition's action +list into a single block of code, eliminating the need to iterate through the +lists. This duplicates action code instead of generating the logic necessary +for reuse. Consequently the binary will be larger. However, this tradeoff applies to +machines with moderate to dense action lists only. If a machine's transitions +frequently have less than two actions then the less reuse options will actually +produce both a smaller and a faster running binary due to less action sharing +overhead. The best way to choose the appropriate code style for your +application is to perform your own tests. + +The table-driven FSM represents the state machine as constant static data. There are +tables of states, transitions, indices and actions. The current state is +stored in a variable. The execution is simply a loop that looks up the current +state, looks up the transition to take, executes any actions and moves to the +target state. In general, the table-driven FSM can handle any machine, produces +a smaller binary and requires a less expensive host language compile, but +results in slower running code. Since the table-driven format is the most +flexible it is the default code style. + +The flat table-driven machine is a table-based machine that is optimized for +small alphabets. Where the regular table machine uses the current character as +the key in a binary search for the transition to take, the flat table machine +uses the current character as an index into an array of transitions. This is +faster in general, however is only suitable if the span of possible characters +is small. + +The goto-driven FSM represents the state machine using goto and switch +statements. The execution is a flat code block where the transition to take is +computed using switch statements and directly executable binary searches. In +general, the goto FSM produces faster code but results in a larger binary and a +more expensive host language compile. + +The goto-driven format has an additional action reuse level (\verb|-G2|) that +writes actions directly into the state transitioning logic rather than putting +all the actions together into a single switch. Generally this produces faster +running code because it allows the machine to encode the current state using +the processor's instruction pointer. Again, sparse machines may actually +compile to smaller binaries when \verb|-G2| is used due to less state and +action management overhead. For many parsing applications \verb|-G2| is the +preferred output format. + +\begin{center} + +Code Output Style Options + +\begin{tabular}{|c|c|c|} +\hline +\verb|-T0|&binary search table-driven&C/D/Java/Ruby/C\#\\ +\hline +\verb|-T1|&binary search, expanded actions&C/D/Ruby/C\#\\ +\hline +\verb|-F0|&flat table-driven&C/D/Ruby/C\#\\ +\hline +\verb|-F1|&flat table, expanded actions&C/D/Ruby/C\#\\ +\hline +\verb|-G0|&goto-driven&C/D/C\#\\ +\hline +\verb|-G1|&goto, expanded actions&C/D/C\#\\ +\hline +\verb|-G2|&goto, in-place actions&C/D/Go\\ +\hline +\end{tabular} +\end{center} + +\chapter{Beyond the Basic Model} + +\section{Parser Modularization} +\label{modularization} + +It is possible to use Ragel's machine construction and action embedding +operators to specify an entire parser using a single regular expression. In +many cases this is the desired way to specify a parser in Ragel. However, in +some scenarios the language to parse may be so large that it is difficult to +think about it as a single regular expression. It may also shift between distinct +parsing strategies, in which case modularization into several coherent blocks +of the language may be appropriate. + +It may also be the case that patterns that compile to a large number of states +must be used in a number of different contexts and referencing them in each +context results in a very large state machine. In this case, an ability to reuse +parsers would reduce code size. + +To address this, distinct regular expressions may be instantiated and linked +together by means of a jumping and calling mechanism. This mechanism is +analogous to the jumping to and calling of processor instructions. A jump +command, given in action code, causes control to be immediately passed to +another portion of the machine by way of setting the current state variable. A +call command causes the target state of the current transition to be pushed to +a state stack before control is transferred. Later on, the original location +may be returned to with a return statement. In the following example, distinct +state machines are used to handle the parsing of two types of headers. + +% GENERATE: call +% %%{ +% machine call; +\begin{inline_code} +\begin{verbatim} +action return { fret; } +action call_date { fcall date; } +action call_name { fcall name; } + +# A parser for date strings. +date := [0-9][0-9] '/' + [0-9][0-9] '/' + [0-9][0-9][0-9][0-9] '\n' @return; + +# A parser for name strings. +name := ( [a-zA-Z]+ | ' ' )** '\n' @return; + +# The main parser. +headers = + ( 'from' | 'to' ) ':' @call_name | + ( 'departed' | 'arrived' ) ':' @call_date; + +main := headers*; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% %% write data; +% void f() +% { +% %% write init; +% %% write exec; +% } +% END GENERATE + +Calling and jumping should be used carefully as they are operations that take +one out of the domain of regular languages. A machine that contains a call or +jump statement in one of its actions should be used as an argument to a machine +construction operator only with considerable care. Since DFA transitions may +actually represent several NFA transitions, a call or jump embedded in one +machine can inadvertently terminate another machine that it shares prefixes +with. Despite this danger, theses statements have proven useful for tying +together sub-parsers of a language into a parser for the full language, +especially for the purpose of modularizing code and reducing the number of +states when the machine contains frequently recurring patterns. + +Section \ref{vals} describes the jump and call statements that are used to +transfer control. These statements make use of two variables that must be +declared by the user, \verb|stack| and \verb|top|. The \verb|stack| variable +must be an array of integers and \verb|top| must be a single integer, which +will point to the next available space in \verb|stack|. Sections \ref{prepush} +and \ref{postpop} describe the Pre-Push and Post-Pop statements which can be +used to implement a dynamically resizable array. + +\section{Referencing Names} +\label{labels} + +This section describes how to reference names in epsilon transitions (Section +\ref{state-charts}) and +action-based control-flow statements such as \verb|fgoto|. There is a hierarchy +of names implied in a Ragel specification. At the top level are the machine +instantiations. Beneath the instantiations are labels and references to machine +definitions. Beneath those are more labels and references to definitions, and +so on. + +Any name reference may contain multiple components separated with the \verb|::| +compound symbol. The search for the first component of a name reference is +rooted at the join expression that the epsilon transition or action embedding +is contained in. If the name reference is not contained in a join, +the search is rooted at the machine definition that the epsilon transition or +action embedding is contained in. Each component after the first is searched +for beginning at the location in the name tree that the previous reference +component refers to. + +In the case of action-based references, if the action is embedded more than +once, the local search is performed for each embedding and the result is the +union of all the searches. If no result is found for action-based references then +the search is repeated at the root of the name tree. Any action-based name +search may be forced into a strictly global search by prefixing the name +reference with \verb|::|. + +The final component of the name reference must resolve to a unique entry point. +If a name is unique in the entire name tree it can be referenced as is. If it +is not unique it can be specified by qualifying it with names above it in the +name tree. However, it can always be renamed. + +% FIXME: Should fit this in somewhere. +% Some kinds of name references are illegal. Cannot call into longest-match +% machine, can only call its start state. Cannot make a call to anywhere from +% any part of a longest-match machine except a rule's action. This would result +% in an eventual return to some point inside a longest-match other than the +% start state. This is banned for the same reason a call into the LM machine is +% banned. + + +\section{Scanners} +\label{generating-scanners} + +Scanners are very much intertwined with regular-languages and their +corresponding processors. For this reason Ragel supports the definition of +scanners. The generated code will repeatedly attempt to match patterns from a +list, favouring longer patterns over shorter patterns. In the case of +equal-length matches, the generated code will favour patterns that appear ahead +of others. When a scanner makes a match it executes the user code associated +with the match, consumes the input then resumes scanning. + +\begin{verbatim} +<machine_name> := |* + pattern1 => action1; + pattern2 => action2; + ... + *|; +\end{verbatim} +\verbspace + +On the surface, Ragel scanners are similar to those defined by Lex. Though +there is a key distinguishing feature: patterns may be arbitrary Ragel +expressions and can therefore contain embedded code. With a Ragel-based scanner +the user need not wait until the end of a pattern before user code can be +executed. + +Scanners can be used to process sub-languages, as well as for tokenizing +programming languages. In the following example a scanner is used to tokenize +the contents of a header field. + +\begin{inline_code} +\begin{verbatim} +word = [a-z]+; +head_name = 'Header'; + +header := |* + word; + ' '; + '\n' => { fret; }; +*|; + +main := ( head_name ':' @{ fcall header; } )*; +\end{verbatim} +\end{inline_code} +\verbspace + +The scanner construction has a purpose similar to the longest-match kleene star +operator \verb|**|. The key +difference is that a scanner is able to backtrack to match a previously matched +shorter string when the pursuit of a longer string fails. For this reason the +scanner construction operator is not a pure state machine construction +operator. It relies on several variables that enable it to backtrack and make +pointers to the matched input text available to the user. For this reason +scanners must be immediately instantiated. They cannot be defined inline or +referenced by another expression. Scanners must be jumped to or called. + +Scanners rely on the \verb|ts|, \verb|te| and \verb|act| +variables to be present so that they can backtrack and make pointers to the +matched text available to the user. If input is processed using multiple calls +to the execute code then the user must ensure that when a token is only +partially matched that the prefix is preserved on the subsequent invocation of +the execute code. + +The \verb|ts| variable must be defined as a pointer to the input data. +It is used for recording where the current token match begins. This variable +may be used in action code for retrieving the text of the current match. Ragel +ensures that in between tokens and outside of the longest-match machines that +this pointer is set to null. In between calls to the execute code the user must +check if \verb|ts| is set and if so, ensure that the data it points to is +preserved ahead of the next buffer block. This is described in more detail +below. + +The \verb|te| variable must also be defined as a pointer to the input data. +It is used for recording where a match ends and where scanning of the next +token should begin. This can also be used in action code for retrieving the +text of the current match. + +The \verb|act| variable must be defined as an integer type. It is used for +recording the identity of the last pattern matched when the scanner must go +past a matched pattern in an attempt to make a longer match. If the longer +match fails it may need to consult the \verb|act| variable. In some cases, use +of the \verb|act| +variable can be avoided because the value of the current state is enough +information to determine which token to accept, however in other cases this is +not enough and so the \verb|act| variable is used. + +When the longest-match operator is in use, the user's driver code must take on +some buffer management functions. The following algorithm gives an overview of +the steps that should be taken to properly use the longest-match operator. + +\begin{itemize} +\item Read a block of input data. +\item Run the execute code. +\item If \verb|ts| is set, the execute code will expect the incomplete +token to be preserved ahead of the buffer on the next invocation of the execute +code. +\begin{itemize} +\item Shift the data beginning at \verb|ts| and ending at \verb|pe| to the +beginning of the input buffer. +\item Reset \verb|ts| to the beginning of the buffer. +\item Shift \verb|te| by the distance from the old value of \verb|ts| +to the new value. The \verb|te| variable may or may not be valid. There is +no way to know if it holds a meaningful value because it is not kept at null +when it is not in use. It can be shifted regardless. +\end{itemize} +\item Read another block of data into the buffer, immediately following any +preserved data. +\item Run the scanner on the new data. +\end{itemize} + +Figure \ref{preserve_example} shows the required handling of an input stream in +which a token is broken by the input block boundaries. After processing up to +and including the ``t'' of ``characters'', the prefix of the string token must be +retained and processing should resume at the ``e'' on the next iteration of +the execute code. + +If one uses a large input buffer for collecting input then the number of times +the shifting must be done will be small. Furthermore, if one takes care not to +define tokens that are allowed to be very long and instead processes these +items using pure state machines or sub-scanners, then only a small amount of +data will ever need to be shifted. + +\begin{figure} +\small +\begin{verbatim} + a) A stream "of characters" to be scanned. + | | | + p ts pe + + b) "of characters" to be scanned. + | | | + ts p pe +\end{verbatim} +\verbspace +\caption{Following an invocation of the execute code there may be a partially +matched token (a). The data of the partially matched token +must be preserved ahead of the new data on the next invocation (b). +} +\label{preserve_example} +\end{figure} + +Since scanners attempt to make the longest possible match of input, patterns +such as identifiers require one character of lookahead in order to trigger a +match. In the case of the last token in the input stream the user must ensure +that the \verb|eof| variable is set so that the final token is flushed out. + +An example scanner processing loop is given in Figure \ref{scanner-loop}. + +\begin{figure} +\small +\begin{verbatim} + int have = 0; + bool done = false; + while ( !done ) { + /* How much space is in the buffer? */ + int space = BUFSIZE - have; + if ( space == 0 ) { + /* Buffer is full. */ + cerr << "TOKEN TOO BIG" << endl; + exit(1); + } + + /* Read in a block after any data we already have. */ + char *p = inbuf + have; + cin.read( p, space ); + int len = cin.gcount(); + + char *pe = p + len; + char *eof = 0; + + /* If no data was read indicate EOF. */ + if ( len == 0 ) { + eof = pe; + done = true; + } + + %% write exec; + + if ( cs == Scanner_error ) { + /* Machine failed before finding a token. */ + cerr << "PARSE ERROR" << endl; + exit(1); + } + + if ( ts == 0 ) + have = 0; + else { + /* There is a prefix to preserve, shift it over. */ + have = pe - ts; + memmove( inbuf, ts, have ); + te = inbuf + (te-ts); + ts = inbuf; + } + } +\end{verbatim} +\verbspace +\caption{A processing loop for a scanner. +} +\label{scanner-loop} +\end{figure} + +\section{State Charts} +\label{state-charts} + +In addition to supporting the construction of state machines using regular +languages, Ragel provides a way to manually specify state machines using +state charts. The comma operator combines machines together without any +implied transitions. The user can then manually link machines by specifying +epsilon transitions with the \verb|->| operator. Epsilon transitions are drawn +between the final states of a machine and entry points defined by labels. This +makes it possible to build machines using the explicit state-chart method while +making minimal changes to the Ragel language. + +An interesting feature of Ragel's state chart construction method is that it +can be mixed freely with regular expression constructions. A state chart may be +referenced from within a regular expression, or a regular expression may be +used in the definition of a state chart transition. + +\subsection{Join} + +\verb|expr , expr , ...| + +Join a list of machines together without +drawing any transitions, without setting up a start state, and without +designating any final states. Transitions between the machines may be specified +using labels and epsilon transitions. The start state must be explicity +specified with the ``start'' label. Final states may be specified with an +epsilon transition to the implicitly created ``final'' state. The join +operation allows one to build machines using a state chart model. + +\subsection{Label} + +\verb|label: expr| + +Attaches a label to an expression. Labels can be +used as the target of epsilon transitions and explicit control transfer +statements such as \verb|fgoto| and \verb|fnext| in action +code. + +\subsection{Epsilon} + +\verb|expr -> label| + +Draws an epsilon transition to the state defined +by \verb|label|. Epsilon transitions are made deterministic when join +operators are evaluated. Epsilon transitions that are not in a join operation +are made deterministic when the machine definition that contains the epsilon is +complete. See Section \ref{labels} for information on referencing labels. + +\subsection{Simplifying State Charts} + +There are two benefits to providing state charts in Ragel. The first is that it +allows us to take a state chart with a full listing of states and transitions +and simplify it in selective places using regular expressions. + +The state chart method of specifying parsers is very common. It is an +effective programming technique for producing robust code. The key disadvantage +becomes clear when one attempts to comprehend a large parser specified in this +way. These programs usually require many lines, causing logic to be spread out +over large distances in the source file. Remembering the function of a large +number of states can be difficult and organizing the parser in a sensible way +requires discipline because branches and repetition present many file layout +options. This kind of programming takes a specification with inherent +structure such as looping, alternation and concatenation and expresses it in a +flat form. + +If we could take an isolated component of a manually programmed state chart, +that is, a subset of states that has only one entry point, and implement it +using regular language operators then we could eliminate all the explicit +naming of the states contained in it. By eliminating explicitly named states +and replacing them with higher-level specifications we simplify a state machine +specification. + +For example, sometimes chains of states are needed, with only a small number of +possible characters appearing along the chain. These can easily be replaced +with a concatenation of characters. Sometimes a group of common states +implement a loop back to another single portion of the machine. Rather than +manually duplicate all the transitions that loop back, we may be able to +express the loop using a kleene star operator. + +Ragel allows one to take this state map simplification approach. We can build +state machines using a state map model and implement portions of the state map +using regular languages. In place of any transition in the state machine, +entire sub-machines can be given. These can encapsulate functionality +defined elsewhere. An important aspect of the Ragel approach is that when we +wrap up a collection of states using a regular expression we do not lose +access to the states and transitions. We can still execute code on the +transitions that we have encapsulated. + +\subsection{Dropping Down One Level of Abstraction} +\label{down} + +The second benefit of incorporating state charts into Ragel is that it permits +us to bypass the regular language abstraction if we need to. Ragel's action +embedding operators are sometimes insufficient for expressing certain parsing +tasks. In the same way that is useful for C language programmers to drop down +to assembly language programming using embedded assembler, it is sometimes +useful for the Ragel programmer to drop down to programming with state charts. + +In the following example, we wish to buffer the characters of an XML CDATA +sequence. The sequence is terminated by the string \verb|]]>|. The challenge +in our application is that we do not wish the terminating characters to be +buffered. An expression of the form \verb|any* @buffer :>> ']]>'| will not work +because the buffer will always contain the characters \verb|]]| on the end. +Instead, what we need is to delay the buffering of \verb|]| +characters until a time when we +abandon the terminating sequence and go back into the main loop. There is no +easy way to express this using Ragel's regular expression and action embedding +operators, and so an ability to drop down to the state chart method is useful. + +% GENERATE: dropdown +% OPT: -p +% %%{ +% machine dropdown; +\begin{inline_code} +\begin{verbatim} +action bchar { buff( fpc ); } # Buffer the current character. +action bbrack1 { buff( "]" ); } +action bbrack2 { buff( "]]" ); } + +CDATA_body = +start: ( + ']' -> one | + (any-']') @bchar ->start +), +one: ( + ']' -> two | + [^\]] @bbrack1 @bchar ->start +), +two: ( + '>' -> final | + ']' @bbrack1 -> two | + [^>\]] @bbrack2 @bchar ->start +); +\end{verbatim} +\end{inline_code} +\verbspace +% main := CDATA_body; +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{dropdown} +\end{center} +\graphspace + + +\section{Semantic Conditions} +\label{semantic} + +Many communication protocols contain variable-length fields, where the length +of the field is given ahead of the field as a value. This +problem cannot be expressed using regular languages because of its +context-dependent nature. The prevalence of variable-length fields in +communication protocols motivated us to introduce semantic conditions into +the Ragel language. + +A semantic condition is a block of user code that is interpreted as an +expression and evaluated immediately +before a transition is taken. If the code returns a value of true, the +transition may be taken. We can now embed code that extracts the length of a +field, then proceed to match $n$ data values. + +% GENERATE: conds1 +% OPT: -p +% %%{ +% machine conds1; +% number = digit+; +\begin{inline_code} +\begin{verbatim} +action rec_num { i = 0; n = getnumber(); } +action test_len { i++ < n } +data_fields = ( + 'd' + [0-9]+ %rec_num + ':' + ( [a-z] when test_len )* +)**; +\end{verbatim} +\end{inline_code} +\verbspace +% main := data_fields; +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{conds1} +\end{center} +\graphspace + +The Ragel implementation of semantic conditions does not force us to give up the +compositional property of Ragel definitions. For example, a machine that tests +the length of a field using conditions can be unioned with another machine +that accepts some of the same strings, without the two machines interfering with +one another. The user need not be concerned about whether or not the result of the +semantic condition will affect the matching of the second machine. + +To see this, first consider that when a user associates a condition with an +existing transition, the transition's label is translated from the base character +to its corresponding value in the space that represents ``condition $c$ true''. Should +the determinization process combine a state that has a conditional transition +with another state that has a transition on the same input character but +without a condition, then the condition-less transition first has its label +translated into two values, one to its corresponding value in the space that +represents ``condition $c$ true'' and another to its corresponding value in the +space that represents ``condition $c$ false''. It +is then safe to combine the two transitions. This is shown in the following +example. Two intersecting patterns are unioned, one with a condition and one +without. The condition embedded in the first pattern does not affect the second +pattern. + +% GENERATE: conds2 +% OPT: -p +% %%{ +% machine conds2; +% number = digit+; +\begin{inline_code} +\begin{verbatim} +action test_len { i++ < n } +action one { /* accept pattern one */ } +action two { /* accept pattern two */ } +patterns = + ( [a-z] when test_len )+ %one | + [a-z][a-z0-9]* %two; +main := patterns '\n'; +\end{verbatim} +\end{inline_code} +\verbspace +% }%% +% END GENERATE + +\graphspace +\begin{center} +\includegraphics[scale=0.55]{conds2} +\end{center} +\graphspace + +There are many more potential uses for semantic conditions. The user is free to +use arbitrary code and may therefore perform actions such as looking up names +in dictionaries, validating input using external parsing mechanisms or +performing checks on the semantic structure of input seen so far. In the next +section we describe how Ragel accommodates several common parser engineering +problems. + +The semantic condition feature works only with alphabet types that are smaller +in width than the \verb|long| type. To implement semantic conditions Ragel +needs to be able to allocate characters from the alphabet space. Ragel uses +these allocated characters to express "character C with condition P true" or "C +with P false." Since internally Ragel uses longs to store characters there is +no room left in the alphabet space unless an alphabet type smaller than long is +used. + +\section{Implementing Lookahead} + +There are a few strategies for implementing lookahead in Ragel programs. +Leaving actions, which are described in Section \ref{out-actions}, can be +used as a form of lookahead. Ragel also provides the \verb|fhold| directive +which can be used in actions to prevent the machine from advancing over the +current character. It is also possible to manually adjust the current character +position by shifting it backwards using \verb|fexec|, however when this is +done, care must be taken not to overstep the beginning of the current buffer +block. In both the use of \verb|fhold| and \verb|fexec| the user must be +cautious of combining the resulting machine with another in such a way that the +transition on which the current position is adjusted is not combined with a +transition from the other machine. + +\section{Parsing Recursive Language Structures} + +In general Ragel cannot handle recursive structures because the grammar is +interpreted as a regular language. However, depending on what needs to be +parsed it is sometimes practical to implement the recursive parts using manual +coding techniques. This often works in cases where the recursive structures are +simple and easy to recognize, such as in the balancing of parentheses + +One approach to parsing recursive structures is to use actions that increment +and decrement counters or otherwise recognize the entry to and exit from +recursive structures and then jump to the appropriate machine defnition using +\verb|fcall| and \verb|fret|. Alternatively, semantic conditions can be used to +test counter variables. + +A more traditional approach is to call a separate parsing function (expressed +in the host language) when a recursive structure is entered, then later return +when the end is recognized. + +\end{document} diff --git a/test/rhsref1.lm b/test/rhsref1.lm new file mode 100644 index 0000000..c905d2a --- /dev/null +++ b/test/rhsref1.lm @@ -0,0 +1,117 @@ +##### LM ##### +lex + literal `var `if `then `else `while `do `for `read `write + `end `to `goto + literal `:= `!= `; `+ `- `* `/ `= `( `) `: + + ignore /'//' [^\n]* '\n'/ + ignore /[\n\t ]+/ + token id /[a-zA-Z_]+/ + token integernumber /[0-9]+/ + token stringlit /'"' [^"]* '"'/ +end + +def program + [statement*] + +def statement + [declaration] +| [assignment_statement] +| [if_statement] +| [while_statement] +| [do_statement] +| [for_statement] +| [read_statement] +| [write_statement] +| [labelled_statement] +| [goto_statement] + +def declaration + [`var id `;] + +def assignment_statement + [id `:= expression `;] + +def if_statement + [`if expression `then statement* opt_else_statement `end] + +def opt_else_statement + [`else statement*] +| [] + +def while_statement + [`while expression `do statement* `end] + +def do_statement + [`do statement* `while expression `;] + +def for_statement + [`for id `:= expression `to expression `do statement* `end] + +def read_statement + [`read id `;] + +def write_statement + [`write expression `;] + +def expression + [Term: term] +| [expression eqop Term: term] + +def eqop [`=] | [`!=] + +def term + [Factor: factor] +| [term addop Factor: factor] + +def addop [`+] | [`-] + +def factor + [Primary: primary] +| [factor mulop Primary: primary] + +def mulop [`*] | [`/] + +def primary + [id] +| [lit] +| [`( expression `)] + +def lit + [integernumber] +| [stringlit] + +def labelled_statement + [id `: statement] + +def goto_statement + [`goto id `;] + +parse P: program[stdin] + +for E: expression in P { + print( ^(E.Term.Factor.Primary) '\n' ) +} + +##### IN ##### + +var a; +a := 1; + +head: + +a := a + 1; +c := d; + +if a = 10 then + goto head; +end + +hi := there; +##### EXP ##### +1 +1 +d +10 +a +there diff --git a/test/rubyhere.lm b/test/rubyhere.lm new file mode 100644 index 0000000..836b18c --- /dev/null +++ b/test/rubyhere.lm @@ -0,0 +1,123 @@ +##### LM ##### +context rubyhere + rl ident_pattern /[a-zA-Z_][a-zA-Z_0-9]*/ + rl number_pattern /[0-9]+/ + + lex + ignore /[ \t\n]+/ + token id /ident_pattern/ + token number /number_pattern/ + literal `<< `* `, `( `) `! + end + + HereId: str + + token rest_of_line /[^\n]*'\n'/ + + lex + ignore /[ \t\n]+/ + token here_id + HereData: here_data + /ident_pattern/ + { + # Take the text of the here_id from the input stream. + HereId = input.pull( match_length ) + + # Get the data up to the rest of the line. + parse_stop ROL: rest_of_line(ctx)[ input ] + + # Parse the heredoc data. + parse_stop HereData: here_data(ctx)[ input ] + + # Push the rest-of-line data back to the input stream. + input.push( $ROL ) + + # Send the here_id token. Attach the heredoc data as an attribute. + input.push( make_token( typeid<here_id> HereId HereData ) ) + } + end + + lex + token here_close_id + / ident_pattern '\n' / + { + if match_text == HereId + '\n' { + input.push( make_token( + typeid<here_close_id> + input.pull( match_length ) ) ) + } + else + input.push( make_token( typeid<here_line> input.pull(match_length) ) ) + } + + token here_line + / [^\n]* '\n' / + end + + def here_data + [here_line* here_close_id] + + def heredoc + [`<< here_id] + + def primary + [id] + | [number] + | [heredoc] + + def arglist + [primary arglist_more*] + + def arglist_more + [`, primary] + + def call + [id `( arglist? `)] + + def statement + [primary] + | [call] + + token foobar /any+/ + + def item + [statement `!] + | [foobar] + + def start + [item*] +end # rubyhere + +cons RubyHere: rubyhere[] + +parse S: rubyhere::start(RubyHere)[ stdin ] + +print_xml(S) +print('\n') +##### IN ##### +print( <<DATA1, more, <<DATA2, 99 ) +"&^#(@ almost +!arbitrary text! +DATA1 +hello +world +DATA2 +! +print( <<DATA1, more, <<DATA2, 99 ) +"&^#(@ almost +!arbitrary text! +DATA1 +hello +world +DATA2 +# error here +##### EXP ##### +<rubyhere::start><rubyhere::_repeat_item><rubyhere::item><rubyhere::statement><rubyhere::call><rubyhere::id>print</rubyhere::id><rubyhere::_literal_000d>(</rubyhere::_literal_000d><rubyhere::_opt_arglist><rubyhere::arglist><rubyhere::primary><rubyhere::heredoc><rubyhere::_literal_0007><<</rubyhere::_literal_0007><rubyhere::here_id>DATA1</rubyhere::here_id></rubyhere::heredoc></rubyhere::primary><rubyhere::_repeat_arglist_more><rubyhere::arglist_more><rubyhere::_literal_000b>,</rubyhere::_literal_000b><rubyhere::primary><rubyhere::id>more</rubyhere::id></rubyhere::primary></rubyhere::arglist_more><rubyhere::arglist_more><rubyhere::_literal_000b>,</rubyhere::_literal_000b><rubyhere::primary><rubyhere::heredoc><rubyhere::_literal_0007><<</rubyhere::_literal_0007><rubyhere::here_id>DATA2</rubyhere::here_id></rubyhere::heredoc></rubyhere::primary></rubyhere::arglist_more><rubyhere::arglist_more><rubyhere::_literal_000b>,</rubyhere::_literal_000b><rubyhere::primary><rubyhere::number>99</rubyhere::number></rubyhere::primary></rubyhere::arglist_more></rubyhere::_repeat_arglist_more></rubyhere::arglist></rubyhere::_opt_arglist><rubyhere::_literal_000f>)</rubyhere::_literal_000f></rubyhere::call></rubyhere::statement><rubyhere::_literal_0011>!</rubyhere::_literal_0011></rubyhere::item><rubyhere::item><rubyhere::foobar>print( <<DATA1, more, <<DATA2, 99 ) +"&^#(@ almost +!arbitrary text! +DATA1 +hello +world +DATA2 +# error here +</rubyhere::foobar></rubyhere::item></rubyhere::_repeat_item></rubyhere::start> diff --git a/test/runtests.sh b/test/runtests.sh new file mode 100755 index 0000000..bed1401 --- /dev/null +++ b/test/runtests.sh @@ -0,0 +1,244 @@ +#!/bin/bash +# + +# Test cases contain sections giving the program, input and expected output. + +###### LM ##### +# +# colm program +# +###### ARGS ##### +# +# program arguments +# +###### IN ##### +# +# program input +# +###### EXP ##### +# +# expected output +# +###### EXIT ###### +# +# expected exit value +# + +####################################### + +WORKING=working +COLM=../src/colm +ERRORS=0 + +cd `dirname $0` +test -d $WORKING || mkdir $WORKING + +function die() +{ + echo + echo "$@" + echo + exit 1 +} + +function sig_exit() +{ + echo + exit 1; +} + +# Parse args. +while getopts vdm opt; do + case $opt in + v) + verbose=true; + ;; + d) + diff=true; + ;; + m) + VALGRIND="valgrind --leak-check=full --show-reachable=yes " + ;; + esac +done +shift $(($OPTIND - 1)) + +# The files to process. If none given then glob all functions and pcap test confs. +if [ $# != 0 ]; then + TEST_PAT="$*" +else + TEST_PAT='*.lm' +fi + +function cat_section +{ + local section=$1 + local nth=$2 + local in=$3 + + # Print Nth instance of the section + awk -vsection=$section -vnth=$nth ' + /#+ *[a-zA-Z]+ *#+/ { + gsub( "[ #\n]", "", $0 ); + in_section = 0 + if ( $0 == section ) { + if ( n == nth ) { + in_section = 1; + found = 1; + } + n += 1 + } + next; + } + + in_section { + print $0; + } + + END { + exit( found ? 0 : 1 ) + } + ' $in | awk ' + /--noeol$/ { + gsub(/--noeol$/,""); + printf("%s", $0); + next; + } + { print $0 } + ' + return ${PIPESTATUS[0]}; +} + +function section +{ + local section=$1 + local nth=$2 + local in=$3 + local out=$4 + + cat_section $section $nth $in > $out + + # Remove the file if no section was found + [ $? = 0 ] || rm $out +} + +function runtests() +{ + for TST in $TEST_PAT; do + ROOT=${TST/.lm} + LM=$WORKING/$ROOT.lm + ARGS=$WORKING/$ROOT.args + IN=$WORKING/$ROOT.in + EXP=$WORKING/$ROOT.exp + + section LM 0 $TST $LM + + BIN=$WORKING/$ROOT + OUT=$WORKING/$ROOT.out + DIFF=$WORKING/$ROOT.diff + LOG=$WORKING/$ROOT.log + + if [ '!' -f $LM ]; then + echo "ERROR: $TST cannot be run: no LM section" + ERRORS=$(( ERRORS + 1 )) + continue + fi + + # Compilation. + $COLM $LM &> $LOG + if [ $? != 0 ]; then + echo "ERROR: $TST cannot be run: compilation error" + ERRORS=$(( ERRORS + 1 )) + continue + fi + + Nth=0 + while true; do + section EXP $Nth $TST $EXP + + # Stop when we have no Nth expected output. + if [ '!' -f $EXP ]; then + break; + fi + + section ARGS $Nth $TST $ARGS + section IN $Nth $TST $IN + EXIT=`cat_section EXIT $Nth $TST` + if [ -z "$EXIT" ]; then + EXIT=0 + fi + + cmdargs="" + if [ -f $ARGS ]; then + cmdargs=`cat $ARGS` + fi + + echo -n "running test $TST ($Nth)... " + + if [ "$verbose" = true ]; then + echo + echo $COLM $TST + fi + + if [ '!' -f $IN ] && [ -f $ROOT.in ]; then + IN=$ROOT.in; + fi + + if [ "$verbose" = true ]; then + if [ -f $IN ]; then + echo "${VALGRIND}./$BIN $cmdargs < $IN > $OUT 2>> $LOG" + else + echo "${VALGRIND}./$BIN $cmdargs > $OUT 2>>$LOG" + fi + fi + + # Execution + if [ -f $IN ]; then + ${VALGRIND}./$BIN $cmdargs < $IN > $OUT 2>> $LOG + else + ${VALGRIND}./$BIN $cmdargs > $OUT 2>>$LOG + fi + + e=$? + if [ $e != "$EXIT" ]; then + echo "FAILED: exit value error: got: $e expected: $EXIT" + ERRORS=$(( ERRORS + 1 )) + Nth=$((Nth + 1)) + continue + fi + + + # Diff of output + diff -u $EXP $OUT > $DIFF + if [ $? != 0 ]; then + echo "FAILED: output differs from expected output" + ERRORS=$(( ERRORS + 1 )) + Nth=$((Nth + 1)) + if [ "$diff" = true ]; then + echo + cat $DIFF + echo + fi + continue + fi + + echo ok + Nth=$((Nth + 1)) + done + done + + if [ $ERRORS != 0 ]; then + [ $ERRORS != 1 ] && plural="s"; + echo + echo "TESTING FAILED: $ERRORS failure$plural" + echo + EXIT=1 + fi +} + +[ -d $workingdir ] || mkdir $workingdir + +runtests; + +exit $EXIT; + diff --git a/test/scope1.lm b/test/scope1.lm new file mode 100644 index 0000000..e0886d3 --- /dev/null +++ b/test/scope1.lm @@ -0,0 +1,36 @@ +##### LM ##### +int f() +{ + i: int = 0 + j: int = 100 + + while i < 4 { + j: int = 200 + if ( i < 1 ) { + j: int = 300 + print( "i: [$i] j: [$j]\n" ) + } + elsif ( i < 2 ) { + j: int = 300 + print( "i: [$i] j: [$j]\n" ) + } + elsif ( i < 3 ) { + print( "i: [$i] j: [$j]\n" ) + } + else { + print( "i: [$i] j: [$j]\n" ) + } + + i = i + 1 + } + + print( "j: [$j]\n" ) +} + +f() +##### EXP ##### +i: 0 j: 300 +i: 1 j: 300 +i: 2 j: 200 +i: 3 j: 200 +j: 100 diff --git a/test/sprintf.lm b/test/sprintf.lm new file mode 100644 index 0000000..b2a65fa --- /dev/null +++ b/test/sprintf.lm @@ -0,0 +1,4 @@ +##### LM ##### +print( sprintf( "%08x\n" (256 + 11 * 16) ) ) +##### EXP ##### +000001b0 diff --git a/test/string.lm b/test/string.lm new file mode 100644 index 0000000..ea41cb2 --- /dev/null +++ b/test/string.lm @@ -0,0 +1,60 @@ +##### LM ##### +lex + token str_escape /'\\' any/ + token str_chr /[^\\"]+/ +end + +def str_item + [str_escape] +| [str_chr] + +def string + [`" str_item* `"] + +lex + token ident /[a-zA-Z_]+/ + token number /[0-9]+/ + + literal `+ `* `; `" `' `( `) + literal `+= `-= `*= + + ignore wp /[ \t\n]+/ +end + +def expr + [expr `+ term] +| [term] + +def term + [term `* primary] +| [primary] + +def primary + [number] +| [ident] +| [string] +| [`( expr `)] + +def expr_list + [expr_list expr `;] +| [] + +def start + [expr_list] + { + if match lhs + ~a + "%{{"; 1 * 2; + { + print( 'yes\n' ) + } + } + +parse S: start[stdin] +print_xml( S ) +print( '\n' ) +##### IN ##### +a + "%{{"; 1 * 2; + +##### EXP ##### +yes +<start><expr_list><expr_list><expr_list></expr_list><expr><expr><term><primary><ident>a</ident></primary></term></expr><_literal_0009>+</_literal_0009><term><primary><string><_literal_000f>"</_literal_000f><_repeat_str_item><str_item><str_chr>%{{</str_chr></str_item></_repeat_str_item><_literal_000f>"</_literal_000f></string></primary></term></expr><_literal_000d>;</_literal_000d></expr_list><expr><term><term><primary><number>1</number></primary></term><_literal_000b>*</_literal_000b><primary><number>2</number></primary></term></expr><_literal_000d>;</_literal_000d></expr_list></start> diff --git a/test/superid.lm b/test/superid.lm new file mode 100644 index 0000000..eb19020 --- /dev/null +++ b/test/superid.lm @@ -0,0 +1,76 @@ +##### LM ##### +context si + lex + literal `! `a + + token SEMI_NL /';\n'/ + + token id /'a'|'b'/ + { + input.push( make_token( trans_id_to input.pull(match_length) ) ) + } + + token super_id // + token foo // + + ignore ws / [ \n\t]+ / + end + + trans_id_to: int + + def e1 + [] + { + print( 'old_id = ' trans_id_to '\n' ) + trans_id_to = typeid<foo> + print( 'new_id = ' trans_id_to '\n' ) + } + + def item1 + msg: str + + [ e1 `! `a super_id super_id `a] + { + lhs.msg = 'this is item1\n' + } + + def e2 + [] + { + print( 'old_id = ' trans_id_to '\n' ) + trans_id_to = typeid<super_id> + print( 'new_id = ' trans_id_to '\n' ) + } + + def item2 + msg: str + + [ e2 `! `a super_id super_id `a] + { + lhs.msg = 'this is item2\n' + } + + + def start + [item1 SEMI_NL] + | [item2 SEMI_NL] + { + match lhs [Item2:item2 ';\n'] + print( Item2.msg ) + } +end # si + +cons SuperId: si[] +parse S: si::start(SuperId)[stdin] +print_xml( S ) +print( '\n' ) +##### IN ##### +!a b b a; +##### EXP ##### +old_id = NIL +new_id = 13 +old_id = NIL +new_id = 12 +this is item2 +<si::start><si::item2><si::e2></si::e2><si::_literal_0001>!</si::_literal_0001><si::_literal_0003>a</si::_literal_0003><si::super_id>b</si::super_id><si::super_id>b</si::super_id><si::_literal_0003>a</si::_literal_0003></si::item2><si::SEMI_NL>; +</si::SEMI_NL></si::start> diff --git a/test/tags1.lm b/test/tags1.lm new file mode 100644 index 0000000..ef17c46 --- /dev/null +++ b/test/tags1.lm @@ -0,0 +1,93 @@ +##### LM ##### +context tags + # Open and close tags by rewriting to generic close tags. Won't work if + # interested in unclosed tags because a token can start as not close_id, but + # then become a close id during the course of parsing. + + # + # Regular Definitions + # + rl rl_ws /[ \t\n\r\v]+/ + rl rl_id /[a-zA-Z_][a-zA-Z0-9_]*/ + + # + # Tokens + # + + # Any single character can be a literal + lex + token BANG_NL /'!\n'/ + token SEMI_NL /';\n'/ + + # Ignore whitespace. + ignore /rl_ws/ + + # Open and close id + token id /rl_id/ + end + + # + # Global Data + # + + def tag_stack + [id tag_stack] + | [] + + TS: tag_stack + + # + # Productions + # + + def open_tag + [id] + { + match lhs [Id:id] + match TS [Top:id Rest:tag_stack] + if Id.data == Top.data { + reject + } else { + TS = construct tag_stack [Id TS] + } + } + + def close_tag + [id] + { + match lhs [Id: id] + match TS [Top: id Rest: tag_stack] + + if Id.data == Top.data + TS = construct tag_stack [Rest] + else + reject + } + + def tag + [open_tag tag* close_tag] + + def start + [tag* SEMI_NL] + { + print_xml( TS ) + print_xml( lhs ) + print( 'got structure\n' ) + } + + | [id* SEMI_NL] + { + print_xml( TS ) + print_xml( lhs ) + print( 'failed\n' ) + } +end # tags + +cons Tags: tags[] +Tags.TS = cons tags::tag_stack ["sentinal"] +parse tags::start(Tags)[stdin] +##### IN ##### +y y a i i b c c m m n n b a; +##### EXP ##### +<tags::tag_stack><tags::id>sentinal</tags::id><tags::tag_stack></tags::tag_stack></tags::tag_stack><tags::start><tags::_repeat_tag><tags::tag><tags::open_tag><tags::id>y</tags::id></tags::open_tag><tags::_repeat_tag></tags::_repeat_tag><tags::close_tag><tags::id>y</tags::id></tags::close_tag></tags::tag><tags::tag><tags::open_tag><tags::id>a</tags::id></tags::open_tag><tags::_repeat_tag><tags::tag><tags::open_tag><tags::id>i</tags::id></tags::open_tag><tags::_repeat_tag></tags::_repeat_tag><tags::close_tag><tags::id>i</tags::id></tags::close_tag></tags::tag><tags::tag><tags::open_tag><tags::id>b</tags::id></tags::open_tag><tags::_repeat_tag><tags::tag><tags::open_tag><tags::id>c</tags::id></tags::open_tag><tags::_repeat_tag></tags::_repeat_tag><tags::close_tag><tags::id>c</tags::id></tags::close_tag></tags::tag><tags::tag><tags::open_tag><tags::id>m</tags::id></tags::open_tag><tags::_repeat_tag></tags::_repeat_tag><tags::close_tag><tags::id>m</tags::id></tags::close_tag></tags::tag><tags::tag><tags::open_tag><tags::id>n</tags::id></tags::open_tag><tags::_repeat_tag></tags::_repeat_tag><tags::close_tag><tags::id>n</tags::id></tags::close_tag></tags::tag></tags::_repeat_tag><tags::close_tag><tags::id>b</tags::id></tags::close_tag></tags::tag></tags::_repeat_tag><tags::close_tag><tags::id>a</tags::id></tags::close_tag></tags::tag></tags::_repeat_tag><tags::SEMI_NL>; +</tags::SEMI_NL></tags::start>got structure diff --git a/test/tags2.lm b/test/tags2.lm new file mode 100644 index 0000000..e83b113 --- /dev/null +++ b/test/tags2.lm @@ -0,0 +1,4183 @@ +##### LM ##### +# +# Definitions +# + +rl xml_digit / (0x30..0x39) / + +rl base_char / 0x41..0x5A | 0x61..0x7A / + +rl char / 0x9 | 0xA | 0xD | 0x20..0x7f / + +rl letter / base_char / + +rl name_char / letter | digit | '.' | '-' | '_' | ':' | 0xb7 / + +rl name / (letter | '_' | ':') name_char* / + +# +# Reference definitions. These appear in the +# top level and also in strings. +# + +rl entity_ref_pat / '&' name ';' / + +rl char_ref_pat / '&#' [0-9]+ ';' | '&0x' [0-9a-fA-F]+ ';' / + +# +# Single quotes. +# +lex + token sq_close /'\''/ + + # References in single quotes + token sq_entity_ref /entity_ref_pat/ + token sq_char_ref /char_ref_pat/ + + token sq_data / [^<&']+ / + + def sq_item + [ sq_data ] + | [ sq_entity_ref ] + | [ sq_char_ref ] + + # The opening quote belongs to the tag region. + def sq_string + [ `' sq_item* sq_close ] +end + +# +# Double quotes. +# +lex + token dq_close /'"'/ + + # References in double quotes + token dq_entity_ref /entity_ref_pat/ + token dq_char_ref /char_ref_pat/ + + token dq_data / [^<&"]+ / + + def dq_item + [ dq_data ] + | [ dq_entity_ref ] + | [ dq_char_ref ] + + # The opening quote belongs to the tag region. + def dq_string + [ `" dq_item* dq_close ] +end + +# +# Tag elements. +# +lex + literal `' `" `= `/ + + # Within this region whitespace is not significant. + ignore xml_space / (0x20 | 0x9 | 0xD | 0xA)+ / + + # + # Attributes + # + token attr_name / name / +end + +literal `> + +# +# Top Level +# +lex + # + # Comments + # + + # Cannot contain '--' + rl char_no_dash / char - '-' / + token comment / '<!--' ( char_no_dash | '-' char_no_dash )* '-->' / + + + # Opening a tag. + literal `< + + # + # Character Data + # + + token cdata / '<![CDATA[' char* :> ']]>'/ + token char_data / [^<&]+ / + token entity_ref /entity_ref_pat/ + token char_ref /char_ref_pat/ +end + + +def attribute_value + [ sq_string ] +| [ dq_string ] + +def attribute + [ attr_name `= attribute_value ] + +def empty_tag + [ `< attr_name attribute* `/ `> ] + +def close_tag + [ `< `/ attr_name `> ] + +def open_tag + [ `< attr_name attribute* `> ] + +def tag + [open_tag content close_tag] + +def content_item + [tag] +| [empty_tag] +| [char_data] +| [entity_ref] +| [char_ref] +| [cdata] +| [comment] + +def content + [content_item*] + +def document + [content] + +def start + [document] + +parse S: start[stdin] + +for Switch:tag in S { + if match Switch + ["<lm_switch>" SwitchContent:content "</lm_switch>"] + { + print( 'SWITCH\n' ) + for Text:tag in SwitchContent { + if match Text + ["<text>" TextContent:content "</text>"] + { + print( ' ' TextContent '\n' ) + } + } + } +} +##### IN ##### +<ragel version="5.24" filename="../colm/lmscan.rl" lang="C"> +<ragel_def name="rlscan"> + <alphtype>char</alphtype> + <machine> + <action_list length="166"> + <action id="0" name="inc_nl" line="217" col="16"><text> + lastnl = p; + column = 0; + line++; + </text></action> + <action id="1" name="initts" line="1" col="1"><init_tokstart></init_tokstart></action> + <action id="2" name="tokstart" line="1" col="1"><set_tokstart></set_tokstart></action> + <action id="3" name="tokend" line="1" col="1"><set_tokend>1</set_tokend></action> + <action id="4" name="last1" line="238" col="12"><set_tokend>1</set_tokend><sub_action><text> token( RE_Char, '\0' ); </text></sub_action></action> + <action id="5" name="last2" line="239" col="12"><set_tokend>1</set_tokend><sub_action><text> token( RE_Char, '\a' ); </text></sub_action></action> + <action id="6" name="last3" line="240" col="12"><set_tokend>1</set_tokend><sub_action><text> token( RE_Char, '\b' ); </text></sub_action></action> + <action id="7" name="last4" line="241" col="12"><set_tokend>1</set_tokend><sub_action><text> token( RE_Char, '\t' ); </text></sub_action></action> + <action id="8" name="last5" line="242" col="12"><set_tokend>1</set_tokend><sub_action><text> token( RE_Char, '\n' ); </text></sub_action></action> + <action id="9" name="last6" line="243" col="12"><set_tokend>1</set_tokend><sub_action><text> token( RE_Char, '\v' ); </text></sub_action></action> + <action id="10" name="last7" line="244" col="12"><set_tokend>1</set_tokend><sub_action><text> token( RE_Char, '\f' ); </text></sub_action></action> + <action id="11" name="last8" line="245" col="12"><set_tokend>1</set_tokend><sub_action><text> token( RE_Char, '\r' ); </text></sub_action></action> + <action id="12" name="last9" line="246" col="13"><set_tokend>1</set_tokend><sub_action><text> updateCol(); </text></sub_action></action> + <action id="13" name="last10" line="247" col="15"><set_tokend>1</set_tokend><sub_action><text> token( RE_Char, tokstart+1, tokend ); </text></sub_action></action> + <action id="14" name="last11" line="250" col="10"><set_tokend>1</set_tokend><sub_action><text> token( RE_Dash, 0, 0 ); </text></sub_action></action> + <action id="15" name="last12" line="253" col="10"><set_tokend>1</set_tokend><sub_action><text> token( RE_SqClose ); </text><ret></ret><text> </text></sub_action></action> + <action id="16" name="last13" line="255" col="10"><set_tokend>1</set_tokend><sub_action><text> + scan_error() << "unterminated OR literal" << endl; + </text></sub_action></action> + <action id="17" name="last14" line="260" col="12"><set_tokend>1</set_tokend><sub_action><text> token( RE_Char, tokstart, tokend ); </text></sub_action></action> + <action id="18" name="store15" line="265" col="13"><set_act>15</set_act></action> + <action id="19" name="store16" line="266" col="12"><set_act>16</set_act></action> + <action id="20" name="store17" line="267" col="12"><set_act>17</set_act></action> + <action id="21" name="store18" line="268" col="13"><set_act>18</set_act></action> + <action id="22" name="store19" line="269" col="11"><set_act>19</set_act></action> + <action id="23" name="store20" line="270" col="13"><set_act>20</set_act></action> + <action id="24" name="store21" line="273" col="12"><set_act>21</set_act></action> + <action id="25" name="last24" line="281" col="7"><set_tokend>1</set_tokend><sub_action><text> token( TK_Literal, tokstart, tokend ); </text></sub_action></action> + <action id="26" name="last26" line="284" col="11"><set_tokend>1</set_tokend><sub_action><text> token( RE_SqOpenNeg ); </text><call>166</call><text> </text></sub_action></action> + <action id="27" name="last27" line="286" col="10"><set_tokend>1</set_tokend><sub_action><text> token( '/'); </text><ret></ret><text> </text></sub_action></action> + <action id="28" name="last28" line="289" col="20"><set_tokend>1</set_tokend><sub_action><text> updateCol(); </text></sub_action></action> + <action id="29" name="last29" line="291" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_ColonEquals ); </text></sub_action></action> + <action id="30" name="last30" line="294" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_StartToState ); </text></sub_action></action> + <action id="31" name="last31" line="295" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_AllToState ); </text></sub_action></action> + <action id="32" name="last32" line="296" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_FinalToState ); </text></sub_action></action> + <action id="33" name="last33" line="297" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_NotStartToState ); </text></sub_action></action> + <action id="34" name="last34" line="298" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_NotFinalToState ); </text></sub_action></action> + <action id="35" name="last35" line="299" col="12"><set_tokend>1</set_tokend><sub_action><text> token( TK_MiddleToState ); </text></sub_action></action> + <action id="36" name="last36" line="302" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_StartFromState ); </text></sub_action></action> + <action id="37" name="last37" line="303" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_AllFromState ); </text></sub_action></action> + <action id="38" name="last38" line="304" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_FinalFromState ); </text></sub_action></action> + <action id="39" name="last39" line="305" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_NotStartFromState ); </text></sub_action></action> + <action id="40" name="last40" line="306" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_NotFinalFromState ); </text></sub_action></action> + <action id="41" name="last41" line="307" col="12"><set_tokend>1</set_tokend><sub_action><text> token( TK_MiddleFromState ); </text></sub_action></action> + <action id="42" name="last42" line="310" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_StartEOF ); </text></sub_action></action> + <action id="43" name="last43" line="311" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_AllEOF ); </text></sub_action></action> + <action id="44" name="last44" line="312" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_FinalEOF ); </text></sub_action></action> + <action id="45" name="last45" line="313" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_NotStartEOF ); </text></sub_action></action> + <action id="46" name="last46" line="314" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_NotFinalEOF ); </text></sub_action></action> + <action id="47" name="last47" line="315" col="12"><set_tokend>1</set_tokend><sub_action><text> token( TK_MiddleEOF ); </text></sub_action></action> + <action id="48" name="last48" line="318" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_StartGblError ); </text></sub_action></action> + <action id="49" name="last49" line="319" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_AllGblError ); </text></sub_action></action> + <action id="50" name="last50" line="320" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_FinalGblError ); </text></sub_action></action> + <action id="51" name="last51" line="321" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_NotStartGblError ); </text></sub_action></action> + <action id="52" name="last52" line="322" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_NotFinalGblError ); </text></sub_action></action> + <action id="53" name="last53" line="323" col="12"><set_tokend>1</set_tokend><sub_action><text> token( TK_MiddleGblError ); </text></sub_action></action> + <action id="54" name="last54" line="326" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_StartLocalError ); </text></sub_action></action> + <action id="55" name="last55" line="327" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_AllLocalError ); </text></sub_action></action> + <action id="56" name="last56" line="328" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_FinalLocalError ); </text></sub_action></action> + <action id="57" name="last57" line="329" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_NotStartLocalError ); </text></sub_action></action> + <action id="58" name="last58" line="330" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_NotFinalLocalError ); </text></sub_action></action> + <action id="59" name="last59" line="331" col="12"><set_tokend>1</set_tokend><sub_action><text> token( TK_MiddleLocalError ); </text></sub_action></action> + <action id="60" name="last61" line="337" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_StartCond ); </text></sub_action></action> + <action id="61" name="last62" line="338" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_AllCond ); </text></sub_action></action> + <action id="62" name="last63" line="339" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_LeavingCond ); </text></sub_action></action> + <action id="63" name="last64" line="341" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_DotDot ); </text></sub_action></action> + <action id="64" name="last65" line="342" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_StarStar ); </text></sub_action></action> + <action id="65" name="last66" line="343" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_DashDash ); </text></sub_action></action> + <action id="66" name="last67" line="344" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_Arrow ); </text></sub_action></action> + <action id="67" name="last69" line="347" col="12"><set_tokend>1</set_tokend><sub_action><text> token( TK_ColonGtGt ); </text></sub_action></action> + <action id="68" name="last70" line="348" col="12"><set_tokend>1</set_tokend><sub_action><text> token( TK_LtColon ); </text></sub_action></action> + <action id="69" name="last72" line="354" col="9"><set_tokend>1</set_tokend><sub_action><text> updateCol(); </text></sub_action></action> + <action id="70" name="last73" line="357" col="6"><set_tokend>1</set_tokend></action> + <action id="71" name="last74" line="359" col="10"><set_tokend>1</set_tokend><sub_action><text> token( *tokstart ); </text></sub_action></action> + <action id="72" name="next21" line="273" col="12"><set_tokend>0</set_tokend><hold></hold><sub_action><text> token( TK_Word, tokstart, tokend ); </text></sub_action></action> + <action id="73" name="next22" line="276" col="13"><set_tokend>0</set_tokend><hold></hold><sub_action><text> token( TK_UInt, tokstart, tokend ); </text></sub_action></action> + <action id="74" name="next23" line="277" col="17"><set_tokend>0</set_tokend><hold></hold><sub_action><text> token( TK_Hex, tokstart, tokend ); </text></sub_action></action> + <action id="75" name="next24" line="281" col="7"><set_tokend>0</set_tokend><hold></hold><sub_action><text> token( TK_Literal, tokstart, tokend ); </text></sub_action></action> + <action id="76" name="next25" line="283" col="10"><set_tokend>0</set_tokend><hold></hold><sub_action><text> token( RE_SqOpen ); </text><call>166</call><text> </text></sub_action></action> + <action id="77" name="next60" line="334" col="11"><set_tokend>0</set_tokend><hold></hold><sub_action><text> token( TK_Middle ); </text></sub_action></action> + <action id="78" name="next68" line="346" col="12"><set_tokend>0</set_tokend><hold></hold><sub_action><text> token( TK_ColonGt ); </text></sub_action></action> + <action id="79" name="next71" line="351" col="15"><set_tokend>0</set_tokend><hold></hold><sub_action><text> updateCol(); </text></sub_action></action> + <action id="80" name="next74" line="359" col="10"><set_tokend>0</set_tokend><hold></hold><sub_action><text> token( *tokstart ); </text></sub_action></action> + <action id="81" name="lag22" line="276" col="13"><exec><get_tokend></get_tokend></exec><sub_action><text> token( TK_UInt, tokstart, tokend ); </text></sub_action></action> + <action id="82" name="switch" line="1" col="1"><lm_switch> + <sub_action id="15"><exec><get_tokend></get_tokend></exec><text> token( KW_When ); </text></sub_action> + <sub_action id="16"><exec><get_tokend></get_tokend></exec><text> token( KW_Eof ); </text></sub_action> + <sub_action id="17"><exec><get_tokend></get_tokend></exec><text> token( KW_Err ); </text></sub_action> + <sub_action id="18"><exec><get_tokend></get_tokend></exec><text> token( KW_Lerr ); </text></sub_action> + <sub_action id="19"><exec><get_tokend></get_tokend></exec><text> token( KW_To ); </text></sub_action> + <sub_action id="20"><exec><get_tokend></get_tokend></exec><text> token( KW_From ); </text></sub_action> + <sub_action id="21"><exec><get_tokend></get_tokend></exec><text> token( TK_Word, tokstart, tokend ); </text></sub_action> + </lm_switch></action> + <action id="83" name="last75" line="363" col="12"><set_tokend>1</set_tokend><sub_action><text> litBuf.append( '\a' ); </text></sub_action></action> + <action id="84" name="last76" line="364" col="12"><set_tokend>1</set_tokend><sub_action><text> litBuf.append( '\b' ); </text></sub_action></action> + <action id="85" name="last77" line="365" col="12"><set_tokend>1</set_tokend><sub_action><text> litBuf.append( '\t' ); </text></sub_action></action> + <action id="86" name="last78" line="366" col="12"><set_tokend>1</set_tokend><sub_action><text> litBuf.append( '\n' ); </text></sub_action></action> + <action id="87" name="last79" line="367" col="12"><set_tokend>1</set_tokend><sub_action><text> litBuf.append( '\v' ); </text></sub_action></action> + <action id="88" name="last80" line="368" col="12"><set_tokend>1</set_tokend><sub_action><text> litBuf.append( '\f' ); </text></sub_action></action> + <action id="89" name="last81" line="369" col="12"><set_tokend>1</set_tokend><sub_action><text> litBuf.append( '\r' ); </text></sub_action></action> + <action id="90" name="last82" line="371" col="12"><set_tokend>1</set_tokend><sub_action><text> + litBuf.append( tokstart[1] ); + </text></sub_action></action> + <action id="91" name="last83" line="374" col="10"><set_tokend>1</set_tokend><sub_action><text> + if ( litBuf.length > 0 ) { + token( TK_LitPat, litBuf.data, litBuf.data+litBuf.length ); + litBuf.clear(); + } + token( '"' ); + </text><ret></ret><text> + </text></sub_action></action> + <action id="92" name="last84" line="382" col="9"><set_tokend>1</set_tokend><sub_action><text> + if ( litBuf.length > 0 ) { + litBuf.append( '\n' ); + token( TK_LitPat, litBuf.data, litBuf.data+litBuf.length ); + litBuf.clear(); + } + token( '"' ); + </text><ret></ret><text> + </text></sub_action></action> + <action id="93" name="last85" line="391" col="10"><set_tokend>1</set_tokend><sub_action><text> + if ( litBuf.length > 0 ) { + token( TK_LitPat, litBuf.data, litBuf.data+litBuf.length ); + litBuf.clear(); + } + token( '[' ); + </text><call>10</call><text> + </text></sub_action></action> + <action id="94" name="last86" line="399" col="10"><set_tokend>1</set_tokend><sub_action><text> + litBuf.append( *tokstart ); + </text></sub_action></action> + <action id="95" name="store87" line="406" col="12"><set_act>87</set_act></action> + <action id="96" name="store88" line="407" col="15"><set_act>88</set_act></action> + <action id="97" name="store89" line="408" col="17"><set_act>89</set_act></action> + <action id="98" name="store90" line="409" col="15"><set_act>90</set_act></action> + <action id="99" name="store91" line="410" col="13"><set_act>91</set_act></action> + <action id="100" name="store92" line="411" col="14"><set_act>92</set_act></action> + <action id="101" name="store93" line="412" col="18"><set_act>93</set_act></action> + <action id="102" name="store94" line="413" col="14"><set_act>94</set_act></action> + <action id="103" name="store95" line="414" col="16"><set_act>95</set_act></action> + <action id="104" name="store96" line="415" col="16"><set_act>96</set_act></action> + <action id="105" name="store97" line="416" col="13"><set_act>97</set_act></action> + <action id="106" name="store98" line="417" col="15"><set_act>98</set_act></action> + <action id="107" name="store99" line="418" col="16"><set_act>99</set_act></action> + <action id="108" name="store101" line="420" col="14"><set_act>101</set_act></action> + <action id="109" name="store102" line="421" col="12"><set_act>102</set_act></action> + <action id="110" name="store103" line="422" col="12"><set_act>103</set_act></action> + <action id="111" name="store104" line="424" col="11"><set_act>104</set_act></action> + <action id="112" name="store105" line="425" col="12"><set_act>105</set_act></action> + <action id="113" name="store106" line="426" col="15"><set_act>106</set_act></action> + <action id="114" name="store107" line="427" col="12"><set_act>107</set_act></action> + <action id="115" name="store108" line="428" col="16"><set_act>108</set_act></action> + <action id="116" name="store109" line="429" col="18"><set_act>109</set_act></action> + <action id="117" name="store110" line="430" col="12"><set_act>110</set_act></action> + <action id="118" name="store112" line="432" col="16"><set_act>112</set_act></action> + <action id="119" name="store113" line="433" col="17"><set_act>113</set_act></action> + <action id="120" name="store114" line="434" col="11"><set_act>114</set_act></action> + <action id="121" name="store115" line="435" col="13"><set_act>115</set_act></action> + <action id="122" name="store116" line="436" col="15"><set_act>116</set_act></action> + <action id="123" name="store117" line="437" col="14"><set_act>117</set_act></action> + <action id="124" name="store118" line="438" col="13"><set_act>118</set_act></action> + <action id="125" name="store119" line="439" col="18"><set_act>119</set_act></action> + <action id="126" name="store120" line="440" col="13"><set_act>120</set_act></action> + <action id="127" name="store121" line="441" col="14"><set_act>121</set_act></action> + <action id="128" name="store122" line="442" col="12"><set_act>122</set_act></action> + <action id="129" name="store123" line="443" col="13"><set_act>123</set_act></action> + <action id="130" name="store124" line="444" col="13"><set_act>124</set_act></action> + <action id="131" name="store125" line="445" col="13"><set_act>125</set_act></action> + <action id="132" name="store126" line="446" col="18"><set_act>126</set_act></action> + <action id="133" name="store127" line="447" col="13"><set_act>127</set_act></action> + <action id="134" name="store128" line="448" col="11"><set_act>128</set_act></action> + <action id="135" name="store129" line="449" col="18"><set_act>129</set_act></action> + <action id="136" name="store130" line="450" col="16"><set_act>130</set_act></action> + <action id="137" name="store131" line="453" col="12"><set_act>131</set_act></action> + <action id="138" name="last133" line="457" col="10"><set_tokend>1</set_tokend><sub_action><text> + token( '/' ); + </text><call>168</call><text> + </text></sub_action></action> + <action id="139" name="last134" line="462" col="20"><set_tokend>1</set_tokend><sub_action><text> + token( '"' ); + token( TK_LitPat, tokstart+1, tokend ); + token( '"' ); + </text></sub_action></action> + <action id="140" name="last135" line="468" col="16"><set_tokend>1</set_tokend><sub_action><text> + token( TK_Literal, tokstart, tokend ); + </text></sub_action></action> + <action id="141" name="last136" line="472" col="10"><set_tokend>1</set_tokend><sub_action><text> + token( '"' ); + litBuf.clear(); + </text><call>203</call><text> + </text></sub_action></action> + <action id="142" name="last137" line="477" col="10"><set_tokend>1</set_tokend><sub_action><text> + token( '[' ); + </text><call>10</call><text> + </text></sub_action></action> + <action id="143" name="last138" line="482" col="10"><set_tokend>1</set_tokend><sub_action><text> + token( ']' ); + if ( top > 0 ) + </text><ret></ret><text> + </text></sub_action></action> + <action id="144" name="last139" line="489" col="20"><set_tokend>1</set_tokend><sub_action><text> updateCol(); </text></sub_action></action> + <action id="145" name="last140" line="491" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_ColonEquals ); </text></sub_action></action> + <action id="146" name="last141" line="492" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_DoubleArrow ); </text></sub_action></action> + <action id="147" name="last142" line="493" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_DoubleEquals ); </text></sub_action></action> + <action id="148" name="last143" line="494" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_NotEquals ); </text></sub_action></action> + <action id="149" name="last144" line="495" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_DoubleColon ); </text></sub_action></action> + <action id="150" name="last145" line="496" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_LessEquals ); </text></sub_action></action> + <action id="151" name="last146" line="497" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_GreaterEquals ); </text></sub_action></action> + <action id="152" name="last147" line="498" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_LeftArrow ); </text></sub_action></action> + <action id="153" name="last148" line="499" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_AmpAmp ); </text></sub_action></action> + <action id="154" name="last149" line="500" col="11"><set_tokend>1</set_tokend><sub_action><text> token( TK_BarBar ); </text></sub_action></action> + <action id="155" name="last150" line="502" col="43"><set_tokend>1</set_tokend><sub_action><text> token( *tokstart ); </text></sub_action></action> + <action id="156" name="last152" line="509" col="9"><set_tokend>1</set_tokend><sub_action><text> updateCol(); </text></sub_action></action> + <action id="157" name="last153" line="512" col="6"><set_tokend>1</set_tokend></action> + <action id="158" name="last154" line="514" col="10"><set_tokend>1</set_tokend><sub_action><text> token( *tokstart ); </text></sub_action></action> + <action id="159" name="next100" line="419" col="12"><set_tokend>0</set_tokend><hold></hold><sub_action><text> token( KW_Pri ); </text></sub_action></action> + <action id="160" name="next111" line="431" col="14"><set_tokend>0</set_tokend><hold></hold><sub_action><text> token( KW_Print ); </text></sub_action></action> + <action id="161" name="next131" line="453" col="12"><set_tokend>0</set_tokend><hold></hold><sub_action><text> token( TK_Word, tokstart, tokend ); </text></sub_action></action> + <action id="162" name="next132" line="455" col="13"><set_tokend>0</set_tokend><hold></hold><sub_action><text> token( TK_Number, tokstart, tokend ); </text></sub_action></action> + <action id="163" name="next151" line="506" col="15"><set_tokend>0</set_tokend><hold></hold><sub_action><text> updateCol(); </text></sub_action></action> + <action id="164" name="next154" line="514" col="10"><set_tokend>0</set_tokend><hold></hold><sub_action><text> token( *tokstart ); </text></sub_action></action> + <action id="165" name="switch" line="1" col="1"><lm_switch> + <sub_action id="87"><exec><get_tokend></get_tokend></exec><text> token( KW_Lex ); </text></sub_action> + <sub_action id="88"><exec><get_tokend></get_tokend></exec><text> token( KW_Action ); </text></sub_action> + <sub_action id="89"><exec><get_tokend></get_tokend></exec><text> token( KW_AlphType ); </text></sub_action> + <sub_action id="90"><exec><get_tokend></get_tokend></exec><text> token( KW_Commit ); </text></sub_action> + <sub_action id="91"><exec><get_tokend></get_tokend></exec><text> token( KW_Undo ); </text></sub_action> + <sub_action id="92"><exec><get_tokend></get_tokend></exec><text> token( KW_Final ); </text></sub_action> + <sub_action id="93"><exec><get_tokend></get_tokend></exec><text> token( KW_Translate ); </text></sub_action> + <sub_action id="94"><exec><get_tokend></get_tokend></exec><text> token( KW_Token ); </text></sub_action> + <sub_action id="95"><exec><get_tokend></get_tokend></exec><text> token( KW_Literal ); </text></sub_action> + <sub_action id="96"><exec><get_tokend></get_tokend></exec><text> token( KW_NonTerm ); </text></sub_action> + <sub_action id="97"><exec><get_tokend></get_tokend></exec><text> token( KW_Uses ); </text></sub_action> + <sub_action id="98"><exec><get_tokend></get_tokend></exec><text> token( KW_Parser ); </text></sub_action> + <sub_action id="99"><exec><get_tokend></get_tokend></exec><text> token( KW_Include ); </text></sub_action> + <sub_action id="101"><exec><get_tokend></get_tokend></exec><text> token( KW_Write ); </text></sub_action> + <sub_action id="102"><exec><get_tokend></get_tokend></exec><text> token( KW_Nfa ); </text></sub_action> + <sub_action id="103"><exec><get_tokend></get_tokend></exec><text> token( KW_Pda ); </text></sub_action> + <sub_action id="104"><exec><get_tokend></get_tokend></exec><text> token( KW_Rl ); </text></sub_action> + <sub_action id="105"><exec><get_tokend></get_tokend></exec><text> token( KW_Cfl ); </text></sub_action> + <sub_action id="106"><exec><get_tokend></get_tokend></exec><text> token( KW_Ignore ); </text></sub_action> + <sub_action id="107"><exec><get_tokend></get_tokend></exec><text> token( KW_End ); </text></sub_action> + <sub_action id="108"><exec><get_tokend></get_tokend></exec><text> token( KW_Pattern ); </text></sub_action> + <sub_action id="109"><exec><get_tokend></get_tokend></exec><text> token( KW_Construct ); </text></sub_action> + <sub_action id="110"><exec><get_tokend></get_tokend></exec><text> token( KW_Red ); </text></sub_action> + <sub_action id="112"><exec><get_tokend></get_tokend></exec><text> token( KW_TypeId ); </text></sub_action> + <sub_action id="113"><exec><get_tokend></get_tokend></exec><text> token( KW_TypeDef ); </text></sub_action> + <sub_action id="114"><exec><get_tokend></get_tokend></exec><text> token( KW_If ); </text></sub_action> + <sub_action id="115"><exec><get_tokend></get_tokend></exec><text> token( KW_Init ); </text></sub_action> + <sub_action id="116"><exec><get_tokend></get_tokend></exec><text> token( KW_Reject ); </text></sub_action> + <sub_action id="117"><exec><get_tokend></get_tokend></exec><text> token( KW_While ); </text></sub_action> + <sub_action id="118"><exec><get_tokend></get_tokend></exec><text> token( KW_Else ); </text></sub_action> + <sub_action id="119"><exec><get_tokend></get_tokend></exec><text> token( KW_SubParser ); </text></sub_action> + <sub_action id="120"><exec><get_tokend></get_tokend></exec><text> token( KW_Next ); </text></sub_action> + <sub_action id="121"><exec><get_tokend></get_tokend></exec><text> token( KW_Match ); </text></sub_action> + <sub_action id="122"><exec><get_tokend></get_tokend></exec><text> token( KW_For ); </text></sub_action> + <sub_action id="123"><exec><get_tokend></get_tokend></exec><text> token( KW_Iter ); </text></sub_action> + <sub_action id="124"><exec><get_tokend></get_tokend></exec><text> token( KW_Find ); </text></sub_action> + <sub_action id="125"><exec><get_tokend></get_tokend></exec><text> token( KW_Root ); </text></sub_action> + <sub_action id="126"><exec><get_tokend></get_tokend></exec><text> token( KW_PrintXML ); </text></sub_action> + <sub_action id="127"><exec><get_tokend></get_tokend></exec><text> token( KW_Then ); </text></sub_action> + <sub_action id="128"><exec><get_tokend></get_tokend></exec><text> token( KW_Do ); </text></sub_action> + <sub_action id="129"><exec><get_tokend></get_tokend></exec><text> token( KW_Namespace ); </text></sub_action> + <sub_action id="130"><exec><get_tokend></get_tokend></exec><text> token( KW_Scanner ); </text></sub_action> + <sub_action id="131"><exec><get_tokend></get_tokend></exec><text> token( TK_Word, tokstart, tokend ); </text></sub_action> + </lm_switch></action> + </action_list> + <action_table_list length="166"> + <action_table id="0" length="2">0 144</action_table> + <action_table id="1" length="1">0</action_table> + <action_table id="2" length="1">140</action_table> + <action_table id="3" length="2">0 139</action_table> + <action_table id="4" length="2">0 28</action_table> + <action_table id="5" length="1">81</action_table> + <action_table id="6" length="1">1</action_table> + <action_table id="7" length="1">2</action_table> + <action_table id="8" length="1">158</action_table> + <action_table id="9" length="1">157</action_table> + <action_table id="10" length="2">0 156</action_table> + <action_table id="11" length="1">141</action_table> + <action_table id="12" length="1">3</action_table> + <action_table id="13" length="1">155</action_table> + <action_table id="14" length="1">138</action_table> + <action_table id="15" length="2">3 137</action_table> + <action_table id="16" length="1">142</action_table> + <action_table id="17" length="1">143</action_table> + <action_table id="18" length="1">163</action_table> + <action_table id="19" length="1">164</action_table> + <action_table id="20" length="1">148</action_table> + <action_table id="21" length="1">153</action_table> + <action_table id="22" length="1">162</action_table> + <action_table id="23" length="1">149</action_table> + <action_table id="24" length="1">145</action_table> + <action_table id="25" length="1">152</action_table> + <action_table id="26" length="1">150</action_table> + <action_table id="27" length="1">147</action_table> + <action_table id="28" length="1">146</action_table> + <action_table id="29" length="1">151</action_table> + <action_table id="30" length="1">165</action_table> + <action_table id="31" length="1">161</action_table> + <action_table id="32" length="2">3 96</action_table> + <action_table id="33" length="2">3 97</action_table> + <action_table id="34" length="2">3 112</action_table> + <action_table id="35" length="2">3 98</action_table> + <action_table id="36" length="2">3 116</action_table> + <action_table id="37" length="2">3 134</action_table> + <action_table id="38" length="2">3 124</action_table> + <action_table id="39" length="2">3 114</action_table> + <action_table id="40" length="2">3 130</action_table> + <action_table id="41" length="2">3 100</action_table> + <action_table id="42" length="2">3 128</action_table> + <action_table id="43" length="2">3 120</action_table> + <action_table id="44" length="2">3 113</action_table> + <action_table id="45" length="2">3 107</action_table> + <action_table id="46" length="2">3 121</action_table> + <action_table id="47" length="2">3 129</action_table> + <action_table id="48" length="2">3 95</action_table> + <action_table id="49" length="2">3 103</action_table> + <action_table id="50" length="2">3 127</action_table> + <action_table id="51" length="2">3 135</action_table> + <action_table id="52" length="2">3 126</action_table> + <action_table id="53" length="2">3 109</action_table> + <action_table id="54" length="2">3 104</action_table> + <action_table id="55" length="2">3 106</action_table> + <action_table id="56" length="2">3 115</action_table> + <action_table id="57" length="2">3 110</action_table> + <action_table id="58" length="1">159</action_table> + <action_table id="59" length="1">160</action_table> + <action_table id="60" length="2">3 132</action_table> + <action_table id="61" length="2">3 111</action_table> + <action_table id="62" length="2">3 117</action_table> + <action_table id="63" length="2">3 122</action_table> + <action_table id="64" length="2">3 131</action_table> + <action_table id="65" length="2">3 136</action_table> + <action_table id="66" length="2">3 125</action_table> + <action_table id="67" length="2">3 133</action_table> + <action_table id="68" length="2">3 102</action_table> + <action_table id="69" length="2">3 101</action_table> + <action_table id="70" length="2">3 119</action_table> + <action_table id="71" length="2">3 118</action_table> + <action_table id="72" length="2">3 99</action_table> + <action_table id="73" length="2">3 105</action_table> + <action_table id="74" length="2">3 123</action_table> + <action_table id="75" length="2">3 108</action_table> + <action_table id="76" length="1">154</action_table> + <action_table id="77" length="1">17</action_table> + <action_table id="78" length="1">16</action_table> + <action_table id="79" length="1">14</action_table> + <action_table id="80" length="1">15</action_table> + <action_table id="81" length="1">13</action_table> + <action_table id="82" length="1">12</action_table> + <action_table id="83" length="1">4</action_table> + <action_table id="84" length="1">5</action_table> + <action_table id="85" length="1">6</action_table> + <action_table id="86" length="1">10</action_table> + <action_table id="87" length="1">8</action_table> + <action_table id="88" length="1">11</action_table> + <action_table id="89" length="1">7</action_table> + <action_table id="90" length="1">9</action_table> + <action_table id="91" length="1">71</action_table> + <action_table id="92" length="1">70</action_table> + <action_table id="93" length="2">0 69</action_table> + <action_table id="94" length="1">27</action_table> + <action_table id="95" length="2">3 24</action_table> + <action_table id="96" length="1">79</action_table> + <action_table id="97" length="1">75</action_table> + <action_table id="98" length="1">25</action_table> + <action_table id="99" length="1">80</action_table> + <action_table id="100" length="1">49</action_table> + <action_table id="101" length="1">37</action_table> + <action_table id="102" length="1">43</action_table> + <action_table id="103" length="1">61</action_table> + <action_table id="104" length="1">55</action_table> + <action_table id="105" length="1">31</action_table> + <action_table id="106" length="1">50</action_table> + <action_table id="107" length="1">38</action_table> + <action_table id="108" length="1">44</action_table> + <action_table id="109" length="1">62</action_table> + <action_table id="110" length="1">56</action_table> + <action_table id="111" length="1">32</action_table> + <action_table id="112" length="1">64</action_table> + <action_table id="113" length="1">65</action_table> + <action_table id="114" length="1">66</action_table> + <action_table id="115" length="1">63</action_table> + <action_table id="116" length="1">73</action_table> + <action_table id="117" length="1">74</action_table> + <action_table id="118" length="1">29</action_table> + <action_table id="119" length="1">78</action_table> + <action_table id="120" length="1">67</action_table> + <action_table id="121" length="1">51</action_table> + <action_table id="122" length="1">39</action_table> + <action_table id="123" length="1">45</action_table> + <action_table id="124" length="1">68</action_table> + <action_table id="125" length="1">57</action_table> + <action_table id="126" length="1">33</action_table> + <action_table id="127" length="1">77</action_table> + <action_table id="128" length="1">53</action_table> + <action_table id="129" length="1">41</action_table> + <action_table id="130" length="1">47</action_table> + <action_table id="131" length="1">59</action_table> + <action_table id="132" length="1">35</action_table> + <action_table id="133" length="1">48</action_table> + <action_table id="134" length="1">36</action_table> + <action_table id="135" length="1">42</action_table> + <action_table id="136" length="1">60</action_table> + <action_table id="137" length="1">54</action_table> + <action_table id="138" length="1">30</action_table> + <action_table id="139" length="1">52</action_table> + <action_table id="140" length="1">40</action_table> + <action_table id="141" length="1">46</action_table> + <action_table id="142" length="1">58</action_table> + <action_table id="143" length="1">34</action_table> + <action_table id="144" length="1">82</action_table> + <action_table id="145" length="1">76</action_table> + <action_table id="146" length="1">26</action_table> + <action_table id="147" length="1">72</action_table> + <action_table id="148" length="2">3 19</action_table> + <action_table id="149" length="2">3 20</action_table> + <action_table id="150" length="2">3 23</action_table> + <action_table id="151" length="2">3 21</action_table> + <action_table id="152" length="2">3 22</action_table> + <action_table id="153" length="2">3 18</action_table> + <action_table id="154" length="1">94</action_table> + <action_table id="155" length="2">0 92</action_table> + <action_table id="156" length="1">91</action_table> + <action_table id="157" length="1">93</action_table> + <action_table id="158" length="1">90</action_table> + <action_table id="159" length="1">83</action_table> + <action_table id="160" length="1">84</action_table> + <action_table id="161" length="1">88</action_table> + <action_table id="162" length="1">86</action_table> + <action_table id="163" length="1">89</action_table> + <action_table id="164" length="1">85</action_table> + <action_table id="165" length="1">87</action_table> + </action_table_list> + <start_state>10</start_state> + <entry_points> + <entry name="or_literal">166</entry> + <entry name="regular_type">168</entry> + <entry name="literal_pattern">203</entry> + <entry name="main">10</entry> + </entry_points> + <state_list length="205"> + <state id="0"> + <trans_list length="3"> + <t>-128 9 0 x</t> + <t>10 10 10 0</t> + <t>11 127 0 x</t> + </trans_list> + </state> + + <state id="1"> + <trans_list length="7"> + <t>-128 9 1 x</t> + <t>10 10 1 1</t> + <t>11 38 1 x</t> + <t>39 39 10 2</t> + <t>40 91 1 x</t> + <t>92 92 2 x</t> + <t>93 127 1 x</t> + </trans_list> + </state> + + <state id="2"> + <trans_list length="3"> + <t>-128 9 1 x</t> + <t>10 10 1 1</t> + <t>11 127 1 x</t> + </trans_list> + </state> + + <state id="3"> + <trans_list length="3"> + <t>-128 9 3 x</t> + <t>10 10 10 3</t> + <t>11 127 3 x</t> + </trans_list> + </state> + + <state id="4"> + <trans_list length="7"> + <t>-128 9 4 x</t> + <t>10 10 4 1</t> + <t>11 33 4 x</t> + <t>34 34 171 x</t> + <t>35 91 4 x</t> + <t>92 92 5 x</t> + <t>93 127 4 x</t> + </trans_list> + </state> + + <state id="5"> + <trans_list length="3"> + <t>-128 9 4 x</t> + <t>10 10 4 1</t> + <t>11 127 4 x</t> + </trans_list> + </state> + + <state id="6"> + <trans_list length="3"> + <t>-128 9 6 x</t> + <t>10 10 168 4</t> + <t>11 127 6 x</t> + </trans_list> + </state> + + <state id="7"> + <trans_list length="7"> + <t>-128 9 7 x</t> + <t>10 10 7 1</t> + <t>11 38 7 x</t> + <t>39 39 171 x</t> + <t>40 91 7 x</t> + <t>92 92 8 x</t> + <t>93 127 7 x</t> + </trans_list> + </state> + + <state id="8"> + <trans_list length="3"> + <t>-128 9 7 x</t> + <t>10 10 7 1</t> + <t>11 127 7 x</t> + </trans_list> + </state> + + <state id="9"> + <trans_list length="7"> + <t>-128 47 168 5</t> + <t>48 57 181 x</t> + <t>58 64 168 5</t> + <t>65 70 181 x</t> + <t>71 96 168 5</t> + <t>97 102 181 x</t> + <t>103 127 168 5</t> + </trans_list> + </state> + + <state id="10" final="t"> + <state_actions>6 7 x</state_actions> + <trans_list length="61"> + <t>-128 -1 10 8</t> + <t>0 0 10 9</t> + <t>1 8 10 8</t> + <t>9 9 11 x</t> + <t>10 10 10 10</t> + <t>11 12 10 8</t> + <t>13 13 11 x</t> + <t>14 31 10 8</t> + <t>32 32 11 x</t> + <t>33 33 12 x</t> + <t>34 34 10 11</t> + <t>35 35 13 12</t> + <t>36 37 10 8</t> + <t>38 38 14 x</t> + <t>39 39 15 12</t> + <t>40 43 10 13</t> + <t>44 44 10 8</t> + <t>45 45 10 13</t> + <t>46 46 10 8</t> + <t>47 47 10 14</t> + <t>48 57 16 x</t> + <t>58 58 17 x</t> + <t>59 59 10 8</t> + <t>60 60 18 x</t> + <t>61 61 19 x</t> + <t>62 62 20 x</t> + <t>63 64 10 8</t> + <t>65 90 21 15</t> + <t>91 91 10 16</t> + <t>92 92 10 8</t> + <t>93 93 10 17</t> + <t>94 94 10 8</t> + <t>95 95 21 15</t> + <t>96 96 10 8</t> + <t>97 97 22 x</t> + <t>98 98 21 15</t> + <t>99 99 33 x</t> + <t>100 100 45 x</t> + <t>101 101 46 x</t> + <t>102 102 50 x</t> + <t>103 104 21 15</t> + <t>105 105 55 x</t> + <t>106 107 21 15</t> + <t>108 108 68 x</t> + <t>109 109 75 x</t> + <t>110 110 79 x</t> + <t>111 111 21 15</t> + <t>112 112 95 x</t> + <t>113 113 21 15</t> + <t>114 114 112 x</t> + <t>115 115 119 x</t> + <t>116 116 132 x</t> + <t>117 117 152 x</t> + <t>118 118 21 15</t> + <t>119 119 157 x</t> + <t>120 122 21 15</t> + <t>123 123 10 8</t> + <t>124 124 164 x</t> + <t>125 125 10 8</t> + <t>126 126 165 12</t> + <t>127 127 10 8</t> + </trans_list> + </state> + + <state id="11" final="t"> + <trans_list length="7"> + <t>-128 8 10 18</t> + <t>9 9 11 x</t> + <t>10 12 10 18</t> + <t>13 13 11 x</t> + <t>14 31 10 18</t> + <t>32 32 11 x</t> + <t>33 127 10 18</t> + </trans_list> + </state> + + <state id="12" final="t"> + <trans_list length="3"> + <t>-128 60 10 19</t> + <t>61 61 10 20</t> + <t>62 127 10 19</t> + </trans_list> + </state> + + <state id="13" final="t"> + <trans_list length="3"> + <t>-128 9 0 x</t> + <t>10 10 10 0</t> + <t>11 127 0 x</t> + </trans_list> + </state> + + <state id="14" final="t"> + <trans_list length="3"> + <t>-128 37 10 19</t> + <t>38 38 10 21</t> + <t>39 127 10 19</t> + </trans_list> + </state> + + <state id="15" final="t"> + <trans_list length="7"> + <t>-128 9 1 x</t> + <t>10 10 1 1</t> + <t>11 38 1 x</t> + <t>39 39 10 2</t> + <t>40 91 1 x</t> + <t>92 92 2 x</t> + <t>93 127 1 x</t> + </trans_list> + </state> + + <state id="16" final="t"> + <trans_list length="3"> + <t>-128 47 10 22</t> + <t>48 57 16 x</t> + <t>58 127 10 22</t> + </trans_list> + </state> + + <state id="17" final="t"> + <trans_list length="5"> + <t>-128 57 10 19</t> + <t>58 58 10 23</t> + <t>59 60 10 19</t> + <t>61 61 10 24</t> + <t>62 127 10 19</t> + </trans_list> + </state> + + <state id="18" final="t"> + <trans_list length="5"> + <t>-128 44 10 19</t> + <t>45 45 10 25</t> + <t>46 60 10 19</t> + <t>61 61 10 26</t> + <t>62 127 10 19</t> + </trans_list> + </state> + + <state id="19" final="t"> + <trans_list length="4"> + <t>-128 60 10 19</t> + <t>61 61 10 27</t> + <t>62 62 10 28</t> + <t>63 127 10 19</t> + </trans_list> + </state> + + <state id="20" final="t"> + <trans_list length="3"> + <t>-128 60 10 19</t> + <t>61 61 10 29</t> + <t>62 127 10 19</t> + </trans_list> + </state> + + <state id="21" final="t"> + <trans_list length="9"> + <t>-128 47 10 30</t> + <t>48 57 21 15</t> + <t>58 64 10 30</t> + <t>65 90 21 15</t> + <t>91 94 10 30</t> + <t>95 95 21 15</t> + <t>96 96 10 30</t> + <t>97 122 21 15</t> + <t>123 127 10 30</t> + </trans_list> + </state> + + <state id="22" final="t"> + <trans_list length="13"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 98 21 15</t> + <t>99 99 23 x</t> + <t>100 107 21 15</t> + <t>108 108 27 x</t> + <t>109 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="23" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 115 21 15</t> + <t>116 116 24 x</t> + <t>117 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="24" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 104 21 15</t> + <t>105 105 25 x</t> + <t>106 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="25" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 110 21 15</t> + <t>111 111 26 x</t> + <t>112 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="26" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 109 21 15</t> + <t>110 110 21 32</t> + <t>111 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="27" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 111 21 15</t> + <t>112 112 28 x</t> + <t>113 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="28" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 103 21 15</t> + <t>104 104 29 x</t> + <t>105 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="29" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 115 21 15</t> + <t>116 116 30 x</t> + <t>117 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="30" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 120 21 15</t> + <t>121 121 31 x</t> + <t>122 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="31" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 111 21 15</t> + <t>112 112 32 x</t> + <t>113 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="32" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 100 21 15</t> + <t>101 101 21 33</t> + <t>102 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="33" final="t"> + <trans_list length="13"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 101 21 15</t> + <t>102 102 34 x</t> + <t>103 110 21 15</t> + <t>111 111 35 x</t> + <t>112 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="34" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 107 21 15</t> + <t>108 108 21 34</t> + <t>109 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="35" final="t"> + <trans_list length="12"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 108 21 15</t> + <t>109 109 36 x</t> + <t>110 110 39 x</t> + <t>111 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="36" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 108 21 15</t> + <t>109 109 37 x</t> + <t>110 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="37" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 104 21 15</t> + <t>105 105 38 x</t> + <t>106 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="38" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 115 21 15</t> + <t>116 116 21 35</t> + <t>117 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="39" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 114 21 15</t> + <t>115 115 40 x</t> + <t>116 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="40" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 115 21 15</t> + <t>116 116 41 x</t> + <t>117 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="41" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 113 21 15</t> + <t>114 114 42 x</t> + <t>115 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="42" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 116 21 15</t> + <t>117 117 43 x</t> + <t>118 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="43" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 98 21 15</t> + <t>99 99 44 x</t> + <t>100 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="44" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 115 21 15</t> + <t>116 116 21 36</t> + <t>117 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="45" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 110 21 15</t> + <t>111 111 21 37</t> + <t>112 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="46" final="t"> + <trans_list length="13"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 107 21 15</t> + <t>108 108 47 x</t> + <t>109 109 21 15</t> + <t>110 110 49 x</t> + <t>111 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="47" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 114 21 15</t> + <t>115 115 48 x</t> + <t>116 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="48" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 100 21 15</t> + <t>101 101 21 38</t> + <t>102 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="49" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 99 21 15</t> + <t>100 100 21 39</t> + <t>101 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="50" final="t"> + <trans_list length="13"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 104 21 15</t> + <t>105 105 51 x</t> + <t>106 110 21 15</t> + <t>111 111 54 x</t> + <t>112 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="51" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 109 21 15</t> + <t>110 110 52 x</t> + <t>111 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="52" final="t"> + <trans_list length="12"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 97 53 x</t> + <t>98 99 21 15</t> + <t>100 100 21 40</t> + <t>101 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="53" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 107 21 15</t> + <t>108 108 21 41</t> + <t>109 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="54" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 113 21 15</t> + <t>114 114 21 42</t> + <t>115 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="55" final="t"> + <trans_list length="16"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 101 21 15</t> + <t>102 102 21 43</t> + <t>103 103 56 x</t> + <t>104 109 21 15</t> + <t>110 110 60 x</t> + <t>111 115 21 15</t> + <t>116 116 66 x</t> + <t>117 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="56" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 109 21 15</t> + <t>110 110 57 x</t> + <t>111 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="57" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 110 21 15</t> + <t>111 111 58 x</t> + <t>112 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="58" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 113 21 15</t> + <t>114 114 59 x</t> + <t>115 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="59" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 100 21 15</t> + <t>101 101 21 44</t> + <t>102 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="60" final="t"> + <trans_list length="13"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 98 21 15</t> + <t>99 99 61 x</t> + <t>100 104 21 15</t> + <t>105 105 65 x</t> + <t>106 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="61" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 107 21 15</t> + <t>108 108 62 x</t> + <t>109 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="62" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 116 21 15</t> + <t>117 117 63 x</t> + <t>118 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="63" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 99 21 15</t> + <t>100 100 64 x</t> + <t>101 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="64" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 100 21 15</t> + <t>101 101 21 45</t> + <t>102 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="65" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 115 21 15</t> + <t>116 116 21 46</t> + <t>117 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="66" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 100 21 15</t> + <t>101 101 67 x</t> + <t>102 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="67" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 113 21 15</t> + <t>114 114 21 47</t> + <t>115 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="68" final="t"> + <trans_list length="13"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 100 21 15</t> + <t>101 101 69 x</t> + <t>102 104 21 15</t> + <t>105 105 70 x</t> + <t>106 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="69" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 119 21 15</t> + <t>120 120 21 48</t> + <t>121 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="70" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 115 21 15</t> + <t>116 116 71 x</t> + <t>117 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="71" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 100 21 15</t> + <t>101 101 72 x</t> + <t>102 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="72" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 113 21 15</t> + <t>114 114 73 x</t> + <t>115 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="73" final="t"> + <trans_list length="10"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 97 74 x</t> + <t>98 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="74" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 107 21 15</t> + <t>108 108 21 49</t> + <t>109 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="75" final="t"> + <trans_list length="10"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 97 76 x</t> + <t>98 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="76" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 115 21 15</t> + <t>116 116 77 x</t> + <t>117 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="77" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 98 21 15</t> + <t>99 99 78 x</t> + <t>100 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="78" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 103 21 15</t> + <t>104 104 21 50</t> + <t>105 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="79" final="t"> + <trans_list length="15"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 97 80 x</t> + <t>98 100 21 15</t> + <t>101 101 87 x</t> + <t>102 102 89 x</t> + <t>103 110 21 15</t> + <t>111 111 90 x</t> + <t>112 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="80" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 108 21 15</t> + <t>109 109 81 x</t> + <t>110 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="81" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 100 21 15</t> + <t>101 101 82 x</t> + <t>102 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="82" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 114 21 15</t> + <t>115 115 83 x</t> + <t>116 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="83" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 111 21 15</t> + <t>112 112 84 x</t> + <t>113 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="84" final="t"> + <trans_list length="10"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 97 85 x</t> + <t>98 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="85" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 98 21 15</t> + <t>99 99 86 x</t> + <t>100 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="86" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 100 21 15</t> + <t>101 101 21 51</t> + <t>102 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="87" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 119 21 15</t> + <t>120 120 88 x</t> + <t>121 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="88" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 115 21 15</t> + <t>116 116 21 52</t> + <t>117 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="89" final="t"> + <trans_list length="10"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 97 21 53</t> + <t>98 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="90" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 109 21 15</t> + <t>110 110 91 x</t> + <t>111 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="91" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 115 21 15</t> + <t>116 116 92 x</t> + <t>117 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="92" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 100 21 15</t> + <t>101 101 93 x</t> + <t>102 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="93" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 113 21 15</t> + <t>114 114 94 x</t> + <t>115 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="94" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 108 21 15</t> + <t>109 109 21 54</t> + <t>110 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="95" final="t"> + <trans_list length="14"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 97 96 x</t> + <t>98 99 21 15</t> + <t>100 100 104 x</t> + <t>101 113 21 15</t> + <t>114 114 105 x</t> + <t>115 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="96" final="t"> + <trans_list length="13"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 113 21 15</t> + <t>114 114 97 x</t> + <t>115 115 21 15</t> + <t>116 116 100 x</t> + <t>117 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="97" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 114 21 15</t> + <t>115 115 98 x</t> + <t>116 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="98" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 100 21 15</t> + <t>101 101 99 x</t> + <t>102 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="99" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 113 21 15</t> + <t>114 114 21 55</t> + <t>115 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="100" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 115 21 15</t> + <t>116 116 101 x</t> + <t>117 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="101" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 100 21 15</t> + <t>101 101 102 x</t> + <t>102 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="102" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 113 21 15</t> + <t>114 114 103 x</t> + <t>115 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="103" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 109 21 15</t> + <t>110 110 21 56</t> + <t>111 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="104" final="t"> + <trans_list length="10"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 97 21 57</t> + <t>98 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="105" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 104 21 15</t> + <t>105 105 106 x</t> + <t>106 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="106" final="t"> + <trans_list length="11"> + <t>-128 47 10 58</t> + <t>48 57 21 15</t> + <t>58 64 10 58</t> + <t>65 90 21 15</t> + <t>91 94 10 58</t> + <t>95 95 21 15</t> + <t>96 96 10 58</t> + <t>97 109 21 15</t> + <t>110 110 107 x</t> + <t>111 122 21 15</t> + <t>123 127 10 58</t> + </trans_list> + </state> + + <state id="107" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 115 21 15</t> + <t>116 116 108 x</t> + <t>117 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="108" final="t"> + <trans_list length="9"> + <t>-128 47 10 59</t> + <t>48 57 21 15</t> + <t>58 64 10 59</t> + <t>65 90 21 15</t> + <t>91 94 10 59</t> + <t>95 95 109 x</t> + <t>96 96 10 59</t> + <t>97 122 21 15</t> + <t>123 127 10 59</t> + </trans_list> + </state> + + <state id="109" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 119 21 15</t> + <t>120 120 110 x</t> + <t>121 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="110" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 108 21 15</t> + <t>109 109 111 x</t> + <t>110 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="111" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 107 21 15</t> + <t>108 108 21 60</t> + <t>109 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="112" final="t"> + <trans_list length="15"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 100 21 15</t> + <t>101 101 113 x</t> + <t>102 107 21 15</t> + <t>108 108 21 61</t> + <t>109 110 21 15</t> + <t>111 111 117 x</t> + <t>112 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="113" final="t"> + <trans_list length="13"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 99 21 15</t> + <t>100 100 21 62</t> + <t>101 105 21 15</t> + <t>106 106 114 x</t> + <t>107 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="114" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 100 21 15</t> + <t>101 101 115 x</t> + <t>102 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="115" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 98 21 15</t> + <t>99 99 116 x</t> + <t>100 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="116" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 115 21 15</t> + <t>116 116 21 63</t> + <t>117 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="117" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 110 21 15</t> + <t>111 111 118 x</t> + <t>112 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="118" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 115 21 15</t> + <t>116 116 21 64</t> + <t>117 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="119" final="t"> + <trans_list length="13"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 98 21 15</t> + <t>99 99 120 x</t> + <t>100 116 21 15</t> + <t>117 117 125 x</t> + <t>118 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="120" final="t"> + <trans_list length="10"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 97 121 x</t> + <t>98 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="121" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 109 21 15</t> + <t>110 110 122 x</t> + <t>111 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="122" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 109 21 15</t> + <t>110 110 123 x</t> + <t>111 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="123" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 100 21 15</t> + <t>101 101 124 x</t> + <t>102 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="124" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 113 21 15</t> + <t>114 114 21 65</t> + <t>115 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="125" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 97 21 15</t> + <t>98 98 126 x</t> + <t>99 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="126" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 111 21 15</t> + <t>112 112 127 x</t> + <t>113 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="127" final="t"> + <trans_list length="10"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 97 128 x</t> + <t>98 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="128" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 113 21 15</t> + <t>114 114 129 x</t> + <t>115 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="129" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 114 21 15</t> + <t>115 115 130 x</t> + <t>116 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="130" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 100 21 15</t> + <t>101 101 131 x</t> + <t>102 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="131" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 113 21 15</t> + <t>114 114 21 66</t> + <t>115 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="132" final="t"> + <trans_list length="17"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 103 21 15</t> + <t>104 104 133 x</t> + <t>105 110 21 15</t> + <t>111 111 135 x</t> + <t>112 113 21 15</t> + <t>114 114 138 x</t> + <t>115 120 21 15</t> + <t>121 121 145 x</t> + <t>122 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="133" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 100 21 15</t> + <t>101 101 134 x</t> + <t>102 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="134" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 109 21 15</t> + <t>110 110 21 67</t> + <t>111 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="135" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 106 21 15</t> + <t>107 107 136 x</t> + <t>108 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="136" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 100 21 15</t> + <t>101 101 137 x</t> + <t>102 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="137" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 109 21 15</t> + <t>110 110 21 68</t> + <t>111 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="138" final="t"> + <trans_list length="10"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 97 139 x</t> + <t>98 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="139" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 109 21 15</t> + <t>110 110 140 x</t> + <t>111 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="140" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 114 21 15</t> + <t>115 115 141 x</t> + <t>116 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="141" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 107 21 15</t> + <t>108 108 142 x</t> + <t>109 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="142" final="t"> + <trans_list length="10"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 97 143 x</t> + <t>98 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="143" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 115 21 15</t> + <t>116 116 144 x</t> + <t>117 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="144" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 100 21 15</t> + <t>101 101 21 69</t> + <t>102 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="145" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 111 21 15</t> + <t>112 112 146 x</t> + <t>113 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="146" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 100 21 15</t> + <t>101 101 147 x</t> + <t>102 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="147" final="t"> + <trans_list length="9"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 148 x</t> + <t>96 96 10 31</t> + <t>97 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="148" final="t"> + <trans_list length="13"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 99 21 15</t> + <t>100 100 149 x</t> + <t>101 104 21 15</t> + <t>105 105 151 x</t> + <t>106 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="149" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 100 21 15</t> + <t>101 101 150 x</t> + <t>102 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="150" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 101 21 15</t> + <t>102 102 21 70</t> + <t>103 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="151" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 99 21 15</t> + <t>100 100 21 71</t> + <t>101 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="152" final="t"> + <trans_list length="13"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 109 21 15</t> + <t>110 110 153 x</t> + <t>111 114 21 15</t> + <t>115 115 155 x</t> + <t>116 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="153" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 99 21 15</t> + <t>100 100 154 x</t> + <t>101 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="154" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 110 21 15</t> + <t>111 111 21 72</t> + <t>112 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="155" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 100 21 15</t> + <t>101 101 156 x</t> + <t>102 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="156" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 114 21 15</t> + <t>115 115 21 73</t> + <t>116 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="157" final="t"> + <trans_list length="13"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 103 21 15</t> + <t>104 104 158 x</t> + <t>105 113 21 15</t> + <t>114 114 161 x</t> + <t>115 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="158" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 104 21 15</t> + <t>105 105 159 x</t> + <t>106 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="159" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 107 21 15</t> + <t>108 108 160 x</t> + <t>109 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="160" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 100 21 15</t> + <t>101 101 21 74</t> + <t>102 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="161" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 104 21 15</t> + <t>105 105 162 x</t> + <t>106 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="162" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 115 21 15</t> + <t>116 116 163 x</t> + <t>117 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="163" final="t"> + <trans_list length="11"> + <t>-128 47 10 31</t> + <t>48 57 21 15</t> + <t>58 64 10 31</t> + <t>65 90 21 15</t> + <t>91 94 10 31</t> + <t>95 95 21 15</t> + <t>96 96 10 31</t> + <t>97 100 21 15</t> + <t>101 101 21 75</t> + <t>102 122 21 15</t> + <t>123 127 10 31</t> + </trans_list> + </state> + + <state id="164" final="t"> + <trans_list length="3"> + <t>-128 123 10 19</t> + <t>124 124 10 76</t> + <t>125 127 10 19</t> + </trans_list> + </state> + + <state id="165" final="t"> + <trans_list length="3"> + <t>-128 9 3 x</t> + <t>10 10 10 3</t> + <t>11 127 3 x</t> + </trans_list> + </state> + + <state id="166" final="t"> + <state_actions>6 7 x</state_actions> + <trans_list length="8"> + <t>-128 -1 166 77</t> + <t>0 0 166 78</t> + <t>1 44 166 77</t> + <t>45 45 166 79</t> + <t>46 91 166 77</t> + <t>92 92 167 x</t> + <t>93 93 166 80</t> + <t>94 127 166 77</t> + </trans_list> + </state> + + <state id="167" final="t"> + <trans_list length="18"> + <t>-128 9 166 81</t> + <t>10 10 166 82</t> + <t>11 47 166 81</t> + <t>48 48 166 83</t> + <t>49 96 166 81</t> + <t>97 97 166 84</t> + <t>98 98 166 85</t> + <t>99 101 166 81</t> + <t>102 102 166 86</t> + <t>103 109 166 81</t> + <t>110 110 166 87</t> + <t>111 113 166 81</t> + <t>114 114 166 88</t> + <t>115 115 166 81</t> + <t>116 116 166 89</t> + <t>117 117 166 81</t> + <t>118 118 166 90</t> + <t>119 127 166 81</t> + </trans_list> + </state> + + <state id="168" final="t"> + <state_actions>6 7 x</state_actions> + <trans_list length="47"> + <t>-128 -1 168 91</t> + <t>0 0 168 92</t> + <t>1 8 168 91</t> + <t>9 9 169 x</t> + <t>10 10 168 93</t> + <t>11 12 168 91</t> + <t>13 13 169 x</t> + <t>14 31 168 91</t> + <t>32 32 169 x</t> + <t>33 33 168 91</t> + <t>34 34 170 12</t> + <t>35 35 172 12</t> + <t>36 36 173 x</t> + <t>37 37 174 x</t> + <t>38 38 168 91</t> + <t>39 39 175 12</t> + <t>40 41 168 91</t> + <t>42 42 176 x</t> + <t>43 44 168 91</t> + <t>45 45 177 x</t> + <t>46 46 178 x</t> + <t>47 47 168 94</t> + <t>48 48 179 12</t> + <t>49 57 180 x</t> + <t>58 58 182 x</t> + <t>59 59 168 91</t> + <t>60 60 184 x</t> + <t>61 61 168 91</t> + <t>62 62 186 x</t> + <t>63 63 168 91</t> + <t>64 64 187 x</t> + <t>65 90 188 95</t> + <t>91 91 189 x</t> + <t>92 94 168 91</t> + <t>95 95 188 95</t> + <t>96 96 168 91</t> + <t>97 100 188 95</t> + <t>101 101 190 x</t> + <t>102 102 193 x</t> + <t>103 107 188 95</t> + <t>108 108 196 x</t> + <t>109 115 188 95</t> + <t>116 116 199 x</t> + <t>117 118 188 95</t> + <t>119 119 200 x</t> + <t>120 122 188 95</t> + <t>123 127 168 91</t> + </trans_list> + </state> + + <state id="169" final="t"> + <trans_list length="7"> + <t>-128 8 168 96</t> + <t>9 9 169 x</t> + <t>10 12 168 96</t> + <t>13 13 169 x</t> + <t>14 31 168 96</t> + <t>32 32 169 x</t> + <t>33 127 168 96</t> + </trans_list> + </state> + + <state id="170" final="t"> + <trans_list length="7"> + <t>-128 9 4 x</t> + <t>10 10 4 1</t> + <t>11 33 4 x</t> + <t>34 34 171 x</t> + <t>35 91 4 x</t> + <t>92 92 5 x</t> + <t>93 127 4 x</t> + </trans_list> + </state> + + <state id="171" final="t"> + <trans_list length="3"> + <t>-128 104 168 97</t> + <t>105 105 168 98</t> + <t>106 127 168 97</t> + </trans_list> + </state> + + <state id="172" final="t"> + <trans_list length="3"> + <t>-128 9 6 x</t> + <t>10 10 168 4</t> + <t>11 127 6 x</t> + </trans_list> + </state> + + <state id="173" final="t"> + <trans_list length="13"> + <t>-128 32 168 99</t> + <t>33 33 168 100</t> + <t>34 41 168 99</t> + <t>42 42 168 101</t> + <t>43 46 168 99</t> + <t>47 47 168 102</t> + <t>48 62 168 99</t> + <t>63 63 168 103</t> + <t>64 93 168 99</t> + <t>94 94 168 104</t> + <t>95 125 168 99</t> + <t>126 126 168 105</t> + <t>127 127 168 99</t> + </trans_list> + </state> + + <state id="174" final="t"> + <trans_list length="13"> + <t>-128 32 168 99</t> + <t>33 33 168 106</t> + <t>34 41 168 99</t> + <t>42 42 168 107</t> + <t>43 46 168 99</t> + <t>47 47 168 108</t> + <t>48 62 168 99</t> + <t>63 63 168 109</t> + <t>64 93 168 99</t> + <t>94 94 168 110</t> + <t>95 125 168 99</t> + <t>126 126 168 111</t> + <t>127 127 168 99</t> + </trans_list> + </state> + + <state id="175" final="t"> + <trans_list length="7"> + <t>-128 9 7 x</t> + <t>10 10 7 1</t> + <t>11 38 7 x</t> + <t>39 39 171 x</t> + <t>40 91 7 x</t> + <t>92 92 8 x</t> + <t>93 127 7 x</t> + </trans_list> + </state> + + <state id="176" final="t"> + <trans_list length="3"> + <t>-128 41 168 99</t> + <t>42 42 168 112</t> + <t>43 127 168 99</t> + </trans_list> + </state> + + <state id="177" final="t"> + <trans_list length="5"> + <t>-128 44 168 99</t> + <t>45 45 168 113</t> + <t>46 61 168 99</t> + <t>62 62 168 114</t> + <t>63 127 168 99</t> + </trans_list> + </state> + + <state id="178" final="t"> + <trans_list length="3"> + <t>-128 45 168 99</t> + <t>46 46 168 115</t> + <t>47 127 168 99</t> + </trans_list> + </state> + + <state id="179" final="t"> + <trans_list length="5"> + <t>-128 47 168 116</t> + <t>48 57 180 x</t> + <t>58 119 168 116</t> + <t>120 120 9 x</t> + <t>121 127 168 116</t> + </trans_list> + </state> + + <state id="180" final="t"> + <trans_list length="3"> + <t>-128 47 168 116</t> + <t>48 57 180 x</t> + <t>58 127 168 116</t> + </trans_list> + </state> + + <state id="181" final="t"> + <trans_list length="7"> + <t>-128 47 168 117</t> + <t>48 57 181 x</t> + <t>58 64 168 117</t> + <t>65 70 181 x</t> + <t>71 96 168 117</t> + <t>97 102 181 x</t> + <t>103 127 168 117</t> + </trans_list> + </state> + + <state id="182" final="t"> + <trans_list length="4"> + <t>-128 60 168 99</t> + <t>61 61 168 118</t> + <t>62 62 183 x</t> + <t>63 127 168 99</t> + </trans_list> + </state> + + <state id="183" final="t"> + <trans_list length="3"> + <t>-128 61 168 119</t> + <t>62 62 168 120</t> + <t>63 127 168 119</t> + </trans_list> + </state> + + <state id="184" final="t"> + <trans_list length="15"> + <t>-128 32 168 99</t> + <t>33 33 168 121</t> + <t>34 41 168 99</t> + <t>42 42 168 122</t> + <t>43 46 168 99</t> + <t>47 47 168 123</t> + <t>48 57 168 99</t> + <t>58 58 168 124</t> + <t>59 61 168 99</t> + <t>62 62 185 x</t> + <t>63 93 168 99</t> + <t>94 94 168 125</t> + <t>95 125 168 99</t> + <t>126 126 168 126</t> + <t>127 127 168 99</t> + </trans_list> + </state> + + <state id="185" final="t"> + <trans_list length="11"> + <t>-128 32 168 127</t> + <t>33 33 168 128</t> + <t>34 41 168 127</t> + <t>42 42 168 129</t> + <t>43 46 168 127</t> + <t>47 47 168 130</t> + <t>48 93 168 127</t> + <t>94 94 168 131</t> + <t>95 125 168 127</t> + <t>126 126 168 132</t> + <t>127 127 168 127</t> + </trans_list> + </state> + + <state id="186" final="t"> + <trans_list length="13"> + <t>-128 32 168 99</t> + <t>33 33 168 133</t> + <t>34 41 168 99</t> + <t>42 42 168 134</t> + <t>43 46 168 99</t> + <t>47 47 168 135</t> + <t>48 62 168 99</t> + <t>63 63 168 136</t> + <t>64 93 168 99</t> + <t>94 94 168 137</t> + <t>95 125 168 99</t> + <t>126 126 168 138</t> + <t>127 127 168 99</t> + </trans_list> + </state> + + <state id="187" final="t"> + <trans_list length="11"> + <t>-128 32 168 99</t> + <t>33 33 168 139</t> + <t>34 41 168 99</t> + <t>42 42 168 140</t> + <t>43 46 168 99</t> + <t>47 47 168 141</t> + <t>48 93 168 99</t> + <t>94 94 168 142</t> + <t>95 125 168 99</t> + <t>126 126 168 143</t> + <t>127 127 168 99</t> + </trans_list> + </state> + + <state id="188" final="t"> + <trans_list length="9"> + <t>-128 47 168 144</t> + <t>48 57 188 95</t> + <t>58 64 168 144</t> + <t>65 90 188 95</t> + <t>91 94 168 144</t> + <t>95 95 188 95</t> + <t>96 96 168 144</t> + <t>97 122 188 95</t> + <t>123 127 168 144</t> + </trans_list> + </state> + + <state id="189" final="t"> + <trans_list length="3"> + <t>-128 93 168 145</t> + <t>94 94 168 146</t> + <t>95 127 168 145</t> + </trans_list> + </state> + + <state id="190" final="t"> + <trans_list length="13"> + <t>-128 47 168 147</t> + <t>48 57 188 95</t> + <t>58 64 168 147</t> + <t>65 90 188 95</t> + <t>91 94 168 147</t> + <t>95 95 188 95</t> + <t>96 96 168 147</t> + <t>97 110 188 95</t> + <t>111 111 191 x</t> + <t>112 113 188 95</t> + <t>114 114 192 x</t> + <t>115 122 188 95</t> + <t>123 127 168 147</t> + </trans_list> + </state> + + <state id="191" final="t"> + <trans_list length="11"> + <t>-128 47 168 147</t> + <t>48 57 188 95</t> + <t>58 64 168 147</t> + <t>65 90 188 95</t> + <t>91 94 168 147</t> + <t>95 95 188 95</t> + <t>96 96 168 147</t> + <t>97 101 188 95</t> + <t>102 102 188 148</t> + <t>103 122 188 95</t> + <t>123 127 168 147</t> + </trans_list> + </state> + + <state id="192" final="t"> + <trans_list length="11"> + <t>-128 47 168 147</t> + <t>48 57 188 95</t> + <t>58 64 168 147</t> + <t>65 90 188 95</t> + <t>91 94 168 147</t> + <t>95 95 188 95</t> + <t>96 96 168 147</t> + <t>97 113 188 95</t> + <t>114 114 188 149</t> + <t>115 122 188 95</t> + <t>123 127 168 147</t> + </trans_list> + </state> + + <state id="193" final="t"> + <trans_list length="11"> + <t>-128 47 168 147</t> + <t>48 57 188 95</t> + <t>58 64 168 147</t> + <t>65 90 188 95</t> + <t>91 94 168 147</t> + <t>95 95 188 95</t> + <t>96 96 168 147</t> + <t>97 113 188 95</t> + <t>114 114 194 x</t> + <t>115 122 188 95</t> + <t>123 127 168 147</t> + </trans_list> + </state> + + <state id="194" final="t"> + <trans_list length="11"> + <t>-128 47 168 147</t> + <t>48 57 188 95</t> + <t>58 64 168 147</t> + <t>65 90 188 95</t> + <t>91 94 168 147</t> + <t>95 95 188 95</t> + <t>96 96 168 147</t> + <t>97 110 188 95</t> + <t>111 111 195 x</t> + <t>112 122 188 95</t> + <t>123 127 168 147</t> + </trans_list> + </state> + + <state id="195" final="t"> + <trans_list length="11"> + <t>-128 47 168 147</t> + <t>48 57 188 95</t> + <t>58 64 168 147</t> + <t>65 90 188 95</t> + <t>91 94 168 147</t> + <t>95 95 188 95</t> + <t>96 96 168 147</t> + <t>97 108 188 95</t> + <t>109 109 188 150</t> + <t>110 122 188 95</t> + <t>123 127 168 147</t> + </trans_list> + </state> + + <state id="196" final="t"> + <trans_list length="11"> + <t>-128 47 168 147</t> + <t>48 57 188 95</t> + <t>58 64 168 147</t> + <t>65 90 188 95</t> + <t>91 94 168 147</t> + <t>95 95 188 95</t> + <t>96 96 168 147</t> + <t>97 100 188 95</t> + <t>101 101 197 x</t> + <t>102 122 188 95</t> + <t>123 127 168 147</t> + </trans_list> + </state> + + <state id="197" final="t"> + <trans_list length="11"> + <t>-128 47 168 147</t> + <t>48 57 188 95</t> + <t>58 64 168 147</t> + <t>65 90 188 95</t> + <t>91 94 168 147</t> + <t>95 95 188 95</t> + <t>96 96 168 147</t> + <t>97 113 188 95</t> + <t>114 114 198 x</t> + <t>115 122 188 95</t> + <t>123 127 168 147</t> + </trans_list> + </state> + + <state id="198" final="t"> + <trans_list length="11"> + <t>-128 47 168 147</t> + <t>48 57 188 95</t> + <t>58 64 168 147</t> + <t>65 90 188 95</t> + <t>91 94 168 147</t> + <t>95 95 188 95</t> + <t>96 96 168 147</t> + <t>97 113 188 95</t> + <t>114 114 188 151</t> + <t>115 122 188 95</t> + <t>123 127 168 147</t> + </trans_list> + </state> + + <state id="199" final="t"> + <trans_list length="11"> + <t>-128 47 168 147</t> + <t>48 57 188 95</t> + <t>58 64 168 147</t> + <t>65 90 188 95</t> + <t>91 94 168 147</t> + <t>95 95 188 95</t> + <t>96 96 168 147</t> + <t>97 110 188 95</t> + <t>111 111 188 152</t> + <t>112 122 188 95</t> + <t>123 127 168 147</t> + </trans_list> + </state> + + <state id="200" final="t"> + <trans_list length="11"> + <t>-128 47 168 147</t> + <t>48 57 188 95</t> + <t>58 64 168 147</t> + <t>65 90 188 95</t> + <t>91 94 168 147</t> + <t>95 95 188 95</t> + <t>96 96 168 147</t> + <t>97 103 188 95</t> + <t>104 104 201 x</t> + <t>105 122 188 95</t> + <t>123 127 168 147</t> + </trans_list> + </state> + + <state id="201" final="t"> + <trans_list length="11"> + <t>-128 47 168 147</t> + <t>48 57 188 95</t> + <t>58 64 168 147</t> + <t>65 90 188 95</t> + <t>91 94 168 147</t> + <t>95 95 188 95</t> + <t>96 96 168 147</t> + <t>97 100 188 95</t> + <t>101 101 202 x</t> + <t>102 122 188 95</t> + <t>123 127 168 147</t> + </trans_list> + </state> + + <state id="202" final="t"> + <trans_list length="11"> + <t>-128 47 168 147</t> + <t>48 57 188 95</t> + <t>58 64 168 147</t> + <t>65 90 188 95</t> + <t>91 94 168 147</t> + <t>95 95 188 95</t> + <t>96 96 168 147</t> + <t>97 109 188 95</t> + <t>110 110 188 153</t> + <t>111 122 188 95</t> + <t>123 127 168 147</t> + </trans_list> + </state> + + <state id="203" final="t"> + <state_actions>6 7 x</state_actions> + <trans_list length="8"> + <t>-128 9 203 154</t> + <t>10 10 203 155</t> + <t>11 33 203 154</t> + <t>34 34 203 156</t> + <t>35 90 203 154</t> + <t>91 91 203 157</t> + <t>92 92 204 x</t> + <t>93 127 203 154</t> + </trans_list> + </state> + + <state id="204" final="t"> + <trans_list length="14"> + <t>-128 96 203 158</t> + <t>97 97 203 159</t> + <t>98 98 203 160</t> + <t>99 101 203 158</t> + <t>102 102 203 161</t> + <t>103 109 203 158</t> + <t>110 110 203 162</t> + <t>111 113 203 158</t> + <t>114 114 203 163</t> + <t>115 115 203 158</t> + <t>116 116 203 164</t> + <t>117 117 203 158</t> + <t>118 118 203 165</t> + <t>119 127 203 158</t> + </trans_list> + </state> + </state_list> + </machine> +</ragel_def> +<ragel_def name="section_parse"> + <alphtype>int</alphtype> + <machine> + <action_list length="5"> + <action id="0" name="clear_words" line="97" col="21"><text> word = lit = 0; word_len = lit_len = 0; </text></action> + <action id="1" name="store_lit" line="98" col="19"><text> lit = tokdata; lit_len = toklen; </text></action> + <action id="2" name="incl_err" line="101" col="18"><text> scan_error() << "bad include statement" << endl; </text></action> + <action id="3" name="handle_include" line="105" col="2"><text> + #if 0 + char *inclSectionName = word; + char *inclFileName = 0; + + /* Implement defaults for the input file and section name. */ + if ( inclSectionName == 0 ) + inclSectionName = parser->sectionName; + + if ( lit != 0 ) + inclFileName = prepareFileName( lit, lit_len ); + else + inclFileName = fileName; + + /* Check for a recursive include structure. Add the current file/section + * name then check if what we are including is already in the stack. */ + includeStack.append( IncludeStackItem( fileName, parser->sectionName ) ); + + if ( recursiveInclude( inclFileName, inclSectionName ) ) + scan_error() << "include: this is a recursive include operation" << endl; + else { + /* Open the input file for reading. */ + ifstream *inFile = new ifstream( inclFileName ); + if ( ! inFile->is_open() ) { + scan_error() << "include: could not open " << + inclFileName << " for reading" << endl; + } + + Scanner scanner( inclFileName, *inFile, output, parser, + inclSectionName, includeDepth+1 ); + scanner.do_scan( ); + delete inFile; + } + + /* Remove the last element (len-1) */ + includeStack.remove( -1 ); + #endif + </text></action> + <action id="4" name="handle_token" line="152" col="2"><text> + InputLoc loc; + + #ifdef PRINT_TOKENS + cerr << "scanner:" << line << ":" << column << + ": sending token to the parser " << Parser_lelNames[*p]; + cerr << " " << toklen; + if ( tokdata != 0 ) + cerr << " " << tokdata; + cerr << endl; + #endif + + loc.fileName = fileName; + loc.line = line; + loc.col = column; + + parser->token( loc, type, tokdata, toklen ); + </text></action> + </action_list> + <action_table_list length="4"> + <action_table id="0" length="1">2</action_table> + <action_table id="1" length="2">0 1</action_table> + <action_table id="2" length="1">3</action_table> + <action_table id="3" length="1">4</action_table> + </action_table_list> + <start_state>3</start_state> + <error_state>0</error_state> + <entry_points> + <entry name="main">3</entry> + </entry_points> + <state_list length="4"> + <state id="0"> + <trans_list length="0"> + </trans_list> + </state> + + <state id="1"> + <state_actions>x x 0</state_actions> + <trans_list length="3"> + <t>-2147483648 131 x 0</t> + <t>132 132 2 1</t> + <t>133 2147483647 x 0</t> + </trans_list> + </state> + + <state id="2"> + <state_actions>x x 0</state_actions> + <trans_list length="3"> + <t>-2147483648 58 x 0</t> + <t>59 59 3 2</t> + <t>60 2147483647 x 0</t> + </trans_list> + </state> + + <state id="3" final="t"> + <trans_list length="3"> + <t>-2147483648 128 3 3</t> + <t>129 129 1 x</t> + <t>130 2147483647 3 3</t> + </trans_list> + </state> + </state_list> + </machine> +</ragel_def> +<host line="1" col="1">/* + * Copyright 2006-2007 Adrian Thurston <thurston@complang.org> + */ + +/* This file is part of Ragel. + * + * Ragel is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Ragel is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Ragel; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <iostream> +#include <fstream> +#include <string.h> + +#include "colm.h" +#include "lmscan.h" +#include "lmparse.h" +#include "parsedata.h" +#include "avltree.h" +#include "vector.h" + +//#define PRINT_TOKENS + +using std::ifstream; +using std::istream; +using std::ostream; +using std::cout; +using std::cerr; +using std::endl; + +</host> +<write def_name="section_parse" line="45" col="2"><arg>data</arg></write> +<host line="46"> + +void Scanner::sectionParseInit() +{ + </host> +<write def_name="section_parse" line="50" col="5"><arg>init</arg></write> +<host line="51">} + +ostream &Scanner::scan_error() +{ + /* Maintain the error count. */ + gblErrorCount += 1; + cerr << fileName << ":" << line << ":" << column << ": "; + return cerr; +} + +bool Scanner::recursiveInclude( char *inclFileName, char *inclSectionName ) +{ + for ( IncludeStack::Iter si = includeStack; si.lte(); si++ ) { + if ( strcmp( si->fileName, inclFileName ) == 0 && + strcmp( si->sectionName, inclSectionName ) == 0 ) + { + return true; + } + } + return false; +} + +void Scanner::updateCol() +{ + char *from = lastnl; + if ( from == 0 ) + from = tokstart; + //cerr << "adding " << tokend - from << " to column" << endl; + column += tokend - from; + lastnl = 0; +} + +void Scanner::token( int type, char c ) +{ + token( type, &c, &c + 1 ); +} + +void Scanner::token( int type ) +{ + token( type, 0, 0 ); +} + +</host> +<host line="178"> + +void Scanner::token( int type, char *start, char *end ) +{ + char *tokdata = 0; + int toklen = 0; + int *p = &type; + int *pe = &type + 1; + + if ( start != 0 ) { + toklen = end-start; + tokdata = new char[toklen+1]; + memcpy( tokdata, start, toklen ); + tokdata[toklen] = 0; + } + + </host> +<write def_name="section_parse" line="196" col="3"><arg>exec</arg></write> +<host line="197"> + + updateCol(); +} + +void Scanner::endSection( ) +{ + /* Execute the eof actions for the section parser. */ + </host> +<write def_name="section_parse" line="207" col="3"><arg>eof</arg></write> +<host line="208"> +} + +</host> +<host line="516"> + +</host> +<write def_name="rlscan" line="518" col="4"><arg>data</arg></write> +<host line="519"> +void Scanner::do_scan() +{ + int bufsize = 8; + char *buf = new char[bufsize]; + const char last_char = 0; + int cs, act, have = 0; + int top, stack[32]; + bool execute = true; + + sectionParseInit(); + </host> +<write def_name="rlscan" line="530" col="5"><arg>init</arg></write> +<host line="531"> + while ( execute ) { + char *p = buf + have; + int space = bufsize - have; + + if ( space == 0 ) { + /* We filled up the buffer trying to scan a token. Grow it. */ + bufsize = bufsize * 2; + char *newbuf = new char[bufsize]; + + /* Recompute p and space. */ + p = newbuf + have; + space = bufsize - have; + + /* Patch up pointers possibly in use. */ + if ( tokstart != 0 ) + tokstart = newbuf + ( tokstart - buf ); + tokend = newbuf + ( tokend - buf ); + + /* Copy the new buffer in. */ + memcpy( newbuf, buf, have ); + delete[] buf; + buf = newbuf; + } + + input.read( p, space ); + int len = input.gcount(); + + /* If we see eof then append the EOF char. */ + if ( len == 0 ) { + p[0] = last_char, len = 1; + execute = false; + } + + char *pe = p + len; + </host> +<write def_name="rlscan" line="566" col="6"><arg>exec</arg></write> +<host line="567"> + /* Check if we failed. */ + if ( cs == rlscan_error ) { + /* Machine failed before finding a token. I'm not yet sure if this + * is reachable. */ + scan_error() << "scanner error" << endl; + exit(1); + } + + /* Decide if we need to preserve anything. */ + char *preserve = tokstart; + + /* Now set up the prefix. */ + if ( preserve == 0 ) + have = 0; + else { + /* There is data that needs to be shifted over. */ + have = pe - preserve; + memmove( buf, preserve, have ); + unsigned int shiftback = preserve - buf; + if ( tokstart != 0 ) + tokstart -= shiftback; + tokend -= shiftback; + + preserve = buf; + } + } + delete[] buf; + + InputLoc loc; + loc.fileName = "<EOF>"; + loc.line = line; + loc.col = 1; + parser->token( loc, _eof, 0, 0 ); +} + +void scan( char *fileName, istream &input, ostream &output ) +{ + Scanner scanner( fileName, input, output, 0, 0, 0 ); +} +</host> +</ragel> +##### EXP ##### +SWITCH + token( KW_When ); + token( KW_Eof ); + token( KW_Err ); + token( KW_Lerr ); + token( KW_To ); + token( KW_From ); + token( TK_Word, tokstart, tokend ); +SWITCH + token( KW_Lex ); + token( KW_Action ); + token( KW_AlphType ); + token( KW_Commit ); + token( KW_Undo ); + token( KW_Final ); + token( KW_Translate ); + token( KW_Token ); + token( KW_Literal ); + token( KW_NonTerm ); + token( KW_Uses ); + token( KW_Parser ); + token( KW_Include ); + token( KW_Write ); + token( KW_Nfa ); + token( KW_Pda ); + token( KW_Rl ); + token( KW_Cfl ); + token( KW_Ignore ); + token( KW_End ); + token( KW_Pattern ); + token( KW_Construct ); + token( KW_Red ); + token( KW_TypeId ); + token( KW_TypeDef ); + token( KW_If ); + token( KW_Init ); + token( KW_Reject ); + token( KW_While ); + token( KW_Else ); + token( KW_SubParser ); + token( KW_Next ); + token( KW_Match ); + token( KW_For ); + token( KW_Iter ); + token( KW_Find ); + token( KW_Root ); + token( KW_PrintXML ); + token( KW_Then ); + token( KW_Do ); + token( KW_Namespace ); + token( KW_Scanner ); + token( TK_Word, tokstart, tokend ); diff --git a/test/tags3.lm b/test/tags3.lm new file mode 100644 index 0000000..645b00b --- /dev/null +++ b/test/tags3.lm @@ -0,0 +1,322 @@ +##### LM ##### +context tags + + # + # Regular Definitions + # + rl def_name_char /[\-A-Za-z0-9._:?]/ + rl def_name /[A-Za-z_:] def_name_char*/ + rl def_system_literal /'"' [^"]* '"' | "'" [^']* "'"/ + + # + # Scanner for tag names. + # + lex + ignore /space+/ + token tag_id /def_name/ + end + + # + # Scanner for attributes names + # + lex + ignore /space+/ + token attr_name /def_name_char+/ + literal `= + end + + # Scanner for attribute values. + lex + ignore /space+/ + token dquote_val /'"' ([^"] | '\\' any)* '"'/ + token squote_val /"'" ([^'] | '\\' any)* "'"/ + token unq_val /[^ \t\r\n<>"'] [^ \t\r\n<>]*/ + end + + literal `> `/> + + # + # Tokens + # + + lex + ignore /space+/ + literal `< `</ `<!DOCTYPE + token doc_data /[^<]+/ + token comment /"<!--" any* :>> "-->"/ + end + + # + # Tags + # + + # This scanner is just for the id in close tags. The id needs to be looked up + # in the tag stack so we can determine if it is a stray. + lex + # Ignore whitespace. + ignore /space+/ + + token stray_close_id // + token close_id /def_name/ + { + # If it is in the tag stack then it is a close_id. If not then it's a + # stray_close_id. + send_id: int = typeid<stray_close_id> + + LocalTagStack: tag_stack = TagStack + for Tag: tag_id in LocalTagStack { + T: tag_id = Tag + if match_text == T.data { + send_id = typeid<close_id> + break + } + } + + input.push( make_token( send_id input.pull(match_length) ) ) + } + end + + # + # Tag Stack + # + + def tag_stack + [tag_id tag_stack] + | [] + + TagStack: tag_stack + + # + # Document Type + # + # This scanner handles inside DOCTYPE tags (except keywords). + lex + ignore /space+/ + token dt_name /def_name/ + token dt_literal /def_system_literal/ + token dt_bl /"[" [^\]]* "]"/ + end + + token dt_close /'>'/ + + # Using a separate scanner for the keywords in DOCTYPE prevents them from + # covering dt_name + lex + ignore /space+/ + literal `SYSTEM `PUBLIC + end + + def DOCTYPE [`<!DOCTYPE dt_name external_id dt_bl? dt_close] + + def external_id + [`SYSTEM dt_literal?] + | [`PUBLIC dt_literal dt_literal?] + + # + # Tags, with optionanal close. + # + + def tag + [open_tag item* opt_close_tag] + + def open_tag + [`< tag_id attr* `>] + { + TagStack = construct tag_stack + [r2 TagStack] + } + + def opt_close_tag + [`</ close_id `>] + { + match TagStack [Top:tag_id Rest:tag_stack] + if r2.data == Top.data + TagStack = Rest + else + reject + } + + | [] + { + match TagStack [Top:tag_id Rest:tag_stack] + TagStack = Rest + } + + # + # Empty tags + # + def empty_tag + [`< tag_id attr* `/>] + + # + # Stray close tags + # + def stray_close + [`</ stray_close_id `>] + + + # + # Attributes + # + + def attr + [attr_name eql_attr_val?] + + def eql_attr_val [`= attr_val] + + def attr_val + [squote_val] + | [dquote_val] + | [unq_val] + | [] + + # + # Items + # + + def item + [DOCTYPE] + | [tag] + | [empty_tag] + | [stray_close] + | [doc_data] + | [comment] + + + token trailing /any*/ + + def start + [item* trailing] + + # + # END GRAMMAR + # + + int addDefaultAltTags( Start: ref<start> ) + { + for T: open_tag in Start { + require T + ["<img" AttrList: attr* ">"] + + haveAlt: bool = false + for A: attr in T { + if match A ["alt=" attr_val] + haveAlt = true + } + + if !haveAlt { + for AL: attr* in T { + if match AL [] { + AL = construct attr* + [" alt=\"default alt\""] + break + } + } + } + } + } + + int printLinks( Start: start ) + { + for A:tag in Start { + require A + ["<a" AttrList: attr* ">" I: item* "</a>"] + + for Attr: attr in AttrList { + if match Attr ["href = " AttrVal: attr_val] + print( 'link: ' I '\ntarget: ' AttrVal '\n\n' ) + } + } + } + + + bool should_close( TI: tag_id ) + { + return true + } + + bool should_flatten( TI: tag_id ) + { + return true + } + + # Finds unclosed tags and puts the content after the tag. Afterwards + # all unclosed tags will be empty 'inside'. + int flatten( Start: ref<start> ) + { + for TL: item* in Start { + require TL + [OT: open_tag Inside: item* Trailing: item*] + + match OT + ['<' TagId: tag_id attr* '>'] + + if should_flatten( TagId ) + { + require Inside + [item item*] + + # Put Trailing at the end of inside. + for END: item* in Inside { + if match END [] { + END = Trailing + break + } + } + + EmptyCloseTag: opt_close_tag = + construct opt_close_tag [] + + # Close the tag and put inside after it. + TL = construct item* + [OT EmptyCloseTag Inside] + } + } + } + +# int close( Start: ref<start> ) +# { +# for TL: item in Start { +# require TL +# [OpenTag: open_tag Inside: item*] +# +# match OpenTag +# ['<' TagId: tag_id attr* '>'] +# +# if should_close( TagId ) +# { +# parse CloseId: close_id[ TagId.data ] +# +# CloseTag: opt_close_tag = +# construct opt_close_tag ['</' CloseId '>'] +# +# # Close the tag and put inside after it. +# TL = construct item +# [OpenTag Inside CloseTag] +# } +# } +# } +end # tags + +cons Tags: tags[] +Tags.TagStack = construct tags::tag_stack [] + +parse HTML_P: tags::start(Tags)[ stdin ] +HTML: tags::start = HTML_P +flatten( HTML ) +print_xml( HTML ) +printLinks( HTML ) +##### IN ##### +<t1> + + <t2> + <a href="foo">FOO</a> + <t3> + </t3> + +</t1> +##### EXP ##### +<tags::start><tags::_repeat_item><tags::item><tags::tag><tags::open_tag><tags::_literal_0019><</tags::_literal_0019><tags::tag_id>t1</tags::tag_id><tags::_repeat_attr></tags::_repeat_attr><tags::_literal_0013>></tags::_literal_0013></tags::open_tag><tags::_repeat_item><tags::item><tags::tag><tags::open_tag><tags::_literal_0019><</tags::_literal_0019><tags::tag_id>t2</tags::tag_id><tags::_repeat_attr></tags::_repeat_attr><tags::_literal_0013>></tags::_literal_0013></tags::open_tag><tags::_repeat_item></tags::_repeat_item><tags::opt_close_tag></tags::opt_close_tag></tags::tag></tags::item><tags::item><tags::tag><tags::open_tag><tags::_literal_0019><</tags::_literal_0019><tags::tag_id>a</tags::tag_id><tags::_repeat_attr><tags::attr><tags::attr_name>href</tags::attr_name><tags::_opt_eql_attr_val><tags::eql_attr_val><tags::_literal_0009>=</tags::_literal_0009><tags::attr_val><tags::dquote_val>"foo"</tags::dquote_val></tags::attr_val></tags::eql_attr_val></tags::_opt_eql_attr_val></tags::attr></tags::_repeat_attr><tags::_literal_0013>></tags::_literal_0013></tags::open_tag><tags::_repeat_item><tags::item><tags::doc_data>FOO</tags::doc_data></tags::item></tags::_repeat_item><tags::opt_close_tag><tags::_literal_001b></</tags::_literal_001b><tags::close_id>a</tags::close_id><tags::_literal_0013>></tags::_literal_0013></tags::opt_close_tag></tags::tag></tags::item><tags::item><tags::tag><tags::open_tag><tags::_literal_0019><</tags::_literal_0019><tags::tag_id>t3</tags::tag_id><tags::_repeat_attr></tags::_repeat_attr><tags::_literal_0013>></tags::_literal_0013></tags::open_tag><tags::_repeat_item></tags::_repeat_item><tags::opt_close_tag><tags::_literal_001b></</tags::_literal_001b><tags::close_id>t3</tags::close_id><tags::_literal_0013>></tags::_literal_0013></tags::opt_close_tag></tags::tag></tags::item></tags::_repeat_item><tags::opt_close_tag><tags::_literal_001b></</tags::_literal_001b><tags::close_id>t1</tags::close_id><tags::_literal_0013>></tags::_literal_0013></tags::opt_close_tag></tags::tag></tags::item></tags::_repeat_item><tags::trailing> +</tags::trailing></tags::start>link: FOO +target: "foo" + diff --git a/test/tags4.lm b/test/tags4.lm new file mode 100644 index 0000000..f710378 --- /dev/null +++ b/test/tags4.lm @@ -0,0 +1,350 @@ +##### LM ##### +# +# +# This is somewhat broken. missing_close_id is cuasing close ids to be parseed +# when they shouldn't. Maybe remove it. +# +# + +context tags + # + # Regular Definitions + # + rl def_name_char /[\-A-Za-z0-9._:?]/ + rl def_name /[A-Za-z_:] def_name_char*/ + rl def_system_literal /'"' [^"]* '"' | "'" [^']* "'"/ + + # + # Scanner for tag names. + # + lex + ignore /space+/ + token tag_id /def_name/ + end + + # + # Scanner for attributes names + # + lex + ignore /space+/ + token attr_name /def_name_char+/ + literal `= + end + + literal `> `/> + + # Scanner for attribute values. + lex + ignore /space+/ + token dquote_val /'"' ([^"] | '\\' any)* '"'/ + token squote_val /"'" ([^'] | '\\' any)* "'"/ + token unq_val /[^ \t\r\n<>"'] [^ \t\r\n<>]*/ + end + + # + # Tokens + # + + lex + ignore /space+/ + + literal `< `</ `<!DOCTYPE + token close_tag + /'</' [\t ]* [a-zA-Z]+ '>'/ + + token doc_data /[^<]+/ + token comment /'<!--' any* :>> '-->'/ + end + + # + # Tags + # + + bool inTagStack( id: str ) + { + LocalTagStack: tag_stack = TagStack + for Tag: tag_id in LocalTagStack { + if id == Tag.data + return true + } + return false + } + + # This scanner is just for the id in close tags. The id needs to be looked up + # in the tag stack so we can determine if it is a stray. + lex + # Ignore whitespace. + ignore /space+/ + + token stray_close_id // + token missing_close_id // + + token close_id /def_name/ + { + # If it is in the tag stack then it is a close_id. If not then it's a + # stray_close_id. + send_id: int = typeid<stray_close_id> + + if ( inTagStack( match_text ) ) { + print( 'CLOSE \'' match_text '\' IN TAG STACK\n' ) + + # The tag is in the stack, send missing close tags until we get to it. + match TagStack [Top:tag_id Rest:tag_stack] + TagStack = Rest + while ( Top.data != match_text ) { + print( 'SENDING missing close\n' ) + input.push( make_token( typeid<missing_close_id> '' ) ) + match TagStack [Top2:tag_id Rest2:tag_stack] + Top = Top2 + TagStack = Rest2 + } + + print( 'SENDING close\n' ) + input.push( make_token( typeid<close_id> input.pull( match_length ) ) ) + } + else { + print( 'CLOSE \'' match_text '\' NOT IN TAG STACK\n' ) + # The tag is not in the tag stack so send the id as a stray close. + input.push( make_token( typeid<stray_close> input.pull( match_length ) ) ) + } + } + end + + # + # Tag Stack + # + + def tag_stack + [tag_id tag_stack] + | [] + + TagStack: tag_stack + + # + # Document Type + # + # This scanner handles inside DOCTYPE tags (except keywords). + lex + ignore /space+/ + token dt_name /def_name/ + token dt_literal /def_system_literal/ + token dt_bl /"[" [^\]]* "]"/ + token dt_close /'>'/ + end + + # Using a separate scanner for the keywords in DOCTYPE prevents them from + # covering dt_name + lex + ignore /space+/ + literal `SYSTEM `PUBLIC + end + + def DOCTYPE [`<!DOCTYPE dt_name external_id dt_bl? dt_close] + + def external_id + [`SYSTEM dt_literal?] + | [`PUBLIC dt_literal dt_literal?] + + # + # Tags, with optionanal close. + # + + def tag + [open_tag item* close_tag] + + def unclosed_tag + [open_tag item* missing_close_id] + + def open_tag + [`< tag_id attr* `>] + { + TagStack = construct tag_stack + [r2 TagStack] + } + + # + # Empty tags + # + def empty_tag + [`< tag_id attr* `/>] + + # + # Stray close tags + # + def stray_close + [close_tag] + + + # + # Attributes + # + + def attr + [attr_name eql_attr_val?] + + def eql_attr_val [`= attr_val] + + def attr_val + [squote_val] + | [dquote_val] + | [unq_val] + | [] + + # + # Items + # + + def item + [DOCTYPE] + | [tag] + | [unclosed_tag] + | [empty_tag] + | [stray_close] + | [doc_data] + | [comment] + + + token trailing /any*/ + + def start + [item* trailing] + + # + # END GRAMMAR + # + + int addDefaultAltTags( Start: ref<start> ) + { + for T: open_tag in Start { + require T + ["<img" AttrList: attr* '>'] + + haveAlt: bool = false + for A: attr in T { + if match A ["alt=" attr_val] + haveAlt = true + } + + if !haveAlt { + for AL: attr* in T { + if match AL [] { + AL = construct attr* + [" alt=\"default alt\""] + break + } + } + } + } + } + + int printLinks( Start: start ) + { + for A:tag in Start { + require A + ["<a" AttrList: attr* ">" I: item* "</a>"] + + for Attr: attr in AttrList { + if match Attr ["href = " AttrVal: attr_val] + print( 'link: ' I '\ntarget: ' AttrVal '\n\n' ) + } + } + } + + + bool should_close( TI: tag_id ) + { + return true + } + + bool should_flatten( TI: tag_id ) + { + return true + } +end # tags + +# Finds unclosed tags and puts the content after the tag. Afterwards +# all unclosed tags will be empty 'inside'. +#int flatten( Start: ref<start> ) +#{ +# for TL: item* in Start { +# require TL +# [OT: open_tag Inside: item* Trailing: item*] +# +# match OT +# ['<' TagId: tag_id attr* '>'] +# +# if should_flatten( TagId ) +# { +# require Inside +# [item item*] +# +# # Put Trailing at the end of inside. +# for END: item* in Inside { +# if match END [] { +# END = Trailing +# break +# } +# } +# +# str empty = '' +# missing_close_id Missing = construct missing_close_id [empty] +# opt_close_tag EmptyCloseTag = +# construct opt_close_tag [Missing] +# +# # Close the tag and put inside after it. +# TL = construct item* +# [OT EmptyCloseTag Inside] +# } +# } +#} +# +#int close( Start: ref<start> ) +#{ +# for TL: item in Start { +# require TL +# [OpenTag: open_tag Inside: item*] +# +# match OpenTag +# ['<' TagId: tag_id attr* '>'] +# +# if should_close( TagId ) +# { +# close_id CloseId = construct close_id +# [TagId.data] +# +# opt_close_tag CloseTag = +# construct opt_close_tag ['</' CloseId '>'] +# +# # Close the tag and put inside after it. +# TL = construct item +# [OpenTag Inside CloseTag] +# } +# } +#} + +cons Tags: tags[] +Tags.TagStack = construct tags::tag_stack [] +parse HTML: tags::start(Tags)[ stdin ] +print( HTML ) + +#print_xml( HTML ) +#for C: close_tag in HTML +# print( C '\n' ) +##### IN ##### +<t1> + + <t2> + <a href="foo">&FOO</a> + <t3> + </t3> + +</t1> +##### EXP ##### +<t1> + + <t2> + <a href="foo">&FOO</a> + <t3> + </t3> + +</t1> diff --git a/test/tcontext1.lm b/test/tcontext1.lm new file mode 100644 index 0000000..86db718 --- /dev/null +++ b/test/tcontext1.lm @@ -0,0 +1,35 @@ +##### LM ##### + +lex + token tIDENTIFIER /[a-z][a-zA-Z_]*/ -ni + ignore /[ \t\n]+/ + ignore comment /'#' [^\n]* '\n'/ +end + +lex + ignore /[\t ]+/ + ignore /'#' [^\n]*/ + literal `; + token NL /'\n'/ +end + +# Required whitespace, but newline is not allowed. +token ws_no_nl + /[ \t]+ [^ \t\n]/ + { + input.push( make_token( typeid<ws_no_nl> input.pull(match_length-1) ) ) + } + +def method_call + [tIDENTIFIER ws_no_nl tIDENTIFIER `; NL] + +parse R: method_call[stdin] + +print_xml( R ) +print( '\n' ) + +##### IN ##### +a bc; +##### EXP ##### +<method_call><tIDENTIFIER>a</tIDENTIFIER><ws_no_nl> </ws_no_nl><tIDENTIFIER>bc</tIDENTIFIER><_literal_000b>;</_literal_000b><NL> +</NL></method_call> diff --git a/test/til.lm b/test/til.lm new file mode 100644 index 0000000..6a93ab3 --- /dev/null +++ b/test/til.lm @@ -0,0 +1,194 @@ +##### LM ##### +lex + literal `var `if `then `else `while `do `for `read `write + `end `to `goto + + literal `:= `!= `; `+ `- `* `/ `= `( `) `: + + ignore /'//' [^\n]* '\n'/ + ignore /[\n\t ]+/ + token id /[a-zA-Z_]+/ + token integernumber /[0-9]+/ + token stringlit /'"' [^"]* '"'/ +end + +def program + [statement*] + +def statement + [declaration] +| [assignment_statement] +| [if_statement] +| [while_statement] +| [do_statement] +| [for_statement] +| [read_statement] +| [write_statement] +| [labelled_statement] +| [goto_statement] + +def declaration + [`var id `;] + +def assignment_statement + [id `:= expression `;] + +def if_statement + [`if expression `then statement* opt_else_statement `end] + +def opt_else_statement + [`else statement*] +| [] + +def while_statement + [`while expression `do statement* `end] + +def do_statement + [`do statement* `while expression `;] + +def for_statement + [`for id `:= expression `to expression `do statement* `end] + +def read_statement + [`read id `;] + +def write_statement + [`write expression `;] + +def expression + [term] +| [expression eqop term] + +def eqop [`=] | [`!=] + +def term + [factor] +| [term addop factor] + +def addop [`+] | [`-] + +def factor + [primary] +| [factor mulop primary] + +def mulop [`*] | [`/] + +def primary + [id] +| [lit] +| [`( expression `)] + +def lit + [integernumber] +| [stringlit] + +def labelled_statement + [id `: statement] + +def goto_statement + [`goto id `;] + +parse P: program[stdin] + +#for S:statement* in P +#{ +# if match S [L0: id ':' +# First: statement +# Rest: statement*] +# { +# for Check: statement* in Rest +# { +# if match Check +# ['if' E: expression 'then' +# 'goto' Targ: id ';' +# 'end' +# T: statement*] +# { +# # This truncates Rest +# Check = construct statement* [] +# +# # Replace the labeled statement through to the goto with a +# # do ... while. +# S = construct statement* +# ['do' +# First +# Rest +# 'while' E ';' +# T] +# break +# } +# } +# } +#} + +for S: statement* in P +{ + if match S [Label: id `: + First: statement + Rest: statement*] + { + Expr: expression + Following: statement* + + # Look though the remaining statements for a goto back to the label. + # The repeat iterator yields only top-level statement lists. It + # restricts our search to the same nesting depth as the label. + for Check: statement* in Rest + { + if match Check + [`if E: expression `then + `goto L:id `; + `end + SL: statement*] + { + Expr = E + Following = SL + + # Check iterates over tails of Rest. Assigning an empty list + # to check truncates the Rest list. What we cut off is saved in + # Following (excluding the if statement). + Check = construct statement* [] + } + } + + # If a goto was found, then perform the rewrite. + if ( Expr ) + { + # Replace the labelled statement through to the goto + # with a do ... while. + S = construct statement* [ + "do + " [^First] + " [^Rest] + "while [^Expr]; + Following] + } + } +} + +print( P ) +##### IN ##### + +var a; +a := 1; + +head: + +a := a + 1; +c := d; + +if a = 10 then + goto head; +end + +hi := there; +##### EXP ##### + +var a; +a := 1; + +do + a := a + 1; + c := d; +while a = 10; +hi := there; diff --git a/test/translate1.lm b/test/translate1.lm new file mode 100644 index 0000000..4403ca6 --- /dev/null +++ b/test/translate1.lm @@ -0,0 +1,28 @@ +##### LM ##### +lex + ignore /space+/ + literal `* `( `) + token id /[a-zA-Z_]+/ + { + t: str = input.pull( match_length ) + input.push( make_token( typeid<id> t ) ) + } +end + +def foo [id] + +def item + [id] +| [foo] +| [`( item* `)] + +def start + [item*] + +parse Input: start[ stdin ] +print( Input ) + +##### IN ##### +a b ( c d ) e f +##### EXP ##### +a b ( c d ) e f diff --git a/test/translate2.lm b/test/translate2.lm new file mode 100644 index 0000000..47bda35 --- /dev/null +++ b/test/translate2.lm @@ -0,0 +1,62 @@ +##### LM ##### +lex + ignore /space+/ + literal `# `{ `} + token id2 /[a-zA-Z_]+/ +end + +def item2 + [id2] +| [`{ item2* `}] + +def start2 + [item2*] + +context ctx + + lex + ignore /space+/ + literal `* `( `) `! + token SEMI_NL /';\n'/ + token id /[a-zA-Z_0-9]+/ + + token ddd /'...'/ { + print('translating\n') + input.pull( match_length ) + input.push( make_token( typeid<id> "dot" ) ) + input.push( make_token( typeid<id> "dot" ) ) + input.push( make_token( typeid<id> "dot" ) ) + } + end + + def item + [id] + | [`( item* `)] + + def A [] { + print( 'A\n' ) + } + + def B [] { + print( 'B\n' ) + } + + + def start + [A item* `!] + | [B item* SEMI_NL] + +end # ctx + +CTX: ctx = cons ctx [] +parse Input: ctx::start( CTX ) [ stdin ] +print( Input ) + +##### IN ##### +a b c ( d1 ... d2 ) e f g ; +##### EXP ##### +A +translating +B +translating +a b c ( d1 dotdotdot d2 ) e f g ; diff --git a/test/travs1.lm b/test/travs1.lm new file mode 100644 index 0000000..c2f7171 --- /dev/null +++ b/test/travs1.lm @@ -0,0 +1,286 @@ +##### LM ##### +lex + ignore /[\t\n ]+/ + literal `^ `| `- `, `: `! `? `. + literal `( `) `{ `} `* `& `+ + + literal `-- `:> `:>> `<: `-> `** + + token word /[a-zA-Z_][a-zA-Z0-9_]*/ + token uint /[0-9]+/ +end + + +def start [expression] + +def expression [term expression_op*] + +def expression_op + [`| term] +| [`& term] +| [`- term] +| [`-- term] + +def term [factor_rep term_rest] + +# This list is done manually to get shortest match. +def term_rest + [] +| [term_op term_rest] + +def term_op + [factor_rep] +| [`. factor_rep] +| [`:> factor_rep] +| [`:>> factor_rep] +| [`<: factor_rep] + +def factor_rep + [factor_neg factor_rep_op*] + +def factor_rep_op + [`*] +| [`**] +| [`?] +| [`+] +| [`{ factor_rep_num `}] +| [`{ `, factor_rep_num `}] +| [`{ factor_rep_num `, `}] +| [`{ factor_rep_num `, factor_rep_num `}] + +def factor_rep_num [uint] + +def factor_neg + [`! factor_neg] +| [`^ factor_neg] +| [factor] + +def factor + [alphabet_num] +| [word] +| [`( expression `)] + +def alphabet_num + [uint] + +parse S: start[stdin] + +# +# Top-Down, Left-Right +# + +int do_topdown_leftright( T: ref<any> ) +{ + for C:any in child(T) { + yield C + do_topdown_leftright( C ) + } +} + +iter topdown_leftright( T: ref<any> ) +{ + do_topdown_leftright( T ) +} + +# +# Bottom-Up, Left-Right +# + +int do_bottomup_leftright( T: ref<any> ) +{ + for C:any in child(T) { + do_bottomup_leftright( C ) + yield C + } +} + +iter bottomup_leftright( T: ref<any> ) +{ + do_bottomup_leftright( T ) +} + + +# +# Top-Down, Right-Left +# + +int do_topdown_rightleft( T: ref<any> ) +{ + for C:any in rev_child(T) { + yield C + do_topdown_rightleft( C ) + } +} + +iter topdown_rightleft( T: ref<any> ) +{ + do_topdown_rightleft( T ) +} + +# +# Bottom-Up, Right-Left +# + +int do_bottomup_rightleft( T: ref<any> ) +{ + for C:any in rev_child(T) { + do_bottomup_rightleft( C ) + yield C + } +} + +iter bottomup_rightleft( T: ref<any> ) +{ + do_bottomup_rightleft( T ) +} + +# +# Testing +# + +print( 'bottomup_leftright\n' ) +for T1: any in bottomup_leftright( S ) +{ + print( ^T1 '\n' ) +} + +print( 'bottomup_rightleft\n' ) +for T2: any in bottomup_rightleft( S ) +{ + print( ^T2 '\n' ) +} + +print( 'topdown_leftright\n' ) +for T3: any in topdown_leftright( S ) +{ + print( ^T3 '\n' ) +} + +print( 'topdown_rightleft\n' ) +for T4: any in topdown_rightleft( S ) +{ + print( ^T4 '\n' ) +} +##### IN ##### +1 | 2 3 +##### EXP ##### +bottomup_leftright +1 +1 +1 +1 + +1 + +1 +| +2 +2 +2 +2 + +2 +3 +3 +3 +3 + +3 +3 + +3 +2 3 +| 2 3 + +| 2 3 +1 | 2 3 +bottomup_rightleft + + + +3 +3 +3 +3 +3 +3 +3 + +2 +2 +2 +2 +2 +2 3 +| +| 2 3 +| 2 3 + + +1 +1 +1 +1 +1 +1 +1 | 2 3 +topdown_leftright +1 | 2 3 +1 +1 +1 +1 +1 +1 + + +| 2 3 +| 2 3 +| +2 3 +2 +2 +2 +2 +2 + +3 +3 +3 +3 +3 +3 +3 + + + +topdown_rightleft +1 | 2 3 +| 2 3 + +| 2 3 +2 3 +3 + +3 +3 + +3 +3 +3 +3 +2 + +2 +2 +2 +2 +| +1 + +1 + +1 +1 +1 +1 diff --git a/test/treecmp1.lm b/test/treecmp1.lm new file mode 100644 index 0000000..3bd5b23 --- /dev/null +++ b/test/treecmp1.lm @@ -0,0 +1,25 @@ +##### LM ##### +rl ident_pattern /[a-zA-Z_][a-zA-Z_0-9]*/ +rl number_pattern /[0-9]+/ + +lex + ignore /[ \t\n]+/ + token id /ident_pattern/ + token number /number_pattern/ +end + +def four_ids + [id id id id] + +B: id = construct id "b" + +parse Input: four_ids[ stdin ] + +for Id: id in Input { + if ( Id == B ) + print( B '\n' ) +} +##### IN ##### +a b c d +##### EXP ##### +b diff --git a/test/typeref1.lm b/test/typeref1.lm new file mode 100644 index 0000000..de1fa26 --- /dev/null +++ b/test/typeref1.lm @@ -0,0 +1,33 @@ +##### LM ##### +namespace n1 + + namespace n2 + lex + token id / 'a' .. 'z' / + ignore / '\n' | '\t' | ' ' / + end + + def start + [id*] + end +end + +parse P: n1::n2::id*[stdin] +print( P ) +##### IN ##### + +##### EXP ##### +##### IN ##### +a +##### EXP ##### +a +##### IN ##### +a + b + c +d +##### EXP ##### +a + b + c +d diff --git a/test/typeref2.lm b/test/typeref2.lm new file mode 100644 index 0000000..d95e3f1 --- /dev/null +++ b/test/typeref2.lm @@ -0,0 +1,34 @@ +##### LM ##### +namespace n1 + + namespace n2 + lex + token id / 'a' .. 'z' / + ignore / '\n' | '\t' | ' ' / + end + + def start + [id*] + end +end + +parse P: n1::n2::id+[stdin] +print( P ) +##### IN ##### + +##### EXP ##### +NIL--noeol +##### IN ##### +a +##### EXP ##### +a +##### IN ##### +a + b + c +d +##### EXP ##### +a + b + c +d diff --git a/test/typeref3.lm b/test/typeref3.lm new file mode 100644 index 0000000..dedff0a --- /dev/null +++ b/test/typeref3.lm @@ -0,0 +1,27 @@ +##### LM ##### +namespace n1 + + namespace n2 + lex + token id / 'a' .. 'z' / + ignore / '\n' | '\t' | ' ' / + end + + def start + [id*] + end +end + +parse P: n1::n2::id?[stdin] +print( P ) +##### IN ##### + +##### EXP ##### +##### IN ##### +a +##### EXP ##### +a +##### IN ##### +a b +##### EXP ##### +NIL--noeol diff --git a/test/undofrag1.lm b/test/undofrag1.lm new file mode 100644 index 0000000..a997cb8 --- /dev/null +++ b/test/undofrag1.lm @@ -0,0 +1,67 @@ +##### LM ##### + +lex + ignore /space+/ + literal `# `{ `} + token id2 /[a-zA-Z_]+/ +end + +def item2 + [id2] +| [`{ item2* `}] + +def start2 + [item2*] + + +context ctx + + SP: parser<start2> + + lex + ignore /space+/ + literal `* `( `) `! + token semi_nl /';\n'/ + token id /[a-zA-Z_]+/ + end + + def item + [id] + | [`( item* `)] + + + def A [] { + print( 'A\n' ) + send SP "{ A{d} }" + } + + def B [] { + print( 'B\n' ) + send SP "{ B{d} }" + } + + def start1 + [A item* `!] + | [B item* semi_nl] + +end # ctx + + +CTX: ctx = cons ctx [] +CTX.SP = cons parser<start2> [] +send CTX.SP "a b{c}" + +parse Input: ctx::start1( CTX )[stdin] + +send CTX.SP "{e}f g" + +print( Input ) +print( CTX.SP() '\n' ) + +##### IN ##### +a b c ( d ) e f g ; +##### EXP ##### +A +B +a b c ( d ) e f g ; +a b{c}{ B{d} }{e}f g diff --git a/test/undofrag2.lm b/test/undofrag2.lm new file mode 100644 index 0000000..bbade5e --- /dev/null +++ b/test/undofrag2.lm @@ -0,0 +1,50 @@ +##### LM ##### +context undo + + lex + ignore /( ' ' | '\t')+/ + literal `* `( `) `^ `; + token NL /'\n'/ + token id /[a-zA-Z_]+/ + end + + Out: parser<out> + + def out_item + [id] + | [`( item* `)] + + def out + [out_item*] + + def item + [id] + { + send Out [r1] + } + | [`( item* `)] + { + send Out ['('] + send Out [r2] + send Out [')'] + } + + def A1 [] + def A2 [] + + def start + [A1 item* `^] + | [A2 item* `; NL] + +end # undo + +cons Undo: undo[] +Undo.Out = construct parser<undo::out> [] + +parse Input: undo::start(Undo)[ stdin ] +print( Input ) + +##### IN ##### +a b c; +##### EXP ##### +a b c; diff --git a/test/undofrag3.lm b/test/undofrag3.lm new file mode 100644 index 0000000..ed96cba --- /dev/null +++ b/test/undofrag3.lm @@ -0,0 +1,56 @@ +##### LM ##### +context undo + + lex + ignore /( ' ' | '\t' )+/ + literal `* `( `) `^ `; `. + token NL /'\n'/ + token id /[a-zA-Z_]+/ + end + + Out: parser<out> + + def out_item + [id] + | [`( item* `)] + + def out + [out_item*] + + def item + [id] + { + send Out [r1] + } + | [`( item* `)] + { + send Out ['('] + send Out [r2] + send Out [')'] + } + + def A1 [] + def A2 [] + + def F + [] + { + print_xml( Out() ) + } + + def start + [A1 item* F `. `^] + | [A2 item* F `. `; NL] + +end # undo + +cons Undo: undo[] +Undo.Out = construct parser<undo::out> [] + +parse Input: undo::start(Undo)[ stdin ] +print( Input ) + +##### IN ##### +a . ; +##### EXP ##### +<undo::out><undo::_repeat_out_item><undo::out_item><undo::id>a</undo::id></undo::out_item></undo::_repeat_out_item></undo::out><undo::out><undo::_repeat_out_item><undo::out_item><undo::id>a</undo::id></undo::out_item></undo::_repeat_out_item></undo::out>a . ; diff --git a/test/while1.lm b/test/while1.lm new file mode 100644 index 0000000..645c28d --- /dev/null +++ b/test/while1.lm @@ -0,0 +1,52 @@ +##### LM ##### +while 0 + print( '0\n' ) + +global I: int = 3 + +int f() +{ + I = I - 1 + print( ' ' I ) +} + +# simple expr and stmt +while I + f() +print( '\n' ) + +# compound stmt list +I = 3 +while I +{ + I = I - 1 + print( ' ' I ) +} +print( '\n' ) + +# paren expr +I = 3 +while ( I ) + f() +print( '\n' ) + +# expr with computation +I = 3 +while ( I + 1 ) + f() +print( '\n' ) + +# computation and stmt list +I = 3 +while ( I + 2 ) +{ + I = I - 1 + print( ' ' I ) +} +print( '\n' ) +##### EXP ##### + 2 1 0 + 2 1 0 + 2 1 0 + 2 1 0 -1 + 2 1 0 -1 -2 |