From eafd7a3974e8605fd02794269db6114a3446e016 Mon Sep 17 00:00:00 2001 From: Lorry Tar Creator Date: Mon, 13 Oct 2014 19:14:30 +0000 Subject: ragel-6.9 --- contrib/Makefile.am | 2 + contrib/Makefile.in | 398 +++++++++++++++++++++++++++++++++++++++++++++++ contrib/ragel.m4 | 53 +++++++ contrib/ragel.make | 6 + contrib/unicode2ragel.rb | 305 ++++++++++++++++++++++++++++++++++++ 5 files changed, 764 insertions(+) create mode 100644 contrib/Makefile.am create mode 100644 contrib/Makefile.in create mode 100644 contrib/ragel.m4 create mode 100644 contrib/ragel.make create mode 100644 contrib/unicode2ragel.rb (limited to 'contrib') diff --git a/contrib/Makefile.am b/contrib/Makefile.am new file mode 100644 index 0000000..7ef7e8d --- /dev/null +++ b/contrib/Makefile.am @@ -0,0 +1,2 @@ + +EXTRA_DIST = ragel.make ragel.m4 unicode2ragel.rb diff --git a/contrib/Makefile.in b/contrib/Makefile.in new file mode 100644 index 0000000..1dffb01 --- /dev/null +++ b/contrib/Makefile.in @@ -0,0 +1,398 @@ +# Makefile.in generated by automake 1.14.1 from Makefile.am. +# @configure_input@ + +# Copyright (C) 1994-2013 Free Software Foundation, Inc. + +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ +VPATH = @srcdir@ +am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)' +am__make_running_with_option = \ + case $${target_option-} in \ + ?) ;; \ + *) echo "am__make_running_with_option: internal error: invalid" \ + "target option '$${target_option-}' specified" >&2; \ + exit 1;; \ + esac; \ + has_opt=no; \ + sane_makeflags=$$MAKEFLAGS; \ + if $(am__is_gnu_make); then \ + sane_makeflags=$$MFLAGS; \ + else \ + case $$MAKEFLAGS in \ + *\\[\ \ ]*) \ + bs=\\; \ + sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ + | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \ + esac; \ + fi; \ + skip_next=no; \ + strip_trailopt () \ + { \ + flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ + }; \ + for flg in $$sane_makeflags; do \ + test $$skip_next = yes && { skip_next=no; continue; }; \ + case $$flg in \ + *=*|--*) continue;; \ + -*I) strip_trailopt 'I'; skip_next=yes;; \ + -*I?*) strip_trailopt 'I';; \ + -*O) strip_trailopt 'O'; skip_next=yes;; \ + -*O?*) strip_trailopt 'O';; \ + -*l) strip_trailopt 'l'; skip_next=yes;; \ + -*l?*) strip_trailopt 'l';; \ + -[dEDm]) skip_next=yes;; \ + -[JT]) skip_next=yes;; \ + esac; \ + case $$flg in \ + *$$target_option*) has_opt=yes; break;; \ + esac; \ + done; \ + test $$has_opt = yes +am__make_dryrun = (target_option=n; $(am__make_running_with_option)) +am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) +pkgdatadir = $(datadir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkglibexecdir = $(libexecdir)/@PACKAGE@ +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +subdir = contrib +DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/configure.in +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +mkinstalldirs = $(install_sh) -d +CONFIG_HEADER = $(top_builddir)/ragel/config.h +CONFIG_CLEAN_FILES = +CONFIG_CLEAN_VPATH_FILES = +AM_V_P = $(am__v_P_@AM_V@) +am__v_P_ = $(am__v_P_@AM_DEFAULT_V@) +am__v_P_0 = false +am__v_P_1 = : +AM_V_GEN = $(am__v_GEN_@AM_V@) +am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@) +am__v_GEN_0 = @echo " GEN " $@; +am__v_GEN_1 = +AM_V_at = $(am__v_at_@AM_V@) +am__v_at_ = $(am__v_at_@AM_DEFAULT_V@) +am__v_at_0 = @ +am__v_at_1 = +SOURCES = +DIST_SOURCES = +am__can_run_installinfo = \ + case $$AM_UPDATE_INFO_DIR in \ + n|no|NO) false;; \ + *) (install-info --version) >/dev/null 2>&1;; \ + esac +am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +ACLOCAL = @ACLOCAL@ +AMTAR = @AMTAR@ +AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ +AR = @AR@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +CC = @CC@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +CPPFLAGS = @CPPFLAGS@ +CXX = @CXX@ +CXXDEPMODE = @CXXDEPMODE@ +CXXFLAGS = @CXXFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EXEEXT = @EXEEXT@ +FIG2DEV = @FIG2DEV@ +GDC = @GDC@ +GMCS = @GMCS@ +GOBIN = @GOBIN@ +GOBJC = @GOBJC@ +INSTALL = @INSTALL@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +JAVAC = @JAVAC@ +KELBT = @KELBT@ +LDFLAGS = @LDFLAGS@ +LIBOBJS = @LIBOBJS@ +LIBS = @LIBS@ +LTLIBOBJS = @LTLIBOBJS@ +MAKEINFO = @MAKEINFO@ +MKDIR_P = @MKDIR_P@ +OBJEXT = @OBJEXT@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_URL = @PACKAGE_URL@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +PDFLATEX = @PDFLATEX@ +PUBDATE = @PUBDATE@ +RAGEL = @RAGEL@ +RANLIB = @RANLIB@ +RUBY = @RUBY@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +STRIP = @STRIP@ +TXL = @TXL@ +VERSION = @VERSION@ +abs_builddir = @abs_builddir@ +abs_srcdir = @abs_srcdir@ +abs_top_builddir = @abs_top_builddir@ +abs_top_srcdir = @abs_top_srcdir@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_CXX = @ac_ct_CXX@ +am__include = @am__include@ +am__leading_dot = @am__leading_dot@ +am__quote = @am__quote@ +am__tar = @am__tar@ +am__untar = @am__untar@ +bindir = @bindir@ +build_alias = @build_alias@ +builddir = @builddir@ +datadir = @datadir@ +datarootdir = @datarootdir@ +docdir = @docdir@ +dvidir = @dvidir@ +exec_prefix = @exec_prefix@ +host_alias = @host_alias@ +htmldir = @htmldir@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +localedir = @localedir@ +localstatedir = @localstatedir@ +mandir = @mandir@ +mkdir_p = @mkdir_p@ +oldincludedir = @oldincludedir@ +pdfdir = @pdfdir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +psdir = @psdir@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +srcdir = @srcdir@ +sysconfdir = @sysconfdir@ +target_alias = @target_alias@ +top_build_prefix = @top_build_prefix@ +top_builddir = @top_builddir@ +top_srcdir = @top_srcdir@ +EXTRA_DIST = ragel.make ragel.m4 unicode2ragel.rb +all: all-am + +.SUFFIXES: +$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ + && { if test -f $@; then exit 0; else break; fi; }; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign contrib/Makefile'; \ + $(am__cd) $(top_srcdir) && \ + $(AUTOMAKE) --foreign contrib/Makefile +.PRECIOUS: Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ + esac; + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(am__aclocal_m4_deps): +tags TAGS: + +ctags CTAGS: + +cscope cscopelist: + + +distdir: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + list='$(DISTFILES)'; \ + dist_files=`for file in $$list; do echo $$file; done | \ + sed -e "s|^$$srcdirstrip/||;t" \ + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ + case $$dist_files in \ + */*) $(MKDIR_P) `echo "$$dist_files" | \ + sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ + sort -u` ;; \ + esac; \ + for file in $$dist_files; do \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + if test -d $$d/$$file; then \ + dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test -d "$(distdir)/$$file"; then \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ + else \ + test -f "$(distdir)/$$file" \ + || cp -p $$d/$$file "$(distdir)/$$file" \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile +installdirs: +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + if test -z '$(STRIP)'; then \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + install; \ + else \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ + fi +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-generic mostlyclean-am + +distclean: distclean-am + -rm -f Makefile +distclean-am: clean-am distclean-generic + +dvi: dvi-am + +dvi-am: + +html: html-am + +html-am: + +info: info-am + +info-am: + +install-data-am: + +install-dvi: install-dvi-am + +install-dvi-am: + +install-exec-am: + +install-html: install-html-am + +install-html-am: + +install-info: install-info-am + +install-info-am: + +install-man: + +install-pdf: install-pdf-am + +install-pdf-am: + +install-ps: install-ps-am + +install-ps-am: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-generic + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: + +.MAKE: install-am install-strip + +.PHONY: all all-am check check-am clean clean-generic cscopelist-am \ + ctags-am distclean distclean-generic distdir dvi dvi-am html \ + html-am info info-am install install-am install-data \ + install-data-am install-dvi install-dvi-am install-exec \ + install-exec-am install-html install-html-am install-info \ + install-info-am install-man install-pdf install-pdf-am \ + install-ps install-ps-am install-strip installcheck \ + installcheck-am installdirs maintainer-clean \ + maintainer-clean-generic mostlyclean mostlyclean-generic pdf \ + pdf-am ps ps-am tags-am uninstall uninstall-am + + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/contrib/ragel.m4 b/contrib/ragel.m4 new file mode 100644 index 0000000..72ce4b9 --- /dev/null +++ b/contrib/ragel.m4 @@ -0,0 +1,53 @@ +dnl Check for presence of the Ragel State Machine generator. +dnl +dnl This macro checks for the presence of the ragel tool in the system, +dnl and whether the ragel tool is absolutely needed for a complete +dnl build. +dnl +dnl To check for the need for Ragel, you have to provide the relative +dnl path of a source file generated through Ragel: if the file is +dnl present in the source tree, a missing ragel command will not cause +dnl the configure to abort. + +AC_DEFUN([_RAGEL_VARS], [ + AC_ARG_VAR([RAGEL], [Ragel generator command]) + AC_ARG_VAR([RAGELFLAGS], [Ragel generator flags]) +]) + +AC_DEFUN([CHECK_RAGEL], [ + AC_REQUIRE([_RAGEL_VARS]) + AC_CHECK_PROG([RAGEL], [ragel], [ragel], [no]) + + dnl We set RAGEL to false so that it would execute the "false" + dnl command if needed. + AS_IF([test x"$RAGEL" = x"no"], + [RAGEL=false], + AS_IF([test x"$2" != "x"], + [ragel_version=`$RAGEL --version | sed -n -e '1s:.*version \(@<:@0-9@:>@\.@<:@0-9@:>@\) .*:\1:p'` + ragel_version_compare=`echo $ragel_version | tr -d .` + ragel_wanted_version=`echo $2 | tr -d .` + AS_IF([test $ragel_version_compare -lt $ragel_wanted_version], + [AC_MSG_WARN([Found Ragel $ragel_version but Ragel $2 requested]) + RAGEL=false + ]) + ])) + + dnl Only test the need if not found + AS_IF([test x"$RAGEL" = x"false"], [ + AC_MSG_CHECKING([whether we need ragel to regenerate sources]) + AS_IF([test -a "${srcdir}/$1"], [ragel_needed=no], [ragel_needed=yes]) + AC_MSG_RESULT([$ragel_needed]) + + AS_IF([test x"$ragel_needed" = x"yes"], + [AC_MSG_ERROR([dnl +You need Ragel to build from development sources. +You can find Ragel at http://www.complang.org/ragel/dnl + ])]) + ]) +]) + +AC_DEFUN([CHECK_RAGEL_AM], [ + CHECK_RAGEL([$1], [$2]) + + AM_CONDITIONAL([HAVE_RAGEL], [test x"$RAGEL" != x"false"]) +]) diff --git a/contrib/ragel.make b/contrib/ragel.make new file mode 100644 index 0000000..f7a71b5 --- /dev/null +++ b/contrib/ragel.make @@ -0,0 +1,6 @@ +# -*- Makefile -*- + +SUFFIXES = .rl + +.rl.c: + $(RAGEL) $(RAGELFLAGS) -C $< -o $@ diff --git a/contrib/unicode2ragel.rb b/contrib/unicode2ragel.rb new file mode 100644 index 0000000..d64e601 --- /dev/null +++ b/contrib/unicode2ragel.rb @@ -0,0 +1,305 @@ +#!/usr/bin/env ruby +# +# This script uses the unicode spec to generate a Ragel state machine +# that recognizes unicode alphanumeric characters. It generates 5 +# character classes: uupper, ulower, ualpha, udigit, and ualnum. +# Currently supported encodings are UTF-8 [default] and UCS-4. +# +# Usage: unicode2ragel.rb [options] +# -e, --encoding [ucs4 | utf8] Data encoding +# -h, --help Show this message +# +# This script was originally written as part of the Ferret search +# engine library. +# +# Author: Rakan El-Khalil + +require 'optparse' +require 'open-uri' + +ENCODINGS = [ :utf8, :ucs4 ] +ALPHTYPES = { :utf8 => "unsigned char", :ucs4 => "unsigned int" } +CHART_URL = "http://www.unicode.org/Public/5.1.0/ucd/DerivedCoreProperties.txt" + +### +# Display vars & default option + +TOTAL_WIDTH = 80 +RANGE_WIDTH = 23 +@encoding = :utf8 + +### +# Option parsing + +cli_opts = OptionParser.new do |opts| + opts.on("-e", "--encoding [ucs4 | utf8]", "Data encoding") do |o| + @encoding = o.downcase.to_sym + end + opts.on("-h", "--help", "Show this message") do + puts opts + exit + end +end + +cli_opts.parse(ARGV) +unless ENCODINGS.member? @encoding + puts "Invalid encoding: #{@encoding}" + puts cli_opts + exit +end + +## +# Downloads the document at url and yields every alpha line's hex +# range and description. + +def each_alpha( url, property ) + open( url ) do |file| + file.each_line do |line| + next if line =~ /^#/; + next if line !~ /; #{property} #/; + + range, description = line.split(/;/) + range.strip! + description.gsub!(/.*#/, '').strip! + + if range =~ /\.\./ + start, stop = range.split '..' + else start = stop = range + end + + yield start.hex .. stop.hex, description + end + end +end + +### +# Formats to hex at minimum width + +def to_hex( n ) + r = "%0X" % n + r = "0#{r}" unless (r.length % 2).zero? + r +end + +### +# UCS4 is just a straight hex conversion of the unicode codepoint. + +def to_ucs4( range ) + rangestr = "0x" + to_hex(range.begin) + rangestr << "..0x" + to_hex(range.end) if range.begin != range.end + [ rangestr ] +end + +## +# 0x00 - 0x7f -> 0zzzzzzz[7] +# 0x80 - 0x7ff -> 110yyyyy[5] 10zzzzzz[6] +# 0x800 - 0xffff -> 1110xxxx[4] 10yyyyyy[6] 10zzzzzz[6] +# 0x010000 - 0x10ffff -> 11110www[3] 10xxxxxx[6] 10yyyyyy[6] 10zzzzzz[6] + +UTF8_BOUNDARIES = [0x7f, 0x7ff, 0xffff, 0x10ffff] + +def to_utf8_enc( n ) + r = 0 + if n <= 0x7f + r = n + elsif n <= 0x7ff + y = 0xc0 | (n >> 6) + z = 0x80 | (n & 0x3f) + r = y << 8 | z + elsif n <= 0xffff + x = 0xe0 | (n >> 12) + y = 0x80 | (n >> 6) & 0x3f + z = 0x80 | n & 0x3f + r = x << 16 | y << 8 | z + elsif n <= 0x10ffff + w = 0xf0 | (n >> 18) + x = 0x80 | (n >> 12) & 0x3f + y = 0x80 | (n >> 6) & 0x3f + z = 0x80 | n & 0x3f + r = w << 24 | x << 16 | y << 8 | z + end + + to_hex(r) +end + +def from_utf8_enc( n ) + n = n.hex + r = 0 + if n <= 0x7f + r = n + elsif n <= 0xdfff + y = (n >> 8) & 0x1f + z = n & 0x3f + r = y << 6 | z + elsif n <= 0xefffff + x = (n >> 16) & 0x0f + y = (n >> 8) & 0x3f + z = n & 0x3f + r = x << 10 | y << 6 | z + elsif n <= 0xf7ffffff + w = (n >> 24) & 0x07 + x = (n >> 16) & 0x3f + y = (n >> 8) & 0x3f + z = n & 0x3f + r = w << 18 | x << 12 | y << 6 | z + end + r +end + +### +# Given a range, splits it up into ranges that can be continuously +# encoded into utf8. Eg: 0x00 .. 0xff => [0x00..0x7f, 0x80..0xff] +# This is not strictly needed since the current [5.1] unicode standard +# doesn't have ranges that straddle utf8 boundaries. This is included +# for completeness as there is no telling if that will ever change. + +def utf8_ranges( range ) + ranges = [] + UTF8_BOUNDARIES.each do |max| + if range.begin <= max + return ranges << range if range.end <= max + + ranges << range.begin .. max + range = (max + 1) .. range.end + end + end + ranges +end + +def build_range( start, stop ) + size = start.size/2 + left = size - 1 + return [""] if size < 1 + + a = start[0..1] + b = stop[0..1] + + ### + # Shared prefix + + if a == b + return build_range(start[2..-1], stop[2..-1]).map do |elt| + "0x#{a} " + elt + end + end + + ### + # Unshared prefix, end of run + + return ["0x#{a}..0x#{b} "] if left.zero? + + ### + # Unshared prefix, not end of run + # Range can be 0x123456..0x56789A + # Which is equivalent to: + # 0x123456 .. 0x12FFFF + # 0x130000 .. 0x55FFFF + # 0x560000 .. 0x56789A + + ret = [] + ret << build_range(start, a + "FF" * left) + + ### + # Only generate middle range if need be. + + if a.hex+1 != b.hex + max = to_hex(b.hex - 1) + max = "FF" if b == "FF" + ret << "0x#{to_hex(a.hex+1)}..0x#{max} " + "0x00..0xFF " * left + end + + ### + # Don't generate last range if it is covered by first range + + ret << build_range(b + "00" * left, stop) unless b == "FF" + ret.flatten! +end + +def to_utf8( range ) + utf8_ranges( range ).map do |r| + build_range to_utf8_enc(r.begin), to_utf8_enc(r.end) + end.flatten! +end + +## +# Perform a 3-way comparison of the number of codepoints advertised by +# the unicode spec for the given range, the originally parsed range, +# and the resulting utf8 encoded range. + +def count_codepoints( code ) + code.split(' ').inject(1) do |acc, elt| + if elt =~ /0x(.+)\.\.0x(.+)/ + if @encoding == :utf8 + acc * (from_utf8_enc($2) - from_utf8_enc($1) + 1) + else + acc * ($2.hex - $1.hex + 1) + end + else + acc + end + end +end + +def is_valid?( range, desc, codes ) + spec_count = 1 + spec_count = $1.to_i if desc =~ /\[(\d+)\]/ + range_count = range.end - range.begin + 1 + + sum = codes.inject(0) { |acc, elt| acc + count_codepoints(elt) } + sum == spec_count and sum == range_count +end + +## +# Generate the state maching to stdout + +def generate_machine( name, property ) + pipe = " " + puts " #{name} = " + each_alpha( CHART_URL, property ) do |range, desc| + + codes = (@encoding == :ucs4) ? to_ucs4(range) : to_utf8(range) + + raise "Invalid encoding of range #{range}: #{codes.inspect}" unless + is_valid? range, desc, codes + + range_width = codes.map { |a| a.size }.max + range_width = RANGE_WIDTH if range_width < RANGE_WIDTH + + desc_width = TOTAL_WIDTH - RANGE_WIDTH - 11 + desc_width -= (range_width - RANGE_WIDTH) if range_width > RANGE_WIDTH + + if desc.size > desc_width + desc = desc[0..desc_width - 4] + "..." + end + + codes.each_with_index do |r, idx| + desc = "" unless idx.zero? + code = "%-#{range_width}s" % r + puts " #{pipe} #{code} ##{desc}" + pipe = "|" + end + end + puts " ;" + puts "" +end + +puts <