summaryrefslogtreecommitdiff
path: root/contrib
diff options
context:
space:
mode:
authorAdrian Thurston <thurston@colm.net>2020-03-14 11:27:58 +0200
committerAdrian Thurston <thurston@colm.net>2020-03-14 11:27:58 +0200
commit2ba036ed94c826a0b814cf15181a4a9e6f89b178 (patch)
tree7a778c2b06c3d4a059cd8806ff4cdccf4a07b475 /contrib
parent78e7949ca590b273c2c152a0abe0d51e590a52fd (diff)
downloadcolm-2ba036ed94c826a0b814cf15181a4a9e6f89b178.tar.gz
removed ragel docs, old makefiles, todo, vim, etc
Diffstat (limited to 'contrib')
-rw-r--r--contrib/Makefile.am2
-rw-r--r--contrib/ragel.m453
-rw-r--r--contrib/ragel.make6
-rw-r--r--contrib/unicode2ragel.rb305
4 files changed, 1 insertions, 365 deletions
diff --git a/contrib/Makefile.am b/contrib/Makefile.am
index 7ef7e8d9..4c8ef12d 100644
--- a/contrib/Makefile.am
+++ b/contrib/Makefile.am
@@ -1,2 +1,2 @@
-EXTRA_DIST = ragel.make ragel.m4 unicode2ragel.rb
+EXTRA_DIST =
diff --git a/contrib/ragel.m4 b/contrib/ragel.m4
deleted file mode 100644
index e7b42425..00000000
--- a/contrib/ragel.m4
+++ /dev/null
@@ -1,53 +0,0 @@
-dnl Check for presence of the Ragel State Machine generator.
-dnl
-dnl This macro checks for the presence of the ragel tool in the system,
-dnl and whether the ragel tool is absolutely needed for a complete
-dnl build.
-dnl
-dnl To check for the need for Ragel, you have to provide the relative
-dnl path of a source file generated through Ragel: if the file is
-dnl present in the source tree, a missing ragel command will not cause
-dnl the configure to abort.
-
-AC_DEFUN([_RAGEL_VARS], [
- AC_ARG_VAR([RAGEL], [Ragel generator command])
- AC_ARG_VAR([RAGELFLAGS], [Ragel generator flags])
-])
-
-AC_DEFUN([CHECK_RAGEL], [
- AC_REQUIRE([_RAGEL_VARS])
- AC_CHECK_PROG([RAGEL], [ragel], [ragel], [no])
-
- dnl We set RAGEL to false so that it would execute the "false"
- dnl command if needed.
- AS_IF([test x"$RAGEL" = x"no"],
- [RAGEL=false],
- AS_IF([test x"$2" != "x"],
- [ragel_version=`$RAGEL --version | sed -n -e '1s:.*version \(@<:@0-9@:>@\.@<:@0-9@:>@\)\(\.@<:@0-9@:>@\)* .*:\1:p'`
- ragel_version_compare=`echo $ragel_version | tr -d .`
- ragel_wanted_version=`echo $2 | tr -d .`
- AS_IF([test $ragel_version_compare -lt $ragel_wanted_version],
- [AC_MSG_WARN([Found Ragel $ragel_version but Ragel $2 requested])
- RAGEL=false
- ])
- ]))
-
- dnl Only test the need if not found
- AS_IF([test x"$RAGEL" = x"false"], [
- AC_MSG_CHECKING([whether we need ragel to regenerate sources])
- AS_IF([test -a "${srcdir}/$1"], [ragel_needed=no], [ragel_needed=yes])
- AC_MSG_RESULT([$ragel_needed])
-
- AS_IF([test x"$ragel_needed" = x"yes"],
- [AC_MSG_ERROR([dnl
-You need Ragel to build from development sources.
-You can find Ragel at http://www.colm.net/open-source/ragel/dnl
- ])])
- ])
-])
-
-AC_DEFUN([CHECK_RAGEL_AM], [
- CHECK_RAGEL([$1], [$2])
-
- AM_CONDITIONAL([HAVE_RAGEL], [test x"$RAGEL" != x"false"])
-])
diff --git a/contrib/ragel.make b/contrib/ragel.make
deleted file mode 100644
index f7a71b5b..00000000
--- a/contrib/ragel.make
+++ /dev/null
@@ -1,6 +0,0 @@
-# -*- Makefile -*-
-
-SUFFIXES = .rl
-
-.rl.c:
- $(RAGEL) $(RAGELFLAGS) -C $< -o $@
diff --git a/contrib/unicode2ragel.rb b/contrib/unicode2ragel.rb
deleted file mode 100644
index d64e601d..00000000
--- a/contrib/unicode2ragel.rb
+++ /dev/null
@@ -1,305 +0,0 @@
-#!/usr/bin/env ruby
-#
-# This script uses the unicode spec to generate a Ragel state machine
-# that recognizes unicode alphanumeric characters. It generates 5
-# character classes: uupper, ulower, ualpha, udigit, and ualnum.
-# Currently supported encodings are UTF-8 [default] and UCS-4.
-#
-# Usage: unicode2ragel.rb [options]
-# -e, --encoding [ucs4 | utf8] Data encoding
-# -h, --help Show this message
-#
-# This script was originally written as part of the Ferret search
-# engine library.
-#
-# Author: Rakan El-Khalil <rakan@well.com>
-
-require 'optparse'
-require 'open-uri'
-
-ENCODINGS = [ :utf8, :ucs4 ]
-ALPHTYPES = { :utf8 => "unsigned char", :ucs4 => "unsigned int" }
-CHART_URL = "http://www.unicode.org/Public/5.1.0/ucd/DerivedCoreProperties.txt"
-
-###
-# Display vars & default option
-
-TOTAL_WIDTH = 80
-RANGE_WIDTH = 23
-@encoding = :utf8
-
-###
-# Option parsing
-
-cli_opts = OptionParser.new do |opts|
- opts.on("-e", "--encoding [ucs4 | utf8]", "Data encoding") do |o|
- @encoding = o.downcase.to_sym
- end
- opts.on("-h", "--help", "Show this message") do
- puts opts
- exit
- end
-end
-
-cli_opts.parse(ARGV)
-unless ENCODINGS.member? @encoding
- puts "Invalid encoding: #{@encoding}"
- puts cli_opts
- exit
-end
-
-##
-# Downloads the document at url and yields every alpha line's hex
-# range and description.
-
-def each_alpha( url, property )
- open( url ) do |file|
- file.each_line do |line|
- next if line =~ /^#/;
- next if line !~ /; #{property} #/;
-
- range, description = line.split(/;/)
- range.strip!
- description.gsub!(/.*#/, '').strip!
-
- if range =~ /\.\./
- start, stop = range.split '..'
- else start = stop = range
- end
-
- yield start.hex .. stop.hex, description
- end
- end
-end
-
-###
-# Formats to hex at minimum width
-
-def to_hex( n )
- r = "%0X" % n
- r = "0#{r}" unless (r.length % 2).zero?
- r
-end
-
-###
-# UCS4 is just a straight hex conversion of the unicode codepoint.
-
-def to_ucs4( range )
- rangestr = "0x" + to_hex(range.begin)
- rangestr << "..0x" + to_hex(range.end) if range.begin != range.end
- [ rangestr ]
-end
-
-##
-# 0x00 - 0x7f -> 0zzzzzzz[7]
-# 0x80 - 0x7ff -> 110yyyyy[5] 10zzzzzz[6]
-# 0x800 - 0xffff -> 1110xxxx[4] 10yyyyyy[6] 10zzzzzz[6]
-# 0x010000 - 0x10ffff -> 11110www[3] 10xxxxxx[6] 10yyyyyy[6] 10zzzzzz[6]
-
-UTF8_BOUNDARIES = [0x7f, 0x7ff, 0xffff, 0x10ffff]
-
-def to_utf8_enc( n )
- r = 0
- if n <= 0x7f
- r = n
- elsif n <= 0x7ff
- y = 0xc0 | (n >> 6)
- z = 0x80 | (n & 0x3f)
- r = y << 8 | z
- elsif n <= 0xffff
- x = 0xe0 | (n >> 12)
- y = 0x80 | (n >> 6) & 0x3f
- z = 0x80 | n & 0x3f
- r = x << 16 | y << 8 | z
- elsif n <= 0x10ffff
- w = 0xf0 | (n >> 18)
- x = 0x80 | (n >> 12) & 0x3f
- y = 0x80 | (n >> 6) & 0x3f
- z = 0x80 | n & 0x3f
- r = w << 24 | x << 16 | y << 8 | z
- end
-
- to_hex(r)
-end
-
-def from_utf8_enc( n )
- n = n.hex
- r = 0
- if n <= 0x7f
- r = n
- elsif n <= 0xdfff
- y = (n >> 8) & 0x1f
- z = n & 0x3f
- r = y << 6 | z
- elsif n <= 0xefffff
- x = (n >> 16) & 0x0f
- y = (n >> 8) & 0x3f
- z = n & 0x3f
- r = x << 10 | y << 6 | z
- elsif n <= 0xf7ffffff
- w = (n >> 24) & 0x07
- x = (n >> 16) & 0x3f
- y = (n >> 8) & 0x3f
- z = n & 0x3f
- r = w << 18 | x << 12 | y << 6 | z
- end
- r
-end
-
-###
-# Given a range, splits it up into ranges that can be continuously
-# encoded into utf8. Eg: 0x00 .. 0xff => [0x00..0x7f, 0x80..0xff]
-# This is not strictly needed since the current [5.1] unicode standard
-# doesn't have ranges that straddle utf8 boundaries. This is included
-# for completeness as there is no telling if that will ever change.
-
-def utf8_ranges( range )
- ranges = []
- UTF8_BOUNDARIES.each do |max|
- if range.begin <= max
- return ranges << range if range.end <= max
-
- ranges << range.begin .. max
- range = (max + 1) .. range.end
- end
- end
- ranges
-end
-
-def build_range( start, stop )
- size = start.size/2
- left = size - 1
- return [""] if size < 1
-
- a = start[0..1]
- b = stop[0..1]
-
- ###
- # Shared prefix
-
- if a == b
- return build_range(start[2..-1], stop[2..-1]).map do |elt|
- "0x#{a} " + elt
- end
- end
-
- ###
- # Unshared prefix, end of run
-
- return ["0x#{a}..0x#{b} "] if left.zero?
-
- ###
- # Unshared prefix, not end of run
- # Range can be 0x123456..0x56789A
- # Which is equivalent to:
- # 0x123456 .. 0x12FFFF
- # 0x130000 .. 0x55FFFF
- # 0x560000 .. 0x56789A
-
- ret = []
- ret << build_range(start, a + "FF" * left)
-
- ###
- # Only generate middle range if need be.
-
- if a.hex+1 != b.hex
- max = to_hex(b.hex - 1)
- max = "FF" if b == "FF"
- ret << "0x#{to_hex(a.hex+1)}..0x#{max} " + "0x00..0xFF " * left
- end
-
- ###
- # Don't generate last range if it is covered by first range
-
- ret << build_range(b + "00" * left, stop) unless b == "FF"
- ret.flatten!
-end
-
-def to_utf8( range )
- utf8_ranges( range ).map do |r|
- build_range to_utf8_enc(r.begin), to_utf8_enc(r.end)
- end.flatten!
-end
-
-##
-# Perform a 3-way comparison of the number of codepoints advertised by
-# the unicode spec for the given range, the originally parsed range,
-# and the resulting utf8 encoded range.
-
-def count_codepoints( code )
- code.split(' ').inject(1) do |acc, elt|
- if elt =~ /0x(.+)\.\.0x(.+)/
- if @encoding == :utf8
- acc * (from_utf8_enc($2) - from_utf8_enc($1) + 1)
- else
- acc * ($2.hex - $1.hex + 1)
- end
- else
- acc
- end
- end
-end
-
-def is_valid?( range, desc, codes )
- spec_count = 1
- spec_count = $1.to_i if desc =~ /\[(\d+)\]/
- range_count = range.end - range.begin + 1
-
- sum = codes.inject(0) { |acc, elt| acc + count_codepoints(elt) }
- sum == spec_count and sum == range_count
-end
-
-##
-# Generate the state maching to stdout
-
-def generate_machine( name, property )
- pipe = " "
- puts " #{name} = "
- each_alpha( CHART_URL, property ) do |range, desc|
-
- codes = (@encoding == :ucs4) ? to_ucs4(range) : to_utf8(range)
-
- raise "Invalid encoding of range #{range}: #{codes.inspect}" unless
- is_valid? range, desc, codes
-
- range_width = codes.map { |a| a.size }.max
- range_width = RANGE_WIDTH if range_width < RANGE_WIDTH
-
- desc_width = TOTAL_WIDTH - RANGE_WIDTH - 11
- desc_width -= (range_width - RANGE_WIDTH) if range_width > RANGE_WIDTH
-
- if desc.size > desc_width
- desc = desc[0..desc_width - 4] + "..."
- end
-
- codes.each_with_index do |r, idx|
- desc = "" unless idx.zero?
- code = "%-#{range_width}s" % r
- puts " #{pipe} #{code} ##{desc}"
- pipe = "|"
- end
- end
- puts " ;"
- puts ""
-end
-
-puts <<EOF
-# The following Ragel file was autogenerated with #{$0}
-# from: #{CHART_URL}
-#
-# It defines ualpha, udigit, ualnum.
-#
-# To use this, make sure that your alphtype is set to #{ALPHTYPES[@encoding]},
-# and that your input is in #{@encoding}.
-
-%%{
- machine WChar;
-EOF
-generate_machine( :ualpha, "Alphabetic" )
-generate_machine( :ulower, "Lowercase" )
-generate_machine( :uupper, "Uppercase" )
-puts <<EOF
- udigit = '0'..'9';
- ualnum = ualpha | udigit;
-}%%
-EOF