From 2ba036ed94c826a0b814cf15181a4a9e6f89b178 Mon Sep 17 00:00:00 2001 From: Adrian Thurston Date: Sat, 14 Mar 2020 11:27:58 +0200 Subject: removed ragel docs, old makefiles, todo, vim, etc --- contrib/Makefile.am | 2 +- contrib/ragel.m4 | 53 -------- contrib/ragel.make | 6 - contrib/unicode2ragel.rb | 305 ----------------------------------------------- 4 files changed, 1 insertion(+), 365 deletions(-) delete mode 100644 contrib/ragel.m4 delete mode 100644 contrib/ragel.make delete mode 100644 contrib/unicode2ragel.rb (limited to 'contrib') diff --git a/contrib/Makefile.am b/contrib/Makefile.am index 7ef7e8d9..4c8ef12d 100644 --- a/contrib/Makefile.am +++ b/contrib/Makefile.am @@ -1,2 +1,2 @@ -EXTRA_DIST = ragel.make ragel.m4 unicode2ragel.rb +EXTRA_DIST = diff --git a/contrib/ragel.m4 b/contrib/ragel.m4 deleted file mode 100644 index e7b42425..00000000 --- a/contrib/ragel.m4 +++ /dev/null @@ -1,53 +0,0 @@ -dnl Check for presence of the Ragel State Machine generator. -dnl -dnl This macro checks for the presence of the ragel tool in the system, -dnl and whether the ragel tool is absolutely needed for a complete -dnl build. -dnl -dnl To check for the need for Ragel, you have to provide the relative -dnl path of a source file generated through Ragel: if the file is -dnl present in the source tree, a missing ragel command will not cause -dnl the configure to abort. - -AC_DEFUN([_RAGEL_VARS], [ - AC_ARG_VAR([RAGEL], [Ragel generator command]) - AC_ARG_VAR([RAGELFLAGS], [Ragel generator flags]) -]) - -AC_DEFUN([CHECK_RAGEL], [ - AC_REQUIRE([_RAGEL_VARS]) - AC_CHECK_PROG([RAGEL], [ragel], [ragel], [no]) - - dnl We set RAGEL to false so that it would execute the "false" - dnl command if needed. - AS_IF([test x"$RAGEL" = x"no"], - [RAGEL=false], - AS_IF([test x"$2" != "x"], - [ragel_version=`$RAGEL --version | sed -n -e '1s:.*version \(@<:@0-9@:>@\.@<:@0-9@:>@\)\(\.@<:@0-9@:>@\)* .*:\1:p'` - ragel_version_compare=`echo $ragel_version | tr -d .` - ragel_wanted_version=`echo $2 | tr -d .` - AS_IF([test $ragel_version_compare -lt $ragel_wanted_version], - [AC_MSG_WARN([Found Ragel $ragel_version but Ragel $2 requested]) - RAGEL=false - ]) - ])) - - dnl Only test the need if not found - AS_IF([test x"$RAGEL" = x"false"], [ - AC_MSG_CHECKING([whether we need ragel to regenerate sources]) - AS_IF([test -a "${srcdir}/$1"], [ragel_needed=no], [ragel_needed=yes]) - AC_MSG_RESULT([$ragel_needed]) - - AS_IF([test x"$ragel_needed" = x"yes"], - [AC_MSG_ERROR([dnl -You need Ragel to build from development sources. -You can find Ragel at http://www.colm.net/open-source/ragel/dnl - ])]) - ]) -]) - -AC_DEFUN([CHECK_RAGEL_AM], [ - CHECK_RAGEL([$1], [$2]) - - AM_CONDITIONAL([HAVE_RAGEL], [test x"$RAGEL" != x"false"]) -]) diff --git a/contrib/ragel.make b/contrib/ragel.make deleted file mode 100644 index f7a71b5b..00000000 --- a/contrib/ragel.make +++ /dev/null @@ -1,6 +0,0 @@ -# -*- Makefile -*- - -SUFFIXES = .rl - -.rl.c: - $(RAGEL) $(RAGELFLAGS) -C $< -o $@ diff --git a/contrib/unicode2ragel.rb b/contrib/unicode2ragel.rb deleted file mode 100644 index d64e601d..00000000 --- a/contrib/unicode2ragel.rb +++ /dev/null @@ -1,305 +0,0 @@ -#!/usr/bin/env ruby -# -# This script uses the unicode spec to generate a Ragel state machine -# that recognizes unicode alphanumeric characters. It generates 5 -# character classes: uupper, ulower, ualpha, udigit, and ualnum. -# Currently supported encodings are UTF-8 [default] and UCS-4. -# -# Usage: unicode2ragel.rb [options] -# -e, --encoding [ucs4 | utf8] Data encoding -# -h, --help Show this message -# -# This script was originally written as part of the Ferret search -# engine library. -# -# Author: Rakan El-Khalil - -require 'optparse' -require 'open-uri' - -ENCODINGS = [ :utf8, :ucs4 ] -ALPHTYPES = { :utf8 => "unsigned char", :ucs4 => "unsigned int" } -CHART_URL = "http://www.unicode.org/Public/5.1.0/ucd/DerivedCoreProperties.txt" - -### -# Display vars & default option - -TOTAL_WIDTH = 80 -RANGE_WIDTH = 23 -@encoding = :utf8 - -### -# Option parsing - -cli_opts = OptionParser.new do |opts| - opts.on("-e", "--encoding [ucs4 | utf8]", "Data encoding") do |o| - @encoding = o.downcase.to_sym - end - opts.on("-h", "--help", "Show this message") do - puts opts - exit - end -end - -cli_opts.parse(ARGV) -unless ENCODINGS.member? @encoding - puts "Invalid encoding: #{@encoding}" - puts cli_opts - exit -end - -## -# Downloads the document at url and yields every alpha line's hex -# range and description. - -def each_alpha( url, property ) - open( url ) do |file| - file.each_line do |line| - next if line =~ /^#/; - next if line !~ /; #{property} #/; - - range, description = line.split(/;/) - range.strip! - description.gsub!(/.*#/, '').strip! - - if range =~ /\.\./ - start, stop = range.split '..' - else start = stop = range - end - - yield start.hex .. stop.hex, description - end - end -end - -### -# Formats to hex at minimum width - -def to_hex( n ) - r = "%0X" % n - r = "0#{r}" unless (r.length % 2).zero? - r -end - -### -# UCS4 is just a straight hex conversion of the unicode codepoint. - -def to_ucs4( range ) - rangestr = "0x" + to_hex(range.begin) - rangestr << "..0x" + to_hex(range.end) if range.begin != range.end - [ rangestr ] -end - -## -# 0x00 - 0x7f -> 0zzzzzzz[7] -# 0x80 - 0x7ff -> 110yyyyy[5] 10zzzzzz[6] -# 0x800 - 0xffff -> 1110xxxx[4] 10yyyyyy[6] 10zzzzzz[6] -# 0x010000 - 0x10ffff -> 11110www[3] 10xxxxxx[6] 10yyyyyy[6] 10zzzzzz[6] - -UTF8_BOUNDARIES = [0x7f, 0x7ff, 0xffff, 0x10ffff] - -def to_utf8_enc( n ) - r = 0 - if n <= 0x7f - r = n - elsif n <= 0x7ff - y = 0xc0 | (n >> 6) - z = 0x80 | (n & 0x3f) - r = y << 8 | z - elsif n <= 0xffff - x = 0xe0 | (n >> 12) - y = 0x80 | (n >> 6) & 0x3f - z = 0x80 | n & 0x3f - r = x << 16 | y << 8 | z - elsif n <= 0x10ffff - w = 0xf0 | (n >> 18) - x = 0x80 | (n >> 12) & 0x3f - y = 0x80 | (n >> 6) & 0x3f - z = 0x80 | n & 0x3f - r = w << 24 | x << 16 | y << 8 | z - end - - to_hex(r) -end - -def from_utf8_enc( n ) - n = n.hex - r = 0 - if n <= 0x7f - r = n - elsif n <= 0xdfff - y = (n >> 8) & 0x1f - z = n & 0x3f - r = y << 6 | z - elsif n <= 0xefffff - x = (n >> 16) & 0x0f - y = (n >> 8) & 0x3f - z = n & 0x3f - r = x << 10 | y << 6 | z - elsif n <= 0xf7ffffff - w = (n >> 24) & 0x07 - x = (n >> 16) & 0x3f - y = (n >> 8) & 0x3f - z = n & 0x3f - r = w << 18 | x << 12 | y << 6 | z - end - r -end - -### -# Given a range, splits it up into ranges that can be continuously -# encoded into utf8. Eg: 0x00 .. 0xff => [0x00..0x7f, 0x80..0xff] -# This is not strictly needed since the current [5.1] unicode standard -# doesn't have ranges that straddle utf8 boundaries. This is included -# for completeness as there is no telling if that will ever change. - -def utf8_ranges( range ) - ranges = [] - UTF8_BOUNDARIES.each do |max| - if range.begin <= max - return ranges << range if range.end <= max - - ranges << range.begin .. max - range = (max + 1) .. range.end - end - end - ranges -end - -def build_range( start, stop ) - size = start.size/2 - left = size - 1 - return [""] if size < 1 - - a = start[0..1] - b = stop[0..1] - - ### - # Shared prefix - - if a == b - return build_range(start[2..-1], stop[2..-1]).map do |elt| - "0x#{a} " + elt - end - end - - ### - # Unshared prefix, end of run - - return ["0x#{a}..0x#{b} "] if left.zero? - - ### - # Unshared prefix, not end of run - # Range can be 0x123456..0x56789A - # Which is equivalent to: - # 0x123456 .. 0x12FFFF - # 0x130000 .. 0x55FFFF - # 0x560000 .. 0x56789A - - ret = [] - ret << build_range(start, a + "FF" * left) - - ### - # Only generate middle range if need be. - - if a.hex+1 != b.hex - max = to_hex(b.hex - 1) - max = "FF" if b == "FF" - ret << "0x#{to_hex(a.hex+1)}..0x#{max} " + "0x00..0xFF " * left - end - - ### - # Don't generate last range if it is covered by first range - - ret << build_range(b + "00" * left, stop) unless b == "FF" - ret.flatten! -end - -def to_utf8( range ) - utf8_ranges( range ).map do |r| - build_range to_utf8_enc(r.begin), to_utf8_enc(r.end) - end.flatten! -end - -## -# Perform a 3-way comparison of the number of codepoints advertised by -# the unicode spec for the given range, the originally parsed range, -# and the resulting utf8 encoded range. - -def count_codepoints( code ) - code.split(' ').inject(1) do |acc, elt| - if elt =~ /0x(.+)\.\.0x(.+)/ - if @encoding == :utf8 - acc * (from_utf8_enc($2) - from_utf8_enc($1) + 1) - else - acc * ($2.hex - $1.hex + 1) - end - else - acc - end - end -end - -def is_valid?( range, desc, codes ) - spec_count = 1 - spec_count = $1.to_i if desc =~ /\[(\d+)\]/ - range_count = range.end - range.begin + 1 - - sum = codes.inject(0) { |acc, elt| acc + count_codepoints(elt) } - sum == spec_count and sum == range_count -end - -## -# Generate the state maching to stdout - -def generate_machine( name, property ) - pipe = " " - puts " #{name} = " - each_alpha( CHART_URL, property ) do |range, desc| - - codes = (@encoding == :ucs4) ? to_ucs4(range) : to_utf8(range) - - raise "Invalid encoding of range #{range}: #{codes.inspect}" unless - is_valid? range, desc, codes - - range_width = codes.map { |a| a.size }.max - range_width = RANGE_WIDTH if range_width < RANGE_WIDTH - - desc_width = TOTAL_WIDTH - RANGE_WIDTH - 11 - desc_width -= (range_width - RANGE_WIDTH) if range_width > RANGE_WIDTH - - if desc.size > desc_width - desc = desc[0..desc_width - 4] + "..." - end - - codes.each_with_index do |r, idx| - desc = "" unless idx.zero? - code = "%-#{range_width}s" % r - puts " #{pipe} #{code} ##{desc}" - pipe = "|" - end - end - puts " ;" - puts "" -end - -puts <