From 38d2a988b816471e2cf2f414773470e28b39c93c Mon Sep 17 00:00:00 2001 From: Adrian Thurston Date: Wed, 11 Dec 2019 18:44:47 +0200 Subject: go grammar: started with the unicode definitions Generating the utf8 encoded unicode defintitions that are referenced in the Go specification. Can start to build the grammar with these definitions on hand. --- grammar/Makefile | 2 +- grammar/go/.gitignore | 5 ++ grammar/go/Makefile | 18 ++++++ grammar/go/go.lm | 23 ++++++++ grammar/go/input.til | 13 +++++ grammar/go/insert-semi.lm | 111 +++++++++++++++++++++++++++++++++++ grammar/go/parsego.lm | 11 ++++ grammar/go/utf8pat.rb | 145 ++++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 327 insertions(+), 1 deletion(-) create mode 100644 grammar/go/.gitignore create mode 100644 grammar/go/Makefile create mode 100644 grammar/go/go.lm create mode 100644 grammar/go/input.til create mode 100644 grammar/go/insert-semi.lm create mode 100644 grammar/go/parsego.lm create mode 100644 grammar/go/utf8pat.rb diff --git a/grammar/Makefile b/grammar/Makefile index 44ac4032..c77188bc 100644 --- a/grammar/Makefile +++ b/grammar/Makefile @@ -1,4 +1,4 @@ -SUBDIRS = rust pcre dns c++ python +SUBDIRS = rust pcre dns c++ python go all: rust pcre dns for d in $(SUBDIRS); do ( cd $$d && $(MAKE) ); done diff --git a/grammar/go/.gitignore b/grammar/go/.gitignore new file mode 100644 index 00000000..80d228d8 --- /dev/null +++ b/grammar/go/.gitignore @@ -0,0 +1,5 @@ +/utf8.lm +/go.c +/go +/insert-semi.c +/insert-semi diff --git a/grammar/go/Makefile b/grammar/go/Makefile new file mode 100644 index 00000000..f4a39bad --- /dev/null +++ b/grammar/go/Makefile @@ -0,0 +1,18 @@ +COLM = ../../colm/colm +RAGEL = ../../ragel/ragel + +all: go insert-semi + +go: go.lm parsego.lm utf8.lm $(COLM) + $(COLM) -o go parsego.lm + +insert-semi: insert-semi.lm + $(COLM) $< + +utf8.lm: Makefile utf8pat.rb UnicodeData.txt + echo 'rl unicode_letter /' >utf8.lm + ruby utf8pat.rb 'L[ultmo]' 1 UnicodeData.txt >> utf8.lm + echo '/' >>utf8.lm + echo 'rl unicode_digit /' >>utf8.lm + ruby utf8pat.rb 'Nd' 1 UnicodeData.txt >> utf8.lm + echo '/' >>utf8.lm diff --git a/grammar/go/go.lm b/grammar/go/go.lm new file mode 100644 index 00000000..0dec2329 --- /dev/null +++ b/grammar/go/go.lm @@ -0,0 +1,23 @@ +include 'utf8.lm' + +token BOM / 0xEF 0xBB 0xBF / + +lex + rl newline / 0x0A / + + rl valid_utf8 / + 0x00 .. 0x7F | + 0xC0 .. 0xDF any | + 0xE0 .. 0xEF any any | + 0xF0 .. 0xF7 any any any + / + + rl unicode_char / valid_utf8 - 0x0A / + + token id / unicode_letter unicode_letter* / + + ignore /[ \t\n\r]+/ +end + +def program + [BOM? id*] diff --git a/grammar/go/input.til b/grammar/go/input.til new file mode 100644 index 00000000..008182b8 --- /dev/null +++ b/grammar/go/input.til @@ -0,0 +1,13 @@ +var a +a := 1; + +head: + +a := ( a + 1 ) +c := d; + +if a = 10 then + goto head +end + +hi := there; friend := yes diff --git a/grammar/go/insert-semi.lm b/grammar/go/insert-semi.lm new file mode 100644 index 00000000..3271216a --- /dev/null +++ b/grammar/go/insert-semi.lm @@ -0,0 +1,111 @@ +lex + literal `var `if `then `else `while `do `for `read `write + `end `to `goto + + literal `:= `!= `+ `- `* `/ `= `( `) `: `; + + ignore /'//' [^\n]* '\n'/ + ignore /[\n\t ]+/ + + token id /[a-zA-Z_]+/ + token integer /[0-9]+/ + token stringlit /'"' [^"]* '"'/ + + token insert_semi / + ( ( id - + 'var' - 'if' - 'then' - 'else' - 'while' - + 'do' - 'for' - 'read' - 'write' - 'end' - + 'to' - 'goto' ) | + integer | stringlit | ')' ) '\n' / + { + Prefix: str = input->pull( match_length - 1 ) + input->push( ";" ) + input->push( Prefix ) + } + +end + +def program + [statement*] + +def statement + [declaration] +| [assignment_statement] +| [if_statement] +| [while_statement] +| [do_statement] +| [for_statement] +| [read_statement] +| [write_statement] +| [labelled_statement] +| [goto_statement] + +def declaration + [`var id `;] + +def assignment_statement + [id `:= expression `;] + +def if_statement + [`if expression `then statement* opt_else_statement `end] + +def opt_else_statement + [`else statement*] +| [] + +def while_statement + [`while expression `do statement* `end] + +def do_statement + [`do statement* `while expression `;] + +def for_statement + [`for id `:= expression `to expression `do statement* `end] + +def read_statement + [`read id `;] + +def write_statement + [`write expression `;] + +def expression + [term] +| [expression eqop term] + +def eqop [`=] | [`!=] + +def term + [factor] +| [term addop factor] + +def addop [`+] | [`-] + +def factor + [primary] +| [factor mulop primary] + +def mulop [`*] | [`/] + +def primary + [id] +| [lit] +| [`( expression `)] + +def lit + [integer] +| [stringlit] + +def labelled_statement + [id `: statement] + +def goto_statement + [`goto id `;] + +parse P: program[stdin] + +if P + print[ P ] +else { + print "[error] + exit( 1 ) +} diff --git a/grammar/go/parsego.lm b/grammar/go/parsego.lm new file mode 100644 index 00000000..659fcb0e --- /dev/null +++ b/grammar/go/parsego.lm @@ -0,0 +1,11 @@ +include 'go.lm' + +parse P: program [stdin] + +if !P { + send stderr "parse error: [error] + exit(1) +} +else { + print [P] +} diff --git a/grammar/go/utf8pat.rb b/grammar/go/utf8pat.rb new file mode 100644 index 00000000..12b9df7a --- /dev/null +++ b/grammar/go/utf8pat.rb @@ -0,0 +1,145 @@ +# utf8pat.rb +# +# Generate utf8-encoded ragel or colm patterns for unicode code character sets. +# +# The dict structure is a hash mapping upper ends of ranges to a hash that +# contains: +# 1. The lower end. +# 2. A dict for the tail of the pattern. +# +# We index by the upper end of the range because we assume all unicode points +# to be read in increasing order and we check for extension as we add points by +# looking up the upper end. +# +# dict: { upper => { :lower => lower, :dict => dict } } +# + +# +# utf8pat.rb +# +# +# ruby utf8pat.rb 'L[lutmo]' 1 UnicodeData.txt + +target_category = Regexp.new( ARGV[0] ) +indentation_level = ARGV[1].to_i +unicode_data = ARGV[2] + +def utf8_enc( n ) + if n <= 0x7F + return [ n ] + elsif n <= 0x7FF + return [ + 0xC0 | (n >> 6), + 0x80 | (n & 0x3F) + ] + elsif n <= 0xFFFF + return [ + 0xE0 | (n >> 12), + 0x80 | (n >> 6) & 0x3F, + 0x80 | n & 0x3F + ] + elsif n <= 0x10ffff + return [ + 0xF0 | (n >> 18), + 0x80 | (n >> 12) & 0x3F, + 0x80 | (n >> 6) & 0x3F, + 0x80 | n & 0x3F + ] + end +end + +def add_to_dict( dict, utf8val ) + return if utf8val.size == 0 + + nk = utf8val[0] + + if utf8val.size == 1 && nk > 0 && dict.key?( nk - 1 ) + dict[nk] = dict[nk - 1] + dict.delete( nk - 1 ) + else + if ! dict.key?( utf8val[0] ) + dict[nk] = { :lower => nk, :dict => {} } + end + end + + add_to_dict( dict[nk][:dict], utf8val[1..-1] ) +end + +def compare( dict1, dict2 ) + # First check if we have equal size. If so, iterate dict1 and endsure key + # is present in dict2. Then check lower end of the range matches and + # recurse on the tails. + return false if dict1.size != dict2.size + + dict1.each do |key, value| + return false if !dict2.key?( key ) + + return false if value[:lower] != dict2[key][:lower] + + return false if !compare( value[:dict], dict2[key][:dict] ) + end + + return true +end + +def merge( dict ) + previous = nil + dict.each do |key, value| + # First recurse, ensuring dict is merged. + merge( value[:dict] ) + if !previous.nil? && ( previous + 1 ) == value[:lower] && + compare( dict[previous][:dict], value[:dict] ) + # The previous and cur entries make a contiguous range AND the two + # tails are identical patterns. + value[:lower] = dict[previous][:lower] + dict[previous][:lower] = -1 + dict.delete( previous ) + end + + previous = key + end +end + +def indent( level ) + for l in 1..level + print "\t" + end +end + +def print_level( level, dict ) + first = true + dict.each do |key, value| + print " |\n" if !first + + indent( level ) + if value[:lower] != key + print "0x%02X .. " % value[:lower] + end + + print "0x%02X" % key + if value[:dict].size > 0 + print " (\n" + print_level( level + 1, value[:dict] ) + indent( level ) + print ")" + end + first = false + end + print "\n" +end + +file = open( unicode_data ) +dict = {} + +file.each_line do |line| + next if line =~ /^[ \t\v]*#/; + next if line =~ /^[ \t\v]*$/; + range, description, category = line.split(/;/) + + if category =~ target_category + add_to_dict( dict, utf8_enc( range.hex ) ) + end +end + +merge( dict ) +print_level( indentation_level, dict ) -- cgit v1.2.1