go grammar: started with the unicode definitions

Generating the utf8 encoded unicode defintitions that are referenced in the Go specification. Can start to build the grammar with these definitions on hand.
author: Adrian Thurston <thurston@colm.net> 2019-12-11 18:44:47 +0200
committer: Adrian Thurston <thurston@colm.net> 2019-12-11 18:44:47 +0200
commit: 38d2a988b816471e2cf2f414773470e28b39c93c (patch)
tree: 38ebf32f330c905ef24f1c3502d1d62a9964f287 /grammar
parent: 900189a4d23f0180970f22479f44377973a2d330 (diff)
download: colm-38d2a988b816471e2cf2f414773470e28b39c93c.tar.gz
8 files changed, 327 insertions, 1 deletions
diff --git a/grammar/Makefile b/grammar/Makefile
index 44ac4032..c77188bc 100644
--- a/grammar/Makefile
+++ b/grammar/Makefile
@@ -1,4 +1,4 @@
-SUBDIRS = rust pcre dns c++ python
+SUBDIRS = rust pcre dns c++ python go
 
 all: rust pcre dns
 	for d in $(SUBDIRS); do ( cd $$d && $(MAKE) ); done
diff --git a/grammar/go/.gitignore b/grammar/go/.gitignore
new file mode 100644
index 00000000..80d228d8
--- /dev/null
+++ b/grammar/go/.gitignore
@@ -0,0 +1,5 @@
+/utf8.lm
+/go.c
+/go
+/insert-semi.c
+/insert-semi
diff --git a/grammar/go/Makefile b/grammar/go/Makefile
new file mode 100644
index 00000000..f4a39bad
--- /dev/null
+++ b/grammar/go/Makefile
@@ -0,0 +1,18 @@
+COLM = ../../colm/colm
+RAGEL = ../../ragel/ragel
+
+all: go insert-semi
+
+go: go.lm parsego.lm utf8.lm $(COLM)
+	$(COLM) -o go parsego.lm
+
+insert-semi: insert-semi.lm
+	$(COLM) $<
+
+utf8.lm: Makefile utf8pat.rb UnicodeData.txt
+	echo 'rl unicode_letter /' >utf8.lm
+	ruby utf8pat.rb 'L[ultmo]' 1 UnicodeData.txt >> utf8.lm
+	echo '/' >>utf8.lm
+	echo 'rl unicode_digit /' >>utf8.lm
+	ruby utf8pat.rb 'Nd' 1 UnicodeData.txt >> utf8.lm
+	echo '/' >>utf8.lm
diff --git a/grammar/go/go.lm b/grammar/go/go.lm
new file mode 100644
index 00000000..0dec2329
--- /dev/null
+++ b/grammar/go/go.lm
@@ -0,0 +1,23 @@
+include 'utf8.lm'
+
+token BOM / 0xEF 0xBB 0xBF /
+
+lex
+	rl newline / 0x0A /
+
+	rl valid_utf8 /
+		0x00 .. 0x7F |
+		0xC0 .. 0xDF any |
+		0xE0 .. 0xEF any any |
+		0xF0 .. 0xF7 any any any
+	/
+
+	rl unicode_char / valid_utf8 - 0x0A /
+
+	token id / unicode_letter unicode_letter* /
+
+	ignore /[ \t\n\r]+/
+end
+
+def program
+	[BOM? id*]
diff --git a/grammar/go/input.til b/grammar/go/input.til
new file mode 100644
index 00000000..008182b8
--- /dev/null
+++ b/grammar/go/input.til
@@ -0,0 +1,13 @@
+var a
+a := 1;
+
+head:
+
+a := ( a + 1 )
+c := d;
+
+if a = 10 then
+	goto head
+end
+
+hi := there; friend := yes
diff --git a/grammar/go/insert-semi.lm b/grammar/go/insert-semi.lm
new file mode 100644
index 00000000..3271216a
--- /dev/null
+++ b/grammar/go/insert-semi.lm
@@ -0,0 +1,111 @@
+lex
+	literal `var `if `then `else `while `do `for `read `write
+			`end `to `goto
+
+	literal `:= `!= `+ `- `* `/ `= `( `) `: `;
+
+	ignore /'//' [^\n]* '\n'/
+	ignore /[\n\t ]+/
+
+	token id /[a-zA-Z_]+/
+	token integer /[0-9]+/
+	token stringlit /'"' [^"]* '"'/
+
+	token insert_semi /
+		( ( id -
+			'var' - 'if' - 'then' - 'else' - 'while' -
+			'do' - 'for' - 'read' - 'write' - 'end' -
+			'to' - 'goto' ) |
+		integer | stringlit | ')' ) '\n' /
+	{
+		Prefix: str = input->pull( match_length - 1 )
+		input->push( ";" )
+		input->push( Prefix )
+	}
+
+end
+
+def program
+	[statement*]
+
+def statement
+	[declaration]
+|	[assignment_statement]
+|	[if_statement]
+|	[while_statement]
+|	[do_statement]
+|	[for_statement]
+|	[read_statement]
+|	[write_statement]
+|	[labelled_statement]
+|	[goto_statement]
+
+def declaration
+	[`var id `;]
+
+def assignment_statement
+	[id `:= expression `;]
+
+def if_statement
+	[`if expression `then statement* opt_else_statement `end]
+
+def opt_else_statement
+	[`else statement*]
+|	[]
+
+def while_statement
+	[`while expression `do statement* `end]
+
+def do_statement
+	[`do statement* `while expression `;]
+
+def for_statement
+	[`for id `:= expression `to expression `do statement* `end]
+
+def read_statement
+	[`read id `;]
+
+def write_statement
+	[`write expression `;]
+
+def expression
+	[term]
+|	[expression eqop term]
+
+def eqop [`=] | [`!=]
+
+def term
+	[factor]
+|	[term addop factor]
+
+def addop [`+] | [`-]
+
+def factor
+	[primary]
+|	[factor mulop primary]
+
+def mulop [`*] | [`/]
+
+def primary
+	[id]
+|	[lit]
+|	[`( expression `)]
+
+def lit
+	[integer]
+|	[stringlit]
+
+def labelled_statement
+	[id `: statement]
+
+def goto_statement
+	[`goto id `;]
+
+parse P: program[stdin]
+
+if P 
+	print[ P ]
+else {
+	print "[error]
+	exit( 1 )
+}
diff --git a/grammar/go/parsego.lm b/grammar/go/parsego.lm
new file mode 100644
index 00000000..659fcb0e
--- /dev/null
+++ b/grammar/go/parsego.lm
@@ -0,0 +1,11 @@
+include 'go.lm'
+
+parse P: program [stdin]
+
+if !P {
+	send stderr "parse error: [error]
+	exit(1)
+}
+else {
+	print [P]
+}
diff --git a/grammar/go/utf8pat.rb b/grammar/go/utf8pat.rb
new file mode 100644
index 00000000..12b9df7a
--- /dev/null
+++ b/grammar/go/utf8pat.rb
@@ -0,0 +1,145 @@
+# utf8pat.rb
+#
+# Generate utf8-encoded ragel or colm patterns for unicode code character sets.
+#
+# The dict structure is a hash mapping upper ends of ranges to a hash that
+# contains:
+#  1. The lower end.
+#  2. A dict for the tail of the pattern.
+#
+# We index by the upper end of the range because we assume all unicode points
+# to be read in increasing order and we check for extension as we add points by
+# looking up the upper end.
+#
+# dict: { upper => { :lower => lower, :dict => dict } }
+#
+
+#
+# utf8pat.rb <category-regex> <indentation-level> <unicode-data-file>
+#
+#
+# ruby utf8pat.rb 'L[lutmo]' 1 UnicodeData.txt
+
+target_category = Regexp.new( ARGV[0] )
+indentation_level = ARGV[1].to_i
+unicode_data = ARGV[2]
+
+def utf8_enc( n )
+	if n <= 0x7F
+		return [ n ]
+	elsif n <= 0x7FF
+		return [
+			0xC0 | (n >> 6),
+			0x80 | (n & 0x3F)
+		]
+	elsif n <= 0xFFFF
+		return [
+			0xE0 | (n >> 12),
+			0x80 | (n >>  6) & 0x3F,
+			0x80 |  n        & 0x3F
+		]
+	elsif n <= 0x10ffff
+		return [
+			0xF0 | (n >> 18),
+			0x80 | (n >> 12) & 0x3F,
+			0x80 | (n >>  6) & 0x3F,
+			0x80 |  n        & 0x3F
+		]
+	end
+end
+
+def add_to_dict( dict, utf8val )
+	return if utf8val.size == 0 
+		
+	nk = utf8val[0]
+
+	if utf8val.size == 1 && nk > 0 && dict.key?( nk - 1 )
+		dict[nk] = dict[nk - 1]
+		dict.delete( nk - 1 )
+	else
+		if ! dict.key?( utf8val[0] )
+			dict[nk] = { :lower => nk, :dict => {} }
+		end
+	end
+
+	add_to_dict( dict[nk][:dict], utf8val[1..-1] )
+end
+
+def compare( dict1, dict2 )
+	# First check if we have equal size. If so, iterate dict1 and endsure key
+	# is present in dict2. Then check lower end of the range matches and
+	# recurse on the tails.
+	return false if dict1.size != dict2.size
+
+	dict1.each do |key, value|
+		return false if !dict2.key?( key )
+
+		return false if value[:lower] != dict2[key][:lower]
+
+		return false if !compare( value[:dict], dict2[key][:dict] )
+	end
+
+	return true
+end
+
+def merge( dict )
+	previous = nil
+	dict.each do |key, value|
+		# First recurse, ensuring dict is merged.
+		merge( value[:dict] )
+		if !previous.nil? && ( previous + 1 ) == value[:lower] &&
+				compare( dict[previous][:dict], value[:dict] )
+			# The previous and cur entries make a contiguous range AND the two
+			# tails are identical patterns.
+			value[:lower] = dict[previous][:lower]
+			dict[previous][:lower] = -1
+			dict.delete( previous )
+		end
+
+		previous = key
+	end
+end
+
+def indent( level )
+	for l in 1..level
+		print "\t"
+	end
+end
+
+def print_level( level, dict )
+	first = true
+	dict.each do |key, value|
+		print " |\n" if !first
+
+		indent( level )
+		if value[:lower] != key
+			print "0x%02X .. " % value[:lower]
+		end
+
+		print "0x%02X" % key
+		if value[:dict].size > 0 
+			print " (\n"
+			print_level( level + 1, value[:dict] )
+			indent( level )
+			print ")"
+		end
+		first = false
+	end
+	print "\n"
+end
+
+file = open( unicode_data )
+dict = {}
+
+file.each_line do |line|
+	next if line =~ /^[ \t\v]*#/;
+	next if line =~ /^[ \t\v]*$/;
+	range, description, category = line.split(/;/)
+
+	if category =~ target_category
+		add_to_dict( dict, utf8_enc( range.hex ) )
+	end
+end
+
+merge( dict )
+print_level( indentation_level, dict )
author	Adrian Thurston <thurston@colm.net>	2019-12-11 18:44:47 +0200
committer	Adrian Thurston <thurston@colm.net>	2019-12-11 18:44:47 +0200
commit	38d2a988b816471e2cf2f414773470e28b39c93c (patch)
tree	38ebf32f330c905ef24f1c3502d1d62a9964f287 /grammar
parent	900189a4d23f0180970f22479f44377973a2d330 (diff)
download	colm-38d2a988b816471e2cf2f414773470e28b39c93c.tar.gz