summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAdrian Thurston <thurston@colm.net>2019-12-11 18:44:47 +0200
committerAdrian Thurston <thurston@colm.net>2019-12-11 18:44:47 +0200
commit38d2a988b816471e2cf2f414773470e28b39c93c (patch)
tree38ebf32f330c905ef24f1c3502d1d62a9964f287
parent900189a4d23f0180970f22479f44377973a2d330 (diff)
downloadcolm-38d2a988b816471e2cf2f414773470e28b39c93c.tar.gz
go grammar: started with the unicode definitions
Generating the utf8 encoded unicode defintitions that are referenced in the Go specification. Can start to build the grammar with these definitions on hand.
-rw-r--r--grammar/Makefile2
-rw-r--r--grammar/go/.gitignore5
-rw-r--r--grammar/go/Makefile18
-rw-r--r--grammar/go/go.lm23
-rw-r--r--grammar/go/input.til13
-rw-r--r--grammar/go/insert-semi.lm111
-rw-r--r--grammar/go/parsego.lm11
-rw-r--r--grammar/go/utf8pat.rb145
8 files changed, 327 insertions, 1 deletions
diff --git a/grammar/Makefile b/grammar/Makefile
index 44ac4032..c77188bc 100644
--- a/grammar/Makefile
+++ b/grammar/Makefile
@@ -1,4 +1,4 @@
-SUBDIRS = rust pcre dns c++ python
+SUBDIRS = rust pcre dns c++ python go
all: rust pcre dns
for d in $(SUBDIRS); do ( cd $$d && $(MAKE) ); done
diff --git a/grammar/go/.gitignore b/grammar/go/.gitignore
new file mode 100644
index 00000000..80d228d8
--- /dev/null
+++ b/grammar/go/.gitignore
@@ -0,0 +1,5 @@
+/utf8.lm
+/go.c
+/go
+/insert-semi.c
+/insert-semi
diff --git a/grammar/go/Makefile b/grammar/go/Makefile
new file mode 100644
index 00000000..f4a39bad
--- /dev/null
+++ b/grammar/go/Makefile
@@ -0,0 +1,18 @@
+COLM = ../../colm/colm
+RAGEL = ../../ragel/ragel
+
+all: go insert-semi
+
+go: go.lm parsego.lm utf8.lm $(COLM)
+ $(COLM) -o go parsego.lm
+
+insert-semi: insert-semi.lm
+ $(COLM) $<
+
+utf8.lm: Makefile utf8pat.rb UnicodeData.txt
+ echo 'rl unicode_letter /' >utf8.lm
+ ruby utf8pat.rb 'L[ultmo]' 1 UnicodeData.txt >> utf8.lm
+ echo '/' >>utf8.lm
+ echo 'rl unicode_digit /' >>utf8.lm
+ ruby utf8pat.rb 'Nd' 1 UnicodeData.txt >> utf8.lm
+ echo '/' >>utf8.lm
diff --git a/grammar/go/go.lm b/grammar/go/go.lm
new file mode 100644
index 00000000..0dec2329
--- /dev/null
+++ b/grammar/go/go.lm
@@ -0,0 +1,23 @@
+include 'utf8.lm'
+
+token BOM / 0xEF 0xBB 0xBF /
+
+lex
+ rl newline / 0x0A /
+
+ rl valid_utf8 /
+ 0x00 .. 0x7F |
+ 0xC0 .. 0xDF any |
+ 0xE0 .. 0xEF any any |
+ 0xF0 .. 0xF7 any any any
+ /
+
+ rl unicode_char / valid_utf8 - 0x0A /
+
+ token id / unicode_letter unicode_letter* /
+
+ ignore /[ \t\n\r]+/
+end
+
+def program
+ [BOM? id*]
diff --git a/grammar/go/input.til b/grammar/go/input.til
new file mode 100644
index 00000000..008182b8
--- /dev/null
+++ b/grammar/go/input.til
@@ -0,0 +1,13 @@
+var a
+a := 1;
+
+head:
+
+a := ( a + 1 )
+c := d;
+
+if a = 10 then
+ goto head
+end
+
+hi := there; friend := yes
diff --git a/grammar/go/insert-semi.lm b/grammar/go/insert-semi.lm
new file mode 100644
index 00000000..3271216a
--- /dev/null
+++ b/grammar/go/insert-semi.lm
@@ -0,0 +1,111 @@
+lex
+ literal `var `if `then `else `while `do `for `read `write
+ `end `to `goto
+
+ literal `:= `!= `+ `- `* `/ `= `( `) `: `;
+
+ ignore /'//' [^\n]* '\n'/
+ ignore /[\n\t ]+/
+
+ token id /[a-zA-Z_]+/
+ token integer /[0-9]+/
+ token stringlit /'"' [^"]* '"'/
+
+ token insert_semi /
+ ( ( id -
+ 'var' - 'if' - 'then' - 'else' - 'while' -
+ 'do' - 'for' - 'read' - 'write' - 'end' -
+ 'to' - 'goto' ) |
+ integer | stringlit | ')' ) '\n' /
+ {
+ Prefix: str = input->pull( match_length - 1 )
+ input->push( ";" )
+ input->push( Prefix )
+ }
+
+end
+
+def program
+ [statement*]
+
+def statement
+ [declaration]
+| [assignment_statement]
+| [if_statement]
+| [while_statement]
+| [do_statement]
+| [for_statement]
+| [read_statement]
+| [write_statement]
+| [labelled_statement]
+| [goto_statement]
+
+def declaration
+ [`var id `;]
+
+def assignment_statement
+ [id `:= expression `;]
+
+def if_statement
+ [`if expression `then statement* opt_else_statement `end]
+
+def opt_else_statement
+ [`else statement*]
+| []
+
+def while_statement
+ [`while expression `do statement* `end]
+
+def do_statement
+ [`do statement* `while expression `;]
+
+def for_statement
+ [`for id `:= expression `to expression `do statement* `end]
+
+def read_statement
+ [`read id `;]
+
+def write_statement
+ [`write expression `;]
+
+def expression
+ [term]
+| [expression eqop term]
+
+def eqop [`=] | [`!=]
+
+def term
+ [factor]
+| [term addop factor]
+
+def addop [`+] | [`-]
+
+def factor
+ [primary]
+| [factor mulop primary]
+
+def mulop [`*] | [`/]
+
+def primary
+ [id]
+| [lit]
+| [`( expression `)]
+
+def lit
+ [integer]
+| [stringlit]
+
+def labelled_statement
+ [id `: statement]
+
+def goto_statement
+ [`goto id `;]
+
+parse P: program[stdin]
+
+if P
+ print[ P ]
+else {
+ print "[error]
+ exit( 1 )
+}
diff --git a/grammar/go/parsego.lm b/grammar/go/parsego.lm
new file mode 100644
index 00000000..659fcb0e
--- /dev/null
+++ b/grammar/go/parsego.lm
@@ -0,0 +1,11 @@
+include 'go.lm'
+
+parse P: program [stdin]
+
+if !P {
+ send stderr "parse error: [error]
+ exit(1)
+}
+else {
+ print [P]
+}
diff --git a/grammar/go/utf8pat.rb b/grammar/go/utf8pat.rb
new file mode 100644
index 00000000..12b9df7a
--- /dev/null
+++ b/grammar/go/utf8pat.rb
@@ -0,0 +1,145 @@
+# utf8pat.rb
+#
+# Generate utf8-encoded ragel or colm patterns for unicode code character sets.
+#
+# The dict structure is a hash mapping upper ends of ranges to a hash that
+# contains:
+# 1. The lower end.
+# 2. A dict for the tail of the pattern.
+#
+# We index by the upper end of the range because we assume all unicode points
+# to be read in increasing order and we check for extension as we add points by
+# looking up the upper end.
+#
+# dict: { upper => { :lower => lower, :dict => dict } }
+#
+
+#
+# utf8pat.rb <category-regex> <indentation-level> <unicode-data-file>
+#
+#
+# ruby utf8pat.rb 'L[lutmo]' 1 UnicodeData.txt
+
+target_category = Regexp.new( ARGV[0] )
+indentation_level = ARGV[1].to_i
+unicode_data = ARGV[2]
+
+def utf8_enc( n )
+ if n <= 0x7F
+ return [ n ]
+ elsif n <= 0x7FF
+ return [
+ 0xC0 | (n >> 6),
+ 0x80 | (n & 0x3F)
+ ]
+ elsif n <= 0xFFFF
+ return [
+ 0xE0 | (n >> 12),
+ 0x80 | (n >> 6) & 0x3F,
+ 0x80 | n & 0x3F
+ ]
+ elsif n <= 0x10ffff
+ return [
+ 0xF0 | (n >> 18),
+ 0x80 | (n >> 12) & 0x3F,
+ 0x80 | (n >> 6) & 0x3F,
+ 0x80 | n & 0x3F
+ ]
+ end
+end
+
+def add_to_dict( dict, utf8val )
+ return if utf8val.size == 0
+
+ nk = utf8val[0]
+
+ if utf8val.size == 1 && nk > 0 && dict.key?( nk - 1 )
+ dict[nk] = dict[nk - 1]
+ dict.delete( nk - 1 )
+ else
+ if ! dict.key?( utf8val[0] )
+ dict[nk] = { :lower => nk, :dict => {} }
+ end
+ end
+
+ add_to_dict( dict[nk][:dict], utf8val[1..-1] )
+end
+
+def compare( dict1, dict2 )
+ # First check if we have equal size. If so, iterate dict1 and endsure key
+ # is present in dict2. Then check lower end of the range matches and
+ # recurse on the tails.
+ return false if dict1.size != dict2.size
+
+ dict1.each do |key, value|
+ return false if !dict2.key?( key )
+
+ return false if value[:lower] != dict2[key][:lower]
+
+ return false if !compare( value[:dict], dict2[key][:dict] )
+ end
+
+ return true
+end
+
+def merge( dict )
+ previous = nil
+ dict.each do |key, value|
+ # First recurse, ensuring dict is merged.
+ merge( value[:dict] )
+ if !previous.nil? && ( previous + 1 ) == value[:lower] &&
+ compare( dict[previous][:dict], value[:dict] )
+ # The previous and cur entries make a contiguous range AND the two
+ # tails are identical patterns.
+ value[:lower] = dict[previous][:lower]
+ dict[previous][:lower] = -1
+ dict.delete( previous )
+ end
+
+ previous = key
+ end
+end
+
+def indent( level )
+ for l in 1..level
+ print "\t"
+ end
+end
+
+def print_level( level, dict )
+ first = true
+ dict.each do |key, value|
+ print " |\n" if !first
+
+ indent( level )
+ if value[:lower] != key
+ print "0x%02X .. " % value[:lower]
+ end
+
+ print "0x%02X" % key
+ if value[:dict].size > 0
+ print " (\n"
+ print_level( level + 1, value[:dict] )
+ indent( level )
+ print ")"
+ end
+ first = false
+ end
+ print "\n"
+end
+
+file = open( unicode_data )
+dict = {}
+
+file.each_line do |line|
+ next if line =~ /^[ \t\v]*#/;
+ next if line =~ /^[ \t\v]*$/;
+ range, description, category = line.split(/;/)
+
+ if category =~ target_category
+ add_to_dict( dict, utf8_enc( range.hex ) )
+ end
+end
+
+merge( dict )
+print_level( indentation_level, dict )