summaryrefslogtreecommitdiff
path: root/grammar
diff options
context:
space:
mode:
authorAdrian Thurston <thurston@colm.net>2019-12-11 18:44:47 +0200
committerAdrian Thurston <thurston@colm.net>2019-12-11 18:44:47 +0200
commit38d2a988b816471e2cf2f414773470e28b39c93c (patch)
tree38ebf32f330c905ef24f1c3502d1d62a9964f287 /grammar
parent900189a4d23f0180970f22479f44377973a2d330 (diff)
downloadcolm-38d2a988b816471e2cf2f414773470e28b39c93c.tar.gz
go grammar: started with the unicode definitions
Generating the utf8 encoded unicode defintitions that are referenced in the Go specification. Can start to build the grammar with these definitions on hand.
Diffstat (limited to 'grammar')
-rw-r--r--grammar/Makefile2
-rw-r--r--grammar/go/.gitignore5
-rw-r--r--grammar/go/Makefile18
-rw-r--r--grammar/go/go.lm23
-rw-r--r--grammar/go/input.til13
-rw-r--r--grammar/go/insert-semi.lm111
-rw-r--r--grammar/go/parsego.lm11
-rw-r--r--grammar/go/utf8pat.rb145
8 files changed, 327 insertions, 1 deletions
diff --git a/grammar/Makefile b/grammar/Makefile
index 44ac4032..c77188bc 100644
--- a/grammar/Makefile
+++ b/grammar/Makefile
@@ -1,4 +1,4 @@
-SUBDIRS = rust pcre dns c++ python
+SUBDIRS = rust pcre dns c++ python go
all: rust pcre dns
for d in $(SUBDIRS); do ( cd $$d && $(MAKE) ); done
diff --git a/grammar/go/.gitignore b/grammar/go/.gitignore
new file mode 100644
index 00000000..80d228d8
--- /dev/null
+++ b/grammar/go/.gitignore
@@ -0,0 +1,5 @@
+/utf8.lm
+/go.c
+/go
+/insert-semi.c
+/insert-semi
diff --git a/grammar/go/Makefile b/grammar/go/Makefile
new file mode 100644
index 00000000..f4a39bad
--- /dev/null
+++ b/grammar/go/Makefile
@@ -0,0 +1,18 @@
+COLM = ../../colm/colm
+RAGEL = ../../ragel/ragel
+
+all: go insert-semi
+
+go: go.lm parsego.lm utf8.lm $(COLM)
+ $(COLM) -o go parsego.lm
+
+insert-semi: insert-semi.lm
+ $(COLM) $<
+
+utf8.lm: Makefile utf8pat.rb UnicodeData.txt
+ echo 'rl unicode_letter /' >utf8.lm
+ ruby utf8pat.rb 'L[ultmo]' 1 UnicodeData.txt >> utf8.lm
+ echo '/' >>utf8.lm
+ echo 'rl unicode_digit /' >>utf8.lm
+ ruby utf8pat.rb 'Nd' 1 UnicodeData.txt >> utf8.lm
+ echo '/' >>utf8.lm
diff --git a/grammar/go/go.lm b/grammar/go/go.lm
new file mode 100644
index 00000000..0dec2329
--- /dev/null
+++ b/grammar/go/go.lm
@@ -0,0 +1,23 @@
+include 'utf8.lm'
+
+token BOM / 0xEF 0xBB 0xBF /
+
+lex
+ rl newline / 0x0A /
+
+ rl valid_utf8 /
+ 0x00 .. 0x7F |
+ 0xC0 .. 0xDF any |
+ 0xE0 .. 0xEF any any |
+ 0xF0 .. 0xF7 any any any
+ /
+
+ rl unicode_char / valid_utf8 - 0x0A /
+
+ token id / unicode_letter unicode_letter* /
+
+ ignore /[ \t\n\r]+/
+end
+
+def program
+ [BOM? id*]
diff --git a/grammar/go/input.til b/grammar/go/input.til
new file mode 100644
index 00000000..008182b8
--- /dev/null
+++ b/grammar/go/input.til
@@ -0,0 +1,13 @@
+var a
+a := 1;
+
+head:
+
+a := ( a + 1 )
+c := d;
+
+if a = 10 then
+ goto head
+end
+
+hi := there; friend := yes
diff --git a/grammar/go/insert-semi.lm b/grammar/go/insert-semi.lm
new file mode 100644
index 00000000..3271216a
--- /dev/null
+++ b/grammar/go/insert-semi.lm
@@ -0,0 +1,111 @@
+lex
+ literal `var `if `then `else `while `do `for `read `write
+ `end `to `goto
+
+ literal `:= `!= `+ `- `* `/ `= `( `) `: `;
+
+ ignore /'//' [^\n]* '\n'/
+ ignore /[\n\t ]+/
+
+ token id /[a-zA-Z_]+/
+ token integer /[0-9]+/
+ token stringlit /'"' [^"]* '"'/
+
+ token insert_semi /
+ ( ( id -
+ 'var' - 'if' - 'then' - 'else' - 'while' -
+ 'do' - 'for' - 'read' - 'write' - 'end' -
+ 'to' - 'goto' ) |
+ integer | stringlit | ')' ) '\n' /
+ {
+ Prefix: str = input->pull( match_length - 1 )
+ input->push( ";" )
+ input->push( Prefix )
+ }
+
+end
+
+def program
+ [statement*]
+
+def statement
+ [declaration]
+| [assignment_statement]
+| [if_statement]
+| [while_statement]
+| [do_statement]
+| [for_statement]
+| [read_statement]
+| [write_statement]
+| [labelled_statement]
+| [goto_statement]
+
+def declaration
+ [`var id `;]
+
+def assignment_statement
+ [id `:= expression `;]
+
+def if_statement
+ [`if expression `then statement* opt_else_statement `end]
+
+def opt_else_statement
+ [`else statement*]
+| []
+
+def while_statement
+ [`while expression `do statement* `end]
+
+def do_statement
+ [`do statement* `while expression `;]
+
+def for_statement
+ [`for id `:= expression `to expression `do statement* `end]
+
+def read_statement
+ [`read id `;]
+
+def write_statement
+ [`write expression `;]
+
+def expression
+ [term]
+| [expression eqop term]
+
+def eqop [`=] | [`!=]
+
+def term
+ [factor]
+| [term addop factor]
+
+def addop [`+] | [`-]
+
+def factor
+ [primary]
+| [factor mulop primary]
+
+def mulop [`*] | [`/]
+
+def primary
+ [id]
+| [lit]
+| [`( expression `)]
+
+def lit
+ [integer]
+| [stringlit]
+
+def labelled_statement
+ [id `: statement]
+
+def goto_statement
+ [`goto id `;]
+
+parse P: program[stdin]
+
+if P
+ print[ P ]
+else {
+ print "[error]
+ exit( 1 )
+}
diff --git a/grammar/go/parsego.lm b/grammar/go/parsego.lm
new file mode 100644
index 00000000..659fcb0e
--- /dev/null
+++ b/grammar/go/parsego.lm
@@ -0,0 +1,11 @@
+include 'go.lm'
+
+parse P: program [stdin]
+
+if !P {
+ send stderr "parse error: [error]
+ exit(1)
+}
+else {
+ print [P]
+}
diff --git a/grammar/go/utf8pat.rb b/grammar/go/utf8pat.rb
new file mode 100644
index 00000000..12b9df7a
--- /dev/null
+++ b/grammar/go/utf8pat.rb
@@ -0,0 +1,145 @@
+# utf8pat.rb
+#
+# Generate utf8-encoded ragel or colm patterns for unicode code character sets.
+#
+# The dict structure is a hash mapping upper ends of ranges to a hash that
+# contains:
+# 1. The lower end.
+# 2. A dict for the tail of the pattern.
+#
+# We index by the upper end of the range because we assume all unicode points
+# to be read in increasing order and we check for extension as we add points by
+# looking up the upper end.
+#
+# dict: { upper => { :lower => lower, :dict => dict } }
+#
+
+#
+# utf8pat.rb <category-regex> <indentation-level> <unicode-data-file>
+#
+#
+# ruby utf8pat.rb 'L[lutmo]' 1 UnicodeData.txt
+
+target_category = Regexp.new( ARGV[0] )
+indentation_level = ARGV[1].to_i
+unicode_data = ARGV[2]
+
+def utf8_enc( n )
+ if n <= 0x7F
+ return [ n ]
+ elsif n <= 0x7FF
+ return [
+ 0xC0 | (n >> 6),
+ 0x80 | (n & 0x3F)
+ ]
+ elsif n <= 0xFFFF
+ return [
+ 0xE0 | (n >> 12),
+ 0x80 | (n >> 6) & 0x3F,
+ 0x80 | n & 0x3F
+ ]
+ elsif n <= 0x10ffff
+ return [
+ 0xF0 | (n >> 18),
+ 0x80 | (n >> 12) & 0x3F,
+ 0x80 | (n >> 6) & 0x3F,
+ 0x80 | n & 0x3F
+ ]
+ end
+end
+
+def add_to_dict( dict, utf8val )
+ return if utf8val.size == 0
+
+ nk = utf8val[0]
+
+ if utf8val.size == 1 && nk > 0 && dict.key?( nk - 1 )
+ dict[nk] = dict[nk - 1]
+ dict.delete( nk - 1 )
+ else
+ if ! dict.key?( utf8val[0] )
+ dict[nk] = { :lower => nk, :dict => {} }
+ end
+ end
+
+ add_to_dict( dict[nk][:dict], utf8val[1..-1] )
+end
+
+def compare( dict1, dict2 )
+ # First check if we have equal size. If so, iterate dict1 and endsure key
+ # is present in dict2. Then check lower end of the range matches and
+ # recurse on the tails.
+ return false if dict1.size != dict2.size
+
+ dict1.each do |key, value|
+ return false if !dict2.key?( key )
+
+ return false if value[:lower] != dict2[key][:lower]
+
+ return false if !compare( value[:dict], dict2[key][:dict] )
+ end
+
+ return true
+end
+
+def merge( dict )
+ previous = nil
+ dict.each do |key, value|
+ # First recurse, ensuring dict is merged.
+ merge( value[:dict] )
+ if !previous.nil? && ( previous + 1 ) == value[:lower] &&
+ compare( dict[previous][:dict], value[:dict] )
+ # The previous and cur entries make a contiguous range AND the two
+ # tails are identical patterns.
+ value[:lower] = dict[previous][:lower]
+ dict[previous][:lower] = -1
+ dict.delete( previous )
+ end
+
+ previous = key
+ end
+end
+
+def indent( level )
+ for l in 1..level
+ print "\t"
+ end
+end
+
+def print_level( level, dict )
+ first = true
+ dict.each do |key, value|
+ print " |\n" if !first
+
+ indent( level )
+ if value[:lower] != key
+ print "0x%02X .. " % value[:lower]
+ end
+
+ print "0x%02X" % key
+ if value[:dict].size > 0
+ print " (\n"
+ print_level( level + 1, value[:dict] )
+ indent( level )
+ print ")"
+ end
+ first = false
+ end
+ print "\n"
+end
+
+file = open( unicode_data )
+dict = {}
+
+file.each_line do |line|
+ next if line =~ /^[ \t\v]*#/;
+ next if line =~ /^[ \t\v]*$/;
+ range, description, category = line.split(/;/)
+
+ if category =~ target_category
+ add_to_dict( dict, utf8_enc( range.hex ) )
+ end
+end
+
+merge( dict )
+print_level( indentation_level, dict )