go grammar: added lexical definitions and semi insertion

Ooof this makes a big output file. May need to do something about it. refs #82
author: Adrian Thurston <thurston@colm.net> 2019-12-13 04:51:45 +0200
committer: Adrian Thurston <thurston@colm.net> 2019-12-13 04:51:45 +0200
commit: 402fc46c2a8c8f00c3d5d270ee001ce22776dd41 (patch)
tree: 77de29c63c6ab3f74bd87c0eff22c3dc3995ad0f /grammar
parent: 38d2a988b816471e2cf2f414773470e28b39c93c (diff)
download: colm-402fc46c2a8c8f00c3d5d270ee001ce22776dd41.tar.gz
4 files changed, 185 insertions, 118 deletions
diff --git a/grammar/go/Makefile b/grammar/go/Makefile
index f4a39bad..8f5d8447 100644
--- a/grammar/go/Makefile
+++ b/grammar/go/Makefile
@@ -1,14 +1,11 @@
 COLM = ../../colm/colm
 RAGEL = ../../ragel/ragel
 
-all: go insert-semi
+all: go
 
 go: go.lm parsego.lm utf8.lm $(COLM)
 	$(COLM) -o go parsego.lm
 
-insert-semi: insert-semi.lm
-	$(COLM) $<
-
 utf8.lm: Makefile utf8pat.rb UnicodeData.txt
 	echo 'rl unicode_letter /' >utf8.lm
 	ruby utf8pat.rb 'L[ultmo]' 1 UnicodeData.txt >> utf8.lm
diff --git a/grammar/go/go.lm b/grammar/go/go.lm
index 0dec2329..0ab8f7fd 100644
--- a/grammar/go/go.lm
+++ b/grammar/go/go.lm
@@ -3,6 +3,9 @@ include 'utf8.lm'
 token BOM / 0xEF 0xBB 0xBF /
 
 lex
+	#
+	# Definitions.
+	#
 	rl newline / 0x0A /
 
 	rl valid_utf8 /
@@ -12,12 +15,180 @@ lex
 		0xF0 .. 0xF7 any any any
 	/
 
-	rl unicode_char / valid_utf8 - 0x0A /
+	rl unicode_char  / valid_utf8 - 0x0A /
 
-	token id / unicode_letter unicode_letter* /
+	rl letter        / unicode_letter | '_' /
+	rl binary_digit  / '0' | '1' /
+	rl octal_digit   / '0' .. '7' /
+	rl decimal_digit / '0' .. '9' /
+	rl hex_digit     / '0' .. '9' | 'A' .. 'F' | 'a' .. 'f' /
 
+	#
+	# Tokens
+	#
+
+	literal
+		`+    `&     `+=    `&=     `&&    `==    `!=    `(    `)
+		`-    `|     `-=    `|=     `||    `<     `<=    `[    `]
+		`*    `^     `*=    `^=     `<-    `>     `>=    `{    `}
+		`/    `<<    `/=    `<<=    `++    `=     `:=    `,    #;
+		`%    `>>    `%=    `>>=    `--    `!     `...   `.    `:
+			  `&^           `&^=
+
+	token SEMI /';'/
+
+	literal
+		`break        `default      `func         `interface    `select
+		`case         `defer        `go           `map          `struct
+		`chan         `else         `goto         `package      `switch
+		`const        `fallthrough  `if           `range        `type
+		`continue     `for          `import       `return       `var
+
+	token id
+		/ letter ( letter | unicode_digit )* /
+
+	#
+	# Non-float numbers
+	#
+	
+	rl binary_digits  / binary_digit ( '_'? binary_digit )* /
+	rl octal_digits   / octal_digit ( '_'? octal_digit )* /
+	rl decimal_digits / decimal_digit ( '_'? decimal_digit )* /
+	rl hex_digits     / hex_digit ( '_'? hex_digit )* /
+
+	token binary_lit     / '0' ( 'b' | 'B' ) '_'? binary_digits /
+	token octal_lit      / '0' ( 'o' | 'O' )? '_'? octal_digits /
+	token decimal_lit    / '0' | ( '1' .. '9' ) ( '_'? decimal_digits )? /
+	token hex_lit        / '0' ( 'x' | 'X' ) '_'? hex_digits /
+
+	rl int_lit
+		/ decimal_lit | binary_lit | octal_lit | hex_lit /
+
+	def int_lit
+		[decimal_lit] | [binary_lit] | [octal_lit] | [hex_lit]
+
+	rl decimal_exponent  / ( 'e' | 'E' ) ( '+' | '-' )? decimal_digits /
+
+	#
+	# Floats
+	#
+	token decimal_float_lit /
+		decimal_digits '.' decimal_digits? decimal_exponent? |
+		decimal_digits decimal_exponent |
+		'.' decimal_digits decimal_exponent? /
+
+	rl hex_mantissa
+		/ '_'? hex_digits '.' hex_digits? | '_'? hex_digits | '.' hex_digits /
+	rl hex_exponent
+		/ ( 'p' | 'P' ) ( '+' | '-' )? decimal_digits /
+
+	token hex_float_lit
+		/ '0' ( 'x' | 'X' ) hex_mantissa hex_exponent /
+
+	rl float_lit
+		/ decimal_float_lit | hex_float_lit /
+
+	def float_lit
+		[decimal_float_lit] | [hex_float_lit]
+	
+	#
+	# Imaginary
+	#
+	token imaginary_lit / 
+		( decimal_digits | int_lit | float_lit ) 'i' /
+	
+	#
+	# Rune literals
+	#
+
+
+	rl escaped_char
+		/ '\\' ( 'a' | 'b' | 'f' | 'n' | 'r' | 't' | 'v' | '\\' | "'" | '"' ) /
+
+	rl octal_byte_value / '\\' octal_digit octal_digit octal_digit /
+	rl hex_byte_value   / '\\' 'x' hex_digit hex_digit /
+	rl little_u_value   / '\\' 'u' hex_digit hex_digit hex_digit hex_digit /
+	rl big_u_value      / '\\' 'U' hex_digit hex_digit hex_digit hex_digit
+							   hex_digit hex_digit hex_digit hex_digit /
+
+	rl byte_value       / octal_byte_value | hex_byte_value /
+	rl unicode_value    / unicode_char | little_u_value | big_u_value | escaped_char /
+
+	token rune_lit / "'" ( unicode_value | byte_value ) "'" /
+
+	#
+	# String literals
+	#
+	rl raw_string_lit         / "`" ( ( unicode_char | newline ) - '`' )* "`" /
+	rl interpreted_string_lit / '"' ( ( unicode_value | byte_value ) - '"' )* '"' /
+	token string_lit          / raw_string_lit | interpreted_string_lit /
+
+	#
+	# Comments
+	#
+	rl line_comment
+		/ '//' [^\n]* '\n' /
+	
+	rl general_comment
+		/ '/*' any* :> '*/'/
+
+	rl pre_insert_semi /
+		( id -
+			'break'     - 'default'     - 'func'      - 'interface' - 'select'
+			'case'      - 'defer'       - 'go'        - 'map'       - 'struct'
+			'chan'      - 'else'        - 'goto'      - 'package'   - 'switch'
+			'const'     - 'fallthrough' - 'if'        - 'range'     - 'type'
+			'continue'  - 'for'         - 'import'    - 'return'    - 'var' ) |
+		int_lit |
+		float_lit |
+		imaginary_lit |
+		rune_lit |
+		string_lit
+	/
+
+	#
+	# Semi-colons
+	#
+	token insert_semi /
+		pre_insert_semi
+		( [ \t]+ | general_comment )*
+		( '/*' [^\n]* | line_comment | '\n'  )
+	/
+	{
+		parse BA: break_apart::break_apart[match_text]
+
+		Prefix: str = input->pull( BA.pre_semi.data.length )
+		input->push( make_token( typeid<SEMI>, ';' ) )
+		input->push( Prefix )
+	}
+
+	ignore /line_comment/
+	ignore /general_comment/
 	ignore /[ \t\n\r]+/
 end
 
+namespace break_apart
+	lex
+		token pre_semi / pre_insert_semi /
+		ignore /line_comment/
+		ignore /'/*' any*/
+		ignore /[ \t\n\r]+/
+	end
+
+	def break_apart
+		[pre_semi]
+end
+
+def item
+	[id]
+|	[int_lit]
+|	[float_lit]
+|	[imaginary_lit]
+|	[rune_lit]
+|	[string_lit]
+
+def stmt
+	[item+ SEMI]
+
 def program
-	[BOM? id*]
+	[BOM? stmt*]
diff --git a/grammar/go/insert-semi.lm b/grammar/go/insert-semi.lm
deleted file mode 100644
index 3271216a..00000000
--- a/grammar/go/insert-semi.lm
+++ /dev/null
@@ -1,111 +0,0 @@
-lex
-	literal `var `if `then `else `while `do `for `read `write
-			`end `to `goto
-
-	literal `:= `!= `+ `- `* `/ `= `( `) `: `;
-
-	ignore /'//' [^\n]* '\n'/
-	ignore /[\n\t ]+/
-
-	token id /[a-zA-Z_]+/
-	token integer /[0-9]+/
-	token stringlit /'"' [^"]* '"'/
-
-	token insert_semi /
-		( ( id -
-			'var' - 'if' - 'then' - 'else' - 'while' -
-			'do' - 'for' - 'read' - 'write' - 'end' -
-			'to' - 'goto' ) |
-		integer | stringlit | ')' ) '\n' /
-	{
-		Prefix: str = input->pull( match_length - 1 )
-		input->push( ";" )
-		input->push( Prefix )
-	}
-
-end
-
-def program
-	[statement*]
-
-def statement
-	[declaration]
-|	[assignment_statement]
-|	[if_statement]
-|	[while_statement]
-|	[do_statement]
-|	[for_statement]
-|	[read_statement]
-|	[write_statement]
-|	[labelled_statement]
-|	[goto_statement]
-
-def declaration
-	[`var id `;]
-
-def assignment_statement
-	[id `:= expression `;]
-
-def if_statement
-	[`if expression `then statement* opt_else_statement `end]
-
-def opt_else_statement
-	[`else statement*]
-|	[]
-
-def while_statement
-	[`while expression `do statement* `end]
-
-def do_statement
-	[`do statement* `while expression `;]
-
-def for_statement
-	[`for id `:= expression `to expression `do statement* `end]
-
-def read_statement
-	[`read id `;]
-
-def write_statement
-	[`write expression `;]
-
-def expression
-	[term]
-|	[expression eqop term]
-
-def eqop [`=] | [`!=]
-
-def term
-	[factor]
-|	[term addop factor]
-
-def addop [`+] | [`-]
-
-def factor
-	[primary]
-|	[factor mulop primary]
-
-def mulop [`*] | [`/]
-
-def primary
-	[id]
-|	[lit]
-|	[`( expression `)]
-
-def lit
-	[integer]
-|	[stringlit]
-
-def labelled_statement
-	[id `: statement]
-
-def goto_statement
-	[`goto id `;]
-
-parse P: program[stdin]
-
-if P 
-	print[ P ]
-else {
-	print "[error]
-	exit( 1 )
-}
diff --git a/grammar/go/parsego.lm b/grammar/go/parsego.lm
index 659fcb0e..31eda462 100644
--- a/grammar/go/parsego.lm
+++ b/grammar/go/parsego.lm
@@ -8,4 +8,14 @@ if !P {
 }
 else {
 	print [P]
+
+	for IL: int_lit in P {
+		print "int_lit: [^IL]-
+	}
+	for RL: rune_lit in P {
+		print "rune_lit: [^RL]-
+	}
+	for SL: string_lit in P {
+		print "string_lit: [^SL]-
+	}
 }
author	Adrian Thurston <thurston@colm.net>	2019-12-13 04:51:45 +0200
committer	Adrian Thurston <thurston@colm.net>	2019-12-13 04:51:45 +0200
commit	402fc46c2a8c8f00c3d5d270ee001ce22776dd41 (patch)
tree	77de29c63c6ab3f74bd87c0eff22c3dc3995ad0f /grammar
parent	38d2a988b816471e2cf2f414773470e28b39c93c (diff)
download	colm-402fc46c2a8c8f00c3d5d270ee001ce22776dd41.tar.gz