summaryrefslogtreecommitdiff
path: root/src/cmd/compile/internal/syntax/source.go
diff options
context:
space:
mode:
authorRobert Griesemer <gri@golang.org>2016-03-04 17:09:08 -0800
committerMatthew Dempsky <mdempsky@google.com>2016-08-18 21:33:38 +0000
commitc8683ff7977c526fb48ae007971fed16ef32ff62 (patch)
tree408437654d4fd529b864b27ff66068f646bde746 /src/cmd/compile/internal/syntax/source.go
parent3b967be4219c789ef9c47aa5e9607cab3005e1cd (diff)
downloadgo-git-c8683ff7977c526fb48ae007971fed16ef32ff62.tar.gz
cmd/compile/internal/syntax: fast Go syntax trees, initial commit.
Syntax tree nodes, scanner, parser, basic printers. Builds syntax trees for entire Go std lib at a rate of ~1.8M lines/s in warmed up state (MacMini, 2.3 GHz Intel Core i7, 8GB RAM): $ go test -run StdLib -fast parsed 1074617 lines (2832 files) in 579.66364ms (1853863 lines/s) allocated 282.212Mb (486.854Mb/s) PASS Change-Id: Ie26d9a7bf4e5ff07457aedfcc9b89f0eba72ae3f Reviewed-on: https://go-review.googlesource.com/27195 Run-TryBot: Matthew Dempsky <mdempsky@google.com> Reviewed-by: Robert Griesemer <gri@golang.org>
Diffstat (limited to 'src/cmd/compile/internal/syntax/source.go')
-rw-r--r--src/cmd/compile/internal/syntax/source.go177
1 files changed, 177 insertions, 0 deletions
diff --git a/src/cmd/compile/internal/syntax/source.go b/src/cmd/compile/internal/syntax/source.go
new file mode 100644
index 0000000000..87c22fcc26
--- /dev/null
+++ b/src/cmd/compile/internal/syntax/source.go
@@ -0,0 +1,177 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syntax
+
+import (
+ "fmt"
+ "io"
+ "unicode/utf8"
+)
+
+// buf [...read...|...|...unread...|s|...free...]
+// ^ ^ ^ ^
+// | | | |
+// suf r0 r w
+
+type source struct {
+ src io.Reader
+ errh ErrorHandler
+
+ // source buffer
+ buf [4 << 10]byte
+ offs int // source offset of buf
+ r0, r, w int // previous/current read and write buf positions, excluding sentinel
+ line0, line int // previous/current line
+ err error // pending io error
+
+ // literal buffer
+ lit []byte // literal prefix
+ suf int // literal suffix; suf >= 0 means we are scanning a literal
+}
+
+func (s *source) init(src io.Reader, errh ErrorHandler) {
+ s.src = src
+ s.errh = errh
+
+ s.buf[0] = utf8.RuneSelf // terminate with sentinel
+ s.offs = 0
+ s.r0, s.r, s.w = 0, 0, 0
+ s.line0, s.line = 1, 1
+ s.err = nil
+
+ s.lit = s.lit[:0]
+ s.suf = -1
+}
+
+func (s *source) error(msg string) {
+ s.error_at(s.pos0(), s.line0, msg)
+}
+
+func (s *source) error_at(pos, line int, msg string) {
+ if s.errh != nil {
+ s.errh(pos, line, msg)
+ return
+ }
+ panic(fmt.Sprintf("%d: %s", line, msg))
+}
+
+// pos0 returns the byte position of the last character read.
+func (s *source) pos0() int {
+ return s.offs + s.r0
+}
+
+func (s *source) ungetr() {
+ s.r, s.line = s.r0, s.line0
+}
+
+func (s *source) getr() rune {
+redo:
+ s.r0, s.line0 = s.r, s.line
+
+ // We could avoid at least one test that is always taken in the
+ // for loop below by duplicating the common case code (ASCII)
+ // here since we always have at least the sentinel (utf8.RuneSelf)
+ // in the buffer. Measure and optimize if necessary.
+
+ // make sure we have at least one rune in buffer, or we are at EOF
+ for s.r+utf8.UTFMax > s.w && !utf8.FullRune(s.buf[s.r:s.w]) && s.err == nil && s.w-s.r < len(s.buf) {
+ s.fill() // s.w-s.r < len(s.buf) => buffer is not full
+ }
+
+ // common case: ASCII and enough bytes
+ // (invariant: s.buf[s.w] == utf8.RuneSelf)
+ if b := s.buf[s.r]; b < utf8.RuneSelf {
+ s.r++
+ if b == 0 {
+ s.error("invalid NUL character")
+ goto redo
+ }
+ if b == '\n' {
+ s.line++
+ }
+ return rune(b)
+ }
+
+ // EOF
+ if s.r == s.w {
+ if s.err != io.EOF {
+ s.error(s.err.Error())
+ }
+ return -1
+ }
+
+ // uncommon case: not ASCII
+ r, w := utf8.DecodeRune(s.buf[s.r:s.w])
+ s.r += w
+
+ if r == utf8.RuneError && w == 1 {
+ s.error("invalid UTF-8 encoding")
+ goto redo
+ }
+
+ // BOM's are only allowed as the first character in a file
+ const BOM = 0xfeff
+ if r == BOM {
+ if s.r0 > 0 { // s.r0 is always > 0 after 1st character (fill will set it to 1)
+ s.error("invalid BOM in the middle of the file")
+ }
+ goto redo
+ }
+
+ return r
+}
+
+func (s *source) fill() {
+ // Slide unread bytes to beginning but preserve last read char
+ // (for one ungetr call) plus one extra byte (for a 2nd ungetr
+ // call, only for ".." character sequence and float literals
+ // starting with ".").
+ if s.r0 > 1 {
+ // save literal prefix, if any
+ // (We see at most one ungetr call while reading
+ // a literal, so make sure s.r0 remains in buf.)
+ if s.suf >= 0 {
+ s.lit = append(s.lit, s.buf[s.suf:s.r0]...)
+ s.suf = 1 // == s.r0 after slide below
+ }
+ s.offs += s.r0 - 1
+ r := s.r - s.r0 + 1 // last read char plus one byte
+ s.w = r + copy(s.buf[r:], s.buf[s.r:s.w])
+ s.r = r
+ s.r0 = 1
+ }
+
+ // read more data: try a limited number of times
+ for i := 100; i > 0; i-- {
+ n, err := s.src.Read(s.buf[s.w : len(s.buf)-1]) // -1 to leave space for sentinel
+ if n < 0 {
+ panic("negative read") // incorrect underlying io.Reader implementation
+ }
+ s.w += n
+ if n > 0 || err != nil {
+ s.buf[s.w] = utf8.RuneSelf // sentinel
+ if err != nil {
+ s.err = err
+ }
+ return
+ }
+ }
+
+ s.err = io.ErrNoProgress
+}
+
+func (s *source) startLit() {
+ s.suf = s.r0
+ s.lit = s.lit[:0] // reuse lit
+}
+
+func (s *source) stopLit() []byte {
+ lit := s.buf[s.suf:s.r]
+ if len(s.lit) > 0 {
+ lit = append(s.lit, lit...)
+ }
+ s.suf = -1 // no pending literal
+ return lit
+}