1 files changed, 143 insertions, 144 deletions
diff --git a/src/cmd/compile/internal/syntax/source.go b/src/cmd/compile/internal/syntax/source.go
index c671e3c11e..01b592152b 100644
--- a/src/cmd/compile/internal/syntax/source.go
+++ b/src/cmd/compile/internal/syntax/source.go
@@ -3,11 +3,10 @@
 // license that can be found in the LICENSE file.
 
 // This file implements source, a buffered rune reader
-// which is specialized for the needs of the Go scanner:
-// Contiguous sequences of runes (literals) are extracted
-// directly as []byte without the need to re-encode the
-// runes in UTF-8 (as would be necessary with bufio.Reader).
-//
+// specialized for scanning Go code: Reading
+// ASCII characters, maintaining current (line, col)
+// position information, and recording of the most
+// recently read source segment are highly optimized.
 // This file is self-contained (go tool compile source.go
 // compiles) and thus could be made into its own package.
 
@@ -18,202 +17,202 @@ import (
 	"unicode/utf8"
 )
 
-// starting points for line and column numbers
-const linebase = 1
-const colbase = 1
-
-// max. number of bytes to unread
-const maxunread = 10
-
-// buf [...read...|...|...unread...|s|...free...]
-//         ^      ^   ^            ^
-//         |      |   |            |
-//        suf     r0  r            w
+// The source buffer is accessed using three indices b (begin),
+// r (read), and e (end):
+//
+// - If b >= 0, it points to the beginning of a segment of most
+//   recently read characters (typically a Go literal).
+//
+// - r points to the byte immediately following the most recently
+//   read character ch, which starts at r-chw.
+//
+// - e points to the byte immediately following the last byte that
+//   was read into the buffer.
+//
+// The buffer content is terminated at buf[e] with the sentinel
+// character utf8.RuneSelf. This makes it possible to test for
+// the common case of ASCII characters with a single 'if' (see
+// nextch method).
+//
+//                +------ content in use -------+
+//                v                             v
+// buf [...read...|...segment...|ch|...unread...|s|...free...]
+//                ^             ^  ^            ^
+//                |             |  |            |
+//                b         r-chw  r            e
+//
+// Invariant: -1 <= b < r <= e < len(buf) && buf[e] == sentinel
 
 type source struct {
-	src  io.Reader
-	errh func(line, pos uint, msg string)
-
-	// source buffer
-	buf         [4 << 10]byte
-	r0, r, w    int   // previous/current read and write buf positions, excluding sentinel
-	line0, line uint  // previous/current line
-	col0, col   uint  // previous/current column (byte offsets from line start)
-	ioerr       error // pending io error
-
-	// literal buffer
-	lit []byte // literal prefix
-	suf int    // literal suffix; suf >= 0 means we are scanning a literal
+	in   io.Reader
+	errh func(line, col uint, msg string)
+
+	buf       []byte // source buffer
+	ioerr     error  // pending I/O error, or nil
+	b, r, e   int    // buffer indices (see comment above)
+	line, col uint   // source position of ch (0-based)
+	ch        rune   // most recently read character
+	chw       int    // width of ch
 }
 
-// init initializes source to read from src and to report errors via errh.
-// errh must not be nil.
-func (s *source) init(src io.Reader, errh func(line, pos uint, msg string)) {
-	s.src = src
+const sentinel = utf8.RuneSelf
+
+func (s *source) init(in io.Reader, errh func(line, col uint, msg string)) {
+	s.in = in
 	s.errh = errh
 
-	s.buf[0] = utf8.RuneSelf // terminate with sentinel
-	s.r0, s.r, s.w = 0, 0, 0
-	s.line0, s.line = 0, linebase
-	s.col0, s.col = 0, colbase
+	if s.buf == nil {
+		s.buf = make([]byte, nextSize(0))
+	}
+	s.buf[0] = sentinel
 	s.ioerr = nil
-
-	s.lit = s.lit[:0]
-	s.suf = -1
+	s.b, s.r, s.e = -1, 0, 0
+	s.line, s.col = 0, 0
+	s.ch = ' '
+	s.chw = 0
 }
 
-// ungetr sets the reading position to a previous reading
-// position, usually the one of the most recently read
-// rune, but possibly earlier (see unread below).
-func (s *source) ungetr() {
-	s.r, s.line, s.col = s.r0, s.line0, s.col0
-}
+// starting points for line and column numbers
+const linebase = 1
+const colbase = 1
 
-// unread moves the previous reading position to a position
-// that is n bytes earlier in the source. The next ungetr
-// call will set the reading position to that moved position.
-// The "unread" runes must be single byte and not contain any
-// newlines; and 0 <= n <= maxunread must hold.
-func (s *source) unread(n int) {
-	s.r0 -= n
-	s.col0 -= uint(n)
+// pos returns the (line, col) source position of s.ch.
+func (s *source) pos() (line, col uint) {
+	return linebase + s.line, colbase + s.col
 }
 
+// error reports the error msg at source position s.pos().
 func (s *source) error(msg string) {
-	s.errh(s.line0, s.col0, msg)
+	line, col := s.pos()
+	s.errh(line, col, msg)
 }
 
-// getr reads and returns the next rune.
-//
-// If a read or source encoding error occurs, getr
-// calls the error handler installed with init.
-// The handler must exist.
-//
-// The (line, col) position passed to the error handler
-// is always at the current source reading position.
-func (s *source) getr() rune {
-redo:
-	s.r0, s.line0, s.col0 = s.r, s.line, s.col
-
-	// We could avoid at least one test that is always taken in the
-	// for loop below by duplicating the common case code (ASCII)
-	// here since we always have at least the sentinel (utf8.RuneSelf)
-	// in the buffer. Measure and optimize if necessary.
+// start starts a new active source segment (including s.ch).
+// As long as stop has not been called, the active segment's
+// bytes (excluding s.ch) may be retrieved by calling segment.
+func (s *source) start()          { s.b = s.r - s.chw }
+func (s *source) stop()           { s.b = -1 }
+func (s *source) segment() []byte { return s.buf[s.b : s.r-s.chw] }
+
+// rewind rewinds the scanner's read position and character s.ch
+// to the start of the currently active segment, which must not
+// contain any newlines (otherwise position information will be
+// incorrect). Currently, rewind is only needed for handling the
+// source sequence ".."; it must not be called outside an active
+// segment.
+func (s *source) rewind() {
+	// ok to verify precondition - rewind is rarely called
+	if s.b < 0 {
+		panic("no active segment")
+	}
+	s.col -= uint(s.r - s.b)
+	s.r = s.b
+	s.nextch()
+}
 
-	// make sure we have at least one rune in buffer, or we are at EOF
-	for s.r+utf8.UTFMax > s.w && !utf8.FullRune(s.buf[s.r:s.w]) && s.ioerr == nil && s.w-s.r < len(s.buf) {
-		s.fill() // s.w-s.r < len(s.buf) => buffer is not full
+func (s *source) nextch() {
+redo:
+	s.col += uint(s.chw)
+	if s.ch == '\n' {
+		s.line++
+		s.col = 0
 	}
 
-	// common case: ASCII and enough bytes
-	// (invariant: s.buf[s.w] == utf8.RuneSelf)
-	if b := s.buf[s.r]; b < utf8.RuneSelf {
+	// fast common case: at least one ASCII character
+	if s.ch = rune(s.buf[s.r]); s.ch < sentinel {
 		s.r++
-		// TODO(gri) Optimization: Instead of adjusting s.col for each character,
-		// remember the line offset instead and then compute the offset as needed
-		// (which is less often).
-		s.col++
-		if b == 0 {
+		s.chw = 1
+		if s.ch == 0 {
 			s.error("invalid NUL character")
 			goto redo
 		}
-		if b == '\n' {
-			s.line++
-			s.col = colbase
-		}
-		return rune(b)
+		return
+	}
+
+	// slower general case: add more bytes to buffer if we don't have a full rune
+	for s.e-s.r < utf8.UTFMax && !utf8.FullRune(s.buf[s.r:s.e]) && s.ioerr == nil {
+		s.fill()
 	}
 
 	// EOF
-	if s.r == s.w {
+	if s.r == s.e {
 		if s.ioerr != io.EOF {
 			// ensure we never start with a '/' (e.g., rooted path) in the error message
 			s.error("I/O error: " + s.ioerr.Error())
+			s.ioerr = nil
 		}
-		return -1
+		s.ch = -1
+		s.chw = 0
+		return
 	}
 
-	// uncommon case: not ASCII
-	r, w := utf8.DecodeRune(s.buf[s.r:s.w])
-	s.r += w
-	s.col += uint(w)
+	s.ch, s.chw = utf8.DecodeRune(s.buf[s.r:s.e])
+	s.r += s.chw
 
-	if r == utf8.RuneError && w == 1 {
+	if s.ch == utf8.RuneError && s.chw == 1 {
 		s.error("invalid UTF-8 encoding")
 		goto redo
 	}
 
 	// BOM's are only allowed as the first character in a file
 	const BOM = 0xfeff
-	if r == BOM {
-		if s.r0 > 0 { // s.r0 is always > 0 after 1st character (fill will set it to maxunread)
+	if s.ch == BOM {
+		if s.line > 0 || s.col > 0 {
 			s.error("invalid BOM in the middle of the file")
 		}
 		goto redo
 	}
-
-	return r
 }
 
+// fill reads more source bytes into s.buf.
+// It returns with at least one more byte in the buffer, or with s.ioerr != nil.
 func (s *source) fill() {
-	// Slide unread bytes to beginning but preserve last read char
-	// (for one ungetr call) plus maxunread extra bytes (for one
-	// unread call).
-	if s.r0 > maxunread {
-		n := s.r0 - maxunread // number of bytes to slide down
-		// save literal prefix, if any
-		// (make sure we keep maxunread bytes and the last
-		// read char in the buffer)
-		if s.suf >= 0 {
-			// we have a literal
-			if s.suf < n {
-				// save literal prefix
-				s.lit = append(s.lit, s.buf[s.suf:n]...)
-				s.suf = 0
-			} else {
-				s.suf -= n
-			}
-		}
-		copy(s.buf[:], s.buf[n:s.w])
-		s.r0 = maxunread // eqv: s.r0 -= n
-		s.r -= n
-		s.w -= n
+	// determine content to preserve
+	b := s.r
+	if s.b >= 0 {
+		b = s.b
+		s.b = 0 // after buffer has grown or content has been moved down
 	}
+	content := s.buf[b:s.e]
+
+	// grow buffer or move content down
+	if len(content)*2 > len(s.buf) {
+		s.buf = make([]byte, nextSize(len(s.buf)))
+		copy(s.buf, content)
+	} else if b > 0 {
+		copy(s.buf, content)
+	}
+	s.r -= b
+	s.e -= b
 
 	// read more data: try a limited number of times
-	for i := 100; i > 0; i-- {
-		n, err := s.src.Read(s.buf[s.w : len(s.buf)-1]) // -1 to leave space for sentinel
+	for i := 0; i < 10; i++ {
+		var n int
+		n, s.ioerr = s.in.Read(s.buf[s.e : len(s.buf)-1]) // -1 to leave space for sentinel
 		if n < 0 {
 			panic("negative read") // incorrect underlying io.Reader implementation
 		}
-		s.w += n
-		if n > 0 || err != nil {
-			s.buf[s.w] = utf8.RuneSelf // sentinel
-			if err != nil {
-				s.ioerr = err
-			}
+		if n > 0 || s.ioerr != nil {
+			s.e += n
+			s.buf[s.e] = sentinel
 			return
 		}
+		// n == 0
 	}
 
-	s.buf[s.w] = utf8.RuneSelf // sentinel
+	s.buf[s.e] = sentinel
 	s.ioerr = io.ErrNoProgress
 }
 
-func (s *source) startLit() {
-	s.suf = s.r0
-	s.lit = s.lit[:0] // reuse lit
-}
-
-func (s *source) stopLit() []byte {
-	lit := s.buf[s.suf:s.r]
-	if len(s.lit) > 0 {
-		lit = append(s.lit, lit...)
+// nextSize returns the next bigger size for a buffer of a given size.
+func nextSize(size int) int {
+	const min = 4 << 10 // 4K: minimum buffer size
+	const max = 1 << 20 // 1M: maximum buffer size which is still doubled
+	if size < min {
+		return min
 	}
-	s.killLit()
-	return lit
-}
-
-func (s *source) killLit() {
-	s.suf = -1 // no pending literal
+	if size <= max {
+		return size << 1
+	}
+	return size + max
 }