1 files changed, 28 insertions, 9 deletions
diff --git a/libgo/go/text/scanner/scanner.go b/libgo/go/text/scanner/scanner.go
index e0d86e343d..5199ee4fc7 100644
--- a/libgo/go/text/scanner/scanner.go
+++ b/libgo/go/text/scanner/scanner.go
@@ -11,7 +11,7 @@
 // By default, a Scanner skips white space and Go comments and recognizes all
 // literals as defined by the Go language specification.  It may be
 // customized to recognize only a subset of those literals and to recognize
-// different white space characters.
+// different identifier and white space characters.
 //
 // Basic usage pattern:
 //
@@ -34,8 +34,6 @@ import (
 	"unicode/utf8"
 )
 
-// TODO(gri): Consider changing this to use the new (token) Position package.
-
 // A source position is represented by a Position value.
 // A position is valid if Line > 0.
 type Position struct {
@@ -68,6 +66,12 @@ func (pos Position) String() string {
 //
 //	ScanIdents | ScanInts | SkipComments
 //
+// With the exceptions of comments, which are skipped if SkipComments is
+// set, unrecognized tokens are not ignored. Instead, the scanner simply
+// returns the respective individual characters (or possibly sub-tokens).
+// For instance, if the mode is ScanIdents (not ScanStrings), the string
+// "foo" is scanned as the token sequence '"' Ident '"'.
+//
 const (
 	ScanIdents     = 1 << -Ident
 	ScanInts       = 1 << -Int
@@ -164,6 +168,13 @@ type Scanner struct {
 	// for values ch > ' '). The field may be changed at any time.
 	Whitespace uint64
 
+	// IsIdentRune is a predicate controlling the characters accepted
+	// as the ith rune in an identifier. The set of valid characters
+	// must not intersect with the set of white space characters.
+	// If no IsIdentRune function is set, regular Go identifiers are
+	// accepted instead. The field may be changed at any time.
+	IsIdentRune func(ch rune, i int) bool
+
 	// Start position of most recently scanned token; set by Scan.
 	// Calling Init or Next invalidates the position (Line == 0).
 	// The Filename field is always left untouched by the Scanner.
@@ -240,6 +251,9 @@ func (s *Scanner) next() rune {
 			s.srcEnd = i + n
 			s.srcBuf[s.srcEnd] = utf8.RuneSelf // sentinel
 			if err != nil {
+				if err != io.EOF {
+					s.error(err.Error())
+				}
 				if s.srcEnd == 0 {
 					if s.lastCharLen > 0 {
 						// previous character was not EOF
@@ -248,9 +262,6 @@ func (s *Scanner) next() rune {
 					s.lastCharLen = 0
 					return EOF
 				}
-				if err != io.EOF {
-					s.error(err.Error())
-				}
 				// If err == EOF, we won't be getting more
 				// bytes; break to avoid infinite loop. If
 				// err is something else, we don't know if
@@ -334,9 +345,17 @@ func (s *Scanner) error(msg string) {
 	fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg)
 }
 
+func (s *Scanner) isIdentRune(ch rune, i int) bool {
+	if s.IsIdentRune != nil {
+		return s.IsIdentRune(ch, i)
+	}
+	return ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch) && i > 0
+}
+
 func (s *Scanner) scanIdentifier() rune {
-	ch := s.next() // read character after first '_' or letter
-	for ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch) {
+	// we know the zero'th rune is OK; start scanning at the next one
+	ch := s.next()
+	for i := 1; s.isIdentRune(ch, i); i++ {
 		ch = s.next()
 	}
 	return ch
@@ -563,7 +582,7 @@ redo:
 	// determine token value
 	tok := ch
 	switch {
-	case unicode.IsLetter(ch) || ch == '_':
+	case s.isIdentRune(ch, 0):
 		if s.Mode&ScanIdents != 0 {
 			tok = Ident
 			ch = s.scanIdentifier()