summaryrefslogtreecommitdiff
path: root/src/cmd/yacc
diff options
context:
space:
mode:
authorRob Pike <r@golang.org>2012-09-13 13:59:00 -0700
committerRob Pike <r@golang.org>2012-09-13 13:59:00 -0700
commit2a435a4dac0e24b7d1f47ce64f8e579398eb7481 (patch)
tree6fe77170e06a5c6a5844e53d5ef71973be65dbb8 /src/cmd/yacc
parentcf6093df7342b3e5c870454d6c8910694aa60d99 (diff)
downloadgo-2a435a4dac0e24b7d1f47ce64f8e579398eb7481.tar.gz
cmd/yacc: allow utf-8 token values
Also clean up the code and allow \U. Fixes issue 3007. R=golang-dev, rsc, 0xjnml CC=golang-dev http://codereview.appspot.com/6492105
Diffstat (limited to 'src/cmd/yacc')
-rw-r--r--src/cmd/yacc/units.y6
-rw-r--r--src/cmd/yacc/yacc.go110
2 files changed, 63 insertions, 53 deletions
diff --git a/src/cmd/yacc/units.y b/src/cmd/yacc/units.y
index 32d37e503..00ccaf2ec 100644
--- a/src/cmd/yacc/units.y
+++ b/src/cmd/yacc/units.y
@@ -76,7 +76,7 @@ var vflag bool
%type <node> prog expr expr0 expr1 expr2 expr3 expr4
-%token <vval> VAL
+%token <vval> VÄL // dieresis to test UTF-8
%token <vvar> VAR
%token <numb> _SUP // tests leading underscore in token name
%%
@@ -199,7 +199,7 @@ expr0:
$$ = $1.node
}
}
-| VAL
+| VÄL
{
$$ = one
$$.vval = $1
@@ -275,7 +275,7 @@ numb:
f = 0
}
yylval.vval = f
- return VAL
+ return VÄL
}
func (UnitsLex) Error(s string) {
diff --git a/src/cmd/yacc/yacc.go b/src/cmd/yacc/yacc.go
index a4ae35349..25bd22298 100644
--- a/src/cmd/yacc/yacc.go
+++ b/src/cmd/yacc/yacc.go
@@ -52,6 +52,7 @@ import (
"os"
"strings"
"unicode"
+ "unicode/utf8"
)
// the following are adjustable
@@ -326,7 +327,6 @@ var resrv = []Resrv{
var zznewstate = 0
const EOF = -1
-const UTFmax = 0x3f
func main() {
@@ -719,8 +719,8 @@ func moreprod() {
}
//
-// define s to be a terminal if t=0
-// or a nonterminal if t=1
+// define s to be a terminal if nt==0
+// or a nonterminal if nt==1
//
func defin(nt int, s string) int {
val := 0
@@ -753,56 +753,66 @@ func defin(nt int, s string) int {
// establish value for token
// single character literal
- if s[0] == ' ' && len(s) == 1+1 {
- val = int(s[1])
- } else if s[0] == ' ' && s[1] == '\\' { // escape sequence
- if len(s) == 2+1 {
- // single character escape sequence
- switch s[2] {
- case '\'':
- val = '\''
- case '"':
- val = '"'
- case '\\':
- val = '\\'
- case 'a':
- val = '\a'
- case 'b':
- val = '\b'
- case 'n':
- val = '\n'
- case 'r':
- val = '\r'
- case 't':
- val = '\t'
- case 'v':
- val = '\v'
- default:
- errorf("invalid escape %v", s[1:3])
- }
- } else if s[2] == 'u' && len(s) == 2+1+4 { // \unnnn sequence
- val = 0
- s = s[3:]
- for s != "" {
- c := int(s[0])
- switch {
- case c >= '0' && c <= '9':
- c -= '0'
- case c >= 'a' && c <= 'f':
- c -= 'a' - 10
- case c >= 'A' && c <= 'F':
- c -= 'A' - 10
+ if s[0] == ' ' {
+ s = s[1:]
+ r, size := utf8.DecodeRuneInString(s)
+ if r == utf8.RuneError && size == 1 {
+ errorf("invalid UTF-8 sequence %q", s)
+ }
+ val = int(r)
+ if val == '\\' { // escape sequence
+ switch {
+ case len(s) == 2:
+ // single character escape sequence
+ switch s[1] {
+ case '\'':
+ val = '\''
+ case '"':
+ val = '"'
+ case '\\':
+ val = '\\'
+ case 'a':
+ val = '\a'
+ case 'b':
+ val = '\b'
+ case 'f':
+ val = '\f'
+ case 'n':
+ val = '\n'
+ case 'r':
+ val = '\r'
+ case 't':
+ val = '\t'
+ case 'v':
+ val = '\v'
default:
- errorf("illegal \\unnnn construction")
+ errorf("invalid escape %s", s)
}
- val = val*16 + c
- s = s[1:]
- }
- if val == 0 {
- errorf("'\\u0000' is illegal")
+ case s[1] == 'u' && len(s) == 2+4, // \unnnn sequence
+ s[1] == 'U' && len(s) == 2+8: // \Unnnnnnnn sequence
+ val = 0
+ s = s[2:]
+ for s != "" {
+ c := int(s[0])
+ switch {
+ case c >= '0' && c <= '9':
+ c -= '0'
+ case c >= 'a' && c <= 'f':
+ c -= 'a' - 10
+ case c >= 'A' && c <= 'F':
+ c -= 'A' - 10
+ default:
+ errorf(`illegal \u or \U construction`)
+ }
+ val = val*16 + c
+ s = s[1:]
+ }
+ default:
+ errorf("invalid escape %s", s)
}
- } else {
- errorf("unknown escape")
+ }
+ if val == 0 {
+ errorf("token value 0 is illegal")
}
} else {
val = extval