diff options
author | Rob Pike <r@golang.org> | 2012-09-13 13:59:00 -0700 |
---|---|---|
committer | Rob Pike <r@golang.org> | 2012-09-13 13:59:00 -0700 |
commit | 2a435a4dac0e24b7d1f47ce64f8e579398eb7481 (patch) | |
tree | 6fe77170e06a5c6a5844e53d5ef71973be65dbb8 /src/cmd/yacc | |
parent | cf6093df7342b3e5c870454d6c8910694aa60d99 (diff) | |
download | go-2a435a4dac0e24b7d1f47ce64f8e579398eb7481.tar.gz |
cmd/yacc: allow utf-8 token values
Also clean up the code and allow \U.
Fixes issue 3007.
R=golang-dev, rsc, 0xjnml
CC=golang-dev
http://codereview.appspot.com/6492105
Diffstat (limited to 'src/cmd/yacc')
-rw-r--r-- | src/cmd/yacc/units.y | 6 | ||||
-rw-r--r-- | src/cmd/yacc/yacc.go | 110 |
2 files changed, 63 insertions, 53 deletions
diff --git a/src/cmd/yacc/units.y b/src/cmd/yacc/units.y index 32d37e503..00ccaf2ec 100644 --- a/src/cmd/yacc/units.y +++ b/src/cmd/yacc/units.y @@ -76,7 +76,7 @@ var vflag bool %type <node> prog expr expr0 expr1 expr2 expr3 expr4 -%token <vval> VAL +%token <vval> VÄL // dieresis to test UTF-8 %token <vvar> VAR %token <numb> _SUP // tests leading underscore in token name %% @@ -199,7 +199,7 @@ expr0: $$ = $1.node } } -| VAL +| VÄL { $$ = one $$.vval = $1 @@ -275,7 +275,7 @@ numb: f = 0 } yylval.vval = f - return VAL + return VÄL } func (UnitsLex) Error(s string) { diff --git a/src/cmd/yacc/yacc.go b/src/cmd/yacc/yacc.go index a4ae35349..25bd22298 100644 --- a/src/cmd/yacc/yacc.go +++ b/src/cmd/yacc/yacc.go @@ -52,6 +52,7 @@ import ( "os" "strings" "unicode" + "unicode/utf8" ) // the following are adjustable @@ -326,7 +327,6 @@ var resrv = []Resrv{ var zznewstate = 0 const EOF = -1 -const UTFmax = 0x3f func main() { @@ -719,8 +719,8 @@ func moreprod() { } // -// define s to be a terminal if t=0 -// or a nonterminal if t=1 +// define s to be a terminal if nt==0 +// or a nonterminal if nt==1 // func defin(nt int, s string) int { val := 0 @@ -753,56 +753,66 @@ func defin(nt int, s string) int { // establish value for token // single character literal - if s[0] == ' ' && len(s) == 1+1 { - val = int(s[1]) - } else if s[0] == ' ' && s[1] == '\\' { // escape sequence - if len(s) == 2+1 { - // single character escape sequence - switch s[2] { - case '\'': - val = '\'' - case '"': - val = '"' - case '\\': - val = '\\' - case 'a': - val = '\a' - case 'b': - val = '\b' - case 'n': - val = '\n' - case 'r': - val = '\r' - case 't': - val = '\t' - case 'v': - val = '\v' - default: - errorf("invalid escape %v", s[1:3]) - } - } else if s[2] == 'u' && len(s) == 2+1+4 { // \unnnn sequence - val = 0 - s = s[3:] - for s != "" { - c := int(s[0]) - switch { - case c >= '0' && c <= '9': - c -= '0' - case c >= 'a' && c <= 'f': - c -= 'a' - 10 - case c >= 'A' && c <= 'F': - c -= 'A' - 10 + if s[0] == ' ' { + s = s[1:] + r, size := utf8.DecodeRuneInString(s) + if r == utf8.RuneError && size == 1 { + errorf("invalid UTF-8 sequence %q", s) + } + val = int(r) + if val == '\\' { // escape sequence + switch { + case len(s) == 2: + // single character escape sequence + switch s[1] { + case '\'': + val = '\'' + case '"': + val = '"' + case '\\': + val = '\\' + case 'a': + val = '\a' + case 'b': + val = '\b' + case 'f': + val = '\f' + case 'n': + val = '\n' + case 'r': + val = '\r' + case 't': + val = '\t' + case 'v': + val = '\v' default: - errorf("illegal \\unnnn construction") + errorf("invalid escape %s", s) } - val = val*16 + c - s = s[1:] - } - if val == 0 { - errorf("'\\u0000' is illegal") + case s[1] == 'u' && len(s) == 2+4, // \unnnn sequence + s[1] == 'U' && len(s) == 2+8: // \Unnnnnnnn sequence + val = 0 + s = s[2:] + for s != "" { + c := int(s[0]) + switch { + case c >= '0' && c <= '9': + c -= '0' + case c >= 'a' && c <= 'f': + c -= 'a' - 10 + case c >= 'A' && c <= 'F': + c -= 'A' - 10 + default: + errorf(`illegal \u or \U construction`) + } + val = val*16 + c + s = s[1:] + } + default: + errorf("invalid escape %s", s) } - } else { - errorf("unknown escape") + } + if val == 0 { + errorf("token value 0 is illegal") } } else { val = extval |