build: move package sources from src/pkg to src

Preparation was in CL 134570043. This CL contains only the effect of 'hg mv src/pkg/* src'. For more about the move, see golang.org/s/go14nopkg.
author: Russ Cox <rsc@golang.org> 2014-09-08 00:08:51 -0400
committer: Russ Cox <rsc@golang.org> 2014-09-08 00:08:51 -0400
commit: 8528da672cc093d4dd06732819abc1f7b6b5a46e (patch)
tree: 334be80d4a4c85b77db6f6fdb67cbf0528cba5f5 /src/unicode/utf8
parent: 73bcb69f272cbf34ddcc9daa56427a8683b5a95d (diff)
download: go-8528da672cc093d4dd06732819abc1f7b6b5a46e.tar.gz
3 files changed, 1075 insertions, 0 deletions
diff --git a/src/unicode/utf8/example_test.go b/src/unicode/utf8/example_test.go
new file mode 100644
index 000000000..7b3e7ac74
--- /dev/null
+++ b/src/unicode/utf8/example_test.go
@@ -0,0 +1,196 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package utf8_test
+
+import (
+	"fmt"
+	"unicode/utf8"
+)
+
+func ExampleDecodeLastRune() {
+	b := []byte("Hello, 世界")
+
+	for len(b) > 0 {
+		r, size := utf8.DecodeLastRune(b)
+		fmt.Printf("%c %v\n", r, size)
+
+		b = b[:len(b)-size]
+	}
+	// Output:
+	// 界 3
+	// 世 3
+	//   1
+	// , 1
+	// o 1
+	// l 1
+	// l 1
+	// e 1
+	// H 1
+}
+
+func ExampleDecodeLastRuneInString() {
+	str := "Hello, 世界"
+
+	for len(str) > 0 {
+		r, size := utf8.DecodeLastRuneInString(str)
+		fmt.Printf("%c %v\n", r, size)
+
+		str = str[:len(str)-size]
+	}
+	// Output:
+	// 界 3
+	// 世 3
+	//   1
+	// , 1
+	// o 1
+	// l 1
+	// l 1
+	// e 1
+	// H 1
+
+}
+
+func ExampleDecodeRune() {
+	b := []byte("Hello, 世界")
+
+	for len(b) > 0 {
+		r, size := utf8.DecodeRune(b)
+		fmt.Printf("%c %v\n", r, size)
+
+		b = b[size:]
+	}
+	// Output:
+	// H 1
+	// e 1
+	// l 1
+	// l 1
+	// o 1
+	// , 1
+	//   1
+	// 世 3
+	// 界 3
+}
+
+func ExampleDecodeRuneInString() {
+	str := "Hello, 世界"
+
+	for len(str) > 0 {
+		r, size := utf8.DecodeRuneInString(str)
+		fmt.Printf("%c %v\n", r, size)
+
+		str = str[size:]
+	}
+	// Output:
+	// H 1
+	// e 1
+	// l 1
+	// l 1
+	// o 1
+	// , 1
+	//   1
+	// 世 3
+	// 界 3
+}
+
+func ExampleEncodeRune() {
+	r := '世'
+	buf := make([]byte, 3)
+
+	n := utf8.EncodeRune(buf, r)
+
+	fmt.Println(buf)
+	fmt.Println(n)
+	// Output:
+	// [228 184 150]
+	// 3
+}
+
+func ExampleFullRune() {
+	buf := []byte{228, 184, 150} // 世
+	fmt.Println(utf8.FullRune(buf))
+	fmt.Println(utf8.FullRune(buf[:2]))
+	// Output:
+	// true
+	// false
+}
+
+func ExampleFullRuneInString() {
+	str := "世"
+	fmt.Println(utf8.FullRuneInString(str))
+	fmt.Println(utf8.FullRuneInString(str[:2]))
+	// Output:
+	// true
+	// false
+}
+
+func ExampleRuneCount() {
+	buf := []byte("Hello, 世界")
+	fmt.Println("bytes =", len(buf))
+	fmt.Println("runes =", utf8.RuneCount(buf))
+	// Output:
+	// bytes = 13
+	// runes = 9
+}
+
+func ExampleRuneCountInString() {
+	str := "Hello, 世界"
+	fmt.Println("bytes =", len(str))
+	fmt.Println("runes =", utf8.RuneCountInString(str))
+	// Output:
+	// bytes = 13
+	// runes = 9
+}
+
+func ExampleRuneLen() {
+	fmt.Println(utf8.RuneLen('a'))
+	fmt.Println(utf8.RuneLen('界'))
+	// Output:
+	// 1
+	// 3
+}
+
+func ExampleRuneStart() {
+	buf := []byte("a界")
+	fmt.Println(utf8.RuneStart(buf[0]))
+	fmt.Println(utf8.RuneStart(buf[1]))
+	fmt.Println(utf8.RuneStart(buf[2]))
+	// Output:
+	// true
+	// true
+	// false
+}
+
+func ExampleValid() {
+	valid := []byte("Hello, 世界")
+	invalid := []byte{0xff, 0xfe, 0xfd}
+
+	fmt.Println(utf8.Valid(valid))
+	fmt.Println(utf8.Valid(invalid))
+	// Output:
+	// true
+	// false
+}
+
+func ExampleValidRune() {
+	valid := 'a'
+	invalid := rune(0xfffffff)
+
+	fmt.Println(utf8.ValidRune(valid))
+	fmt.Println(utf8.ValidRune(invalid))
+	// Output:
+	// true
+	// false
+}
+
+func ExampleValidString() {
+	valid := "Hello, 世界"
+	invalid := string([]byte{0xff, 0xfe, 0xfd})
+
+	fmt.Println(utf8.ValidString(valid))
+	fmt.Println(utf8.ValidString(invalid))
+	// Output:
+	// true
+	// false
+}
diff --git a/src/unicode/utf8/utf8.go b/src/unicode/utf8/utf8.go
new file mode 100644
index 000000000..253295ad3
--- /dev/null
+++ b/src/unicode/utf8/utf8.go
@@ -0,0 +1,435 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package utf8 implements functions and constants to support text encoded in
+// UTF-8. It includes functions to translate between runes and UTF-8 byte sequences.
+package utf8
+
+// The conditions RuneError==unicode.ReplacementChar and
+// MaxRune==unicode.MaxRune are verified in the tests.
+// Defining them locally avoids this package depending on package unicode.
+
+// Numbers fundamental to the encoding.
+const (
+	RuneError = '\uFFFD'     // the "error" Rune or "Unicode replacement character"
+	RuneSelf  = 0x80         // characters below Runeself are represented as themselves in a single byte.
+	MaxRune   = '\U0010FFFF' // Maximum valid Unicode code point.
+	UTFMax    = 4            // maximum number of bytes of a UTF-8 encoded Unicode character.
+)
+
+// Code points in the surrogate range are not valid for UTF-8.
+const (
+	surrogateMin = 0xD800
+	surrogateMax = 0xDFFF
+)
+
+const (
+	t1 = 0x00 // 0000 0000
+	tx = 0x80 // 1000 0000
+	t2 = 0xC0 // 1100 0000
+	t3 = 0xE0 // 1110 0000
+	t4 = 0xF0 // 1111 0000
+	t5 = 0xF8 // 1111 1000
+
+	maskx = 0x3F // 0011 1111
+	mask2 = 0x1F // 0001 1111
+	mask3 = 0x0F // 0000 1111
+	mask4 = 0x07 // 0000 0111
+
+	rune1Max = 1<<7 - 1
+	rune2Max = 1<<11 - 1
+	rune3Max = 1<<16 - 1
+)
+
+func decodeRuneInternal(p []byte) (r rune, size int, short bool) {
+	n := len(p)
+	if n < 1 {
+		return RuneError, 0, true
+	}
+	c0 := p[0]
+
+	// 1-byte, 7-bit sequence?
+	if c0 < tx {
+		return rune(c0), 1, false
+	}
+
+	// unexpected continuation byte?
+	if c0 < t2 {
+		return RuneError, 1, false
+	}
+
+	// need first continuation byte
+	if n < 2 {
+		return RuneError, 1, true
+	}
+	c1 := p[1]
+	if c1 < tx || t2 <= c1 {
+		return RuneError, 1, false
+	}
+
+	// 2-byte, 11-bit sequence?
+	if c0 < t3 {
+		r = rune(c0&mask2)<<6 | rune(c1&maskx)
+		if r <= rune1Max {
+			return RuneError, 1, false
+		}
+		return r, 2, false
+	}
+
+	// need second continuation byte
+	if n < 3 {
+		return RuneError, 1, true
+	}
+	c2 := p[2]
+	if c2 < tx || t2 <= c2 {
+		return RuneError, 1, false
+	}
+
+	// 3-byte, 16-bit sequence?
+	if c0 < t4 {
+		r = rune(c0&mask3)<<12 | rune(c1&maskx)<<6 | rune(c2&maskx)
+		if r <= rune2Max {
+			return RuneError, 1, false
+		}
+		if surrogateMin <= r && r <= surrogateMax {
+			return RuneError, 1, false
+		}
+		return r, 3, false
+	}
+
+	// need third continuation byte
+	if n < 4 {
+		return RuneError, 1, true
+	}
+	c3 := p[3]
+	if c3 < tx || t2 <= c3 {
+		return RuneError, 1, false
+	}
+
+	// 4-byte, 21-bit sequence?
+	if c0 < t5 {
+		r = rune(c0&mask4)<<18 | rune(c1&maskx)<<12 | rune(c2&maskx)<<6 | rune(c3&maskx)
+		if r <= rune3Max || MaxRune < r {
+			return RuneError, 1, false
+		}
+		return r, 4, false
+	}
+
+	// error
+	return RuneError, 1, false
+}
+
+func decodeRuneInStringInternal(s string) (r rune, size int, short bool) {
+	n := len(s)
+	if n < 1 {
+		return RuneError, 0, true
+	}
+	c0 := s[0]
+
+	// 1-byte, 7-bit sequence?
+	if c0 < tx {
+		return rune(c0), 1, false
+	}
+
+	// unexpected continuation byte?
+	if c0 < t2 {
+		return RuneError, 1, false
+	}
+
+	// need first continuation byte
+	if n < 2 {
+		return RuneError, 1, true
+	}
+	c1 := s[1]
+	if c1 < tx || t2 <= c1 {
+		return RuneError, 1, false
+	}
+
+	// 2-byte, 11-bit sequence?
+	if c0 < t3 {
+		r = rune(c0&mask2)<<6 | rune(c1&maskx)
+		if r <= rune1Max {
+			return RuneError, 1, false
+		}
+		return r, 2, false
+	}
+
+	// need second continuation byte
+	if n < 3 {
+		return RuneError, 1, true
+	}
+	c2 := s[2]
+	if c2 < tx || t2 <= c2 {
+		return RuneError, 1, false
+	}
+
+	// 3-byte, 16-bit sequence?
+	if c0 < t4 {
+		r = rune(c0&mask3)<<12 | rune(c1&maskx)<<6 | rune(c2&maskx)
+		if r <= rune2Max {
+			return RuneError, 1, false
+		}
+		if surrogateMin <= r && r <= surrogateMax {
+			return RuneError, 1, false
+		}
+		return r, 3, false
+	}
+
+	// need third continuation byte
+	if n < 4 {
+		return RuneError, 1, true
+	}
+	c3 := s[3]
+	if c3 < tx || t2 <= c3 {
+		return RuneError, 1, false
+	}
+
+	// 4-byte, 21-bit sequence?
+	if c0 < t5 {
+		r = rune(c0&mask4)<<18 | rune(c1&maskx)<<12 | rune(c2&maskx)<<6 | rune(c3&maskx)
+		if r <= rune3Max || MaxRune < r {
+			return RuneError, 1, false
+		}
+		return r, 4, false
+	}
+
+	// error
+	return RuneError, 1, false
+}
+
+// FullRune reports whether the bytes in p begin with a full UTF-8 encoding of a rune.
+// An invalid encoding is considered a full Rune since it will convert as a width-1 error rune.
+func FullRune(p []byte) bool {
+	_, _, short := decodeRuneInternal(p)
+	return !short
+}
+
+// FullRuneInString is like FullRune but its input is a string.
+func FullRuneInString(s string) bool {
+	_, _, short := decodeRuneInStringInternal(s)
+	return !short
+}
+
+// DecodeRune unpacks the first UTF-8 encoding in p and returns the rune and its width in bytes.
+// If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8.
+// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
+// out of range, or is not the shortest possible UTF-8 encoding for the
+// value. No other validation is performed.
+func DecodeRune(p []byte) (r rune, size int) {
+	r, size, _ = decodeRuneInternal(p)
+	return
+}
+
+// DecodeRuneInString is like DecodeRune but its input is a string.
+// If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8.
+// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
+// out of range, or is not the shortest possible UTF-8 encoding for the
+// value. No other validation is performed.
+func DecodeRuneInString(s string) (r rune, size int) {
+	r, size, _ = decodeRuneInStringInternal(s)
+	return
+}
+
+// DecodeLastRune unpacks the last UTF-8 encoding in p and returns the rune and its width in bytes.
+// If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8.
+// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
+// out of range, or is not the shortest possible UTF-8 encoding for the
+// value. No other validation is performed.
+func DecodeLastRune(p []byte) (r rune, size int) {
+	end := len(p)
+	if end == 0 {
+		return RuneError, 0
+	}
+	start := end - 1
+	r = rune(p[start])
+	if r < RuneSelf {
+		return r, 1
+	}
+	// guard against O(n^2) behavior when traversing
+	// backwards through strings with long sequences of
+	// invalid UTF-8.
+	lim := end - UTFMax
+	if lim < 0 {
+		lim = 0
+	}
+	for start--; start >= lim; start-- {
+		if RuneStart(p[start]) {
+			break
+		}
+	}
+	if start < 0 {
+		start = 0
+	}
+	r, size = DecodeRune(p[start:end])
+	if start+size != end {
+		return RuneError, 1
+	}
+	return r, size
+}
+
+// DecodeLastRuneInString is like DecodeLastRune but its input is a string.
+// If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8.
+// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
+// out of range, or is not the shortest possible UTF-8 encoding for the
+// value. No other validation is performed.
+func DecodeLastRuneInString(s string) (r rune, size int) {
+	end := len(s)
+	if end == 0 {
+		return RuneError, 0
+	}
+	start := end - 1
+	r = rune(s[start])
+	if r < RuneSelf {
+		return r, 1
+	}
+	// guard against O(n^2) behavior when traversing
+	// backwards through strings with long sequences of
+	// invalid UTF-8.
+	lim := end - UTFMax
+	if lim < 0 {
+		lim = 0
+	}
+	for start--; start >= lim; start-- {
+		if RuneStart(s[start]) {
+			break
+		}
+	}
+	if start < 0 {
+		start = 0
+	}
+	r, size = DecodeRuneInString(s[start:end])
+	if start+size != end {
+		return RuneError, 1
+	}
+	return r, size
+}
+
+// RuneLen returns the number of bytes required to encode the rune.
+// It returns -1 if the rune is not a valid value to encode in UTF-8.
+func RuneLen(r rune) int {
+	switch {
+	case r < 0:
+		return -1
+	case r <= rune1Max:
+		return 1
+	case r <= rune2Max:
+		return 2
+	case surrogateMin <= r && r <= surrogateMax:
+		return -1
+	case r <= rune3Max:
+		return 3
+	case r <= MaxRune:
+		return 4
+	}
+	return -1
+}
+
+// EncodeRune writes into p (which must be large enough) the UTF-8 encoding of the rune.
+// It returns the number of bytes written.
+func EncodeRune(p []byte, r rune) int {
+	// Negative values are erroneous.  Making it unsigned addresses the problem.
+	switch i := uint32(r); {
+	case i <= rune1Max:
+		p[0] = byte(r)
+		return 1
+	case i <= rune2Max:
+		p[0] = t2 | byte(r>>6)
+		p[1] = tx | byte(r)&maskx
+		return 2
+	case i > MaxRune, surrogateMin <= i && i <= surrogateMax:
+		r = RuneError
+		fallthrough
+	case i <= rune3Max:
+		p[0] = t3 | byte(r>>12)
+		p[1] = tx | byte(r>>6)&maskx
+		p[2] = tx | byte(r)&maskx
+		return 3
+	default:
+		p[0] = t4 | byte(r>>18)
+		p[1] = tx | byte(r>>12)&maskx
+		p[2] = tx | byte(r>>6)&maskx
+		p[3] = tx | byte(r)&maskx
+		return 4
+	}
+}
+
+// RuneCount returns the number of runes in p.  Erroneous and short
+// encodings are treated as single runes of width 1 byte.
+func RuneCount(p []byte) int {
+	i := 0
+	var n int
+	for n = 0; i < len(p); n++ {
+		if p[i] < RuneSelf {
+			i++
+		} else {
+			_, size := DecodeRune(p[i:])
+			i += size
+		}
+	}
+	return n
+}
+
+// RuneCountInString is like RuneCount but its input is a string.
+func RuneCountInString(s string) (n int) {
+	for range s {
+		n++
+	}
+	return
+}
+
+// RuneStart reports whether the byte could be the first byte of
+// an encoded rune.  Second and subsequent bytes always have the top
+// two bits set to 10.
+func RuneStart(b byte) bool { return b&0xC0 != 0x80 }
+
+// Valid reports whether p consists entirely of valid UTF-8-encoded runes.
+func Valid(p []byte) bool {
+	i := 0
+	for i < len(p) {
+		if p[i] < RuneSelf {
+			i++
+		} else {
+			_, size := DecodeRune(p[i:])
+			if size == 1 {
+				// All valid runes of size 1 (those
+				// below RuneSelf) were handled above.
+				// This must be a RuneError.
+				return false
+			}
+			i += size
+		}
+	}
+	return true
+}
+
+// ValidString reports whether s consists entirely of valid UTF-8-encoded runes.
+func ValidString(s string) bool {
+	for i, r := range s {
+		if r == RuneError {
+			// The RuneError value can be an error
+			// sentinel value (if it's size 1) or the same
+			// value encoded properly. Decode it to see if
+			// it's the 1 byte sentinel value.
+			_, size := DecodeRuneInString(s[i:])
+			if size == 1 {
+				return false
+			}
+		}
+	}
+	return true
+}
+
+// ValidRune reports whether r can be legally encoded as UTF-8.
+// Code points that are out of range or a surrogate half are illegal.
+func ValidRune(r rune) bool {
+	switch {
+	case r < 0:
+		return false
+	case surrogateMin <= r && r <= surrogateMax:
+		return false
+	case r > MaxRune:
+		return false
+	}
+	return true
+}
diff --git a/src/unicode/utf8/utf8_test.go b/src/unicode/utf8/utf8_test.go
new file mode 100644
index 000000000..758d7a0f8
--- /dev/null
+++ b/src/unicode/utf8/utf8_test.go
@@ -0,0 +1,444 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package utf8_test
+
+import (
+	"bytes"
+	"testing"
+	"unicode"
+	. "unicode/utf8"
+)
+
+// Validate the constants redefined from unicode.
+func init() {
+	if MaxRune != unicode.MaxRune {
+		panic("utf8.MaxRune is wrong")
+	}
+	if RuneError != unicode.ReplacementChar {
+		panic("utf8.RuneError is wrong")
+	}
+}
+
+// Validate the constants redefined from unicode.
+func TestConstants(t *testing.T) {
+	if MaxRune != unicode.MaxRune {
+		t.Errorf("utf8.MaxRune is wrong: %x should be %x", MaxRune, unicode.MaxRune)
+	}
+	if RuneError != unicode.ReplacementChar {
+		t.Errorf("utf8.RuneError is wrong: %x should be %x", RuneError, unicode.ReplacementChar)
+	}
+}
+
+type Utf8Map struct {
+	r   rune
+	str string
+}
+
+var utf8map = []Utf8Map{
+	{0x0000, "\x00"},
+	{0x0001, "\x01"},
+	{0x007e, "\x7e"},
+	{0x007f, "\x7f"},
+	{0x0080, "\xc2\x80"},
+	{0x0081, "\xc2\x81"},
+	{0x00bf, "\xc2\xbf"},
+	{0x00c0, "\xc3\x80"},
+	{0x00c1, "\xc3\x81"},
+	{0x00c8, "\xc3\x88"},
+	{0x00d0, "\xc3\x90"},
+	{0x00e0, "\xc3\xa0"},
+	{0x00f0, "\xc3\xb0"},
+	{0x00f8, "\xc3\xb8"},
+	{0x00ff, "\xc3\xbf"},
+	{0x0100, "\xc4\x80"},
+	{0x07ff, "\xdf\xbf"},
+	{0x0800, "\xe0\xa0\x80"},
+	{0x0801, "\xe0\xa0\x81"},
+	{0xd7ff, "\xed\x9f\xbf"}, // last code point before surrogate half.
+	{0xe000, "\xee\x80\x80"}, // first code point after surrogate half.
+	{0xfffe, "\xef\xbf\xbe"},
+	{0xffff, "\xef\xbf\xbf"},
+	{0x10000, "\xf0\x90\x80\x80"},
+	{0x10001, "\xf0\x90\x80\x81"},
+	{0x10fffe, "\xf4\x8f\xbf\xbe"},
+	{0x10ffff, "\xf4\x8f\xbf\xbf"},
+	{0xFFFD, "\xef\xbf\xbd"},
+}
+
+var surrogateMap = []Utf8Map{
+	{0xd800, "\xed\xa0\x80"}, // surrogate min decodes to (RuneError, 1)
+	{0xdfff, "\xed\xbf\xbf"}, // surrogate max decodes to (RuneError, 1)
+}
+
+var testStrings = []string{
+	"",
+	"abcd",
+	"☺☻☹",
+	"日a本b語ç日ð本Ê語þ日¥本¼語i日©",
+	"日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©",
+	"\x80\x80\x80\x80",
+}
+
+func TestFullRune(t *testing.T) {
+	for _, m := range utf8map {
+		b := []byte(m.str)
+		if !FullRune(b) {
+			t.Errorf("FullRune(%q) (%U) = false, want true", b, m.r)
+		}
+		s := m.str
+		if !FullRuneInString(s) {
+			t.Errorf("FullRuneInString(%q) (%U) = false, want true", s, m.r)
+		}
+		b1 := b[0 : len(b)-1]
+		if FullRune(b1) {
+			t.Errorf("FullRune(%q) = true, want false", b1)
+		}
+		s1 := string(b1)
+		if FullRuneInString(s1) {
+			t.Errorf("FullRune(%q) = true, want false", s1)
+		}
+	}
+}
+
+func TestEncodeRune(t *testing.T) {
+	for _, m := range utf8map {
+		b := []byte(m.str)
+		var buf [10]byte
+		n := EncodeRune(buf[0:], m.r)
+		b1 := buf[0:n]
+		if !bytes.Equal(b, b1) {
+			t.Errorf("EncodeRune(%#04x) = %q want %q", m.r, b1, b)
+		}
+	}
+}
+
+func TestDecodeRune(t *testing.T) {
+	for _, m := range utf8map {
+		b := []byte(m.str)
+		r, size := DecodeRune(b)
+		if r != m.r || size != len(b) {
+			t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, m.r, len(b))
+		}
+		s := m.str
+		r, size = DecodeRuneInString(s)
+		if r != m.r || size != len(b) {
+			t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, m.r, len(b))
+		}
+
+		// there's an extra byte that bytes left behind - make sure trailing byte works
+		r, size = DecodeRune(b[0:cap(b)])
+		if r != m.r || size != len(b) {
+			t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, m.r, len(b))
+		}
+		s = m.str + "\x00"
+		r, size = DecodeRuneInString(s)
+		if r != m.r || size != len(b) {
+			t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, m.r, len(b))
+		}
+
+		// make sure missing bytes fail
+		wantsize := 1
+		if wantsize >= len(b) {
+			wantsize = 0
+		}
+		r, size = DecodeRune(b[0 : len(b)-1])
+		if r != RuneError || size != wantsize {
+			t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b[0:len(b)-1], r, size, RuneError, wantsize)
+		}
+		s = m.str[0 : len(m.str)-1]
+		r, size = DecodeRuneInString(s)
+		if r != RuneError || size != wantsize {
+			t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, RuneError, wantsize)
+		}
+
+		// make sure bad sequences fail
+		if len(b) == 1 {
+			b[0] = 0x80
+		} else {
+			b[len(b)-1] = 0x7F
+		}
+		r, size = DecodeRune(b)
+		if r != RuneError || size != 1 {
+			t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, RuneError, 1)
+		}
+		s = string(b)
+		r, size = DecodeRuneInString(s)
+		if r != RuneError || size != 1 {
+			t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, RuneError, 1)
+		}
+
+	}
+}
+
+func TestDecodeSurrogateRune(t *testing.T) {
+	for _, m := range surrogateMap {
+		b := []byte(m.str)
+		r, size := DecodeRune(b)
+		if r != RuneError || size != 1 {
+			t.Errorf("DecodeRune(%q) = %x, %d want %x, %d", b, r, size, RuneError, 1)
+		}
+		s := m.str
+		r, size = DecodeRuneInString(s)
+		if r != RuneError || size != 1 {
+			t.Errorf("DecodeRuneInString(%q) = %x, %d want %x, %d", b, r, size, RuneError, 1)
+		}
+	}
+}
+
+// Check that DecodeRune and DecodeLastRune correspond to
+// the equivalent range loop.
+func TestSequencing(t *testing.T) {
+	for _, ts := range testStrings {
+		for _, m := range utf8map {
+			for _, s := range []string{ts + m.str, m.str + ts, ts + m.str + ts} {
+				testSequence(t, s)
+			}
+		}
+	}
+}
+
+// Check that a range loop and a []int conversion visit the same runes.
+// Not really a test of this package, but the assumption is used here and
+// it's good to verify
+func TestIntConversion(t *testing.T) {
+	for _, ts := range testStrings {
+		runes := []rune(ts)
+		if RuneCountInString(ts) != len(runes) {
+			t.Errorf("%q: expected %d runes; got %d", ts, len(runes), RuneCountInString(ts))
+			break
+		}
+		i := 0
+		for _, r := range ts {
+			if r != runes[i] {
+				t.Errorf("%q[%d]: expected %c (%U); got %c (%U)", ts, i, runes[i], runes[i], r, r)
+			}
+			i++
+		}
+	}
+}
+
+func testSequence(t *testing.T, s string) {
+	type info struct {
+		index int
+		r     rune
+	}
+	index := make([]info, len(s))
+	b := []byte(s)
+	si := 0
+	j := 0
+	for i, r := range s {
+		if si != i {
+			t.Errorf("Sequence(%q) mismatched index %d, want %d", s, si, i)
+			return
+		}
+		index[j] = info{i, r}
+		j++
+		r1, size1 := DecodeRune(b[i:])
+		if r != r1 {
+			t.Errorf("DecodeRune(%q) = %#04x, want %#04x", s[i:], r1, r)
+			return
+		}
+		r2, size2 := DecodeRuneInString(s[i:])
+		if r != r2 {
+			t.Errorf("DecodeRuneInString(%q) = %#04x, want %#04x", s[i:], r2, r)
+			return
+		}
+		if size1 != size2 {
+			t.Errorf("DecodeRune/DecodeRuneInString(%q) size mismatch %d/%d", s[i:], size1, size2)
+			return
+		}
+		si += size1
+	}
+	j--
+	for si = len(s); si > 0; {
+		r1, size1 := DecodeLastRune(b[0:si])
+		r2, size2 := DecodeLastRuneInString(s[0:si])
+		if size1 != size2 {
+			t.Errorf("DecodeLastRune/DecodeLastRuneInString(%q, %d) size mismatch %d/%d", s, si, size1, size2)
+			return
+		}
+		if r1 != index[j].r {
+			t.Errorf("DecodeLastRune(%q, %d) = %#04x, want %#04x", s, si, r1, index[j].r)
+			return
+		}
+		if r2 != index[j].r {
+			t.Errorf("DecodeLastRuneInString(%q, %d) = %#04x, want %#04x", s, si, r2, index[j].r)
+			return
+		}
+		si -= size1
+		if si != index[j].index {
+			t.Errorf("DecodeLastRune(%q) index mismatch at %d, want %d", s, si, index[j].index)
+			return
+		}
+		j--
+	}
+	if si != 0 {
+		t.Errorf("DecodeLastRune(%q) finished at %d, not 0", s, si)
+	}
+}
+
+// Check that negative runes encode as U+FFFD.
+func TestNegativeRune(t *testing.T) {
+	errorbuf := make([]byte, UTFMax)
+	errorbuf = errorbuf[0:EncodeRune(errorbuf, RuneError)]
+	buf := make([]byte, UTFMax)
+	buf = buf[0:EncodeRune(buf, -1)]
+	if !bytes.Equal(buf, errorbuf) {
+		t.Errorf("incorrect encoding [% x] for -1; expected [% x]", buf, errorbuf)
+	}
+}
+
+type RuneCountTest struct {
+	in  string
+	out int
+}
+
+var runecounttests = []RuneCountTest{
+	{"abcd", 4},
+	{"☺☻☹", 3},
+	{"1,2,3,4", 7},
+	{"\xe2\x00", 2},
+}
+
+func TestRuneCount(t *testing.T) {
+	for _, tt := range runecounttests {
+		if out := RuneCountInString(tt.in); out != tt.out {
+			t.Errorf("RuneCountInString(%q) = %d, want %d", tt.in, out, tt.out)
+		}
+		if out := RuneCount([]byte(tt.in)); out != tt.out {
+			t.Errorf("RuneCount(%q) = %d, want %d", tt.in, out, tt.out)
+		}
+	}
+}
+
+type RuneLenTest struct {
+	r    rune
+	size int
+}
+
+var runelentests = []RuneLenTest{
+	{0, 1},
+	{'e', 1},
+	{'é', 2},
+	{'☺', 3},
+	{RuneError, 3},
+	{MaxRune, 4},
+	{0xD800, -1},
+	{0xDFFF, -1},
+	{MaxRune + 1, -1},
+	{-1, -1},
+}
+
+func TestRuneLen(t *testing.T) {
+	for _, tt := range runelentests {
+		if size := RuneLen(tt.r); size != tt.size {
+			t.Errorf("RuneLen(%#U) = %d, want %d", tt.r, size, tt.size)
+		}
+	}
+}
+
+type ValidTest struct {
+	in  string
+	out bool
+}
+
+var validTests = []ValidTest{
+	{"", true},
+	{"a", true},
+	{"abc", true},
+	{"Ж", true},
+	{"ЖЖ", true},
+	{"брэд-ЛГТМ", true},
+	{"☺☻☹", true},
+	{string([]byte{66, 250}), false},
+	{string([]byte{66, 250, 67}), false},
+	{"a\uFFFDb", true},
+	{string("\xF4\x8F\xBF\xBF"), true},      // U+10FFFF
+	{string("\xF4\x90\x80\x80"), false},     // U+10FFFF+1; out of range
+	{string("\xF7\xBF\xBF\xBF"), false},     // 0x1FFFFF; out of range
+	{string("\xFB\xBF\xBF\xBF\xBF"), false}, // 0x3FFFFFF; out of range
+	{string("\xc0\x80"), false},             // U+0000 encoded in two bytes: incorrect
+	{string("\xed\xa0\x80"), false},         // U+D800 high surrogate (sic)
+	{string("\xed\xbf\xbf"), false},         // U+DFFF low surrogate (sic)
+}
+
+func TestValid(t *testing.T) {
+	for _, tt := range validTests {
+		if Valid([]byte(tt.in)) != tt.out {
+			t.Errorf("Valid(%q) = %v; want %v", tt.in, !tt.out, tt.out)
+		}
+		if ValidString(tt.in) != tt.out {
+			t.Errorf("ValidString(%q) = %v; want %v", tt.in, !tt.out, tt.out)
+		}
+	}
+}
+
+type ValidRuneTest struct {
+	r  rune
+	ok bool
+}
+
+var validrunetests = []ValidRuneTest{
+	{0, true},
+	{'e', true},
+	{'é', true},
+	{'☺', true},
+	{RuneError, true},
+	{MaxRune, true},
+	{0xD7FF, true},
+	{0xD800, false},
+	{0xDFFF, false},
+	{0xE000, true},
+	{MaxRune + 1, false},
+	{-1, false},
+}
+
+func TestValidRune(t *testing.T) {
+	for _, tt := range validrunetests {
+		if ok := ValidRune(tt.r); ok != tt.ok {
+			t.Errorf("ValidRune(%#U) = %t, want %t", tt.r, ok, tt.ok)
+		}
+	}
+}
+
+func BenchmarkRuneCountTenASCIIChars(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		RuneCountInString("0123456789")
+	}
+}
+
+func BenchmarkRuneCountTenJapaneseChars(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		RuneCountInString("日本語日本語日本語日")
+	}
+}
+
+func BenchmarkEncodeASCIIRune(b *testing.B) {
+	buf := make([]byte, UTFMax)
+	for i := 0; i < b.N; i++ {
+		EncodeRune(buf, 'a')
+	}
+}
+
+func BenchmarkEncodeJapaneseRune(b *testing.B) {
+	buf := make([]byte, UTFMax)
+	for i := 0; i < b.N; i++ {
+		EncodeRune(buf, '本')
+	}
+}
+
+func BenchmarkDecodeASCIIRune(b *testing.B) {
+	a := []byte{'a'}
+	for i := 0; i < b.N; i++ {
+		DecodeRune(a)
+	}
+}
+
+func BenchmarkDecodeJapaneseRune(b *testing.B) {
+	nihon := []byte("本")
+	for i := 0; i < b.N; i++ {
+		DecodeRune(nihon)
+	}
+}
author	Russ Cox <rsc@golang.org>	2014-09-08 00:08:51 -0400
committer	Russ Cox <rsc@golang.org>	2014-09-08 00:08:51 -0400
commit	8528da672cc093d4dd06732819abc1f7b6b5a46e (patch)
tree	334be80d4a4c85b77db6f6fdb67cbf0528cba5f5 /src/unicode/utf8
parent	73bcb69f272cbf34ddcc9daa56427a8683b5a95d (diff)
download	go-8528da672cc093d4dd06732819abc1f7b6b5a46e.tar.gz