summaryrefslogtreecommitdiff
path: root/src/unicode
diff options
context:
space:
mode:
authorJoe Tsai <joetsai@digital-static.net>2021-08-11 23:51:09 -0700
committerJoe Tsai <joetsai@digital-static.net>2021-08-28 01:49:50 +0000
commitf371b30f326b66e4c5c13c7ea51358a42c431752 (patch)
tree15cc06413794b4afdf317eb35680a1929a80fa0e /src/unicode
parentef4cb2f776f1c7e861604d1b46ab8fcf2672f10d (diff)
downloadgo-git-f371b30f326b66e4c5c13c7ea51358a42c431752.tar.gz
unicode/utf8: add AppendRune
AppendRune appends the UTF-8 encoding of a rune to a []byte. It is a generally more user friendly than EncodeRune. EncodeASCIIRune-4 2.35ns ± 2% EncodeJapaneseRune-4 4.60ns ± 2% AppendASCIIRune-4 0.30ns ± 3% AppendJapaneseRune-4 4.70ns ± 2% The ASCII case is written to be inlineable. Fixes #47609 Change-Id: If4f71eedffd2bd4ef0d7f960cb55b41c637eec54 Reviewed-on: https://go-review.googlesource.com/c/go/+/345571 Trust: Joe Tsai <joetsai@digital-static.net> Reviewed-by: Rob Pike <r@golang.org> Run-TryBot: Rob Pike <r@golang.org> TryBot-Result: Go Bot <gobot@golang.org>
Diffstat (limited to 'src/unicode')
-rw-r--r--src/unicode/utf8/utf8.go26
-rw-r--r--src/unicode/utf8/utf8_test.go25
2 files changed, 51 insertions, 0 deletions
diff --git a/src/unicode/utf8/utf8.go b/src/unicode/utf8/utf8.go
index 557e8a7770..6938c7e6a7 100644
--- a/src/unicode/utf8/utf8.go
+++ b/src/unicode/utf8/utf8.go
@@ -369,6 +369,32 @@ func EncodeRune(p []byte, r rune) int {
}
}
+// AppendRune appends the UTF-8 encoding of r to the end of p and
+// returns the extended buffer. If the rune is out of range,
+// it appends the encoding of RuneError.
+func AppendRune(p []byte, r rune) []byte {
+ // This function is inlineable for fast handling of ASCII.
+ if uint32(r) <= rune1Max {
+ return append(p, byte(r))
+ }
+ return appendRuneNonASCII(p, r)
+}
+
+func appendRuneNonASCII(p []byte, r rune) []byte {
+ // Negative values are erroneous. Making it unsigned addresses the problem.
+ switch i := uint32(r); {
+ case i <= rune2Max:
+ return append(p, t2|byte(r>>6), tx|byte(r)&maskx)
+ case i > MaxRune, surrogateMin <= i && i <= surrogateMax:
+ r = RuneError
+ fallthrough
+ case i <= rune3Max:
+ return append(p, t3|byte(r>>12), tx|byte(r>>6)&maskx, tx|byte(r)&maskx)
+ default:
+ return append(p, t4|byte(r>>18), tx|byte(r>>12)&maskx, tx|byte(r>>6)&maskx, tx|byte(r)&maskx)
+ }
+}
+
// RuneCount returns the number of runes in p. Erroneous and short
// encodings are treated as single runes of width 1 byte.
func RuneCount(p []byte) int {
diff --git a/src/unicode/utf8/utf8_test.go b/src/unicode/utf8/utf8_test.go
index eaf1b5ffee..a60040ecfd 100644
--- a/src/unicode/utf8/utf8_test.go
+++ b/src/unicode/utf8/utf8_test.go
@@ -127,6 +127,17 @@ func TestEncodeRune(t *testing.T) {
}
}
+func TestAppendRune(t *testing.T) {
+ for _, m := range utf8map {
+ if buf := AppendRune(nil, m.r); string(buf) != m.str {
+ t.Errorf("AppendRune(nil, %#04x) = %s, want %s", m.r, buf, m.str)
+ }
+ if buf := AppendRune([]byte("init"), m.r); string(buf) != "init"+m.str {
+ t.Errorf("AppendRune(nil, %#04x) = %s, want %s", m.r, buf, "init"+m.str)
+ }
+ }
+}
+
func TestDecodeRune(t *testing.T) {
for _, m := range utf8map {
b := []byte(m.str)
@@ -583,6 +594,20 @@ func BenchmarkEncodeJapaneseRune(b *testing.B) {
}
}
+func BenchmarkAppendASCIIRune(b *testing.B) {
+ buf := make([]byte, UTFMax)
+ for i := 0; i < b.N; i++ {
+ AppendRune(buf[:0], 'a')
+ }
+}
+
+func BenchmarkAppendJapaneseRune(b *testing.B) {
+ buf := make([]byte, UTFMax)
+ for i := 0; i < b.N; i++ {
+ AppendRune(buf[:0], '本')
+ }
+}
+
func BenchmarkDecodeASCIIRune(b *testing.B) {
a := []byte{'a'}
for i := 0; i < b.N; i++ {