diff options
author | qmuntal <quimmuntal@gmail.com> | 2023-05-05 18:17:18 +0200 |
---|---|---|
committer | Quim Muntal <quimmuntal@gmail.com> | 2023-05-15 09:26:16 +0000 |
commit | 974236bda9b9aad87b4b10ec9af2cc01b14e382f (patch) | |
tree | 321cb16cc9fc2eec78fc81060ff64db06115a544 /src/os | |
parent | 91b8cc0dfaae12af1a89e2b7ad3da10728883ee1 (diff) | |
download | go-git-974236bda9b9aad87b4b10ec9af2cc01b14e382f.tar.gz |
os, syscall: support ill-formed UTF-16 strings on Windows
Windows UTF-16 strings can contain unpaired surrogates, which can't be
decoded into a valid UTF-8 string. This file defines a set of functions
that can be used to encode and decode potentially ill-formed UTF-16
strings by using the
[the WTF-8 encoding](https://simonsapin.github.io/wtf-8/).
WTF-8 is a strict superset of UTF-8, i.e. any string that is
well-formed in UTF-8 is also well-formed in WTF-8 and the content
is unchanged. Also, the conversion never fails and is lossless.
The benefit of using WTF-8 instead of UTF-8 when decoding a UTF-16
string is that the conversion is lossless even for ill-formed
UTF-16 strings. This property allows to read an ill-formed UTF-16
string, convert it to a Go string, and convert it back to the same
original UTF-16 string.
Fixes #59971
Change-Id: Id6007f6e537844913402b233e73d698688cd5ba6
Reviewed-on: https://go-review.googlesource.com/c/go/+/493036
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Bryan Mills <bcmills@google.com>
Run-TryBot: Quim Muntal <quimmuntal@gmail.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Paul Hampson <Paul.Hampson@Pobox.com>
Diffstat (limited to 'src/os')
-rw-r--r-- | src/os/dir_windows.go | 3 | ||||
-rw-r--r-- | src/os/exec/lp_windows_test.go | 3 | ||||
-rw-r--r-- | src/os/file_windows.go | 3 | ||||
-rw-r--r-- | src/os/os_windows_test.go | 69 |
4 files changed, 72 insertions, 6 deletions
diff --git a/src/os/dir_windows.go b/src/os/dir_windows.go index cee05cc729..7792d03040 100644 --- a/src/os/dir_windows.go +++ b/src/os/dir_windows.go @@ -11,7 +11,6 @@ import ( "runtime" "sync" "syscall" - "unicode/utf16" "unsafe" ) @@ -104,7 +103,7 @@ func (file *File) readdir(n int, mode readdirMode) (names []string, dirents []Di d.bufp = 0 } nameslice := unsafe.Slice(&info.FileName[0], info.FileNameLength/2) - name := string(utf16.Decode(nameslice)) + name := syscall.UTF16ToString(nameslice) if name == "." || name == ".." { // Useless names continue } diff --git a/src/os/exec/lp_windows_test.go b/src/os/exec/lp_windows_test.go index 50d522948a..4d85a5f415 100644 --- a/src/os/exec/lp_windows_test.go +++ b/src/os/exec/lp_windows_test.go @@ -587,7 +587,6 @@ package main import ( "os" "syscall" - "unicode/utf16" "unsafe" ) @@ -599,7 +598,7 @@ func getMyName() (string, error) { if n == 0 { return "", err } - return string(utf16.Decode(b[0:n])), nil + return syscall.UTF16ToString(b[0:n]), nil } func main() { diff --git a/src/os/file_windows.go b/src/os/file_windows.go index f5a436e235..37db3f931c 100644 --- a/src/os/file_windows.go +++ b/src/os/file_windows.go @@ -11,7 +11,6 @@ import ( "runtime" "sync" "syscall" - "unicode/utf16" "unsafe" ) @@ -259,7 +258,7 @@ func tempDir() string { // Otherwise remove terminating \. n-- } - return string(utf16.Decode(b[:n])) + return syscall.UTF16ToString(b[:n]) } } diff --git a/src/os/os_windows_test.go b/src/os/os_windows_test.go index 21a8c21d1e..fbc8cc1b9f 100644 --- a/src/os/os_windows_test.go +++ b/src/os/os_windows_test.go @@ -18,6 +18,7 @@ import ( "path/filepath" "reflect" "runtime" + "slices" "sort" "strings" "syscall" @@ -1377,3 +1378,71 @@ func TestAppExecLinkStat(t *testing.T) { t.Errorf("exec.LookPath(%q) = %q; want %q", pythonPath, p, pythonPath) } } + +func TestIllformedUTF16FileName(t *testing.T) { + dir := t.TempDir() + const sep = string(os.PathSeparator) + if !strings.HasSuffix(dir, sep) { + dir += sep + } + + // This UTF-16 file name is ill-formed as it contains low surrogates that are not preceded by high surrogates ([1:5]). + namew := []uint16{0x2e, 0xdc6d, 0xdc73, 0xdc79, 0xdc73, 0x30, 0x30, 0x30, 0x31, 0} + + // Create a file whose name contains unpaired surrogates. + // Use syscall.CreateFile instead of os.Create to simulate a file that is created by + // a non-Go program so the file name hasn't gone through syscall.UTF16FromString. + dirw := utf16.Encode([]rune(dir)) + pathw := append(dirw, namew...) + fd, err := syscall.CreateFile(&pathw[0], syscall.GENERIC_ALL, 0, nil, syscall.CREATE_NEW, 0, 0) + if err != nil { + t.Fatal(err) + } + syscall.CloseHandle(fd) + + name := syscall.UTF16ToString(namew) + path := filepath.Join(dir, name) + // Verify that os.Lstat can query the file. + fi, err := os.Lstat(path) + if err != nil { + t.Fatal(err) + } + if got := fi.Name(); got != name { + t.Errorf("got %q, want %q", got, name) + } + // Verify that File.Readdirnames lists the file. + f, err := os.Open(dir) + if err != nil { + t.Fatal(err) + } + files, err := f.Readdirnames(0) + f.Close() + if err != nil { + t.Fatal(err) + } + if !slices.Contains(files, name) { + t.Error("file not listed") + } + // Verify that os.RemoveAll can remove the directory + // and that it doesn't hang. + err = os.RemoveAll(dir) + if err != nil { + t.Error(err) + } +} + +func TestUTF16Alloc(t *testing.T) { + allowsPerRun := func(want int, f func()) { + t.Helper() + got := int(testing.AllocsPerRun(5, f)) + if got != want { + t.Errorf("got %d allocs, want %d", got, want) + } + } + allowsPerRun(1, func() { + syscall.UTF16ToString([]uint16{'a', 'b', 'c'}) + }) + allowsPerRun(1, func() { + syscall.UTF16FromString("abc") + }) +} |