105 files changed, 2800 insertions, 6350 deletions
diff --git a/doc/articles/go_command.html b/doc/articles/go_command.html
index 246b8c956..2978628cd 100644
--- a/doc/articles/go_command.html
+++ b/doc/articles/go_command.html
@@ -78,17 +78,18 @@ well-established conventions.</p>
 source code.  For Bitbucket, GitHub, Google Code, and Launchpad, the
 root directory of the repository is identified by the repository's
 main URL, without the <code>http://</code> prefix.  Subdirectories are named by
-adding to that path.  For example, the supplemental networking
-libraries for Go are obtained by running</p>
+adding to that path.
+For example, the Go example programs are obtained by running</p>
 
 <pre>
-hg clone http://code.google.com/p/go.net
+git clone https://github.com/golang/example
 </pre>
 
 <p>and thus the import path for the root directory of that repository is
-"<code>code.google.com/p/go.net</code>".  The websocket package is stored in a
-subdirectory, so its import path is
-"<code>code.google.com/p/go.net/websocket</code>".</p>
+"<code>github.com/golang/example</code>".
+The <a href="https://godoc.org/github.com/golang/example/stringutil">stringutil</a>
+package is stored in a subdirectory, so its import path is
+"<code>github.com/golang/example/stringutil</code>".</p>
 
 <p>These paths are on the long side, but in exchange we get an
 automatically managed name space for import paths and the ability for
diff --git a/doc/cmd.html b/doc/cmd.html
index 132ea275f..5d20d3887 100644
--- a/doc/cmd.html
+++ b/doc/cmd.html
@@ -62,7 +62,7 @@ details.
 </tr>
 
 <tr>
-<td><a href="//godoc.org/code.google.com/p/go.tools/cmd/cover/">cover</a></td>
+<td><a href="//godoc.org/golang.org/x/tools/cmd/cover/">cover</a></td>
 <td>&nbsp;&nbsp;&nbsp;&nbsp;</td>
 <td>Cover is a program for creating and analyzing the coverage profiles
 generated by <code>"go test -coverprofile"</code>.</td>
@@ -83,13 +83,13 @@ gofmt</a> command with more general options.</td>
 </tr>
 
 <tr>
-<td><a href="//godoc.org/code.google.com/p/go.tools/cmd/godoc/">godoc</a></td>
+<td><a href="//godoc.org/golang.org/x/tools/cmd/godoc/">godoc</a></td>
 <td>&nbsp;&nbsp;&nbsp;&nbsp;</td>
 <td>Godoc extracts and generates documentation for Go packages.</td>
 </tr>
 
 <tr>
-<td><a href="//godoc.org/code.google.com/p/go.tools/cmd/vet/">vet</a></td>
+<td><a href="//godoc.org/golang.org/x/tools/cmd/vet/">vet</a></td>
 <td>&nbsp;&nbsp;&nbsp;&nbsp;</td>
 <td>Vet examines Go source code and reports suspicious constructs, such as Printf
 calls whose arguments do not align with the format string.</td>
diff --git a/doc/code.html b/doc/code.html
index f019306fa..ce9f8636f 100644
--- a/doc/code.html
+++ b/doc/code.html
@@ -60,37 +60,35 @@ To give you an idea of how a workspace looks in practice, here's an example:
 
 <pre>
 bin/
-    streak                         # command executable
-    todo                           # command executable
+    hello                          # command executable
+    outyet                         # command executable
 pkg/
     linux_amd64/
-        code.google.com/p/goauth2/
-            oauth.a                # package object
-        github.com/nf/todo/
-            task.a                 # package object
+        github.com/golang/example/
+            stringutil.a           # package object
 src/
-    code.google.com/p/goauth2/
-        .hg/                       # mercurial repository metadata
-        oauth/
-            oauth.go               # package source
-            oauth_test.go          # test source
-    github.com/nf/
-        streak/
-            .git/                  # git repository metadata
-            oauth.go               # command source
-            streak.go              # command source
-        todo/
-            .git/                  # git repository metadata
-            task/
-                task.go            # package source
-            todo.go                # command source
+    <a href="https://github.com/golang/example/">github.com/golang/example/</a>
+        .git/                      # Git repository metadata
+	hello/
+	    hello.go               # command source
+	outyet/
+	    main.go                # command source
+	    main_test.go           # test source
+	stringutil/
+	    reverse.go             # package source
+	    reverse_test.go        # test source
 </pre>
 
 <p>
-This workspace contains three repositories (<code>goauth2</code>,
-<code>streak</code>, and <code>todo</code>) comprising two commands
-(<code>streak</code> and <code>todo</code>) and two libraries
-(<code>oauth</code> and <code>task</code>).
+This workspace contains one repository (<code>example</code>)
+comprising two commands (<code>hello</code> and <code>outyet</code>)
+and one library (<code>stringutil</code>).
+</p>
+
+<p>
+A typical workspace would contain many source repositories containing many
+packages and commands. Most Go programmers keep <i>all</i> their Go source code
+and dependencies in a single workspace.
 </p>
 
 <p>
@@ -277,29 +275,29 @@ Let's write a library and use it from the <code>hello</code> program.
 
 <p>
 Again, the first step is to choose a package path (we'll use
-<code>github.com/user/newmath</code>) and create the package directory:
+<code>github.com/user/stringutil</code>) and create the package directory:
 </p>
 
 <pre>
-$ <b>mkdir $GOPATH/src/github.com/user/newmath</b>
+$ <b>mkdir $GOPATH/src/github.com/user/stringutil</b>
 </pre>
 
 <p>
-Next, create a file named <code>sqrt.go</code> in that directory with the
+Next, create a file named <code>reverse.go</code> in that directory with the
 following contents.
 </p>
 
 <pre>
-// Package newmath is a trivial example package.
-package newmath
-
-// Sqrt returns an approximation to the square root of x.
-func Sqrt(x float64) float64 {
-	z := 1.0
-	for i := 0; i &lt; 1000; i++ {
-		z -= (z*z - x) / (2 * z)
+// Package stringutil contains utility functions for working with strings.
+package stringutil
+
+// Reverse returns its argument string reversed rune-wise left to right.
+func Reverse(s string) string {
+	r := []rune(s)
+	for i, j := 0, len(r)-1; i &lt; len(r)/2; i, j = i+1, j-1 {
+		r[i], r[j] = r[j], r[i]
 	}
-	return z
+	return string(r)
 }
 </pre>
 
@@ -308,7 +306,7 @@ Now, test that the package compiles with <code>go build</code>:
 </p>
 
 <pre>
-$ <b>go build github.com/user/newmath</b>
+$ <b>go build github.com/user/stringutil</b>
 </pre>
 
 <p>
@@ -326,7 +324,7 @@ directory of the workspace.
 </p>
 
 <p>
-After confirming that the <code>newmath</code> package builds,
+After confirming that the <code>stringutil</code> package builds,
 modify your original <code>hello.go</code> (which is in
 <code>$GOPATH/src/github.com/user/hello</code>) to use it:
 </p>
@@ -337,18 +335,18 @@ package main
 import (
 	"fmt"
 
-	<b>"github.com/user/newmath"</b>
+	<b>"github.com/user/stringutil"</b>
 )
 
 func main() {
-	fmt.Printf("Hello, world.  <b>Sqrt(2) = %v\n", newmath.Sqrt(2)</b>)
+	fmt.Printf(stringutil.Reverse("!oG ,olleH"))
 }
 </pre>
 
 <p>
 Whenever the <code>go</code> tool installs a package or binary, it also
-installs whatever dependencies it has. So when you install the <code>hello</code>
-program
+installs whatever dependencies it has.
+So when you install the <code>hello</code> program
 </p>
 
 <pre>
@@ -356,16 +354,16 @@ $ <b>go install github.com/user/hello</b>
 </pre>
 
 <p>
-the <code>newmath</code> package will be installed as well, automatically.
+the <code>stringutil</code> package will be installed as well, automatically.
 </p>
 
 <p>
-Running the new version of the program, you should see some numerical output:
+Running the new version of the program, you should see a new, reversed message:
 </p>
 
 <pre>
 $ <b>hello</b>
-Hello, world.  Sqrt(2) = 1.414213562373095
+Hello, Go!
 </pre>
 
 <p>
@@ -374,22 +372,22 @@ After the steps above, your workspace should look like this:
 
 <pre>
 bin/
-    hello              # command executable
+    hello                 # command executable
 pkg/
-    linux_amd64/       # this will reflect your OS and architecture
+    linux_amd64/          # this will reflect your OS and architecture
         github.com/user/
-            newmath.a  # package object
+            stringutil.a  # package object
 src/
     github.com/user/
         hello/
-            hello.go   # command source
-        newmath/
-            sqrt.go    # package source
+            hello.go      # command source
+        stringutil/
+            reverse.go    # package source
 </pre>
 
 <p>
-Note that <code>go install</code> placed the <code>newmath.a</code> object in a
-directory inside <code>pkg/linux_amd64</code> that mirrors its source
+Note that <code>go install</code> placed the <code>stringutil.a</code> object
+in a directory inside <code>pkg/linux_amd64</code> that mirrors its source
 directory.
 This is so that future invocations of the <code>go</code> tool can find the
 package object and avoid recompiling the package unnecessarily.
@@ -457,20 +455,29 @@ if the function calls a failure function such as <code>t.Error</code> or
 </p>
 
 <p>
-Add a test to the <code>newmath</code> package by creating the file
-<code>$GOPATH/src/github.com/user/newmath/sqrt_test.go</code> containing the
-following Go code.
+Add a test to the <code>stringutil</code> package by creating the file
+<code>$GOPATH/src/github.com/user/stringutil/reverse_test.go</code> containing
+the following Go code.
 </p>
 
 <pre>
-package newmath
+package stringutil
 
 import "testing"
 
-func TestSqrt(t *testing.T) {
-	const in, out = 4, 2
-	if x := Sqrt(in); x != out {
-		t.Errorf("Sqrt(%v) = %v, want %v", in, x, out)
+func TestReverse(t *testing.T) {
+	cases := []struct {
+		in, want string
+	}{
+		{"Hello, world", "dlrow ,olleH"},
+		{"Hello, 世界", "界世 ,olleH"},
+		{"", ""},
+	}
+	for _, c := range cases {
+		got := Reverse(c.in)
+		if got != c.want {
+			t.Errorf("Reverse(%q) == %q, want %q", c.in, got, c.want)
+		}
 	}
 }
 </pre>
@@ -480,8 +487,8 @@ Then run the test with <code>go test</code>:
 </p>
 
 <pre>
-$ <b>go test github.com/user/newmath</b>
-ok  	github.com/user/newmath 0.165s
+$ <b>go test github.com/user/stringutil</b>
+ok  	github.com/user/stringutil 0.165s
 </pre>
 
 <p>
@@ -491,7 +498,7 @@ directory, you can omit the package path:
 
 <pre>
 $ <b>go test</b>
-ok  	github.com/user/newmath 0.165s
+ok  	github.com/user/stringutil 0.165s
 </pre>
 
 <p>
@@ -507,16 +514,16 @@ An import path can describe how to obtain the package source code using a
 revision control system such as Git or Mercurial. The <code>go</code> tool uses
 this property to automatically fetch packages from remote repositories.
 For instance, the examples described in this document are also kept in a
-Mercurial repository hosted at Google Code,
-<code><a href="//code.google.com/p/go.example">code.google.com/p/go.example</a></code>.
+Git repository hosted at GitHub
+<code><a href="https://github.com/golang/example">github.com/golang/example</a></code>.
 If you include the repository URL in the package's import path,
 <code>go get</code> will fetch, build, and install it automatically:
 </p>
 
 <pre>
-$ <b>go get code.google.com/p/go.example/hello</b>
+$ <b>go get github.com/golang/example/hello</b>
 $ <b>$GOPATH/bin/hello</b>
-Hello, world.  Sqrt(2) = 1.414213562373095
+Hello, Go examples!
 </pre>
 
 <p>
@@ -533,37 +540,39 @@ tree should now look like this:
 
 <pre>
 bin/
-    hello                 # command executable
+    hello                           # command executable
 pkg/
     linux_amd64/
-        code.google.com/p/go.example/
-            newmath.a     # package object
+        github.com/golang/example/
+            stringutil.a            # package object
         github.com/user/
-            newmath.a     # package object
+            stringutil.a            # package object
 src/
-    code.google.com/p/go.example/
+    github.com/golang/example/
+	.git/                       # Git repository metadata
         hello/
-            hello.go      # command source
-        newmath/
-            sqrt.go       # package source
-            sqrt_test.go  # test source
+            hello.go                # command source
+        stringutil/
+            reverse.go              # package source
+            reverse_test.go         # test source
     github.com/user/
         hello/
-            hello.go      # command source
-        newmath/
-            sqrt.go       # package source
-            sqrt_test.go  # test source
+            hello.go                # command source
+        stringutil/
+            reverse.go              # package source
+            reverse_test.go         # test source
 </pre>
 
 <p>
-The <code>hello</code> command hosted at Google Code depends on the
-<code>newmath</code> package within the same repository. The imports in
-<code>hello.go</code> file use the same import path convention, so the <code>go
-get</code> command is able to locate and install the dependent package, too.
+The <code>hello</code> command hosted at GitHub depends on the
+<code>stringutil</code> package within the same repository. The imports in
+<code>hello.go</code> file use the same import path convention, so the
+<code>go get</code> command is able to locate and install the dependent
+package, too.
 </p>
 
 <pre>
-import "code.google.com/p/go.example/newmath"
+import "github.com/golang/example/stringutil"
 </pre>
 
 <p>
diff --git a/doc/contribute.html b/doc/contribute.html
index 90c3f10a1..92fd88b48 100644
--- a/doc/contribute.html
+++ b/doc/contribute.html
@@ -121,7 +121,7 @@ are inside the go directory when issuing commands.
 
 <p>To contribute to subrepositories, edit the <code>.hg/hgrc</code> for each
 subrepository in the same way. For example, add the codereview extension to
-<code>code.google.com/p/go.tools/.hg/hgrc</code>.
+<code>golang.org/x/tools/.hg/hgrc</code>.
 </p>
 
 <h3>Understanding the extension</h3>
diff --git a/doc/go1.4.html b/doc/go1.4.html
index 3310117a4..ac63ade60 100644
--- a/doc/go1.4.html
+++ b/doc/go1.4.html
@@ -120,9 +120,9 @@ compile but is easy to fix by adding an explicit dereference.
 <p>
 Go 1.4 can build binaries for ARM processors running the Android operating system.
 It can also build a <code>.so</code> library that can be loaded by an Android application
-using the supporting packages in the <a href="http://code.google.com/p/go.mobile">go.mobile</a> repository.
+using the supporting packages in the <a href="https://golang.org/x/mobile">mobile</a> subrepository.
 A brief description of the plans for this experimental port are available
-<a href="/s/go14android">here</a>.
+<a href="https://golang.org/s/go14android">here</a>.
 </p>
 
 <h3 id="naclarm">NaCl on ARM</h3>
@@ -193,13 +193,12 @@ A consequence is that stacks are no longer segmented, eliminating the "hot split
 When a stack limit is reached, a new, larger stack is allocated, all active frames for
 the goroutine are copied there, and any pointers into the stack are updated.
 Performance can be noticeably better in some cases and is always more predictable.
-Details are available in <a href="/s/contigstacks">the design document</a>.
+Details are available in <a href="https://golang.org/s/contigstacks">the design document</a>.
 </p>
 
 <p>
 The use of contiguous stacks means that stacks can start smaller without triggering performance issues,
 so the default starting size for a goroutine's stack in 1.4 has been reduced to 2048 bytes from 8192 bytes.
-TODO: It may be bumped to 4096 for the release.
 </p>
 
 <p>
@@ -320,7 +319,7 @@ from 1.5 and onward it will be enforced for any repository.
 
 <p>
 Full details of the mechanism are in
-<a href="http://golang.org/s/go14internal">the design document</a>.
+<a href="https://golang.org/s/go14internal">the design document</a>.
 </p>
 
 <h3 id="canonicalimports">Canonical import paths</h3>
@@ -382,7 +381,25 @@ The new <code>-f</code> flag overrides this check.
 
 <p>
 Further information is in
-<a href="http://golang.org/s/go14customimport">the design document</a>.
+<a href="https://golang.org/s/go14customimport">the design document</a>.
+</p>
+
+<h3 id="subrepo">Import paths for the subrepositories</h3>
+
+<p>
+The Go project subrepositories (<code>code.google.com/p/go.tools</code> and so on)
+are now available under custom import paths replacing <code>code.google.com/p/go.</code> with <code>golang.org/x/</code>,
+as in <code>golang.org/x/tools</code>.
+We will add canonical import comments to the code around June 1, 2015,
+at which point Go 1.4 and later will stop accepting the old <code>code.google.com</code> paths.
+</p>
+
+<p>
+<em>Updating</em>: All code that imports from subrepositories should change
+to use the new <code>golang.org</code> paths.
+Go 1.0 and later can resolve and import the new paths, so updating will not break
+compatibility with older releases.
+Code that has not updated will stop compiling with Go 1.4 around June 1, 2015.
 </p>
 
 <h3 id="gogenerate">The go generate subcommand</h3>
@@ -394,13 +411,13 @@ to automate the running of tools to generate source code before compilation.
 For example, it can be used to run the <a href="/cmd/yacc"><code>yacc</code></a>
 compiler-compiler on a <code>.y</code> file to produce the Go source file implementing the grammar,
 or to automate the generation of <code>String</code> methods for typed constants using the new
-<a href="http://godoc.org/code.google.com/p/go.tools/cmd/stringer">stringer</a>
-tool in the <code>go.tools</code> repository.
+<a href="http://godoc.org/golang.org/x/tools/cmd/stringer">stringer</a>
+tool in the <code>golang.org/x/tools</code> subrepository.
 </p>
 
 <p>
 For more information, see the 
-<a href="http://golang.org/s/go1.4-generate">design document</a>.
+<a href="https://golang.org/s/go1.4-generate">design document</a>.
 </p>
 
 <h3 id="filenames">Change to file name handling</h3>
@@ -480,7 +497,7 @@ rebuild the standard library and commands, to avoid overwriting the installation
 <p>
 In the main Go source repository, the source code for the packages was kept in
 the directory <code>src/pkg</code>, which made sense but differed from
-other repositories, including the Go sub-repositories such as <code>go.tools</code>.
+other repositories, including the Go subrepositories.
 In Go 1.4, the<code> pkg</code> level of the source tree is now gone, so for example
 the <a href="/pkg/fmt/"><code>fmt</code></a> package's source, once kept in
 directory <code>src/pkg/fmt</code>, now lives one level higher in <code>src/fmt</code>.
@@ -586,19 +603,19 @@ The <a href="/pkg/syscall/"><code>syscall</code></a> package is now frozen excep
 for changes needed to maintain the core repository.
 In particular, it will no longer be extended to support new or different system calls
 that are not used by the core.
-The reasons are described at length in <a href="http://golang.org/s/go1.4-syscall">a
+The reasons are described at length in <a href="https://golang.org/s/go1.4-syscall">a
 separate document</a>.
 </p>
 
 <p>
-A new subrepository, <a href="http://code.google.com/p/go.sys">go.sys</a>,
+A new subrepository, <a href="https://golang.org/x/sys">golang.org/x/sys</a>,
 has been created to serve as the location for new developments to support system
 calls on all kernels.
 It has a nicer structure, with three packages that each hold the implementation of
 system calls for one of
-<a href="http://godoc.org/code.google.com/p/go.sys/unix">Unix</a>,
-<a href="http://godoc.org/code.google.com/p/go.sys/windows">Windows</a> and
-<a href="http://godoc.org/code.google.com/p/go.sys/plan9">Plan 9</a>.
+<a href="http://godoc.org/golang.org/x/sys/unix">Unix</a>,
+<a href="http://godoc.org/golang.org/x/sys/windows">Windows</a> and
+<a href="http://godoc.org/golang.org/x/sys/plan9">Plan 9</a>.
 These packages will be curated more generously, accepting all reasonable changes
 that reflect kernel interfaces in those operating systems.
 See the documentation and the article mentioned above for more information.
@@ -608,7 +625,7 @@ See the documentation and the article mentioned above for more information.
 <em>Updating</em>: Existing programs are not affected as the <code>syscall</code>
 package is largely unchanged from the 1.3 release.
 Future development that requires system calls not in the <code>syscall</code> package
-should build on <code>go.sys</code> instead.
+should build on <code>golang.org/x/sys</code> instead.
 </p>
 
 <h3 id="minor_library_changes">Minor changes to the library</h3>
diff --git a/doc/go1compat.html b/doc/go1compat.html
index 94c48d2ce..d800dec0c 100644
--- a/doc/go1compat.html
+++ b/doc/go1compat.html
@@ -153,7 +153,7 @@ developed software based on Go 1.
 
 <p>
 Code in sub-repositories of the main go tree, such as
-<a href="//code.google.com/p/go.net">code.google.com/p/go.net</a>,
+<a href="//golang.org/x/net">golang.org/x/net</a>,
 may be developed under
 looser compatibility requirements. However, the sub-repositories
 will be tagged as appropriate to identify versions that are compatible
@@ -170,9 +170,9 @@ is therefore outside the purview of the guarantees made here.
 As of Go version 1.4, the <code>syscall</code> package is frozen.
 Any evolution of the system call interface must be supported elsewhere,
 such as in the
-<a href="http://godoc.org/code.google.com/p/go.sys">go.sys</a> subrepository.
+<a href="//golang.org/x/sys">go.sys</a> subrepository.
 For details and background, see
-<a href="https://golang.org/s/go1.4-syscall">this document</a>.
+<a href="//golang.org/s/go1.4-syscall">this document</a>.
 </p>
 
 <h2 id="tools">Tools</h2>
diff --git a/doc/go_faq.html b/doc/go_faq.html
index 9aac05838..759799779 100644
--- a/doc/go_faq.html
+++ b/doc/go_faq.html
@@ -1616,7 +1616,7 @@ Go is a
 fine language in which to implement a self-hosting compiler: a native lexer and
 parser are already available in the <a href="/pkg/go/"><code>go</code></a> package
 and a separate type checking
-<a href="http://godoc.org/code.google.com/p/go.tools/go/types">package</a>
+<a href="http://godoc.org/golang.org/x/tools/go/types">package</a>
 has also been written.
 </p>
 
@@ -1715,7 +1715,7 @@ func main() {
 
 <p>
 Nowadays, most Go programmers use a tool,
-<a href="http://godoc.org/code.google.com/p/go.tools/cmd/goimports">goimports</a>,
+<a href="http://godoc.org/golang.org/x/tools/cmd/goimports">goimports</a>,
 which automatically rewrites a Go source file to have the correct imports,
 eliminating the unused imports issue in practice.
 This program is easily connected to most editors to run automatically when a Go source file is written.
diff --git a/doc/install-source.html b/doc/install-source.html
index 82859b50f..f53deb404 100644
--- a/doc/install-source.html
+++ b/doc/install-source.html
@@ -241,12 +241,12 @@ provides <b>essential setup instructions</b> for using the Go tools.
 
 <p>
 The source code for several Go tools (including <a href="/cmd/godoc/">godoc</a>)
-is kept in <a href="https://code.google.com/p/go.tools">the go.tools repository</a>.
+is kept in <a href="https://golang.org/x/tools">the go.tools repository</a>.
 To install all of them, run the <code>go</code> <code>get</code> command:
 </p>
 
 <pre>
-$ go get code.google.com/p/go.tools/cmd/...
+$ go get golang.org/x/tools/cmd/...
 </pre>
 
 <p>
@@ -254,7 +254,7 @@ Or if you just want to install a specific command (<code>godoc</code> in this ca
 </p>
 
 <pre>
-$ go get code.google.com/p/go.tools/cmd/godoc
+$ go get golang.org/x/tools/cmd/godoc
 </pre>
 
 <p>
diff --git a/include/link.h b/include/link.h
index 06f3ebb48..80f3f4d82 100644
--- a/include/link.h
+++ b/include/link.h
@@ -89,7 +89,7 @@ struct	Prog
 	int32	lineno;
 	Prog*	link;
 	short	as;
-	uchar	scond; // arm only; condition codes
+	uchar	scond; // arm only
 
 	// operands
 	Addr	from;
diff --git a/lib/codereview/codereview.py b/lib/codereview/codereview.py
index 263385b79..0c9b27a31 100644
--- a/lib/codereview/codereview.py
+++ b/lib/codereview/codereview.py
@@ -1631,7 +1631,7 @@ def clpatch_or_undo(ui, repo, clname, opts, mode):
 	try:
 		cmd = subprocess.Popen(argv, shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=None, close_fds=sys.platform != "win32")
 	except:
-		return "hgapplydiff: " + ExceptionDetail() + "\nInstall hgapplydiff with:\n$ go get code.google.com/p/go.codereview/cmd/hgapplydiff\n"
+		return "hgapplydiff: " + ExceptionDetail() + "\nInstall hgapplydiff with:\n$ go get golang.org/x/codereview/cmd/hgapplydiff\n"
 
 	out, err = cmd.communicate(patch)
 	if cmd.returncode != 0 and not opts["ignore_hgapplydiff_failure"]:
@@ -2024,13 +2024,13 @@ def submit(ui, repo, *pats, **opts):
 	# push to remote; if it fails for any reason, roll back
 	try:
 		new_heads = len(hg_heads(ui, repo).split())
-		if old_heads != new_heads and not (old_heads == 0 and new_heads == 1):
+		if cl.desc.find("create new branch") < 0 and old_heads != new_heads and not (old_heads == 0 and new_heads == 1):
 			# Created new head, so we weren't up to date.
 			need_sync()
 
 		# Push changes to remote.  If it works, we're committed.  If not, roll back.
 		try:
-			if hg_push(ui, repo):
+			if hg_push(ui, repo, new_branch=cl.desc.find("create new branch")>=0):
 				raise hg_util.Abort("push error")
 		except hg_error.Abort, e:
 			if e.message.find("push creates new heads") >= 0:
@@ -3451,6 +3451,7 @@ class FakeMercurialUI(object):
 	def __init__(self):
 		self.quiet = True
 		self.output = ''
+		self.debugflag = False
 	
 	def write(self, *args, **opts):
 		self.output += ' '.join(args)
diff --git a/misc/benchcmp b/misc/benchcmp
index 28a37392d..84d92eefd 100755
--- a/misc/benchcmp
+++ b/misc/benchcmp
@@ -1,5 +1,5 @@
 #!/bin/bash
 
 echo 'misc/benchcmp has moved:' >&2
-echo '	go get -u code.google.com/p/go.tools/cmd/benchcmp' >&2
+echo '	go get -u golang.org/x/tools/cmd/benchcmp' >&2
 exit 2
diff --git a/misc/cgo/test/issue9026.go b/misc/cgo/test/issue9026.go
index b5d975f17..8848d0e81 100644
--- a/misc/cgo/test/issue9026.go
+++ b/misc/cgo/test/issue9026.go
@@ -1,33 +1,9 @@
 package cgotest
 
-/*
-typedef struct {} git_merge_file_input;
-
-typedef struct {} git_merge_file_options;
-
-void git_merge_file(
-        git_merge_file_input *in,
-        git_merge_file_options *opts) {}
-*/
-import "C"
 import (
-	"fmt"
 	"testing"
-)
 
-func test9026(t *testing.T) {
-	var in C.git_merge_file_input
-	var opts *C.git_merge_file_options
-	C.git_merge_file(&in, opts)
+	"./issue9026"
+)
 
-	// Test that the generated type names are deterministic.
-	// (Previously this would fail about 10% of the time.)
-	//
-	// Brittle: the assertion may fail spuriously when the algorithm
-	// changes, but should remain stable otherwise.
-	got := fmt.Sprintf("%T %T", in, opts)
-	want := "cgotest._Ctype_struct___12 *cgotest._Ctype_struct___13"
-	if got != want {
-		t.Errorf("Non-deterministic type names: got %s, want %s", got, want)
-	}
-}
+func test9026(t *testing.T) { issue9026.Test(t) }
diff --git a/misc/cgo/test/issue9026/issue9026.go b/misc/cgo/test/issue9026/issue9026.go
new file mode 100644
index 000000000..0af86e64d
--- /dev/null
+++ b/misc/cgo/test/issue9026/issue9026.go
@@ -0,0 +1,36 @@
+package issue9026
+
+// This file appears in its own package since the assertion tests the
+// per-package counter used to create fresh identifiers.
+
+/*
+typedef struct {} git_merge_file_input;
+
+typedef struct {} git_merge_file_options;
+
+void git_merge_file(
+        git_merge_file_input *in,
+        git_merge_file_options *opts) {}
+*/
+import "C"
+import (
+	"fmt"
+	"testing"
+)
+
+func Test(t *testing.T) {
+	var in C.git_merge_file_input
+	var opts *C.git_merge_file_options
+	C.git_merge_file(&in, opts)
+
+	// Test that the generated type names are deterministic.
+	// (Previously this would fail about 10% of the time.)
+	//
+	// Brittle: the assertion may fail spuriously when the algorithm
+	// changes, but should remain stable otherwise.
+	got := fmt.Sprintf("%T %T", in, opts)
+	want := "issue9026._Ctype_struct___0 *issue9026._Ctype_struct___1"
+	if got != want {
+		t.Errorf("Non-deterministic type names: got %s, want %s", got, want)
+	}
+}
diff --git a/misc/makerelease/makerelease.go b/misc/makerelease/makerelease.go
index 9b2373307..e94efdbce 100644
--- a/misc/makerelease/makerelease.go
+++ b/misc/makerelease/makerelease.go
@@ -53,8 +53,8 @@ var (
 )
 
 const (
-	blogPath       = "code.google.com/p/go.blog"
-	toolPath       = "code.google.com/p/go.tools"
+	blogPath       = "golang.org/x/blog"
+	toolPath       = "golang.org/x/tools"
 	tourPath       = "code.google.com/p/go-tour"
 	defaultToolTag = "release-branch.go1.3"
 	defaultTourTag = "release-branch.go1.3"
@@ -64,9 +64,9 @@ const (
 // These must be the command that cmd/go knows to install to $GOROOT/bin
 // or $GOROOT/pkg/tool.
 var toolPaths = []string{
-	"code.google.com/p/go.tools/cmd/cover",
-	"code.google.com/p/go.tools/cmd/godoc",
-	"code.google.com/p/go.tools/cmd/vet",
+	"golang.org/x/tools/cmd/cover",
+	"golang.org/x/tools/cmd/godoc",
+	"golang.org/x/tools/cmd/vet",
 }
 
 var preBuildCleanFiles = []string{
diff --git a/misc/pprof b/misc/pprof
deleted file mode 100755
index f83e6fb65..000000000
--- a/misc/pprof
+++ /dev/null
@@ -1,5100 +0,0 @@
-#! /usr/bin/env perl
-
-# This is a copy of http://google-perftools.googlecode.com/svn/trunk/src/pprof
-# with local modifications to handle generation of SVG images and
-# the Go-style pprof paths.  These modifications will probably filter
-# back into the official source before long.
-# It's convenient to have a copy here because we need just the one
-# Perl script, not all the C++ libraries that surround it.
-
-# Copyright (c) 1998-2007, Google Inc.
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-#
-#     * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following disclaimer
-# in the documentation and/or other materials provided with the
-# distribution.
-#     * Neither the name of Google Inc. nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-# ---
-# Program for printing the profile generated by common/profiler.cc,
-# or by the heap profiler (common/debugallocation.cc)
-#
-# The profile contains a sequence of entries of the form:
-#       <count> <stack trace>
-# This program parses the profile, and generates user-readable
-# output.
-#
-# Examples:
-#
-# % tools/pprof "program" "profile"
-#   Enters "interactive" mode
-#
-# % tools/pprof --text "program" "profile"
-#   Generates one line per procedure
-#
-# % tools/pprof --gv "program" "profile"
-#   Generates annotated call-graph and displays via "gv"
-#
-# % tools/pprof --gv --focus=Mutex "program" "profile"
-#   Restrict to code paths that involve an entry that matches "Mutex"
-#
-# % tools/pprof --gv --focus=Mutex --ignore=string "program" "profile"
-#   Restrict to code paths that involve an entry that matches "Mutex"
-#   and does not match "string"
-#
-# % tools/pprof --list=IBF_CheckDocid "program" "profile"
-#   Generates disassembly listing of all routines with at least one
-#   sample that match the --list=<regexp> pattern.  The listing is
-#   annotated with the flat and cumulative sample counts at each line.
-#
-# % tools/pprof --disasm=IBF_CheckDocid "program" "profile"
-#   Generates disassembly listing of all routines with at least one
-#   sample that match the --disasm=<regexp> pattern.  The listing is
-#   annotated with the flat and cumulative sample counts at each PC value.
-#
-# TODO: Use color to indicate files?
-
-use strict;
-use warnings;
-use Getopt::Long;
-use File::Temp;
-use File::Copy;
-
-my $PPROF_VERSION = "1.5";
-
-# NOTE: All mentions of c++filt have been expunged from this script
-# because (1) we don't use C++, and (2) the copy of c++filt that ships
-# on OS X is from 2007 and destroys nm output by "demangling" the
-# first two columns (address and symbol type).
-
-# These are the object tools we use which can come from a
-# user-specified location using --tools, from the PPROF_TOOLS
-# environment variable, or from the environment.
-my %obj_tool_map = (
-  "objdump" => "objdump",
-  "nm" => "nm",
-  "addr2line" => "addr2line",
-  ## ConfigureObjTools may add architecture-specific entries:
-  #"nm_pdb" => "nm-pdb",       # for reading windows (PDB-format) executables
-  #"addr2line_pdb" => "addr2line-pdb",                                # ditto
-  #"otool" => "otool",         # equivalent of objdump on OS X
-);
-my $DOT = "dot";          # leave non-absolute, since it may be in /usr/local
-my $GV = "gv";
-my $KCACHEGRIND = "kcachegrind";
-my $PS2PDF = "ps2pdf";
-# These are used for dynamic profiles
-
-# These are the web pages that servers need to support for dynamic profiles
-my $HEAP_PAGE = "/pprof/heap";
-my $THREAD_PAGE = "/pprof/thread";
-my $PROFILE_PAGE = "/pprof/profile";   # must support cgi-param "?seconds=#"
-my $BLOCK_PAGE = "/pprof/block";
-my $PMUPROFILE_PAGE = "/pprof/pmuprofile(?:\\?.*)?"; # must support cgi-param
-                                                # ?seconds=#&event=x&period=n
-my $GROWTH_PAGE = "/pprof/growth";
-my $CONTENTION_PAGE = "/pprof/contention";
-my $WALL_PAGE = "/pprof/wall(?:\\?.*)?";  # accepts options like namefilter
-my $FILTEREDPROFILE_PAGE = "/pprof/filteredprofile(?:\\?.*)?";
-my $SYMBOL_PAGE = "/pprof/symbol";     # must support symbol lookup via POST
-my $PROGRAM_NAME_PAGE = "/pprof/cmdline";
-
-# default binary name
-my $UNKNOWN_BINARY = "(unknown)";
-
-# There is a pervasive dependency on the length (in hex characters,
-# i.e., nibbles) of an address, distinguishing between 32-bit and
-# 64-bit profiles.  To err on the safe size, default to 64-bit here:
-my $address_length = 16;
-
-# A list of paths to search for shared object files
-my @prefix_list = ();
-
-# Special routine name that should not have any symbols.
-# Used as separator to parse "addr2line -i" output.
-my $sep_symbol = '_fini';
-my $sep_address = undef;
-
-my $OS = $^O;
-my $DEVNULL = "/dev/null";
-if ($^O =~ /MSWin32|cygwin|msys/) {
-  $OS = "windows";
-  $DEVNULL = "NUL";
-}
-
-##### Argument parsing #####
-
-sub usage_string {
-  return <<EOF;
-Usage:
-pprof [options] <program> <profiles>
-   <profiles> is a space separated list of profile names.
-pprof [options] <symbolized-profiles>
-   <symbolized-profiles> is a list of profile files where each file contains
-   the necessary symbol mappings  as well as profile data (likely generated
-   with --raw).
-pprof [options] <profile>
-   <profile> is a remote form.  Symbols are obtained from host:port$SYMBOL_PAGE
-
-   Each name can be:
-   /path/to/profile        - a path to a profile file
-   host:port[/<service>]   - a location of a service to get profile from
-
-   The /<service> can be $HEAP_PAGE, $PROFILE_PAGE, /pprof/pmuprofile,
-                         $GROWTH_PAGE, $CONTENTION_PAGE, /pprof/wall,
-                         $THREAD_PAGE, $BLOCK_PAGE or /pprof/filteredprofile.
-   For instance:
-     pprof http://myserver.com:80$HEAP_PAGE
-   If /<service> is omitted, the service defaults to $PROFILE_PAGE (cpu profiling).
-pprof --symbols <program>
-   Maps addresses to symbol names.  In this mode, stdin should be a
-   list of library mappings, in the same format as is found in the heap-
-   and cpu-profile files (this loosely matches that of /proc/self/maps
-   on linux), followed by a list of hex addresses to map, one per line.
-
-   For more help with querying remote servers, including how to add the
-   necessary server-side support code, see this filename (or one like it):
-
-   /usr/doc/google-perftools-$PPROF_VERSION/pprof_remote_servers.html
-
-Options:
-   --cum               Sort by cumulative data
-   --base=<base>       Subtract <base> from <profile> before display
-   --interactive       Run in interactive mode (interactive "help" gives help) [default]
-   --seconds=<n>       Length of time for dynamic profiles [default=30 secs]
-   --add_lib=<file>    Read additional symbols and line info from the given library
-   --lib_prefix=<dir>  Comma separated list of library path prefixes
-
-Reporting Granularity:
-   --addresses         Report at address level
-   --lines             Report at source line level
-   --functions         Report at function level [default]
-   --files             Report at source file level
-
-Output type:
-   --text              Generate text report
-   --callgrind         Generate callgrind format to stdout
-   --gv                Generate Postscript and display
-   --web               Generate SVG and display
-   --list=<regexp>     Generate source listing of matching routines
-   --disasm=<regexp>   Generate disassembly of matching routines
-   --symbols           Print demangled symbol names found at given addresses
-   --dot               Generate DOT file to stdout
-   --ps                Generate Postcript to stdout
-   --pdf               Generate PDF to stdout
-   --svg               Generate SVG to stdout
-   --gif               Generate GIF to stdout
-   --raw               Generate symbolized pprof data (useful with remote fetch)
-
-Heap-Profile Options:
-   --inuse_space       Display in-use (mega)bytes [default]
-   --inuse_objects     Display in-use objects
-   --alloc_space       Display allocated (mega)bytes
-   --alloc_objects     Display allocated objects
-   --show_bytes        Display space in bytes
-   --drop_negative     Ignore negative differences
-
-Contention-profile options:
-   --total_delay       Display total delay at each region [default]
-   --contentions       Display number of delays at each region
-   --mean_delay        Display mean delay at each region
-
-Call-graph Options:
-   --nodecount=<n>     Show at most so many nodes [default=80]
-   --nodefraction=<f>  Hide nodes below <f>*total [default=.005]
-   --edgefraction=<f>  Hide edges below <f>*total [default=.001]
-   --focus=<regexp>    Focus on nodes matching <regexp>
-   --ignore=<regexp>   Ignore nodes matching <regexp>
-   --scale=<n>         Set GV scaling [default=0]
-   --heapcheck         Make nodes with non-0 object counts
-                       (i.e. direct leak generators) more visible
-
-Miscellaneous:
-   --tools=<prefix>    Prefix for object tool pathnames
-   --test              Run unit tests
-   --help              This message
-   --version           Version information
-
-Environment Variables:
-   PPROF_TMPDIR        Profiles directory. Defaults to \$HOME/pprof
-   PPROF_TOOLS         Prefix for object tools pathnames
-
-Examples:
-
-pprof /bin/ls ls.prof
-                       Enters "interactive" mode
-pprof --text /bin/ls ls.prof
-                       Outputs one line per procedure
-pprof --web /bin/ls ls.prof
-                       Displays annotated call-graph in web browser
-pprof --gv /bin/ls ls.prof
-                       Displays annotated call-graph via 'gv'
-pprof --gv --focus=Mutex /bin/ls ls.prof
-                       Restricts to code paths including a .*Mutex.* entry
-pprof --gv --focus=Mutex --ignore=string /bin/ls ls.prof
-                       Code paths including Mutex but not string
-pprof --list=getdir /bin/ls ls.prof
-                       (Per-line) annotated source listing for getdir()
-pprof --disasm=getdir /bin/ls ls.prof
-                       (Per-PC) annotated disassembly for getdir()
-
-pprof http://localhost:1234/
-                       Enters "interactive" mode
-pprof --text localhost:1234
-                       Outputs one line per procedure for localhost:1234
-pprof --raw localhost:1234 > ./local.raw
-pprof --text ./local.raw
-                       Fetches a remote profile for later analysis and then
-                       analyzes it in text mode.
-EOF
-}
-
-sub version_string {
-  return <<EOF
-pprof (part of google-perftools $PPROF_VERSION)
-
-Copyright 1998-2007 Google Inc.
-
-This is BSD licensed software; see the source for copying conditions
-and license information.
-There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A
-PARTICULAR PURPOSE.
-EOF
-}
-
-sub usage {
-  my $msg = shift;
-  print STDERR "$msg\n\n";
-  print STDERR usage_string();
-  print STDERR "\nFATAL ERROR: $msg\n";    # just as a reminder
-  exit(1);
-}
-
-sub Init() {
-  # Setup tmp-file name and handler to clean it up.
-  # We do this in the very beginning so that we can use
-  # error() and cleanup() function anytime here after.
-  $main::tmpfile_sym = File::Temp->new()->filename;
-  $main::tmpfile_ps = File::Temp->new()->filename;
-  
-  $main::next_tmpfile = 0;
-  $SIG{'INT'} = \&sighandler;
-
-  # Cache from filename/linenumber to source code
-  $main::source_cache = ();
-
-  $main::opt_help = 0;
-  $main::opt_version = 0;
-
-  $main::opt_cum = 0;
-  $main::opt_base = '';
-  $main::opt_addresses = 0;
-  $main::opt_lines = 0;
-  $main::opt_functions = 0;
-  $main::opt_files = 0;
-  $main::opt_lib_prefix = "";
-
-  $main::opt_text = 0;
-  $main::opt_callgrind = 0;
-  $main::opt_list = "";
-  $main::opt_disasm = "";
-  $main::opt_symbols = 0;
-  $main::opt_gv = 0;
-  $main::opt_web = 0;
-  $main::opt_dot = 0;
-  $main::opt_ps = 0;
-  $main::opt_pdf = 0;
-  $main::opt_gif = 0;
-  $main::opt_svg = 0;
-  $main::opt_raw = 0;
-
-  $main::opt_nodecount = 80;
-  $main::opt_nodefraction = 0.005;
-  $main::opt_edgefraction = 0.001;
-  $main::opt_focus = '';
-  $main::opt_ignore = '';
-  $main::opt_scale = 0;
-  $main::opt_heapcheck = 0;
-  $main::opt_seconds = 30;
-  $main::opt_lib = "";
-
-  $main::opt_inuse_space   = 0;
-  $main::opt_inuse_objects = 0;
-  $main::opt_alloc_space   = 0;
-  $main::opt_alloc_objects = 0;
-  $main::opt_show_bytes    = 0;
-  $main::opt_drop_negative = 0;
-  $main::opt_interactive   = 0;
-
-  $main::opt_total_delay = 0;
-  $main::opt_contentions = 0;
-  $main::opt_mean_delay = 0;
-
-  $main::opt_tools   = "";
-  $main::opt_debug   = 0;
-  $main::opt_test    = 0;
-
-  # These are undocumented flags used only by unittests.
-  $main::opt_test_stride = 0;
-
-  # Are we using $SYMBOL_PAGE?
-  $main::use_symbol_page = 0;
-
-  # Files returned by TempName.
-  %main::tempnames = ();
-
-  # Type of profile we are dealing with
-  # Supported types:
-  #     cpu
-  #     heap
-  #     growth
-  #     contention
-  $main::profile_type = '';     # Empty type means "unknown"
-
-  GetOptions("help!"          => \$main::opt_help,
-             "version!"       => \$main::opt_version,
-             "cum!"           => \$main::opt_cum,
-             "base=s"         => \$main::opt_base,
-             "seconds=i"      => \$main::opt_seconds,
-             "add_lib=s"      => \$main::opt_lib,
-             "lib_prefix=s"   => \$main::opt_lib_prefix,
-             "functions!"     => \$main::opt_functions,
-             "lines!"         => \$main::opt_lines,
-             "addresses!"     => \$main::opt_addresses,
-             "files!"         => \$main::opt_files,
-             "text!"          => \$main::opt_text,
-             "callgrind!"     => \$main::opt_callgrind,
-             "list=s"         => \$main::opt_list,
-             "disasm=s"       => \$main::opt_disasm,
-             "symbols!"       => \$main::opt_symbols,
-             "gv!"            => \$main::opt_gv,
-             "web!"           => \$main::opt_web,
-             "dot!"           => \$main::opt_dot,
-             "ps!"            => \$main::opt_ps,
-             "pdf!"           => \$main::opt_pdf,
-             "svg!"           => \$main::opt_svg,
-             "gif!"           => \$main::opt_gif,
-             "raw!"           => \$main::opt_raw,
-             "interactive!"   => \$main::opt_interactive,
-             "nodecount=i"    => \$main::opt_nodecount,
-             "nodefraction=f" => \$main::opt_nodefraction,
-             "edgefraction=f" => \$main::opt_edgefraction,
-             "focus=s"        => \$main::opt_focus,
-             "ignore=s"       => \$main::opt_ignore,
-             "scale=i"        => \$main::opt_scale,
-             "heapcheck"      => \$main::opt_heapcheck,
-             "inuse_space!"   => \$main::opt_inuse_space,
-             "inuse_objects!" => \$main::opt_inuse_objects,
-             "alloc_space!"   => \$main::opt_alloc_space,
-             "alloc_objects!" => \$main::opt_alloc_objects,
-             "show_bytes!"    => \$main::opt_show_bytes,
-             "drop_negative!" => \$main::opt_drop_negative,
-             "total_delay!"   => \$main::opt_total_delay,
-             "contentions!"   => \$main::opt_contentions,
-             "mean_delay!"    => \$main::opt_mean_delay,
-             "tools=s"        => \$main::opt_tools,
-             "test!"          => \$main::opt_test,
-             "debug!"         => \$main::opt_debug,
-             # Undocumented flags used only by unittests:
-             "test_stride=i"  => \$main::opt_test_stride,
-      ) || usage("Invalid option(s)");
-
-  # Deal with the standard --help and --version
-  if ($main::opt_help) {
-    print usage_string();
-    exit(0);
-  }
-
-  if ($main::opt_version) {
-    print version_string();
-    exit(0);
-  }
-
-  # Disassembly/listing/symbols mode requires address-level info
-  if ($main::opt_disasm || $main::opt_list || $main::opt_symbols) {
-    $main::opt_functions = 0;
-    $main::opt_lines = 0;
-    $main::opt_addresses = 1;
-    $main::opt_files = 0;
-  }
-
-  # Check heap-profiling flags
-  if ($main::opt_inuse_space +
-      $main::opt_inuse_objects +
-      $main::opt_alloc_space +
-      $main::opt_alloc_objects > 1) {
-    usage("Specify at most on of --inuse/--alloc options");
-  }
-
-  # Check output granularities
-  my $grains =
-      $main::opt_functions +
-      $main::opt_lines +
-      $main::opt_addresses +
-      $main::opt_files +
-      0;
-  if ($grains > 1) {
-    usage("Only specify one output granularity option");
-  }
-  if ($grains == 0) {
-    $main::opt_functions = 1;
-  }
-
-  # Check output modes
-  my $modes =
-      $main::opt_text +
-      $main::opt_callgrind +
-      ($main::opt_list eq '' ? 0 : 1) +
-      ($main::opt_disasm eq '' ? 0 : 1) +
-      ($main::opt_symbols == 0 ? 0 : 1) +
-      $main::opt_gv +
-      $main::opt_web +
-      $main::opt_dot +
-      $main::opt_ps +
-      $main::opt_pdf +
-      $main::opt_svg +
-      $main::opt_gif +
-      $main::opt_raw +
-      $main::opt_interactive +
-      0;
-  if ($modes > 1) {
-    usage("Only specify one output mode");
-  }
-  if ($modes == 0) {
-    if (-t STDOUT) {  # If STDOUT is a tty, activate interactive mode
-      $main::opt_interactive = 1;
-    } else {
-      $main::opt_text = 1;
-    }
-  }
-
-  if ($main::opt_test) {
-    RunUnitTests();
-    # Should not return
-    exit(1);
-  }
-
-  # Binary name and profile arguments list
-  $main::prog = "";
-  @main::pfile_args = ();
-
-  # Remote profiling without a binary (using $SYMBOL_PAGE instead)
-  if (IsProfileURL($ARGV[0])) {
-    $main::use_symbol_page = 1;
-  } elsif ($ARGV[0] && IsSymbolizedProfileFile($ARGV[0])) {
-    $main::use_symbolized_profile = 1;
-    $main::prog = $UNKNOWN_BINARY;  # will be set later from the profile file
-  }
-
-  if ($main::use_symbol_page || $main::use_symbolized_profile) {
-    # We don't need a binary!
-    my %disabled = ('--lines' => $main::opt_lines,
-                    '--disasm' => $main::opt_disasm);
-    for my $option (keys %disabled) {
-      usage("$option cannot be used without a binary") if $disabled{$option};
-    }
-    # Set $main::prog later...
-    scalar(@ARGV) || usage("Did not specify profile file");
-  } elsif ($main::opt_symbols) {
-    # --symbols needs a binary-name (to run nm on, etc) but not profiles
-    $main::prog = shift(@ARGV) || usage("Did not specify program");
-  } else {
-    $main::prog = shift(@ARGV) || usage("Did not specify program");
-    scalar(@ARGV) || usage("Did not specify profile file");
-  }
-
-  # Parse profile file/location arguments
-  foreach my $farg (@ARGV) {
-    if ($farg =~ m/(.*)\@([0-9]+)(|\/.*)$/ ) {
-      my $machine = $1;
-      my $num_machines = $2;
-      my $path = $3;
-      for (my $i = 0; $i < $num_machines; $i++) {
-        unshift(@main::pfile_args, "$i.$machine$path");
-      }
-    } else {
-      unshift(@main::pfile_args, $farg);
-    }
-  }
-
-  if ($main::use_symbol_page) {
-    unless (IsProfileURL($main::pfile_args[0])) {
-      error("The first profile should be a remote form to use $SYMBOL_PAGE\n");
-    }
-    CheckSymbolPage();
-    $main::prog = FetchProgramName();
-  } elsif (!$main::use_symbolized_profile) {  # may not need objtools!
-    ConfigureObjTools($main::prog)
-  }
-
-  # Break the opt_lib_prefix into the prefix_list array
-  @prefix_list = split (',', $main::opt_lib_prefix);
-
-  # Remove trailing / from the prefixes, in the list to prevent
-  # searching things like /my/path//lib/mylib.so
-  foreach (@prefix_list) {
-    s|/+$||;
-  }
-}
-
-sub Main() {
-  Init();
-  $main::collected_profile = undef;
-  @main::profile_files = ();
-  $main::op_time = time();
-
-  # Printing symbols is special and requires a lot less info that most.
-  if ($main::opt_symbols) {
-    PrintSymbols(*STDIN);   # Get /proc/maps and symbols output from stdin
-    return;
-  }
-
-  # Fetch all profile data
-  FetchDynamicProfiles();
-
-  # this will hold symbols that we read from the profile files
-  my $symbol_map = {};
-
-  # Read one profile, pick the last item on the list
-  my $data = ReadProfile($main::prog, pop(@main::profile_files));
-  my $profile = $data->{profile};
-  my $pcs = $data->{pcs};
-  my $libs = $data->{libs};   # Info about main program and shared libraries
-  $symbol_map = MergeSymbols($symbol_map, $data->{symbols});
-
-  # Add additional profiles, if available.
-  if (scalar(@main::profile_files) > 0) {
-    foreach my $pname (@main::profile_files) {
-      my $data2 = ReadProfile($main::prog, $pname);
-      $profile = AddProfile($profile, $data2->{profile});
-      $pcs = AddPcs($pcs, $data2->{pcs});
-      $symbol_map = MergeSymbols($symbol_map, $data2->{symbols});
-    }
-  }
-
-  # Subtract base from profile, if specified
-  if ($main::opt_base ne '') {
-    my $base = ReadProfile($main::prog, $main::opt_base);
-    $profile = SubtractProfile($profile, $base->{profile});
-    $pcs = AddPcs($pcs, $base->{pcs});
-    $symbol_map = MergeSymbols($symbol_map, $base->{symbols});
-  }
-
-  # Get total data in profile
-  my $total = TotalProfile($profile);
-
-  # Collect symbols
-  my $symbols;
-  if ($main::use_symbolized_profile) {
-    $symbols = FetchSymbols($pcs, $symbol_map);
-  } elsif ($main::use_symbol_page) {
-    $symbols = FetchSymbols($pcs);
-  } else {
-    $symbols = ExtractSymbols($libs, $pcs);
-  }
-
-  # Remove uniniteresting stack items
-  $profile = RemoveUninterestingFrames($symbols, $profile);
-
-  # Focus?
-  if ($main::opt_focus ne '') {
-    $profile = FocusProfile($symbols, $profile, $main::opt_focus);
-  }
-
-  # Ignore?
-  if ($main::opt_ignore ne '') {
-    $profile = IgnoreProfile($symbols, $profile, $main::opt_ignore);
-  }
-
-  my $calls = ExtractCalls($symbols, $profile);
-
-  # Reduce profiles to required output granularity, and also clean
-  # each stack trace so a given entry exists at most once.
-  my $reduced = ReduceProfile($symbols, $profile);
-
-  # Get derived profiles
-  my $flat = FlatProfile($reduced);
-  my $cumulative = CumulativeProfile($reduced);
-
-  # Print
-  if (!$main::opt_interactive) {
-    if ($main::opt_disasm) {
-      PrintDisassembly($libs, $flat, $cumulative, $main::opt_disasm, $total);
-    } elsif ($main::opt_list) {
-      PrintListing($total, $libs, $flat, $cumulative, $main::opt_list, 0);
-    } elsif ($main::opt_text) {
-      # Make sure the output is empty when have nothing to report
-      # (only matters when --heapcheck is given but we must be
-      # compatible with old branches that did not pass --heapcheck always):
-      if ($total != 0) {
-        Infof("Total: %s %s\n", Unparse($total), Units());
-      }
-      PrintText($symbols, $flat, $cumulative, $total, -1);
-    } elsif ($main::opt_raw) {
-      PrintSymbolizedProfile($symbols, $profile, $main::prog);
-    } elsif ($main::opt_callgrind) {
-      PrintCallgrind($calls);
-    } else {
-      if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) {
-        if ($main::opt_gv) {
-          RunGV(TempName($main::next_tmpfile, "ps"), "");
-        } elsif ($main::opt_web) {
-          my $tmp = TempName($main::next_tmpfile, "svg");
-          RunWeb($tmp);
-          # The command we run might hand the file name off
-          # to an already running browser instance and then exit.
-          # Normally, we'd remove $tmp on exit (right now),
-          # but fork a child to remove $tmp a little later, so that the
-          # browser has time to load it first.
-          delete $main::tempnames{$tmp};
-          if (fork() == 0) {
-            sleep 5;
-            unlink($tmp);
-            exit(0);
-          }
-        }
-      } else {
-        exit(1);
-      }
-    }
-  } else {
-    InteractiveMode($profile, $symbols, $libs, $total);
-  }
-
-  cleanup();
-  exit(0);
-}
-
-##### Entry Point #####
-
-Main();
-
-# Temporary code to detect if we're running on a Goobuntu system.
-# These systems don't have the right stuff installed for the special
-# Readline libraries to work, so as a temporary workaround, we default
-# to using the normal stdio code, rather than the fancier readline-based
-# code
-sub ReadlineMightFail {
-  if (-e '/lib/libtermcap.so.2') {
-    return 0;  # libtermcap exists, so readline should be okay
-  } else {
-    return 1;
-  }
-}
-
-sub RunGV {
-  my $fname = shift;
-  my $bg = shift;       # "" or " &" if we should run in background
-  if (!system("$GV --version >$DEVNULL 2>&1")) {
-    # Options using double dash are supported by this gv version.
-    # Also, turn on noantialias to better handle bug in gv for
-    # postscript files with large dimensions.
-    # TODO: Maybe we should not pass the --noantialias flag
-    # if the gv version is known to work properly without the flag.
-    system("$GV --scale=$main::opt_scale --noantialias " . $fname . $bg);
-  } else {
-    # Old gv version - only supports options that use single dash.
-    print STDERR "$GV -scale $main::opt_scale\n";
-    system("$GV -scale $main::opt_scale " . $fname . $bg);
-  }
-}
-
-sub RunWeb {
-  my $fname = shift;
-  print STDERR "Loading web page file:///$fname\n";
-
-  my $uname = `uname`;
-  if ($uname =~ /Darwin/) {
-    # OS X: open will use standard preference for SVG files.
-    system("/usr/bin/open", $fname);
-    return;
-  }
-
-  if ($uname =~ /CYGWIN/) {
-    # Windows(cygwin): open will use standard preference for SVG files.
-    my $winname = `cygpath -wa $fname`;
-    system("explorer.exe", $winname);
-    return;
-  }
-  if ($uname =~ /MINGW/) {
-    # Windows(MinGW): open will use standard preference for SVG files.
-    system("cmd", "/c", "start", $fname);
-    return;
-  }
-
-  # Some kind of Unix; try generic symlinks, then specific browsers.
-  # (Stop once we find one.)
-  # Works best if the browser is already running.
-  my @alt = (
-    "/etc/alternatives/gnome-www-browser",
-    "/etc/alternatives/x-www-browser",
-    "google-chrome",
-    "firefox",
-  );
-  foreach my $b (@alt) {
-    if (system($b, $fname) == 0) {
-      return;
-    }
-  }
-
-  print STDERR "Could not load web browser.\n";
-}
-
-sub RunKcachegrind {
-  my $fname = shift;
-  my $bg = shift;       # "" or " &" if we should run in background
-  print STDERR "Starting '$KCACHEGRIND " . $fname . $bg . "'\n";
-  system("$KCACHEGRIND " . $fname . $bg);
-}
-
-
-##### Interactive helper routines #####
-
-sub InteractiveMode {
-  $| = 1;  # Make output unbuffered for interactive mode
-  my ($orig_profile, $symbols, $libs, $total) = @_;
-
-  print STDERR "Welcome to pprof!  For help, type 'help'.\n";
-
-  # Use ReadLine if it's installed and input comes from a console.
-  if ( -t STDIN &&
-       !ReadlineMightFail() &&
-       defined(eval {require Term::ReadLine}) ) {
-    my $term = new Term::ReadLine 'pprof';
-    while ( defined ($_ = $term->readline('(pprof) '))) {
-      $term->addhistory($_) if /\S/;
-      if (!InteractiveCommand($orig_profile, $symbols, $libs, $total, $_)) {
-        last;    # exit when we get an interactive command to quit
-      }
-    }
-  } else {       # don't have readline
-    while (1) {
-      print STDERR "(pprof) ";
-      $_ = <STDIN>;
-      last if ! defined $_ ;
-      s/\r//g;         # turn windows-looking lines into unix-looking lines
-
-      # Save some flags that might be reset by InteractiveCommand()
-      my $save_opt_lines = $main::opt_lines;
-
-      if (!InteractiveCommand($orig_profile, $symbols, $libs, $total, $_)) {
-        last;    # exit when we get an interactive command to quit
-      }
-
-      # Restore flags
-      $main::opt_lines = $save_opt_lines;
-    }
-  }
-}
-
-# Takes two args: orig profile, and command to run.
-# Returns 1 if we should keep going, or 0 if we were asked to quit
-sub InteractiveCommand {
-  my($orig_profile, $symbols, $libs, $total, $command) = @_;
-  $_ = $command;                # just to make future m//'s easier
-  if (!defined($_)) {
-    print STDERR "\n";
-    return 0;
-  }
-  if (m/^\s*quit/) {
-    return 0;
-  }
-  if (m/^\s*help/) {
-    InteractiveHelpMessage();
-    return 1;
-  }
-  # Clear all the mode options -- mode is controlled by "$command"
-  $main::opt_text = 0;
-  $main::opt_callgrind = 0;
-  $main::opt_disasm = 0;
-  $main::opt_list = 0;
-  $main::opt_gv = 0;
-  $main::opt_cum = 0;
-
-  if (m/^\s*(text|top)(\d*)\s*(.*)/) {
-    $main::opt_text = 1;
-
-    my $line_limit = ($2 ne "") ? int($2) : 10;
-
-    my $routine;
-    my $ignore;
-    ($routine, $ignore) = ParseInteractiveArgs($3);
-
-    my $profile = ProcessProfile($total, $orig_profile, $symbols, "", $ignore);
-    my $reduced = ReduceProfile($symbols, $profile);
-
-    # Get derived profiles
-    my $flat = FlatProfile($reduced);
-    my $cumulative = CumulativeProfile($reduced);
-
-    PrintText($symbols, $flat, $cumulative, $total, $line_limit);
-    return 1;
-  }
-  if (m/^\s*callgrind\s*([^ \n]*)/) {
-    $main::opt_callgrind = 1;
-
-    # Get derived profiles
-    my $calls = ExtractCalls($symbols, $orig_profile);
-    my $filename = $1;
-    if ( $1 eq '' ) {
-      $filename = TempName($main::next_tmpfile, "callgrind");
-    }
-    PrintCallgrind($calls, $filename);
-    if ( $1 eq '' ) {
-      RunKcachegrind($filename, " & ");
-      $main::next_tmpfile++;
-    }
-
-    return 1;
-  }
-  if (m/^\s*(web)?list\s*(.+)/) {
-    my $html = (defined($1) && ($1 eq "web"));
-    $main::opt_list = 1;
-
-    my $routine;
-    my $ignore;
-    ($routine, $ignore) = ParseInteractiveArgs($2);
-
-    my $profile = ProcessProfile($total, $orig_profile, $symbols, "", $ignore);
-    my $reduced = ReduceProfile($symbols, $profile);
-
-    # Get derived profiles
-    my $flat = FlatProfile($reduced);
-    my $cumulative = CumulativeProfile($reduced);
-
-    PrintListing($total, $libs, $flat, $cumulative, $routine, $html);
-    return 1;
-  }
-  if (m/^\s*disasm\s*(.+)/) {
-    $main::opt_disasm = 1;
-
-    my $routine;
-    my $ignore;
-    ($routine, $ignore) = ParseInteractiveArgs($1);
-
-    # Process current profile to account for various settings
-    my $profile = ProcessProfile($total, $orig_profile, $symbols, "", $ignore);
-    my $reduced = ReduceProfile($symbols, $profile);
-
-    # Get derived profiles
-    my $flat = FlatProfile($reduced);
-    my $cumulative = CumulativeProfile($reduced);
-
-    PrintDisassembly($libs, $flat, $cumulative, $routine, $total);
-    return 1;
-  }
-  if (m/^\s*(gv|web)\s*(.*)/) {
-    $main::opt_gv = 0;
-    $main::opt_web = 0;
-    if ($1 eq "gv") {
-      $main::opt_gv = 1;
-    } elsif ($1 eq "web") {
-      $main::opt_web = 1;
-    }
-
-    my $focus;
-    my $ignore;
-    ($focus, $ignore) = ParseInteractiveArgs($2);
-
-    # Process current profile to account for various settings
-    my $profile = ProcessProfile($total, $orig_profile, $symbols, $focus, $ignore);
-    my $reduced = ReduceProfile($symbols, $profile);
-
-    # Get derived profiles
-    my $flat = FlatProfile($reduced);
-    my $cumulative = CumulativeProfile($reduced);
-
-    if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) {
-      if ($main::opt_gv) {
-        RunGV(TempName($main::next_tmpfile, "ps"), " &");
-      } elsif ($main::opt_web) {
-        RunWeb(TempName($main::next_tmpfile, "svg"));
-      }
-      $main::next_tmpfile++;
-    }
-    return 1;
-  }
-  if (m/^\s*$/) {
-    return 1;
-  }
-  print STDERR "Unknown command: try 'help'.\n";
-  return 1;
-}
-
-
-sub ProcessProfile {
-  my $total_count = shift;
-  my $orig_profile = shift;
-  my $symbols = shift;
-  my $focus = shift;
-  my $ignore = shift;
-
-  # Process current profile to account for various settings
-  my $profile = $orig_profile;
-  printf("Total: %s %s\n", Unparse($total_count), Units());
-  if ($focus ne '') {
-    $profile = FocusProfile($symbols, $profile, $focus);
-    my $focus_count = TotalProfile($profile);
-    Infof("After focusing on '%s': %s %s of %s (%0.1f%%)\n",
-           $focus,
-           Unparse($focus_count), Units(),
-           Unparse($total_count), ($focus_count*100.0) / $total_count);
-  }
-  if ($ignore ne '') {
-    $profile = IgnoreProfile($symbols, $profile, $ignore);
-    my $ignore_count = TotalProfile($profile);
-    Infof("After ignoring '%s': %s %s of %s (%0.1f%%)\n",
-           $ignore,
-           Unparse($ignore_count), Units(),
-           Unparse($total_count),
-           ($ignore_count*100.0) / $total_count);
-  }
-
-  return $profile;
-}
-
-sub InteractiveHelpMessage {
-  print STDERR <<ENDOFHELP;
-Interactive pprof mode
-
-Commands:
-  gv
-  gv [focus] [-ignore1] [-ignore2]
-      Show graphical hierarchical display of current profile.  Without
-      any arguments, shows all samples in the profile.  With the optional
-      "focus" argument, restricts the samples shown to just those where
-      the "focus" regular expression matches a routine name on the stack
-      trace.
-
-  web
-  web [focus] [-ignore1] [-ignore2]
-      Like GV, but displays profile in your web browser instead of using
-      Ghostview. Works best if your web browser is already running.
-      To change the browser that gets used:
-      On Linux, set the /etc/alternatives/gnome-www-browser symlink.
-      On OS X, change the Finder association for SVG files.
-
-  list [routine_regexp] [-ignore1] [-ignore2]
-      Show source listing of routines whose names match "routine_regexp"
-
-  weblist [routine_regexp] [-ignore1] [-ignore2]
-      Displays a source listing of routines whose names match "routine_regexp"
-      in a web browser.  You can click on source lines to view the
-      corresponding disassembly.
-
-  top [--cum] [-ignore1] [-ignore2]
-  top20 [--cum] [-ignore1] [-ignore2]
-  top37 [--cum] [-ignore1] [-ignore2]
-      Show top lines ordered by flat profile count, or cumulative count
-      if --cum is specified.  If a number is present after 'top', the
-      top K routines will be shown (defaults to showing the top 10)
-
-  disasm [routine_regexp] [-ignore1] [-ignore2]
-      Show disassembly of routines whose names match "routine_regexp",
-      annotated with sample counts.
-
-  callgrind
-  callgrind [filename]
-      Generates callgrind file. If no filename is given, kcachegrind is called.
-
-  help - This listing
-  quit or ^D - End pprof
-
-For commands that accept optional -ignore tags, samples where any routine in
-the stack trace matches the regular expression in any of the -ignore
-parameters will be ignored.
-
-Further pprof details are available at this location (or one similar):
-
- /usr/doc/google-perftools-$PPROF_VERSION/cpu_profiler.html
- /usr/doc/google-perftools-$PPROF_VERSION/heap_profiler.html
-
-ENDOFHELP
-}
-sub ParseInteractiveArgs {
-  my $args = shift;
-  my $focus = "";
-  my $ignore = "";
-  my @x = split(/ +/, $args);
-  foreach $a (@x) {
-    if ($a =~ m/^(--|-)lines$/) {
-      $main::opt_lines = 1;
-    } elsif ($a =~ m/^(--|-)cum$/) {
-      $main::opt_cum = 1;
-    } elsif ($a =~ m/^-(.*)/) {
-      $ignore .= (($ignore ne "") ? "|" : "" ) . $1;
-    } else {
-      $focus .= (($focus ne "") ? "|" : "" ) . $a;
-    }
-  }
-  if ($ignore ne "") {
-    print STDERR "Ignoring samples in call stacks that match '$ignore'\n";
-  }
-  return ($focus, $ignore);
-}
-
-##### Output code #####
-
-sub TempName {
-  my $fnum = shift;
-  my $ext = shift;
-  my $file = "$main::tmpfile_ps.$fnum.$ext";
-  $main::tempnames{$file} = 1;
-  return $file;
-}
-
-# Print profile data in packed binary format (64-bit) to standard out
-sub PrintProfileData {
-  my $profile = shift;
-
-  # print header (64-bit style)
-  # (zero) (header-size) (version) (sample-period) (zero)
-  print pack('L*', 0, 0, 3, 0, 0, 0, 1, 0, 0, 0);
-
-  foreach my $k (keys(%{$profile})) {
-    my $count = $profile->{$k};
-    my @addrs = split(/\n/, $k);
-    if ($#addrs >= 0) {
-      my $depth = $#addrs + 1;
-      # int(foo / 2**32) is the only reliable way to get rid of bottom
-      # 32 bits on both 32- and 64-bit systems.
-      print pack('L*', $count & 0xFFFFFFFF, int($count / 2**32));
-      print pack('L*', $depth & 0xFFFFFFFF, int($depth / 2**32));
-
-      foreach my $full_addr (@addrs) {
-        my $addr = $full_addr;
-        $addr =~ s/0x0*//;  # strip off leading 0x, zeroes
-        if (length($addr) > 16) {
-          print STDERR "Invalid address in profile: $full_addr\n";
-          next;
-        }
-        my $low_addr = substr($addr, -8);       # get last 8 hex chars
-        my $high_addr = substr($addr, -16, 8);  # get up to 8 more hex chars
-        print pack('L*', hex('0x' . $low_addr), hex('0x' . $high_addr));
-      }
-    }
-  }
-}
-
-# Print symbols and profile data
-sub PrintSymbolizedProfile {
-  my $symbols = shift;
-  my $profile = shift;
-  my $prog = shift;
-
-  $SYMBOL_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
-  my $symbol_marker = $&;
-
-  print '--- ', $symbol_marker, "\n";
-  if (defined($prog)) {
-    print 'binary=', $prog, "\n";
-  }
-  while (my ($pc, $name) = each(%{$symbols})) {
-    my $sep = ' ';
-    print '0x', $pc;
-    # We have a list of function names, which include the inlined
-    # calls.  They are separated (and terminated) by --, which is
-    # illegal in function names.
-    for (my $j = 2; $j <= $#{$name}; $j += 3) {
-      print $sep, $name->[$j];
-      $sep = '--';
-    }
-    print "\n";
-  }
-  print '---', "\n";
-
-  $PROFILE_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
-  my $profile_marker = $&;
-  print '--- ', $profile_marker, "\n";
-  if (defined($main::collected_profile)) {
-    # if used with remote fetch, simply dump the collected profile to output.
-    open(SRC, "<$main::collected_profile");
-    while (<SRC>) {
-      print $_;
-    }
-    close(SRC);
-  } else {
-    # dump a cpu-format profile to standard out
-    PrintProfileData($profile);
-  }
-}
-
-# Print information conditionally filtered out depending on the output
-# format.
-sub Infof {
-  my $format = shift;
-  my @args = @_;
-  return if $main::opt_svg;
-  printf($format, @args);
-}
-
-# Print text output
-sub PrintText {
-  my $symbols = shift;
-  my $flat = shift;
-  my $cumulative = shift;
-  my $total = shift;
-  my $line_limit = shift;
-
-  # Which profile to sort by?
-  my $s = $main::opt_cum ? $cumulative : $flat;
-
-  my $running_sum = 0;
-  my $lines = 0;
-  foreach my $k (sort { GetEntry($s, $b) <=> GetEntry($s, $a) || $a cmp $b }
-                 keys(%{$cumulative})) {
-    my $f = GetEntry($flat, $k);
-    my $c = GetEntry($cumulative, $k);
-    $running_sum += $f;
-
-    my $sym = $k;
-    if (exists($symbols->{$k})) {
-      $sym = $symbols->{$k}->[0] . " " . $symbols->{$k}->[1];
-      if ($main::opt_addresses) {
-        $sym = $k . " " . $sym;
-      }
-    }
-
-    if ($f != 0 || $c != 0) {
-      printf("%8s %6s %6s %8s %6s %s\n",
-             Unparse($f),
-             Percent($f, $total),
-             Percent($running_sum, $total),
-             Unparse($c),
-             Percent($c, $total),
-             $sym);
-    }
-    $lines++;
-    last if ($line_limit >= 0 && $lines >= $line_limit);
-  }
-}
-
-# Print the call graph in a way that's suiteable for callgrind.
-sub PrintCallgrind {
-  my $calls = shift;
-  my $filename;
-  if ($main::opt_interactive) {
-    $filename = shift;
-    print STDERR "Writing callgrind file to '$filename'.\n"
-  } else {
-    $filename = "&STDOUT";
-  }
-  open(CG, ">".$filename );
-  printf CG ("events: Hits\n\n");
-  foreach my $call ( map { $_->[0] }
-                     sort { $a->[1] cmp $b ->[1] ||
-                            $a->[2] <=> $b->[2] }
-                     map { /([^:]+):(\d+):([^ ]+)( -> ([^:]+):(\d+):(.+))?/;
-                           [$_, $1, $2] }
-                     keys %$calls ) {
-    my $count = int($calls->{$call});
-    $call =~ /([^:]+):(\d+):([^ ]+)( -> ([^:]+):(\d+):(.+))?/;
-    my ( $caller_file, $caller_line, $caller_function,
-         $callee_file, $callee_line, $callee_function ) =
-       ( $1, $2, $3, $5, $6, $7 );
-
-    printf CG ("fl=$caller_file\nfn=$caller_function\n");
-    if (defined $6) {
-      printf CG ("cfl=$callee_file\n");
-      printf CG ("cfn=$callee_function\n");
-      printf CG ("calls=$count $callee_line\n");
-    }
-    printf CG ("$caller_line $count\n\n");
-  }
-}
-
-# Print disassembly for all all routines that match $main::opt_disasm
-sub PrintDisassembly {
-  my $libs = shift;
-  my $flat = shift;
-  my $cumulative = shift;
-  my $disasm_opts = shift;
-  my $total = shift;
-
-  foreach my $lib (@{$libs}) {
-    my $symbol_table = GetProcedureBoundaries($lib->[0], $disasm_opts);
-    my $offset = AddressSub($lib->[1], $lib->[3]);
-    foreach my $routine (sort ByName keys(%{$symbol_table})) {
-      my $start_addr = $symbol_table->{$routine}->[0];
-      my $end_addr = $symbol_table->{$routine}->[1];
-      # See if there are any samples in this routine
-      my $length = hex(AddressSub($end_addr, $start_addr));
-      my $addr = AddressAdd($start_addr, $offset);
-      for (my $i = 0; $i < $length; $i++) {
-        if (defined($cumulative->{$addr})) {
-          PrintDisassembledFunction($lib->[0], $offset,
-                                    $routine, $flat, $cumulative,
-                                    $start_addr, $end_addr, $total);
-          last;
-        }
-        $addr = AddressInc($addr);
-      }
-    }
-  }
-}
-
-# Return reference to array of tuples of the form:
-#       [start_address, filename, linenumber, instruction, limit_address]
-# E.g.,
-#       ["0x806c43d", "/foo/bar.cc", 131, "ret", "0x806c440"]
-sub Disassemble {
-  my $prog = shift;
-  my $offset = shift;
-  my $start_addr = shift;
-  my $end_addr = shift;
-
-  my $objdump = $obj_tool_map{"objdump"};
-  my $cmd = sprintf("$objdump -C -d -l --no-show-raw-insn " .
-                    "--start-address=0x$start_addr " .
-                    "--stop-address=0x$end_addr $prog");
-
-  if (system("$objdump --help >$DEVNULL 2>&1") != 0) {
-    # objdump must not exist.  Fall back to go tool objdump.
-    $objdump = "go tool objdump";
-    $cmd = "$objdump $prog 0x$start_addr 0x$end_addr";
-  }
-
-  open(OBJDUMP, "$cmd |") || error("$objdump: $!\n");
-  my @result = ();
-  my $filename = "";
-  my $linenumber = -1;
-  my $last = ["", "", "", ""];
-  while (<OBJDUMP>) {
-    s/\r//g;         # turn windows-looking lines into unix-looking lines
-    chop;
-    if (m|\s*(.+):(\d+)\s*$|) {
-      # Location line of the form:
-      #   <filename>:<linenumber>
-      $filename = $1;
-      $linenumber = $2;
-    } elsif (m/^ +([0-9a-f]+):\s*(.*)/) {
-      # Disassembly line -- zero-extend address to full length
-      my $addr = HexExtend($1);
-      my $k = AddressAdd($addr, $offset);
-      $last->[4] = $k;   # Store ending address for previous instruction
-      $last = [$k, $filename, $linenumber, $2, $end_addr];
-      push(@result, $last);
-    }
-  }
-  close(OBJDUMP);
-  return @result;
-}
-
-# The input file should contain lines of the form /proc/maps-like
-# output (same format as expected from the profiles) or that looks
-# like hex addresses (like "0xDEADBEEF").  We will parse all
-# /proc/maps output, and for all the hex addresses, we will output
-# "short" symbol names, one per line, in the same order as the input.
-sub PrintSymbols {
-  my $maps_and_symbols_file = shift;
-
-  # ParseLibraries expects pcs to be in a set.  Fine by us...
-  my @pclist = ();   # pcs in sorted order
-  my $pcs = {};
-  my $map = "";
-  foreach my $line (<$maps_and_symbols_file>) {
-    $line =~ s/\r//g;    # turn windows-looking lines into unix-looking lines
-    if ($line =~ /\b(0x[0-9a-f]+)\b/i) {
-      push(@pclist, HexExtend($1));
-      $pcs->{$pclist[-1]} = 1;
-    } else {
-      $map .= $line;
-    }
-  }
-
-  my $libs = ParseLibraries($main::prog, $map, $pcs);
-  my $symbols = ExtractSymbols($libs, $pcs);
-
-  foreach my $pc (@pclist) {
-    # ->[0] is the shortname, ->[2] is the full name
-    print(($symbols->{$pc}->[0] || "??") . "\n");
-  }
-}
-
-
-# For sorting functions by name
-sub ByName {
-  return ShortFunctionName($a) cmp ShortFunctionName($b);
-}
-
-# Print source-listing for all all routines that match $main::opt_list
-sub PrintListing {
-  my $total = shift;
-  my $libs = shift;
-  my $flat = shift;
-  my $cumulative = shift;
-  my $list_opts = shift;
-  my $html = shift;
-
-  my $output = \*STDOUT;
-  my $fname = "";
-
-
-  if ($html) {
-    # Arrange to write the output to a temporary file
-    $fname = TempName($main::next_tmpfile, "html");
-    $main::next_tmpfile++;
-    if (!open(TEMP, ">$fname")) {
-      print STDERR "$fname: $!\n";
-      return;
-    }
-    $output = \*TEMP;
-    print $output HtmlListingHeader();
-    printf $output ("<div class=\"legend\">%s<br>Total: %s %s</div>\n",
-                    $main::prog, Unparse($total), Units());
-  }
-
-  my $listed = 0;
-  foreach my $lib (@{$libs}) {
-    my $symbol_table = GetProcedureBoundaries($lib->[0], $list_opts);
-    my $offset = AddressSub($lib->[1], $lib->[3]);
-    foreach my $routine (sort ByName keys(%{$symbol_table})) {
-      # Print if there are any samples in this routine
-      my $start_addr = $symbol_table->{$routine}->[0];
-      my $end_addr = $symbol_table->{$routine}->[1];
-      my $length = hex(AddressSub($end_addr, $start_addr));
-      my $addr = AddressAdd($start_addr, $offset);
-      for (my $i = 0; $i < $length; $i++) {
-        if (defined($cumulative->{$addr})) {
-          $listed += PrintSource(
-            $lib->[0], $offset,
-            $routine, $flat, $cumulative,
-            $start_addr, $end_addr,
-            $html,
-            $output);
-          last;
-        }
-        $addr = AddressInc($addr);
-      }
-    }
-  }
-
-  if ($html) {
-    if ($listed > 0) {
-      print $output HtmlListingFooter();
-      close($output);
-      RunWeb($fname);
-    } else {
-      close($output);
-      unlink($fname);
-    }
-  }
-}
-
-sub HtmlListingHeader {
-  return <<'EOF';
-<!DOCTYPE html>
-<html>
-<head>
-<title>Pprof listing</title>
-<style type="text/css">
-body {
-  font-family: sans-serif;
-}
-h1 {
-  font-size: 1.5em;
-  margin-bottom: 4px;
-}
-.legend {
-  font-size: 1.25em;
-}
-.line {
-  color: #aaaaaa;
-}
-.livesrc {
-  color: #0000ff;
-  cursor: pointer;
-}
-.livesrc:hover {
-  background-color: #cccccc;
-}
-.asm {
-  color: #888888;
-  display: none;
-}
-</style>
-<script type="text/javascript">
-function pprof_toggle_asm(e) {
-  var target;
-  if (!e) e = window.event;
-  if (e.target) target = e.target;
-  else if (e.srcElement) target = e.srcElement;
-
-  if (target && target.className == "livesrc") {
-    var asm = target.nextSibling;
-    if (asm && asm.className == "asm") {
-      asm.style.display = (asm.style.display == "block" ? "none" : "block");
-      e.preventDefault();
-      return false;
-    }
-  }
-}
-</script>
-</head>
-<body>
-EOF
-}
-
-sub HtmlListingFooter {
-  return <<'EOF';
-</body>
-</html>
-EOF
-}
-
-sub HtmlEscape {
-  my $text = shift;
-  $text =~ s/&/&amp;/g;
-  $text =~ s/</&lt;/g;
-  $text =~ s/>/&gt;/g;
-  return $text;
-}
-
-# Returns the indentation of the line, if it has any non-whitespace
-# characters.  Otherwise, returns -1.
-sub Indentation {
-  my $line = shift;
-  if (m/^(\s*)\S/) {
-    return length($1);
-  } else {
-    return -1;
-  }
-}
-
-# Print source-listing for one routine
-sub PrintSource {
-  my $prog = shift;
-  my $offset = shift;
-  my $routine = shift;
-  my $flat = shift;
-  my $cumulative = shift;
-  my $start_addr = shift;
-  my $end_addr = shift;
-  my $html = shift;
-  my $output = shift;
-
-  # Disassemble all instructions (just to get line numbers)
-  my @instructions = Disassemble($prog, $offset, $start_addr, $end_addr);
-
-  # Hack 1: assume that the first source file encountered in the
-  # disassembly contains the routine
-  my $filename = undef;
-  for (my $i = 0; $i <= $#instructions; $i++) {
-    if ($instructions[$i]->[2] >= 0) {
-      $filename = $instructions[$i]->[1];
-      last;
-    }
-  }
-  if (!defined($filename)) {
-    print STDERR "no filename found in $routine\n";
-    return 0;
-  }
-
-  # Hack 2: assume that the largest line number from $filename is the
-  # end of the procedure.  This is typically safe since if P1 contains
-  # an inlined call to P2, then P2 usually occurs earlier in the
-  # source file.  If this does not work, we might have to compute a
-  # density profile or just print all regions we find.
-  my $lastline = 0;
-  for (my $i = 0; $i <= $#instructions; $i++) {
-    my $f = $instructions[$i]->[1];
-    my $l = $instructions[$i]->[2];
-    if (($f eq $filename) && ($l > $lastline)) {
-      $lastline = $l;
-    }
-  }
-
-  # Hack 3: assume the first source location from "filename" is the start of
-  # the source code.
-  my $firstline = 1;
-  for (my $i = 0; $i <= $#instructions; $i++) {
-    if ($instructions[$i]->[1] eq $filename) {
-      $firstline = $instructions[$i]->[2];
-      last;
-    }
-  }
-
-  # Hack 4: Extend last line forward until its indentation is less than
-  # the indentation we saw on $firstline
-  my $oldlastline = $lastline;
-  {
-    if (!open(FILE, "<$filename")) {
-      print STDERR "$filename: $!\n";
-      return 0;
-    }
-    my $l = 0;
-    my $first_indentation = -1;
-    while (<FILE>) {
-      s/\r//g;         # turn windows-looking lines into unix-looking lines
-      $l++;
-      my $indent = Indentation($_);
-      if ($l >= $firstline) {
-        if ($first_indentation < 0 && $indent >= 0) {
-          $first_indentation = $indent;
-          last if ($first_indentation == 0);
-        }
-      }
-      if ($l >= $lastline && $indent >= 0) {
-        if ($indent >= $first_indentation) {
-          $lastline = $l+1;
-        } else {
-          last;
-        }
-      }
-    }
-    close(FILE);
-  }
-
-  # Assign all samples to the range $firstline,$lastline,
-  # Hack 4: If an instruction does not occur in the range, its samples
-  # are moved to the next instruction that occurs in the range.
-  my $samples1 = {};        # Map from line number to flat count
-  my $samples2 = {};        # Map from line number to cumulative count
-  my $running1 = 0;         # Unassigned flat counts
-  my $running2 = 0;         # Unassigned cumulative counts
-  my $total1 = 0;           # Total flat counts
-  my $total2 = 0;           # Total cumulative counts
-  my %disasm = ();          # Map from line number to disassembly
-  my $running_disasm = "";  # Unassigned disassembly
-  my $skip_marker = "---\n";
-  if ($html) {
-    $skip_marker = "";
-    for (my $l = $firstline; $l <= $lastline; $l++) {
-      $disasm{$l} = "";
-    }
-  }
-  foreach my $e (@instructions) {
-    # Add up counts for all address that fall inside this instruction
-    my $c1 = 0;
-    my $c2 = 0;
-    for (my $a = $e->[0]; $a lt $e->[4]; $a = AddressInc($a)) {
-      $c1 += GetEntry($flat, $a);
-      $c2 += GetEntry($cumulative, $a);
-    }
-
-    if ($html) {
-      $running_disasm .= sprintf("      %6s %6s \t\t%8s: %s\n",
-                                 HtmlPrintNumber($c1),
-                                 HtmlPrintNumber($c2),
-                                 $e->[0],
-                                 CleanDisassembly($e->[3]));
-    }
-
-    $running1 += $c1;
-    $running2 += $c2;
-    $total1 += $c1;
-    $total2 += $c2;
-    my $file = $e->[1];
-    my $line = $e->[2];
-    if (($file eq $filename) &&
-        ($line >= $firstline) &&
-        ($line <= $lastline)) {
-      # Assign all accumulated samples to this line
-      AddEntry($samples1, $line, $running1);
-      AddEntry($samples2, $line, $running2);
-      $running1 = 0;
-      $running2 = 0;
-      if ($html) {
-        $disasm{$line} .= $running_disasm;
-        $running_disasm = '';
-      }
-    }
-  }
-
-  # Assign any leftover samples to $lastline
-  AddEntry($samples1, $lastline, $running1);
-  AddEntry($samples2, $lastline, $running2);
-
-  if ($html) {
-    printf $output (
-      "<h1>%s</h1>%s\n<pre onClick=\"pprof_toggle_asm()\">\n" .
-      "Total:%6s %6s (flat / cumulative %s)\n",
-      HtmlEscape(ShortFunctionName($routine)),
-      HtmlEscape($filename),
-      Unparse($total1),
-      Unparse($total2),
-      Units());
-  } else {
-    printf $output (
-      "ROUTINE ====================== %s in %s\n" .
-      "%6s %6s Total %s (flat / cumulative)\n",
-      ShortFunctionName($routine),
-      $filename,
-      Unparse($total1),
-      Unparse($total2),
-      Units());
-  }
-  if (!open(FILE, "<$filename")) {
-    print STDERR "$filename: $!\n";
-    return 0;
-  }
-  my $l = 0;
-  while (<FILE>) {
-    s/\r//g;         # turn windows-looking lines into unix-looking lines
-    $l++;
-    if ($l >= $firstline - 5 &&
-        (($l <= $oldlastline + 5) || ($l <= $lastline))) {
-      chop;
-      my $text = $_;
-      if ($l == $firstline) { print $output $skip_marker; }
-      my $n1 = GetEntry($samples1, $l);
-      my $n2 = GetEntry($samples2, $l);
-      if ($html) {
-        my $dis = $disasm{$l};
-        if (!defined($dis) || $n1 + $n2 == 0) {
-          # No samples/disassembly for this source line
-          printf $output (
-            "<span class=\"line\">%5d</span> " .
-            "<span class=\"deadsrc\">%6s %6s %s</span>\n",
-            $l,
-            HtmlPrintNumber($n1),
-            HtmlPrintNumber($n2),
-            HtmlEscape($text));
-        } else {
-          printf $output (
-            "<span class=\"line\">%5d</span> " .
-            "<span class=\"livesrc\">%6s %6s %s</span>" .
-            "<span class=\"asm\">%s</span>\n",
-            $l,
-            HtmlPrintNumber($n1),
-            HtmlPrintNumber($n2),
-            HtmlEscape($text),
-            HtmlEscape($dis));
-        }
-      } else {
-        printf $output(
-          "%6s %6s %4d: %s\n",
-          UnparseAlt($n1),
-          UnparseAlt($n2),
-          $l,
-          $text);
-      }
-      if ($l == $lastline)  { print $output $skip_marker; }
-    };
-  }
-  close(FILE);
-  if ($html) {
-    print $output "</pre>\n";
-  }
-  return 1;
-}
-
-# Return the source line for the specified file/linenumber.
-# Returns undef if not found.
-sub SourceLine {
-  my $file = shift;
-  my $line = shift;
-
-  # Look in cache
-  if (!defined($main::source_cache{$file})) {
-    if (100 < scalar keys(%main::source_cache)) {
-      # Clear the cache when it gets too big
-      $main::source_cache = ();
-    }
-
-    # Read all lines from the file
-    if (!open(FILE, "<$file")) {
-      print STDERR "$file: $!\n";
-      $main::source_cache{$file} = [];  # Cache the negative result
-      return undef;
-    }
-    my $lines = [];
-    push(@{$lines}, "");        # So we can use 1-based line numbers as indices
-    while (<FILE>) {
-      push(@{$lines}, $_);
-    }
-    close(FILE);
-
-    # Save the lines in the cache
-    $main::source_cache{$file} = $lines;
-  }
-
-  my $lines = $main::source_cache{$file};
-  if (($line < 0) || ($line > $#{$lines})) {
-    return undef;
-  } else {
-    return $lines->[$line];
-  }
-}
-
-# Print disassembly for one routine with interspersed source if available
-sub PrintDisassembledFunction {
-  my $prog = shift;
-  my $offset = shift;
-  my $routine = shift;
-  my $flat = shift;
-  my $cumulative = shift;
-  my $start_addr = shift;
-  my $end_addr = shift;
-  my $total = shift;
-
-  # Disassemble all instructions
-  my @instructions = Disassemble($prog, $offset, $start_addr, $end_addr);
-
-  # Make array of counts per instruction
-  my @flat_count = ();
-  my @cum_count = ();
-  my $flat_total = 0;
-  my $cum_total = 0;
-  foreach my $e (@instructions) {
-    # Add up counts for all address that fall inside this instruction
-    my $c1 = 0;
-    my $c2 = 0;
-    for (my $a = $e->[0]; $a lt $e->[4]; $a = AddressInc($a)) {
-      $c1 += GetEntry($flat, $a);
-      $c2 += GetEntry($cumulative, $a);
-    }
-    push(@flat_count, $c1);
-    push(@cum_count, $c2);
-    $flat_total += $c1;
-    $cum_total += $c2;
-  }
-
-  # Print header with total counts
-  printf("ROUTINE ====================== %s\n" .
-         "%6s %6s %s (flat, cumulative) %.1f%% of total\n",
-         ShortFunctionName($routine),
-         Unparse($flat_total),
-         Unparse($cum_total),
-         Units(),
-         ($cum_total * 100.0) / $total);
-
-  # Process instructions in order
-  my $current_file = "";
-  for (my $i = 0; $i <= $#instructions; ) {
-    my $e = $instructions[$i];
-
-    # Print the new file name whenever we switch files
-    if ($e->[1] ne $current_file) {
-      $current_file = $e->[1];
-      my $fname = $current_file;
-      $fname =~ s|^\./||;   # Trim leading "./"
-
-      # Shorten long file names
-      if (length($fname) >= 58) {
-        $fname = "..." . substr($fname, -55);
-      }
-      printf("-------------------- %s\n", $fname);
-    }
-
-    # TODO: Compute range of lines to print together to deal with
-    # small reorderings.
-    my $first_line = $e->[2];
-    my $last_line = $first_line;
-    my %flat_sum = ();
-    my %cum_sum = ();
-    for (my $l = $first_line; $l <= $last_line; $l++) {
-      $flat_sum{$l} = 0;
-      $cum_sum{$l} = 0;
-    }
-
-    # Find run of instructions for this range of source lines
-    my $first_inst = $i;
-    while (($i <= $#instructions) &&
-           ($instructions[$i]->[2] >= $first_line) &&
-           ($instructions[$i]->[2] <= $last_line)) {
-      $e = $instructions[$i];
-      $flat_sum{$e->[2]} += $flat_count[$i];
-      $cum_sum{$e->[2]} += $cum_count[$i];
-      $i++;
-    }
-    my $last_inst = $i - 1;
-
-    # Print source lines
-    for (my $l = $first_line; $l <= $last_line; $l++) {
-      my $line = SourceLine($current_file, $l);
-      if (!defined($line)) {
-        $line = "?\n";
-        next;
-      } else {
-        $line =~ s/^\s+//;
-      }
-      printf("%6s %6s %5d: %s",
-             UnparseAlt($flat_sum{$l}),
-             UnparseAlt($cum_sum{$l}),
-             $l,
-             $line);
-    }
-
-    # Print disassembly
-    for (my $x = $first_inst; $x <= $last_inst; $x++) {
-      my $e = $instructions[$x];
-      my $address = $e->[0];
-      $address = AddressSub($address, $offset);  # Make relative to section
-      $address =~ s/^0x//;
-      $address =~ s/^0*//;
-
-      printf("%6s %6s    %8s: %6s\n",
-             UnparseAlt($flat_count[$x]),
-             UnparseAlt($cum_count[$x]),
-             $address,
-             CleanDisassembly($e->[3]));
-    }
-  }
-}
-
-# Print DOT graph
-sub PrintDot {
-  my $prog = shift;
-  my $symbols = shift;
-  my $raw = shift;
-  my $flat = shift;
-  my $cumulative = shift;
-  my $overall_total = shift;
-
-  # Get total
-  my $local_total = TotalProfile($flat);
-  my $nodelimit = int($main::opt_nodefraction * $local_total);
-  my $edgelimit = int($main::opt_edgefraction * $local_total);
-  my $nodecount = $main::opt_nodecount;
-
-  # Find nodes to include
-  my @list = (sort { abs(GetEntry($cumulative, $b)) <=>
-                     abs(GetEntry($cumulative, $a))
-                     || $a cmp $b }
-              keys(%{$cumulative}));
-  my $last = $nodecount - 1;
-  if ($last > $#list) {
-    $last = $#list;
-  }
-  while (($last >= 0) &&
-         (abs(GetEntry($cumulative, $list[$last])) <= $nodelimit)) {
-    $last--;
-  }
-  if ($last < 0) {
-    print STDERR "No nodes to print\n";
-    cleanup();
-    return 0;
-  }
-
-  if ($nodelimit > 0 || $edgelimit > 0) {
-    printf STDERR ("Dropping nodes with <= %s %s; edges with <= %s abs(%s)\n",
-                   Unparse($nodelimit), Units(),
-                   Unparse($edgelimit), Units());
-  }
-
-  # Open DOT output file
-  my $output;
-  if ($main::opt_gv) {
-    $output = "| $DOT -Tps2 >" . TempName($main::next_tmpfile, "ps");
-  } elsif ($main::opt_ps) {
-    $output = "| $DOT -Tps2";
-  } elsif ($main::opt_pdf) {
-    $output = "| $DOT -Tps2 | $PS2PDF - -";
-  } elsif ($main::opt_web || $main::opt_svg) {
-    # We need to post-process the SVG, so write to a temporary file always.
-    $output = "| $DOT -Tsvg >" . TempName($main::next_tmpfile, "svg");
-  } elsif ($main::opt_gif) {
-    $output = "| $DOT -Tgif";
-  } else {
-    $output = ">&STDOUT";
-  }
-  open(DOT, $output) || error("$output: $!\n");
-
-  # Title
-  printf DOT ("digraph \"%s; %s %s\" {\n",
-              $prog,
-              Unparse($overall_total),
-              Units());
-  if ($main::opt_pdf) {
-    # The output is more printable if we set the page size for dot.
-    printf DOT ("size=\"8,11\"\n");
-  }
-  printf DOT ("node [width=0.375,height=0.25];\n");
-
-  # Print legend
-  printf DOT ("Legend [shape=box,fontsize=24,shape=plaintext," .
-              "label=\"%s\\l%s\\l%s\\l%s\\l%s\\l\"];\n",
-              $prog,
-              sprintf("Total %s: %s", Units(), Unparse($overall_total)),
-              sprintf("Focusing on: %s", Unparse($local_total)),
-              sprintf("Dropped nodes with <= %s abs(%s)",
-                      Unparse($nodelimit), Units()),
-              sprintf("Dropped edges with <= %s %s",
-                      Unparse($edgelimit), Units())
-              );
-
-  # Print nodes
-  my %node = ();
-  my $nextnode = 1;
-  foreach my $a (@list[0..$last]) {
-    # Pick font size
-    my $f = GetEntry($flat, $a);
-    my $c = GetEntry($cumulative, $a);
-
-    my $fs = 8;
-    if ($local_total > 0) {
-      $fs = 8 + (50.0 * sqrt(abs($f * 1.0 / $local_total)));
-    }
-
-    $node{$a} = $nextnode++;
-    my $sym = $a;
-    $sym =~ s/\s+/\\n/g;
-    $sym =~ s/::/\\n/g;
-
-    # Extra cumulative info to print for non-leaves
-    my $extra = "";
-    if ($f != $c) {
-      $extra = sprintf("\\rof %s (%s)",
-                       Unparse($c),
-                       Percent($c, $overall_total));
-    }
-    my $style = "";
-    if ($main::opt_heapcheck) {
-      if ($f > 0) {
-        # make leak-causing nodes more visible (add a background)
-        $style = ",style=filled,fillcolor=gray"
-      } elsif ($f < 0) {
-        # make anti-leak-causing nodes (which almost never occur)
-        # stand out as well (triple border)
-        $style = ",peripheries=3"
-      }
-    }
-
-    printf DOT ("N%d [label=\"%s\\n%s (%s)%s\\r" .
-                "\",shape=box,fontsize=%.1f%s];\n",
-                $node{$a},
-                $sym,
-                Unparse($f),
-                Percent($f, $overall_total),
-                $extra,
-                $fs,
-                $style,
-               );
-  }
-
-  # Get edges and counts per edge
-  my %edge = ();
-  my $n;
-  foreach my $k (keys(%{$raw})) {
-    # TODO: omit low %age edges
-    $n = $raw->{$k};
-    my @translated = TranslateStack($symbols, $k);
-    for (my $i = 1; $i <= $#translated; $i++) {
-      my $src = $translated[$i];
-      my $dst = $translated[$i-1];
-      #next if ($src eq $dst);  # Avoid self-edges?
-      if (exists($node{$src}) && exists($node{$dst})) {
-        my $edge_label = "$src\001$dst";
-        if (!exists($edge{$edge_label})) {
-          $edge{$edge_label} = 0;
-        }
-        $edge{$edge_label} += $n;
-      }
-    }
-  }
-
-  # Print edges
-  foreach my $e (keys(%edge)) {
-    my @x = split(/\001/, $e);
-    $n = $edge{$e};
-
-    if (abs($n) > $edgelimit) {
-      # Compute line width based on edge count
-      my $fraction = abs($local_total ? (3 * ($n / $local_total)) : 0);
-      if ($fraction > 1) { $fraction = 1; }
-      my $w = $fraction * 2;
-      if ($w < 1 && ($main::opt_web || $main::opt_svg)) {
-        # SVG output treats line widths < 1 poorly.
-        $w = 1;
-      }
-
-      # Dot sometimes segfaults if given edge weights that are too large, so
-      # we cap the weights at a large value
-      my $edgeweight = abs($n) ** 0.7;
-      if ($edgeweight > 100000) { $edgeweight = 100000; }
-      $edgeweight = int($edgeweight);
-
-      my $style = sprintf("setlinewidth(%f)", $w);
-      if ($x[1] =~ m/\(inline\)/) {
-        $style .= ",dashed";
-      }
-
-      # Use a slightly squashed function of the edge count as the weight
-      printf DOT ("N%s -> N%s [label=%s, weight=%d, style=\"%s\"];\n",
-                  $node{$x[0]},
-                  $node{$x[1]},
-                  Unparse($n),
-                  $edgeweight,
-                  $style);
-    }
-  }
-
-  print DOT ("}\n");
-  close(DOT);
-
-  if ($main::opt_web || $main::opt_svg) {
-    # Rewrite SVG to be more usable inside web browser.
-    RewriteSvg(TempName($main::next_tmpfile, "svg"));
-  }
-
-  return 1;
-}
-
-sub RewriteSvg {
-  my $svgfile = shift;
-
-  open(SVG, $svgfile) || die "open temp svg: $!";
-  my @svg = <SVG>;
-  close(SVG);
-  unlink $svgfile;
-  my $svg = join('', @svg);
-
-  # Dot's SVG output is
-  #
-  #    <svg width="___" height="___"
-  #     viewBox="___" xmlns=...>
-  #    <g id="graph0" transform="...">
-  #    ...
-  #    </g>
-  #    </svg>
-  #
-  # Change it to
-  #
-  #    <svg width="100%" height="100%"
-  #     xmlns=...>
-  #    $svg_javascript
-  #    <g id="viewport" transform="translate(0,0)">
-  #    <g id="graph0" transform="...">
-  #    ...
-  #    </g>
-  #    </g>
-  #    </svg>
-
-  # Fix width, height; drop viewBox.
-  $svg =~ s/(?s)<svg width="[^"]+" height="[^"]+"(.*?)viewBox="[^"]+"/<svg width="100%" height="100%"$1/;
-
-  # Insert script, viewport <g> above first <g>
-  my $svg_javascript = SvgJavascript();
-  my $viewport = "<g id=\"viewport\" transform=\"translate(0,0)\">\n";
-  $svg =~ s/<g id="graph\d"/$svg_javascript$viewport$&/;
-
-  # Insert final </g> above </svg>.
-  $svg =~ s/(.*)(<\/svg>)/$1<\/g>$2/;
-  $svg =~ s/<g id="graph\d"(.*?)/<g id="viewport"$1/;
-
-  if ($main::opt_svg) {
-    # --svg: write to standard output.
-    print $svg;
-  } else {
-    # Write back to temporary file.
-    open(SVG, ">$svgfile") || die "open $svgfile: $!";
-    print SVG $svg;
-    close(SVG);
-  }
-}
-
-sub SvgJavascript {
-  return <<'EOF';
-<script type="text/ecmascript"><![CDATA[
-// SVGPan
-// http://www.cyberz.org/blog/2009/12/08/svgpan-a-javascript-svg-panzoomdrag-library/
-// Local modification: if(true || ...) below to force panning, never moving.
-// Local modification: add clamping to fix bug in handleMouseWheel.
-
-/**
- *  SVGPan library 1.2
- * ====================
- *
- * Given an unique existing element with id "viewport", including the
- * the library into any SVG adds the following capabilities:
- *
- *  - Mouse panning
- *  - Mouse zooming (using the wheel)
- *  - Object dargging
- *
- * Known issues:
- *
- *  - Zooming (while panning) on Safari has still some issues
- *
- * Releases:
- *
- * 1.2, Sat Mar 20 08:42:50 GMT 2010, Zeng Xiaohui
- *	Fixed a bug with browser mouse handler interaction
- *
- * 1.1, Wed Feb  3 17:39:33 GMT 2010, Zeng Xiaohui
- *	Updated the zoom code to support the mouse wheel on Safari/Chrome
- *
- * 1.0, Andrea Leofreddi
- *	First release
- *
- * This code is licensed under the following BSD license:
- *
- * Copyright 2009-2010 Andrea Leofreddi <a.leofreddi@itcharm.com>. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without modification, are
- * permitted provided that the following conditions are met:
- *
- *    1. Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *
- *    2. Redistributions in binary form must reproduce the above copyright notice, this list
- *       of conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY Andrea Leofreddi ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Andrea Leofreddi OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * The views and conclusions contained in the software and documentation are those of the
- * authors and should not be interpreted as representing official policies, either expressed
- * or implied, of Andrea Leofreddi.
- */
-
-var root = document.documentElement;
-
-var state = 'none', stateTarget, stateOrigin, stateTf;
-
-setupHandlers(root);
-
-/**
- * Register handlers
- */
-function setupHandlers(root){
-	setAttributes(root, {
-		"onmouseup" : "add(evt)",
-		"onmousedown" : "handleMouseDown(evt)",
-		"onmousemove" : "handleMouseMove(evt)",
-		"onmouseup" : "handleMouseUp(evt)",
-		//"onmouseout" : "handleMouseUp(evt)", // Decomment this to stop the pan functionality when dragging out of the SVG element
-	});
-
-	if(navigator.userAgent.toLowerCase().indexOf('webkit') >= 0)
-		window.addEventListener('mousewheel', handleMouseWheel, false); // Chrome/Safari
-	else
-		window.addEventListener('DOMMouseScroll', handleMouseWheel, false); // Others
-
-	var g = svgDoc.getElementById("svg");
-	g.width = "100%";
-	g.height = "100%";
-}
-
-/**
- * Instance an SVGPoint object with given event coordinates.
- */
-function getEventPoint(evt) {
-	var p = root.createSVGPoint();
-
-	p.x = evt.clientX;
-	p.y = evt.clientY;
-
-	return p;
-}
-
-/**
- * Sets the current transform matrix of an element.
- */
-function setCTM(element, matrix) {
-	var s = "matrix(" + matrix.a + "," + matrix.b + "," + matrix.c + "," + matrix.d + "," + matrix.e + "," + matrix.f + ")";
-
-	element.setAttribute("transform", s);
-}
-
-/**
- * Dumps a matrix to a string (useful for debug).
- */
-function dumpMatrix(matrix) {
-	var s = "[ " + matrix.a + ", " + matrix.c + ", " + matrix.e + "\n  " + matrix.b + ", " + matrix.d + ", " + matrix.f + "\n  0, 0, 1 ]";
-
-	return s;
-}
-
-/**
- * Sets attributes of an element.
- */
-function setAttributes(element, attributes){
-	for (i in attributes)
-		element.setAttributeNS(null, i, attributes[i]);
-}
-
-/**
- * Handle mouse move event.
- */
-function handleMouseWheel(evt) {
-	if(evt.preventDefault)
-		evt.preventDefault();
-
-	evt.returnValue = false;
-
-	var svgDoc = evt.target.ownerDocument;
-
-	var delta;
-
-	if(evt.wheelDelta)
-		delta = evt.wheelDelta / 3600; // Chrome/Safari
-	else
-		delta = evt.detail / -90; // Mozilla
-
-	var z = 1 + delta; // Zoom factor: 0.9/1.1
-
-	// Clamp to reasonable values.
-	// The 0.1 check is important because
-	// a very large scroll can turn into a
-	// negative z, which rotates the image 180 degrees.
-	if(z < 0.1)
-		z = 0.1;
-	if(z > 10.0)
-		z = 10.0;
-
-	var g = svgDoc.getElementById("viewport");
-
-	var p = getEventPoint(evt);
-
-	p = p.matrixTransform(g.getCTM().inverse());
-
-	// Compute new scale matrix in current mouse position
-	var k = root.createSVGMatrix().translate(p.x, p.y).scale(z).translate(-p.x, -p.y);
-
-        setCTM(g, g.getCTM().multiply(k));
-
-	stateTf = stateTf.multiply(k.inverse());
-}
-
-/**
- * Handle mouse move event.
- */
-function handleMouseMove(evt) {
-	if(evt.preventDefault)
-		evt.preventDefault();
-
-	evt.returnValue = false;
-
-	var svgDoc = evt.target.ownerDocument;
-
-	var g = svgDoc.getElementById("viewport");
-
-	if(state == 'pan') {
-		// Pan mode
-		var p = getEventPoint(evt).matrixTransform(stateTf);
-
-		setCTM(g, stateTf.inverse().translate(p.x - stateOrigin.x, p.y - stateOrigin.y));
-	} else if(state == 'move') {
-		// Move mode
-		var p = getEventPoint(evt).matrixTransform(g.getCTM().inverse());
-
-		setCTM(stateTarget, root.createSVGMatrix().translate(p.x - stateOrigin.x, p.y - stateOrigin.y).multiply(g.getCTM().inverse()).multiply(stateTarget.getCTM()));
-
-		stateOrigin = p;
-	}
-}
-
-/**
- * Handle click event.
- */
-function handleMouseDown(evt) {
-	if(evt.preventDefault)
-		evt.preventDefault();
-
-	evt.returnValue = false;
-
-	var svgDoc = evt.target.ownerDocument;
-
-	var g = svgDoc.getElementById("viewport");
-
-	if(true || evt.target.tagName == "svg") {
-		// Pan mode
-		state = 'pan';
-
-		stateTf = g.getCTM().inverse();
-
-		stateOrigin = getEventPoint(evt).matrixTransform(stateTf);
-	} else {
-		// Move mode
-		state = 'move';
-
-		stateTarget = evt.target;
-
-		stateTf = g.getCTM().inverse();
-
-		stateOrigin = getEventPoint(evt).matrixTransform(stateTf);
-	}
-}
-
-/**
- * Handle mouse button release event.
- */
-function handleMouseUp(evt) {
-	if(evt.preventDefault)
-		evt.preventDefault();
-
-	evt.returnValue = false;
-
-	var svgDoc = evt.target.ownerDocument;
-
-	if(state == 'pan' || state == 'move') {
-		// Quit pan mode
-		state = '';
-	}
-}
-
-]]></script>
-EOF
-}
-
-# Translate a stack of addresses into a stack of symbols
-sub TranslateStack {
-  my $symbols = shift;
-  my $k = shift;
-
-  my @addrs = split(/\n/, $k);
-  my @result = ();
-  for (my $i = 0; $i <= $#addrs; $i++) {
-    my $a = $addrs[$i];
-
-    # Skip large addresses since they sometimes show up as fake entries on RH9
-    if (length($a) > 8 && $a gt "7fffffffffffffff") {
-      next;
-    }
-
-    if ($main::opt_disasm || $main::opt_list) {
-      # We want just the address for the key
-      push(@result, $a);
-      next;
-    }
-
-    my $symlist = $symbols->{$a};
-    if (!defined($symlist)) {
-      $symlist = [$a, "", $a];
-    }
-
-    # We can have a sequence of symbols for a particular entry
-    # (more than one symbol in the case of inlining).  Callers
-    # come before callees in symlist, so walk backwards since
-    # the translated stack should contain callees before callers.
-    for (my $j = $#{$symlist}; $j >= 2; $j -= 3) {
-      my $func = $symlist->[$j-2];
-      my $fileline = $symlist->[$j-1];
-      my $fullfunc = $symlist->[$j];
-      if ($j > 2) {
-        $func = "$func (inline)";
-      }
-      if ($main::opt_addresses) {
-        push(@result, "$a $func $fileline");
-      } elsif ($main::opt_lines) {
-        if ($func eq '??' && $fileline eq '??:0') {
-          push(@result, "$a");
-        } else {
-          push(@result, "$func $fileline");
-        }
-      } elsif ($main::opt_functions) {
-        if ($func eq '??') {
-          push(@result, "$a");
-        } else {
-          push(@result, $func);
-        }
-      } elsif ($main::opt_files) {
-        if ($fileline eq '??:0' || $fileline eq '') {
-          push(@result, "$a");
-        } else {
-          my $f = $fileline;
-          $f =~ s/:\d+$//;
-          push(@result, $f);
-        }
-      } else {
-        push(@result, $a);
-        last;  # Do not print inlined info
-      }
-    }
-  }
-
-  # print join(",", @addrs), " => ", join(",", @result), "\n";
-  return @result;
-}
-
-# Generate percent string for a number and a total
-sub Percent {
-  my $num = shift;
-  my $tot = shift;
-  if ($tot != 0) {
-    return sprintf("%.1f%%", $num * 100.0 / $tot);
-  } else {
-    return ($num == 0) ? "nan" : (($num > 0) ? "+inf" : "-inf");
-  }
-}
-
-# Generate pretty-printed form of number
-sub Unparse {
-  my $num = shift;
-  if ($main::profile_type eq 'heap' || $main::profile_type eq 'growth') {
-    if ($main::opt_inuse_objects || $main::opt_alloc_objects) {
-      return sprintf("%d", $num);
-    } else {
-      if ($main::opt_show_bytes) {
-        return sprintf("%d", $num);
-      } else {
-        return sprintf("%.1f", $num / 1048576.0);
-      }
-    }
-  } elsif ($main::profile_type eq 'contention' && !$main::opt_contentions) {
-    return sprintf("%.3f", $num / 1e9); # Convert nanoseconds to seconds
-  } else {
-    return sprintf("%d", $num);
-  }
-}
-
-# Alternate pretty-printed form: 0 maps to "."
-sub UnparseAlt {
-  my $num = shift;
-  if ($num == 0) {
-    return ".";
-  } else {
-    return Unparse($num);
-  }
-}
-
-# Alternate pretty-printed form: 0 maps to ""
-sub HtmlPrintNumber {
-  my $num = shift;
-  if ($num == 0) {
-    return "";
-  } else {
-    return Unparse($num);
-  }
-}
-
-# Return output units
-sub Units {
-  if ($main::profile_type eq 'heap' || $main::profile_type eq 'growth') {
-    if ($main::opt_inuse_objects || $main::opt_alloc_objects) {
-      return "objects";
-    } else {
-      if ($main::opt_show_bytes) {
-        return "B";
-      } else {
-        return "MB";
-      }
-    }
-  } elsif ($main::profile_type eq 'contention' && !$main::opt_contentions) {
-    return "seconds";
-  } elsif ($main::profile_type eq 'thread') {
-    return "threads";
-  } else {
-    return "samples";
-  }
-}
-
-##### Profile manipulation code #####
-
-# Generate flattened profile:
-# If count is charged to stack [a,b,c,d], in generated profile,
-# it will be charged to [a]
-sub FlatProfile {
-  my $profile = shift;
-  my $result = {};
-  foreach my $k (keys(%{$profile})) {
-    my $count = $profile->{$k};
-    my @addrs = split(/\n/, $k);
-    if ($#addrs >= 0) {
-      AddEntry($result, $addrs[0], $count);
-    }
-  }
-  return $result;
-}
-
-# Generate cumulative profile:
-# If count is charged to stack [a,b,c,d], in generated profile,
-# it will be charged to [a], [b], [c], [d]
-sub CumulativeProfile {
-  my $profile = shift;
-  my $result = {};
-  foreach my $k (keys(%{$profile})) {
-    my $count = $profile->{$k};
-    my @addrs = split(/\n/, $k);
-    foreach my $a (@addrs) {
-      AddEntry($result, $a, $count);
-    }
-  }
-  return $result;
-}
-
-# If the second-youngest PC on the stack is always the same, returns
-# that pc.  Otherwise, returns undef.
-sub IsSecondPcAlwaysTheSame {
-  my $profile = shift;
-
-  my $second_pc = undef;
-  foreach my $k (keys(%{$profile})) {
-    my @addrs = split(/\n/, $k);
-    if ($#addrs < 1) {
-      return undef;
-    }
-    if (not defined $second_pc) {
-      $second_pc = $addrs[1];
-    } else {
-      if ($second_pc ne $addrs[1]) {
-        return undef;
-      }
-    }
-  }
-  return $second_pc;
-}
-
-sub ExtractSymbolLocation {
-  my $symbols = shift;
-  my $address = shift;
-  # 'addr2line' outputs "??:0" for unknown locations; we do the
-  # same to be consistent.
-  my $location = "??:0:unknown";
-  if (exists $symbols->{$address}) {
-    my $file = $symbols->{$address}->[1];
-    if ($file eq "?") {
-      $file = "??:0"
-    }
-    $location = $file . ":" . $symbols->{$address}->[0];
-  }
-  return $location;
-}
-
-# Extracts a graph of calls.
-sub ExtractCalls {
-  my $symbols = shift;
-  my $profile = shift;
-
-  my $calls = {};
-  while( my ($stack_trace, $count) = each %$profile ) {
-    my @address = split(/\n/, $stack_trace);
-    my $destination = ExtractSymbolLocation($symbols, $address[0]);
-    AddEntry($calls, $destination, $count);
-    for (my $i = 1; $i <= $#address; $i++) {
-      my $source = ExtractSymbolLocation($symbols, $address[$i]);
-      my $call = "$source -> $destination";
-      AddEntry($calls, $call, $count);
-      $destination = $source;
-    }
-  }
-
-  return $calls;
-}
-
-sub RemoveUninterestingFrames {
-  my $symbols = shift;
-  my $profile = shift;
-
-  # List of function names to skip
-  my %skip = ();
-  my $skip_regexp = 'NOMATCH';
-  if ($main::profile_type eq 'heap' || $main::profile_type eq 'growth') {
-    foreach my $name ('calloc',
-                      'cfree',
-                      'malloc',
-                      'free',
-                      'memalign',
-                      'posix_memalign',
-                      'pvalloc',
-                      'valloc',
-                      'realloc',
-                      'tc_calloc',
-                      'tc_cfree',
-                      'tc_malloc',
-                      'tc_free',
-                      'tc_memalign',
-                      'tc_posix_memalign',
-                      'tc_pvalloc',
-                      'tc_valloc',
-                      'tc_realloc',
-                      'tc_new',
-                      'tc_delete',
-                      'tc_newarray',
-                      'tc_deletearray',
-                      'tc_new_nothrow',
-                      'tc_newarray_nothrow',
-                      'do_malloc',
-                      '::do_malloc',   # new name -- got moved to an unnamed ns
-                      '::do_malloc_or_cpp_alloc',
-                      'DoSampledAllocation',
-                      'simple_alloc::allocate',
-                      '__malloc_alloc_template::allocate',
-                      '__builtin_delete',
-                      '__builtin_new',
-                      '__builtin_vec_delete',
-                      '__builtin_vec_new',
-                      'operator new',
-                      'operator new[]',
-                      # Go
-                      'catstring',
-                      'cnew',
-                      'copyin',
-                      'gostring',
-                      'gostringsize',
-                      'growslice1',
-                      'appendslice1',
-                      'hash_init',
-                      'hash_subtable_new',
-                      'hash_conv',
-                      'hash_grow',
-                      'hash_insert_internal',
-                      'hash_insert',
-                      'mapassign',
-                      'runtime.mapassign',
-                      'runtime.appendslice',
-                      'runtime.mapassign1',
-                      'makechan',
-                      'makemap',
-                      'mal',
-                      'profilealloc',
-                      'runtime.new',
-                      'makeslice1',
-                      'runtime.malloc',
-                      'unsafe.New',
-                      'runtime.mallocgc',
-                      'runtime.catstring',
-                      'runtime.cnew',
-                      'runtime.cnewarray',
-                      'runtime.growslice',
-                      'runtime.ifaceT2E',
-                      'runtime.ifaceT2I',
-                      'runtime.makechan',
-                      'runtime.makechan_c',
-                      'runtime.makemap',
-                      'runtime.makemap_c',
-                      'runtime.makeslice',
-                      'runtime.mal',
-                      'runtime.settype',
-                      'runtime.settype_flush',
-                      'runtime.slicebytetostring',
-                      'runtime.sliceinttostring',
-                      'runtime.stringtoslicebyte',
-                      'runtime.stringtosliceint',
-                      # These mark the beginning/end of our custom sections
-                      '__start_google_malloc',
-                      '__stop_google_malloc',
-                      '__start_malloc_hook',
-                      '__stop_malloc_hook') {
-      $skip{$name} = 1;
-      $skip{"_" . $name} = 1;   # Mach (OS X) adds a _ prefix to everything
-    }
-    # TODO: Remove TCMalloc once everything has been
-    # moved into the tcmalloc:: namespace and we have flushed
-    # old code out of the system.
-    $skip_regexp = "TCMalloc|^tcmalloc::";
-  } elsif ($main::profile_type eq 'contention') {
-    foreach my $vname ('Mutex::Unlock', 'Mutex::UnlockSlow') {
-      $skip{$vname} = 1;
-    }
-  } elsif ($main::profile_type eq 'cpu') {
-    # Drop signal handlers used for CPU profile collection
-    # TODO(dpeng): this should not be necessary; it's taken
-    # care of by the general 2nd-pc mechanism below.
-    foreach my $name ('ProfileData::Add',           # historical
-                      'ProfileData::prof_handler',  # historical
-                      'CpuProfiler::prof_handler',
-                      '__FRAME_END__',
-                      '__pthread_sighandler',
-                      '__restore') {
-      $skip{$name} = 1;
-    }
-  } else {
-    # Nothing skipped for unknown types
-  }
-
-  # Go doesn't have the problem that this heuristic tries to fix.  Disable.
-  if (0 && $main::profile_type eq 'cpu') {
-    # If all the second-youngest program counters are the same,
-    # this STRONGLY suggests that it is an artifact of measurement,
-    # i.e., stack frames pushed by the CPU profiler signal handler.
-    # Hence, we delete them.
-    # (The topmost PC is read from the signal structure, not from
-    # the stack, so it does not get involved.)
-    while (my $second_pc = IsSecondPcAlwaysTheSame($profile)) {
-      my $result = {};
-      my $func = '';
-      if (exists($symbols->{$second_pc})) {
-        $second_pc = $symbols->{$second_pc}->[0];
-      }
-      print STDERR "Removing $second_pc from all stack traces.\n";
-      foreach my $k (keys(%{$profile})) {
-        my $count = $profile->{$k};
-        my @addrs = split(/\n/, $k);
-        splice @addrs, 1, 1;
-        my $reduced_path = join("\n", @addrs);
-        AddEntry($result, $reduced_path, $count);
-      }
-      $profile = $result;
-    }
-  }
-
-  my $result = {};
-  foreach my $k (keys(%{$profile})) {
-    my $count = $profile->{$k};
-    my @addrs = split(/\n/, $k);
-    my @path = ();
-    foreach my $a (@addrs) {
-      if (exists($symbols->{$a})) {
-        my $func = $symbols->{$a}->[0];
-        if ($skip{$func} || ($func =~ m/$skip_regexp/)) {
-          next;
-        }
-      }
-      push(@path, $a);
-    }
-    my $reduced_path = join("\n", @path);
-    AddEntry($result, $reduced_path, $count);
-  }
-  return $result;
-}
-
-# Reduce profile to granularity given by user
-sub ReduceProfile {
-  my $symbols = shift;
-  my $profile = shift;
-  my $result = {};
-  foreach my $k (keys(%{$profile})) {
-    my $count = $profile->{$k};
-    my @translated = TranslateStack($symbols, $k);
-    my @path = ();
-    my %seen = ();
-    $seen{''} = 1;      # So that empty keys are skipped
-    foreach my $e (@translated) {
-      # To avoid double-counting due to recursion, skip a stack-trace
-      # entry if it has already been seen
-      if (!$seen{$e}) {
-        $seen{$e} = 1;
-        push(@path, $e);
-      }
-    }
-    my $reduced_path = join("\n", @path);
-    AddEntry($result, $reduced_path, $count);
-  }
-  return $result;
-}
-
-# Does the specified symbol array match the regexp?
-sub SymbolMatches {
-  my $sym = shift;
-  my $re = shift;
-  if (defined($sym)) {
-    for (my $i = 0; $i < $#{$sym}; $i += 3) {
-      if ($sym->[$i] =~ m/$re/ || $sym->[$i+1] =~ m/$re/) {
-        return 1;
-      }
-    }
-  }
-  return 0;
-}
-
-# Focus only on paths involving specified regexps
-sub FocusProfile {
-  my $symbols = shift;
-  my $profile = shift;
-  my $focus = shift;
-  my $result = {};
-  foreach my $k (keys(%{$profile})) {
-    my $count = $profile->{$k};
-    my @addrs = split(/\n/, $k);
-    foreach my $a (@addrs) {
-      # Reply if it matches either the address/shortname/fileline
-      if (($a =~ m/$focus/) || SymbolMatches($symbols->{$a}, $focus)) {
-        AddEntry($result, $k, $count);
-        last;
-      }
-    }
-  }
-  return $result;
-}
-
-# Focus only on paths not involving specified regexps
-sub IgnoreProfile {
-  my $symbols = shift;
-  my $profile = shift;
-  my $ignore = shift;
-  my $result = {};
-  foreach my $k (keys(%{$profile})) {
-    my $count = $profile->{$k};
-    my @addrs = split(/\n/, $k);
-    my $matched = 0;
-    foreach my $a (@addrs) {
-      # Reply if it matches either the address/shortname/fileline
-      if (($a =~ m/$ignore/) || SymbolMatches($symbols->{$a}, $ignore)) {
-        $matched = 1;
-        last;
-      }
-    }
-    if (!$matched) {
-      AddEntry($result, $k, $count);
-    }
-  }
-  return $result;
-}
-
-# Get total count in profile
-sub TotalProfile {
-  my $profile = shift;
-  my $result = 0;
-  foreach my $k (keys(%{$profile})) {
-    $result += $profile->{$k};
-  }
-  return $result;
-}
-
-# Add A to B
-sub AddProfile {
-  my $A = shift;
-  my $B = shift;
-
-  my $R = {};
-  # add all keys in A
-  foreach my $k (keys(%{$A})) {
-    my $v = $A->{$k};
-    AddEntry($R, $k, $v);
-  }
-  # add all keys in B
-  foreach my $k (keys(%{$B})) {
-    my $v = $B->{$k};
-    AddEntry($R, $k, $v);
-  }
-  return $R;
-}
-
-# Merges symbol maps
-sub MergeSymbols {
-  my $A = shift;
-  my $B = shift;
-
-  my $R = {};
-  foreach my $k (keys(%{$A})) {
-    $R->{$k} = $A->{$k};
-  }
-  if (defined($B)) {
-    foreach my $k (keys(%{$B})) {
-      $R->{$k} = $B->{$k};
-    }
-  }
-  return $R;
-}
-
-
-# Add A to B
-sub AddPcs {
-  my $A = shift;
-  my $B = shift;
-
-  my $R = {};
-  # add all keys in A
-  foreach my $k (keys(%{$A})) {
-    $R->{$k} = 1
-  }
-  # add all keys in B
-  foreach my $k (keys(%{$B})) {
-    $R->{$k} = 1
-  }
-  return $R;
-}
-
-# Subtract B from A
-sub SubtractProfile {
-  my $A = shift;
-  my $B = shift;
-
-  my $R = {};
-  foreach my $k (keys(%{$A})) {
-    my $v = $A->{$k} - GetEntry($B, $k);
-    if ($v < 0 && $main::opt_drop_negative) {
-      $v = 0;
-    }
-    AddEntry($R, $k, $v);
-  }
-  if (!$main::opt_drop_negative) {
-    # Take care of when subtracted profile has more entries
-    foreach my $k (keys(%{$B})) {
-      if (!exists($A->{$k})) {
-        AddEntry($R, $k, 0 - $B->{$k});
-      }
-    }
-  }
-  return $R;
-}
-
-# Get entry from profile; zero if not present
-sub GetEntry {
-  my $profile = shift;
-  my $k = shift;
-  if (exists($profile->{$k})) {
-    return $profile->{$k};
-  } else {
-    return 0;
-  }
-}
-
-# Add entry to specified profile
-sub AddEntry {
-  my $profile = shift;
-  my $k = shift;
-  my $n = shift;
-  if (!exists($profile->{$k})) {
-    $profile->{$k} = 0;
-  }
-  $profile->{$k} += $n;
-}
-
-# Add a stack of entries to specified profile, and add them to the $pcs
-# list.
-sub AddEntries {
-  my $profile = shift;
-  my $pcs = shift;
-  my $stack = shift;
-  my $count = shift;
-  my @k = ();
-
-  foreach my $e (split(/\s+/, $stack)) {
-    my $pc = HexExtend($e);
-    $pcs->{$pc} = 1;
-    push @k, $pc;
-  }
-  AddEntry($profile, (join "\n", @k), $count);
-}
-
-sub IsSymbolizedProfileFile {
-  my $file_name = shift;
-
-  if (!(-e $file_name) || !(-r $file_name)) {
-    return 0;
-  }
-
-  $SYMBOL_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
-  my $symbol_marker = $&;
-  # Check if the file contains a symbol-section marker.
-  open(TFILE, "<$file_name");
-  my @lines = <TFILE>;
-  my $result = grep(/^--- *$symbol_marker/, @lines);
-  close(TFILE);
-  return $result > 0;
-}
-
-##### Code to profile a server dynamically #####
-
-sub CheckSymbolPage {
-  my $url = SymbolPageURL();
-print STDERR "Read $url\n";
-
-  my $line = FetchHTTP($url);
-  $line =~ s/\r//g;         # turn windows-looking lines into unix-looking lines
-  unless (defined($line)) {
-    error("$url doesn't exist\n");
-  }
-
-  if ($line =~ /^num_symbols:\s+(\d+)$/) {
-    if ($1 == 0) {
-      error("Stripped binary. No symbols available.\n");
-    }
-  } else {
-    error("Failed to get the number of symbols from $url\n");
-  }
-}
-
-sub IsProfileURL {
-  my $profile_name = shift;
-  my ($scheme, $host, $port, $prefix, $path) = ParseProfileURL($profile_name);
-  return defined($host) and defined($port) and defined($path);
-}
-
-sub ParseProfileURL {
-  my $profile_name = shift;
-  if (defined($profile_name) &&
-      $profile_name =~ m,^(?:(https?)://|)([^/:]+):(\d+)(|\@\d+)(|/|(.*?)($PROFILE_PAGE|$PMUPROFILE_PAGE|$HEAP_PAGE|$GROWTH_PAGE|$THREAD_PAGE|$BLOCK_PAGE|$CONTENTION_PAGE|$WALL_PAGE|$FILTEREDPROFILE_PAGE))$,o) {
-    # $7 is $PROFILE_PAGE/$HEAP_PAGE/etc.  $5 is *everything* after
-    # the hostname, as long as that everything is the empty string,
-    # a slash, or something ending in $PROFILE_PAGE/$HEAP_PAGE/etc.
-    # So "$7 || $5" is $PROFILE_PAGE/etc if there, or else it's "/" or "".
-    return ($1 || "http", $2, $3, $6, $7 || $5);
-  }
-  return ();
-}
-
-# We fetch symbols from the first profile argument.
-sub SymbolPageURL {
-  my ($scheme, $host, $port, $prefix, $path) = ParseProfileURL($main::pfile_args[0]);
-  return "$scheme://$host:$port$prefix$SYMBOL_PAGE";
-}
-
-sub FetchProgramName() {
-  my ($scheme, $host, $port, $prefix, $path) = ParseProfileURL($main::pfile_args[0]);
-  my $url = "$scheme://$host:$port$prefix$PROGRAM_NAME_PAGE";
-  
-  my $cmdline = FetchHTTP($url);
-  $cmdline =~ s/\n.*//s; # first line only
-  $cmdline =~ s/\r//g;   # turn windows-looking lines into unix-looking lines
-  error("Failed to get program name from $url\n") unless defined($cmdline);
-  $cmdline =~ s/\x00.+//;  # Remove argv[1] and latters.
-  $cmdline =~ s!\n!!g;  # Remove LFs.
-  return $cmdline;
-}
-
-# Reads a symbol map from the file handle name given as $1, returning
-# the resulting symbol map.  Also processes variables relating to symbols.
-# Currently, the only variable processed is 'binary=<value>' which updates
-# $main::prog to have the correct program name.
-sub ReadSymbols {
-  my $in = shift;
-  my $map = shift;
-  while (<$in>) {
-    s/\r//g;         # turn windows-looking lines into unix-looking lines
-    # Removes all the leading zeroes from the symbols, see comment below.
-    if (m/^0x0*([0-9a-f]+)\s+(.+)/) {
-      $map->{$1} = $2;
-    } elsif (m/^---/) {
-      last;
-    } elsif (m/^([a-z][^=]*)=(.*)$/ ) {
-      my ($variable, $value) = ($1, $2);
-      for ($variable, $value) {
-        s/^\s+//;
-        s/\s+$//;
-      }
-      if ($variable eq "binary") {
-        if ($main::prog ne $UNKNOWN_BINARY && $main::prog ne $value) {
-          printf STDERR ("Warning: Mismatched binary name '%s', using '%s'.\n",
-                         $main::prog, $value);
-        }
-        $main::prog = $value;
-      } else {
-        printf STDERR ("Ignoring unknown variable in symbols list: " .
-            "'%s' = '%s'\n", $variable, $value);
-      }
-    }
-  }
-  return $map;
-}
-
-# Fetches and processes symbols to prepare them for use in the profile output
-# code.  If the optional 'symbol_map' arg is not given, fetches symbols from
-# $SYMBOL_PAGE for all PC values found in profile.  Otherwise, the raw symbols
-# are assumed to have already been fetched into 'symbol_map' and are simply
-# extracted and processed.
-sub FetchSymbols {
-  my $pcset = shift;
-  my $symbol_map = shift;
-
-  my %seen = ();
-  my @pcs = grep { !$seen{$_}++ } keys(%$pcset);  # uniq
-
-  if (!defined($symbol_map)) {
-    $symbol_map = {};
-
-    my $post_data = join("+", sort((map {"0x" . "$_"} @pcs)));
-    my $url = SymbolPageURL();
-    my $content = PostHTTP($url, $post_data);
-    
-    my $tmp_symbol = File::Temp->new()->filename;
-    open(SYMBOL, ">$tmp_symbol");
-    print SYMBOL $content;
-    close(SYMBOL);
-    
-    open(SYMBOL, "<$tmp_symbol") || error("$tmp_symbol");
-    ReadSymbols(*SYMBOL{IO}, $symbol_map);
-    close(SYMBOL);
-  }
-
-  my $symbols = {};
-  foreach my $pc (@pcs) {
-    my $fullname;
-    # For 64 bits binaries, symbols are extracted with 8 leading zeroes.
-    # Then /symbol reads the long symbols in as uint64, and outputs
-    # the result with a "0x%08llx" format which get rid of the zeroes.
-    # By removing all the leading zeroes in both $pc and the symbols from
-    # /symbol, the symbols match and are retrievable from the map.
-    my $shortpc = $pc;
-    $shortpc =~ s/^0*//;
-    # Each line may have a list of names, which includes the function
-    # and also other functions it has inlined.  They are separated
-    # (in PrintSymbolizedFile), by --, which is illegal in function names.
-    my $fullnames;
-    if (defined($symbol_map->{$shortpc})) {
-      $fullnames = $symbol_map->{$shortpc};
-    } else {
-      $fullnames = "0x" . $pc;  # Just use addresses
-    }
-    my $sym = [];
-    $symbols->{$pc} = $sym;
-    foreach my $fullname (split("--", $fullnames)) {
-      my $name = ShortFunctionName($fullname);
-      push(@{$sym}, $name, "?", $fullname);
-    }
-  }
-  return $symbols;
-}
-
-sub BaseName {
-  my $file_name = shift;
-  $file_name =~ s!^.*/!!;  # Remove directory name
-  return $file_name;
-}
-
-sub MakeProfileBaseName {
-  my ($binary_name, $profile_name) = @_;
-  my ($scheme, $host, $port, $prefix, $path) = ParseProfileURL($profile_name);
-  my $binary_shortname = BaseName($binary_name);
-  return sprintf("%s.%s.%s-port%s",
-                 $binary_shortname, $main::op_time, $host, $port);
-}
-
-sub FetchDynamicProfile {
-  my $binary_name = shift;
-  my $profile_name = shift;
-  my $fetch_name_only = shift;
-  my $encourage_patience = shift;
-
-  if (!IsProfileURL($profile_name)) {
-    return $profile_name;
-  } else {
-    my ($scheme, $host, $port, $prefix, $path) = ParseProfileURL($profile_name);
-    if ($path eq "" || $path eq "/") {
-      # Missing type specifier defaults to cpu-profile
-      $path = $PROFILE_PAGE;
-    }
-
-    my $profile_file = MakeProfileBaseName($binary_name, $profile_name);
-
-    my $url;
-    my $timeout;
-    if (($path =~ m/$PROFILE_PAGE/) || ($path =~ m/$PMUPROFILE_PAGE/)) {
-      if ($path =~ m/$PROFILE_PAGE/) {
-        $url = sprintf("$scheme://$host:$port$prefix$path?seconds=%d",
-            $main::opt_seconds);
-      } else {
-        if ($profile_name =~ m/[?]/) {
-          $profile_name .= "&"
-        } else {
-          $profile_name .= "?"
-        }
-        $url = sprintf("$scheme://$profile_name" . "seconds=%d",
-            $main::opt_seconds);
-      }
-      $timeout = int($main::opt_seconds * 1.01 + 60);
-    } else {
-      # For non-CPU profiles, we add a type-extension to
-      # the target profile file name.
-      my $suffix = $path;
-      $suffix =~ s,/,.,g;
-      $profile_file .= "$suffix";
-      $url = "$scheme://$host:$port$prefix$path";
-    }
-
-    my $tmp_profile = File::Temp->new()->filename;
-    my $real_profile = File::Temp->new()->filename;
-
-    if ($fetch_name_only > 0) {
-      return $real_profile;
-    }
-
-    if (($path =~ m/$PROFILE_PAGE/) || ($path =~ m/$PMUPROFILE_PAGE/)){
-      print STDERR "Gathering CPU profile from $url for $main::opt_seconds seconds to\n  ${real_profile}\n";
-      if ($encourage_patience) {
-        print STDERR "Be patient...\n";
-      }
-    } else {
-      print STDERR "Fetching $path profile from $host:$port to\n  ${real_profile}\n";
-    }
-
-    my $content = FetchHTTP($url, $timeout);
-    
-    open(OUTFILE, ">$tmp_profile");
-    binmode(OUTFILE);
-    print OUTFILE $content;
-    close(OUTFILE);
-    
-    my $line = $content;
-    $line !~ /^Could not enable CPU profiling/ || error($line);
-    
-    copy($tmp_profile, $real_profile) || error("Unable to copy profile\n");
-    print STDERR "Wrote profile to $real_profile\n";
-    $main::collected_profile = $real_profile;
-    return $main::collected_profile;
-  }
-}
-
-# Collect profiles in parallel
-sub FetchDynamicProfiles {
-  my $items = scalar(@main::pfile_args);
-  my $levels = log($items) / log(2);
-
-  if ($items == 1) {
-    $main::profile_files[0] = FetchDynamicProfile($main::prog, $main::pfile_args[0], 0, 1);
-  } else {
-    # math rounding issues
-    if ((2 ** $levels) < $items) {
-     $levels++;
-    }
-    my $count = scalar(@main::pfile_args);
-    for (my $i = 0; $i < $count; $i++) {
-      $main::profile_files[$i] = FetchDynamicProfile($main::prog, $main::pfile_args[$i], 1, 0);
-    }
-    print STDERR "Fetching $count profiles, Be patient...\n";
-    FetchDynamicProfilesRecurse($levels, 0, 0);
-    $main::collected_profile = join(" \\\n    ", @main::profile_files);
-  }
-}
-
-# Recursively fork a process to get enough processes
-# collecting profiles
-sub FetchDynamicProfilesRecurse {
-  my $maxlevel = shift;
-  my $level = shift;
-  my $position = shift;
-
-  if (my $pid = fork()) {
-    $position = 0 | ($position << 1);
-    TryCollectProfile($maxlevel, $level, $position);
-    wait;
-  } else {
-    $position = 1 | ($position << 1);
-    TryCollectProfile($maxlevel, $level, $position);
-    exit(0);
-  }
-}
-
-# Collect a single profile
-sub TryCollectProfile {
-  my $maxlevel = shift;
-  my $level = shift;
-  my $position = shift;
-
-  if ($level >= ($maxlevel - 1)) {
-    if ($position < scalar(@main::pfile_args)) {
-      FetchDynamicProfile($main::prog, $main::pfile_args[$position], 0, 0);
-    }
-  } else {
-    FetchDynamicProfilesRecurse($maxlevel, $level+1, $position);
-  }
-}
-
-##### Parsing code #####
-
-# Provide a small streaming-read module to handle very large
-# cpu-profile files.  Stream in chunks along a sliding window.
-# Provides an interface to get one 'slot', correctly handling
-# endian-ness differences.  A slot is one 32-bit or 64-bit word
-# (depending on the input profile).  We tell endianness and bit-size
-# for the profile by looking at the first 8 bytes: in cpu profiles,
-# the second slot is always 3 (we'll accept anything that's not 0).
-BEGIN {
-  package CpuProfileStream;
-
-  sub new {
-    my ($class, $file, $fname) = @_;
-    my $self = { file        => $file,
-                 base        => 0,
-                 stride      => 512 * 1024,   # must be a multiple of bitsize/8
-                 slots       => [],
-                 unpack_code => "",           # N for big-endian, V for little
-    };
-    bless $self, $class;
-    # Let unittests adjust the stride
-    if ($main::opt_test_stride > 0) {
-      $self->{stride} = $main::opt_test_stride;
-    }
-    # Read the first two slots to figure out bitsize and endianness.
-    my $slots = $self->{slots};
-    my $str;
-    read($self->{file}, $str, 8);
-    # Set the global $address_length based on what we see here.
-    # 8 is 32-bit (8 hexadecimal chars); 16 is 64-bit (16 hexadecimal chars).
-    $address_length = ($str eq (chr(0)x8)) ? 16 : 8;
-    if ($address_length == 8) {
-      if (substr($str, 6, 2) eq chr(0)x2) {
-        $self->{unpack_code} = 'V';  # Little-endian.
-      } elsif (substr($str, 4, 2) eq chr(0)x2) {
-        $self->{unpack_code} = 'N';  # Big-endian
-      } else {
-        ::error("$fname: header size >= 2**16\n");
-      }
-      @$slots = unpack($self->{unpack_code} . "*", $str);
-    } else {
-      # If we're a 64-bit profile, make sure we're a 64-bit-capable
-      # perl.  Otherwise, each slot will be represented as a float
-      # instead of an int64, losing precision and making all the
-      # 64-bit addresses right.  We *could* try to handle this with
-      # software emulation of 64-bit ints, but that's added complexity
-      # for no clear benefit (yet).  We use 'Q' to test for 64-bit-ness;
-      # perl docs say it's only available on 64-bit perl systems.
-      my $has_q = 0;
-      eval { $has_q = pack("Q", "1") ? 1 : 1; };
-      if (!$has_q) {
-        ::error("$fname: need a 64-bit perl to process this 64-bit profile.\n");
-      }
-      read($self->{file}, $str, 8);
-      if (substr($str, 4, 4) eq chr(0)x4) {
-        # We'd love to use 'Q', but it's a) not universal, b) not endian-proof.
-        $self->{unpack_code} = 'V';  # Little-endian.
-      } elsif (substr($str, 0, 4) eq chr(0)x4) {
-        $self->{unpack_code} = 'N';  # Big-endian
-      } else {
-        ::error("$fname: header size >= 2**32\n");
-      }
-      my @pair = unpack($self->{unpack_code} . "*", $str);
-      # Since we know one of the pair is 0, it's fine to just add them.
-      @$slots = (0, $pair[0] + $pair[1]);
-    }
-    return $self;
-  }
-
-  # Load more data when we access slots->get(X) which is not yet in memory.
-  sub overflow {
-    my ($self) = @_;
-    my $slots = $self->{slots};
-    $self->{base} += $#$slots + 1;   # skip over data we're replacing
-    my $str;
-    read($self->{file}, $str, $self->{stride});
-    if ($address_length == 8) {      # the 32-bit case
-      # This is the easy case: unpack provides 32-bit unpacking primitives.
-      @$slots = unpack($self->{unpack_code} . "*", $str);
-    } else {
-      # We need to unpack 32 bits at a time and combine.
-      my @b32_values = unpack($self->{unpack_code} . "*", $str);
-      my @b64_values = ();
-      for (my $i = 0; $i < $#b32_values; $i += 2) {
-        # TODO(csilvers): if this is a 32-bit perl, the math below
-        #    could end up in a too-large int, which perl will promote
-        #    to a double, losing necessary precision.  Deal with that.
-        if ($self->{unpack_code} eq 'V') {    # little-endian
-          push(@b64_values, $b32_values[$i] + $b32_values[$i+1] * (2**32));
-        } else {
-          push(@b64_values, $b32_values[$i] * (2**32) + $b32_values[$i+1]);
-        }
-      }
-      @$slots = @b64_values;
-    }
-  }
-
-  # Access the i-th long in the file (logically), or -1 at EOF.
-  sub get {
-    my ($self, $idx) = @_;
-    my $slots = $self->{slots};
-    while ($#$slots >= 0) {
-      if ($idx < $self->{base}) {
-        # The only time we expect a reference to $slots[$i - something]
-        # after referencing $slots[$i] is reading the very first header.
-        # Since $stride > |header|, that shouldn't cause any lookback
-        # errors.  And everything after the header is sequential.
-        print STDERR "Unexpected look-back reading CPU profile";
-        return -1;   # shrug, don't know what better to return
-      } elsif ($idx > $self->{base} + $#$slots) {
-        $self->overflow();
-      } else {
-        return $slots->[$idx - $self->{base}];
-      }
-    }
-    # If we get here, $slots is [], which means we've reached EOF
-    return -1;  # unique since slots is supposed to hold unsigned numbers
-  }
-}
-
-# Parse profile generated by common/profiler.cc and return a reference
-# to a map:
-#      $result->{version}     Version number of profile file
-#      $result->{period}      Sampling period (in microseconds)
-#      $result->{profile}     Profile object
-#      $result->{map}         Memory map info from profile
-#      $result->{pcs}         Hash of all PC values seen, key is hex address
-sub ReadProfile {
-  my $prog = shift;
-  my $fname = shift;
-
-  if (IsSymbolizedProfileFile($fname) && !$main::use_symbolized_profile) {
-    # we have both a binary and symbolized profiles, abort
-    usage("Symbolized profile '$fname' cannot be used with a binary arg.  " .
-          "Try again without passing '$prog'.");
-  }
-
-  $main::profile_type = '';
-
-  $CONTENTION_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
-  my $contention_marker = $&;
-  $GROWTH_PAGE  =~ m,[^/]+$,;    # matches everything after the last slash
-  my $growth_marker = $&;
-  $SYMBOL_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
-  my $symbol_marker = $&;
-  $PROFILE_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
-  my $profile_marker = $&;
-
-  # Look at first line to see if it is a heap or a CPU profile.
-  # CPU profile may start with no header at all, and just binary data
-  # (starting with \0\0\0\0) -- in that case, don't try to read the
-  # whole firstline, since it may be gigabytes(!) of data.
-  open(PROFILE, "<$fname") || error("$fname: $!\n");
-  binmode PROFILE;      # New perls do UTF-8 processing
-  my $firstchar = "";
-  my $header = "";
-  read(PROFILE, $firstchar, 1);
-  seek(PROFILE, -1, 1);          # unread the firstchar
-  if ($firstchar ne "\0") {
-    $header = <PROFILE>;
-    if (!defined($header)) {
-      error("Profile is empty.\n");
-    }
-    $header =~ s/\r//g;   # turn windows-looking lines into unix-looking lines
-  }
-
-  my $symbols;
-  if ($header =~ m/^--- *$symbol_marker/o) {
-    # read the symbol section of the symbolized profile file
-    $symbols = ReadSymbols(*PROFILE{IO});
-
-    # read the next line to get the header for the remaining profile
-    $header = "";
-    read(PROFILE, $firstchar, 1);
-    seek(PROFILE, -1, 1);          # unread the firstchar
-    if ($firstchar ne "\0") {
-      $header = <PROFILE>;
-      $header =~ s/\r//g;
-    }
-  }
-
-  my $result;
-
-  if ($header =~ m/^heap profile:.*$growth_marker/o) {
-    $main::profile_type = 'growth';
-    $result =  ReadHeapProfile($prog, $fname, $header);
-  } elsif ($header =~ m/^heap profile:/) {
-    $main::profile_type = 'heap';
-    $result =  ReadHeapProfile($prog, $fname, $header);
-  } elsif ($header =~ m/^--- *$contention_marker/o) {
-    $main::profile_type = 'contention';
-    $result = ReadSynchProfile($prog, $fname);
-  } elsif ($header =~ m/^--- *Stacks:/) {
-    print STDERR
-      "Old format contention profile: mistakenly reports " .
-      "condition variable signals as lock contentions.\n";
-    $main::profile_type = 'contention';
-    $result = ReadSynchProfile($prog, $fname);
-  } elsif ($header =~ m/^thread creation profile:/) {
-    $main::profile_type = 'thread';
-    $result = ReadThreadProfile($prog, $fname);
-  } elsif ($header =~ m/^--- *$profile_marker/) {
-    # the binary cpu profile data starts immediately after this line
-    $main::profile_type = 'cpu';
-    $result = ReadCPUProfile($prog, $fname);
-  } else {
-    if (defined($symbols)) {
-      # a symbolized profile contains a format we don't recognize, bail out
-      error("$fname: Cannot recognize profile section after symbols.\n");
-    }
-    # no ascii header present -- must be a CPU profile
-    $main::profile_type = 'cpu';
-    $result = ReadCPUProfile($prog, $fname);
-  }
-
-  # if we got symbols along with the profile, return those as well
-  if (defined($symbols)) {
-    $result->{symbols} = $symbols;
-  }
-
-  return $result;
-}
-
-# Subtract one from caller pc so we map back to call instr.
-# However, don't do this if we're reading a symbolized profile
-# file, in which case the subtract-one was done when the file
-# was written.
-#
-# We apply the same logic to all readers, though ReadCPUProfile uses an
-# independent implementation.
-sub FixCallerAddresses {
-  my $stack = shift;
-  if ($main::use_symbolized_profile) {
-    return $stack;
-  } else {
-    $stack =~ /(\s)/;
-    my $delimiter = $1;
-    my @addrs = split(' ', $stack);
-    my @fixedaddrs;
-    $#fixedaddrs = $#addrs;
-    if ($#addrs >= 0) {
-      $fixedaddrs[0] = $addrs[0];
-    }
-    for (my $i = 1; $i <= $#addrs; $i++) {
-      $fixedaddrs[$i] = AddressSub($addrs[$i], "0x1");
-    }
-    return join $delimiter, @fixedaddrs;
-  }
-}
-
-# CPU profile reader
-sub ReadCPUProfile {
-  my $prog = shift;
-  my $fname = shift;
-  my $version;
-  my $period;
-  my $i;
-  my $profile = {};
-  my $pcs = {};
-
-  # Parse string into array of slots.
-  my $slots = CpuProfileStream->new(*PROFILE, $fname);
-
-  # Read header.  The current header version is a 5-element structure
-  # containing:
-  #   0: header count (always 0)
-  #   1: header "words" (after this one: 3)
-  #   2: format version (0)
-  #   3: sampling period (usec)
-  #   4: unused padding (always 0)
-  if ($slots->get(0) != 0 ) {
-    error("$fname: not a profile file, or old format profile file\n");
-  }
-  $i = 2 + $slots->get(1);
-  $version = $slots->get(2);
-  $period = $slots->get(3);
-  # Do some sanity checking on these header values.
-  if ($version > (2**32) || $period > (2**32) || $i > (2**32) || $i < 5) {
-    error("$fname: not a profile file, or corrupted profile file\n");
-  }
-
-  # Parse profile
-  while ($slots->get($i) != -1) {
-    my $n = $slots->get($i++);
-    my $d = $slots->get($i++);
-    if ($d > (2**16)) {  # TODO(csilvers): what's a reasonable max-stack-depth?
-      my $addr = sprintf("0%o", $i * ($address_length == 8 ? 4 : 8));
-      print STDERR "At index $i (address $addr):\n";
-      error("$fname: stack trace depth >= 2**32\n");
-    }
-    if ($slots->get($i) == 0) {
-      # End of profile data marker
-      $i += $d;
-      last;
-    }
-
-    # Make key out of the stack entries
-    my @k = ();
-    for (my $j = 0; $j < $d; $j++) {
-      my $pc = $slots->get($i+$j);
-      # Subtract one from caller pc so we map back to call instr.
-      # However, don't do this if we're reading a symbolized profile
-      # file, in which case the subtract-one was done when the file
-      # was written.
-      if ($j > 0 && !$main::use_symbolized_profile) {
-        $pc--;
-      }
-      $pc = sprintf("%0*x", $address_length, $pc);
-      $pcs->{$pc} = 1;
-      push @k, $pc;
-    }
-
-    AddEntry($profile, (join "\n", @k), $n);
-    $i += $d;
-  }
-
-  # Parse map
-  my $map = '';
-  seek(PROFILE, $i * 4, 0);
-  read(PROFILE, $map, (stat PROFILE)[7]);
-  close(PROFILE);
-
-  my $r = {};
-  $r->{version} = $version;
-  $r->{period} = $period;
-  $r->{profile} = $profile;
-  $r->{libs} = ParseLibraries($prog, $map, $pcs);
-  $r->{pcs} = $pcs;
-
-  return $r;
-}
-
-sub ReadHeapProfile {
-  my $prog = shift;
-  my $fname = shift;
-  my $header = shift;
-
-  my $index = 1;
-  if ($main::opt_inuse_space) {
-    $index = 1;
-  } elsif ($main::opt_inuse_objects) {
-    $index = 0;
-  } elsif ($main::opt_alloc_space) {
-    $index = 3;
-  } elsif ($main::opt_alloc_objects) {
-    $index = 2;
-  }
-
-  # Find the type of this profile.  The header line looks like:
-  #    heap profile:   1246:  8800744 [  1246:  8800744] @ <heap-url>/266053
-  # There are two pairs <count: size>, the first inuse objects/space, and the
-  # second allocated objects/space.  This is followed optionally by a profile
-  # type, and if that is present, optionally by a sampling frequency.
-  # For remote heap profiles (v1):
-  # The interpretation of the sampling frequency is that the profiler, for
-  # each sample, calculates a uniformly distributed random integer less than
-  # the given value, and records the next sample after that many bytes have
-  # been allocated.  Therefore, the expected sample interval is half of the
-  # given frequency.  By default, if not specified, the expected sample
-  # interval is 128KB.  Only remote-heap-page profiles are adjusted for
-  # sample size.
-  # For remote heap profiles (v2):
-  # The sampling frequency is the rate of a Poisson process. This means that
-  # the probability of sampling an allocation of size X with sampling rate Y
-  # is 1 - exp(-X/Y)
-  # For version 2, a typical header line might look like this:
-  # heap profile:   1922: 127792360 [  1922: 127792360] @ <heap-url>_v2/524288
-  # the trailing number (524288) is the sampling rate. (Version 1 showed
-  # double the 'rate' here)
-  my $sampling_algorithm = 0;
-  my $sample_adjustment = 0;
-  chomp($header);
-  my $type = "unknown";
-  if ($header =~ m"^heap profile:\s*(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\](\s*@\s*([^/]*)(/(\d+))?)?") {
-    if (defined($6) && ($6 ne '')) {
-      $type = $6;
-      my $sample_period = $8;
-      # $type is "heapprofile" for profiles generated by the
-      # heap-profiler, and either "heap" or "heap_v2" for profiles
-      # generated by sampling directly within tcmalloc.  It can also
-      # be "growth" for heap-growth profiles.  The first is typically
-      # found for profiles generated locally, and the others for
-      # remote profiles.
-      if (($type eq "heapprofile") || ($type !~ /heap/) ) {
-        # No need to adjust for the sampling rate with heap-profiler-derived data
-        $sampling_algorithm = 0;
-      } elsif ($type =~ /_v2/) {
-        $sampling_algorithm = 2;     # version 2 sampling
-        if (defined($sample_period) && ($sample_period ne '')) {
-          $sample_adjustment = int($sample_period);
-        }
-      } else {
-        $sampling_algorithm = 1;     # version 1 sampling
-        if (defined($sample_period) && ($sample_period ne '')) {
-          $sample_adjustment = int($sample_period)/2;
-        }
-      }
-    } else {
-      # We detect whether or not this is a remote-heap profile by checking
-      # that the total-allocated stats ($n2,$s2) are exactly the
-      # same as the in-use stats ($n1,$s1).  It is remotely conceivable
-      # that a non-remote-heap profile may pass this check, but it is hard
-      # to imagine how that could happen.
-      # In this case it's so old it's guaranteed to be remote-heap version 1.
-      my ($n1, $s1, $n2, $s2) = ($1, $2, $3, $4);
-      if (($n1 == $n2) && ($s1 == $s2)) {
-        # This is likely to be a remote-heap based sample profile
-        $sampling_algorithm = 1;
-      }
-    }
-  }
-
-  if ($sampling_algorithm > 0) {
-    # For remote-heap generated profiles, adjust the counts and sizes to
-    # account for the sample rate (we sample once every 128KB by default).
-    if ($sample_adjustment == 0) {
-      # Turn on profile adjustment.
-      $sample_adjustment = 128*1024;
-      print STDERR "Adjusting heap profiles for 1-in-128KB sampling rate\n";
-    } else {
-      printf STDERR ("Adjusting heap profiles for 1-in-%d sampling rate\n",
-                     $sample_adjustment);
-    }
-    if ($sampling_algorithm > 1) {
-      # We don't bother printing anything for the original version (version 1)
-      printf STDERR "Heap version $sampling_algorithm\n";
-    }
-  }
-
-  my $profile = {};
-  my $pcs = {};
-  my $map = "";
-
-  while (<PROFILE>) {
-    s/\r//g;         # turn windows-looking lines into unix-looking lines
-    if (/^MAPPED_LIBRARIES:/) {
-      # Read the /proc/self/maps data
-      while (<PROFILE>) {
-        s/\r//g;         # turn windows-looking lines into unix-looking lines
-        $map .= $_;
-      }
-      last;
-    }
-
-    if (/^--- Memory map:/) {
-      # Read /proc/self/maps data as formatted by DumpAddressMap()
-      my $buildvar = "";
-      while (<PROFILE>) {
-        s/\r//g;         # turn windows-looking lines into unix-looking lines
-        # Parse "build=<dir>" specification if supplied
-        if (m/^\s*build=(.*)\n/) {
-          $buildvar = $1;
-        }
-
-        # Expand "$build" variable if available
-        $_ =~ s/\$build\b/$buildvar/g;
-
-        $map .= $_;
-      }
-      last;
-    }
-
-    # Read entry of the form:
-    #  <count1>: <bytes1> [<count2>: <bytes2>] @ a1 a2 a3 ... an
-    s/^\s*//;
-    s/\s*$//;
-    if (m/^\s*(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\]\s+@\s+(.*)$/) {
-      my $stack = $5;
-      my ($n1, $s1, $n2, $s2) = ($1, $2, $3, $4);
-
-      if ($sample_adjustment) {
-        if ($sampling_algorithm == 2) {
-          # Remote-heap version 2
-          # The sampling frequency is the rate of a Poisson process.
-          # This means that the probability of sampling an allocation of
-          # size X with sampling rate Y is 1 - exp(-X/Y)
-          my $ratio;
-          $ratio = (($s1*1.0)/$n1)/($sample_adjustment);
-          my $scale_factor;
-          $scale_factor = 1/(1 - exp(-$ratio));
-          $n1 *= $scale_factor;
-          $s1 *= $scale_factor;
-          $ratio = (($s2*1.0)/$n2)/($sample_adjustment);
-          $scale_factor = 1/(1 - exp(-$ratio));
-          $n2 *= $scale_factor;
-          $s2 *= $scale_factor;
-        } else {
-          # Remote-heap version 1
-          my $ratio;
-          if ($n1 > 0) {
-            $ratio = (($s1*1.0)/$n1)/($sample_adjustment);
-            if ($ratio < 1) {
-                $n1 /= $ratio;
-                $s1 /= $ratio;
-            }
-          }
-          if ($n2 > 0) {
-            $ratio = (($s2*1.0)/$n2)/($sample_adjustment);
-            if ($ratio < 1) {
-                $n2 /= $ratio;
-                $s2 /= $ratio;
-            }
-          }
-        }
-      }
-
-      my @counts = ($n1, $s1, $n2, $s2);
-      AddEntries($profile, $pcs, FixCallerAddresses($stack), $counts[$index]);
-    }
-  }
-
-  my $r = {};
-  $r->{version} = "heap";
-  $r->{period} = 1;
-  $r->{profile} = $profile;
-  $r->{libs} = ParseLibraries($prog, $map, $pcs);
-  $r->{pcs} = $pcs;
-  return $r;
-}
-
-sub ReadThreadProfile {
-  my $prog = shift;
-  my $fname = shift;
-
-  my $profile = {};
-  my $pcs = {};
-  my $map = "";
-
-  while (<PROFILE>) {
-    s/\r//g;         # turn windows-looking lines into unix-looking lines
-    if (/^MAPPED_LIBRARIES:/) {
-      # Read the /proc/self/maps data
-      while (<PROFILE>) {
-        s/\r//g;         # turn windows-looking lines into unix-looking lines
-        $map .= $_;
-      }
-      last;
-    }
-
-    if (/^--- Memory map:/) {
-      # Read /proc/self/maps data as formatted by DumpAddressMap()
-      my $buildvar = "";
-      while (<PROFILE>) {
-        s/\r//g;         # turn windows-looking lines into unix-looking lines
-        # Parse "build=<dir>" specification if supplied
-        if (m/^\s*build=(.*)\n/) {
-          $buildvar = $1;
-        }
-
-        # Expand "$build" variable if available
-        $_ =~ s/\$build\b/$buildvar/g;
-
-        $map .= $_;
-      }
-      last;
-    }
-
-    # Read entry of the form:
-    #  @ a1 a2 a3 ... an
-    s/^\s*//;
-    s/\s*$//;
-    if (m/^@\s+(.*)$/) {
-      AddEntries($profile, $pcs, FixCallerAddresses($1), 1);
-    }
-  }
-
-  my $r = {};
-  $r->{version} = "thread";
-  $r->{period} = 1;
-  $r->{profile} = $profile;
-  $r->{libs} = ParseLibraries($prog, $map, $pcs);
-  $r->{pcs} = $pcs;
-  return $r;
-}
-
-sub ReadSynchProfile {
-  my ($prog, $fname, $header) = @_;
-
-  my $map = '';
-  my $profile = {};
-  my $pcs = {};
-  my $sampling_period = 1;
-  my $cyclespernanosec = 2.8;   # Default assumption for old binaries
-  my $seen_clockrate = 0;
-  my $line;
-
-  my $index = 0;
-  if ($main::opt_total_delay) {
-    $index = 0;
-  } elsif ($main::opt_contentions) {
-    $index = 1;
-  } elsif ($main::opt_mean_delay) {
-    $index = 2;
-  }
-
-  while ( $line = <PROFILE> ) {
-    $line =~ s/\r//g;      # turn windows-looking lines into unix-looking lines
-    if ( $line =~ /^\s*(\d+)\s+(\d+) \@\s*(.*?)\s*$/ ) {
-      my ($cycles, $count, $stack) = ($1, $2, $3);
-
-      # Convert cycles to nanoseconds
-      $cycles /= $cyclespernanosec;
-
-      # Adjust for sampling done by application
-      $cycles *= $sampling_period;
-      $count *= $sampling_period;
-
-      my @values = ($cycles, $count, $cycles / $count);
-      AddEntries($profile, $pcs, FixCallerAddresses($stack), $values[$index]);
-
-    } elsif ( $line =~ /^(slow release).*thread \d+  \@\s*(.*?)\s*$/ ||
-              $line =~ /^\s*(\d+) \@\s*(.*?)\s*$/ ) {
-      my ($cycles, $stack) = ($1, $2);
-      if ($cycles !~ /^\d+$/) {
-        next;
-      }
-
-      # Convert cycles to nanoseconds
-      $cycles /= $cyclespernanosec;
-
-      # Adjust for sampling done by application
-      $cycles *= $sampling_period;
-
-      AddEntries($profile, $pcs, FixCallerAddresses($stack), $cycles);
-
-    } elsif ( $line =~ m/^([a-z][^=]*)=(.*)$/ ) {
-      my ($variable, $value) = ($1,$2);
-      for ($variable, $value) {
-        s/^\s+//;
-        s/\s+$//;
-      }
-      if ($variable eq "cycles/second") {
-        $cyclespernanosec = $value / 1e9;
-        $seen_clockrate = 1;
-      } elsif ($variable eq "sampling period") {
-        $sampling_period = $value;
-      } elsif ($variable eq "ms since reset") {
-        # Currently nothing is done with this value in pprof
-        # So we just silently ignore it for now
-      } elsif ($variable eq "discarded samples") {
-        # Currently nothing is done with this value in pprof
-        # So we just silently ignore it for now
-      } else {
-        printf STDERR ("Ignoring unnknown variable in /contention output: " .
-                       "'%s' = '%s'\n",$variable,$value);
-      }
-    } else {
-      # Memory map entry
-      $map .= $line;
-    }
-  }
-  close PROFILE;
-
-  if (!$seen_clockrate) {
-    printf STDERR ("No cycles/second entry in profile; Guessing %.1f GHz\n",
-                   $cyclespernanosec);
-  }
-
-  my $r = {};
-  $r->{version} = 0;
-  $r->{period} = $sampling_period;
-  $r->{profile} = $profile;
-  $r->{libs} = ParseLibraries($prog, $map, $pcs);
-  $r->{pcs} = $pcs;
-  return $r;
-}
-
-# Given a hex value in the form "0x1abcd" return "0001abcd" or
-# "000000000001abcd", depending on the current address length.
-# There's probably a more idiomatic (or faster) way to do this...
-sub HexExtend {
-  my $addr = shift;
-
-  $addr =~ s/^0x//;
-
-  if (length $addr > $address_length) {
-    printf STDERR "Warning:  address $addr is longer than address length $address_length\n";
-  }
-
-  return substr("000000000000000".$addr, -$address_length);
-}
-
-##### Symbol extraction #####
-
-# Aggressively search the lib_prefix values for the given library
-# If all else fails, just return the name of the library unmodified.
-# If the lib_prefix is "/my/path,/other/path" and $file is "/lib/dir/mylib.so"
-# it will search the following locations in this order, until it finds a file:
-#   /my/path/lib/dir/mylib.so
-#   /other/path/lib/dir/mylib.so
-#   /my/path/dir/mylib.so
-#   /other/path/dir/mylib.so
-#   /my/path/mylib.so
-#   /other/path/mylib.so
-#   /lib/dir/mylib.so              (returned as last resort)
-sub FindLibrary {
-  my $file = shift;
-  my $suffix = $file;
-
-  # Search for the library as described above
-  do {
-    foreach my $prefix (@prefix_list) {
-      my $fullpath = $prefix . $suffix;
-      if (-e $fullpath) {
-        return $fullpath;
-      }
-    }
-  } while ($suffix =~ s|^/[^/]+/|/|);
-  return $file;
-}
-
-# Return path to library with debugging symbols.
-# For libc libraries, the copy in /usr/lib/debug contains debugging symbols
-sub DebuggingLibrary {
-  my $file = shift;
-  if ($file =~ m|^/| && -f "/usr/lib/debug$file") {
-    return "/usr/lib/debug$file";
-  }
-  return undef;
-}
-
-# Parse text section header of a library using objdump
-sub ParseTextSectionHeaderFromObjdump {
-  my $lib = shift;
-
-  my $size = undef;
-  my $vma;
-  my $file_offset;
-  # Get objdump output from the library file to figure out how to
-  # map between mapped addresses and addresses in the library.
-  my $objdump = $obj_tool_map{"objdump"};
-  open(OBJDUMP, "$objdump -h $lib |")
-                || error("$objdump $lib: $!\n");
-  while (<OBJDUMP>) {
-    s/\r//g;         # turn windows-looking lines into unix-looking lines
-    # Idx Name          Size      VMA       LMA       File off  Algn
-    #  10 .text         00104b2c  420156f0  420156f0  000156f0  2**4
-    # For 64-bit objects, VMA and LMA will be 16 hex digits, size and file
-    # offset may still be 8.  But AddressSub below will still handle that.
-    my @x = split;
-    if (($#x >= 6) && ($x[1] eq '.text')) {
-      $size = $x[2];
-      $vma = $x[3];
-      $file_offset = $x[5];
-      last;
-    }
-  }
-  close(OBJDUMP);
-
-  if (!defined($size)) {
-    return undef;
-  }
-
-  my $r = {};
-  $r->{size} = $size;
-  $r->{vma} = $vma;
-  $r->{file_offset} = $file_offset;
-
-  return $r;
-}
-
-# Parse text section header of a library using otool (on OS X)
-sub ParseTextSectionHeaderFromOtool {
-  my $lib = shift;
-
-  my $size = undef;
-  my $vma = undef;
-  my $file_offset = undef;
-  # Get otool output from the library file to figure out how to
-  # map between mapped addresses and addresses in the library.
-  my $otool = $obj_tool_map{"otool"};
-  open(OTOOL, "$otool -l $lib |")
-                || error("$otool $lib: $!\n");
-  my $cmd = "";
-  my $sectname = "";
-  my $segname = "";
-  foreach my $line (<OTOOL>) {
-    $line =~ s/\r//g;      # turn windows-looking lines into unix-looking lines
-    # Load command <#>
-    #       cmd LC_SEGMENT
-    # [...]
-    # Section
-    #   sectname __text
-    #    segname __TEXT
-    #       addr 0x000009f8
-    #       size 0x00018b9e
-    #     offset 2552
-    #      align 2^2 (4)
-    # We will need to strip off the leading 0x from the hex addresses,
-    # and convert the offset into hex.
-    if ($line =~ /Load command/) {
-      $cmd = "";
-      $sectname = "";
-      $segname = "";
-    } elsif ($line =~ /Section/) {
-      $sectname = "";
-      $segname = "";
-    } elsif ($line =~ /cmd (\w+)/) {
-      $cmd = $1;
-    } elsif ($line =~ /sectname (\w+)/) {
-      $sectname = $1;
-    } elsif ($line =~ /segname (\w+)/) {
-      $segname = $1;
-    } elsif (!(($cmd eq "LC_SEGMENT" || $cmd eq "LC_SEGMENT_64") &&
-               $sectname eq "__text" &&
-               $segname eq "__TEXT")) {
-      next;
-    } elsif ($line =~ /\baddr 0x([0-9a-fA-F]+)/) {
-      $vma = $1;
-    } elsif ($line =~ /\bsize 0x([0-9a-fA-F]+)/) {
-      $size = $1;
-    } elsif ($line =~ /\boffset ([0-9]+)/) {
-      $file_offset = sprintf("%016x", $1);
-    }
-    if (defined($vma) && defined($size) && defined($file_offset)) {
-      last;
-    }
-  }
-  close(OTOOL);
-
-  if (!defined($vma) || !defined($size) || !defined($file_offset)) {
-     return undef;
-  }
-
-  my $r = {};
-  $r->{size} = $size;
-  $r->{vma} = $vma;
-  $r->{file_offset} = $file_offset;
-
-  return $r;
-}
-
-sub ParseTextSectionHeader {
-  # obj_tool_map("otool") is only defined if we're in a Mach-O environment
-  if (defined($obj_tool_map{"otool"})) {
-    my $r = ParseTextSectionHeaderFromOtool(@_);
-    if (defined($r)){
-      return $r;
-    }
-  }
-  # If otool doesn't work, or we don't have it, fall back to objdump
-  return ParseTextSectionHeaderFromObjdump(@_);
-}
-
-# Split /proc/pid/maps dump into a list of libraries
-sub ParseLibraries {
-  return if $main::use_symbol_page;  # We don't need libraries info.
-  my $prog = shift;
-  my $map = shift;
-  my $pcs = shift;
-
-  my $result = [];
-  my $h = "[a-f0-9]+";
-  my $zero_offset = HexExtend("0");
-
-  my $buildvar = "";
-  foreach my $l (split("\n", $map)) {
-    if ($l =~ m/^\s*build=(.*)$/) {
-      $buildvar = $1;
-    }
-
-    my $start;
-    my $finish;
-    my $offset;
-    my $lib;
-    if ($l =~ /^($h)-($h)\s+..x.\s+($h)\s+\S+:\S+\s+\d+\s+(\S+\.(so|dll|dylib|bundle)((\.\d+)+\w*(\.\d+){0,3})?)$/i) {
-      # Full line from /proc/self/maps.  Example:
-      #   40000000-40015000 r-xp 00000000 03:01 12845071   /lib/ld-2.3.2.so
-      $start = HexExtend($1);
-      $finish = HexExtend($2);
-      $offset = HexExtend($3);
-      $lib = $4;
-      $lib =~ s|\\|/|g;     # turn windows-style paths into unix-style paths
-    } elsif ($l =~ /^\s*($h)-($h):\s*(\S+\.so(\.\d+)*)/) {
-      # Cooked line from DumpAddressMap.  Example:
-      #   40000000-40015000: /lib/ld-2.3.2.so
-      $start = HexExtend($1);
-      $finish = HexExtend($2);
-      $offset = $zero_offset;
-      $lib = $3;
-    } else {
-      next;
-    }
-
-    # Expand "$build" variable if available
-    $lib =~ s/\$build\b/$buildvar/g;
-
-    $lib = FindLibrary($lib);
-
-    # Check for pre-relocated libraries, which use pre-relocated symbol tables
-    # and thus require adjusting the offset that we'll use to translate
-    # VM addresses into symbol table addresses.
-    # Only do this if we're not going to fetch the symbol table from a
-    # debugging copy of the library.
-    if (!DebuggingLibrary($lib)) {
-      my $text = ParseTextSectionHeader($lib);
-      if (defined($text)) {
-         my $vma_offset = AddressSub($text->{vma}, $text->{file_offset});
-         $offset = AddressAdd($offset, $vma_offset);
-      }
-    }
-
-    push(@{$result}, [$lib, $start, $finish, $offset]);
-  }
-
-  # Append special entry for additional library (not relocated)
-  if ($main::opt_lib ne "") {
-    my $text = ParseTextSectionHeader($main::opt_lib);
-    if (defined($text)) {
-       my $start = $text->{vma};
-       my $finish = AddressAdd($start, $text->{size});
-
-       push(@{$result}, [$main::opt_lib, $start, $finish, $start]);
-    }
-  }
-
-  # Append special entry for the main program.  This covers
-  # 0..max_pc_value_seen, so that we assume pc values not found in one
-  # of the library ranges will be treated as coming from the main
-  # program binary.
-  my $min_pc = HexExtend("0");
-  my $max_pc = $min_pc;          # find the maximal PC value in any sample
-  foreach my $pc (keys(%{$pcs})) {
-    if (HexExtend($pc) gt $max_pc) { $max_pc = HexExtend($pc); }
-  }
-  push(@{$result}, [$prog, $min_pc, $max_pc, $zero_offset]);
-
-  return $result;
-}
-
-# Add two hex addresses of length $address_length.
-# Run pprof --test for unit test if this is changed.
-sub AddressAdd {
-  my $addr1 = shift;
-  my $addr2 = shift;
-  my $sum;
-
-  if ($address_length == 8) {
-    # Perl doesn't cope with wraparound arithmetic, so do it explicitly:
-    $sum = (hex($addr1)+hex($addr2)) % (0x10000000 * 16);
-    return sprintf("%08x", $sum);
-
-  } else {
-    # Do the addition in 7-nibble chunks to trivialize carry handling.
-
-    if ($main::opt_debug and $main::opt_test) {
-      print STDERR "AddressAdd $addr1 + $addr2 = ";
-    }
-
-    my $a1 = substr($addr1,-7);
-    $addr1 = substr($addr1,0,-7);
-    my $a2 = substr($addr2,-7);
-    $addr2 = substr($addr2,0,-7);
-    $sum = hex($a1) + hex($a2);
-    my $c = 0;
-    if ($sum > 0xfffffff) {
-      $c = 1;
-      $sum -= 0x10000000;
-    }
-    my $r = sprintf("%07x", $sum);
-
-    $a1 = substr($addr1,-7);
-    $addr1 = substr($addr1,0,-7);
-    $a2 = substr($addr2,-7);
-    $addr2 = substr($addr2,0,-7);
-    $sum = hex($a1) + hex($a2) + $c;
-    $c = 0;
-    if ($sum > 0xfffffff) {
-      $c = 1;
-      $sum -= 0x10000000;
-    }
-    $r = sprintf("%07x", $sum) . $r;
-
-    $sum = hex($addr1) + hex($addr2) + $c;
-    if ($sum > 0xff) { $sum -= 0x100; }
-    $r = sprintf("%02x", $sum) . $r;
-
-    if ($main::opt_debug and $main::opt_test) { print STDERR "$r\n"; }
-
-    return $r;
-  }
-}
-
-
-# Subtract two hex addresses of length $address_length.
-# Run pprof --test for unit test if this is changed.
-sub AddressSub {
-  my $addr1 = shift;
-  my $addr2 = shift;
-  my $diff;
-
-  if ($address_length == 8) {
-    # Perl doesn't cope with wraparound arithmetic, so do it explicitly:
-    $diff = (hex($addr1)-hex($addr2)) % (0x10000000 * 16);
-    return sprintf("%08x", $diff);
-
-  } else {
-    # Do the addition in 7-nibble chunks to trivialize borrow handling.
-    # if ($main::opt_debug) { print STDERR "AddressSub $addr1 - $addr2 = "; }
-
-    my $a1 = hex(substr($addr1,-7));
-    $addr1 = substr($addr1,0,-7);
-    my $a2 = hex(substr($addr2,-7));
-    $addr2 = substr($addr2,0,-7);
-    my $b = 0;
-    if ($a2 > $a1) {
-      $b = 1;
-      $a1 += 0x10000000;
-    }
-    $diff = $a1 - $a2;
-    my $r = sprintf("%07x", $diff);
-
-    $a1 = hex(substr($addr1,-7));
-    $addr1 = substr($addr1,0,-7);
-    $a2 = hex(substr($addr2,-7)) + $b;
-    $addr2 = substr($addr2,0,-7);
-    $b = 0;
-    if ($a2 > $a1) {
-      $b = 1;
-      $a1 += 0x10000000;
-    }
-    $diff = $a1 - $a2;
-    $r = sprintf("%07x", $diff) . $r;
-
-    $a1 = hex($addr1);
-    $a2 = hex($addr2) + $b;
-    if ($a2 > $a1) { $a1 += 0x100; }
-    $diff = $a1 - $a2;
-    $r = sprintf("%02x", $diff) . $r;
-
-    # if ($main::opt_debug) { print STDERR "$r\n"; }
-
-    return $r;
-  }
-}
-
-# Increment a hex addresses of length $address_length.
-# Run pprof --test for unit test if this is changed.
-sub AddressInc {
-  my $addr = shift;
-  my $sum;
-
-  if ($address_length == 8) {
-    # Perl doesn't cope with wraparound arithmetic, so do it explicitly:
-    $sum = (hex($addr)+1) % (0x10000000 * 16);
-    return sprintf("%08x", $sum);
-
-  } else {
-    # Do the addition in 7-nibble chunks to trivialize carry handling.
-    # We are always doing this to step through the addresses in a function,
-    # and will almost never overflow the first chunk, so we check for this
-    # case and exit early.
-
-    # if ($main::opt_debug) { print STDERR "AddressInc $addr1 = "; }
-
-    my $a1 = substr($addr,-7);
-    $addr = substr($addr,0,-7);
-    $sum = hex($a1) + 1;
-    my $r = sprintf("%07x", $sum);
-    if ($sum <= 0xfffffff) {
-      $r = $addr . $r;
-      # if ($main::opt_debug) { print STDERR "$r\n"; }
-      return HexExtend($r);
-    } else {
-      $r = "0000000";
-    }
-
-    $a1 = substr($addr,-7);
-    $addr = substr($addr,0,-7);
-    $sum = hex($a1) + 1;
-    $r = sprintf("%07x", $sum) . $r;
-    if ($sum <= 0xfffffff) {
-      $r = $addr . $r;
-      # if ($main::opt_debug) { print STDERR "$r\n"; }
-      return HexExtend($r);
-    } else {
-      $r = "00000000000000";
-    }
-
-    $sum = hex($addr) + 1;
-    if ($sum > 0xff) { $sum -= 0x100; }
-    $r = sprintf("%02x", $sum) . $r;
-
-    # if ($main::opt_debug) { print STDERR "$r\n"; }
-    return $r;
-  }
-}
-
-# Extract symbols for all PC values found in profile
-sub ExtractSymbols {
-  my $libs = shift;
-  my $pcset = shift;
-
-  my $symbols = {};
-
-  # Map each PC value to the containing library
-  my %seen = ();
-  foreach my $lib (@{$libs}) {
-    my $libname = $lib->[0];
-    my $start = $lib->[1];
-    my $finish = $lib->[2];
-    my $offset = $lib->[3];
-
-    # Get list of pcs that belong in this library.
-    my $contained = [];
-    foreach my $pc (keys(%{$pcset})) {
-      if (!$seen{$pc} && ($pc ge $start) && ($pc le $finish)) {
-        $seen{$pc} = 1;
-        push(@{$contained}, $pc);
-      }
-    }
-    # Map to symbols
-    MapToSymbols($libname, AddressSub($start, $offset), $contained, $symbols);
-  }
-
-  return $symbols;
-}
-
-# Map list of PC values to symbols for a given image
-sub MapToSymbols {
-  my $image = shift;
-  my $offset = shift;
-  my $pclist = shift;
-  my $symbols = shift;
-
-  my $debug = 0;
-
-  # Ignore empty binaries
-  if ($#{$pclist} < 0) { return; }
-
-  # Figure out the addr2line command to use
-  my $addr2line = $obj_tool_map{"addr2line"};
-  my $cmd = "$addr2line -f -C -e $image";
-  if (exists $obj_tool_map{"addr2line_pdb"}) {
-    $addr2line = $obj_tool_map{"addr2line_pdb"};
-    $cmd = "$addr2line --demangle -f -C -e $image";
-  }
-
-  # Use the go version because we know it works on all platforms
-  $addr2line = "go tool addr2line";
-  $cmd = "$addr2line $image";
-
-  # If "addr2line" isn't installed on the system at all, just use
-  # nm to get what info we can (function names, but not line numbers).
-  if (system("$addr2line --help >$DEVNULL 2>&1") != 0) {
-    MapSymbolsWithNM($image, $offset, $pclist, $symbols);
-    return;
-  }
-
-  # "addr2line -i" can produce a variable number of lines per input
-  # address, with no separator that allows us to tell when data for
-  # the next address starts.  So we find the address for a special
-  # symbol (_fini) and interleave this address between all real
-  # addresses passed to addr2line.  The name of this special symbol
-  # can then be used as a separator.
-  $sep_address = undef;  # May be filled in by MapSymbolsWithNM()
-  my $nm_symbols = {};
-  MapSymbolsWithNM($image, $offset, $pclist, $nm_symbols);
-  # TODO(csilvers): only add '-i' if addr2line supports it.
-  if (defined($sep_address)) {
-    # Only add " -i" to addr2line if the binary supports it.
-    # addr2line --help returns 0, but not if it sees an unknown flag first.
-    if (system("$cmd -i --help >$DEVNULL 2>&1") == 0) {
-      $cmd .= " -i";
-    } else {
-      $sep_address = undef;   # no need for sep_address if we don't support -i
-    }
-  }
-
-  # Make file with all PC values with intervening 'sep_address' so
-  # that we can reliably detect the end of inlined function list
-  open(ADDRESSES, ">$main::tmpfile_sym") || error("$main::tmpfile_sym: $!\n");
-  if ($debug) { print("---- $image ---\n"); }
-  for (my $i = 0; $i <= $#{$pclist}; $i++) {
-    # addr2line always reads hex addresses, and does not need '0x' prefix.
-    if ($debug) { printf STDERR ("%s\n", $pclist->[$i]); }
-    printf ADDRESSES ("%s\n", AddressSub($pclist->[$i], $offset));
-    if (defined($sep_address)) {
-      printf ADDRESSES ("%s\n", $sep_address);
-    }
-  }
-  close(ADDRESSES);
-  if ($debug) {
-    print("----\n");
-    system("cat $main::tmpfile_sym");
-    print("---- $cmd\n");
-    system("$cmd <$main::tmpfile_sym");
-    print("----\n");
-  }
-
-  open(SYMBOLS, "$cmd <$main::tmpfile_sym |") || error("$cmd: $!\n");
-  my $count = 0;   # Index in pclist
-  while (<SYMBOLS>) {
-    # Read fullfunction and filelineinfo from next pair of lines
-    s/\r?\n$//g;
-    my $fullfunction = $_;
-    $_ = <SYMBOLS>;
-    s/\r?\n$//g;
-    my $filelinenum = $_;
-
-    if (defined($sep_address) && $fullfunction eq $sep_symbol) {
-      # Terminating marker for data for this address
-      $count++;
-      next;
-    }
-
-    $filelinenum =~ s|\\|/|g; # turn windows-style paths into unix-style paths
-
-    my $pcstr = $pclist->[$count];
-    my $function = ShortFunctionName($fullfunction);
-    if ($fullfunction eq '??') {
-      # See if nm found a symbol
-      my $nms = $nm_symbols->{$pcstr};
-      if (defined($nms)) {
-        $function = $nms->[0];
-        $fullfunction = $nms->[2];
-      }
-    }
-
-    # Prepend to accumulated symbols for pcstr
-    # (so that caller comes before callee)
-    my $sym = $symbols->{$pcstr};
-    if (!defined($sym)) {
-      $sym = [];
-      $symbols->{$pcstr} = $sym;
-    }
-    unshift(@{$sym}, $function, $filelinenum, $fullfunction);
-    if ($debug) { printf STDERR ("%s => [%s]\n", $pcstr, join(" ", @{$sym})); }
-    if (!defined($sep_address)) {
-      # Inlining is off, se this entry ends immediately
-      $count++;
-    }
-  }
-  close(SYMBOLS);
-}
-
-# Use nm to map the list of referenced PCs to symbols.  Return true iff we
-# are able to read procedure information via nm.
-sub MapSymbolsWithNM {
-  my $image = shift;
-  my $offset = shift;
-  my $pclist = shift;
-  my $symbols = shift;
-
-  # Get nm output sorted by increasing address
-  my $symbol_table = GetProcedureBoundaries($image, ".");
-  if (!%{$symbol_table}) {
-    return 0;
-  }
-  # Start addresses are already the right length (8 or 16 hex digits).
-  my @names = sort { $symbol_table->{$a}->[0] cmp $symbol_table->{$b}->[0] }
-    keys(%{$symbol_table});
-
-  if ($#names < 0) {
-    # No symbols: just use addresses
-    foreach my $pc (@{$pclist}) {
-      my $pcstr = "0x" . $pc;
-      $symbols->{$pc} = [$pcstr, "?", $pcstr];
-    }
-    return 0;
-  }
-
-  # Sort addresses so we can do a join against nm output
-  my $index = 0;
-  my $fullname = $names[0];
-  my $name = ShortFunctionName($fullname);
-  foreach my $pc (sort { $a cmp $b } @{$pclist}) {
-    # Adjust for mapped offset
-    my $mpc = AddressSub($pc, $offset);
-    while (($index < $#names) && ($mpc ge $symbol_table->{$fullname}->[1])){
-      $index++;
-      $fullname = $names[$index];
-      $name = ShortFunctionName($fullname);
-    }
-    if ($mpc lt $symbol_table->{$fullname}->[1]) {
-      $symbols->{$pc} = [$name, "?", $fullname];
-    } else {
-      my $pcstr = "0x" . $pc;
-      $symbols->{$pc} = [$pcstr, "?", $pcstr];
-    }
-  }
-  return 1;
-}
-
-sub ShortFunctionName {
-  my $function = shift;
-  while ($function =~ s/(?<!\.)\([^()]*\)(\s*const)?//g) { }   # Argument types
-  while ($function =~ s/<[^<>]*>//g)  { }    # Remove template arguments
-  $function =~ s/^.*\s+(\w+::)/$1/;          # Remove leading type
-  return $function;
-}
-
-# Trim overly long symbols found in disassembler output
-sub CleanDisassembly {
-  my $d = shift;
-  while ($d =~ s/(?<!\.)\([^()%A-Z]*\)(\s*const)?//g) { } # Argument types, not (%rax)
-  while ($d =~ s/(\w+)<[^<>]*>/$1/g)  { }       # Remove template arguments
-  return $d;
-}
-
-##### Miscellaneous #####
-
-# Find the right versions of the above object tools to use.  The
-# argument is the program file being analyzed, and should be an ELF
-# 32-bit or ELF 64-bit executable file.  The location of the tools
-# is determined by considering the following options in this order:
-#   1) --tools option, if set
-#   2) PPROF_TOOLS environment variable, if set
-#   3) the environment
-sub ConfigureObjTools {
-  my $prog_file = shift;
-
-  # Check for the existence of $prog_file because /usr/bin/file does not
-  # predictably return error status in prod.
-  (-e $prog_file)  || error("$prog_file does not exist.\n");
-
-  # Follow symlinks (at least for systems where "file" supports that)
-  my $file_cmd = "/usr/bin/file -L $prog_file 2>$DEVNULL || /usr/bin/file $prog_file 2>$DEVNULL";
-  if ($^O eq "MSWin32") {
-    $file_cmd = "file -L $prog_file 2>NUL || file $prog_file 2>NUL";
-  }
-  my $file_type = `$file_cmd`;
-
-  if ($file_type =~ /64-bit/) {
-    # Change $address_length to 16 if the program file is ELF 64-bit.
-    # We can't detect this from many (most?) heap or lock contention
-    # profiles, since the actual addresses referenced are generally in low
-    # memory even for 64-bit programs.
-    $address_length = 16;
-  }
-
-  if (($file_type =~ /MS Windows/) || ($OS eq "windows")) {
-    # For windows, we provide a version of nm and addr2line as part of
-    # the opensource release, which is capable of parsing
-    # Windows-style PDB executables.  It should live in the path, or
-    # in the same directory as pprof.
-    $obj_tool_map{"nm_pdb"} = "nm-pdb";
-    $obj_tool_map{"addr2line_pdb"} = "addr2line-pdb";
-    $obj_tool_map{"objdump"} = "false";  # no objdump
-  }
-
-  if ($file_type =~ /Mach-O/) {
-    # OS X uses otool to examine Mach-O files, rather than objdump.
-    $obj_tool_map{"otool"} = "otool";
-    $obj_tool_map{"addr2line"} = "false";  # no addr2line
-    $obj_tool_map{"objdump"} = "false";  # no objdump
-  }
-
-  # Go fill in %obj_tool_map with the pathnames to use:
-  foreach my $tool (keys %obj_tool_map) {
-    $obj_tool_map{$tool} = ConfigureTool($obj_tool_map{$tool});
-  }
-}
-
-# Returns the path of a caller-specified object tool.  If --tools or
-# PPROF_TOOLS are specified, then returns the full path to the tool
-# with that prefix.  Otherwise, returns the path unmodified (which
-# means we will look for it on PATH).
-sub ConfigureTool {
-  my $tool = shift;
-  my $path;
-
-  if ($main::opt_tools ne "") {
-    # Use a prefix specified by the --tools option...
-    $path = $main::opt_tools . $tool;
-    if (!-x $path) {
-      error("No '$tool' found with prefix specified by --tools $main::opt_tools\n");
-    }
-  } elsif (exists $ENV{"PPROF_TOOLS"} &&
-           $ENV{"PPROF_TOOLS"} ne "") {
-    #... or specified with the PPROF_TOOLS environment variable...
-    $path = $ENV{"PPROF_TOOLS"} . $tool;
-    if (!-x $path) {
-      error("No '$tool' found with prefix specified by PPROF_TOOLS=$ENV{PPROF_TOOLS}\n");
-    }
-  } else {
-    # ... otherwise use the version that exists in the same directory as
-    # pprof.  If there's nothing there, use $PATH.
-    $0 =~ m,[^/]*$,;     # this is everything after the last slash
-    my $dirname = $`;    # this is everything up to and including the last slash
-    if (-x "$dirname$tool") {
-      $path = "$dirname$tool";
-    } else {
-      $path = $tool;
-    }
-  }
-  if ($main::opt_debug) { print STDERR "Using '$path' for '$tool'.\n"; }
-  return $path;
-}
-
-# FetchHTTP retrieves a URL using either curl or LWP::UserAgent.
-# It returns the entire body of the page on success, or exits the program
-# with an error message on any failure.
-sub FetchHTTP {
-  my $url = shift;
-  my $timeout = shift;  # optional, in seconds
-  eval "use LWP::UserAgent ();";
-  if ($@) {
-    my @max;
-    push @max, "--max-time", $timeout if $timeout;
-    open(my $fh, "-|", "curl", @max, "-s", $url) or error("Neither LWP::UserAgent nor curl is installed: $!\n");
-    my $slurp = do { local $/; <$fh> };
-    close($fh);
-    if ($? != 0) {
-      error("Error fetching $url with curl: exit $?")
-    }
-    return $slurp;
-  }
-  my $ua = LWP::UserAgent->new;
-  $ua->timeout($timeout) if $timeout;
-  my $res = $ua->get($url);
-  error("Failed to fetch $url\n") unless $res->is_success();
-  return $res->content();
-}
-
-sub PostHTTP {
-  my ($url, $post_data) = @_;
-  eval "use LWP::UserAgent ();";
-  if ($@) {
-    open(POSTFILE, ">$main::tmpfile_sym");
-    print POSTFILE $post_data;
-    close(POSTFILE);
-
-    open(my $fh, "-|", "curl", "-s", "-d", "\@$main::tmpfile_sym", $url) or error("Neither LWP::UserAgent nor curl is installed: $!\n");
-    my $slurp = do { local $/; <$fh> };
-    close($fh);
-    if ($? != 0) {
-      error("Error fetching $url with curl: exit $?")
-    }
-    return $slurp;
-  }
-  my $req = HTTP::Request->new(POST => $url);
-  $req->content($post_data);
-  my $ua = LWP::UserAgent->new;
-  my $res = $ua->request($req);
-  error("Failed to POST to $url\n") unless $res->is_success();
-  return $res->content();
-}
-
-sub cleanup {
-  unlink($main::tmpfile_sym) if defined $main::tmpfile_sym;
-  unlink(keys %main::tempnames) if %main::tempnames;
-  unlink($main::collected_profile) if defined $main::collected_profile;
-
-  # We leave any collected profiles in $HOME/pprof in case the user wants
-  # to look at them later.  We print a message informing them of this.
-  if ((scalar(@main::profile_files) > 0) &&
-      defined($main::collected_profile)) {
-    if (scalar(@main::profile_files) == 1) {
-      print STDERR "Dynamically gathered profile is in $main::collected_profile\n";
-    }
-    print STDERR "If you want to investigate this profile further, you can do:\n";
-    print STDERR "\n";
-    print STDERR "  pprof \\\n";
-    print STDERR "    $main::prog \\\n";
-    print STDERR "    $main::collected_profile\n";
-    print STDERR "\n";
-  }
-}
-
-sub sighandler {
-  cleanup();
-  exit(1);
-}
-
-sub error {
-  my $msg = shift;
-  print STDERR $msg;
-  cleanup();
-  exit(1);
-}
-
-
-# Run $nm_command and get all the resulting procedure boundaries whose
-# names match "$regexp" and returns them in a hashtable mapping from
-# procedure name to a two-element vector of [start address, end address]
-sub GetProcedureBoundariesViaNm {
-  my $nm_command = shift;
-  my $regexp = shift;
-
-  my $symbol_table = {};
-  open(NM, "$nm_command |") || error("$nm_command: $!\n");
-  my $last_start = "0";
-  my $routine = "";
-  while (<NM>) {
-    s/\r//g;         # turn windows-looking lines into unix-looking lines
-    if (m/^\s*([0-9a-f]+) (.) (..*)/) {
-      my $start_val = $1;
-      my $type = $2;
-      my $this_routine = $3;
-
-      # It's possible for two symbols to share the same address, if
-      # one is a zero-length variable (like __start_google_malloc) or
-      # one symbol is a weak alias to another (like __libc_malloc).
-      # In such cases, we want to ignore all values except for the
-      # actual symbol, which in nm-speak has type "T".  The logic
-      # below does this, though it's a bit tricky: what happens when
-      # we have a series of lines with the same address, is the first
-      # one gets queued up to be processed.  However, it won't
-      # *actually* be processed until later, when we read a line with
-      # a different address.  That means that as long as we're reading
-      # lines with the same address, we have a chance to replace that
-      # item in the queue, which we do whenever we see a 'T' entry --
-      # that is, a line with type 'T'.  If we never see a 'T' entry,
-      # we'll just go ahead and process the first entry (which never
-      # got touched in the queue), and ignore the others.
-      if ($start_val eq $last_start && $type =~ /t/i) {
-        # We are the 'T' symbol at this address, replace previous symbol.
-        $routine = $this_routine;
-        next;
-      } elsif ($start_val eq $last_start) {
-        # We're not the 'T' symbol at this address, so ignore us.
-        next;
-      }
-
-      if ($this_routine eq $sep_symbol) {
-        $sep_address = HexExtend($start_val);
-      }
-
-      # Tag this routine with the starting address in case the image
-      # has multiple occurrences of this routine.  We use a syntax
-      # that resembles template paramters that are automatically
-      # stripped out by ShortFunctionName()
-      $this_routine .= "<$start_val>";
-
-      if (defined($routine) && $routine =~ m/$regexp/) {
-        $symbol_table->{$routine} = [HexExtend($last_start),
-                                     HexExtend($start_val)];
-      }
-      $last_start = $start_val;
-      $routine = $this_routine;
-    } elsif (m/^Loaded image name: (.+)/) {
-      # The win32 nm workalike emits information about the binary it is using.
-      if ($main::opt_debug) { print STDERR "Using Image $1\n"; }
-    } elsif (m/^PDB file name: (.+)/) {
-      # The win32 nm workalike emits information about the pdb it is using.
-      if ($main::opt_debug) { print STDERR "Using PDB $1\n"; }
-    }
-  }
-  close(NM);
-  # Handle the last line in the nm output.  Unfortunately, we don't know
-  # how big this last symbol is, because we don't know how big the file
-  # is.  For now, we just give it a size of 0.
-  # TODO(csilvers): do better here.
-  if (defined($routine) && $routine =~ m/$regexp/) {
-    $symbol_table->{$routine} = [HexExtend($last_start),
-                                 HexExtend($last_start)];
-  }
-  return $symbol_table;
-}
-
-# Gets the procedure boundaries for all routines in "$image" whose names
-# match "$regexp" and returns them in a hashtable mapping from procedure
-# name to a two-element vector of [start address, end address].
-# Will return an empty map if nm is not installed or not working properly.
-sub GetProcedureBoundaries {
-  my $image = shift;
-  my $regexp = shift;
-
-  # For libc libraries, the copy in /usr/lib/debug contains debugging symbols
-  my $debugging = DebuggingLibrary($image);
-  if ($debugging) {
-    $image = $debugging;
-  }
-
-  my $nm = $obj_tool_map{"nm"};
-
-  # nm can fail for two reasons: 1) $image isn't a debug library; 2) nm
-  # binary doesn't support --demangle.  In addition, for OS X we need
-  # to use the -f flag to get 'flat' nm output (otherwise we don't sort
-  # properly and get incorrect results).  Unfortunately, GNU nm uses -f
-  # in an incompatible way.  So first we test whether our nm supports
-  # --demangle and -f.
-  my $demangle_flag = "";
-  if (system("$nm --demangle $image >$DEVNULL 2>&1") == 0) {
-    # In this mode, we do "nm --demangle <foo>"
-    $demangle_flag = "--demangle";
-  }
-  my $flatten_flag = "";
-  if (system("$nm -f $image >$DEVNULL 2>&1") == 0) {
-    $flatten_flag = "-f";
-  }
-
-  # Finally, in the case $image isn't a debug library, we try again with
-  # -D to at least get *exported* symbols.  If we can't use --demangle, too bad.
-  my @nm_commands = ("$nm -n $flatten_flag $demangle_flag" .
-                     " $image 2>$DEVNULL",
-                     "$nm -D -n $flatten_flag $demangle_flag" .
-                     " $image 2>$DEVNULL",
-                     # go tool nm is for Go binaries
-                     "go tool nm $image 2>$DEVNULL | sort");
-
-  foreach my $nm_command (@nm_commands) {
-    my $symbol_table = GetProcedureBoundariesViaNm($nm_command, $regexp);
-    return $symbol_table if (%{$symbol_table});
-  }
-  my $symbol_table = {};
-  return $symbol_table;
-}
-
-
-# The test vectors for AddressAdd/Sub/Inc are 8-16-nibble hex strings.
-# To make them more readable, we add underscores at interesting places.
-# This routine removes the underscores, producing the canonical representation
-# used by pprof to represent addresses, particularly in the tested routines.
-sub CanonicalHex {
-  my $arg = shift;
-  return join '', (split '_',$arg);
-}
-
-
-# Unit test for AddressAdd:
-sub AddressAddUnitTest {
-  my $test_data_8 = shift;
-  my $test_data_16 = shift;
-  my $error_count = 0;
-  my $fail_count = 0;
-  my $pass_count = 0;
-  # print STDERR "AddressAddUnitTest: ", 1+$#{$test_data_8}, " tests\n";
-
-  # First a few 8-nibble addresses.  Note that this implementation uses
-  # plain old arithmetic, so a quick sanity check along with verifying what
-  # happens to overflow (we want it to wrap):
-  $address_length = 8;
-  foreach my $row (@{$test_data_8}) {
-    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
-    my $sum = AddressAdd ($row->[0], $row->[1]);
-    if ($sum ne $row->[2]) {
-      printf STDERR "ERROR: %s != %s + %s = %s\n", $sum,
-             $row->[0], $row->[1], $row->[2];
-      ++$fail_count;
-    } else {
-      ++$pass_count;
-    }
-  }
-  printf STDERR "AddressAdd 32-bit tests: %d passes, %d failures\n",
-         $pass_count, $fail_count;
-  $error_count = $fail_count;
-  $fail_count = 0;
-  $pass_count = 0;
-
-  # Now 16-nibble addresses.
-  $address_length = 16;
-  foreach my $row (@{$test_data_16}) {
-    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
-    my $sum = AddressAdd (CanonicalHex($row->[0]), CanonicalHex($row->[1]));
-    my $expected = join '', (split '_',$row->[2]);
-    if ($sum ne CanonicalHex($row->[2])) {
-      printf STDERR "ERROR: %s != %s + %s = %s\n", $sum,
-             $row->[0], $row->[1], $row->[2];
-      ++$fail_count;
-    } else {
-      ++$pass_count;
-    }
-  }
-  printf STDERR "AddressAdd 64-bit tests: %d passes, %d failures\n",
-         $pass_count, $fail_count;
-  $error_count += $fail_count;
-
-  return $error_count;
-}
-
-
-# Unit test for AddressSub:
-sub AddressSubUnitTest {
-  my $test_data_8 = shift;
-  my $test_data_16 = shift;
-  my $error_count = 0;
-  my $fail_count = 0;
-  my $pass_count = 0;
-  # print STDERR "AddressSubUnitTest: ", 1+$#{$test_data_8}, " tests\n";
-
-  # First a few 8-nibble addresses.  Note that this implementation uses
-  # plain old arithmetic, so a quick sanity check along with verifying what
-  # happens to overflow (we want it to wrap):
-  $address_length = 8;
-  foreach my $row (@{$test_data_8}) {
-    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
-    my $sum = AddressSub ($row->[0], $row->[1]);
-    if ($sum ne $row->[3]) {
-      printf STDERR "ERROR: %s != %s - %s = %s\n", $sum,
-             $row->[0], $row->[1], $row->[3];
-      ++$fail_count;
-    } else {
-      ++$pass_count;
-    }
-  }
-  printf STDERR "AddressSub 32-bit tests: %d passes, %d failures\n",
-         $pass_count, $fail_count;
-  $error_count = $fail_count;
-  $fail_count = 0;
-  $pass_count = 0;
-
-  # Now 16-nibble addresses.
-  $address_length = 16;
-  foreach my $row (@{$test_data_16}) {
-    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
-    my $sum = AddressSub (CanonicalHex($row->[0]), CanonicalHex($row->[1]));
-    if ($sum ne CanonicalHex($row->[3])) {
-      printf STDERR "ERROR: %s != %s - %s = %s\n", $sum,
-             $row->[0], $row->[1], $row->[3];
-      ++$fail_count;
-    } else {
-      ++$pass_count;
-    }
-  }
-  printf STDERR "AddressSub 64-bit tests: %d passes, %d failures\n",
-         $pass_count, $fail_count;
-  $error_count += $fail_count;
-
-  return $error_count;
-}
-
-
-# Unit test for AddressInc:
-sub AddressIncUnitTest {
-  my $test_data_8 = shift;
-  my $test_data_16 = shift;
-  my $error_count = 0;
-  my $fail_count = 0;
-  my $pass_count = 0;
-  # print STDERR "AddressIncUnitTest: ", 1+$#{$test_data_8}, " tests\n";
-
-  # First a few 8-nibble addresses.  Note that this implementation uses
-  # plain old arithmetic, so a quick sanity check along with verifying what
-  # happens to overflow (we want it to wrap):
-  $address_length = 8;
-  foreach my $row (@{$test_data_8}) {
-    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
-    my $sum = AddressInc ($row->[0]);
-    if ($sum ne $row->[4]) {
-      printf STDERR "ERROR: %s != %s + 1 = %s\n", $sum,
-             $row->[0], $row->[4];
-      ++$fail_count;
-    } else {
-      ++$pass_count;
-    }
-  }
-  printf STDERR "AddressInc 32-bit tests: %d passes, %d failures\n",
-         $pass_count, $fail_count;
-  $error_count = $fail_count;
-  $fail_count = 0;
-  $pass_count = 0;
-
-  # Now 16-nibble addresses.
-  $address_length = 16;
-  foreach my $row (@{$test_data_16}) {
-    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
-    my $sum = AddressInc (CanonicalHex($row->[0]));
-    if ($sum ne CanonicalHex($row->[4])) {
-      printf STDERR "ERROR: %s != %s + 1 = %s\n", $sum,
-             $row->[0], $row->[4];
-      ++$fail_count;
-    } else {
-      ++$pass_count;
-    }
-  }
-  printf STDERR "AddressInc 64-bit tests: %d passes, %d failures\n",
-         $pass_count, $fail_count;
-  $error_count += $fail_count;
-
-  return $error_count;
-}
-
-
-# Driver for unit tests.
-# Currently just the address add/subtract/increment routines for 64-bit.
-sub RunUnitTests {
-  my $error_count = 0;
-
-  # This is a list of tuples [a, b, a+b, a-b, a+1]
-  my $unit_test_data_8 = [
-    [qw(aaaaaaaa 50505050 fafafafa 5a5a5a5a aaaaaaab)],
-    [qw(50505050 aaaaaaaa fafafafa a5a5a5a6 50505051)],
-    [qw(ffffffff aaaaaaaa aaaaaaa9 55555555 00000000)],
-    [qw(00000001 ffffffff 00000000 00000002 00000002)],
-    [qw(00000001 fffffff0 fffffff1 00000011 00000002)],
-  ];
-  my $unit_test_data_16 = [
-    # The implementation handles data in 7-nibble chunks, so those are the
-    # interesting boundaries.
-    [qw(aaaaaaaa 50505050
-        00_000000f_afafafa 00_0000005_a5a5a5a 00_000000a_aaaaaab)],
-    [qw(50505050 aaaaaaaa
-        00_000000f_afafafa ff_ffffffa_5a5a5a6 00_0000005_0505051)],
-    [qw(ffffffff aaaaaaaa
-        00_000001a_aaaaaa9 00_0000005_5555555 00_0000010_0000000)],
-    [qw(00000001 ffffffff
-        00_0000010_0000000 ff_ffffff0_0000002 00_0000000_0000002)],
-    [qw(00000001 fffffff0
-        00_000000f_ffffff1 ff_ffffff0_0000011 00_0000000_0000002)],
-
-    [qw(00_a00000a_aaaaaaa 50505050
-        00_a00000f_afafafa 00_a000005_a5a5a5a 00_a00000a_aaaaaab)],
-    [qw(0f_fff0005_0505050 aaaaaaaa
-        0f_fff000f_afafafa 0f_ffefffa_5a5a5a6 0f_fff0005_0505051)],
-    [qw(00_000000f_fffffff 01_800000a_aaaaaaa
-        01_800001a_aaaaaa9 fe_8000005_5555555 00_0000010_0000000)],
-    [qw(00_0000000_0000001 ff_fffffff_fffffff
-        00_0000000_0000000 00_0000000_0000002 00_0000000_0000002)],
-    [qw(00_0000000_0000001 ff_fffffff_ffffff0
-        ff_fffffff_ffffff1 00_0000000_0000011 00_0000000_0000002)],
-  ];
-
-  $error_count += AddressAddUnitTest($unit_test_data_8, $unit_test_data_16);
-  $error_count += AddressSubUnitTest($unit_test_data_8, $unit_test_data_16);
-  $error_count += AddressIncUnitTest($unit_test_data_8, $unit_test_data_16);
-  if ($error_count > 0) {
-    print STDERR $error_count, " errors: FAILED\n";
-  } else {
-    print STDERR "PASS\n";
-  }
-  exit ($error_count);
-}
diff --git a/src/bufio/scan.go b/src/bufio/scan.go
index a41451524..364d15961 100644
--- a/src/bufio/scan.go
+++ b/src/bufio/scan.go
@@ -36,6 +36,7 @@ type Scanner struct {
 	start        int       // First non-processed byte in buf.
 	end          int       // End of data in buf.
 	err          error     // Sticky error.
+	empties      int       // Count of successive empty tokens.
 }
 
 // SplitFunc is the signature of the split function used to tokenize the
@@ -108,6 +109,8 @@ func (s *Scanner) Text() string {
 // After Scan returns false, the Err method will return any error that
 // occurred during scanning, except that if it was io.EOF, Err
 // will return nil.
+// Split panics if the split function returns 100 empty tokens without
+// advancing the input. This is a common error mode for scanners.
 func (s *Scanner) Scan() bool {
 	// Loop until we have a token.
 	for {
@@ -125,6 +128,15 @@ func (s *Scanner) Scan() bool {
 			}
 			s.token = token
 			if token != nil {
+				if s.err == nil || advance > 0 {
+					s.empties = 0
+				} else {
+					// Returning tokens not advancing input at EOF.
+					s.empties++
+					if s.empties > 100 {
+						panic("bufio.Scan: 100 empty tokens without progressing")
+					}
+				}
 				return true
 			}
 		}
@@ -172,6 +184,7 @@ func (s *Scanner) Scan() bool {
 				break
 			}
 			if n > 0 {
+				s.empties = 0
 				break
 			}
 			loop++
diff --git a/src/bufio/scan_test.go b/src/bufio/scan_test.go
index 1454a8113..eea87cbf7 100644
--- a/src/bufio/scan_test.go
+++ b/src/bufio/scan_test.go
@@ -455,3 +455,70 @@ func TestEmptyTokens(t *testing.T) {
 		t.Fatal(err)
 	}
 }
+
+func loopAtEOFSplit(data []byte, atEOF bool) (advance int, token []byte, err error) {
+	if len(data) > 0 {
+		return 1, data[:1], nil
+	}
+	return 0, data, nil
+}
+
+func TestDontLoopForever(t *testing.T) {
+	s := NewScanner(strings.NewReader("abc"))
+	s.Split(loopAtEOFSplit)
+	// Expect a panic
+	defer func() {
+		err := recover()
+		if err == nil {
+			t.Fatal("should have panicked")
+		}
+		if msg, ok := err.(string); !ok || !strings.Contains(msg, "empty tokens") {
+			panic(err)
+		}
+	}()
+	for count := 0; s.Scan(); count++ {
+		if count > 1000 {
+			t.Fatal("looping")
+		}
+	}
+	if s.Err() != nil {
+		t.Fatal("after scan:", s.Err())
+	}
+}
+
+func TestBlankLines(t *testing.T) {
+	s := NewScanner(strings.NewReader(strings.Repeat("\n", 1000)))
+	for count := 0; s.Scan(); count++ {
+		if count > 2000 {
+			t.Fatal("looping")
+		}
+	}
+	if s.Err() != nil {
+		t.Fatal("after scan:", s.Err())
+	}
+}
+
+type countdown int
+
+func (c *countdown) split(data []byte, atEOF bool) (advance int, token []byte, err error) {
+	if *c > 0 {
+		*c--
+		return 1, data[:1], nil
+	}
+	return 0, nil, nil
+}
+
+// Check that the looping-at-EOF check doesn't trigger for merely empty tokens.
+func TestEmptyLinesOK(t *testing.T) {
+	c := countdown(10000)
+	s := NewScanner(strings.NewReader(strings.Repeat("\n", 10000)))
+	s.Split(c.split)
+	for s.Scan() {
+	}
+	if s.Err() != nil {
+		t.Fatal("after scan:", s.Err())
+	}
+	if c != 0 {
+		t.Fatalf("stopped with %d left to process", c)
+	}
+}
diff --git a/src/cmd/5g/reg.c b/src/cmd/5g/reg.c
index 712841329..441792873 100644
--- a/src/cmd/5g/reg.c
+++ b/src/cmd/5g/reg.c
@@ -199,7 +199,7 @@ regopt(Prog *firstp)
 		proginfo(&info, p);
 
 		// Avoid making variables for direct-called functions.
-		if(p->as == ABL && p->to.type == D_EXTERN)
+		if(p->as == ABL && p->to.name == D_EXTERN)
 			continue;
 
 		bit = mkvar(r, &p->from);
diff --git a/src/cmd/api/goapi.go b/src/cmd/api/goapi.go
index 5a8c87603..e49ba33bb 100644
--- a/src/cmd/api/goapi.go
+++ b/src/cmd/api/goapi.go
@@ -405,6 +405,7 @@ func (w *Walker) parseFile(dir, file string) (*ast.File, error) {
 			" note struct{};" +
 			" p struct{};" +
 			" parfor struct{};" +
+			" slice struct{};" +
 			" slicetype struct{};" +
 			" stkframe struct{};" +
 			" sudog struct{};" +
diff --git a/src/cmd/dist/build.c b/src/cmd/dist/build.c
index 8fd2e998a..9c81dd8b2 100644
--- a/src/cmd/dist/build.c
+++ b/src/cmd/dist/build.c
@@ -691,13 +691,6 @@ install(char *dir)
 	bpathf(&final_path, "%s/src/%s", goroot_final, dir);
 	name = lastelem(dir);
 
-	// For misc/prof, copy into the tool directory and we're done.
-	if(hasprefix(dir, "misc/")) {
-		copyfile(bpathf(&b, "%s/%s", tooldir, name),
-			bpathf(&b1, "%s/misc/%s", goroot, name), 1);
-		goto out;
-	}
-
 	// set up gcc command line on first run.
 	if(gccargs.len == 0) {
 		bprintf(&b, "%s %s", defaultcc, defaultcflags);
@@ -1328,8 +1321,6 @@ static char *buildorder[] = {
 	"libbio",
 	"liblink",
 
-	"misc/pprof",
-
 	"cmd/cc",  // must be before c
 	"cmd/gc",  // must be before g
 	"cmd/%sl",  // must be before a, c, g
diff --git a/src/cmd/gc/builtin.c b/src/cmd/gc/builtin.c
index fbca4ee5f..aeeadedca 100644
--- a/src/cmd/gc/builtin.c
+++ b/src/cmd/gc/builtin.c
@@ -24,6 +24,8 @@ char *runtimeimport =
 	"func @\"\".printslice (? any)\n"
 	"func @\"\".printnl ()\n"
 	"func @\"\".printsp ()\n"
+	"func @\"\".printlock ()\n"
+	"func @\"\".printunlock ()\n"
 	"func @\"\".concatstring2 (? string, ? string) (? string)\n"
 	"func @\"\".concatstring3 (? string, ? string, ? string) (? string)\n"
 	"func @\"\".concatstring4 (? string, ? string, ? string, ? string) (? string)\n"
@@ -86,10 +88,33 @@ char *runtimeimport =
 	"func @\"\".writebarrierstring (@\"\".dst·1 *any, @\"\".src·2 any)\n"
 	"func @\"\".writebarrierslice (@\"\".dst·1 *any, @\"\".src·2 any)\n"
 	"func @\"\".writebarrieriface (@\"\".dst·1 *any, @\"\".src·2 any)\n"
-	"func @\"\".writebarrierfat2 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
-	"func @\"\".writebarrierfat3 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
-	"func @\"\".writebarrierfat4 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat01 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat10 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat11 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat001 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat010 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat011 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat100 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat101 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat110 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat111 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat0001 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat0010 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat0011 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat0100 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat0101 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat0110 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat0111 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat1000 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat1001 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat1010 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat1011 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat1100 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat1101 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat1110 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat1111 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
 	"func @\"\".writebarrierfat (@\"\".typ·1 *byte, @\"\".dst·2 *any, @\"\".src·3 *any)\n"
+	"func @\"\".writebarriercopy (@\"\".typ·2 *byte, @\"\".dst·3 any, @\"\".src·4 any) (? int)\n"
 	"func @\"\".selectnbsend (@\"\".chanType·2 *byte, @\"\".hchan·3 chan<- any, @\"\".elem·4 *any) (? bool)\n"
 	"func @\"\".selectnbrecv (@\"\".chanType·2 *byte, @\"\".elem·3 *any, @\"\".hchan·4 <-chan any) (? bool)\n"
 	"func @\"\".selectnbrecv2 (@\"\".chanType·2 *byte, @\"\".elem·3 *any, @\"\".received·4 *bool, @\"\".hchan·5 <-chan any) (? bool)\n"
diff --git a/src/cmd/gc/go.h b/src/cmd/gc/go.h
index d3c4193b5..c695c5bf3 100644
--- a/src/cmd/gc/go.h
+++ b/src/cmd/gc/go.h
@@ -1466,6 +1466,7 @@ void	walk(Node *fn);
 void	walkexpr(Node **np, NodeList **init);
 void	walkexprlist(NodeList *l, NodeList **init);
 void	walkexprlistsafe(NodeList *l, NodeList **init);
+void	walkexprlistcheap(NodeList *l, NodeList **init);
 void	walkstmt(Node **np);
 void	walkstmtlist(NodeList *l);
 Node*	conv(Node*, Type*);
diff --git a/src/cmd/gc/lex.c b/src/cmd/gc/lex.c
index 2303b442c..523ba37aa 100644
--- a/src/cmd/gc/lex.c
+++ b/src/cmd/gc/lex.c
@@ -344,8 +344,8 @@ main(int argc, char *argv[])
 					break;
 				}
 			}
-			if(j == nelem(debugtab))
-				fatal("unknown debug information -d '%s'\n", f[i]);
+			if(debugtab[j].name == nil)
+				sysfatal("unknown debug information -d '%s'\n", f[i]);
 		}
 	}
 
diff --git a/src/cmd/gc/plive.c b/src/cmd/gc/plive.c
index 0feb2c710..3bfa69b1f 100644
--- a/src/cmd/gc/plive.c
+++ b/src/cmd/gc/plive.c
@@ -1092,7 +1092,7 @@ twobitwalktype1(Type *t, vlong *xoffset, Bvec *bv)
 	case TCOMPLEX64:
 	case TCOMPLEX128:
 		for(i = 0; i < t->width; i++) {
-			bvset(bv, ((*xoffset + i) / widthptr) * BitsPerPointer); // 1 = live scalar
+			bvset(bv, ((*xoffset + i) / widthptr) * BitsPerPointer); // 1 = live scalar (BitsScalar)
 		}
 		*xoffset += t->width;
 		break;
@@ -1105,7 +1105,7 @@ twobitwalktype1(Type *t, vlong *xoffset, Bvec *bv)
 	case TMAP:
 		if((*xoffset & (widthptr-1)) != 0)
 			fatal("twobitwalktype1: invalid alignment, %T", t);
-		bvset(bv, (*xoffset / widthptr) * BitsPerPointer + 1); // 2 = live ptr
+		bvset(bv, (*xoffset / widthptr) * BitsPerPointer + 1); // 2 = live ptr (BitsPointer)
 		*xoffset += t->width;
 		break;
 
@@ -1113,7 +1113,7 @@ twobitwalktype1(Type *t, vlong *xoffset, Bvec *bv)
 		// struct { byte *str; intgo len; }
 		if((*xoffset & (widthptr-1)) != 0)
 			fatal("twobitwalktype1: invalid alignment, %T", t);
-		bvset(bv, (*xoffset / widthptr) * BitsPerPointer + 1); // 2 = live ptr in first slot
+		bvset(bv, (*xoffset / widthptr) * BitsPerPointer + 1); // 2 = live ptr in first slot (BitsPointer)
 		*xoffset += t->width;
 		break;
 
@@ -1123,15 +1123,8 @@ twobitwalktype1(Type *t, vlong *xoffset, Bvec *bv)
 		// struct { Type *type; union { void *ptr, uintptr val } data; }
 		if((*xoffset & (widthptr-1)) != 0)
 			fatal("twobitwalktype1: invalid alignment, %T", t);
-		bvset(bv, ((*xoffset / widthptr) * BitsPerPointer) + 0);
-		bvset(bv, ((*xoffset / widthptr) * BitsPerPointer) + 1); // 3 = multiword
-		// next word contains 2 = Iface, 3 = Eface
-		if(isnilinter(t)) {
-			bvset(bv, ((*xoffset / widthptr) * BitsPerPointer) + 2);
-			bvset(bv, ((*xoffset / widthptr) * BitsPerPointer) + 3);
-		} else {
-			bvset(bv, ((*xoffset / widthptr) * BitsPerPointer) + 3);
-		}
+		bvset(bv, (*xoffset / widthptr) * BitsPerPointer + 1); // 2 = live ptr in first slot (BitsPointer)
+		bvset(bv, (*xoffset / widthptr) * BitsPerPointer + 3); // 2 = live ptr in second slot (BitsPointer)
 		*xoffset += t->width;
 		break;
 
@@ -1144,7 +1137,7 @@ twobitwalktype1(Type *t, vlong *xoffset, Bvec *bv)
 			// struct { byte *array; uintgo len; uintgo cap; }
 			if((*xoffset & (widthptr-1)) != 0)
 				fatal("twobitwalktype1: invalid TARRAY alignment, %T", t);
-			bvset(bv, (*xoffset / widthptr) * BitsPerPointer + 1); // 2 = live ptr in first slot
+			bvset(bv, (*xoffset / widthptr) * BitsPerPointer + 1); // 2 = live ptr in first slot (BitsPointer)
 			*xoffset += t->width;
 		} else
 			for(i = 0; i < t->bound; i++)
diff --git a/src/cmd/gc/popt.c b/src/cmd/gc/popt.c
index 993bb2482..6e6db88ef 100644
--- a/src/cmd/gc/popt.c
+++ b/src/cmd/gc/popt.c
@@ -847,6 +847,10 @@ nilopt(Prog *firstp)
 	Graph *g;
 	int ncheck, nkill;
 
+	// TODO(minux): nilopt on power64 throw away seemly random segment of code.
+	if(thechar == '9')
+		return;
+
 	g = flowstart(firstp, sizeof(NilFlow));
 	if(g == nil)
 		return;
diff --git a/src/cmd/gc/reflect.c b/src/cmd/gc/reflect.c
index b2ff2fbc5..0f8802abc 100644
--- a/src/cmd/gc/reflect.c
+++ b/src/cmd/gc/reflect.c
@@ -1525,11 +1525,9 @@ gengcprog1(ProgGen *g, Type *t, vlong *xoffset)
 		*xoffset += t->width;
 		break;
 	case TINTER:
-		proggendata(g, BitsMultiWord);
-		if(isnilinter(t))
-			proggendata(g, BitsEface);
-		else
-			proggendata(g, BitsIface);
+		// Assuming IfacePointerOnly=1.
+		proggendata(g, BitsPointer);
+		proggendata(g, BitsPointer);
 		*xoffset += t->width;
 		break;
 	case TARRAY:
diff --git a/src/cmd/gc/runtime.go b/src/cmd/gc/runtime.go
index 0fb15c265..c6007714c 100644
--- a/src/cmd/gc/runtime.go
+++ b/src/cmd/gc/runtime.go
@@ -36,6 +36,8 @@ func printeface(any)
 func printslice(any)
 func printnl()
 func printsp()
+func printlock()
+func printunlock()
 
 func concatstring2(string, string) string
 func concatstring3(string, string, string) string
@@ -115,10 +117,35 @@ func writebarrieriface(dst *any, src any)
 // The unused *byte argument makes sure that src is 2-pointer-aligned,
 // which is the maximum alignment on NaCl amd64p32
 // (and possibly on 32-bit systems if we start 64-bit aligning uint64s).
-func writebarrierfat2(dst *any, _ *byte, src any)
-func writebarrierfat3(dst *any, _ *byte, src any)
-func writebarrierfat4(dst *any, _ *byte, src any)
+// The bitmap in the name tells which words being copied are pointers.
+func writebarrierfat01(dst *any, _ *byte, src any)
+func writebarrierfat10(dst *any, _ *byte, src any)
+func writebarrierfat11(dst *any, _ *byte, src any)
+func writebarrierfat001(dst *any, _ *byte, src any)
+func writebarrierfat010(dst *any, _ *byte, src any)
+func writebarrierfat011(dst *any, _ *byte, src any)
+func writebarrierfat100(dst *any, _ *byte, src any)
+func writebarrierfat101(dst *any, _ *byte, src any)
+func writebarrierfat110(dst *any, _ *byte, src any)
+func writebarrierfat111(dst *any, _ *byte, src any)
+func writebarrierfat0001(dst *any, _ *byte, src any)
+func writebarrierfat0010(dst *any, _ *byte, src any)
+func writebarrierfat0011(dst *any, _ *byte, src any)
+func writebarrierfat0100(dst *any, _ *byte, src any)
+func writebarrierfat0101(dst *any, _ *byte, src any)
+func writebarrierfat0110(dst *any, _ *byte, src any)
+func writebarrierfat0111(dst *any, _ *byte, src any)
+func writebarrierfat1000(dst *any, _ *byte, src any)
+func writebarrierfat1001(dst *any, _ *byte, src any)
+func writebarrierfat1010(dst *any, _ *byte, src any)
+func writebarrierfat1011(dst *any, _ *byte, src any)
+func writebarrierfat1100(dst *any, _ *byte, src any)
+func writebarrierfat1101(dst *any, _ *byte, src any)
+func writebarrierfat1110(dst *any, _ *byte, src any)
+func writebarrierfat1111(dst *any, _ *byte, src any)
+
 func writebarrierfat(typ *byte, dst *any, src *any)
+func writebarriercopy(typ *byte, dst any, src any) int
 
 func selectnbsend(chanType *byte, hchan chan<- any, elem *any) bool
 func selectnbrecv(chanType *byte, elem *any, hchan <-chan any) bool
diff --git a/src/cmd/gc/typecheck.c b/src/cmd/gc/typecheck.c
index 714c66268..f05d8022d 100644
--- a/src/cmd/gc/typecheck.c
+++ b/src/cmd/gc/typecheck.c
@@ -2891,7 +2891,8 @@ typecheckas(Node *n)
 		case OSLICE3:
 		case OSLICESTR:
 			// For x = x[0:y], x can be updated in place, without touching pointer.
-			if(samesafeexpr(n->left, n->right->left) && (n->right->right->left == N || iszero(n->right->right->left)))
+			// TODO(rsc): Reenable once it is actually updated in place without touching the pointer.
+			if(0 && samesafeexpr(n->left, n->right->left) && (n->right->right->left == N || iszero(n->right->right->left)))
 				n->right->reslice = 1;
 			break;
 		
@@ -2899,7 +2900,8 @@ typecheckas(Node *n)
 			// For x = append(x, ...), x can be updated in place when there is capacity,
 			// without touching the pointer; otherwise the emitted code to growslice
 			// can take care of updating the pointer, and only in that case.
-			if(n->right->list != nil && samesafeexpr(n->left, n->right->list->n))
+			// TODO(rsc): Reenable once the emitted code does update the pointer.
+			if(0 && n->right->list != nil && samesafeexpr(n->left, n->right->list->n))
 				n->right->reslice = 1;
 			break;
 		}
diff --git a/src/cmd/gc/walk.c b/src/cmd/gc/walk.c
index d4d0f449c..37bd62dea 100644
--- a/src/cmd/gc/walk.c
+++ b/src/cmd/gc/walk.c
@@ -6,6 +6,7 @@
 #include	<libc.h>
 #include	"go.h"
 #include	"../ld/textflag.h"
+#include	"../../runtime/mgc0.h"
 
 static	Node*	walkprint(Node*, NodeList**);
 static	Node*	writebarrierfn(char*, Type*, Type*);
@@ -363,6 +364,15 @@ walkexprlistsafe(NodeList *l, NodeList **init)
 }
 
 void
+walkexprlistcheap(NodeList *l, NodeList **init)
+{
+	for(; l; l=l->next) {
+		l->n = cheapexpr(l->n, init);
+		walkexpr(&l->n, init);
+	}
+}
+
+void
 walkexpr(Node **np, NodeList **init)
 {
 	Node *r, *l, *var, *a;
@@ -1772,6 +1782,11 @@ walkprint(Node *nn, NodeList **init)
 	calls = nil;
 	notfirst = 0;
 
+	// Hoist all the argument evaluation up before the lock.
+	walkexprlistcheap(all, init);
+
+	calls = list(calls, mkcall("printlock", T, init));
+
 	for(l=all; l; l=l->next) {
 		if(notfirst) {
 			calls = list(calls, mkcall("printsp", T, init));
@@ -1852,6 +1867,9 @@ walkprint(Node *nn, NodeList **init)
 
 	if(op == OPRINTN)
 		calls = list(calls, mkcall("printnl", T, nil));
+
+	calls = list(calls, mkcall("printunlock", T, init));
+
 	typechecklist(calls, Etop);
 	walkexprlist(calls, init);
 
@@ -1988,6 +2006,9 @@ applywritebarrier(Node *n, NodeList **init)
 {
 	Node *l, *r;
 	Type *t;
+	vlong x;
+	static Bvec *bv;
+	char name[32];
 
 	if(n->left && n->right && needwritebarrier(n->left, n->right)) {
 		t = n->left->type;
@@ -2005,14 +2026,35 @@ applywritebarrier(Node *n, NodeList **init)
 		} else if(isinter(t)) {
 			n = mkcall1(writebarrierfn("writebarrieriface", t, n->right->type), T, init,
 				l, n->right);
-		} else if(t->width == 2*widthptr) {
-			n = mkcall1(writebarrierfn("writebarrierfat2", t, n->right->type), T, init,
-				l, nodnil(), n->right);
-		} else if(t->width == 3*widthptr) {
-			n = mkcall1(writebarrierfn("writebarrierfat3", t, n->right->type), T, init,
-				l, nodnil(), n->right);
-		} else if(t->width == 4*widthptr) {
-			n = mkcall1(writebarrierfn("writebarrierfat4", t, n->right->type), T, init,
+		} else if(t->width <= 4*widthptr) {
+			x = 0;
+			if(bv == nil)
+				bv = bvalloc(BitsPerPointer*4);
+			bvresetall(bv);
+			twobitwalktype1(t, &x, bv);
+			// The bvgets are looking for BitsPointer in successive slots.
+			enum {
+				PtrBit = 1,
+			};
+			if(BitsPointer != (1<<PtrBit))
+				fatal("wrong PtrBit");
+			switch(t->width/widthptr) {
+			default:
+				fatal("found writebarrierfat for %d-byte object of type %T", (int)t->width, t);
+			case 2:
+				snprint(name, sizeof name, "writebarrierfat%d%d",
+					bvget(bv, PtrBit), bvget(bv, BitsPerPointer+PtrBit));
+				break;
+			case 3:
+				snprint(name, sizeof name, "writebarrierfat%d%d%d",
+					bvget(bv, PtrBit), bvget(bv, BitsPerPointer+PtrBit), bvget(bv, 2*BitsPerPointer+PtrBit));
+				break;
+			case 4:
+				snprint(name, sizeof name, "writebarrierfat%d%d%d%d",
+					bvget(bv, PtrBit), bvget(bv, BitsPerPointer+PtrBit), bvget(bv, 2*BitsPerPointer+PtrBit), bvget(bv, 3*BitsPerPointer+PtrBit));
+				break;
+			}
+			n = mkcall1(writebarrierfn(name, t, n->right->type), T, init,
 				l, nodnil(), n->right);
 		} else {
 			r = n->right;
@@ -2874,6 +2916,11 @@ copyany(Node *n, NodeList **init, int runtimecall)
 {
 	Node *nl, *nr, *nfrm, *nto, *nif, *nlen, *nwid, *fn;
 	NodeList *l;
+	
+	if(haspointers(n->left->type->type)) {
+		fn = writebarrierfn("writebarriercopy", n->left->type, n->right->type);
+		return mkcall1(fn, n->type, init, typename(n->left->type->type), n->left, n->right);
+	}
 
 	if(runtimecall) {
 		if(n->right->type->etype == TSTRING)
diff --git a/src/cmd/go/build.go b/src/cmd/go/build.go
index 79a27116a..1dd4314da 100644
--- a/src/cmd/go/build.go
+++ b/src/cmd/go/build.go
@@ -1826,7 +1826,15 @@ func (gcToolchain) ld(b *builder, p *Package, out string, allactions []*action,
 func (gcToolchain) cc(b *builder, p *Package, objdir, ofile, cfile string) error {
 	inc := filepath.Join(goroot, "pkg", fmt.Sprintf("%s_%s", goos, goarch))
 	cfile = mkAbs(p.Dir, cfile)
-	args := stringList(tool(archChar+"c"), "-F", "-V", "-w", "-trimpath", b.work, "-I", objdir, "-I", inc, "-o", ofile, buildCcflags, "-D", "GOOS_"+goos, "-D", "GOARCH_"+goarch, cfile)
+	warn := []string{"-w"}
+	if p.usesSwig() {
+		// When using SWIG, this compiler is only used to
+		// compile the C files generated by SWIG.
+		// We don't want warnings.
+		// See issue 9065 for details.
+		warn = nil
+	}
+	args := stringList(tool(archChar+"c"), "-F", "-V", warn, "-trimpath", b.work, "-I", objdir, "-I", inc, "-o", ofile, buildCcflags, "-D", "GOOS_"+goos, "-D", "GOARCH_"+goarch, cfile)
 	return b.run(p.Dir, p.ImportPath, nil, args)
 }
 
diff --git a/src/cmd/go/doc.go b/src/cmd/go/doc.go
index cf3a54565..43a315944 100644
--- a/src/cmd/go/doc.go
+++ b/src/cmd/go/doc.go
@@ -590,7 +590,7 @@ Usage:
 
 Vet runs the Go vet command on the packages named by the import paths.
 
-For more about vet, see 'godoc code.google.com/p/go.tools/cmd/vet'.
+For more about vet, see 'godoc golang.org/x/tools/cmd/vet'.
 For more about specifying packages, see 'go help packages'.
 
 To run the vet tool with specific options, run 'go tool vet'.
diff --git a/src/cmd/go/pkg.go b/src/cmd/go/pkg.go
index e17326442..b71feb7a6 100644
--- a/src/cmd/go/pkg.go
+++ b/src/cmd/go/pkg.go
@@ -383,9 +383,10 @@ func findInternal(path string) (index int, ok bool) {
 type targetDir int
 
 const (
-	toRoot targetDir = iota // to bin dir inside package root (default)
-	toTool                  // GOROOT/pkg/tool
-	toBin                   // GOROOT/bin
+	toRoot    targetDir = iota // to bin dir inside package root (default)
+	toTool                     // GOROOT/pkg/tool
+	toBin                      // GOROOT/bin
+	stalePath                  // the old import path; fail to build
 )
 
 // goTools is a map of Go program import path to install target directory.
@@ -398,10 +399,14 @@ var goTools = map[string]targetDir{
 	"cmd/nm":                               toTool,
 	"cmd/objdump":                          toTool,
 	"cmd/pack":                             toTool,
+	"cmd/pprof":                            toTool,
 	"cmd/yacc":                             toTool,
-	"code.google.com/p/go.tools/cmd/cover": toTool,
-	"code.google.com/p/go.tools/cmd/godoc": toBin,
-	"code.google.com/p/go.tools/cmd/vet":   toTool,
+	"golang.org/x/tools/cmd/cover":         toTool,
+	"golang.org/x/tools/cmd/godoc":         toBin,
+	"golang.org/x/tools/cmd/vet":           toTool,
+	"code.google.com/p/go.tools/cmd/cover": stalePath,
+	"code.google.com/p/go.tools/cmd/godoc": stalePath,
+	"code.google.com/p/go.tools/cmd/vet":   stalePath,
 }
 
 // expandScanner expands a scanner.List error into all the errors in the list.
@@ -462,6 +467,13 @@ func (p *Package) load(stk *importStack, bp *build.Package, err error) *Package
 	}
 
 	if p.Name == "main" {
+		// Report an error when the old code.google.com/p/go.tools paths are used.
+		if goTools[p.ImportPath] == stalePath {
+			newPath := strings.Replace(p.ImportPath, "code.google.com/p/go.", "golang.org/x/", 1)
+			e := fmt.Sprintf("the %v command has moved; use %v instead.", p.ImportPath, newPath)
+			p.Error = &PackageError{Err: e}
+			return p
+		}
 		_, elem := filepath.Split(p.Dir)
 		full := buildContext.GOOS + "_" + buildContext.GOARCH + "/" + elem
 		if buildContext.GOOS != toolGOOS || buildContext.GOARCH != toolGOARCH {
diff --git a/src/cmd/go/test.bash b/src/cmd/go/test.bash
index 2b5230b1a..e0f066f18 100755
--- a/src/cmd/go/test.bash
+++ b/src/cmd/go/test.bash
@@ -433,20 +433,20 @@ TEST godoc installs into GOBIN
 d=$(mktemp -d -t testgoXXX)
 export GOPATH=$d
 mkdir $d/gobin
-GOBIN=$d/gobin ./testgo get code.google.com/p/go.tools/cmd/godoc || ok=false
+GOBIN=$d/gobin ./testgo get golang.org/x/tools/cmd/godoc || ok=false
 if [ ! -x $d/gobin/godoc ]; then
 	echo did not install godoc to '$GOBIN'
-	GOBIN=$d/gobin ./testgo list -f 'Target: {{.Target}}' code.google.com/p/go.tools/cmd/godoc || true
+	GOBIN=$d/gobin ./testgo list -f 'Target: {{.Target}}' golang.org/x/tools/cmd/godoc || true
 	ok=false
 fi
 
 TEST godoc installs into GOROOT
 GOROOT=$(./testgo env GOROOT)
 rm -f $GOROOT/bin/godoc
-./testgo install code.google.com/p/go.tools/cmd/godoc || ok=false
+./testgo install golang.org/x/tools/cmd/godoc || ok=false
 if [ ! -x $GOROOT/bin/godoc ]; then
 	echo did not install godoc to '$GOROOT/bin'
-	./testgo list -f 'Target: {{.Target}}' code.google.com/p/go.tools/cmd/godoc || true
+	./testgo list -f 'Target: {{.Target}}' golang.org/x/tools/cmd/godoc || true
 	ok=false
 fi
 
@@ -561,8 +561,8 @@ fi
 TEST without GOPATH, go get fails
 d=$(mktemp -d -t testgoXXX)
 mkdir -p $d/src
-if GOPATH= GOROOT=$d ./testgo get -d code.google.com/p/go.codereview/cmd/hgpatch ; then 
-	echo 'go get code.google.com/p/go.codereview/cmd/hgpatch should not succeed with $GOPATH unset'
+if GOPATH= GOROOT=$d ./testgo get -d golang.org/x/codereview/cmd/hgpatch ; then 
+	echo 'go get golang.org/x/codereview/cmd/hgpatch should not succeed with $GOPATH unset'
 	ok=false
 fi	
 rm -rf $d
@@ -571,8 +571,8 @@ rm -rf $d
 TEST with GOPATH=GOROOT, go get fails
 d=$(mktemp -d -t testgoXXX)
 mkdir -p $d/src
-if GOPATH=$d GOROOT=$d ./testgo get -d code.google.com/p/go.codereview/cmd/hgpatch ; then
-        echo 'go get code.google.com/p/go.codereview/cmd/hgpatch should not succeed with GOPATH=$GOROOT'
+if GOPATH=$d GOROOT=$d ./testgo get -d golang.org/x/codereview/cmd/hgpatch ; then
+        echo 'go get golang.org/x/codereview/cmd/hgpatch should not succeed with GOPATH=$GOROOT'
         ok=false
 fi
 rm -rf $d
@@ -728,7 +728,7 @@ elif ! grep "case-insensitive file name collision" $d/out >/dev/null; then
 fi
 
 TEST go get cover
-./testgo get code.google.com/p/go.tools/cmd/cover || ok=false
+./testgo get golang.org/x/tools/cmd/cover || ok=false
 
 unset GOPATH
 rm -rf $d
diff --git a/src/cmd/go/tool.go b/src/cmd/go/tool.go
index 6d26f7a4b..c96161e0f 100644
--- a/src/cmd/go/tool.go
+++ b/src/cmd/go/tool.go
@@ -53,7 +53,7 @@ func tool(toolName string) string {
 	// Give a nice message if there is no tool with that name.
 	if _, err := os.Stat(toolPath); err != nil {
 		if isInGoToolsRepo(toolName) {
-			fmt.Fprintf(os.Stderr, "go tool: no such tool %q; to install:\n\tgo get code.google.com/p/go.tools/cmd/%s\n", toolName, toolName)
+			fmt.Fprintf(os.Stderr, "go tool: no such tool %q; to install:\n\tgo get golang.org/x/tools/cmd/%s\n", toolName, toolName)
 		} else {
 			fmt.Fprintf(os.Stderr, "go tool: no such tool %q\n", toolName)
 		}
diff --git a/src/cmd/go/vet.go b/src/cmd/go/vet.go
index de7befc61..02ff54b2a 100644
--- a/src/cmd/go/vet.go
+++ b/src/cmd/go/vet.go
@@ -17,7 +17,7 @@ var cmdVet = &Command{
 	Long: `
 Vet runs the Go vet command on the packages named by the import paths.
 
-For more about vet, see 'godoc code.google.com/p/go.tools/cmd/vet'.
+For more about vet, see 'godoc golang.org/x/tools/cmd/vet'.
 For more about specifying packages, see 'go help packages'.
 
 To run the vet tool with specific options, run 'go tool vet'.
diff --git a/src/cmd/internal/objfile/disasm.go b/src/cmd/internal/objfile/disasm.go
new file mode 100644
index 000000000..1a339c321
--- /dev/null
+++ b/src/cmd/internal/objfile/disasm.go
@@ -0,0 +1,248 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package objfile
+
+import (
+	"bufio"
+	"debug/gosym"
+	"encoding/binary"
+	"fmt"
+	"io"
+	"regexp"
+	"sort"
+	"strings"
+	"text/tabwriter"
+
+	"cmd/internal/rsc.io/arm/armasm"
+	"cmd/internal/rsc.io/x86/x86asm"
+)
+
+// Disasm is a disassembler for a given File.
+type Disasm struct {
+	syms      []Sym            //symbols in file, sorted by address
+	pcln      *gosym.Table     // pcln table
+	text      []byte           // bytes of text segment (actual instructions)
+	textStart uint64           // start PC of text
+	textEnd   uint64           // end PC of text
+	goarch    string           // GOARCH string
+	disasm    disasmFunc       // disassembler function for goarch
+	byteOrder binary.ByteOrder // byte order for goarch
+}
+
+// Disasm returns a disassembler for the file f.
+func (f *File) Disasm() (*Disasm, error) {
+	syms, err := f.Symbols()
+	if err != nil {
+		return nil, err
+	}
+
+	pcln, err := f.PCLineTable()
+	if err != nil {
+		return nil, err
+	}
+
+	textStart, textBytes, err := f.Text()
+	if err != nil {
+		return nil, err
+	}
+
+	goarch := f.GOARCH()
+	disasm := disasms[goarch]
+	byteOrder := byteOrders[goarch]
+	if disasm == nil || byteOrder == nil {
+		return nil, fmt.Errorf("unsupported architecture")
+	}
+
+	// Filter out section symbols, overwriting syms in place.
+	keep := syms[:0]
+	for _, sym := range syms {
+		switch sym.Name {
+		case "runtime.text", "text", "_text", "runtime.etext", "etext", "_etext":
+			// drop
+		default:
+			keep = append(keep, sym)
+		}
+	}
+	syms = keep
+	d := &Disasm{
+		syms:      syms,
+		pcln:      pcln,
+		text:      textBytes,
+		textStart: textStart,
+		textEnd:   textStart + uint64(len(textBytes)),
+		goarch:    goarch,
+		disasm:    disasm,
+		byteOrder: byteOrder,
+	}
+
+	return d, nil
+}
+
+// lookup finds the symbol name containing addr.
+func (d *Disasm) lookup(addr uint64) (name string, base uint64) {
+	i := sort.Search(len(d.syms), func(i int) bool { return addr < d.syms[i].Addr })
+	if i > 0 {
+		s := d.syms[i-1]
+		if s.Addr != 0 && s.Addr <= addr && addr < s.Addr+uint64(s.Size) {
+			return s.Name, s.Addr
+		}
+	}
+	return "", 0
+}
+
+// base returns the final element in the path.
+// It works on both Windows and Unix paths,
+// regardless of host operating system.
+func base(path string) string {
+	path = path[strings.LastIndex(path, "/")+1:]
+	path = path[strings.LastIndex(path, `\`)+1:]
+	return path
+}
+
+// Print prints a disassembly of the file to w.
+// If filter is non-nil, the disassembly only includes functions with names matching filter.
+// The disassembly only includes functions that overlap the range [start, end).
+func (d *Disasm) Print(w io.Writer, filter *regexp.Regexp, start, end uint64) {
+	if start < d.textStart {
+		start = d.textStart
+	}
+	if end > d.textEnd {
+		end = d.textEnd
+	}
+	printed := false
+	bw := bufio.NewWriter(w)
+	for _, sym := range d.syms {
+		symStart := sym.Addr
+		symEnd := sym.Addr + uint64(sym.Size)
+		if sym.Code != 'T' && sym.Code != 't' ||
+			symStart < d.textStart ||
+			symEnd <= start || end <= symStart ||
+			filter != nil && !filter.MatchString(sym.Name) {
+			continue
+		}
+		if printed {
+			fmt.Fprintf(bw, "\n")
+		}
+		printed = true
+
+		file, _, _ := d.pcln.PCToLine(sym.Addr)
+		fmt.Fprintf(bw, "TEXT %s(SB) %s\n", sym.Name, file)
+
+		tw := tabwriter.NewWriter(bw, 1, 8, 1, '\t', 0)
+		if symEnd > end {
+			symEnd = end
+		}
+		code := d.text[:end-d.textStart]
+		d.Decode(symStart, symEnd, func(pc, size uint64, file string, line int, text string) {
+			i := pc - d.textStart
+			fmt.Fprintf(tw, "\t%s:%d\t%#x\t", base(file), line, pc)
+			if size%4 != 0 || d.goarch == "386" || d.goarch == "amd64" {
+				// Print instruction as bytes.
+				fmt.Fprintf(tw, "%x", code[i:i+size])
+			} else {
+				// Print instruction as 32-bit words.
+				for j := uint64(0); j < size; j += 4 {
+					if j > 0 {
+						fmt.Fprintf(tw, " ")
+					}
+					fmt.Fprintf(tw, "%08x", d.byteOrder.Uint32(code[i+j:]))
+				}
+			}
+			fmt.Fprintf(tw, "\t%s\n", text)
+		})
+		tw.Flush()
+	}
+	bw.Flush()
+}
+
+// Decode disassembles the text segment range [start, end), calling f for each instruction.
+func (d *Disasm) Decode(start, end uint64, f func(pc, size uint64, file string, line int, text string)) {
+	if start < d.textStart {
+		start = d.textStart
+	}
+	if end > d.textEnd {
+		end = d.textEnd
+	}
+	code := d.text[:end-d.textStart]
+	lookup := d.lookup
+	for pc := start; pc < end; {
+		i := pc - d.textStart
+		text, size := d.disasm(code[i:], pc, lookup)
+		file, line, _ := d.pcln.PCToLine(pc)
+		f(pc, uint64(size), file, line, text)
+		pc += uint64(size)
+	}
+}
+
+type lookupFunc func(addr uint64) (sym string, base uint64)
+type disasmFunc func(code []byte, pc uint64, lookup lookupFunc) (text string, size int)
+
+func disasm_386(code []byte, pc uint64, lookup lookupFunc) (string, int) {
+	return disasm_x86(code, pc, lookup, 32)
+}
+
+func disasm_amd64(code []byte, pc uint64, lookup lookupFunc) (string, int) {
+	return disasm_x86(code, pc, lookup, 64)
+}
+
+func disasm_x86(code []byte, pc uint64, lookup lookupFunc, arch int) (string, int) {
+	inst, err := x86asm.Decode(code, 64)
+	var text string
+	size := inst.Len
+	if err != nil || size == 0 || inst.Op == 0 {
+		size = 1
+		text = "?"
+	} else {
+		text = x86asm.Plan9Syntax(inst, pc, lookup)
+	}
+	return text, size
+}
+
+type textReader struct {
+	code []byte
+	pc   uint64
+}
+
+func (r textReader) ReadAt(data []byte, off int64) (n int, err error) {
+	if off < 0 || uint64(off) < r.pc {
+		return 0, io.EOF
+	}
+	d := uint64(off) - r.pc
+	if d >= uint64(len(r.code)) {
+		return 0, io.EOF
+	}
+	n = copy(data, r.code[d:])
+	if n < len(data) {
+		err = io.ErrUnexpectedEOF
+	}
+	return
+}
+
+func disasm_arm(code []byte, pc uint64, lookup lookupFunc) (string, int) {
+	inst, err := armasm.Decode(code, armasm.ModeARM)
+	var text string
+	size := inst.Len
+	if err != nil || size == 0 || inst.Op == 0 {
+		size = 4
+		text = "?"
+	} else {
+		text = armasm.Plan9Syntax(inst, pc, lookup, textReader{code, pc})
+	}
+	return text, size
+}
+
+var disasms = map[string]disasmFunc{
+	"386":   disasm_386,
+	"amd64": disasm_amd64,
+	"arm":   disasm_arm,
+}
+
+var byteOrders = map[string]binary.ByteOrder{
+	"386":       binary.LittleEndian,
+	"amd64":     binary.LittleEndian,
+	"arm":       binary.LittleEndian,
+	"power64":   binary.BigEndian,
+	"power64le": binary.LittleEndian,
+}
diff --git a/src/cmd/internal/objfile/objfile.go b/src/cmd/internal/objfile/objfile.go
index 3d4a5d27c..9227ef387 100644
--- a/src/cmd/internal/objfile/objfile.go
+++ b/src/cmd/internal/objfile/objfile.go
@@ -9,6 +9,7 @@ import (
 	"debug/gosym"
 	"fmt"
 	"os"
+	"sort"
 )
 
 type rawFile interface {
@@ -62,9 +63,20 @@ func (f *File) Close() error {
 }
 
 func (f *File) Symbols() ([]Sym, error) {
-	return f.raw.symbols()
+	syms, err := f.raw.symbols()
+	if err != nil {
+		return nil, err
+	}
+	sort.Sort(byAddr(syms))
+	return syms, nil
 }
 
+type byAddr []Sym
+
+func (x byAddr) Less(i, j int) bool { return x[i].Addr < x[j].Addr }
+func (x byAddr) Len() int           { return len(x) }
+func (x byAddr) Swap(i, j int)      { x[i], x[j] = x[j], x[i] }
+
 func (f *File) PCLineTable() (*gosym.Table, error) {
 	textStart, symtab, pclntab, err := f.raw.pcln()
 	if err != nil {
diff --git a/src/cmd/objdump/main.go b/src/cmd/objdump/main.go
index 0f125c98b..708a85370 100644
--- a/src/cmd/objdump/main.go
+++ b/src/cmd/objdump/main.go
@@ -32,24 +32,15 @@
 package main
 
 import (
-	"bufio"
-	"debug/gosym"
-	"encoding/binary"
 	"flag"
 	"fmt"
-	"io"
 	"log"
 	"os"
 	"regexp"
-	"sort"
 	"strconv"
 	"strings"
-	"text/tabwriter"
 
 	"cmd/internal/objfile"
-
-	"cmd/internal/rsc.io/arm/armasm"
-	"cmd/internal/rsc.io/x86/x86asm"
 )
 
 var symregexp = flag.String("s", "", "only dump symbols matching this regexp")
@@ -87,227 +78,30 @@ func main() {
 		log.Fatal(err)
 	}
 
-	syms, err := f.Symbols()
+	dis, err := f.Disasm()
 	if err != nil {
-		log.Fatalf("reading %s: %v", flag.Arg(0), err)
+		log.Fatal("disassemble %s: %v", flag.Arg(0), err)
 	}
 
-	tab, err := f.PCLineTable()
-	if err != nil {
-		log.Fatalf("reading %s: %v", flag.Arg(0), err)
-	}
-
-	textStart, textBytes, err := f.Text()
-	if err != nil {
-		log.Fatalf("reading %s: %v", flag.Arg(0), err)
-	}
-
-	goarch := f.GOARCH()
-
-	disasm := disasms[goarch]
-	if disasm == nil {
-		log.Fatalf("reading %s: unknown architecture", flag.Arg(0))
-	}
-
-	// Filter out section symbols, overwriting syms in place.
-	keep := syms[:0]
-	for _, sym := range syms {
-		switch sym.Name {
-		case "runtime.text", "text", "_text", "runtime.etext", "etext", "_etext":
-			// drop
-		default:
-			keep = append(keep, sym)
-		}
-	}
-	syms = keep
-
-	sort.Sort(ByAddr(syms))
-	lookup := func(addr uint64) (string, uint64) {
-		i := sort.Search(len(syms), func(i int) bool { return addr < syms[i].Addr })
-		if i > 0 {
-			s := syms[i-1]
-			if s.Addr != 0 && s.Addr <= addr && addr < s.Addr+uint64(s.Size) {
-				return s.Name, s.Addr
-			}
-		}
-		return "", 0
-	}
-
-	if flag.NArg() == 1 {
-		// disassembly of entire object - our format
-		dump(tab, lookup, disasm, goarch, syms, textBytes, textStart)
+	switch flag.NArg() {
+	default:
+		usage()
+	case 1:
+		// disassembly of entire object
+		dis.Print(os.Stdout, symRE, 0, ^uint64(0))
 		os.Exit(0)
-	}
-
-	// disassembly of specific piece of object - gnu objdump format for pprof
-	gnuDump(tab, lookup, disasm, textBytes, textStart)
-	os.Exit(0)
-}
-
-// base returns the final element in the path.
-// It works on both Windows and Unix paths.
-func base(path string) string {
-	path = path[strings.LastIndex(path, "/")+1:]
-	path = path[strings.LastIndex(path, `\`)+1:]
-	return path
-}
-
-func dump(tab *gosym.Table, lookup lookupFunc, disasm disasmFunc, goarch string, syms []objfile.Sym, textData []byte, textStart uint64) {
-	stdout := bufio.NewWriter(os.Stdout)
-	defer stdout.Flush()
-
-	printed := false
-	for _, sym := range syms {
-		if (sym.Code != 'T' && sym.Code != 't') || sym.Size == 0 || sym.Addr < textStart || symRE != nil && !symRE.MatchString(sym.Name) {
-			continue
-		}
-		if sym.Addr >= textStart+uint64(len(textData)) || sym.Addr+uint64(sym.Size) > textStart+uint64(len(textData)) {
-			break
-		}
-		if printed {
-			fmt.Fprintf(stdout, "\n")
-		} else {
-			printed = true
-		}
-		file, _, _ := tab.PCToLine(sym.Addr)
-		fmt.Fprintf(stdout, "TEXT %s(SB) %s\n", sym.Name, file)
-		tw := tabwriter.NewWriter(stdout, 1, 8, 1, '\t', 0)
-		start := sym.Addr
-		end := sym.Addr + uint64(sym.Size)
-		for pc := start; pc < end; {
-			i := pc - textStart
-			text, size := disasm(textData[i:end-textStart], pc, lookup)
-			file, line, _ := tab.PCToLine(pc)
-
-			// ARM is word-based, so show actual word hex, not byte hex.
-			// Since ARM is little endian, they're different.
-			if goarch == "arm" && size == 4 {
-				fmt.Fprintf(tw, "\t%s:%d\t%#x\t%08x\t%s\n", base(file), line, pc, binary.LittleEndian.Uint32(textData[i:i+uint64(size)]), text)
-			} else {
-				fmt.Fprintf(tw, "\t%s:%d\t%#x\t%x\t%s\n", base(file), line, pc, textData[i:i+uint64(size)], text)
-			}
-			pc += uint64(size)
-		}
-		tw.Flush()
-	}
-}
-
-func disasm_386(code []byte, pc uint64, lookup lookupFunc) (string, int) {
-	return disasm_x86(code, pc, lookup, 32)
-}
-
-func disasm_amd64(code []byte, pc uint64, lookup lookupFunc) (string, int) {
-	return disasm_x86(code, pc, lookup, 64)
-}
-
-func disasm_x86(code []byte, pc uint64, lookup lookupFunc, arch int) (string, int) {
-	inst, err := x86asm.Decode(code, 64)
-	var text string
-	size := inst.Len
-	if err != nil || size == 0 || inst.Op == 0 {
-		size = 1
-		text = "?"
-	} else {
-		text = x86asm.Plan9Syntax(inst, pc, lookup)
-	}
-	return text, size
-}
-
-type textReader struct {
-	code []byte
-	pc   uint64
-}
-
-func (r textReader) ReadAt(data []byte, off int64) (n int, err error) {
-	if off < 0 || uint64(off) < r.pc {
-		return 0, io.EOF
-	}
-	d := uint64(off) - r.pc
-	if d >= uint64(len(r.code)) {
-		return 0, io.EOF
-	}
-	n = copy(data, r.code[d:])
-	if n < len(data) {
-		err = io.ErrUnexpectedEOF
-	}
-	return
-}
-
-func disasm_arm(code []byte, pc uint64, lookup lookupFunc) (string, int) {
-	inst, err := armasm.Decode(code, armasm.ModeARM)
-	var text string
-	size := inst.Len
-	if err != nil || size == 0 || inst.Op == 0 {
-		size = 4
-		text = "?"
-	} else {
-		text = armasm.Plan9Syntax(inst, pc, lookup, textReader{code, pc})
-	}
-	return text, size
-}
-
-var disasms = map[string]disasmFunc{
-	"386":   disasm_386,
-	"amd64": disasm_amd64,
-	"arm":   disasm_arm,
-}
-
-func gnuDump(tab *gosym.Table, lookup lookupFunc, disasm disasmFunc, textData []byte, textStart uint64) {
-	start, err := strconv.ParseUint(strings.TrimPrefix(flag.Arg(1), "0x"), 16, 64)
-	if err != nil {
-		log.Fatalf("invalid start PC: %v", err)
-	}
-	end, err := strconv.ParseUint(strings.TrimPrefix(flag.Arg(2), "0x"), 16, 64)
-	if err != nil {
-		log.Fatalf("invalid end PC: %v", err)
-	}
-	if start < textStart {
-		start = textStart
-	}
-	if end < start {
-		end = start
-	}
-	if end > textStart+uint64(len(textData)) {
-		end = textStart + uint64(len(textData))
-	}
-
-	stdout := bufio.NewWriter(os.Stdout)
-	defer stdout.Flush()
-
-	// For now, find spans of same PC/line/fn and
-	// emit them as having dummy instructions.
-	var (
-		spanPC   uint64
-		spanFile string
-		spanLine int
-		spanFn   *gosym.Func
-	)
 
-	flush := func(endPC uint64) {
-		if spanPC == 0 {
-			return
-		}
-		fmt.Fprintf(stdout, "%s:%d\n", spanFile, spanLine)
-		for pc := spanPC; pc < endPC; {
-			text, size := disasm(textData[pc-textStart:], pc, lookup)
-			fmt.Fprintf(stdout, " %x: %s\n", pc, text)
-			pc += uint64(size)
+	case 3:
+		// disassembly of PC range
+		start, err := strconv.ParseUint(strings.TrimPrefix(flag.Arg(1), "0x"), 16, 64)
+		if err != nil {
+			log.Fatalf("invalid start PC: %v", err)
 		}
-		spanPC = 0
-	}
-
-	for pc := start; pc < end; pc++ {
-		file, line, fn := tab.PCToLine(pc)
-		if file != spanFile || line != spanLine || fn != spanFn {
-			flush(pc)
-			spanPC, spanFile, spanLine, spanFn = pc, file, line, fn
+		end, err := strconv.ParseUint(strings.TrimPrefix(flag.Arg(2), "0x"), 16, 64)
+		if err != nil {
+			log.Fatalf("invalid end PC: %v", err)
 		}
+		dis.Print(os.Stdout, symRE, start, end)
+		os.Exit(0)
 	}
-	flush(end)
 }
-
-type ByAddr []objfile.Sym
-
-func (x ByAddr) Less(i, j int) bool { return x[i].Addr < x[j].Addr }
-func (x ByAddr) Len() int           { return len(x) }
-func (x ByAddr) Swap(i, j int)      { x[i], x[j] = x[j], x[i] }
diff --git a/src/cmd/objdump/objdump_test.go b/src/cmd/objdump/objdump_test.go
index 5047f9aa8..bd09ae9f9 100644
--- a/src/cmd/objdump/objdump_test.go
+++ b/src/cmd/objdump/objdump_test.go
@@ -5,117 +5,15 @@
 package main
 
 import (
-	"bufio"
-	"bytes"
-	"fmt"
 	"io/ioutil"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"runtime"
-	"strconv"
 	"strings"
 	"testing"
 )
 
-func loadSyms(t *testing.T) map[string]string {
-	switch runtime.GOOS {
-	case "android", "nacl":
-		t.Skipf("skipping on %s", runtime.GOOS)
-	}
-
-	cmd := exec.Command("go", "tool", "nm", os.Args[0])
-	out, err := cmd.CombinedOutput()
-	if err != nil {
-		t.Fatalf("go tool nm %v: %v\n%s", os.Args[0], err, string(out))
-	}
-	syms := make(map[string]string)
-	scanner := bufio.NewScanner(bytes.NewReader(out))
-	for scanner.Scan() {
-		f := strings.Fields(scanner.Text())
-		if len(f) < 3 {
-			continue
-		}
-		syms[f[2]] = f[0]
-	}
-	if err := scanner.Err(); err != nil {
-		t.Fatalf("error reading symbols: %v", err)
-	}
-	return syms
-}
-
-func runObjDump(t *testing.T, exe, startaddr, endaddr string) (path, lineno string) {
-	switch runtime.GOOS {
-	case "android", "nacl":
-		t.Skipf("skipping on %s", runtime.GOOS)
-	}
-	switch runtime.GOARCH {
-	case "power64", "power64le":
-		t.Skipf("skipping on %s, issue 9039", runtime.GOARCH)
-	}
-
-	cmd := exec.Command(exe, os.Args[0], startaddr, endaddr)
-	out, err := cmd.CombinedOutput()
-	if err != nil {
-		t.Fatalf("go tool objdump %v: %v\n%s", os.Args[0], err, string(out))
-	}
-	f := strings.Split(string(out), "\n")
-	if len(f) < 1 {
-		t.Fatal("objdump output must have at least one line")
-	}
-	pathAndLineNo := f[0]
-	f = strings.Split(pathAndLineNo, ":")
-	if runtime.GOOS == "windows" {
-		switch len(f) {
-		case 2:
-			return f[0], f[1]
-		case 3:
-			return f[0] + ":" + f[1], f[2]
-		default:
-			t.Fatalf("no line number found in %q", pathAndLineNo)
-		}
-	}
-	if len(f) != 2 {
-		t.Fatalf("no line number found in %q", pathAndLineNo)
-	}
-	return f[0], f[1]
-}
-
-func testObjDump(t *testing.T, exe, startaddr, endaddr string, line int) {
-	srcPath, srcLineNo := runObjDump(t, exe, startaddr, endaddr)
-	fi1, err := os.Stat("objdump_test.go")
-	if err != nil {
-		t.Fatalf("Stat failed: %v", err)
-	}
-	fi2, err := os.Stat(srcPath)
-	if err != nil {
-		t.Fatalf("Stat failed: %v", err)
-	}
-	if !os.SameFile(fi1, fi2) {
-		t.Fatalf("objdump_test.go and %s are not same file", srcPath)
-	}
-	if srcLineNo != fmt.Sprint(line) {
-		t.Fatalf("line number = %v; want %d", srcLineNo, line)
-	}
-}
-
-func TestObjDump(t *testing.T) {
-	_, _, line, _ := runtime.Caller(0)
-	syms := loadSyms(t)
-
-	tmp, exe := buildObjdump(t)
-	defer os.RemoveAll(tmp)
-
-	startaddr := syms["cmd/objdump.TestObjDump"]
-	addr, err := strconv.ParseUint(startaddr, 16, 64)
-	if err != nil {
-		t.Fatalf("invalid start address %v: %v", startaddr, err)
-	}
-	endaddr := fmt.Sprintf("%x", addr+10)
-	testObjDump(t, exe, startaddr, endaddr, line-1)
-	testObjDump(t, exe, "0x"+startaddr, "0x"+endaddr, line-1)
-}
-
 func buildObjdump(t *testing.T) (tmp, exe string) {
 	switch runtime.GOOS {
 	case "android", "nacl":
diff --git a/src/cmd/pprof/README b/src/cmd/pprof/README
new file mode 100644
index 000000000..a728ef235
--- /dev/null
+++ b/src/cmd/pprof/README
@@ -0,0 +1,8 @@
+The pprof in this directory is adapted from the pprof used inside Google
+for C++, Java, and Go programs. Because it was developed for that broader
+context, it is overgeneralized when used here for the specific use case
+of profiling standard Go programs. However, we've left the abstractions
+intact in order to share updates between this copy and Google's internal one.
+
+Please do not take the level of abstraction in this program as an example
+to follow in your own.
diff --git a/src/cmd/pprof/doc.go b/src/cmd/pprof/doc.go
new file mode 100644
index 000000000..c6ff11d10
--- /dev/null
+++ b/src/cmd/pprof/doc.go
@@ -0,0 +1,12 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Pprof interprets and displays profiles of Go programs.
+//
+// Usage:
+//
+//	go tool pprof binary profile
+//
+// For more information, see http://blog.golang.org/profiling-go-programs.
+package main
diff --git a/src/cmd/pprof/pprof.go b/src/cmd/pprof/pprof.go
index 89a5bb7d2..44f4f6cb7 100644
--- a/src/cmd/pprof/pprof.go
+++ b/src/cmd/pprof/pprof.go
@@ -11,6 +11,7 @@ import (
 	"os"
 	"regexp"
 	"strings"
+	"sync"
 
 	"cmd/internal/objfile"
 	"cmd/pprof/internal/commands"
@@ -100,7 +101,10 @@ func (flags) ExtraUsage() string {
 
 // objTool implements plugin.ObjTool using Go libraries
 // (instead of invoking GNU binutils).
-type objTool struct{}
+type objTool struct {
+	mu          sync.Mutex
+	disasmCache map[string]*objfile.Disasm
+}
 
 func (*objTool) Open(name string, start uint64) (plugin.ObjFile, error) {
 	of, err := objfile.Open(name)
@@ -119,8 +123,39 @@ func (*objTool) Demangle(names []string) (map[string]string, error) {
 	return make(map[string]string), nil
 }
 
-func (*objTool) Disasm(file string, start, end uint64) ([]plugin.Inst, error) {
-	return nil, fmt.Errorf("disassembly not supported")
+func (t *objTool) Disasm(file string, start, end uint64) ([]plugin.Inst, error) {
+	d, err := t.cachedDisasm(file)
+	if err != nil {
+		return nil, err
+	}
+	var asm []plugin.Inst
+	d.Decode(start, end, func(pc, size uint64, file string, line int, text string) {
+		asm = append(asm, plugin.Inst{Addr: pc, File: file, Line: line, Text: text})
+	})
+	return asm, nil
+}
+
+func (t *objTool) cachedDisasm(file string) (*objfile.Disasm, error) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.disasmCache == nil {
+		t.disasmCache = make(map[string]*objfile.Disasm)
+	}
+	d := t.disasmCache[file]
+	if d != nil {
+		return d, nil
+	}
+	f, err := objfile.Open(file)
+	if err != nil {
+		return nil, err
+	}
+	d, err = f.Disasm()
+	f.Close()
+	if err != nil {
+		return nil, err
+	}
+	t.disasmCache[file] = d
+	return d, nil
 }
 
 func (*objTool) SetConfig(config string) {
diff --git a/src/compress/lzw/reader.go b/src/compress/lzw/reader.go
index 0835bd8b9..526620c82 100644
--- a/src/compress/lzw/reader.go
+++ b/src/compress/lzw/reader.go
@@ -11,7 +11,7 @@
 // two non-literal codes are a clear code and an EOF code.
 //
 // The TIFF file format uses a similar but incompatible version of the LZW
-// algorithm. See the code.google.com/p/go.image/tiff/lzw package for an
+// algorithm. See the golang.org/x/image/tiff/lzw package for an
 // implementation.
 package lzw
 
diff --git a/src/crypto/crypto.go b/src/crypto/crypto.go
index 5a91baca0..59b23e93f 100644
--- a/src/crypto/crypto.go
+++ b/src/crypto/crypto.go
@@ -21,7 +21,7 @@ func (h Hash) HashFunc() Hash {
 }
 
 const (
-	MD4       Hash = 1 + iota // import code.google.com/p/go.crypto/md4
+	MD4       Hash = 1 + iota // import golang.org/x/crypto/md4
 	MD5                       // import crypto/md5
 	SHA1                      // import crypto/sha1
 	SHA224                    // import crypto/sha256
@@ -29,11 +29,11 @@ const (
 	SHA384                    // import crypto/sha512
 	SHA512                    // import crypto/sha512
 	MD5SHA1                   // no implementation; MD5+SHA1 used for TLS RSA
-	RIPEMD160                 // import code.google.com/p/go.crypto/ripemd160
-	SHA3_224                  // import code.google.com/p/go.crypto/sha3
-	SHA3_256                  // import code.google.com/p/go.crypto/sha3
-	SHA3_384                  // import code.google.com/p/go.crypto/sha3
-	SHA3_512                  // import code.google.com/p/go.crypto/sha3
+	RIPEMD160                 // import golang.org/x/crypto/ripemd160
+	SHA3_224                  // import golang.org/x/crypto/sha3
+	SHA3_256                  // import golang.org/x/crypto/sha3
+	SHA3_384                  // import golang.org/x/crypto/sha3
+	SHA3_512                  // import golang.org/x/crypto/sha3
 	maxHash
 )
 
diff --git a/src/debug/goobj/read_test.go b/src/debug/goobj/read_test.go
index dee140533..cc991e5d9 100644
--- a/src/debug/goobj/read_test.go
+++ b/src/debug/goobj/read_test.go
@@ -12,7 +12,7 @@ var importPathToPrefixTests = []struct {
 }{
 	{"runtime", "runtime"},
 	{"sync/atomic", "sync/atomic"},
-	{"code.google.com/p/go.tools/godoc", "code.google.com/p/go.tools/godoc"},
+	{"golang.org/x/tools/godoc", "golang.org/x/tools/godoc"},
 	{"foo.bar/baz.quux", "foo.bar/baz%2equux"},
 	{"", ""},
 	{"%foo%bar", "%25foo%25bar"},
diff --git a/src/net/http/cookiejar/jar.go b/src/net/http/cookiejar/jar.go
index 389ab58e4..0e0fac928 100644
--- a/src/net/http/cookiejar/jar.go
+++ b/src/net/http/cookiejar/jar.go
@@ -30,7 +30,7 @@ import (
 // set a cookie for bar.com.
 //
 // A public suffix list implementation is in the package
-// code.google.com/p/go.net/publicsuffix.
+// golang.org/x/net/publicsuffix.
 type PublicSuffixList interface {
 	// PublicSuffix returns the public suffix of domain.
 	//
diff --git a/src/net/http/serve_test.go b/src/net/http/serve_test.go
index bb44ac853..5e0a0053c 100644
--- a/src/net/http/serve_test.go
+++ b/src/net/http/serve_test.go
@@ -2819,6 +2819,7 @@ func benchmarkClientServerParallel(b *testing.B, parallelism int, useTLS bool) {
 				InsecureSkipVerify: true,
 			},
 		}
+		defer noVerifyTransport.CloseIdleConnections()
 		client := &Client{Transport: noVerifyTransport}
 		for pb.Next() {
 			res, err := client.Get(ts.URL)
diff --git a/src/os/exec/exec_test.go b/src/os/exec/exec_test.go
index bc9c00eff..197d3e8b4 100644
--- a/src/os/exec/exec_test.go
+++ b/src/os/exec/exec_test.go
@@ -246,7 +246,7 @@ func TestPipeLookPathLeak(t *testing.T) {
 }
 
 func numOpenFDS(t *testing.T) (n int, lsof []byte) {
-	lsof, err := exec.Command("lsof", "-n", "-p", strconv.Itoa(os.Getpid())).Output()
+	lsof, err := exec.Command("lsof", "-b", "-n", "-p", strconv.Itoa(os.Getpid())).Output()
 	if err != nil {
 		t.Skip("skipping test; error finding or running lsof")
 	}
diff --git a/src/os/file_plan9.go b/src/os/file_plan9.go
index 5efc2a4f1..132594eed 100644
--- a/src/os/file_plan9.go
+++ b/src/os/file_plan9.go
@@ -25,7 +25,8 @@ type file struct {
 	dirinfo *dirInfo // nil unless directory being read
 }
 
-// Fd returns the integer Unix file descriptor referencing the open file.
+// Fd returns the integer Plan 9 file descriptor referencing the open file.
+// The file descriptor is valid only until f.Close is called or f is garbage collected.
 func (f *File) Fd() uintptr {
 	if f == nil {
 		return ^(uintptr(0))
diff --git a/src/os/file_unix.go b/src/os/file_unix.go
index f59d563e6..ff4fc7d12 100644
--- a/src/os/file_unix.go
+++ b/src/os/file_unix.go
@@ -29,6 +29,7 @@ type file struct {
 }
 
 // Fd returns the integer Unix file descriptor referencing the open file.
+// The file descriptor is valid only until f.Close is called or f is garbage collected.
 func (f *File) Fd() uintptr {
 	if f == nil {
 		return ^(uintptr(0))
diff --git a/src/os/file_windows.go b/src/os/file_windows.go
index 3b5519390..2a90a5055 100644
--- a/src/os/file_windows.go
+++ b/src/os/file_windows.go
@@ -36,6 +36,7 @@ type file struct {
 }
 
 // Fd returns the Windows handle referencing the open file.
+// The handle is valid only until f.Close is called or f is garbage collected.
 func (file *File) Fd() uintptr {
 	if file == nil {
 		return uintptr(syscall.InvalidHandle)
diff --git a/src/reflect/type.go b/src/reflect/type.go
index 572e611fa..2064922f6 100644
--- a/src/reflect/type.go
+++ b/src/reflect/type.go
@@ -1533,12 +1533,8 @@ func (gc *gcProg) appendProg(t *rtype) {
 			gc.appendProg(e)
 		}
 	case Interface:
-		gc.appendWord(bitsMultiWord)
-		if t.NumMethod() == 0 {
-			gc.appendWord(bitsEface)
-		} else {
-			gc.appendWord(bitsIface)
-		}
+		gc.appendWord(bitsPointer)
+		gc.appendWord(bitsPointer)
 	case Struct:
 		c := t.NumField()
 		for i := 0; i < c; i++ {
diff --git a/src/run.bash b/src/run.bash
index 3c9430c87..91f12a174 100755
--- a/src/run.bash
+++ b/src/run.bash
@@ -66,7 +66,8 @@ go test sync -short -timeout=$(expr 120 \* $timeout_scale)s -cpu=10
 
 # Race detector only supported on Linux, FreeBSD and OS X,
 # and only on amd64, and only when cgo is enabled.
-case "$GOHOSTOS-$GOOS-$GOARCH-$CGO_ENABLED" in
+# DISABLED until we get garbage collection working.
+case "$GOHOSTOS-$GOOS-$GOARCH-$CGO_ENABLED-XXX-DISABLED" in
 linux-linux-amd64-1 | freebsd-freebsd-amd64-1 | darwin-darwin-amd64-1)
 	echo
 	echo '# Testing race detector.'
diff --git a/src/runtime/asm_386.s b/src/runtime/asm_386.s
index 8cbabfed2..501e64b09 100644
--- a/src/runtime/asm_386.s
+++ b/src/runtime/asm_386.s
@@ -502,7 +502,7 @@ fail:
 //		return 1;
 //	}else
 //		return 0;
-TEXT runtime·casp(SB), NOSPLIT, $0-13
+TEXT runtime·casp1(SB), NOSPLIT, $0-13
 	MOVL	ptr+0(FP), BX
 	MOVL	old+4(FP), AX
 	MOVL	new+8(FP), CX
@@ -537,7 +537,7 @@ TEXT runtime·xchg(SB), NOSPLIT, $0-12
 	MOVL	AX, ret+8(FP)
 	RET
 
-TEXT runtime·xchgp(SB), NOSPLIT, $0-12
+TEXT runtime·xchgp1(SB), NOSPLIT, $0-12
 	MOVL	ptr+0(FP), BX
 	MOVL	new+4(FP), AX
 	XCHGL	AX, 0(BX)
@@ -555,7 +555,7 @@ again:
 	JNZ	again
 	RET
 
-TEXT runtime·atomicstorep(SB), NOSPLIT, $0-8
+TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-8
 	MOVL	ptr+0(FP), BX
 	MOVL	val+4(FP), AX
 	XCHGL	AX, 0(BX)
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index 2871a172a..1aa2d71a8 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -489,7 +489,7 @@ TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-16
 //		return 1;
 //	} else
 //		return 0;
-TEXT runtime·casp(SB), NOSPLIT, $0-25
+TEXT runtime·casp1(SB), NOSPLIT, $0-25
 	MOVQ	ptr+0(FP), BX
 	MOVQ	old+8(FP), AX
 	MOVQ	new+16(FP), CX
@@ -541,7 +541,7 @@ TEXT runtime·xchg64(SB), NOSPLIT, $0-24
 	MOVQ	AX, ret+16(FP)
 	RET
 
-TEXT runtime·xchgp(SB), NOSPLIT, $0-24
+TEXT runtime·xchgp1(SB), NOSPLIT, $0-24
 	MOVQ	ptr+0(FP), BX
 	MOVQ	new+8(FP), AX
 	XCHGQ	AX, 0(BX)
@@ -559,7 +559,7 @@ again:
 	JNZ	again
 	RET
 
-TEXT runtime·atomicstorep(SB), NOSPLIT, $0-16
+TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-16
 	MOVQ	ptr+0(FP), BX
 	MOVQ	val+8(FP), AX
 	XCHGQ	AX, 0(BX)
diff --git a/src/runtime/asm_amd64p32.s b/src/runtime/asm_amd64p32.s
index 0d62320de..153564b14 100644
--- a/src/runtime/asm_amd64p32.s
+++ b/src/runtime/asm_amd64p32.s
@@ -460,7 +460,7 @@ fail:
 //		return 1;
 //	} else
 //		return 0;
-TEXT runtime·casp(SB), NOSPLIT, $0-17
+TEXT runtime·casp1(SB), NOSPLIT, $0-17
 	MOVL	ptr+0(FP), BX
 	MOVL	old+4(FP), AX
 	MOVL	new+8(FP), CX
@@ -512,7 +512,7 @@ TEXT runtime·xchg64(SB), NOSPLIT, $0-24
 	MOVQ	AX, ret+16(FP)
 	RET
 
-TEXT runtime·xchgp(SB), NOSPLIT, $0-12
+TEXT runtime·xchgp1(SB), NOSPLIT, $0-12
 	MOVL	ptr+0(FP), BX
 	MOVL	new+4(FP), AX
 	XCHGL	AX, 0(BX)
@@ -530,7 +530,7 @@ again:
 	JNZ	again
 	RET
 
-TEXT runtime·atomicstorep(SB), NOSPLIT, $0-8
+TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-8
 	MOVL	ptr+0(FP), BX
 	MOVL	val+4(FP), AX
 	XCHGL	AX, 0(BX)
diff --git a/src/runtime/asm_power64x.s b/src/runtime/asm_power64x.s
index a75bb8ce1..ba900c2b3 100644
--- a/src/runtime/asm_power64x.s
+++ b/src/runtime/asm_power64x.s
@@ -472,7 +472,7 @@ TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-16
 //		return 1;
 //	} else
 //		return 0;
-TEXT runtime·casp(SB), NOSPLIT, $0-25
+TEXT runtime·casp1(SB), NOSPLIT, $0-25
 	BR runtime·cas64(SB)
 
 // uint32 xadd(uint32 volatile *ptr, int32 delta)
@@ -529,7 +529,7 @@ TEXT runtime·xchg64(SB), NOSPLIT, $0-24
 	MOVD	R3, ret+16(FP)
 	RETURN
 
-TEXT runtime·xchgp(SB), NOSPLIT, $0-24
+TEXT runtime·xchgp1(SB), NOSPLIT, $0-24
 	BR	runtime·xchg64(SB)
 
 TEXT runtime·xchguintptr(SB), NOSPLIT, $0-24
@@ -538,7 +538,7 @@ TEXT runtime·xchguintptr(SB), NOSPLIT, $0-24
 TEXT runtime·procyield(SB),NOSPLIT,$0-0
 	RETURN
 
-TEXT runtime·atomicstorep(SB), NOSPLIT, $0-16
+TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-16
 	BR	runtime·atomicstore64(SB)
 
 TEXT runtime·atomicstore(SB), NOSPLIT, $0-12
diff --git a/src/runtime/atomic.go b/src/runtime/atomic.go
index 7e9d9b3aa..a0e4d84e9 100644
--- a/src/runtime/atomic.go
+++ b/src/runtime/atomic.go
@@ -20,8 +20,16 @@ func xchg(ptr *uint32, new uint32) uint32
 //go:noescape
 func xchg64(ptr *uint64, new uint64) uint64
 
-//go:noescape
-func xchgp(ptr unsafe.Pointer, new unsafe.Pointer) unsafe.Pointer
+// Cannot use noescape here: ptr does not but new does escape.
+// Instead use noescape(ptr) in wrapper below.
+func xchgp1(ptr unsafe.Pointer, new unsafe.Pointer) unsafe.Pointer
+
+//go:nosplit
+func xchgp(ptr unsafe.Pointer, new unsafe.Pointer) unsafe.Pointer {
+	old := xchgp1(noescape(ptr), new)
+	writebarrierptr_nostore((*uintptr)(ptr), uintptr(new))
+	return old
+}
 
 //go:noescape
 func xchguintptr(ptr *uintptr, new uintptr) uintptr
@@ -47,5 +55,27 @@ func atomicstore(ptr *uint32, val uint32)
 //go:noescape
 func atomicstore64(ptr *uint64, val uint64)
 
-//go:noescape
-func atomicstorep(ptr unsafe.Pointer, val unsafe.Pointer)
+// Cannot use noescape here: ptr does not but val does escape.
+// Instead use noescape(ptr) in wrapper below.
+func atomicstorep1(ptr unsafe.Pointer, val unsafe.Pointer)
+
+//go:nosplit
+func atomicstorep(ptr unsafe.Pointer, val unsafe.Pointer) {
+	atomicstorep1(noescape(ptr), val)
+	// TODO(rsc): Why does the compiler think writebarrierptr_nostore's dst argument escapes?
+	writebarrierptr_nostore((*uintptr)(noescape(ptr)), uintptr(val))
+}
+
+// Cannot use noescape here: ptr does not but new does escape.
+// Instead use noescape(ptr) in wrapper below.
+func casp1(ptr *unsafe.Pointer, old, new unsafe.Pointer) bool
+
+//go:nosplit
+func casp(ptr *unsafe.Pointer, old, new unsafe.Pointer) bool {
+	ok := casp1((*unsafe.Pointer)(noescape(unsafe.Pointer(ptr))), old, new)
+	if !ok {
+		return false
+	}
+	writebarrierptr_nostore((*uintptr)(unsafe.Pointer(ptr)), uintptr(new))
+	return true
+}
diff --git a/src/runtime/cgo/dragonfly.c b/src/runtime/cgo/dragonfly.c
index 3c95ff354..c233c8ba9 100644
--- a/src/runtime/cgo/dragonfly.c
+++ b/src/runtime/cgo/dragonfly.c
@@ -2,6 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+// +build dragonfly
+
 #include "textflag.h"
 
 // Supply environ and __progname, because we don't
diff --git a/src/runtime/cgo/freebsd.c b/src/runtime/cgo/freebsd.c
index aefc481e6..4876b2abe 100644
--- a/src/runtime/cgo/freebsd.c
+++ b/src/runtime/cgo/freebsd.c
@@ -2,6 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+// +build freebsd
+
 #include "textflag.h"
 
 // Supply environ and __progname, because we don't
diff --git a/src/runtime/cgo/netbsd.c b/src/runtime/cgo/netbsd.c
index de38bb770..076cc87f1 100644
--- a/src/runtime/cgo/netbsd.c
+++ b/src/runtime/cgo/netbsd.c
@@ -2,6 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+// +build netbsd
+
 #include "textflag.h"
 
 // Supply environ and __progname, because we don't
diff --git a/src/runtime/cgo/openbsd.c b/src/runtime/cgo/openbsd.c
index 7c2b6c173..476649544 100644
--- a/src/runtime/cgo/openbsd.c
+++ b/src/runtime/cgo/openbsd.c
@@ -2,6 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+// +build openbsd
+
 #include "textflag.h"
 
 // Supply environ, __progname and __guard_local, because
diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index be352557f..65e918e84 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -26,7 +26,7 @@ var Exitsyscall = exitsyscall
 var LockedOSThread = lockedOSThread
 
 type LFNode struct {
-	Next    *LFNode
+	Next    uint64
 	Pushcnt uintptr
 }
 
diff --git a/src/runtime/gcinfo_test.go b/src/runtime/gcinfo_test.go
index 2c6d4d662..662b7546d 100644
--- a/src/runtime/gcinfo_test.go
+++ b/src/runtime/gcinfo_test.go
@@ -188,6 +188,6 @@ var (
 
 	infoString = []byte{BitsPointer, BitsDead}
 	infoSlice  = []byte{BitsPointer, BitsDead, BitsDead}
-	infoEface  = []byte{BitsMultiWord, BitsEface}
-	infoIface  = []byte{BitsMultiWord, BitsIface}
+	infoEface  = []byte{BitsPointer, BitsPointer}
+	infoIface  = []byte{BitsPointer, BitsPointer}
 )
diff --git a/src/runtime/heapdump.c b/src/runtime/heapdump.c
index 94a4bd2be..da14f2d24 100644
--- a/src/runtime/heapdump.c
+++ b/src/runtime/heapdump.c
@@ -251,7 +251,9 @@ dumpbv(BitVector *bv, uintptr offset)
 	for(i = 0; i < bv->n; i += BitsPerPointer) {
 		switch(bv->bytedata[i/8] >> i%8 & 3) {
 		case BitsDead:
-			return;
+			// BitsDead has already been processed in makeheapobjbv.
+			// We should only see it in stack maps, in which case we should continue processing.
+			break;
 		case BitsScalar:
 			break;
 		case BitsPointer:
@@ -259,20 +261,7 @@ dumpbv(BitVector *bv, uintptr offset)
 			dumpint(offset + i / BitsPerPointer * PtrSize);
 			break;
 		case BitsMultiWord:
-			switch(bv->bytedata[(i+BitsPerPointer)/8] >> (i+BitsPerPointer)%8 & 3) {
-			default:
-				runtime·throw("unexpected garbage collection bits");
-			case BitsIface:
-				dumpint(FieldKindIface);
-				dumpint(offset + i / BitsPerPointer * PtrSize);
-				i += BitsPerPointer;
-				break;
-			case BitsEface:
-				dumpint(FieldKindEface);
-				dumpint(offset + i / BitsPerPointer * PtrSize);
-				i += BitsPerPointer;
-				break;
-			}
+			runtime·throw("bumpbv unexpected garbage collection bits");
 		}
 	}
 }
diff --git a/src/runtime/lfstack.c b/src/runtime/lfstack.c
index 57e0af282..0ced839c2 100644
--- a/src/runtime/lfstack.c
+++ b/src/runtime/lfstack.c
@@ -46,7 +46,7 @@ runtime·lfstackpush(uint64 *head, LFNode *node)
 	new = (uint64)(uintptr)node|(((uint64)node->pushcnt&CNT_MASK)<<PTR_BITS);
 	for(;;) {
 		old = runtime·atomicload64(head);
-		node->next = (LFNode*)(uintptr)(old&PTR_MASK);
+		node->next = old;
 		if(runtime·cas64(head, old, new))
 			break;
 	}
@@ -55,19 +55,17 @@ runtime·lfstackpush(uint64 *head, LFNode *node)
 LFNode*
 runtime·lfstackpop(uint64 *head)
 {
-	LFNode *node, *node2;
-	uint64 old, new;
+	LFNode *node;
+	uint64 old, next;
 
 	for(;;) {
 		old = runtime·atomicload64(head);
 		if(old == 0)
 			return nil;
 		node = (LFNode*)(uintptr)(old&PTR_MASK);
-		node2 = runtime·atomicloadp(&node->next);
-		new = 0;
-		if(node2 != nil)
-			new = (uint64)(uintptr)node2|(((uint64)node2->pushcnt&CNT_MASK)<<PTR_BITS);
-		if(runtime·cas64(head, old, new))
+		next = runtime·atomicload64(&node->next);
+
+		if(runtime·cas64(head, old, next))
 			return node;
 	}
 }
diff --git a/src/runtime/lfstack_test.go b/src/runtime/lfstack_test.go
index e51877704..68f221d6e 100644
--- a/src/runtime/lfstack_test.go
+++ b/src/runtime/lfstack_test.go
@@ -121,7 +121,7 @@ func TestLFStackStress(t *testing.T) {
 			}
 			cnt++
 			sum2 += node.data
-			node.Next = nil
+			node.Next = 0
 		}
 	}
 	if cnt != K {
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index 294bc4870..fab8cf269 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -41,7 +41,7 @@ var zerobase uintptr
 // Allocate an object of size bytes.
 // Small objects are allocated from the per-P cache's free lists.
 // Large objects (> 32 kB) are allocated straight from the heap.
-func mallocgc(size uintptr, typ *_type, flags int) unsafe.Pointer {
+func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
 	if size == 0 {
 		return unsafe.Pointer(&zerobase)
 	}
@@ -245,6 +245,8 @@ func mallocgc(size uintptr, typ *_type, flags int) unsafe.Pointer {
 			masksize = masksize * pointersPerByte / 8 // 4 bits per word
 			masksize++                                // unroll flag in the beginning
 			if masksize > maxGCMask && typ.gc[1] != 0 {
+				// write barriers have not been updated to deal with this case yet.
+				gothrow("maxGCMask too small for now")
 				// If the mask is too large, unroll the program directly
 				// into the GC bitmap. It's 7 times slower than copying
 				// from the pre-unrolled mask, but saves 1/16 of type size
@@ -304,6 +306,18 @@ func mallocgc(size uintptr, typ *_type, flags int) unsafe.Pointer {
 		}
 	}
 marked:
+
+	// GCmarkterminate allocates black
+	// All slots hold nil so no scanning is needed.
+	// This may be racing with GC so do it atomically if there can be
+	// a race marking the bit.
+	if gcphase == _GCmarktermination {
+		mp := acquirem()
+		mp.ptrarg[0] = x
+		onM(gcmarknewobject_m)
+		releasem(mp)
+	}
+
 	if raceenabled {
 		racemalloc(x, size)
 	}
@@ -344,9 +358,40 @@ marked:
 	return x
 }
 
+func loadPtrMask(typ *_type) []uint8 {
+	var ptrmask *uint8
+	nptr := (uintptr(typ.size) + ptrSize - 1) / ptrSize
+	if typ.kind&kindGCProg != 0 {
+		masksize := nptr
+		if masksize%2 != 0 {
+			masksize *= 2 // repeated
+		}
+		masksize = masksize * pointersPerByte / 8 // 4 bits per word
+		masksize++                                // unroll flag in the beginning
+		if masksize > maxGCMask && typ.gc[1] != 0 {
+			// write barriers have not been updated to deal with this case yet.
+			gothrow("maxGCMask too small for now")
+		}
+		ptrmask = (*uint8)(unsafe.Pointer(uintptr(typ.gc[0])))
+		// Check whether the program is already unrolled
+		// by checking if the unroll flag byte is set
+		maskword := uintptr(atomicloadp(unsafe.Pointer(ptrmask)))
+		if *(*uint8)(unsafe.Pointer(&maskword)) == 0 {
+			mp := acquirem()
+			mp.ptrarg[0] = unsafe.Pointer(typ)
+			onM(unrollgcprog_m)
+			releasem(mp)
+		}
+		ptrmask = (*uint8)(add(unsafe.Pointer(ptrmask), 1)) // skip the unroll flag byte
+	} else {
+		ptrmask = (*uint8)(unsafe.Pointer(typ.gc[0])) // pointer to unrolled mask
+	}
+	return (*[1 << 30]byte)(unsafe.Pointer(ptrmask))[:(nptr+1)/2]
+}
+
 // implementation of new builtin
 func newobject(typ *_type) unsafe.Pointer {
-	flags := 0
+	flags := uint32(0)
 	if typ.kind&kindNoPointers != 0 {
 		flags |= flagNoScan
 	}
@@ -355,7 +400,7 @@ func newobject(typ *_type) unsafe.Pointer {
 
 // implementation of make builtin for slices
 func newarray(typ *_type, n uintptr) unsafe.Pointer {
-	flags := 0
+	flags := uint32(0)
 	if typ.kind&kindNoPointers != 0 {
 		flags |= flagNoScan
 	}
@@ -438,7 +483,20 @@ func gogc(force int32) {
 	mp = acquirem()
 	mp.gcing = 1
 	releasem(mp)
+
 	onM(stoptheworld)
+	onM(finishsweep_m) // finish sweep before we start concurrent scan.
+	if false {         // To turn on concurrent scan and mark set to true...
+		onM(starttheworld)
+		// Do a concurrent heap scan before we stop the world.
+		onM(gcscan_m)
+		onM(stoptheworld)
+		onM(gcinstallmarkwb_m)
+		onM(starttheworld)
+		onM(gcmark_m)
+		onM(stoptheworld)
+		onM(gcinstalloffwb_m)
+	}
 	if mp != acquirem() {
 		gothrow("gogc: rescheduled")
 	}
@@ -469,6 +527,8 @@ func gogc(force int32) {
 		onM(gc_m)
 	}
 
+	onM(gccheckmark_m)
+
 	// all done
 	mp.gcing = 0
 	semrelease(&worldsema)
@@ -483,6 +543,14 @@ func gogc(force int32) {
 	}
 }
 
+func GCcheckmarkenable() {
+	onM(gccheckmarkenable_m)
+}
+
+func GCcheckmarkdisable() {
+	onM(gccheckmarkdisable_m)
+}
+
 // GC runs a garbage collection.
 func GC() {
 	gogc(2)
diff --git a/src/runtime/malloc.h b/src/runtime/malloc.h
index adb8d3d67..522b11bba 100644
--- a/src/runtime/malloc.h
+++ b/src/runtime/malloc.h
@@ -86,6 +86,7 @@ typedef struct MSpan	MSpan;
 typedef struct MStats	MStats;
 typedef struct MLink	MLink;
 typedef struct GCStats	GCStats;
+typedef struct Workbuf  Workbuf;
 
 enum
 {
@@ -344,8 +345,6 @@ struct MCache
 
 	SudoG*	sudogcache;
 
-	void*	gcworkbuf;
-
 	// Local allocator stats, flushed during GC.
 	uintptr local_nlookup;		// number of pointer lookups
 	uintptr local_largefree;	// bytes freed for large objects (>MaxSmallSize)
@@ -356,7 +355,7 @@ struct MCache
 MSpan*	runtime·MCache_Refill(MCache *c, int32 sizeclass);
 void	runtime·MCache_ReleaseAll(MCache *c);
 void	runtime·stackcache_clear(MCache *c);
-void	runtime·gcworkbuffree(void *b);
+void	runtime·gcworkbuffree(Workbuf *b);
 
 enum
 {
diff --git a/src/runtime/mcache.c b/src/runtime/mcache.c
index 5fdbe3266..95ddced3e 100644
--- a/src/runtime/mcache.c
+++ b/src/runtime/mcache.c
@@ -39,12 +39,12 @@ runtime·allocmcache(void)
 	return c;
 }
 
+// mheap.lock needs to be held to release the gcworkbuf.
 static void
 freemcache(MCache *c)
 {
 	runtime·MCache_ReleaseAll(c);
 	runtime·stackcache_clear(c);
-	runtime·gcworkbuffree(c->gcworkbuf);
 	runtime·lock(&runtime·mheap.lock);
 	runtime·purgecachedstats(c);
 	runtime·FixAlloc_Free(&runtime·mheap.cachealloc, c);
diff --git a/src/runtime/mgc0.c b/src/runtime/mgc0.c
index 897dc1415..f37c01af0 100644
--- a/src/runtime/mgc0.c
+++ b/src/runtime/mgc0.c
@@ -4,22 +4,72 @@
 
 // Garbage collector (GC).
 //
-// GC is:
-// - mark&sweep
-// - mostly precise (with the exception of some C-allocated objects, assembly frames/arguments, etc)
-// - parallel (up to MaxGcproc threads)
-// - partially concurrent (mark is stop-the-world, while sweep is concurrent)
-// - non-moving/non-compacting
-// - full (non-partial)
+// The GC runs concurrently with mutator threads, is type accurate (aka precise), allows multiple GC 
+// thread to run in parallel. It is a concurrent mark and sweep that uses a write barrier. It is 
+// non-generational and non-compacting. Allocation is done using size segregated per P allocation 
+// areas to minimize fragmentation while eliminating locks in the common case. 
 //
-// GC rate.
-// Next GC is after we've allocated an extra amount of memory proportional to
-// the amount already in use. The proportion is controlled by GOGC environment variable
-// (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
-// (this mark is tracked in next_gc variable). This keeps the GC cost in linear
-// proportion to the allocation cost. Adjusting GOGC just changes the linear constant
-// (and also the amount of extra memory used).
+// The algorithm decomposes into several steps.
+// This is a high level description of the algorithm being used. For an overview of GC a good
+// place to start is Richard Jones' gchandbook.org.
+// 
+// The algorithm's intellectual heritage includes Dijkstra's on-the-fly algorithm, see
+// Edsger W. Dijkstra, Leslie Lamport, A. J. Martin, C. S. Scholten, and E. F. M. Steffens. 1978. 
+// On-the-fly garbage collection: an exercise in cooperation. Commun. ACM 21, 11 (November 1978), 966-975.
+// For journal quality proofs that these steps are complete, correct, and terminate see
+// Hudson, R., and Moss, J.E.B. Copying Garbage Collection without stopping the world. 
+// Concurrency and Computation: Practice and Experience 15(3-5), 2003. 
 //
+//  0. Set phase = GCscan from GCoff.
+//  1. Wait for all P's to acknowledge phase change.
+//         At this point all goroutines have passed through a GC safepoint and
+//         know we are in the GCscan phase.
+//  2. GC scans all goroutine stacks, mark and enqueues all encountered pointers
+//       (marking avoids most duplicate enqueuing but races may produce duplication which is benign).
+//       Preempted goroutines are scanned before P schedules next goroutine.
+//  3. Set phase = GCmark.
+//  4. Wait for all P's to acknowledge phase change.
+//  5. Now write barrier marks and enqueues black, grey, or white to white pointers.
+//       Malloc still allocates white (non-marked) objects.
+//  6. Meanwhile GC transitively walks the heap marking reachable objects.
+//  7. When GC finishes marking heap, it preempts P's one-by-one and
+//       retakes partial wbufs (filled by write barrier or during a stack scan of the goroutine
+//       currently scheduled on the P).
+//  8. Once the GC has exhausted all available marking work it sets phase = marktermination.
+//  9. Wait for all P's to acknowledge phase change.
+// 10. Malloc now allocates black objects, so number of unmarked reachable objects
+//        monotonically decreases.
+// 11. GC preempts P's one-by-one taking partial wbufs and marks all unmarked yet reachable objects.
+// 12. When GC completes a full cycle over P's and discovers no new grey
+//         objects, (which means all reachable objects are marked) set phase = GCsweep.
+// 13. Wait for all P's to acknowledge phase change.
+// 14. Now malloc allocates white (but sweeps spans before use).
+//         Write barrier becomes nop.
+// 15. GC does background sweeping, see description below.
+// 16. When sweeping is complete set phase to GCoff.
+// 17. When sufficient allocation has taken place replay the sequence starting at 0 above, 
+//         see discussion of GC rate below.
+
+// Changing phases.
+// Phases are changed by setting the gcphase to the next phase and possibly calling ackgcphase.
+// All phase action must be benign in the presence of a change.
+// Starting with GCoff
+// GCoff to GCscan
+//     GSscan scans stacks and globals greying them and never marks an object black.
+//     Once all the P's are aware of the new phase they will scan gs on preemption.
+//     This means that the scanning of preempted gs can't start until all the Ps
+//     have acknowledged.
+// GCscan to GCmark
+//     GCMark turns on the write barrier which also only greys objects. No scanning
+//     of objects (making them black) can happen until all the Ps have acknowledged 
+//     the phase change.
+// GCmark to GCmarktermination
+//     The only change here is that we start allocating black so the Ps must acknowledge
+//     the change before we begin the termination algorithm
+// GCmarktermination to GSsweep
+//     Object currently on the freelist must be marked black for this to work. 
+//     Are things on the free lists black or white? How does the sweep phase work?
+
 // Concurrent sweep.
 // The sweep phase proceeds concurrently with normal program execution.
 // The heap is swept span-by-span both lazily (when a goroutine needs another span)
@@ -50,6 +100,14 @@
 // The finalizer goroutine is kicked off only when all spans are swept.
 // When the next GC starts, it sweeps all not-yet-swept spans (if any).
 
+// GC rate.
+// Next GC is after we've allocated an extra amount of memory proportional to
+// the amount already in use. The proportion is controlled by GOGC environment variable
+// (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
+// (this mark is tracked in next_gc variable). This keeps the GC cost in linear 
+// proportion to the allocation cost. Adjusting GOGC just changes the linear constant	
+// (and also the amount of extra memory used).
+
 #include "runtime.h"
 #include "arch_GOARCH.h"
 #include "malloc.h"
@@ -65,9 +123,8 @@
 enum {
 	Debug		= 0,
 	DebugPtrs	= 0, // if 1, print trace of every pointer load during GC
-	ConcurrentSweep	= 0,
+	ConcurrentSweep	= 1,
 
-	WorkbufSize	= 4*1024,
 	FinBlockSize	= 4*1024,
 	RootData	= 0,
 	RootBss		= 1,
@@ -80,7 +137,7 @@ enum {
 // ptrmask for an allocation containing a single pointer.
 static byte oneptr[] = {BitsPointer};
 
-// Initialized from $GOGC.  GOGC=off means no gc.
+// Initialized from $GOGC.  GOGC=off means no GC.
 extern int32 runtime·gcpercent;
 
 // Holding worldsema grants an M the right to try to stop the world.
@@ -98,12 +155,16 @@ extern int32 runtime·gcpercent;
 //
 uint32 runtime·worldsema = 1;
 
-typedef struct Workbuf Workbuf;
-struct Workbuf
-{
-	LFNode	node; // must be first
-	uintptr	nobj;
-	byte*	obj[(WorkbufSize-sizeof(LFNode)-sizeof(uintptr))/PtrSize];
+// It is a bug if bits does not have bitBoundary set but
+// there are still some cases where this happens related
+// to stack spans.
+typedef struct Markbits Markbits;
+struct Markbits {
+	byte *bitp; // pointer to the byte holding xbits
+ 	byte shift; // bits xbits needs to be shifted to get bits
+	byte xbits; // byte holding all the bits from *bitp
+	byte bits;  // mark and boundary bits relevant to corresponding slot.
+	byte tbits; // pointer||scalar bits relevant to corresponding slot.
 };
 
 extern byte runtime·data[];
@@ -128,26 +189,40 @@ BitVector	runtime·gcbssmask;
 
 Mutex	runtime·gclock;
 
-static	uintptr	badblock[1024];
-static	int32	nbadblock;
-
+static Workbuf* getpartialorempty(void);
+static void	putpartial(Workbuf*);
 static Workbuf* getempty(Workbuf*);
 static Workbuf* getfull(Workbuf*);
 static void	putempty(Workbuf*);
+static void	putfull(Workbuf*);
 static Workbuf* handoff(Workbuf*);
 static void	gchelperstart(void);
 static void	flushallmcaches(void);
-static bool	scanframe(Stkframe *frame, void *unused);
-static void	scanstack(G *gp);
-static BitVector	unrollglobgcprog(byte *prog, uintptr size);
+static bool	scanframe(Stkframe*, void*);
+static void	scanstack(G*);
+static BitVector	unrollglobgcprog(byte*, uintptr);
+static void     scanblock(byte*, uintptr, byte*);
+static byte*    objectstart(byte*, Markbits*);
+static Workbuf*	greyobject(byte*, Markbits*, Workbuf*);
+static bool     inheap(byte*);
+static bool     shaded(byte*);
+static void     shade(byte*);
+static void	slottombits(byte*, Markbits*);
+static void     atomicxor8(byte*, byte);
+static bool     ischeckmarked(Markbits*);
+static bool     ismarked(Markbits*);
+static void     clearcheckmarkbits(void);
+static void     clearcheckmarkbitsspan(MSpan*);
 
 void runtime·bgsweep(void);
+void runtime·finishsweep_m(void);
 static FuncVal bgsweepv = {runtime·bgsweep};
 
 typedef struct WorkData WorkData;
 struct WorkData {
-	uint64	full;  // lock-free list of full blocks
-	uint64	empty; // lock-free list of empty blocks
+	uint64	full;    // lock-free list of full blocks
+	uint64	empty;   // lock-free list of empty blocks
+	uint64  partial; // lock-free list of partially filled blocks
 	byte	pad0[CacheLineSize]; // prevents false-sharing between full/empty and nproc/nwait
 	uint32	nproc;
 	int64	tstart;
@@ -162,315 +237,422 @@ struct WorkData {
 };
 WorkData runtime·work;
 
-// Is _cgo_allocate linked into the binary?
+// To help debug the concurrent GC we remark with the world
+// stopped ensuring that any object encountered has their normal
+// mark bit set. To do this we use an orthogonal bit
+// pattern to indicate the object is marked. The following pattern
+// uses the upper two bits in the object's bounday nibble. 
+// 01: scalar  not marked
+// 10: pointer not marked
+// 11: pointer     marked
+// 00: scalar      marked
+// Xoring with 01 will flip the pattern from marked to unmarked and vica versa.
+// The higher bit is 1 for pointers and 0 for scalars, whether the object
+// is marked or not.
+// The first nibble no longer holds the bitsDead pattern indicating that the
+// there are no more pointers in the object. This information is held
+// in the second nibble.
+
+// When marking an object if the bool checkmark is true one uses the above 
+// encoding, otherwise one uses the bitMarked bit in the lower two bits 
+// of the nibble.
+static bool checkmark = false;
+static bool gccheckmarkenable = true;
+
+// Is address b in the known heap. If it doesn't have a valid gcmap
+// returns false. For example pointers into stacks will return false.
 static bool
-have_cgo_allocate(void)
+inheap(byte *b)
 {
-	extern	byte	go·weak·runtime·_cgo_allocate_internal[1];
-	return go·weak·runtime·_cgo_allocate_internal != nil;
+	MSpan *s;
+	pageID k;
+	uintptr x;
+
+	if(b == nil || b < runtime·mheap.arena_start || b >= runtime·mheap.arena_used)
+		return false;
+	// Not a beginning of a block, consult span table to find the block beginning.
+	k = (uintptr)b>>PageShift;
+	x = k;
+	x -= (uintptr)runtime·mheap.arena_start>>PageShift;
+	s = runtime·mheap.spans[x];
+	if(s == nil || k < s->start || b >= s->limit || s->state != MSpanInUse)
+		return false;
+	return true;
 }
 
-// scanblock scans a block of n bytes starting at pointer b for references
-// to other objects, scanning any it finds recursively until there are no
-// unscanned objects left.  Instead of using an explicit recursion, it keeps
-// a work list in the Workbuf* structures and loops in the main function
-// body.  Keeping an explicit work list is easier on the stack allocator and
-// more efficient.
+// Given an address in the heap return the relevant byte from the gcmap. This routine
+// can be used on addresses to the start of an object or to the interior of the an object.
 static void
-scanblock(byte *b, uintptr n, byte *ptrmask)
+slottombits(byte *obj, Markbits *mbits)
 {
-	byte *obj, *obj0, *p, *arena_start, *arena_used, **wp, *scanbuf[8], *ptrbitp, *bitp;
-	uintptr i, j, nobj, size, idx, x, off, scanbufpos, bits, xbits, shift;
-	Workbuf *wbuf;
-	Iface *iface;
-	Eface *eface;
-	Type *typ;
+	uintptr off;
+
+	off = (uintptr*)((uintptr)obj&~(PtrSize-1)) - (uintptr*)runtime·mheap.arena_start;
+	mbits->bitp = runtime·mheap.arena_start - off/wordsPerBitmapByte - 1;
+	mbits->shift = (off % wordsPerBitmapByte) * gcBits;
+	mbits->xbits = *mbits->bitp;
+	mbits->bits = (mbits->xbits >> mbits->shift) & bitMask;
+	mbits->tbits = ((mbits->xbits >> mbits->shift) & bitPtrMask) >> 2;
+}
+
+// b is a pointer into the heap.
+// Find the start of the object refered to by b.
+// Set mbits to the associated bits from the bit map.
+// If b is not a valid heap object return nil and
+// undefined values in mbits.
+static byte*
+objectstart(byte *b, Markbits *mbits)
+{
+	byte *obj, *p;
 	MSpan *s;
 	pageID k;
-	bool keepworking;
+	uintptr x, size, idx;
 
-	// Cache memory arena parameters in local vars.
-	arena_start = runtime·mheap.arena_start;
-	arena_used = runtime·mheap.arena_used;
+	obj = (byte*)((uintptr)b&~(PtrSize-1));
+	for(;;) {
+		slottombits(obj, mbits);
+		if((mbits->bits&bitBoundary) == bitBoundary)
+			break;
 
-	wbuf = getempty(nil);
-	nobj = wbuf->nobj;
-	wp = &wbuf->obj[nobj];
-	keepworking = b == nil;
-	scanbufpos = 0;
-	for(i = 0; i < nelem(scanbuf); i++)
-		scanbuf[i] = nil;
+		// Not a beginning of a block, consult span table to find the block beginning.
+		k = (uintptr)obj>>PageShift;
+		x = k;
+		x -= (uintptr)runtime·mheap.arena_start>>PageShift;
+		s = runtime·mheap.spans[x];
+		if(s == nil || k < s->start || obj >= s->limit || s->state != MSpanInUse){
+			if(s != nil && s->state == MSpanStack) {
+				return nil; // This is legit.
+			}
 
+			// The following ensures that we are rigorous about what data 
+			// structures hold valid pointers
+			if(0) {
+				// Still happens sometimes. We don't know why.
+				runtime·printf("runtime:objectstart Span weird: obj=%p, k=%p", obj, k);
+				if (s == nil)
+					runtime·printf(" s=nil\n");
+				else
+					runtime·printf(" s->start=%p s->limit=%p, s->state=%d\n", s->start*PageSize, s->limit, s->state);
+				runtime·throw("objectstart: bad pointer in unexpected span");
+			}
+			return nil;
+		}
+		p = (byte*)((uintptr)s->start<<PageShift);
+		if(s->sizeclass != 0) {
+			size = s->elemsize;
+			idx = ((byte*)obj - p)/size;
+			p = p+idx*size;
+		}
+		if(p == obj) {
+			runtime·printf("runtime: failed to find block beginning for %p s=%p s->limit=%p\n",
+				       p, s->start*PageSize, s->limit);
+			runtime·throw("failed to find block beginning");
+		}
+		obj = p;
+	}
+	// if size(obj.firstfield) < PtrSize, the &obj.secondfield could map to the boundary bit
+	// Clear any low bits to get to the start of the object.
+	// greyobject depends on this.
+	return obj;
+}
+
+// Slow for now as we serialize this, since this is on a debug path 
+// speed is not critical at this point.
+static Mutex andlock;
+static void
+atomicand8(byte *src, byte val)
+{
+	runtime·lock(&andlock);
+	*src = *src&val;
+	runtime·unlock(&andlock);
+}
+
+// Mark using the checkmark scheme.
+void
+docheckmark(Markbits *mbits)
+{
+	// xor 01 moves 01(scalar unmarked) to 00(scalar marked) 
+	// and 10(pointer unmarked) to 11(pointer marked)
+	if(mbits->tbits == BitsScalar)
+		atomicand8(mbits->bitp, ~(byte)(BitsCheckMarkXor<<mbits->shift<<2));
+	else if(mbits->tbits == BitsPointer)
+		runtime·atomicor8(mbits->bitp, BitsCheckMarkXor<<mbits->shift<<2);
+
+	// reload bits for ischeckmarked
+	mbits->xbits = *mbits->bitp;
+	mbits->bits = (mbits->xbits >> mbits->shift) & bitMask;
+	mbits->tbits = ((mbits->xbits >> mbits->shift) & bitPtrMask) >> 2;
+
+	return;
+}
+
+// In the default scheme does mbits refer to a marked object.
+static bool
+ismarked(Markbits *mbits)
+{
+	if((mbits->bits&bitBoundary) != bitBoundary)
+		runtime·throw("ismarked: bits should have boundary bit set");
+	return (mbits->bits&bitMarked) == bitMarked;
+}
+
+// In the checkmark scheme does mbits refer to a marked object.
+static bool
+ischeckmarked(Markbits *mbits)
+{
+	if((mbits->bits&bitBoundary) != bitBoundary)
+		runtime·printf("runtime:ischeckmarked: bits should have boundary bit set\n");
+	return mbits->tbits==BitsScalarMarked || mbits->tbits==BitsPointerMarked;
+}
+
+// When in GCmarkterminate phase we allocate black.
+void
+runtime·gcmarknewobject_m(void)
+{
+	Markbits mbits;
+	byte *obj;
+
+	if(runtime·gcphase != GCmarktermination)
+		runtime·throw("marking new object while not in mark termination phase");
+	if(checkmark) // The world should be stopped so this should not happen.
+		runtime·throw("gcmarknewobject called while doing checkmark");
+
+	obj = g->m->ptrarg[0];	
+	slottombits((byte*)((uintptr)obj & (PtrSize-1)), &mbits);
+
+	if((mbits.bits&bitMarked) != 0)
+		return;
+	
+	// Each byte of GC bitmap holds info for two words.
+	// If the current object is larger than two words, or if the object is one word
+	// but the object it shares the byte with is already marked,
+	// then all the possible concurrent updates are trying to set the same bit,
+	// so we can use a non-atomic update.
+	if((mbits.xbits&(bitMask|(bitMask<<gcBits))) != (bitBoundary|(bitBoundary<<gcBits)) || runtime·work.nproc == 1)
+		*mbits.bitp = mbits.xbits | (bitMarked<<mbits.shift);
+	else
+		runtime·atomicor8(mbits.bitp, bitMarked<<mbits.shift);
+	return;	
+}
+
+// obj is the start of an object with mark mbits.
+// If it isn't already marked, mark it and enqueue into workbuf.
+// Return possibly new workbuf to use.
+static Workbuf*
+greyobject(byte *obj, Markbits *mbits, Workbuf *wbuf) 
+{
+	// obj should be start of allocation, and so must be at least pointer-aligned.
+	if(((uintptr)obj & (PtrSize-1)) != 0)
+		runtime·throw("greyobject: obj not pointer-aligned");
+
+	if(checkmark) {
+		if(!ismarked(mbits)) {
+			MSpan *s;
+			pageID k;
+			uintptr x, i;
+
+			runtime·printf("runtime:greyobject: checkmarks finds unexpected unmarked object obj=%p, mbits->bits=%x, *mbits->bitp=%x\n", obj, mbits->bits, *mbits->bitp);
+
+			k = (uintptr)obj>>PageShift;
+			x = k;
+			x -= (uintptr)runtime·mheap.arena_start>>PageShift;
+			s = runtime·mheap.spans[x];
+			runtime·printf("runtime:greyobject Span: obj=%p, k=%p", obj, k);
+			if (s == nil) {
+				runtime·printf(" s=nil\n");
+			} else {
+				runtime·printf(" s->start=%p s->limit=%p, s->state=%d, s->sizeclass=%d, s->elemsize=%D \n", s->start*PageSize, s->limit, s->state, s->sizeclass, s->elemsize);
+				for(i=0; i<s->sizeclass; i++) {
+					runtime·printf(" ((uintptr*)obj)[%D]=%p\n", i, ((uintptr*)obj)[i]);
+				}
+			}
+			runtime·throw("checkmark found unmarked object");
+		}
+		if(ischeckmarked(mbits))
+			return wbuf;
+		docheckmark(mbits);
+		if(!ischeckmarked(mbits)) {
+			runtime·printf("mbits xbits=%x bits=%x tbits=%x shift=%d\n", mbits->xbits, mbits->bits, mbits->tbits, mbits->shift);
+			runtime·throw("docheckmark and ischeckmarked disagree");
+		}
+	} else {
+		// If marked we have nothing to do.
+		if((mbits->bits&bitMarked) != 0)
+			return wbuf;
+
+		// Each byte of GC bitmap holds info for two words.
+		// If the current object is larger than two words, or if the object is one word
+		// but the object it shares the byte with is already marked,
+		// then all the possible concurrent updates are trying to set the same bit,
+		// so we can use a non-atomic update.
+		if((mbits->xbits&(bitMask|(bitMask<<gcBits))) != (bitBoundary|(bitBoundary<<gcBits)) || runtime·work.nproc == 1)
+			*mbits->bitp = mbits->xbits | (bitMarked<<mbits->shift);
+		else
+			runtime·atomicor8(mbits->bitp, bitMarked<<mbits->shift);
+	}
+
+	if (!checkmark && (((mbits->xbits>>(mbits->shift+2))&BitsMask) == BitsDead))
+		return wbuf;  // noscan object
+
+	// Queue the obj for scanning. The PREFETCH(obj) logic has been removed but
+	// seems like a nice optimization that can be added back in.
+	// There needs to be time between the PREFETCH and the use.
+	// Previously we put the obj in an 8 element buffer that is drained at a rate
+	// to give the PREFETCH time to do its work.
+	// Use of PREFETCHNTA might be more appropriate than PREFETCH
+
+	// If workbuf is full, obtain an empty one.
+	if(wbuf->nobj >= nelem(wbuf->obj)) {
+		wbuf = getempty(wbuf);
+	}
+
+	wbuf->obj[wbuf->nobj] = obj;
+	wbuf->nobj++;
+	return wbuf;                    
+}
+
+// Scan the object b of size n, adding pointers to wbuf.
+// Return possibly new wbuf to use.
+// If ptrmask != nil, it specifies where pointers are in b.
+// If ptrmask == nil, the GC bitmap should be consulted.
+// In this case, n may be an overestimate of the size; the GC bitmap
+// must also be used to make sure the scan stops at the end of b.
+static Workbuf*
+scanobject(byte *b, uintptr n, byte *ptrmask, Workbuf *wbuf)
+{
+	byte *obj, *arena_start, *arena_used, *ptrbitp;
+	uintptr i, j;
+	int32 bits;
+	Markbits mbits;
+
+	arena_start = (byte*)runtime·mheap.arena_start;
+	arena_used = runtime·mheap.arena_used;
 	ptrbitp = nil;
 
+	// Find bits of the beginning of the object.
+	if(ptrmask == nil) {
+		b = objectstart(b, &mbits);
+		if(b == nil)
+			return wbuf;
+		ptrbitp = mbits.bitp; //arena_start - off/wordsPerBitmapByte - 1;
+	}
+	for(i = 0; i < n; i += PtrSize) {
+		// Find bits for this word.
+		if(ptrmask != nil) {
+			// dense mask (stack or data)
+			bits = (ptrmask[(i/PtrSize)/4]>>(((i/PtrSize)%4)*BitsPerPointer))&BitsMask;
+		} else {
+			// Check if we have reached end of span.
+			// n is an overestimate of the size of the object.
+			if((((uintptr)b+i)%PageSize) == 0 &&
+				runtime·mheap.spans[(b-arena_start)>>PageShift] != runtime·mheap.spans[(b+i-arena_start)>>PageShift])
+				break;
+			// Consult GC bitmap.
+			bits = *ptrbitp;
+			if(wordsPerBitmapByte != 2)
+				runtime·throw("alg doesn't work for wordsPerBitmapByte != 2");
+			j = ((uintptr)b+i)/PtrSize & 1; // j indicates upper nibble or lower nibble
+			bits >>= gcBits*j;
+			if(i == 0)
+				bits &= ~bitBoundary;
+			ptrbitp -= j;
+		
+			if((bits&bitBoundary) != 0 && i != 0)
+				break; // reached beginning of the next object
+			bits = (bits&bitPtrMask)>>2; // bits refer to the type bits.
+			
+			if(i != 0 && bits == BitsDead) // BitsDead in first nibble not valid during checkmark
+				break; // reached no-scan part of the object
+		}
+
+		if(bits <= BitsScalar) // Bits Scalar ||
+			               // BitsDead    ||       // default encoding 
+			               // BitsScalarMarked     // checkmark encoding
+				continue;
+
+		if((bits&BitsPointer) != BitsPointer) {
+			runtime·printf("gc checkmark=%d, b=%p ptrmask=%p, mbits.bitp=%p, mbits.xbits=%x, bits=%x\n", checkmark, b, ptrmask, mbits.bitp, mbits.xbits, bits);
+			runtime·throw("unexpected garbage collection bits");
+		}
+
+		obj = *(byte**)(b+i);
+		// At this point we have extracted the next potential pointer.
+		// Check if it points into heap.
+		if(obj == nil || obj < arena_start || obj >= arena_used)
+			continue;
+		// Mark the object. return some important bits.
+		// We we combine the following two rotines we don't have to pass mbits or obj around.
+		obj = objectstart(obj, &mbits);
+		// In the case of the span being MSpan_Stack mbits is useless and will not have 
+		// the boundary bit set. It does not need to be greyed since it will be
+		// scanned using the scan stack mechanism.
+		if(obj == nil)
+			continue;
+		wbuf = greyobject(obj, &mbits, wbuf);
+	}
+	return wbuf;
+}
+
+// scanblock starts by scanning b as scanobject would.
+// If the gcphase is GCscan, that's all scanblock does.
+// Otherwise it traverses some fraction of the pointers it found in b, recursively.
+// As a special case, scanblock(nil, 0, nil) means to scan previously queued work,
+// stopping only when no work is left in the system.
+static void
+scanblock(byte *b, uintptr n, byte *ptrmask)
+{
+	Workbuf *wbuf;
+	bool keepworking;
+
+	wbuf = getpartialorempty();
+	if(b != nil) {
+		wbuf = scanobject(b, n, ptrmask, wbuf);
+		if(runtime·gcphase == GCscan) {
+			if(inheap(b) && !ptrmask)
+				// b is in heap, we are in GCscan so there should be a ptrmask.
+				runtime·throw("scanblock: In GCscan phase and inheap is true.");
+			// GCscan only goes one level deep since mark wb not turned on.
+			putpartial(wbuf);
+			return;
+		}
+	}
+	if(runtime·gcphase == GCscan) {
+		runtime·throw("scanblock: In GCscan phase but no b passed in.");
+	}
+	
+	keepworking = b == nil;
+
 	// ptrmask can have 2 possible values:
 	// 1. nil - obtain pointer mask from GC bitmap.
 	// 2. pointer to a compact mask (for stacks and data).
-	if(b != nil)
-		goto scanobj;
 	for(;;) {
-		if(nobj == 0) {
-			// Out of work in workbuf.
-			// First, see is there is any work in scanbuf.
-			for(i = 0; i < nelem(scanbuf); i++) {
-				b = scanbuf[scanbufpos];
-				scanbuf[scanbufpos++] = nil;
-				scanbufpos %= nelem(scanbuf);
-				if(b != nil) {
-					n = arena_used - b; // scan until bitBoundary or BitsDead
-					ptrmask = nil; // use GC bitmap for pointer info
-					goto scanobj;
-				}
-			}
+		if(wbuf->nobj == 0) {
 			if(!keepworking) {
 				putempty(wbuf);
 				return;
 			}
 			// Refill workbuf from global queue.
 			wbuf = getfull(wbuf);
-			if(wbuf == nil)
+			if(wbuf == nil) // nil means out of work barrier reached
 				return;
-			nobj = wbuf->nobj;
-			wp = &wbuf->obj[nobj];
+
+			if(wbuf->nobj<=0) {
+				runtime·throw("runtime:scanblock getfull returns empty buffer");
+			}
+
 		}
 
 		// If another proc wants a pointer, give it some.
-		if(runtime·work.nwait > 0 && nobj > 4 && runtime·work.full == 0) {
-			wbuf->nobj = nobj;
+		if(runtime·work.nwait > 0 && wbuf->nobj > 4 && runtime·work.full == 0) {
 			wbuf = handoff(wbuf);
-			nobj = wbuf->nobj;
-			wp = &wbuf->obj[nobj];
-		}
-
-		wp--;
-		nobj--;
-		b = *wp;
-		n = arena_used - b; // scan until next bitBoundary or BitsDead
-		ptrmask = nil; // use GC bitmap for pointer info
-
-	scanobj:
-		if(DebugPtrs)
-			runtime·printf("scanblock %p +%p %p\n", b, n, ptrmask);
-		// Find bits of the beginning of the object.
-		if(ptrmask == nil) {
-			off = (uintptr*)b - (uintptr*)arena_start;
-			ptrbitp = arena_start - off/wordsPerBitmapByte - 1;
 		}
-		for(i = 0; i < n; i += PtrSize) {
-			obj = nil;
-			// Find bits for this word.
-			if(ptrmask == nil) {
-				// Check is we have reached end of span.
-				if((((uintptr)b+i)%PageSize) == 0 &&
-					runtime·mheap.spans[(b-arena_start)>>PageShift] != runtime·mheap.spans[(b+i-arena_start)>>PageShift])
-					break;
-				// Consult GC bitmap.
-				bits = *ptrbitp;
-
-				if(wordsPerBitmapByte != 2)
-					runtime·throw("alg doesn't work for wordsPerBitmapByte != 2");
-				j = ((uintptr)b+i)/PtrSize & 1;
-				ptrbitp -= j;
-				bits >>= gcBits*j;
-
-				if((bits&bitBoundary) != 0 && i != 0)
-					break; // reached beginning of the next object
-				bits = (bits>>2)&BitsMask;
-				if(bits == BitsDead)
-					break; // reached no-scan part of the object
-			} else // dense mask (stack or data)
-				bits = (ptrmask[(i/PtrSize)/4]>>(((i/PtrSize)%4)*BitsPerPointer))&BitsMask;
-
-			if(bits <= BitsScalar) // BitsScalar || BitsDead
-				continue;
-			if(bits == BitsPointer) {
-				obj = *(byte**)(b+i);
-				obj0 = obj;
-				goto markobj;
-			}
-
-			// With those three out of the way, must be multi-word.
-			if(Debug && bits != BitsMultiWord)
-				runtime·throw("unexpected garbage collection bits");
-			// Find the next pair of bits.
-			if(ptrmask == nil) {
-				bits = *ptrbitp;
-				j = ((uintptr)b+i+PtrSize)/PtrSize & 1;
-				ptrbitp -= j;
-				bits >>= gcBits*j;
-				bits = (bits>>2)&BitsMask;
-			} else
-				bits = (ptrmask[((i+PtrSize)/PtrSize)/4]>>((((i+PtrSize)/PtrSize)%4)*BitsPerPointer))&BitsMask;
-
-			if(Debug && bits != BitsIface && bits != BitsEface)
-				runtime·throw("unexpected garbage collection bits");
-
-			if(bits == BitsIface) {
-				iface = (Iface*)(b+i);
-				if(iface->tab != nil) {
-					typ = iface->tab->type;
-					if(!(typ->kind&KindDirectIface) || !(typ->kind&KindNoPointers))
-						obj = iface->data;
-				}
-			} else {
-				eface = (Eface*)(b+i);
-				typ = eface->type;
-				if(typ != nil) {
-					if(!(typ->kind&KindDirectIface) || !(typ->kind&KindNoPointers))
-						obj = eface->data;
-				}
-			}
 
-			i += PtrSize;
-
-			obj0 = obj;
-		markobj:
-			// At this point we have extracted the next potential pointer.
-			// Check if it points into heap.
-			if(obj == nil)
-				continue;
-			if(obj < arena_start || obj >= arena_used) {
-				if((uintptr)obj < PhysPageSize && runtime·invalidptr) {
-					s = nil;
-					goto badobj;
-				}
-				continue;
-			}
-			// Mark the object.
-			obj = (byte*)((uintptr)obj & ~(PtrSize-1));
-			off = (uintptr*)obj - (uintptr*)arena_start;
-			bitp = arena_start - off/wordsPerBitmapByte - 1;
-			shift = (off % wordsPerBitmapByte) * gcBits;
-			xbits = *bitp;
-			bits = (xbits >> shift) & bitMask;
-			if((bits&bitBoundary) == 0) {
-				// Not a beginning of a block, consult span table to find the block beginning.
-				k = (uintptr)obj>>PageShift;
-				x = k;
-				x -= (uintptr)arena_start>>PageShift;
-				s = runtime·mheap.spans[x];
-				if(s == nil || k < s->start || obj >= s->limit || s->state != MSpanInUse) {
-					// Stack pointers lie within the arena bounds but are not part of the GC heap.
-					// Ignore them.
-					if(s != nil && s->state == MSpanStack)
-						continue;
-				
-				badobj:
-					// If cgo_allocate is linked into the binary, it can allocate
-					// memory as []unsafe.Pointer that may not contain actual
-					// pointers and must be scanned conservatively.
-					// In this case alone, allow the bad pointer.
-					if(have_cgo_allocate() && ptrmask == nil)
-						continue;
-
-					// Anything else indicates a bug somewhere.
-					// If we're in the middle of chasing down a different bad pointer,
-					// don't confuse the trace by printing about this one.
-					if(nbadblock > 0)
-						continue;
-
-					runtime·printf("runtime: garbage collector found invalid heap pointer *(%p+%p)=%p", b, i, obj);
-					if(s == nil)
-						runtime·printf(" s=nil\n");
-					else
-						runtime·printf(" span=%p-%p-%p state=%d\n", (uintptr)s->start<<PageShift, s->limit, (uintptr)(s->start+s->npages)<<PageShift, s->state);
-					if(ptrmask != nil)
-						runtime·throw("invalid heap pointer");
-					// Add to badblock list, which will cause the garbage collection
-					// to keep repeating until it has traced the chain of pointers
-					// leading to obj all the way back to a root.
-					if(nbadblock == 0)
-						badblock[nbadblock++] = (uintptr)b;
-					continue;
-				}
-				p = (byte*)((uintptr)s->start<<PageShift);
-				if(s->sizeclass != 0) {
-					size = s->elemsize;
-					idx = ((byte*)obj - p)/size;
-					p = p+idx*size;
-				}
-				if(p == obj) {
-					runtime·printf("runtime: failed to find block beginning for %p s=%p s->limit=%p\n",
-						p, s->start*PageSize, s->limit);
-					runtime·throw("failed to find block beginning");
-				}
-				obj = p;
-				goto markobj;
-			}
-			if(DebugPtrs)
-				runtime·printf("scan *%p = %p => base %p\n", b+i, obj0, obj);
-
-			if(nbadblock > 0 && (uintptr)obj == badblock[nbadblock-1]) {
-				// Running garbage collection again because
-				// we want to find the path from a root to a bad pointer.
-				// Found possible next step; extend or finish path.
-				for(j=0; j<nbadblock; j++)
-					if(badblock[j] == (uintptr)b)
-						goto AlreadyBad;
-				runtime·printf("runtime: found *(%p+%p) = %p+%p\n", b, i, obj0, (uintptr)(obj-obj0));
-				if(ptrmask != nil)
-					runtime·throw("bad pointer");
-				if(nbadblock >= nelem(badblock))
-					runtime·throw("badblock trace too long");
-				badblock[nbadblock++] = (uintptr)b;
-			AlreadyBad:;
-			}
-
-			// Now we have bits, bitp, and shift correct for
-			// obj pointing at the base of the object.
-			// Only care about not marked objects.
-			if((bits&bitMarked) != 0)
-				continue;
-			// If obj size is greater than 8, then each byte of GC bitmap
-			// contains info for at most one object. In such case we use
-			// non-atomic byte store to mark the object. This can lead
-			// to double enqueue of the object for scanning, but scanning
-			// is an idempotent operation, so it is OK. This cannot lead
-			// to bitmap corruption because the single marked bit is the
-			// only thing that can change in the byte.
-			// For 8-byte objects we use non-atomic store, if the other
-			// quadruple is already marked. Otherwise we resort to CAS
-			// loop for marking.
-			if((xbits&(bitMask|(bitMask<<gcBits))) != (bitBoundary|(bitBoundary<<gcBits)) ||
-				runtime·work.nproc == 1)
-				*bitp = xbits | (bitMarked<<shift);
-			else
-				runtime·atomicor8(bitp, bitMarked<<shift);
-
-			if(((xbits>>(shift+2))&BitsMask) == BitsDead)
-				continue;  // noscan object
-
-			// Queue the obj for scanning.
-			PREFETCH(obj);
-			p = scanbuf[scanbufpos];
-			scanbuf[scanbufpos++] = obj;
-			scanbufpos %= nelem(scanbuf);
-			if(p == nil)
-				continue;
-
-			// If workbuf is full, obtain an empty one.
-			if(nobj >= nelem(wbuf->obj)) {
-				wbuf->nobj = nobj;
-				wbuf = getempty(wbuf);
-				nobj = wbuf->nobj;
-				wp = &wbuf->obj[nobj];
-			}
-			*wp = p;
-			wp++;
-			nobj++;
-		}
-		if(DebugPtrs)
-			runtime·printf("end scanblock %p +%p %p\n", b, n, ptrmask);
-
-		if(Debug && ptrmask == nil) {
-			// For heap objects ensure that we did not overscan.
-			n = 0;
-			p = nil;
-			if(!runtime·mlookup(b, &p, &n, nil) || b != p || i > n) {
-				runtime·printf("runtime: scanned (%p,%p), heap object (%p,%p)\n", b, i, p, n);
-				runtime·throw("scanblock: scanned invalid object");
-			}
-		}
+		// This might be a good place to add prefetch code...
+		// if(wbuf->nobj > 4) {
+		//         PREFETCH(wbuf->obj[wbuf->nobj - 3];
+		//  }
+		--wbuf->nobj;
+		b = wbuf->obj[wbuf->nobj];
+		wbuf = scanobject(b, runtime·mheap.arena_used - b, nil, wbuf);
 	}
 }
 
@@ -484,7 +666,7 @@ markroot(ParFor *desc, uint32 i)
 	void *p;
 	uint32 status;
 	bool restart;
-
+ 
 	USED(&desc);
 	// Note: if you add a case here, please also update heapdump.c:dumproots.
 	switch(i) {
@@ -511,7 +693,8 @@ markroot(ParFor *desc, uint32 i)
 			s = runtime·work.spans[spanidx];
 			if(s->state != MSpanInUse)
 				continue;
-			if(s->sweepgen != sg) {
+			if(!checkmark && s->sweepgen != sg) { 
+				// sweepgen was updated (+2) during non-checkmark GC pass
 				runtime·printf("sweep %d %d\n", s->sweepgen, sg);
 				runtime·throw("gc: unswept span");
 			}
@@ -523,14 +706,16 @@ markroot(ParFor *desc, uint32 i)
 				spf = (SpecialFinalizer*)sp;
 				// A finalizer can be set for an inner byte of an object, find object beginning.
 				p = (void*)((s->start << PageShift) + spf->special.offset/s->elemsize*s->elemsize);
-				scanblock(p, s->elemsize, nil);
+				if(runtime·gcphase != GCscan)
+					scanblock(p, s->elemsize, nil); // Scanned during mark phase
 				scanblock((void*)&spf->fn, PtrSize, oneptr);
 			}
 		}
 		break;
 
 	case RootFlushCaches:
-		flushallmcaches();
+		if (runtime·gcphase != GCscan) // Do not flush mcaches during GCscan phase.
+			flushallmcaches();
 		break;
 
 	default:
@@ -540,17 +725,37 @@ markroot(ParFor *desc, uint32 i)
 		gp = runtime·allg[i - RootCount];
 		// remember when we've first observed the G blocked
 		// needed only to output in traceback
-		status = runtime·readgstatus(gp);
+		status = runtime·readgstatus(gp); // We are not in a scan state
 		if((status == Gwaiting || status == Gsyscall) && gp->waitsince == 0)
 			gp->waitsince = runtime·work.tstart;
-		// Shrink a stack if not much of it is being used.
-		runtime·shrinkstack(gp);
-		if(runtime·readgstatus(gp) == Gdead) 
+		// Shrink a stack if not much of it is being used but not in the scan phase.
+		if (runtime·gcphase != GCscan) // Do not shrink during GCscan phase.
+			runtime·shrinkstack(gp);
+		if(runtime·readgstatus(gp) == Gdead)
 			gp->gcworkdone = true;
 		else 
 			gp->gcworkdone = false; 
 		restart = runtime·stopg(gp);
-		scanstack(gp);
+
+		// goroutine will scan its own stack when it stops running.
+		// Wait until it has.
+		while(runtime·readgstatus(gp) == Grunning && !gp->gcworkdone) {
+		}
+
+		// scanstack(gp) is done as part of gcphasework
+		// But to make sure we finished we need to make sure that
+		// the stack traps have all responded so drop into
+		// this while loop until they respond.
+		while(!gp->gcworkdone){
+			status = runtime·readgstatus(gp);
+			if(status == Gdead) {
+				gp->gcworkdone = true; // scan is a noop
+				break;
+				//do nothing, scan not needed. 
+			}
+			if(status == Gwaiting || status == Grunnable)
+				restart = runtime·stopg(gp);
+		}
 		if(restart)
 			runtime·restartg(gp);
 		break;
@@ -562,53 +767,95 @@ markroot(ParFor *desc, uint32 i)
 static Workbuf*
 getempty(Workbuf *b)
 {
-	MCache *c;
-
-	if(b != nil)
-		runtime·lfstackpush(&runtime·work.full, &b->node);
-	b = nil;
-	c = g->m->mcache;
-	if(c->gcworkbuf != nil) {
-		b = c->gcworkbuf;
-		c->gcworkbuf = nil;
+	if(b != nil) {
+		putfull(b);
+		b = nil;
 	}
-	if(b == nil)
+	if(runtime·work.empty)
 		b = (Workbuf*)runtime·lfstackpop(&runtime·work.empty);
-	if(b == nil)
+
+	if(b && b->nobj != 0) {
+		runtime·printf("m%d: getempty: popped b=%p with non-zero b->nobj=%d\n", g->m->id, b, (uint32)b->nobj);
+		runtime·throw("getempty: workbuffer not empty, b->nobj not 0");
+	}
+	if(b == nil) {
 		b = runtime·persistentalloc(sizeof(*b), CacheLineSize, &mstats.gc_sys);
-	b->nobj = 0;
+		b->nobj = 0;
+	}
 	return b;
 }
 
 static void
 putempty(Workbuf *b)
 {
-	MCache *c;
-
-	c = g->m->mcache;
-	if(c->gcworkbuf == nil) {
-		c->gcworkbuf = b;
-		return;
+	if(b->nobj != 0) {
+		runtime·throw("putempty: b->nobj not 0\n");
 	}
 	runtime·lfstackpush(&runtime·work.empty, &b->node);
 }
 
-void
-runtime·gcworkbuffree(void *b)
+// Put a full or partially full workbuf on the full list.
+static void
+putfull(Workbuf *b)
 {
-	if(b != nil)
-		putempty(b);
+	if(b->nobj <= 0) {
+		runtime·throw("putfull: b->nobj <= 0\n");
+	}
+	runtime·lfstackpush(&runtime·work.full, &b->node);
 }
 
-// Get a full work buffer off the work.full list, or return nil.
+// Get an partially empty work buffer
+// if none are available get an empty one.
+static Workbuf*
+getpartialorempty(void)
+{
+	Workbuf *b;
+
+	b = (Workbuf*)runtime·lfstackpop(&runtime·work.partial);
+	if(b == nil)
+		b = getempty(nil);
+	return b;
+}
+
+static void
+putpartial(Workbuf *b)
+{
+
+	if(b->nobj == 0)
+		runtime·lfstackpush(&runtime·work.empty, &b->node);
+	else if (b->nobj < nelem(b->obj))
+		runtime·lfstackpush(&runtime·work.partial, &b->node);
+	else if (b->nobj == nelem(b->obj))
+		runtime·lfstackpush(&runtime·work.full, &b->node);
+	else {
+		runtime·printf("b=%p, b->nobj=%d, nelem(b->obj)=%d\n", b, (uint32)b->nobj, (uint32)nelem(b->obj));
+		runtime·throw("putpartial: bad Workbuf b->nobj");
+	}
+}
+
+// Get a full work buffer off the work.full or a partially
+// filled one off the work.partial list. If nothing is available
+// wait until all the other gc helpers have finished and then
+// return nil.
+// getfull acts as a barrier for work.nproc helpers. As long as one
+// gchelper is actively marking objects it
+// may create a workbuffer that the other helpers can work on.
+// The for loop either exits when a work buffer is found
+// or when _all_ of the work.nproc GC helpers are in the loop 
+// looking for work and thus not capable of creating new work.
+// This is in fact the termination condition for the STW mark 
+// phase.
 static Workbuf*
 getfull(Workbuf *b)
 {
 	int32 i;
 
 	if(b != nil)
-		runtime·lfstackpush(&runtime·work.empty, &b->node);
+		putempty(b);
+
 	b = (Workbuf*)runtime·lfstackpop(&runtime·work.full);
+	if(b==nil)
+		b = (Workbuf*)runtime·lfstackpop(&runtime·work.partial);
 	if(b != nil || runtime·work.nproc == 1)
 		return b;
 
@@ -617,7 +864,9 @@ getfull(Workbuf *b)
 		if(runtime·work.full != 0) {
 			runtime·xadd(&runtime·work.nwait, -1);
 			b = (Workbuf*)runtime·lfstackpop(&runtime·work.full);
-			if(b != nil)
+			if(b==nil)
+				b = (Workbuf*)runtime·lfstackpop(&runtime·work.partial);
+			if(b != nil) 
 				return b;
 			runtime·xadd(&runtime·work.nwait, +1);
 		}
@@ -737,7 +986,7 @@ scanframe(Stkframe *frame, void *unused)
 			}
  			bv = runtime·stackmapdata(stackmap, pcdata);
 		}
- 		scanblock((byte*)frame->argp, bv.n/BitsPerPointer*PtrSize, bv.bytedata);
+		scanblock((byte*)frame->argp, bv.n/BitsPerPointer*PtrSize, bv.bytedata);
  	}
  	return true;
 }
@@ -760,8 +1009,7 @@ scanstack(G *gp)
 	case Gdead:
 		return;
 	case Grunning:
-		runtime·printf("runtime: gp=%p, goid=%D, gp->atomicstatus=%d\n", gp, gp->goid, runtime·readgstatus(gp));
-		runtime·throw("mark - world not stopped");
+		runtime·throw("scanstack: - goroutine not stopped");
 	case Grunnable:
 	case Gsyscall:
 	case Gwaiting:
@@ -778,8 +1026,117 @@ scanstack(G *gp)
 	runtime·tracebackdefers(gp, &fn, nil);
 }
 
-// The gp has been moved to a gc safepoint. If there is gcphase specific
-// work it is done here. 
+// If the slot is grey or black return true, if white return false.
+// If the slot is not in the known heap and thus does not have a valid GC bitmap then
+// it is considered grey. Globals and stacks can hold such slots.
+// The slot is grey if its mark bit is set and it is enqueued to be scanned.
+// The slot is black if it has already been scanned.
+// It is white if it has a valid mark bit and the bit is not set. 
+static bool
+shaded(byte *slot)
+{
+	Markbits mbits;
+	byte *valid;
+
+	if(!inheap(slot)) // non-heap slots considered grey
+		return true;
+
+	valid = objectstart(slot, &mbits);
+	if(valid == nil)
+		return true;
+
+	if(checkmark)
+		return ischeckmarked(&mbits);
+
+	return (mbits.bits&bitMarked) != 0;
+}
+
+// Shade the object if it isn't already.
+// The object is not nil and known to be in the heap.
+static void
+shade(byte *b)
+{
+	byte *obj;
+	Workbuf *wbuf;
+	Markbits mbits;
+	
+	if(!inheap(b))
+		runtime·throw("shade: passed an address not in the heap");
+	
+	wbuf = getpartialorempty();
+	// Mark the object, return some important bits.
+	// If we combine the following two rotines we don't have to pass mbits or obj around.
+	obj = objectstart(b, &mbits);
+	if(obj != nil)
+		wbuf = greyobject(obj, &mbits, wbuf); // augments the wbuf
+
+	putpartial(wbuf);
+	return;
+}
+
+// This is the Dijkstra barrier coarsened to always shade the ptr (dst) object.
+// The original Dijkstra barrier only shaded ptrs being placed in black slots.
+//
+// Shade indicates that it has seen a white pointer by adding the referent
+// to wbuf as well as marking it.
+//
+// slot is the destination (dst) in go code
+// ptr is the value that goes into the slot (src) in the go code
+//
+// Dijkstra pointed out that maintaining the no black to white
+// pointers means that white to white pointers not need 
+// to be noted by the write barrier. Furthermore if either 
+// white object dies before it is reached by the 
+// GC then the object can be collected during this GC cycle 
+// instead of waiting for the next cycle. Unfortunately the cost of 
+// ensure that the object holding the slot doesn't concurrently
+// change to black without the mutator noticing seems prohibitive.
+//
+// Consider the following example where the mutator writes into 
+// a slot and then loads the slot's mark bit while the GC thread 
+// writes to the slot's mark bit and then as part of scanning reads 
+// the slot.
+// 
+// Initially both [slot] and [slotmark] are 0 (nil)
+// Mutator thread          GC thread
+// st [slot], ptr          st [slotmark], 1
+// 
+// ld r1, [slotmark]       ld r2, [slot]
+//
+// This is a classic example of independent reads of independent writes,
+// aka IRIW. The question is if r1==r2==0 is allowed and for most HW the 
+// answer is yes without inserting a memory barriers between the st and the ld. 
+// These barriers are expensive so we have decided that we will 
+// always grey the ptr object regardless of the slot's color.
+// 
+void
+runtime·gcmarkwb_m()
+{
+	byte *ptr;
+	ptr = (byte*)g->m->scalararg[1];
+
+	switch(runtime·gcphase) {
+	default:
+		runtime·throw("gcphasework in bad gcphase");
+	case GCoff:
+	case GCquiesce:
+	case GCstw:
+	case GCsweep:
+	case GCscan:
+		break;
+	case GCmark:
+		if(ptr != nil && inheap(ptr))
+			shade(ptr);
+		break;
+	case GCmarktermination:
+		if(ptr != nil && inheap(ptr))
+			shade(ptr);
+		break;
+	}
+}
+
+// The gp has been moved to a GC safepoint. GC phase specific
+// work is done here. 
 void
 runtime·gcphasework(G *gp)
 {
@@ -790,12 +1147,18 @@ runtime·gcphasework(G *gp)
 	case GCquiesce:
 	case GCstw:
 	case GCsweep:
-		// No work for now.
+		// No work.
+		break;
+	case GCscan:
+		// scan the stack, mark the objects, put pointers in work buffers
+		// hanging off the P where this is being run.
+		scanstack(gp);
 		break;
 	case GCmark:
-		// Disabled until concurrent GC is implemented
-		// but indicate the scan has been done. 
-		// scanstack(gp);
+		break;
+	case GCmarktermination:
+		scanstack(gp);
+		// All available mark work will be emptied before returning.
 		break;
 	}
 	gp->gcworkdone = true;
@@ -885,6 +1248,7 @@ runtime·iterate_finq(void (*callback)(FuncVal*, byte*, uintptr, Type*, PtrType*
 	}
 }
 
+// Returns only when span s has been swept.
 void
 runtime·MSpan_EnsureSwept(MSpan *s)
 {
@@ -899,6 +1263,7 @@ runtime·MSpan_EnsureSwept(MSpan *s)
 	sg = runtime·mheap.sweepgen;
 	if(runtime·atomicload(&s->sweepgen) == sg)
 		return;
+	// The caller must be sure that the span is a MSpanInUse span.
 	if(runtime·cas(&s->sweepgen, sg-2, sg-1)) {
 		runtime·MSpan_Sweep(s, false);
 		return;
@@ -926,6 +1291,9 @@ runtime·MSpan_Sweep(MSpan *s, bool preserve)
 	Special *special, **specialp, *y;
 	bool res, sweepgenset;
 
+	if(checkmark)
+		runtime·throw("MSpan_Sweep: checkmark only runs in STW and after the sweep.");
+
 	// It's critical that we enter this function with preemption disabled,
 	// GC must not start while we are in the middle of this function.
 	if(g->m->locks == 0 && g->m->mallocing == 0 && g != g->m->g0)
@@ -1173,6 +1541,7 @@ runtime·gosweepdone(void)
 	return runtime·mheap.sweepdone;
 }
 
+
 void
 runtime·gchelper(void)
 {
@@ -1181,13 +1550,11 @@ runtime·gchelper(void)
 	g->m->traceback = 2;
 	gchelperstart();
 
-	// parallel mark for over gc roots
+	// parallel mark for over GC roots
 	runtime·parfordo(runtime·work.markfor);
-
-	// help other threads scan secondary blocks
-	scanblock(nil, 0, nil);
-
-	nproc = runtime·work.nproc;  // runtime·work.nproc can change right after we increment runtime·work.ndone
+	if(runtime·gcphase != GCscan) 
+		scanblock(nil, 0, nil); // blocks in getfull
+	nproc = runtime·work.nproc;  // work.nproc can change right after we increment work.ndone
 	if(runtime·xadd(&runtime·work.ndone, +1) == nproc-1)
 		runtime·notewakeup(&runtime·work.alldone);
 	g->m->traceback = 0;
@@ -1353,6 +1720,7 @@ runtime·gcinit(void)
 	runtime·gcbssmask = unrollglobgcprog(runtime·gcbss, runtime·ebss - runtime·bss);
 }
 
+// Called from malloc.go using onM, stopping and starting the world handled in caller.
 void
 runtime·gc_m(void)
 {
@@ -1366,17 +1734,296 @@ runtime·gc_m(void)
 	a.start_time = (uint64)(g->m->scalararg[0]) | ((uint64)(g->m->scalararg[1]) << 32);
 	a.eagersweep = g->m->scalararg[2];
 	gc(&a);
+	runtime·casgstatus(gp, Gwaiting, Grunning);
+}
+
+// Similar to clearcheckmarkbits but works on a single span. 
+// It preforms two tasks. 
+// 1. When used before the checkmark phase it converts BitsDead (00) to bitsScalar (01)
+//    for nibbles with the BoundaryBit set.
+// 2. When used after the checkmark phase it converts BitsPointerMark (11) to BitsPointer 10 and 
+//    BitsScalarMark (00) to BitsScalar (01), thus clearing the checkmark mark encoding.
+// For the second case it is possible to restore the BitsDead pattern but since
+// clearmark is a debug tool performance has a lower priority than simplicity.
+// The span is MSpanInUse and the world is stopped.
+static void
+clearcheckmarkbitsspan(MSpan *s)
+{
+	int32 cl, n, npages, i;
+	uintptr size, off, step;
+	byte *p, *bitp, *arena_start, b;
+
+	if(s->state != MSpanInUse) {
+		runtime·printf("runtime:clearcheckmarkbitsspan: state=%d\n",
+			s->state);
+		runtime·throw("clearcheckmarkbitsspan: bad span state");
+	}
+	arena_start = runtime·mheap.arena_start;
+	cl = s->sizeclass;
+	size = s->elemsize;
+	if(cl == 0) {
+		n = 1;
+	} else {
+		// Chunk full of small blocks.
+		npages = runtime·class_to_allocnpages[cl];
+		n = (npages << PageShift) / size;
+	}
+
+	// MSpan_Sweep has similar code but instead of overloading and 
+	// complicating that routine we do a simpler walk here.
+	// Sweep through n objects of given size starting at p.
+	// This thread owns the span now, so it can manipulate
+	// the block bitmap without atomic operations.
+	p = (byte*)(s->start << PageShift);
+	// Find bits for the beginning of the span.
+	off = (uintptr*)p - (uintptr*)arena_start;
+	bitp = arena_start - off/wordsPerBitmapByte - 1;
+	step = size/(PtrSize*wordsPerBitmapByte);
+
+	// The type bit values are:
+	//	00 - BitsDead, for us BitsScalarMarked
+	//	01 - BitsScalar
+	//	10 - BitsPointer
+	//	11 - unused, for us BitsPointerMarked
+	//
+	// When called to prepare for the checkmark phase (checkmark==1),
+	// we change BitsDead to BitsScalar, so that there are no BitsScalarMarked
+	// type bits anywhere.
+	//
+	// The checkmark phase marks by changing BitsScalar to BitsScalarMarked
+	// and BitsPointer to BitsPointerMarked.
+	//
+	// When called to clean up after the checkmark phase (checkmark==0),
+	// we unmark by changing BitsScalarMarked back to BitsScalar and
+	// BitsPointerMarked back to BitsPointer.
+	//
+	// There are two problems with the scheme as just described.
+	// First, the setup rewrites BitsDead to BitsScalar, but the type bits
+	// following a BitsDead are uninitialized and must not be used.
+	// Second, objects that are free are expected to have their type
+	// bits zeroed (BitsDead), so in the cleanup we need to restore
+	// any BitsDeads that were there originally.
+	//
+	// In a one-word object (8-byte allocation on 64-bit system),
+	// there is no difference between BitsScalar and BitsDead, because
+	// neither is a pointer and there are no more words in the object,
+	// so using BitsScalar during the checkmark is safe and mapping
+	// both back to BitsDead during cleanup is also safe.
+	//
+	// In a larger object, we need to be more careful. During setup,
+	// if the type of the first word is BitsDead, we change it to BitsScalar
+	// (as we must) but also initialize the type of the second
+	// word to BitsDead, so that a scan during the checkmark phase
+	// will still stop before seeing the uninitialized type bits in the
+	// rest of the object. The sequence 'BitsScalar BitsDead' never
+	// happens in real type bitmaps - BitsDead is always as early
+	// as possible, so immediately after the last BitsPointer.
+	// During cleanup, if we see a BitsScalar, we can check to see if it
+	// is followed by BitsDead. If so, it was originally BitsDead and
+	// we can change it back.
 
-	if(nbadblock > 0) {
-		// Work out path from root to bad block.
-		for(;;) {
-			gc(&a);
-			if(nbadblock >= nelem(badblock))
-				runtime·throw("cannot find path to bad pointer");
+	if(step == 0) {
+		// updating top and bottom nibbles, all boundaries
+		for(i=0; i<n/2; i++, bitp--) {
+			if((*bitp & bitBoundary) != bitBoundary)
+				runtime·throw("missing bitBoundary");      
+			b = (*bitp & bitPtrMask)>>2;
+			if(!checkmark && (b == BitsScalar || b == BitsScalarMarked))
+				*bitp &= ~0x0c; // convert to BitsDead
+			else if(b == BitsScalarMarked || b == BitsPointerMarked)
+				*bitp ^= BitsCheckMarkXor<<2;
+ 			
+			if(((*bitp>>gcBits) & bitBoundary) != bitBoundary)
+				runtime·throw("missing bitBoundary");            
+			b = ((*bitp>>gcBits) & bitPtrMask)>>2;
+			if(!checkmark && (b == BitsScalar || b == BitsScalarMarked))
+				*bitp &= ~0xc0; // convert to BitsDead
+			else if(b == BitsScalarMarked || b == BitsPointerMarked)
+				*bitp ^= BitsCheckMarkXor<<(2+gcBits);
+		}
+	} else {
+		// updating bottom nibble for first word of each object
+		for(i=0; i<n; i++, bitp -= step) {
+			if((*bitp & bitBoundary) != bitBoundary)
+				runtime·throw("missing bitBoundary");            
+			b = (*bitp & bitPtrMask)>>2;
+			
+			if(checkmark && b == BitsDead) {
+				// move BitsDead into second word.
+				// set bits to BitsScalar in preparation for checkmark phase.
+				*bitp &= ~0xc0;
+				*bitp |= BitsScalar<<2;
+			} else if(!checkmark && (b == BitsScalar || b == BitsScalarMarked) && (*bitp & 0xc0) == 0) {
+				// Cleaning up after checkmark phase.
+				// First word is scalar or dead (we forgot)
+				// and second word is dead.
+				// First word might as well be dead too.
+				*bitp &= ~0x0c;
+			} else if(b == BitsScalarMarked || b == BitsPointerMarked)
+				*bitp ^= BitsCheckMarkXor<<2;
 		}
 	}
+}
 
-	runtime·casgstatus(gp, Gwaiting, Grunning);
+// clearcheckmarkbits preforms two tasks.
+// 1. When used before the checkmark phase it converts BitsDead (00) to bitsScalar (01)
+//    for nibbles with the BoundaryBit set.
+// 2. When used after the checkmark phase it converts BitsPointerMark (11) to BitsPointer 10 and 
+//    BitsScalarMark (00) to BitsScalar (01), thus clearing the checkmark mark encoding.
+// This is a bit expensive but preserves the BitsDead encoding during the normal marking.
+// BitsDead remains valid for every nibble except the ones with BitsBoundary set.
+static void
+clearcheckmarkbits(void)
+{
+	uint32 idx;
+	MSpan *s;
+	for(idx=0; idx<runtime·work.nspan; idx++) {
+		s = runtime·work.spans[idx];
+		if(s->state == MSpanInUse) {
+			clearcheckmarkbitsspan(s);
+		}
+	}
+}
+
+// Called from malloc.go using onM. 
+// The world is stopped. Rerun the scan and mark phases
+// using the bitMarkedCheck bit instead of the
+// bitMarked bit. If the marking encounters an
+// bitMarked bit that is not set then we throw.
+void
+runtime·gccheckmark_m(void)
+{
+	if(!gccheckmarkenable)
+		return;
+
+	if(checkmark)
+		runtime·throw("gccheckmark_m, entered with checkmark already true.");
+
+	checkmark = true;
+	clearcheckmarkbits(); // Converts BitsDead to BitsScalar.
+	runtime·gc_m(); // turns off checkmark
+	// Work done, fixed up the GC bitmap to remove the checkmark bits.
+	clearcheckmarkbits();
+}
+
+// checkmarkenable is initially false
+void
+runtime·gccheckmarkenable_m(void)
+{
+	gccheckmarkenable = true;
+}
+
+void
+runtime·gccheckmarkdisable_m(void)
+{
+	gccheckmarkenable = false;
+}
+
+void
+runtime·finishsweep_m(void)
+{
+	uint32 i, sg;
+	MSpan *s;
+
+	// The world is stopped so we should be able to complete the sweeps 
+	// quickly. 
+	while(runtime·sweepone() != -1)
+		runtime·sweep.npausesweep++;
+
+	// There may be some other spans being swept concurrently that 
+	// we need to wait for. If finishsweep_m is done with the world stopped
+	// this code is not required.
+	sg = runtime·mheap.sweepgen;
+	for(i=0; i<runtime·work.nspan; i++) {
+		s = runtime·work.spans[i];
+		if(s->sweepgen == sg) {
+			continue;
+		}
+		if(s->state != MSpanInUse) // Span is not part of the GCed heap so no need to ensure it is swept.
+			continue;
+		runtime·MSpan_EnsureSwept(s);
+	}	
+}
+
+// Scan all of the stacks, greying (or graying if in America) the referents
+// but not blackening them since the mark write barrier isn't installed.
+void
+runtime·gcscan_m(void)
+{
+	uint32 i, allglen, oldphase;
+	G *gp, *mastergp, **allg;
+
+	// Grab the g that called us and potentially allow rescheduling.
+	// This allows it to be scanned like other goroutines.
+	mastergp = g->m->curg;
+
+	runtime·casgstatus(mastergp, Grunning, Gwaiting);
+	mastergp->waitreason = runtime·gostringnocopy((byte*)"garbage collection scan");
+
+	// Span sweeping has been done by finishsweep_m.
+	// Long term we will want to make this goroutine runnable 
+	// by placing it onto a scanenqueue state and then calling 
+	// runtime·restartg(mastergp) to make it Grunnable.  
+	// At the bottom we will want to return this p back to the scheduler.
+
+	oldphase = runtime·gcphase;
+
+	runtime·lock(&runtime·allglock);
+	allglen = runtime·allglen;
+	allg = runtime·allg;
+	// Prepare flag indicating that the scan has not been completed.
+	for(i = 0; i < allglen; i++) {
+		gp = allg[i];
+		gp->gcworkdone = false;  // set to true in gcphasework
+	}
+	runtime·unlock(&runtime·allglock);
+
+	runtime·work.nwait = 0;
+	runtime·work.ndone = 0;
+	runtime·work.nproc = 1; // For now do not do this in parallel.
+	runtime·gcphase = GCscan;
+	//	ackgcphase is not needed since we are not scanning running goroutines.
+	runtime·parforsetup(runtime·work.markfor, runtime·work.nproc, RootCount + allglen, nil, false, markroot);
+	runtime·parfordo(runtime·work.markfor);
+	
+	runtime·lock(&runtime·allglock);	
+
+	allg = runtime·allg;
+	// Check that gc work is done. 
+	for(i = 0; i < allglen; i++) {
+		gp = allg[i];
+		if(!gp->gcworkdone) {
+			runtime·throw("scan missed a g");
+		}
+	}
+	runtime·unlock(&runtime·allglock);
+
+	runtime·gcphase = oldphase;
+	runtime·casgstatus(mastergp, Gwaiting, Grunning);
+	// Let the g that called us continue to run.
+}
+
+// Mark all objects that are known about.
+void
+runtime·gcmark_m(void)
+{
+	scanblock(nil, 0, nil);
+}
+
+// For now this must be bracketed with a stoptheworld and a starttheworld to ensure
+// all go routines see the new barrier.
+void
+runtime·gcinstallmarkwb_m(void)
+{
+	runtime·gcphase = GCmark;
+}
+
+// For now this must be bracketed with a stoptheworld and a starttheworld to ensure
+// all go routines see the new barrier.
+void
+runtime·gcinstalloffwb_m(void)
+{
+	runtime·gcphase = GCoff;
 }
 
 static void
@@ -1385,9 +2032,9 @@ gc(struct gc_args *args)
 	int64 t0, t1, t2, t3, t4;
 	uint64 heap0, heap1, obj;
 	GCStats stats;
-
-	if(DebugPtrs)
-		runtime·printf("GC start\n");
+	uint32 oldphase;
+	uint32 i;
+	G *gp;
 
 	if(runtime·debug.allocfreetrace)
 		runtime·tracegc();
@@ -1400,11 +2047,10 @@ gc(struct gc_args *args)
 	if(runtime·debug.gctrace)
 		t1 = runtime·nanotime();
 
-	// Sweep what is not sweeped by bgsweep.
-	while(runtime·sweepone() != -1)
-		runtime·sweep.npausesweep++;
+	if(!checkmark)
+		runtime·finishsweep_m(); // skip during checkmark debug phase.
 
-	// Cache runtime.mheap.allspans in work.spans to avoid conflicts with
+	// Cache runtime·mheap.allspans in work.spans to avoid conflicts with
 	// resizing/freeing allspans.
 	// New spans can be created while GC progresses, but they are not garbage for
 	// this round:
@@ -1421,10 +2067,19 @@ gc(struct gc_args *args)
 	runtime·work.spans = runtime·mheap.allspans;
 	runtime·work.nspan = runtime·mheap.nspan;
 	runtime·unlock(&runtime·mheap.lock);
+	oldphase = runtime·gcphase;
 
 	runtime·work.nwait = 0;
 	runtime·work.ndone = 0;
-	runtime·work.nproc = runtime·gcprocs();
+	runtime·work.nproc = runtime·gcprocs(); 
+	runtime·gcphase = GCmarktermination;
+
+	// World is stopped so allglen will not change.
+	for(i = 0; i < runtime·allglen; i++) {
+		gp = runtime·allg[i];
+		gp->gcworkdone = false;  // set to true in gcphasework
+	}
+
 	runtime·parforsetup(runtime·work.markfor, runtime·work.nproc, RootCount + runtime·allglen, nil, false, markroot);
 	if(runtime·work.nproc > 1) {
 		runtime·noteclear(&runtime·work.alldone);
@@ -1437,8 +2092,15 @@ gc(struct gc_args *args)
 
 	gchelperstart();
 	runtime·parfordo(runtime·work.markfor);
+
 	scanblock(nil, 0, nil);
 
+	if(runtime·work.full)
+		runtime·throw("runtime·work.full != nil");
+	if(runtime·work.partial)
+		runtime·throw("runtime·work.partial != nil");
+
+	runtime·gcphase = oldphase;
 	t3 = 0;
 	if(runtime·debug.gctrace)
 		t3 = runtime·nanotime();
@@ -1499,6 +2161,16 @@ gc(struct gc_args *args)
 	// Free the old cached mark array if necessary.
 	if(runtime·work.spans != nil && runtime·work.spans != runtime·mheap.allspans)
 		runtime·SysFree(runtime·work.spans, runtime·work.nspan*sizeof(runtime·work.spans[0]), &mstats.other_sys);
+	
+	if(gccheckmarkenable) {
+		if(!checkmark) {
+			// first half of two-pass; don't set up sweep
+			runtime·unlock(&runtime·mheap.lock);
+			return;
+		}
+		checkmark = false; // done checking marks
+	}
+
 	// Cache the current array for sweeping.
 	runtime·mheap.gcspans = runtime·mheap.allspans;
 	runtime·mheap.sweepgen += 2;
@@ -1508,6 +2180,7 @@ gc(struct gc_args *args)
 	runtime·sweep.spanidx = 0;
 	runtime·unlock(&runtime·mheap.lock);
 
+
 	if(ConcurrentSweep && !args->eagersweep) {
 		runtime·lock(&runtime·gclock);
 		if(runtime·sweep.g == nil)
@@ -1527,9 +2200,6 @@ gc(struct gc_args *args)
 
 	runtime·mProf_GC();
 	g->m->traceback = 0;
-
-	if(DebugPtrs)
-		runtime·printf("GC end\n");
 }
 
 extern uintptr runtime·sizeof_C_MStats;
@@ -1802,7 +2472,7 @@ runtime·unrollgcprog_m(void)
 			prog = (byte*)typ->gc[1];
 			unrollgcprog1(mask, prog, &pos, false, true);
 		}
-		
+
 		// atomic way to say mask[0] = 1
 		x = *(uintptr*)mask;
 		((byte*)&x)[0] = 1;
diff --git a/src/runtime/mgc0.go b/src/runtime/mgc0.go
index 3a7204b54..dc4eec519 100644
--- a/src/runtime/mgc0.go
+++ b/src/runtime/mgc0.go
@@ -83,54 +83,139 @@ func bgsweep() {
 	}
 }
 
+const (
+	_PoisonGC    = 0xf969696969696969 & (1<<(8*ptrSize) - 1)
+	_PoisonStack = 0x6868686868686868 & (1<<(8*ptrSize) - 1)
+)
+
 // NOTE: Really dst *unsafe.Pointer, src unsafe.Pointer,
 // but if we do that, Go inserts a write barrier on *dst = src.
 //go:nosplit
 func writebarrierptr(dst *uintptr, src uintptr) {
 	*dst = src
+	writebarrierptr_nostore(dst, src)
+}
+
+// Like writebarrierptr, but the store has already been applied.
+// Do not reapply.
+//go:nosplit
+func writebarrierptr_nostore(dst *uintptr, src uintptr) {
+	if getg() == nil { // very low-level startup
+		return
+	}
+
+	if src != 0 && (src < _PageSize || src == _PoisonGC || src == _PoisonStack) {
+		onM(func() { gothrow("bad pointer in write barrier") })
+	}
+
+	mp := acquirem()
+	if mp.inwb || mp.dying > 0 {
+		releasem(mp)
+		return
+	}
+	mp.inwb = true
+	oldscalar0 := mp.scalararg[0]
+	oldscalar1 := mp.scalararg[1]
+	mp.scalararg[0] = uintptr(unsafe.Pointer(dst))
+	mp.scalararg[1] = src
+	onM_signalok(gcmarkwb_m)
+	mp.scalararg[0] = oldscalar0
+	mp.scalararg[1] = oldscalar1
+	mp.inwb = false
+	releasem(mp)
 }
 
 //go:nosplit
 func writebarrierstring(dst *[2]uintptr, src [2]uintptr) {
-	dst[0] = src[0]
+	writebarrierptr(&dst[0], src[0])
 	dst[1] = src[1]
 }
 
 //go:nosplit
 func writebarrierslice(dst *[3]uintptr, src [3]uintptr) {
-	dst[0] = src[0]
+	writebarrierptr(&dst[0], src[0])
 	dst[1] = src[1]
 	dst[2] = src[2]
 }
 
 //go:nosplit
 func writebarrieriface(dst *[2]uintptr, src [2]uintptr) {
-	dst[0] = src[0]
-	dst[1] = src[1]
-}
-
-//go:nosplit
-func writebarrierfat2(dst *[2]uintptr, _ *byte, src [2]uintptr) {
-	dst[0] = src[0]
-	dst[1] = src[1]
+	writebarrierptr(&dst[0], src[0])
+	writebarrierptr(&dst[1], src[1])
 }
 
-//go:nosplit
-func writebarrierfat3(dst *[3]uintptr, _ *byte, src [3]uintptr) {
-	dst[0] = src[0]
-	dst[1] = src[1]
-	dst[2] = src[2]
-}
+//go:generate go run wbfat_gen.go -- wbfat.go
+//
+// The above line generates multiword write barriers for
+// all the combinations of ptr+scalar up to four words.
+// The implementations are written to wbfat.go.
 
 //go:nosplit
-func writebarrierfat4(dst *[4]uintptr, _ *byte, src [4]uintptr) {
-	dst[0] = src[0]
-	dst[1] = src[1]
-	dst[2] = src[2]
-	dst[3] = src[3]
+func writebarrierfat(typ *_type, dst, src unsafe.Pointer) {
+	mask := loadPtrMask(typ)
+	nptr := typ.size / ptrSize
+	for i := uintptr(0); i < nptr; i += 2 {
+		bits := mask[i/2]
+		if (bits>>2)&_BitsMask == _BitsPointer {
+			writebarrierptr((*uintptr)(dst), *(*uintptr)(src))
+		} else {
+			*(*uintptr)(dst) = *(*uintptr)(src)
+		}
+		dst = add(dst, ptrSize)
+		src = add(src, ptrSize)
+		if i+1 == nptr {
+			break
+		}
+		bits >>= 4
+		if (bits>>2)&_BitsMask == _BitsPointer {
+			writebarrierptr((*uintptr)(dst), *(*uintptr)(src))
+		} else {
+			*(*uintptr)(dst) = *(*uintptr)(src)
+		}
+		dst = add(dst, ptrSize)
+		src = add(src, ptrSize)
+	}
 }
 
 //go:nosplit
-func writebarrierfat(typ *_type, dst, src unsafe.Pointer) {
-	memmove(dst, src, typ.size)
+func writebarriercopy(typ *_type, dst, src slice) int {
+	n := dst.len
+	if n > src.len {
+		n = src.len
+	}
+	if n == 0 {
+		return 0
+	}
+	dstp := unsafe.Pointer(dst.array)
+	srcp := unsafe.Pointer(src.array)
+
+	if uintptr(srcp) < uintptr(dstp) && uintptr(srcp)+uintptr(n)*typ.size > uintptr(dstp) {
+		// Overlap with src before dst.
+		// Copy backward, being careful not to move dstp/srcp
+		// out of the array they point into.
+		dstp = add(dstp, uintptr(n-1)*typ.size)
+		srcp = add(srcp, uintptr(n-1)*typ.size)
+		i := uint(0)
+		for {
+			writebarrierfat(typ, dstp, srcp)
+			if i++; i >= n {
+				break
+			}
+			dstp = add(dstp, -typ.size)
+			srcp = add(srcp, -typ.size)
+		}
+	} else {
+		// Copy forward, being careful not to move dstp/srcp
+		// out of the array they point into.
+		i := uint(0)
+		for {
+			writebarrierfat(typ, dstp, srcp)
+			if i++; i >= n {
+				break
+			}
+			dstp = add(dstp, typ.size)
+			srcp = add(srcp, typ.size)
+		}
+	}
+	return int(n)
 }
diff --git a/src/runtime/mgc0.h b/src/runtime/mgc0.h
index 64f818914..519d7206e 100644
--- a/src/runtime/mgc0.h
+++ b/src/runtime/mgc0.h
@@ -45,8 +45,12 @@ enum {
 	// If you change these, also change scanblock.
 	// scanblock does "if(bits == BitsScalar || bits == BitsDead)" as "if(bits <= BitsScalar)".
 	BitsDead	= 0,
-	BitsScalar	= 1,
-	BitsPointer	= 2,
+	BitsScalar	= 1,                                // 01
+	BitsPointer	= 2,                                // 10
+	BitsCheckMarkXor = 1,                               // 10
+	BitsScalarMarked = BitsScalar ^ BitsCheckMarkXor,   // 00
+	BitsPointerMarked = BitsPointer ^ BitsCheckMarkXor, // 11
+
 	BitsMultiWord	= 3,
 	// BitsMultiWord will be set for the first word of a multi-word item.
 	// When it is set, one of the following will be set for the second word.
@@ -56,7 +60,7 @@ enum {
 	BitsEface	= 3,
 
 	// 64 bytes cover objects of size 1024/512 on 64/32 bits, respectively.
-	MaxGCMask	= 64,
+	MaxGCMask	= 65536, // TODO(rsc): change back to 64
 };
 
 // Bits in per-word bitmap.
diff --git a/src/runtime/mprof.go b/src/runtime/mprof.go
index d64e3be69..d409c6c30 100644
--- a/src/runtime/mprof.go
+++ b/src/runtime/mprof.go
@@ -528,8 +528,6 @@ var allgs []*g // proc.c
 // Most clients should use the runtime/pprof package instead
 // of calling GoroutineProfile directly.
 func GoroutineProfile(p []StackRecord) (n int, ok bool) {
-	sp := getcallersp(unsafe.Pointer(&p))
-	pc := getcallerpc(unsafe.Pointer(&p))
 
 	n = NumGoroutine()
 	if n <= len(p) {
@@ -542,7 +540,11 @@ func GoroutineProfile(p []StackRecord) (n int, ok bool) {
 		if n <= len(p) {
 			ok = true
 			r := p
-			saveg(pc, sp, gp, &r[0])
+			sp := getcallersp(unsafe.Pointer(&p))
+			pc := getcallerpc(unsafe.Pointer(&p))
+			onM(func() {
+				saveg(pc, sp, gp, &r[0])
+			})
 			r = r[1:]
 			for _, gp1 := range allgs {
 				if gp1 == gp || readgstatus(gp1) == _Gdead {
@@ -573,8 +575,6 @@ func saveg(pc, sp uintptr, gp *g, r *StackRecord) {
 // If all is true, Stack formats stack traces of all other goroutines
 // into buf after the trace for the current goroutine.
 func Stack(buf []byte, all bool) int {
-	sp := getcallersp(unsafe.Pointer(&buf))
-	pc := getcallerpc(unsafe.Pointer(&buf))
 	mp := acquirem()
 	gp := mp.curg
 	if all {
@@ -589,14 +589,19 @@ func Stack(buf []byte, all bool) int {
 
 	n := 0
 	if len(buf) > 0 {
-		gp.writebuf = buf[0:0:len(buf)]
-		goroutineheader(gp)
-		traceback(pc, sp, 0, gp)
-		if all {
-			tracebackothers(gp)
-		}
-		n = len(gp.writebuf)
-		gp.writebuf = nil
+		sp := getcallersp(unsafe.Pointer(&buf))
+		pc := getcallerpc(unsafe.Pointer(&buf))
+		onM(func() {
+			g0 := getg()
+			g0.writebuf = buf[0:0:len(buf)]
+			goroutineheader(gp)
+			traceback(pc, sp, 0, gp)
+			if all {
+				tracebackothers(gp)
+			}
+			n = len(g0.writebuf)
+			g0.writebuf = nil
+		})
 	}
 
 	if all {
@@ -623,7 +628,11 @@ func tracealloc(p unsafe.Pointer, size uintptr, typ *_type) {
 	}
 	if gp.m.curg == nil || gp == gp.m.curg {
 		goroutineheader(gp)
-		traceback(getcallerpc(unsafe.Pointer(&p)), getcallersp(unsafe.Pointer(&p)), 0, gp)
+		pc := getcallerpc(unsafe.Pointer(&p))
+		sp := getcallersp(unsafe.Pointer(&p))
+		onM(func() {
+			traceback(pc, sp, 0, gp)
+		})
 	} else {
 		goroutineheader(gp.m.curg)
 		traceback(^uintptr(0), ^uintptr(0), 0, gp.m.curg)
@@ -639,7 +648,11 @@ func tracefree(p unsafe.Pointer, size uintptr) {
 	gp.m.traceback = 2
 	print("tracefree(", p, ", ", hex(size), ")\n")
 	goroutineheader(gp)
-	traceback(getcallerpc(unsafe.Pointer(&p)), getcallersp(unsafe.Pointer(&p)), 0, gp)
+	pc := getcallerpc(unsafe.Pointer(&p))
+	sp := getcallersp(unsafe.Pointer(&p))
+	onM(func() {
+		traceback(pc, sp, 0, gp)
+	})
 	print("\n")
 	gp.m.traceback = 0
 	unlock(&tracelock)
diff --git a/src/runtime/os_android.c b/src/runtime/os_android.c
index 58e0dac93..5805f6871 100644
--- a/src/runtime/os_android.c
+++ b/src/runtime/os_android.c
@@ -9,7 +9,7 @@
 // Export the runtime entry point symbol.
 //
 // Used by the app package to start the Go runtime after loading
-// a shared library via JNI. See code.google.com/p/go.mobile/app.
+// a shared library via JNI. See golang.org/x/mobile/app.
 
 void _rt0_arm_linux1();
 #pragma cgo_export_static _rt0_arm_linux1
diff --git a/src/runtime/os_darwin.c b/src/runtime/os_darwin.c
index bbd29282b..b866863d0 100644
--- a/src/runtime/os_darwin.c
+++ b/src/runtime/os_darwin.c
@@ -135,7 +135,10 @@ void
 runtime·mpreinit(M *mp)
 {
 	mp->gsignal = runtime·malg(32*1024);	// OS X wants >=8K, Linux >=2K
+	runtime·writebarrierptr_nostore(&mp->gsignal, mp->gsignal);
+
 	mp->gsignal->m = mp;
+	runtime·writebarrierptr_nostore(&mp->gsignal->m, mp->gsignal->m);
 }
 
 // Called to initialize a new m (including the bootstrap m).
diff --git a/src/runtime/os_dragonfly.c b/src/runtime/os_dragonfly.c
index e372205ec..051192ad3 100644
--- a/src/runtime/os_dragonfly.c
+++ b/src/runtime/os_dragonfly.c
@@ -195,7 +195,10 @@ void
 runtime·mpreinit(M *mp)
 {
 	mp->gsignal = runtime·malg(32*1024);
+	runtime·writebarrierptr_nostore(&mp->gsignal, mp->gsignal);
+
 	mp->gsignal->m = mp;
+	runtime·writebarrierptr_nostore(&mp->gsignal->m, mp->gsignal->m);
 }
 
 // Called to initialize a new m (including the bootstrap m).
diff --git a/src/runtime/os_freebsd.c b/src/runtime/os_freebsd.c
index a513cb604..1c126547a 100644
--- a/src/runtime/os_freebsd.c
+++ b/src/runtime/os_freebsd.c
@@ -203,7 +203,10 @@ void
 runtime·mpreinit(M *mp)
 {
 	mp->gsignal = runtime·malg(32*1024);
+	runtime·writebarrierptr_nostore(&mp->gsignal, mp->gsignal);
+
 	mp->gsignal->m = mp;
+	runtime·writebarrierptr_nostore(&mp->gsignal->m, mp->gsignal->m);
 }
 
 // Called to initialize a new m (including the bootstrap m).
diff --git a/src/runtime/os_linux.c b/src/runtime/os_linux.c
index 9bd123d59..cc23774e3 100644
--- a/src/runtime/os_linux.c
+++ b/src/runtime/os_linux.c
@@ -233,7 +233,10 @@ void
 runtime·mpreinit(M *mp)
 {
 	mp->gsignal = runtime·malg(32*1024);	// OS X wants >=8K, Linux >=2K
+	runtime·writebarrierptr_nostore(&mp->gsignal, mp->gsignal);
+
 	mp->gsignal->m = mp;
+	runtime·writebarrierptr_nostore(&mp->gsignal->m, mp->gsignal->m);
 }
 
 // Called to initialize a new m (including the bootstrap m).
diff --git a/src/runtime/os_nacl.c b/src/runtime/os_nacl.c
index 14b558303..ad72cc7c6 100644
--- a/src/runtime/os_nacl.c
+++ b/src/runtime/os_nacl.c
@@ -20,7 +20,10 @@ void
 runtime·mpreinit(M *mp)
 {
 	mp->gsignal = runtime·malg(32*1024);	// OS X wants >=8K, Linux >=2K
+	runtime·writebarrierptr_nostore(&mp->gsignal, mp->gsignal);
+
 	mp->gsignal->m = mp;
+	runtime·writebarrierptr_nostore(&mp->gsignal->m, mp->gsignal->m);
 }
 
 // Called to initialize a new m (including the bootstrap m).
diff --git a/src/runtime/os_netbsd.c b/src/runtime/os_netbsd.c
index 58e5bedf2..28929ea57 100644
--- a/src/runtime/os_netbsd.c
+++ b/src/runtime/os_netbsd.c
@@ -271,7 +271,10 @@ void
 runtime·mpreinit(M *mp)
 {
 	mp->gsignal = runtime·malg(32*1024);
+	runtime·writebarrierptr_nostore(&mp->gsignal, mp->gsignal);
+
 	mp->gsignal->m = mp;
+	runtime·writebarrierptr_nostore(&mp->gsignal->m, mp->gsignal->m);
 }
 
 // Called to initialize a new m (including the bootstrap m).
diff --git a/src/runtime/os_openbsd.c b/src/runtime/os_openbsd.c
index eebaa13ee..960aaffff 100644
--- a/src/runtime/os_openbsd.c
+++ b/src/runtime/os_openbsd.c
@@ -217,7 +217,10 @@ void
 runtime·mpreinit(M *mp)
 {
 	mp->gsignal = runtime·malg(32*1024);
+	runtime·writebarrierptr_nostore(&mp->gsignal, mp->gsignal);
+
 	mp->gsignal->m = mp;
+	runtime·writebarrierptr_nostore(&mp->gsignal->m, mp->gsignal->m);
 }
 
 // Called to initialize a new m (including the bootstrap m).
diff --git a/src/runtime/os_plan9.c b/src/runtime/os_plan9.c
index f8c543f6f..18460fc12 100644
--- a/src/runtime/os_plan9.c
+++ b/src/runtime/os_plan9.c
@@ -20,12 +20,18 @@ runtime·mpreinit(M *mp)
 {
 	// Initialize stack and goroutine for note handling.
 	mp->gsignal = runtime·malg(32*1024);
+	runtime·writebarrierptr_nostore(&mp->gsignal, mp->gsignal);
+
 	mp->gsignal->m = mp;
+	runtime·writebarrierptr_nostore(&mp->gsignal->m, mp->gsignal->m);
+
 	mp->notesig = (int8*)runtime·mallocgc(ERRMAX*sizeof(int8), nil, FlagNoScan);
+	runtime·writebarrierptr_nostore(&mp->notesig, mp->notesig);
 
 	// Initialize stack for handling strings from the
 	// errstr system call, as used in package syscall.
 	mp->errstr = (byte*)runtime·mallocgc(ERRMAX*sizeof(byte), nil, FlagNoScan);
+	runtime·writebarrierptr_nostore(&mp->errstr, mp->errstr);
 }
 
 // Called to initialize a new m (including the bootstrap m).
diff --git a/src/runtime/os_solaris.c b/src/runtime/os_solaris.c
index e16b8e637..bee91d8e6 100644
--- a/src/runtime/os_solaris.c
+++ b/src/runtime/os_solaris.c
@@ -176,7 +176,10 @@ void
 runtime·mpreinit(M *mp)
 {
 	mp->gsignal = runtime·malg(32*1024);
+	runtime·writebarrierptr_nostore(&mp->gsignal, mp->gsignal);
+
 	mp->gsignal->m = mp;
+	runtime·writebarrierptr_nostore(&mp->gsignal->m, mp->gsignal->m);
 }
 
 // Called to initialize a new m (including the bootstrap m).
diff --git a/src/runtime/print1.go b/src/runtime/print1.go
index 8f8268873..3d812bd04 100644
--- a/src/runtime/print1.go
+++ b/src/runtime/print1.go
@@ -41,7 +41,31 @@ func snprintf(dst *byte, n int32, s *byte) {
 	gp.writebuf = nil
 }
 
-//var debuglock mutex
+var debuglock mutex
+
+// The compiler emits calls to printlock and printunlock around
+// the multiple calls that implement a single Go print or println
+// statement. Some of the print helpers (printsp, for example)
+// call print recursively. There is also the problem of a crash
+// happening during the print routines and needing to acquire
+// the print lock to print information about the crash.
+// For both these reasons, let a thread acquire the printlock 'recursively'.
+
+func printlock() {
+	mp := getg().m
+	mp.printlock++
+	if mp.printlock == 1 {
+		lock(&debuglock)
+	}
+}
+
+func printunlock() {
+	mp := getg().m
+	mp.printlock--
+	if mp.printlock == 0 {
+		unlock(&debuglock)
+	}
+}
 
 // write to goroutine-local buffer if diverting output,
 // or else standard error.
@@ -80,7 +104,7 @@ func printnl() {
 // Very simple printf.  Only for debugging prints.
 // Do not add to this without checking with Rob.
 func vprintf(str string, arg unsafe.Pointer) {
-	//lock(&debuglock);
+	printlock()
 
 	s := bytes(str)
 	start := 0
@@ -160,7 +184,7 @@ func vprintf(str string, arg unsafe.Pointer) {
 		gwrite(s[start:i])
 	}
 
-	//unlock(&debuglock);
+	printunlock()
 }
 
 func printpc(p unsafe.Pointer) {
diff --git a/src/runtime/proc.c b/src/runtime/proc.c
index feee8ea19..ce39db4ab 100644
--- a/src/runtime/proc.c
+++ b/src/runtime/proc.c
@@ -423,13 +423,7 @@ runtime·casgstatus(G *gp, uint32 oldval, uint32 newval)
 	// loop if gp->atomicstatus is in a scan state giving
 	// GC time to finish and change the state to oldval.
 	while(!runtime·cas(&gp->atomicstatus, oldval, newval)) {
-		// Help GC if needed. 
-		if(gp->preemptscan && !gp->gcworkdone && (oldval == Grunning || oldval == Gsyscall)) {
-			gp->preemptscan = false;
-			g->m->ptrarg[0] = gp;
-			fn = helpcasgstatus;
-			runtime·onM(&fn);
-		}
+
 	}	
 }
 
@@ -504,6 +498,13 @@ runtime·stopg(G *gp)
 			return false;
 
 		case Grunning:
+			if(runtime·gcphase == GCscan) {
+				gp->gcworkdone = true;
+				return false;
+				// Running routines not scanned during
+				// GCscan phase, we only scan non-running routines.
+			}
+				
 			// Claim goroutine, so we aren't racing with a status
 			// transition away from Grunning.
 			if(!runtime·castogscanstatus(gp, Grunning, Gscanrunning))
@@ -581,9 +582,10 @@ mquiesce(G *gpmaster)
 	uint32 status;
 	uint32 activeglen;
 
-	activeglen = runtime·allglen;
 	// enqueue the calling goroutine.
 	runtime·restartg(gpmaster);
+
+	activeglen = runtime·allglen;
 	for(i = 0; i < activeglen; i++) {
 		gp = runtime·allg[i];
 		if(runtime·readgstatus(gp) == Gdead) 
@@ -874,7 +876,9 @@ runtime·allocm(P *p)
 		mp->g0 = runtime·malg(-1);
 	else
 		mp->g0 = runtime·malg(8192);
+	runtime·writebarrierptr_nostore(&mp->g0, mp->g0);
 	mp->g0->m = mp;
+	runtime·writebarrierptr_nostore(&mp->g0->m, mp->g0->m);
 
 	if(p == g->m->p)
 		releasep();
@@ -990,7 +994,7 @@ runtime·newextram(void)
 	// the goroutine stack ends.
 	mp = runtime·allocm(nil);
 	gp = runtime·malg(4096);
-	gp->sched.pc = (uintptr)runtime·goexit;
+	gp->sched.pc = (uintptr)runtime·goexit + PCQuantum;
 	gp->sched.sp = gp->stack.hi;
 	gp->sched.sp -= 4*sizeof(uintreg); // extra space in case of reads slightly beyond frame
 	gp->sched.lr = 0;
@@ -1058,7 +1062,7 @@ runtime·dropm(void)
 	unlockextra(mp);
 }
 
-#define MLOCKED ((M*)1)
+#define MLOCKED 1
 
 // lockextra locks the extra list and returns the list head.
 // The caller must unlock the list by storing a new list head
@@ -1069,28 +1073,28 @@ runtime·dropm(void)
 static M*
 lockextra(bool nilokay)
 {
-	M *mp;
+	uintptr mpx;
 	void (*yield)(void);
 
 	for(;;) {
-		mp = runtime·atomicloadp(&runtime·extram);
-		if(mp == MLOCKED) {
+		mpx = runtime·atomicloaduintptr((uintptr*)&runtime·extram);
+		if(mpx == MLOCKED) {
 			yield = runtime·osyield;
 			yield();
 			continue;
 		}
-		if(mp == nil && !nilokay) {
+		if(mpx == 0 && !nilokay) {
 			runtime·usleep(1);
 			continue;
 		}
-		if(!runtime·casp(&runtime·extram, mp, MLOCKED)) {
+		if(!runtime·casuintptr((uintptr*)&runtime·extram, mpx, MLOCKED)) {
 			yield = runtime·osyield;
 			yield();
 			continue;
 		}
 		break;
 	}
-	return mp;
+	return (M*)mpx;
 }
 
 #pragma textflag NOSPLIT
@@ -1915,6 +1919,7 @@ exitsyscallfast(void)
 
 	// Freezetheworld sets stopwait but does not retake P's.
 	if(runtime·sched.stopwait) {
+		g->m->mcache = nil; 
 		g->m->p = nil;
 		return false;
 	}
@@ -1927,6 +1932,7 @@ exitsyscallfast(void)
 		return true;
 	}
 	// Try to get any other idle P.
+	g->m->mcache = nil;
 	g->m->p = nil;
 	if(runtime·sched.pidle) {
 		fn = exitsyscallfast_pidle;
@@ -2424,9 +2430,10 @@ static struct ProfState {
 	int32 hz;
 } prof;
 
-static void System(void) {}
-static void ExternalCode(void) {}
-static void GC(void) {}
+static void System(void) { System(); }
+static void ExternalCode(void) { ExternalCode(); }
+static void GC(void) { GC(); }
+
 extern void runtime·cpuproftick(uintptr*, int32);
 extern byte runtime·etext[];
 
@@ -2614,6 +2621,8 @@ runtime·setcpuprofilerate_m(void)
 P *runtime·newP(void);
 
 // Change number of processors.  The world is stopped, sched is locked.
+// gcworkbufs are not being modified by either the GC or 
+// the write barrier code.
 static void
 procresize(int32 new)
 {
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 5b8c7d8ae..f41ffbff3 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -165,6 +165,9 @@ func acquireSudog() *sudog {
 	// which keeps the garbage collector from being invoked.
 	mp := acquirem()
 	p := new(sudog)
+	if p.elem != nil {
+		gothrow("acquireSudog: found p.elem != nil after new")
+	}
 	releasem(mp)
 	return p
 }
diff --git a/src/runtime/runtime.h b/src/runtime/runtime.h
index 977c4547d..330ed429b 100644
--- a/src/runtime/runtime.h
+++ b/src/runtime/runtime.h
@@ -94,6 +94,7 @@ typedef	struct	PollDesc	PollDesc;
 typedef	struct	DebugVars	DebugVars;
 typedef	struct	ForceGCState	ForceGCState;
 typedef	struct	Stack		Stack;
+typedef struct  Workbuf         Workbuf;
 
 /*
  * Per-CPU declaration.
@@ -304,7 +305,7 @@ struct	G
 	bool	paniconfault;	// panic (instead of crash) on unexpected fault address
 	bool	preemptscan;    // preempted g does scan for GC
 	bool	gcworkdone;     // debug: cleared at begining of gc work phase cycle, set by gcphasework, tested at end of cycle
-	bool	throwsplit; // must not split stack
+	bool	throwsplit;     // must not split stack
 	int8	raceignore;	// ignore race detection events
 	M*	m;		// for debuggers, but offset not hard-coded
 	M*	lockedm;
@@ -344,6 +345,8 @@ struct	M
 	int32	helpgc;
 	bool	spinning;	// M is out of work and is actively looking for work
 	bool	blocked;	// M is blocked on a Note
+	bool    inwb;           // M is executing a write barrier
+	int8	printlock;
 	uint32	fastrand;
 	uint64	ncgocall;	// number of cgo calls in total
 	int32	ncgo;		// number of cgo calls currently in progress
@@ -570,9 +573,10 @@ enum {
 #endif
 
 // Lock-free stack node.
+// Also known to export_test.go.
 struct LFNode
 {
-	LFNode	*next;
+	uint64	next;
 	uintptr	pushcnt;
 };
 
@@ -598,6 +602,16 @@ struct ParFor
 	uint64 nsleep;
 };
 
+enum {
+	WorkbufSize	= 4*1024,
+};
+struct Workbuf
+{
+	LFNode	node; // must be first
+	uintptr	nobj;
+	byte*	obj[(WorkbufSize-sizeof(LFNode)-sizeof(uintptr))/PtrSize];
+};
+
 // Track memory allocated by code not written in Go during a cgo call,
 // so that the garbage collector can see them.
 struct CgoMal
@@ -620,12 +634,14 @@ struct DebugVars
 
 // Indicates to write barrier and sychronization task to preform.
 enum
-{                   // Synchronization            Write barrier
-	GCoff,      // stop and start             nop
-	GCquiesce,  // stop and start             nop
-	GCstw,      // stop the ps                nop
-	GCmark,     // scan the stacks and start  no white to black
-	GCsweep,    // stop and start             nop
+{                               // Action               WB installation
+	GCoff = 0,		// stop and start	no wb
+	GCquiesce, 		// stop and start	no wb
+	GCstw, 			// stop the ps		nop
+	GCscan,			// scan the stacks prior to marking
+	GCmark,			// mark use wbufs from GCscan and globals, scan the stacks, then go to GCtermination
+	GCmarktermination,	// mark termination detection. Allocate black, Ps help out GC
+	GCsweep,		// stop and start	nop
 };
 
 struct ForceGCState
@@ -636,6 +652,7 @@ struct ForceGCState
 };
 
 extern uint32 runtime·gcphase;
+extern Mutex runtime·allglock;
 
 /*
  * defined macros
@@ -666,6 +683,7 @@ enum {
 
 uint32  runtime·readgstatus(G*);
 void    runtime·casgstatus(G*, uint32, uint32);
+bool    runtime·castogscanstatus(G*, uint32, uint32);
 void    runtime·quiesce(G*);
 bool    runtime·stopg(G*);
 void    runtime·restartg(G*);
@@ -882,6 +900,7 @@ int32	runtime·round2(int32 x); // round x up to a power of 2.
 bool	runtime·cas(uint32*, uint32, uint32);
 bool	runtime·cas64(uint64*, uint64, uint64);
 bool	runtime·casp(void**, void*, void*);
+bool	runtime·casuintptr(uintptr*, uintptr, uintptr);
 // Don't confuse with XADD x86 instruction,
 // this one is actually 'addx', that is, add-and-fetch.
 uint32	runtime·xadd(uint32 volatile*, int32);
@@ -1108,6 +1127,8 @@ void	runtime·osyield(void);
 void	runtime·lockOSThread(void);
 void	runtime·unlockOSThread(void);
 
+void	runtime·writebarrierptr_nostore(void*, void*);
+
 bool	runtime·showframe(Func*, G*);
 void	runtime·printcreatedby(G*);
 
diff --git a/src/runtime/select.go b/src/runtime/select.go
index efe68c1f5..d703e1d79 100644
--- a/src/runtime/select.go
+++ b/src/runtime/select.go
@@ -377,12 +377,7 @@ loop:
 	// iterating through the linked list they are in reverse order.
 	cas = nil
 	sglist = gp.waiting
-	// Clear all selectdone and elem before unlinking from gp.waiting.
-	// They must be cleared before being put back into the sudog cache.
-	// Clear before unlinking, because if a stack copy happens after the unlink,
-	// they will not be updated, they will be left pointing to the old stack,
-	// which creates dangling pointers, which may be detected by the
-	// garbage collector.
+	// Clear all elem before unlinking from gp.waiting.
 	for sg1 := gp.waiting; sg1 != nil; sg1 = sg1.waitlink {
 		sg1.selectdone = nil
 		sg1.elem = nil
diff --git a/src/runtime/sema.go b/src/runtime/sema.go
index a42a29988..d2a028c01 100644
--- a/src/runtime/sema.go
+++ b/src/runtime/sema.go
@@ -259,6 +259,7 @@ func syncsemrelease(s *syncSema, n uint32) {
 		}
 		s.tail = w
 		goparkunlock(&s.lock, "semarelease")
+		releaseSudog(w)
 	} else {
 		unlock(&s.lock)
 	}
diff --git a/src/runtime/stack.c b/src/runtime/stack.c
index 072bc242b..ffae73a2a 100644
--- a/src/runtime/stack.c
+++ b/src/runtime/stack.c
@@ -382,8 +382,6 @@ adjustpointers(byte **scanp, BitVector *bv, AdjustInfo *adjinfo, Func *f)
 	uintptr delta;
 	int32 num, i;
 	byte *p, *minp, *maxp;
-	Type *t;
-	Itab *tab;
 	
 	minp = (byte*)adjinfo->old.lo;
 	maxp = (byte*)adjinfo->old.hi;
@@ -415,43 +413,7 @@ adjustpointers(byte **scanp, BitVector *bv, AdjustInfo *adjinfo, Func *f)
 			}
 			break;
 		case BitsMultiWord:
-			switch(bv->bytedata[(i+1) / (8 / BitsPerPointer)] >> ((i+1) * BitsPerPointer & 7) & 3) {
-			default:
-				runtime·throw("unexpected garbage collection bits");
-			case BitsEface:
-				t = (Type*)scanp[i];
-				if(t != nil && ((t->kind & KindDirectIface) == 0 || (t->kind & KindNoPointers) == 0)) {
-					p = scanp[i+1];
-					if(minp <= p && p < maxp) {
-						if(StackDebug >= 3)
-							runtime·printf("adjust eface %p\n", p);
-						if(t->size > PtrSize) // currently we always allocate such objects on the heap
-							runtime·throw("large interface value found on stack");
-						scanp[i+1] = p + delta;
-					}
-				}
-				i++;
-				break;
-			case BitsIface:
-				tab = (Itab*)scanp[i];
-				if(tab != nil) {
-					t = tab->type;
-					//runtime·printf("          type=%p\n", t);
-					if((t->kind & KindDirectIface) == 0 || (t->kind & KindNoPointers) == 0) {
-						p = scanp[i+1];
-						if(minp <= p && p < maxp) {
-							if(StackDebug >= 3)
-								runtime·printf("adjust iface %p\n", p);
-							if(t->size > PtrSize) // currently we always allocate such objects on the heap
-								runtime·throw("large interface value found on stack");
-							scanp[i+1] = p + delta;
-						}
-					}
-				}
-				i++;
-				break;
-			}
-			break;
+			runtime·throw("adjustpointers: unexpected garbage collection bits");
 		}
 	}
 }
@@ -587,13 +549,13 @@ adjustsudogs(G *gp, AdjustInfo *adjinfo)
 }
 
 // Copies gp's stack to a new stack of a different size.
+// Caller must have changed gp status to Gcopystack.
 static void
 copystack(G *gp, uintptr newsize)
 {
 	Stack old, new;
 	uintptr used;
 	AdjustInfo adjinfo;
-	uint32 oldstatus;
 	bool (*cb)(Stkframe*, void*);
 	byte *p, *ep;
 
@@ -637,20 +599,11 @@ copystack(G *gp, uintptr newsize)
 	}
 	runtime·memmove((byte*)new.hi - used, (byte*)old.hi - used, used);
 
-	oldstatus = runtime·readgstatus(gp);
-	oldstatus &= ~Gscan;
-	if(oldstatus == Gwaiting || oldstatus == Grunnable)
-		runtime·casgstatus(gp, oldstatus, Gcopystack); // oldstatus is Gwaiting or Grunnable
-	else
-		runtime·throw("copystack: bad status, not Gwaiting or Grunnable");
-
 	// Swap out old stack for new one
 	gp->stack = new;
 	gp->stackguard0 = new.lo + StackGuard; // NOTE: might clobber a preempt request
 	gp->sched.sp = new.hi - used;
 
-	runtime·casgstatus(gp, Gcopystack, oldstatus); // oldstatus is Gwaiting or Grunnable
-
 	// free old stack
 	if(StackPoisonCopy) {
 		p = (byte*)old.lo;
@@ -700,6 +653,7 @@ void
 runtime·newstack(void)
 {
 	int32 oldsize, newsize;
+	uint32 oldstatus;
 	uintptr sp;
 	G *gp;
 	Gobuf morebuf;
@@ -752,6 +706,14 @@ runtime·newstack(void)
 		runtime·printf("runtime: split stack overflow: %p < %p\n", sp, gp->stack.lo);
 		runtime·throw("runtime: split stack overflow");
 	}
+	
+	if(gp->sched.ctxt != nil) {
+		// morestack wrote sched.ctxt on its way in here,
+		// without a write barrier. Run the write barrier now.
+		// It is not possible to be preempted between then
+		// and now, so it's okay.
+		runtime·writebarrierptr_nostore(&gp->sched.ctxt, gp->sched.ctxt);
+	}
 
 	if(gp->stackguard0 == (uintptr)StackPreempt) {
 		if(gp == g->m->g0)
@@ -789,12 +751,15 @@ runtime·newstack(void)
 		runtime·throw("stack overflow");
 	}
 
-	// Note that the concurrent GC might be scanning the stack as we try to replace it.
-	// copystack takes care of the appropriate coordination with the stack scanner.
+	oldstatus = runtime·readgstatus(gp);
+	oldstatus &= ~Gscan;
+	runtime·casgstatus(gp, oldstatus, Gcopystack); // oldstatus is Gwaiting or Grunnable
+	// The concurrent GC will not scan the stack while we are doing the copy since
+	// the gp is in a Gcopystack status.
 	copystack(gp, newsize);
 	if(StackDebug >= 1)
 		runtime·printf("stack grow done\n");
-	runtime·casgstatus(gp, Gwaiting, Grunning);
+	runtime·casgstatus(gp, Gcopystack, Grunning);
 	runtime·gogo(&gp->sched);
 }
 
@@ -825,6 +790,7 @@ void
 runtime·shrinkstack(G *gp)
 {
 	uintptr used, oldsize, newsize;
+	uint32 oldstatus;
 
 	if(runtime·readgstatus(gp) == Gdead) {
 		if(gp->stack.lo != 0) {
@@ -858,8 +824,19 @@ runtime·shrinkstack(G *gp)
 #endif
 	if(StackDebug > 0)
 		runtime·printf("shrinking stack %D->%D\n", (uint64)oldsize, (uint64)newsize);
+	// This is being done in a Gscan state and was initiated by the GC so no need to move to
+	// the Gcopystate.
+	// The world is stopped, so the goroutine must be Gwaiting or Grunnable,
+	// and what it is is not changing underfoot.
+
+	oldstatus = runtime·readgstatus(gp);
+	oldstatus &= ~Gscan;
+	if(oldstatus != Gwaiting && oldstatus != Grunnable)
+		runtime·throw("status is not Gwaiting or Grunnable");
+	runtime·casgstatus(gp, oldstatus, Gcopystack);
 	copystack(gp, newsize);
-}
+	runtime·casgstatus(gp, Gcopystack, oldstatus);
+ }
 
 // Do any delayed stack freeing that was queued up during GC.
 void
diff --git a/src/runtime/string.c b/src/runtime/string.c
index ed5debc33..475ea2de6 100644
--- a/src/runtime/string.c
+++ b/src/runtime/string.c
@@ -48,7 +48,7 @@ runtime·gostringnocopy(byte *str)
 	s.len = runtime·findnull(str);
 	while(true) {
 		ms = runtime·maxstring;
-		if(s.len <= ms || runtime·casp((void**)&runtime·maxstring, (void*)ms, (void*)s.len))
+		if(s.len <= ms || runtime·casuintptr(&runtime·maxstring, ms, s.len))
 			return s;
 	}
 }
diff --git a/src/runtime/stubs.go b/src/runtime/stubs.go
index 341904719..9889567d6 100644
--- a/src/runtime/stubs.go
+++ b/src/runtime/stubs.go
@@ -106,6 +106,16 @@ func recovery_m(*g)
 func mcacheRefill_m()
 func largeAlloc_m()
 func gc_m()
+func gcscan_m()
+func gcmark_m()
+func gccheckmark_m()
+func gccheckmarkenable_m()
+func gccheckmarkdisable_m()
+func gcinstallmarkwb_m()
+func gcinstalloffwb_m()
+func gcmarknewobject_m()
+func gcmarkwb_m()
+func finishsweep_m()
 func scavenge_m()
 func setFinalizer_m()
 func removeFinalizer_m()
@@ -204,9 +214,6 @@ func write(fd uintptr, p unsafe.Pointer, n int32) int32
 func cas(ptr *uint32, old, new uint32) bool
 
 //go:noescape
-func casp(ptr *unsafe.Pointer, old, new unsafe.Pointer) bool
-
-//go:noescape
 func casuintptr(ptr *uintptr, old, new uintptr) bool
 
 //go:noescape
@@ -221,6 +228,34 @@ func atomicloaduint(ptr *uint) uint
 //go:noescape
 func setcallerpc(argp unsafe.Pointer, pc uintptr)
 
+// getcallerpc returns the program counter (PC) of its caller's caller.
+// getcallersp returns the stack pointer (SP) of its caller's caller.
+// For both, the argp must be a pointer to the caller's first function argument.
+// The implementation may or may not use argp, depending on
+// the architecture.
+//
+// For example:
+//
+//	func f(arg1, arg2, arg3 int) {
+//		pc := getcallerpc(unsafe.Pointer(&arg1))
+//		sp := getcallerpc(unsafe.Pointer(&arg2))
+//	}
+//
+// These two lines find the PC and SP immediately following
+// the call to f (where f will return).
+//
+// The call to getcallerpc and getcallersp must be done in the
+// frame being asked about. It would not be correct for f to pass &arg1
+// to another function g and let g call getcallerpc/getcallersp.
+// The call inside g might return information about g's caller or
+// information about f's caller or complete garbage.
+//
+// The result of getcallersp is correct at the time of the return,
+// but it may be invalidated by any subsequent call to a function
+// that might relocate the stack in order to grow or shrink it.
+// A general rule is that the result of getcallersp should be used
+// immediately and can only be passed to nosplit functions.
+
 //go:noescape
 func getcallerpc(argp unsafe.Pointer) uintptr
 
diff --git a/src/runtime/sys_x86.c b/src/runtime/sys_x86.c
index a450b3e58..edbe47ff4 100644
--- a/src/runtime/sys_x86.c
+++ b/src/runtime/sys_x86.c
@@ -20,6 +20,7 @@ runtime·gostartcall(Gobuf *gobuf, void (*fn)(void), void *ctxt)
 	gobuf->sp = (uintptr)sp;
 	gobuf->pc = (uintptr)fn;
 	gobuf->ctxt = ctxt;
+	runtime·writebarrierptr_nostore(&gobuf->ctxt, ctxt);
 }
 
 // Called to rewind context saved during morestack back to beginning of function.
diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go
index 834435b40..1c6ce6e64 100644
--- a/src/runtime/traceback.go
+++ b/src/runtime/traceback.go
@@ -101,6 +101,22 @@ func gentraceback(pc0 uintptr, sp0 uintptr, lr0 uintptr, gp *g, skip int, pcbuf
 		gothrow("gentraceback before goexitPC initialization")
 	}
 	g := getg()
+	if g == gp && g == g.m.curg {
+		// The starting sp has been passed in as a uintptr, and the caller may
+		// have other uintptr-typed stack references as well.
+		// If during one of the calls that got us here or during one of the
+		// callbacks below the stack must be grown, all these uintptr references
+		// to the stack will not be updated, and gentraceback will continue
+		// to inspect the old stack memory, which may no longer be valid.
+		// Even if all the variables were updated correctly, it is not clear that
+		// we want to expose a traceback that begins on one stack and ends
+		// on another stack. That could confuse callers quite a bit.
+		// Instead, we require that gentraceback and any other function that
+		// accepts an sp for the current goroutine (typically obtained by
+		// calling getcallersp) must not run on that goroutine's stack but
+		// instead on the g0 stack.
+		gothrow("gentraceback cannot trace user goroutine on its own stack")
+	}
 	gotraceback := gotraceback(nil)
 	if pc0 == ^uintptr(0) && sp0 == ^uintptr(0) { // Signal to fetch saved values from gp.
 		if gp.syscallsp != 0 {
@@ -511,7 +527,11 @@ func traceback1(pc uintptr, sp uintptr, lr uintptr, gp *g, flags uint) {
 func callers(skip int, pcbuf *uintptr, m int) int {
 	sp := getcallersp(unsafe.Pointer(&skip))
 	pc := uintptr(getcallerpc(unsafe.Pointer(&skip)))
-	return gentraceback(pc, sp, 0, getg(), skip, pcbuf, m, nil, nil, 0)
+	var n int
+	onM(func() {
+		n = gentraceback(pc, sp, 0, getg(), skip, pcbuf, m, nil, nil, 0)
+	})
+	return n
 }
 
 func gcallers(gp *g, skip int, pcbuf *uintptr, m int) int {
diff --git a/src/runtime/wbfat.go b/src/runtime/wbfat.go
new file mode 100644
index 000000000..75c58b26b
--- /dev/null
+++ b/src/runtime/wbfat.go
@@ -0,0 +1,190 @@
+// generated by wbfat_gen.go; use go generate
+
+package runtime
+
+//go:nosplit
+func writebarrierfat01(dst *[2]uintptr, _ *byte, src [2]uintptr) {
+	dst[0] = src[0]
+	writebarrierptr(&dst[1], src[1])
+}
+
+//go:nosplit
+func writebarrierfat10(dst *[2]uintptr, _ *byte, src [2]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	dst[1] = src[1]
+}
+
+//go:nosplit
+func writebarrierfat11(dst *[2]uintptr, _ *byte, src [2]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	writebarrierptr(&dst[1], src[1])
+}
+
+//go:nosplit
+func writebarrierfat001(dst *[3]uintptr, _ *byte, src [3]uintptr) {
+	dst[0] = src[0]
+	dst[1] = src[1]
+	writebarrierptr(&dst[2], src[2])
+}
+
+//go:nosplit
+func writebarrierfat010(dst *[3]uintptr, _ *byte, src [3]uintptr) {
+	dst[0] = src[0]
+	writebarrierptr(&dst[1], src[1])
+	dst[2] = src[2]
+}
+
+//go:nosplit
+func writebarrierfat011(dst *[3]uintptr, _ *byte, src [3]uintptr) {
+	dst[0] = src[0]
+	writebarrierptr(&dst[1], src[1])
+	writebarrierptr(&dst[2], src[2])
+}
+
+//go:nosplit
+func writebarrierfat100(dst *[3]uintptr, _ *byte, src [3]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	dst[1] = src[1]
+	dst[2] = src[2]
+}
+
+//go:nosplit
+func writebarrierfat101(dst *[3]uintptr, _ *byte, src [3]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	dst[1] = src[1]
+	writebarrierptr(&dst[2], src[2])
+}
+
+//go:nosplit
+func writebarrierfat110(dst *[3]uintptr, _ *byte, src [3]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	writebarrierptr(&dst[1], src[1])
+	dst[2] = src[2]
+}
+
+//go:nosplit
+func writebarrierfat111(dst *[3]uintptr, _ *byte, src [3]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	writebarrierptr(&dst[1], src[1])
+	writebarrierptr(&dst[2], src[2])
+}
+
+//go:nosplit
+func writebarrierfat0001(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	dst[0] = src[0]
+	dst[1] = src[1]
+	dst[2] = src[2]
+	writebarrierptr(&dst[3], src[3])
+}
+
+//go:nosplit
+func writebarrierfat0010(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	dst[0] = src[0]
+	dst[1] = src[1]
+	writebarrierptr(&dst[2], src[2])
+	dst[3] = src[3]
+}
+
+//go:nosplit
+func writebarrierfat0011(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	dst[0] = src[0]
+	dst[1] = src[1]
+	writebarrierptr(&dst[2], src[2])
+	writebarrierptr(&dst[3], src[3])
+}
+
+//go:nosplit
+func writebarrierfat0100(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	dst[0] = src[0]
+	writebarrierptr(&dst[1], src[1])
+	dst[2] = src[2]
+	dst[3] = src[3]
+}
+
+//go:nosplit
+func writebarrierfat0101(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	dst[0] = src[0]
+	writebarrierptr(&dst[1], src[1])
+	dst[2] = src[2]
+	writebarrierptr(&dst[3], src[3])
+}
+
+//go:nosplit
+func writebarrierfat0110(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	dst[0] = src[0]
+	writebarrierptr(&dst[1], src[1])
+	writebarrierptr(&dst[2], src[2])
+	dst[3] = src[3]
+}
+
+//go:nosplit
+func writebarrierfat0111(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	dst[0] = src[0]
+	writebarrierptr(&dst[1], src[1])
+	writebarrierptr(&dst[2], src[2])
+	writebarrierptr(&dst[3], src[3])
+}
+
+//go:nosplit
+func writebarrierfat1000(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	dst[1] = src[1]
+	dst[2] = src[2]
+	dst[3] = src[3]
+}
+
+//go:nosplit
+func writebarrierfat1001(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	dst[1] = src[1]
+	dst[2] = src[2]
+	writebarrierptr(&dst[3], src[3])
+}
+
+//go:nosplit
+func writebarrierfat1010(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	dst[1] = src[1]
+	writebarrierptr(&dst[2], src[2])
+	dst[3] = src[3]
+}
+
+//go:nosplit
+func writebarrierfat1011(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	dst[1] = src[1]
+	writebarrierptr(&dst[2], src[2])
+	writebarrierptr(&dst[3], src[3])
+}
+
+//go:nosplit
+func writebarrierfat1100(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	writebarrierptr(&dst[1], src[1])
+	dst[2] = src[2]
+	dst[3] = src[3]
+}
+
+//go:nosplit
+func writebarrierfat1101(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	writebarrierptr(&dst[1], src[1])
+	dst[2] = src[2]
+	writebarrierptr(&dst[3], src[3])
+}
+
+//go:nosplit
+func writebarrierfat1110(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	writebarrierptr(&dst[1], src[1])
+	writebarrierptr(&dst[2], src[2])
+	dst[3] = src[3]
+}
+
+//go:nosplit
+func writebarrierfat1111(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	writebarrierptr(&dst[1], src[1])
+	writebarrierptr(&dst[2], src[2])
+	writebarrierptr(&dst[3], src[3])
+}
diff --git a/src/runtime/wbfat_gen.go b/src/runtime/wbfat_gen.go
new file mode 100644
index 000000000..78d5b6271
--- /dev/null
+++ b/src/runtime/wbfat_gen.go
@@ -0,0 +1,41 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+package main
+
+import (
+	"flag"
+	"fmt"
+	"log"
+	"os"
+)
+
+func main() {
+	flag.Parse()
+	if flag.NArg() > 0 {
+		f, err := os.Create(flag.Arg(0))
+		if err != nil {
+			log.Fatal(err)
+		}
+		os.Stdout = f
+	}
+	fmt.Printf("// generated by wbfat_gen.go; use go generate\n\n")
+	fmt.Printf("package runtime\n")
+	for i := uint(2); i <= 4; i++ {
+		for j := 1; j < 1<<i; j++ {
+			fmt.Printf("\n//go:nosplit\n")
+			fmt.Printf("func writebarrierfat%0*b(dst *[%d]uintptr, _ *byte, src [%d]uintptr) {\n", int(i), j, i, i)
+			for k := uint(0); k < i; k++ {
+				if j&(1<<(i-1-k)) != 0 {
+					fmt.Printf("\twritebarrierptr(&dst[%d], src[%d])\n", k, k)
+				} else {
+					fmt.Printf("\tdst[%d] = src[%d]\n", k, k)
+				}
+			}
+			fmt.Printf("}\n")
+		}
+	}
+}
diff --git a/src/sync/atomic/atomic_test.go b/src/sync/atomic/atomic_test.go
index 9f13af48b..ec573aa8c 100644
--- a/src/sync/atomic/atomic_test.go
+++ b/src/sync/atomic/atomic_test.go
@@ -164,7 +164,7 @@ func TestSwapPointer(t *testing.T) {
 	x.before = magicptr
 	x.after = magicptr
 	var j uintptr
-	for delta := uintptr(1); delta+delta > delta; delta += delta {
+	for delta := uintptr(1 << 16); delta+delta > delta; delta += delta {
 		k := SwapPointer(&x.i, unsafe.Pointer(delta))
 		if uintptr(x.i) != delta || uintptr(k) != j {
 			t.Fatalf("delta=%d i=%d j=%d k=%d", delta, x.i, j, k)
@@ -456,7 +456,7 @@ func TestCompareAndSwapPointer(t *testing.T) {
 	magicptr := uintptr(m)
 	x.before = magicptr
 	x.after = magicptr
-	for val := uintptr(1); val+val > val; val += val {
+	for val := uintptr(1 << 16); val+val > val; val += val {
 		x.i = unsafe.Pointer(val)
 		if !CompareAndSwapPointer(&x.i, unsafe.Pointer(val), unsafe.Pointer(val+1)) {
 			t.Fatalf("should have swapped %#x %#x", val, val+1)
@@ -595,7 +595,7 @@ func TestLoadPointer(t *testing.T) {
 	magicptr := uintptr(m)
 	x.before = magicptr
 	x.after = magicptr
-	for delta := uintptr(1); delta+delta > delta; delta += delta {
+	for delta := uintptr(1 << 16); delta+delta > delta; delta += delta {
 		k := LoadPointer(&x.i)
 		if k != x.i {
 			t.Fatalf("delta=%d i=%d k=%d", delta, x.i, k)
@@ -731,7 +731,7 @@ func TestStorePointer(t *testing.T) {
 	x.before = magicptr
 	x.after = magicptr
 	v := unsafe.Pointer(uintptr(0))
-	for delta := uintptr(1); delta+delta > delta; delta += delta {
+	for delta := uintptr(1 << 16); delta+delta > delta; delta += delta {
 		StorePointer(&x.i, unsafe.Pointer(v))
 		if x.i != v {
 			t.Fatalf("delta=%d i=%d v=%d", delta, x.i, v)
diff --git a/test/linkx.go b/test/linkx.go
index 06888a229..151b6db1e 100644
--- a/test/linkx.go
+++ b/test/linkx.go
@@ -1,13 +1,11 @@
-// $G $D/$F.go && $L -X main.tbd hello -X main.overwrite trumped -X main.nosuchsymbol neverseen $F.$A && ./$A.out
-
-// NOTE: This test is not run by 'run.go' and so not run by all.bash.
-// To run this test you must use the ./run shell script.
+// skip
 
 // Copyright 2012 The Go Authors.  All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 // Test the -X facility of the gc linker (6l etc.).
+// This test is run by linkx_run.go.
 
 package main
 
@@ -15,10 +13,6 @@ var tbd string
 var overwrite string = "dibs"
 
 func main() {
-	if tbd != "hello" {
-		println("BUG: test/linkx tbd", len(tbd), tbd)
-	}
-	if overwrite != "trumped" {
-		println("BUG: test/linkx overwrite", len(overwrite), overwrite)
-	}
+	println(tbd)
+	println(overwrite)
 }
diff --git a/test/linkx_run.go b/test/linkx_run.go
new file mode 100644
index 000000000..5b67ce7d3
--- /dev/null
+++ b/test/linkx_run.go
@@ -0,0 +1,33 @@
+// +build !nacl
+// run
+
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Run the linkx test.
+
+package main
+
+import (
+	"fmt"
+	"os"
+	"os/exec"
+)
+
+func main() {
+	cmd := exec.Command("go", "run", "-ldflags=-X main.tbd hello -X main.overwrite trumped", "linkx.go")
+	out, err := cmd.CombinedOutput()
+	if err != nil {
+		fmt.Println(string(out))
+		fmt.Println(err)
+		os.Exit(1)
+	}
+
+	want := "hello\ntrumped\n"
+	got := string(out)
+	if got != want {
+		fmt.Printf("got %q want %q\n", got, want)
+		os.Exit(1)
+	}
+}
diff --git a/test/live.go b/test/live.go
index f69d0a4c1..62c6a0b0e 100644
--- a/test/live.go
+++ b/test/live.go
@@ -9,20 +9,39 @@
 
 package main
 
+func printnl()
+
+//go:noescape
+func printpointer(**int)
+
+//go:noescape
+func printintpointer(*int)
+
+//go:noescape
+func printstringpointer(*string)
+
+//go:noescape
+func printstring(string)
+
+//go:noescape
+func printbytepointer(*byte)
+
+func printint(int)
+
 func f1() {
 	var x *int
-	print(&x) // ERROR "live at call to printpointer: x$"
-	print(&x) // ERROR "live at call to printpointer: x$"
+	printpointer(&x) // ERROR "live at call to printpointer: x$"
+	printpointer(&x) // ERROR "live at call to printpointer: x$"
 }
 
 func f2(b bool) {
 	if b {
-		print(0) // nothing live here
+		printint(0) // nothing live here
 		return
 	}
 	var x *int
-	print(&x) // ERROR "live at call to printpointer: x$"
-	print(&x) // ERROR "live at call to printpointer: x$"
+	printpointer(&x) // ERROR "live at call to printpointer: x$"
+	printpointer(&x) // ERROR "live at call to printpointer: x$"
 }
 
 func f3(b bool) {
@@ -30,22 +49,22 @@ func f3(b bool) {
 	// live throughout the function, to avoid being poisoned
 	// in GODEBUG=gcdead=1 mode.
 
-	print(0) // ERROR "live at call to printint: x y$"
+	printint(0) // ERROR "live at call to printint: x y$"
 	if b == false {
-		print(0) // ERROR "live at call to printint: x y$"
+		printint(0) // ERROR "live at call to printint: x y$"
 		return
 	}
 
 	if b {
 		var x *int
-		print(&x) // ERROR "live at call to printpointer: x y$"
-		print(&x) // ERROR "live at call to printpointer: x y$"
+		printpointer(&x) // ERROR "live at call to printpointer: x y$"
+		printpointer(&x) // ERROR "live at call to printpointer: x y$"
 	} else {
 		var y *int
-		print(&y) // ERROR "live at call to printpointer: x y$"
-		print(&y) // ERROR "live at call to printpointer: x y$"
+		printpointer(&y) // ERROR "live at call to printpointer: x y$"
+		printpointer(&y) // ERROR "live at call to printpointer: x y$"
 	}
-	print(0) // ERROR "live at call to printint: x y$" "x \(type \*int\) is ambiguously live" "y \(type \*int\) is ambiguously live"
+	printint(0) // ERROR "live at call to printint: x y$" "x \(type \*int\) is ambiguously live" "y \(type \*int\) is ambiguously live"
 }
 
 // The old algorithm treated x as live on all code that
@@ -56,20 +75,20 @@ func f3(b bool) {
 
 func f4(b1, b2 bool) { // x not live here
 	if b2 {
-		print(0) // x not live here
+		printint(0) // x not live here
 		return
 	}
 	var z **int
 	x := new(int)
 	*x = 42
 	z = &x
-	print(**z) // ERROR "live at call to printint: x z$"
+	printint(**z) // ERROR "live at call to printint: x z$"
 	if b2 {
-		print(1) // ERROR "live at call to printint: x$"
+		printint(1) // ERROR "live at call to printint: x$"
 		return
 	}
 	for {
-		print(**z) // ERROR "live at call to printint: x z$"
+		printint(**z) // ERROR "live at call to printint: x z$"
 	}
 }
 
@@ -84,7 +103,7 @@ func f5(b1 bool) {
 		*y = 54
 		z = &y
 	}
-	print(**z) // ERROR "live at call to printint: x y$" "x \(type \*int\) is ambiguously live" "y \(type \*int\) is ambiguously live"
+	printint(**z) // ERROR "live at call to printint: x y$" "x \(type \*int\) is ambiguously live" "y \(type \*int\) is ambiguously live"
 }
 
 // confusion about the _ result used to cause spurious "live at entry to f6: _".
@@ -155,8 +174,8 @@ func f11b() *int {
 		// At this point p is dead: the code here cannot
 		// get to the bottom of the function.
 		// This used to have a spurious "live at call to printint: p".
-		print(1) // nothing live here!
-		select { // ERROR "live at call to newselect: autotmp" "live at call to selectgo: autotmp"
+		printint(1) // nothing live here!
+		select {    // ERROR "live at call to newselect: autotmp" "live at call to selectgo: autotmp"
 		case <-c: // ERROR "live at call to selectrecv: autotmp"
 			return nil
 		case <-c: // ERROR "live at call to selectrecv: autotmp"
@@ -172,8 +191,8 @@ func f11c() *int {
 	if b {
 		// Unlike previous, the cases in this select fall through,
 		// so we can get to the println, so p is not dead.
-		print(1) // ERROR "live at call to printint: p"
-		select { // ERROR "live at call to newselect: autotmp.* p" "live at call to selectgo: autotmp.* p"
+		printint(1) // ERROR "live at call to printint: p"
+		select {    // ERROR "live at call to newselect: autotmp.* p" "live at call to selectgo: autotmp.* p"
 		case <-c: // ERROR "live at call to selectrecv: autotmp.* p"
 		case <-c: // ERROR "live at call to selectrecv: autotmp.* p"
 		}
@@ -209,7 +228,7 @@ func h13(string, string) string
 
 func f14() {
 	x := g14()
-	print(&x) // ERROR "live at call to printpointer: x"
+	printstringpointer(&x) // ERROR "live at call to printstringpointer: x"
 }
 
 func g14() string
@@ -217,8 +236,8 @@ func g14() string
 func f15() {
 	var x string
 	_ = &x
-	x = g15() // ERROR "live at call to g15: x"
-	print(x)  // ERROR "live at call to printstring: x"
+	x = g15()      // ERROR "live at call to g15: x"
+	printstring(x) // ERROR "live at call to printstring: x"
 }
 
 func g15() string
@@ -282,7 +301,7 @@ func f18() {
 	}
 	z = m2[g18()] // ERROR "live at call to mapaccess1: autotmp_[0-9]+$"
 	z = m2[g18()] // ERROR "live at call to mapaccess1: autotmp_[0-9]+$"
-	print(z)
+	printbytepointer(z)
 }
 
 var ch chan *byte
@@ -296,7 +315,7 @@ func f19() {
 	}
 	z = <-ch // ERROR "live at call to chanrecv1: autotmp_[0-9]+$"
 	z = <-ch // ERROR "live at call to chanrecv1: autotmp_[0-9]+$"
-	print(z)
+	printbytepointer(z)
 }
 
 func f20() {
@@ -316,7 +335,7 @@ func f21() {
 	}
 	z = m2[[2]string{"x", "y"}] // ERROR "live at call to mapaccess1: autotmp_[0-9]+$"
 	z = m2[[2]string{"x", "y"}] // ERROR "live at call to mapaccess1: autotmp_[0-9]+$"
-	print(z)
+	printbytepointer(z)
 }
 
 func f23() {
@@ -328,7 +347,8 @@ func f23() {
 	}
 	z, ok = m2[[2]string{"x", "y"}] // ERROR "live at call to mapaccess2: autotmp_[0-9]+$"
 	z, ok = m2[[2]string{"x", "y"}] // ERROR "live at call to mapaccess2: autotmp_[0-9]+$"
-	print(z, ok)
+	printbytepointer(z)
+	print(ok)
 }
 
 func f24() {
@@ -350,8 +370,8 @@ func f25(b bool) {
 	}
 	var x string
 	_ = &x
-	x = g15() // ERROR "live at call to g15: x"
-	print(x)  // ERROR "live at call to printstring: x"
+	x = g15()      // ERROR "live at call to g15: x"
+	printstring(x) // ERROR "live at call to printstring: x"
 } // ERROR "live at call to deferreturn: x"
 
 func g25()
@@ -366,7 +386,7 @@ func f26(b bool) {
 	}
 	print26((*int)(nil), (*int)(nil), (*int)(nil)) // ERROR "live at call to print26: autotmp_[0-9]+$"
 	print26((*int)(nil), (*int)(nil), (*int)(nil)) // ERROR "live at call to print26: autotmp_[0-9]+$"
-	println()
+	printnl()
 }
 
 //go:noescape
@@ -381,7 +401,7 @@ func f27(b bool) {
 	}
 	call27(func() { x++ }) // ERROR "live at call to call27: autotmp_[0-9]+$"
 	call27(func() { x++ }) // ERROR "live at call to call27: autotmp_[0-9]+$"
-	println()
+	printnl()
 }
 
 // but defer does escape to later execution in the function
@@ -392,7 +412,7 @@ func f27defer(b bool) {
 		defer call27(func() { x++ }) // ERROR "live at call to deferproc: autotmp_[0-9]+$" "live at call to deferreturn: autotmp_[0-9]+$"
 	}
 	defer call27(func() { x++ }) // ERROR "live at call to deferproc: autotmp_[0-9]+ autotmp_[0-9]+$" "live at call to deferreturn: autotmp_[0-9]+ autotmp_[0-9]+$" "ambiguously live"
-	println()                    // ERROR "live at call to printnl: autotmp_[0-9]+ autotmp_[0-9]+$"
+	printnl()                    // ERROR "live at call to printnl: autotmp_[0-9]+ autotmp_[0-9]+$"
 } // ERROR "live at call to deferreturn: autotmp_[0-9]+ autotmp_[0-9]+$"
 
 // and newproc (go) escapes to the heap
@@ -403,7 +423,7 @@ func f27go(b bool) {
 		go call27(func() { x++ }) // ERROR "live at call to newobject: &x" "live at call to newproc: &x$"
 	}
 	go call27(func() { x++ }) // ERROR "live at call to newobject: &x"
-	println()
+	printnl()
 }
 
 //go:noescape
@@ -415,10 +435,10 @@ var s1, s2, s3, s4, s5, s6, s7, s8, s9, s10 string
 
 func f28(b bool) {
 	if b {
-		print(s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10) // ERROR "live at call to concatstrings: autotmp_[0-9]+$" "live at call to printstring: autotmp_[0-9]+$"
+		printstring(s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10) // ERROR "live at call to concatstrings: autotmp_[0-9]+$" "live at call to printstring: autotmp_[0-9]+$"
 	}
-	print(s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10) // ERROR "live at call to concatstrings: autotmp_[0-9]+$" "live at call to printstring: autotmp_[0-9]+$"
-	print(s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10) // ERROR "live at call to concatstrings: autotmp_[0-9]+$" "live at call to printstring: autotmp_[0-9]+$"
+	printstring(s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10) // ERROR "live at call to concatstrings: autotmp_[0-9]+$" "live at call to printstring: autotmp_[0-9]+$"
+	printstring(s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10) // ERROR "live at call to concatstrings: autotmp_[0-9]+$" "live at call to printstring: autotmp_[0-9]+$"
 }
 
 // map iterator should die on end of range loop
@@ -426,14 +446,14 @@ func f28(b bool) {
 func f29(b bool) {
 	if b {
 		for k := range m { // ERROR "live at call to mapiterinit: autotmp_[0-9]+$" "live at call to mapiternext: autotmp_[0-9]+$"
-			print(k) // ERROR "live at call to printstring: autotmp_[0-9]+$"
+			printstring(k) // ERROR "live at call to printstring: autotmp_[0-9]+$"
 		}
 	}
 	for k := range m { // ERROR "live at call to mapiterinit: autotmp_[0-9]+$" "live at call to mapiternext: autotmp_[0-9]+$"
-		print(k) // ERROR "live at call to printstring: autotmp_[0-9]+$"
+		printstring(k) // ERROR "live at call to printstring: autotmp_[0-9]+$"
 	}
 	for k := range m { // ERROR "live at call to mapiterinit: autotmp_[0-9]+$" "live at call to mapiternext: autotmp_[0-9]+$"
-		print(k) // ERROR "live at call to printstring: autotmp_[0-9]+$"
+		printstring(k) // ERROR "live at call to printstring: autotmp_[0-9]+$"
 	}
 }
 
@@ -446,14 +466,14 @@ func f30(b bool) {
 	// the copy of ptrarr and the internal iterator pointer.
 	if b {
 		for _, p := range ptrarr {
-			print(p) // ERROR "live at call to printpointer: autotmp_[0-9]+ autotmp_[0-9]+$"
+			printintpointer(p) // ERROR "live at call to printintpointer: autotmp_[0-9]+ autotmp_[0-9]+$"
 		}
 	}
 	for _, p := range ptrarr {
-		print(p) // ERROR "live at call to printpointer: autotmp_[0-9]+ autotmp_[0-9]+$"
+		printintpointer(p) // ERROR "live at call to printintpointer: autotmp_[0-9]+ autotmp_[0-9]+$"
 	}
 	for _, p := range ptrarr {
-		print(p) // ERROR "live at call to printpointer: autotmp_[0-9]+ autotmp_[0-9]+$"
+		printintpointer(p) // ERROR "live at call to printintpointer: autotmp_[0-9]+ autotmp_[0-9]+$"
 	}
 }
 
@@ -503,44 +523,44 @@ var m33 map[interface{}]int
 
 func f33() {
 	if m33[nil] == 0 { // ERROR "live at call to mapaccess1: autotmp_[0-9]+$"
-		println()
+		printnl()
 		return
 	} else {
-		println()
+		printnl()
 	}
-	println()
+	printnl()
 }
 
 func f34() {
 	if m33[nil] == 0 { // ERROR "live at call to mapaccess1: autotmp_[0-9]+$"
-		println()
+		printnl()
 		return
 	}
-	println()
+	printnl()
 }
 
 func f35() {
 	if m33[nil] == 0 && m33[nil] == 0 { // ERROR "live at call to mapaccess1: autotmp_[0-9]+$"
-		println()
+		printnl()
 		return
 	}
-	println()
+	printnl()
 }
 
 func f36() {
 	if m33[nil] == 0 || m33[nil] == 0 { // ERROR "live at call to mapaccess1: autotmp_[0-9]+$"
-		println()
+		printnl()
 		return
 	}
-	println()
+	printnl()
 }
 
 func f37() {
 	if (m33[nil] == 0 || m33[nil] == 0) && m33[nil] == 0 { // ERROR "live at call to mapaccess1: autotmp_[0-9]+$"
-		println()
+		printnl()
 		return
 	}
-	println()
+	printnl()
 }
 
 // select temps should disappear in the case bodies
@@ -558,44 +578,44 @@ func f38(b bool) {
 	if b {
 		select { // ERROR "live at call"
 		case <-fc38(): // ERROR "live at call"
-			println()
+			printnl()
 		case fc38() <- *fi38(1): // ERROR "live at call"
-			println()
+			printnl()
 		case *fi38(2) = <-fc38(): // ERROR "live at call"
-			println()
+			printnl()
 		case *fi38(3), *fb38() = <-fc38(): // ERROR "live at call"
-			println()
+			printnl()
 		}
-		println()
+		printnl()
 	}
-	println()
+	printnl()
 }
 
 // issue 8097: mishandling of x = x during return.
 
 func f39() (x []int) {
 	x = []int{1}
-	println() // ERROR "live at call to printnl: x"
+	printnl() // ERROR "live at call to printnl: x"
 	return x
 }
 
 func f39a() (x []int) {
 	x = []int{1}
-	println() // ERROR "live at call to printnl: x"
+	printnl() // ERROR "live at call to printnl: x"
 	return
 }
 
 func f39b() (x [10]*int) {
 	x = [10]*int{}
 	x[0] = new(int) // ERROR "live at call to newobject: x"
-	println()       // ERROR "live at call to printnl: x"
+	printnl()       // ERROR "live at call to printnl: x"
 	return x
 }
 
 func f39c() (x [10]*int) {
 	x = [10]*int{}
 	x[0] = new(int) // ERROR "live at call to newobject: x"
-	println()       // ERROR "live at call to printnl: x"
+	printnl()       // ERROR "live at call to printnl: x"
 	return
 }
 
@@ -615,13 +635,13 @@ func newT40() *T40 {
 func bad40() {
 	t := newT40()
 	_ = t
-	println()
+	printnl()
 }
 
 func good40() {
 	ret := T40{}
 	ret.m = make(map[int]int) // ERROR "live at call to makemap: ret"
 	t := &ret
-	println() // ERROR "live at call to printnl: ret"
+	printnl() // ERROR "live at call to printnl: ret"
 	_ = t
 }
diff --git a/test/live2.go b/test/live2.go
index ef6ad994c..1bd0af2cc 100644
--- a/test/live2.go
+++ b/test/live2.go
@@ -12,6 +12,8 @@ package main
 // issue 8142: lost 'addrtaken' bit on inlined variables.
 // no inlining in this test, so just checking that non-inlined works.
 
+func printnl()
+
 type T40 struct {
 	m map[int]int
 }
@@ -24,7 +26,7 @@ func newT40() *T40 {
 
 func bad40() {
 	t := newT40() // ERROR "live at call to makemap: ret"
-	println()     // ERROR "live at call to printnl: ret"
+	printnl()     // ERROR "live at call to printnl: ret"
 	_ = t
 }
 
@@ -32,6 +34,6 @@ func good40() {
 	ret := T40{}
 	ret.m = make(map[int]int) // ERROR "live at call to makemap: ret"
 	t := &ret
-	println() // ERROR "live at call to printnl: ret"
+	printnl() // ERROR "live at call to printnl: ret"
 	_ = t
 }
diff --git a/test/run.go b/test/run.go
index 28882cf54..e8ec2df9c 100644
--- a/test/run.go
+++ b/test/run.go
@@ -907,8 +907,6 @@ func (t *test) wantedErrors(file, short string) (errs []wantedError) {
 }
 
 var skipOkay = map[string]bool{
-	"linkx.go":            true, // like "run" but wants linker flags
-	"sinit.go":            true,
 	"fixedbugs/bug248.go": true, // combines errorcheckdir and rundir in the same dir.
 	"fixedbugs/bug302.go": true, // tests both .$O and .a imports.
 	"fixedbugs/bug345.go": true, // needs the appropriate flags in gc invocation.
diff --git a/test/sinit.go b/test/sinit.go
index 5e50e1100..df1a4cc93 100644
--- a/test/sinit.go
+++ b/test/sinit.go
@@ -1,7 +1,4 @@
-// $G -S $D/$F.go | egrep initdone >/dev/null && echo BUG sinit || true
-
-// NOTE: This test is not run by 'run.go' and so not run by all.bash.
-// To run this test you must use the ./run shell script.
+// skip
 
 // Copyright 2010 The Go Authors.  All rights reserved.
 // Use of this source code is governed by a BSD-style
@@ -9,6 +6,7 @@
 
 // Test that many initializations can be done at link time and
 // generate no executable init functions.
+// This test is run by sinit_run.go.
 
 package p
 
@@ -106,20 +104,27 @@ var answers = [...]int{
 }
 
 var (
-	copy_zero = zero
-	copy_one = one
-	copy_pi = pi
-	copy_slice = slice
+	copy_zero     = zero
+	copy_one      = one
+	copy_pi       = pi
+	copy_slice    = slice
 	copy_sliceInt = sliceInt
-	copy_hello = hello
-	copy_bytes = bytes
+	copy_hello    = hello
+
+	// Could be handled without an initialization function, but
+	// requires special handling for "a = []byte("..."); b = a"
+	// which is not a likely case.
+	// copy_bytes = bytes
+	// https://codereview.appspot.com/171840043 is one approach to
+	// make this special case work.
+
 	copy_four, copy_five = four, five
-	copy_x, copy_y = x, y
-	copy_nilslice = nilslice
-	copy_nilmap = nilmap
-	copy_nilfunc = nilfunc
-	copy_nilchan = nilchan
-	copy_nilptr = nilptr
+	copy_x, copy_y       = x, y
+	copy_nilslice        = nilslice
+	copy_nilmap          = nilmap
+	copy_nilfunc         = nilfunc
+	copy_nilchan         = nilchan
+	copy_nilptr          = nilptr
 )
 
 var copy_a = a
@@ -172,7 +177,7 @@ var sx []int
 var s0 = []int{0, 0, 0}
 var s1 = []int{1, 2, 3}
 
-func fi() int
+func fi() int { return 1 }
 
 var ax [10]int
 var a0 = [10]int{0, 0, 0}
@@ -202,58 +207,66 @@ var pt0b = &T{X: 0}
 var pt1 = &T{X: 1, Y: 2}
 var pt1a = &T{3, 4}
 
-var copy_bx = bx
+// The checks similar to
+// var copy_bx = bx
+// are commented out.  The  compiler no longer statically initializes them.
+// See issue 7665 and https://codereview.appspot.com/93200044.
+// If https://codereview.appspot.com/169040043 is submitted, and this
+// test is changed to pass -complete to the compiler, then we can
+// uncomment the copy lines again.
+
+// var copy_bx = bx
 var copy_b0 = b0
 var copy_b1 = b1
 
-var copy_fx = fx
+// var copy_fx = fx
 var copy_f0 = f0
 var copy_f1 = f1
 
-var copy_gx = gx
+// var copy_gx = gx
 var copy_g0 = g0
 var copy_g1 = g1
 
-var copy_ix = ix
+// var copy_ix = ix
 var copy_i0 = i0
 var copy_i1 = i1
 
-var copy_jx = jx
+// var copy_jx = jx
 var copy_j0 = j0
 var copy_j1 = j1
 
-var copy_cx = cx
+// var copy_cx = cx
 var copy_c0 = c0
 var copy_c1 = c1
 
-var copy_dx = dx
+// var copy_dx = dx
 var copy_d0 = d0
 var copy_d1 = d1
 
-var copy_sx = sx
+// var copy_sx = sx
 var copy_s0 = s0
 var copy_s1 = s1
 
-var copy_ax = ax
+// var copy_ax = ax
 var copy_a0 = a0
 var copy_a1 = a1
 
-var copy_tx = tx
+// var copy_tx = tx
 var copy_t0 = t0
 var copy_t0a = t0a
 var copy_t0b = t0b
 var copy_t1 = t1
 var copy_t1a = t1a
 
-var copy_psx = psx
+// var copy_psx = psx
 var copy_ps0 = ps0
 var copy_ps1 = ps1
 
-var copy_pax = pax
+// var copy_pax = pax
 var copy_pa0 = pa0
 var copy_pa1 = pa1
 
-var copy_ptx = ptx
+// var copy_ptx = ptx
 var copy_pt0 = pt0
 var copy_pt0a = pt0a
 var copy_pt0b = pt0b
@@ -266,6 +279,8 @@ type T1 int
 
 func (t *T1) M() {}
 
-type Mer interface { M() }
+type Mer interface {
+	M()
+}
 
 var _ Mer = (*T1)(nil)
diff --git a/test/sinit_run.go b/test/sinit_run.go
new file mode 100644
index 000000000..b0a91ce5b
--- /dev/null
+++ b/test/sinit_run.go
@@ -0,0 +1,40 @@
+// +build !nacl
+// run
+
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Run the sinit test.
+
+package main
+
+import (
+	"bytes"
+	"fmt"
+	"go/build"
+	"os"
+	"os/exec"
+)
+
+func main() {
+	letter, err := build.ArchChar(build.Default.GOARCH)
+	if err != nil {
+		fmt.Println(err)
+		os.Exit(1)
+	}
+
+	cmd := exec.Command("go", "tool", letter+"g", "-S", "sinit.go")
+	out, err := cmd.CombinedOutput()
+	if err != nil {
+		fmt.Println(string(out))
+		fmt.Println(err)
+		os.Exit(1)
+	}
+	os.Remove("sinit." + letter)
+
+	if bytes.Contains(out, []byte("initdone")) {
+		fmt.Println("sinit generated an init function")
+		os.Exit(1)
+	}
+}