30 files changed, 1750 insertions, 1581 deletions
diff --git a/ChangeLog b/ChangeLog
index b8391f8..ea3e0a0 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -32,3 +32,23 @@ Fri Jun 24 18:02:26 2005  Google Inc. <opensource@google.com>
 	* Add support for mallopt() and mallinfo (sanjay)
 	* Improve stacktrace's performance on some 64-bit systems (etune)
 	* Improve the stacktrace unittest (etune)
+
+Wed Oct 26 15:19:16 2005  Google Inc. <opensource@google.com>
+
+	* Decrease fragmentation in tcmalloc (lefevere)
+	* Support for ARM in some of the thread-specific code (markus)
+	* Turn off heap-checker for statically-linked binaries, which
+	  cause error leak reports now (etune)
+	* Many pprof improvements, including a command-line interface (jeff)
+	* CPU profiling now automatically affects all threads in linux 2.6.
+	  (Kernel bugs break CPU profiling and threads in linux 2.4 a bit.)
+	  ProfilerEnable() and ProfilerDisable() are deprecated.  (sanjay)
+	* tcmalloc now correctly intercepts memalign (m3b, maxim)
+	* Syntax fix: added missing va_end()s.  Helps non-gcc compiling (etune)
+	* Fixed a few coredumper bugs: race condition after PTRACE_DETACH,
+	  ignore non-aligned stackframe pointers (markus, menage)
+	* 64-bit cleanup, especially for spinlock code (etune) and mmap (sanjay)
+	* Better support for finding threads in linux (markus)
+	* tcmalloc now tracks those stack traces that allocate memory (sanjay)
+	* Work around a weird setspecific problem (sanjay)
+	* Fix tcmalloc overflow problems when an alloc is close to 2G/4G (sanjay)
diff --git a/Makefile.am b/Makefile.am
index 21ab0af..cab257c 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,4 +1,4 @@
-## Process this file with automake to produce Makefile.in       
+## Process this file with automake to produce Makefile.in
 
 # Note: for every library we create, we're explicit about what symbols
 # we export.  In order to avoid complications with C++ mangling, we always
@@ -26,7 +26,7 @@ docdir = $(prefix)/doc/$(PACKAGE)-$(VERSION)
 # Add your documentation files (in doc/) in addition to these
 # top-level boilerplate files.  Also add a TODO file if you have one.
 # We'll add to this later, on a library-by-library basis
-dist_doc_DATA = AUTHORS COPYING ChangeLog INSTALL NEWS README TODO 
+dist_doc_DATA = AUTHORS COPYING ChangeLog INSTALL NEWS README TODO
 
 # The libraries (.so's) you want to install
 # We'll add to this later, on a library-by-library basis
@@ -75,7 +75,7 @@ stacktrace_unittest_SOURCES = src/tests/stacktrace_unittest.cc \
 stacktrace_unittest_LDADD = libstacktrace.la
 
 ### Documentation
-dist_doc_DATA += 
+dist_doc_DATA +=
 
 ### ------- tcmalloc_minimal (thread-caching malloc)
 
@@ -90,7 +90,7 @@ S_TCMALLOC_MINIMAL_INCLUDES = src/config.h \
                               src/maybe_threads.h
 SG_TCMALLOC_MINIMAL_INCLUDES = src/google/malloc_hook.h \
                                src/google/malloc_extension.h \
-                               src/google/stacktrace.h 
+                               src/google/stacktrace.h
 SGP_TCMALLOC_MINIMAL_INCLUDES = src/google/perftools/hash_set.h
 TCMALLOC_MINIMAL_INCLUDES = $(S_TCMALLOC_MINIMAL_INCLUDES) $(SG_TCMALLOC_MINIMAL_INCLUDES) $(SGP_TCMALLOC_MINIMAL_INCLUDES)
 googleinclude_HEADERS += $(SG_TCMALLOC_MINIMAL_INCLUDES)
@@ -135,6 +135,12 @@ tcmalloc_unittest_CXXFLAGS = $(PTHREAD_CFLAGS)
 tcmalloc_unittest_LDFLAGS = $(PTHREAD_CFLAGS)
 tcmalloc_unittest_LDADD = libtcmalloc.la $(PTHREAD_LIBS)
 
+TESTS += tcmalloc_large_unittest
+tcmalloc_large_unittest_SOURCES = src/tests/tcmalloc_large_unittest.cc
+tcmalloc_large_unittest_CXXFLAGS = $(PTHREAD_CFLAGS)
+tcmalloc_large_unittest_LDFLAGS = $(PTHREAD_CFLAGS)
+tcmalloc_large_unittest_LDADD = libtcmalloc.la $(PTHREAD_LIBS)
+
 # performance/unittests originally from ptmalloc2
 TESTS += ptmalloc_unittest1 ptmalloc_unittest2
 PTMALLOC_UNITTEST_INCLUDES = src/tests/ptmalloc/t-test.h \
@@ -144,7 +150,7 @@ PTMALLOC_UNITTEST_INCLUDES = src/tests/ptmalloc/t-test.h \
                              src/tests/ptmalloc/malloc-machine.h
 ptmalloc_unittest1_SOURCES = src/tests/ptmalloc/t-test1.c \
                              $(PTMALLOC_UNITTEST_INCLUDES)
-ptmalloc_unittest1_CFLAGS = $(PTHREAD_CFLAGS) -DUSE_PTHREADS 
+ptmalloc_unittest1_CFLAGS = $(PTHREAD_CFLAGS) -DUSE_PTHREADS
 ptmalloc_unittest1_LDFLAGS = $(PTHREAD_CFLAGS)
 ptmalloc_unittest1_LDADD = $(PTHREAD_LIBS)
 ptmalloc_unittest2_SOURCES = src/tests/ptmalloc/t-test2.c \
@@ -180,7 +186,7 @@ dist_doc_DATA += doc/tcmalloc.html \
                  doc/tcmalloc-opspersec.vs.size.3.threads.png 		\
                  doc/tcmalloc-opspersec.vs.size.4.threads.png 		\
                  doc/tcmalloc-opspersec.vs.size.5.threads.png 		\
-                 doc/tcmalloc-opspersec.vs.size.8.threads.png 
+                 doc/tcmalloc-opspersec.vs.size.8.threads.png
 
 # I don't know how to say "distribute the .dot files but don't install them";
 # noinst doesn't seem to work with data.  I separate them out anyway, in case
@@ -206,6 +212,7 @@ S_TCMALLOC_INCLUDES = src/config.h \
                       src/base/logging.h \
                       src/base/googleinit.h \
                       src/base/elfcore.h \
+                      src/base/linux_syscall_support.h \
                       src/base/linuxthreads.h \
                       src/base/thread_lister.h \
                       src/maybe_threads.h
@@ -213,7 +220,7 @@ SG_TCMALLOC_INCLUDES = src/google/malloc_hook.h \
                        src/google/malloc_extension.h \
                        src/google/heap-profiler.h \
                        src/google/heap-checker.h \
-                       src/google/stacktrace.h 
+                       src/google/stacktrace.h
 SGP_TCMALLOC_INCLUDES = src/google/perftools/hash_set.h
 TCMALLOC_INCLUDES = $(S_TCMALLOC_INCLUDES) $(SG_TCMALLOC_INCLUDES) $(SGP_TCMALLOC_INCLUDES)
 googleinclude_HEADERS += $(SG_TCMALLOC_INCLUDES)
@@ -230,7 +237,6 @@ libtcmalloc_la_SOURCES = src/internal_logging.cc \
                          src/heap-profiler.cc \
                          src/heap-checker.cc \
                          src/heap-checker-bcad.cc \
-                         src/base/elfcore.c \
                          src/base/linuxthreads.c \
                          src/base/thread_lister.c \
                          $(TCMALLOC_INCLUDES)
@@ -304,7 +310,7 @@ S_CPU_PROFILER_INCLUDES = src/config.h \
                           src/base/logging.h
 SG_CPU_PROFILER_INCLUDES = src/google/profiler.h \
                            src/google/stacktrace.h
-SGP_CPU_PROFILER_INCLUDES = 
+SGP_CPU_PROFILER_INCLUDES =
 CPU_PROFILER_INCLUDES = $(S_CPU_PROFILER_INCLUDES) $(SG_CPU_PROFILER_INCLUDES) $(SGP_CPU_PROFILER_INCLUDES)
 googleinclude_HEADERS += $(SG_CPU_PROFILER_INCLUDES)
 perftoolsinclude_HEADERS += $(SGP_CPU_PROFILER_INCLUDES)
diff --git a/aclocal.m4 b/aclocal.m4
index ae8a9c0..0b68740 100644
--- a/aclocal.m4
+++ b/aclocal.m4
@@ -6969,21 +6969,31 @@ AC_DEFUN([AC_CXX_STL_NAMESPACE],
    fi
 ])
 
-# Checks whether the compiler implements namespaces
+dnl @synopsis AC_CXX_NAMESPACES
+dnl
+dnl If the compiler can prevent names clashes using namespaces, define
+dnl HAVE_NAMESPACES.
+dnl
+dnl @category Cxx
+dnl @author Todd Veldhuizen
+dnl @author Luc Maisonobe <luc@spaceroots.org>
+dnl @version 2004-02-04
+dnl @license AllPermissive
+
 AC_DEFUN([AC_CXX_NAMESPACES],
- [AC_CACHE_CHECK(whether the compiler implements namespaces,
-                 ac_cv_cxx_namespaces,
-                 [AC_LANG_SAVE
-                  AC_LANG_CPLUSPLUS
-                  AC_TRY_COMPILE([namespace Outer {
-                                    namespace Inner { int i = 0; }}],
-                                 [using namespace Outer::Inner; return i;],
-                                 ac_cv_cxx_namespaces=yes,
-                                 ac_cv_cxx_namespaces=no)
-                  AC_LANG_RESTORE])
-  if test "$ac_cv_cxx_namespaces" = yes; then
-    AC_DEFINE(HAVE_NAMESPACES, 1, [define if the compiler implements namespaces])
-  fi])
+[AC_CACHE_CHECK(whether the compiler implements namespaces,
+ac_cv_cxx_namespaces,
+[AC_LANG_SAVE
+ AC_LANG_CPLUSPLUS
+ AC_TRY_COMPILE([namespace Outer { namespace Inner { int i = 0; }}],
+                [using namespace Outer::Inner; return i;],
+ ac_cv_cxx_namespaces=yes, ac_cv_cxx_namespaces=no)
+ AC_LANG_RESTORE
+])
+if test "$ac_cv_cxx_namespaces" = yes; then
+  AC_DEFINE(HAVE_NAMESPACES,,[define if the compiler implements namespaces])
+fi
+])
 
 # Figures out where hash_set is defined, and then writes out the
 # location to the file specified in $1.  The output file also
diff --git a/configure b/configure
index 421ada1..339ad8f 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.57 for google-perftools 0.3.
+# Generated by GNU Autoconf 2.57 for google-perftools 0.4.
 #
 # Report bugs to <opensource@google.com>.
 #
@@ -422,8 +422,8 @@ SHELL=${CONFIG_SHELL-/bin/sh}
 # Identity of this package.
 PACKAGE_NAME='google-perftools'
 PACKAGE_TARNAME='google-perftools'
-PACKAGE_VERSION='0.3'
-PACKAGE_STRING='google-perftools 0.3'
+PACKAGE_VERSION='0.4'
+PACKAGE_STRING='google-perftools 0.4'
 PACKAGE_BUGREPORT='opensource@google.com'
 
 ac_unique_file="README"
@@ -953,7 +953,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures google-perftools 0.3 to adapt to many kinds of systems.
+\`configure' configures google-perftools 0.4 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1019,7 +1019,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of google-perftools 0.3:";;
+     short | recursive ) echo "Configuration of google-perftools 0.4:";;
    esac
   cat <<\_ACEOF
 
@@ -1125,7 +1125,7 @@ fi
 test -n "$ac_init_help" && exit 0
 if $ac_init_version; then
   cat <<\_ACEOF
-google-perftools configure 0.3
+google-perftools configure 0.4
 generated by GNU Autoconf 2.57
 
 Copyright 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001, 2002
@@ -1140,7 +1140,7 @@ cat >&5 <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by google-perftools $as_me 0.3, which was
+It was created by google-perftools $as_me 0.4, which was
 generated by GNU Autoconf 2.57.  Invocation command line was
 
   $ $0 $@
@@ -1733,7 +1733,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE=google-perftools
- VERSION=0.3
+ VERSION=0.4
 
 
 cat >>confdefs.h <<_ACEOF
@@ -21090,21 +21090,20 @@ if test "${ac_cv_cxx_namespaces+set}" = set; then
 else
 
 
-                  ac_ext=cc
+ ac_ext=cc
 ac_cpp='$CXXCPP $CPPFLAGS'
 ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
 ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
 
-                  cat >conftest.$ac_ext <<_ACEOF
+ cat >conftest.$ac_ext <<_ACEOF
 #line $LINENO "configure"
 /* confdefs.h.  */
 _ACEOF
 cat confdefs.h >>conftest.$ac_ext
 cat >>conftest.$ac_ext <<_ACEOF
 /* end confdefs.h.  */
-namespace Outer {
-                                    namespace Inner { int i = 0; }}
+namespace Outer { namespace Inner { int i = 0; }}
 int
 main ()
 {
@@ -21133,22 +21132,24 @@ sed 's/^/| /' conftest.$ac_ext >&5
 ac_cv_cxx_namespaces=no
 fi
 rm -f conftest.$ac_objext conftest.$ac_ext
-                  ac_ext=c
+ ac_ext=c
 ac_cpp='$CPP $CPPFLAGS'
 ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
 ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
+
 fi
 echo "$as_me:$LINENO: result: $ac_cv_cxx_namespaces" >&5
 echo "${ECHO_T}$ac_cv_cxx_namespaces" >&6
-  if test "$ac_cv_cxx_namespaces" = yes; then
+if test "$ac_cv_cxx_namespaces" = yes; then
 
 cat >>confdefs.h <<\_ACEOF
-#define HAVE_NAMESPACES 1
+#define HAVE_NAMESPACES
 _ACEOF
 
-  fi
+fi
+
 echo "$as_me:$LINENO: checking what namespace STL code is in" >&5
 echo $ECHO_N "checking what namespace STL code is in... $ECHO_C" >&6
 if test "${ac_cv_cxx_stl_namespace+set}" = set; then
@@ -22015,7 +22016,7 @@ _ASBOX
 } >&5
 cat >&5 <<_CSEOF
 
-This file was extended by google-perftools $as_me 0.3, which was
+This file was extended by google-perftools $as_me 0.4, which was
 generated by GNU Autoconf 2.57.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -22078,7 +22079,7 @@ _ACEOF
 
 cat >>$CONFIG_STATUS <<_ACEOF
 ac_cs_version="\\
-google-perftools config.status 0.3
+google-perftools config.status 0.4
 configured by $0, generated by GNU Autoconf 2.57,
   with options \\"`echo "$ac_configure_args" | sed 's/[\\""\`\$]/\\\\&/g'`\\"
 
diff --git a/configure.ac b/configure.ac
index 245d166..7da5e86 100644
--- a/configure.ac
+++ b/configure.ac
@@ -5,7 +5,7 @@
 # make sure we're interpreted by some minimal autoconf
 AC_PREREQ(2.57)
 
-AC_INIT(google-perftools, 0.3, opensource@google.com)
+AC_INIT(google-perftools, 0.4, opensource@google.com)
 # The argument here is just something that should be in the current directory
 # (for sanity checking)
 AC_CONFIG_SRCDIR(README)
diff --git a/doc/cpu_profiler.html b/doc/cpu_profiler.html
index ad0e9fd..bc18940 100644
--- a/doc/cpu_profiler.html
+++ b/doc/cpu_profiler.html
@@ -45,17 +45,13 @@ given run of an executable:</p>
      profile-filename as an argument.
 </ol>
 
-<p>Profiling works correctly with threads.  To use, just call
-ProfilerRegisterThread() at the beginning of the routine the thread
-runs.  Profiling also works correctly with sub-processes: each child
+<p>In Linux 2.6 and above, profiling works correctly with threads,
+automatically profiling all threads.  In Linux 2.4, profiling only
+profiles the main thread (due to a kernel bug involving itimers and
+threads).  Profiling works correctly with sub-processes: each child
 process gets its own profile with its own name (generated by combining
 CPUPROFILE with the child's process id).</p>
 
-<p>You can also turn profiling on and off throughout the code, and do
-other tweaks.  This functionality will not frequently be needed.  See
-/usr/local/include/google/profiler.h (or src/google/profiler.h in this
-directory) for more details.</p>
-
 <p>For security reasons, CPU profiling will not write to a file -- and
 is thus not usable -- for setuid programs.</p>
 
@@ -68,12 +64,6 @@ profile.</p>
 
 <table frame=box rules=sides cellpadding=5 width=100%>
 <tr>
-<td><code>PROFILESELECTED=1</code></td>
-    <td>If set, cpu-profiler will only profile regions of code
-        surrounded with
-        <code>ProfilerEnable()</code>/<code>ProfilerDisable()</code>.
-    </td>
-</tr><tr>
 <td><code>PROFILEFREQUENCY=<i>x</i></code></td>
     <td>How many interrupts/second the cpu-profiler samples.
     </td>
diff --git a/src/base/elfcore.c b/src/base/elfcore.c
deleted file mode 100644
index d7bce9a..0000000
--- a/src/base/elfcore.c
+++ /dev/null
@@ -1,1046 +0,0 @@
-/* Copyright (c) 2005, Google Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *
- *     * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following disclaimer
- * in the documentation and/or other materials provided with the
- * distribution.
- *     * Neither the name of Google Inc. nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * ---
- * Author: Markus Gutschke
- */
-
-#include "base/elfcore.h"
-#if defined DUMPER
-
-#include <elf.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <limits.h>
-#include <linux/unistd.h>
-#include <pthread.h>
-#include <signal.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/prctl.h>
-#include <sys/ptrace.h>
-#include <sys/resource.h>
-#include <sys/socket.h>
-#include <sys/stat.h>
-#include <sys/sysctl.h>
-#include <sys/time.h>
-#include <sys/types.h>
-#include <sys/uio.h>
-#include <sys/wait.h>
-#include <unistd.h>
-
-#include "base/thread_lister.h"
-
-/* Definitions missing from the standard header files                        */
-#ifndef NT_PRFPXREG
-#define NT_PRFPXREG       20
-#endif
-#ifndef PTRACE_GETFPXREGS
-#define PTRACE_GETFPXREGS ((enum __ptrace_request)18)
-#endif
-#ifndef PR_GET_DUMPABLE
-#define PR_GET_DUMPABLE   3
-#endif
-#ifndef PR_SET_DUMPABLE
-#define PR_SET_DUMPABLE   4
-#endif
-
-
-/* Data structures found in x86-32/64 core dumps on Linux; similar data
- * structures are defined in /usr/include/{linux,asm}/... but those
- * headers conflict with the rest of the libc headers. So we cannot
- * include them here.
- */
-
-typedef struct i386_fpxregs {   /* SSE registers                             */
-  uint16_t  cwd;
-  uint16_t  swd;
-  uint16_t  twd;
-  uint16_t  fop;
-  uint32_t  fip;
-  uint32_t  fcs;
-  uint32_t  foo;
-  uint32_t  fos;
-  uint32_t  mxcsr;
-  uint32_t  mxcsr_mask;
-  uint32_t  st_space[32];       /*  8*16 bytes for each FP-reg  = 128 bytes  */
-  uint32_t  xmm_space[64];      /* 16*16 bytes for each XMM-reg = 128 bytes  */
-  uint32_t  padding[24];
-} i386_fpxregs;
-
-
-#ifdef __x86_64__
-/* Linux on x86-64 stores all FPU registers in the SSE structure             */
-typedef  i386_fpxregs i386_fpregs;
-#else
-typedef struct i386_fpregs {    /* FPU registers                             */
-  uint32_t  cwd;
-  uint32_t  swd;
-  uint32_t  twd;
-  uint32_t  fip;
-  uint32_t  fcs;
-  uint32_t  foo;
-  uint32_t  fos;
-  uint32_t  st_space[20];       /* 8*10 bytes for each FP-reg = 80 bytes     */
-} i386_fpregs;
-#endif
-
-
-typedef struct i386_timeval {   /* Time value with microsecond resolution    */
-  long tv_sec;                  /* Seconds                                   */
-  long tv_usec;                 /* Microseconds                              */
-} i386_timeval;
-
-
-typedef struct i386_siginfo {   /* Information about signal (unused)         */
-  int32_t si_signo;             /* Signal number                             */
-  int32_t si_code;              /* Extra code                                */
-  int32_t si_errno;             /* Errno                                     */
-} i386_siginfo;
-
-
-typedef struct i386_prstatus {  /* Information about thread; includes CPU reg*/
-  struct i386_siginfo pr_info;  /* Info associated with signal               */
-  uint16_t       pr_cursig;     /* Current signal                            */
-  unsigned long  pr_sigpend;    /* Set of pending signals                    */
-  unsigned long  pr_sighold;    /* Set of held signals                       */
-  pid_t          pr_pid;        /* Process ID                                */
-  pid_t          pr_ppid;       /* Parent's process ID                       */
-  pid_t          pr_pgrp;       /* Group ID                                  */
-  pid_t          pr_sid;        /* Session ID                                */
-  i386_timeval   pr_utime;      /* User time                                 */
-  i386_timeval   pr_stime;      /* System time                               */
-  i386_timeval   pr_cutime;     /* Cumulative user time                      */
-  i386_timeval   pr_cstime;     /* Cumulative system time                    */
-  i386_regs      pr_reg;        /* CPU registers                             */
-  uint32_t       pr_fpvalid;    /* True if math co-processor being used      */
-} i386_prstatus;
-
-
-typedef struct i386_prpsinfo {  /* Information about process                 */
-  unsigned char  pr_state;      /* Numeric process state                     */
-  char           pr_sname;      /* Char for pr_state                         */
-  unsigned char  pr_zomb;       /* Zombie                                    */
-  signed char    pr_nice;       /* Nice val                                  */
-  unsigned long  pr_flag;       /* Flags                                     */
-#ifdef __x86_64__
-  uint32_t       pr_uid;        /* User ID                                   */
-  uint32_t       pr_gid;        /* Group ID                                  */
-#else
-  uint16_t       pr_uid;        /* User ID                                   */
-  uint16_t       pr_gid;        /* Group ID                                  */
-#endif
-  pid_t          pr_pid;        /* Process ID                                */
-  pid_t          pr_ppid;       /* Parent's process ID                       */
-  pid_t          pr_pgrp;       /* Group ID                                  */
-  pid_t          pr_sid;        /* Session ID                                */
-  char           pr_fname[16];  /* Filename of executable                    */
-  char           pr_psargs[80]; /* Initial part of arg list                  */
-} i386_prpsinfo;
-
-
-typedef struct i386_user {      /* Ptrace returns this data for thread state */
-  i386_regs      regs;          /* CPU registers                             */
-  unsigned long  fpvalid;       /* True if math co-processor being used      */
-  i386_fpregs    fpregs;        /* FPU registers                             */
-  unsigned long  tsize;         /* Text segment size in pages                */
-  unsigned long  dsize;         /* Data segment size in pages                */
-  unsigned long  ssize;         /* Stack segment size in pages               */
-  unsigned long  start_code;    /* Starting virtual address of text          */
-  unsigned long  start_stack;   /* Starting virtual address of stack area    */
-  unsigned long  signal;        /* Signal that caused the core dump          */
-  unsigned long  reserved;      /* No longer used                            */
-  i386_regs      *regs_ptr;     /* Used by gdb to help find the CPU registers*/
-  i386_fpregs    *fpregs_ptr;   /* Pointer to FPU registers                  */
-  unsigned long  magic;         /* Magic for old A.OUT core files            */
-  char           comm[32];      /* User command that was responsible         */
-  unsigned long  debugreg[8];
-  unsigned long  error_code;    /* CPU error code or 0                       */
-  unsigned long  fault_address; /* CR3 or 0                                  */
-} i386_user;
-
-
-#ifdef __x86_64__
-  #define ELF_CLASS ELFCLASS64
-  #define ELF_ARCH  EM_X86_64
-  #define Ehdr      Elf64_Ehdr
-  #define Phdr      Elf64_Phdr
-  #define Shdr      Elf64_Shdr
-  #define Nhdr      Elf64_Nhdr
-#else
-  #define ELF_CLASS ELFCLASS32
-  #define ELF_ARCH  EM_386
-  #define Ehdr      Elf32_Ehdr
-  #define Phdr      Elf32_Phdr
-  #define Shdr      Elf32_Shdr
-  #define Nhdr      Elf32_Nhdr
-#endif
-
-
-/* After forking, we must make sure to only call system calls.               */
-#if __BOUNDED_POINTERS__
-  #error "Need to port invocations of syscalls for bounded ptrs"
-#else
-  /* The code in this file gets executed after threads have been suspended.
-   * As a consequence, we cannot call any functions that acquire locks.
-   * Unfortunately, libc wraps most system calls (e.g. in order to implement
-   * pthread_atfork, and to make calls cancellable), which means we cannot
-   * call these functions. Instead, we have to call syscall() directly.
-   */
-  #include <stdarg.h>
-  #include <syscall.h>
-  #ifdef __x86_64__
-    #define sys_recvmsg(s,m,f)      syscall(SYS_recvmsg,    (s), (m), (f))
-    #define sys_sendmsg(s,m,f)      syscall(SYS_sendmsg,    (s), (m), (f))
-    #define sys_shutdown(s,h)       syscall(SYS_shutdown,   (s), (h))
-    #define sys_sigaction(s,a,o)    syscall(SYS_rt_sigaction,    (s), (a),(o),\
-                                                                       _NSIG/8)
-    #define sys_sigprocmask(h,s,o)  syscall(SYS_rt_sigprocmask,  (h), (s),(o),\
-                                                                       _NSIG/8)
-    #define sys_socketpair(d,t,p,s) syscall(SYS_socketpair, (d), (t), (p),(s))
-    #define sys_waitpid(p,s,o)      syscall(SYS_wait4, (p), (s), (o),(void *)0)
-  #else
-    static int sys_socketcall(int op, ...) {
-      int rc;
-      va_list ap;
-      va_start(ap, op);
-      rc = syscall(SYS_socketcall, op, ap);
-      va_end(ap);
-      return rc;
-    }
-    #define sys_recvmsg(s,m,f)      sys_socketcall(17,      (s), (m), (f))
-    #define sys_sendmsg(s,m,f)      sys_socketcall(16,      (s), (m), (f))
-    #define sys_shutdown(s,h)       sys_socketcall(13,      (s), (h))
-    #define sys_sigaction(s,a,o)    syscall(SYS_sigaction,  (s), (a), (o))
-    #define sys_sigprocmask(h,s,o)  syscall(SYS_sigprocmask,(h), (s), (o))
-    #define sys_socketpair(d,t,p,s) sys_socketcall(8,       (d), (t), (p),(s))
-    #define sys_waitpid(p,s,o)      syscall(SYS_waitpid,    (p), (s), (o))
-  #endif
-  #define sys_close(f)              syscall(SYS_close,      (f))
-  #define sys_exit(r)               syscall(SYS_exit,       (r))
-  #define sys_fork()                syscall(SYS_fork)
-  #define sys_getegid()             syscall(SYS_getegid)
-  #define sys_geteuid()             syscall(SYS_geteuid)
-  #define sys_getpgrp()             syscall(SYS_getpgrp)
-  #define sys_getpid()              syscall(SYS_getpid)
-  #define sys_getppid()             syscall(SYS_getppid)
-  #define sys_getpriority(a,b)      syscall(SYS_getpriority)
-  #define sys_getrlimit(r,l)        syscall(SYS_getrlimit,  (r), (l))
-  #define sys_getsid(p)             syscall(SYS_getsid,     (p))
-  #define sys_open(f,p,m)           syscall(SYS_open,       (f), (p), (m))
-  #define sys_pipe(f)               syscall(SYS_pipe,       (f))
-  #define sys_prctl(o,a)            syscall(SYS_prctl,      (o), (a))
-  #define sys_ptrace(r,p,a,d)       syscall(SYS_ptrace,     (r), (p), (a),(d))
-  #define sys_read(f,b,c)           syscall(SYS_read,       (f), (b), (c))
-  #define sys_readlink(p,b,s)       syscall(SYS_readlink,   (p), (b), (s))
-  #define sys_write(f,b,c)          syscall(SYS_write,      (f), (b), (c))
-
-  static int sys_sysconf(int name) {
-    extern int __getpagesize(void);
-    switch (name) {
-      case _SC_OPEN_MAX: {
-        struct rlimit ru;
-        return sys_getrlimit(RLIMIT_NOFILE, &ru) < 0 ? 8192 : ru.rlim_cur;
-      }
-      case _SC_PAGESIZE:
-        return __getpagesize();
-      default:
-        errno = ENOSYS;
-        return -1;
-    }
-  }
-
-  static pid_t sys_gettid() {
-    #ifndef SYS_gettid
-      #define SYS_gettid 224
-    #endif
-    pid_t tid = syscall(SYS_gettid);
-    if (tid != -1) {
-      return tid;
-    }
-    return sys_getpid();
-  }
-#endif
-
-
-/* Re-runs fn until it doesn't cause EINTR
- */
-
-#define NO_INTR(fn)   do {} while ((fn) < 0 && errno == EINTR)
-
-/* Wrapper for read() which is guaranteed to never return EINTR.
- */
-static ssize_t c_read(int f, const void *buf, size_t bytes) {
-  if (bytes > 0) {
-    ssize_t rc;
-    NO_INTR(rc = sys_read(f, buf, bytes));
-    return rc;
-  }
-  return 0;
-}
-
-/* Wrapper for write() which is guaranteed to never return EINTR nor
- * short writes.
- */
-static ssize_t c_write(int f, const void *void_buf, size_t bytes) {
-  const unsigned char *buf = (const unsigned char*)void_buf;
-  size_t len = bytes;
-  while (len > 0) {
-    ssize_t rc;
-    NO_INTR(rc = sys_write(f, buf, len));
-    if (rc < 0)
-      return rc;
-    else if (rc == 0)
-      break;
-    buf += rc;
-    len -= rc;
-  }
-  return bytes;
-}
-
-
-struct io {
-  int fd;
-  unsigned char *data, *end;
-  unsigned char buf[4096];
-};
-
-
-/* Reads one character from the "io" file. This function has the same
- * semantics as fgetc(), but we cannot call any library functions at this
- * time.
- */
-static int GetChar(struct io *io) {
-  unsigned char *ptr = io->data;
-  if (ptr == io->end) {
-    /* Even though we are parsing one character at a time, read in larger
-     * chunks.
-     */
-    ssize_t n = c_read(io->fd, io->buf, sizeof(io->buf));
-    if (n <= 0) {
-      if (n == 0)
-        errno = 0;
-      return -1;
-    }
-    ptr = &io->buf[0];
-    io->end = &io->buf[n];
-  }
-  io->data = ptr+1;
-  return *ptr;
-}
-
-
-/* Place the hex number read from "io" into "*hex".  The first non-hex
- * character is returned (or -1 in the case of end-of-file).
- */
-static int GetHex(struct io *io, size_t *hex) {
-  int ch;
-  *hex = 0;
-  while (((ch = GetChar(io)) >= '0' && ch <= '9') ||
-         (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f'))
-    *hex = (*hex << 4) | (ch < 'A' ? ch - '0' : (ch & 0xF) + 9);
-  return ch;
-}
-
-
-/* Computes the amount of leading zeros in a memory region.
- */
-static size_t LeadingZeros(int *loopback, void *mem, size_t len,
-                           size_t pagesize) {
-  char   buf[pagesize];
-  size_t count;
-  char   *ptr = 0;
-  for (count = 0; count < len; ) {
-    /* Read a page by going through the pipe. Assume that we can write at
-     * least one page without blocking.
-     *
-     * "Normal" kernels do not require this hack. But some of the security
-     * patches (e.g. grsec) can be configured to disallow read access of
-     * executable pages. So, directly scanning the memory range would
-     * result in a segmentation fault.
-     *
-     * If we cannot access a page, we assume that it was all zeros.
-     */
-    if ((count % pagesize) == 0) {
-      if (c_write(loopback[1], (char *)mem + count, pagesize) < 0 ||
-          c_read(loopback[0],  buf,                 pagesize) < 0) {
-        count += pagesize;
-        continue;
-      } else
-        ptr = buf;
-    }
-    if (*ptr++)
-      break;
-    count++;
-  }
-  return count & ~(pagesize-1);
-}
-
-
-/* This function is invoked from a seperate process. It has access to a
- * copy-on-write copy of the parents address space, and all crucial
- * information about the parent has been computed by the caller.
- */
-static void CreateElfCore(int fd, i386_prpsinfo *prpsinfo, i386_user *user,
-                          i386_prstatus *prstatus, int num_threads,
-                          pid_t *pids, i386_regs *regs, i386_fpregs *fpregs,
-                          i386_fpxregs *fpxregs, size_t pagesize) {
-  /* Count the number of mappings in "/proc/self/maps". We are guaranteed
-   * that this number is not going to change while this function executes.
-   */
-  int       num_mappings = 0;
-  struct io io;
-  int       loopback[2] = { -1, -1 };
-
-  if (sys_pipe(loopback) < 0)
-    goto done;
-
-  io.data = io.end = 0;
-  NO_INTR(io.fd = sys_open("/proc/self/maps", O_RDONLY, 0));
-  if (io.fd >= 0) {
-    int i, ch;
-    while ((ch = GetChar(&io)) >= 0) {
-      num_mappings += (ch == '\n');
-    }
-    if (errno != 0) {
-   read_error:
-      NO_INTR(sys_close(io.fd));
-      goto done;
-    }
-    NO_INTR(sys_close(io.fd));
-
-    /* Read all mappings. This requires re-opening "/proc/self/maps"         */
-    /* scope */ {
-      struct {
-        size_t start_address, end_address, offset;
-        int   flags;
-      } mappings[num_mappings];
-      io.data = io.end = 0;
-      NO_INTR(io.fd = sys_open("/proc/self/maps", O_RDONLY, 0));
-      if (io.fd >= 0) {
-        size_t note_align;
-        /* Parse entries of the form:
-         * "^[0-9A-F]*-[0-9A-F]* [r-][w-][x-][p-] [0-9A-F]*.*$"
-         */
-        for (i = 0; i < num_mappings;) {
-          static const char * const dev_zero = "/dev/zero";
-          const char *dev = dev_zero;
-          int    j, is_device;
-          size_t zeros;
-
-          memset(&mappings[i], 0, sizeof(mappings[i]));
-
-          /* Read start and end addresses                                    */
-          if (GetHex(&io, &mappings[i].start_address) != '-' ||
-              GetHex(&io, &mappings[i].end_address)   != ' ')
-            goto read_error;
-
-          /* Read flags                                                      */
-          while ((ch = GetChar(&io)) != ' ') {
-            if (ch < 0)
-              goto read_error;
-            mappings[i].flags = (mappings[i].flags << 1) | (ch != '-');
-          }
-          /* Drop the private/shared bit. This makes the flags compatible with
-           * the ELF access bits
-           */
-          mappings[i].flags >>= 1;
-
-          /* Read offset                                                     */
-          if ((ch = GetHex(&io, &mappings[i].offset)) != ' ')
-            goto read_error;
-
-          /* Skip over device numbers, and inode number                      */
-          for (j = 0; j < 2; j++) {
-            while (ch == ' ') {
-              ch = GetChar(&io);
-            }
-            while (ch != ' ' && ch != '\n') {
-              if (ch < 0)
-                goto read_error;
-              ch = GetChar(&io);
-            }
-            while (ch == ' ') {
-              ch = GetChar(&io);
-            }
-            if (ch < 0)
-              goto read_error;
-          }
-
-          /* Check whether this is a mapping for a device                    */
-          while (*dev && ch == *dev) {
-            ch = GetChar(&io);
-            dev++;
-          }
-          is_device = dev >= dev_zero + 5 &&
-                      ((ch != '\n' && ch != ' ') || *dev != '\000');
-
-          /* Skip until end of line                                          */
-          while (ch != '\n') {
-            if (ch < 0)
-              goto read_error;
-            ch = GetChar(&io);
-          }
-
-          /* Skip leading zeroed pages (as found in the stack segment)       */
-          if ((mappings[i].flags & PF_R) && !is_device) {
-            zeros = LeadingZeros(loopback, (void *)mappings[i].start_address,
-                         mappings[i].end_address - mappings[i].start_address,
-                         pagesize);
-            mappings[i].start_address += zeros;
-          }
-
-          /* Remove mapping, if it was not readable, or completely zero
-           * anyway. The former is usually the case of stack guard pages, and
-           * the latter occasionally happens for unused memory.
-           * Also, be careful not to touch mapped devices.
-           */
-          if ((mappings[i].flags & PF_R) == 0 ||
-              mappings[i].start_address == mappings[i].end_address ||
-              is_device) {
-            num_mappings--;
-          } else {
-            i++;
-          }
-        }
-        NO_INTR(sys_close(io.fd));
-
-        /* Write out the ELF header                                          */
-        /* scope */ {
-          Ehdr ehdr;
-          memset(&ehdr, 0, sizeof(ehdr));
-          ehdr.e_ident[0] = ELFMAG0;
-          ehdr.e_ident[1] = ELFMAG1;
-          ehdr.e_ident[2] = ELFMAG2;
-          ehdr.e_ident[3] = ELFMAG3;
-          ehdr.e_ident[4] = ELF_CLASS;
-          ehdr.e_ident[5] = ELFDATA2LSB;
-          ehdr.e_ident[6] = EV_CURRENT;
-          ehdr.e_type     = ET_CORE;
-          ehdr.e_machine  = ELF_ARCH;
-          ehdr.e_version  = EV_CURRENT;
-          ehdr.e_phoff    = sizeof(ehdr);
-          ehdr.e_ehsize   = sizeof(ehdr);
-          ehdr.e_phentsize= sizeof(Phdr);
-          ehdr.e_phnum    = num_mappings + 1;
-          ehdr.e_shentsize= sizeof(Shdr);
-          if (c_write(fd, &ehdr, sizeof(ehdr)) != sizeof(ehdr)) {
-            goto done;
-          }
-        }
-
-        /* Write program headers, starting with the PT_NOTE entry            */
-        /* scope */ {
-          Phdr   phdr;
-          size_t offset   = sizeof(Ehdr) + (num_mappings + 1)*sizeof(Phdr);
-          size_t filesz   = sizeof(Nhdr) + 4 + sizeof(i386_prpsinfo) +
-                            sizeof(Nhdr) + 4 + sizeof(i386_user) +
-                            num_threads*(
-                            + sizeof(Nhdr) + 4 + sizeof(i386_prstatus)
-                            + sizeof(Nhdr) + 4 + sizeof(i386_fpregs));
-          #ifndef __x86_64__
-          if (fpxregs) {
-            filesz       += num_threads*(
-                              sizeof(Nhdr) + 4 + sizeof(i386_fpxregs));
-          }
-          #endif
-          memset(&phdr, 0, sizeof(phdr));
-          phdr.p_type     = PT_NOTE;
-          phdr.p_offset   = offset;
-          phdr.p_filesz   = filesz;
-          if (c_write(fd, &phdr, sizeof(phdr)) != sizeof(phdr)) {
-            goto done;
-          }
-
-          /* Now follow with program headers for each of the memory segments */
-          phdr.p_type     = PT_LOAD;
-          phdr.p_align    = pagesize;
-          phdr.p_paddr    = 0;
-          note_align      = phdr.p_align - ((offset+filesz) % phdr.p_align);
-          if (note_align == phdr.p_align)
-            note_align    = 0;
-          offset         += note_align;
-          for (i = 0; i < num_mappings; i++) {
-            offset       += filesz;
-            filesz        = mappings[i].end_address -mappings[i].start_address;
-            phdr.p_offset = offset;
-            phdr.p_vaddr  = mappings[i].start_address;
-            phdr.p_memsz  = filesz;
-
-            /* Do not write contents for memory segments that are read-only  */
-            if ((mappings[i].flags & PF_W) == 0)
-              filesz      = 0;
-            phdr.p_filesz = filesz;
-            phdr.p_flags  = mappings[i].flags;
-            if (c_write(fd, &phdr, sizeof(phdr)) != sizeof(phdr)) {
-              goto done;
-            }
-          }
-        }
-
-        /* Write note section                                                */
-        /* scope */ {
-          Nhdr nhdr;
-          memset(&nhdr, 0, sizeof(nhdr));
-          nhdr.n_namesz   = 4;
-          nhdr.n_descsz   = sizeof(i386_prpsinfo);
-          nhdr.n_type     = NT_PRPSINFO;
-          if (c_write(fd, &nhdr, sizeof(nhdr)) != sizeof(nhdr) ||
-              c_write(fd, "CORE", 4) != 4 ||
-              c_write(fd, prpsinfo, sizeof(i386_prpsinfo)) !=
-              sizeof(i386_prpsinfo)) {
-            goto done;
-          }
-          nhdr.n_descsz   = sizeof(i386_user);
-          nhdr.n_type     = NT_PRXREG;
-          if (c_write(fd, &nhdr, sizeof(nhdr)) != sizeof(nhdr) ||
-              c_write(fd, "CORE", 4) != 4 ||
-              c_write(fd, user, sizeof(i386_user)) != sizeof(i386_user)) {
-            goto done;
-          }
-
-          for (i = num_threads; i-- > 0; ) {
-            /* Process status and integer registers                          */
-            nhdr.n_descsz = sizeof(i386_prstatus);
-            nhdr.n_type   = NT_PRSTATUS;
-            prstatus->pr_pid = pids[i];
-            prstatus->pr_reg = regs[i];
-            if (c_write(fd, &nhdr, sizeof(nhdr)) != sizeof(nhdr) ||
-                c_write(fd, "CORE", 4) != 4 ||
-                c_write(fd, prstatus, sizeof(i386_prstatus)) !=
-                sizeof(i386_prstatus)) {
-              goto done;
-            }
-
-            /* FPU registers                                                 */
-            nhdr.n_descsz = sizeof(i386_fpregs);
-            nhdr.n_type   = NT_FPREGSET;
-            if (c_write(fd, &nhdr, sizeof(nhdr)) != sizeof(nhdr) ||
-                c_write(fd, "CORE", 4) != 4 ||
-                c_write(fd, fpregs+1, sizeof(i386_fpregs)) !=
-                sizeof(i386_fpregs)) {
-              goto done;
-            }
-
-            /* SSE registers                                                 */
-            #ifndef __x86_64__
-            /* Linux on x86-64 stores all FPU registers in the SSE structure */
-            if (fpxregs) {
-              nhdr.n_descsz = sizeof(i386_fpxregs);
-              nhdr.n_type   = NT_PRFPXREG;
-              if (c_write(fd, &nhdr, sizeof(nhdr)) != sizeof(nhdr) ||
-                  c_write(fd, "CORE", 4) != 4 ||
-                  c_write(fd, fpxregs+1, sizeof(i386_fpxregs)) !=
-                  sizeof(i386_fpxregs)) {
-                goto done;
-              }
-            }
-            #endif
-          }
-        }
-
-        /* Align all following segments to multiples of page size            */
-        if (note_align) {
-          char scratch[note_align];
-          memset(scratch, 0, sizeof(scratch));
-          if (c_write(fd, scratch, sizeof(scratch)) != sizeof(scratch)) {
-            goto done;
-          }
-        }
-
-        /* Write all memory segments                                         */
-        for (i = 0; i < num_mappings; i++) {
-          if (mappings[i].flags & PF_W &&
-              c_write(fd, (void *)mappings[i].start_address,
-                      mappings[i].end_address - mappings[i].start_address) !=
-                      mappings[i].end_address - mappings[i].start_address) {
-            goto done;
-          }
-        }
-      }
-    }
-  }
-
-done:
-  if (loopback[0] >= 0)
-    NO_INTR(sys_close(loopback[0]));
-  if (loopback[1] >= 0)
-    NO_INTR(sys_close(loopback[1]));
-  NO_INTR(sys_close(fd));
-  return;
-}
-
-
-/* Internal function for generating a core file. This function works for
- * both single- and multi-threaded core files. It assumes that all threads
- * are already suspended, and will resume them before returning.
- *
- * The caller must make sure that prctl(PR_SET_DUMPABLE, 1) has been called,
- * or this function might fail.
- */
-int InternalGetCoreDump(void *frame, int num_threads, pid_t *thread_pids) {
-  long          i;
-  int           rc = -1, fd = -1, threads = num_threads, hasSSE = 0;
-  i386_prpsinfo prpsinfo;
-  i386_prstatus prstatus;
-  pid_t         pids[threads           + 1];
-  i386_regs     thread_regs[threads    + 1];
-  i386_fpregs   thread_fpregs[threads  + 1];
-  i386_fpxregs  thread_fpxregs[threads + 1];
-  int           pair[2];
-  int           main_pid = sys_gettid();
-
-  /* Get thread status                                                       */
-  if (threads)
-    memcpy(pids, thread_pids, threads * sizeof(pid_t));
-  memset(thread_regs,    0, (threads + 1) * sizeof(i386_regs));
-  memset(thread_fpregs,  0, (threads + 1) * sizeof(i386_fpregs));
-  memset(thread_fpxregs, 0, (threads + 1) * sizeof(i386_fpxregs));
-
-  /* Threads are already attached, read their registers now                  */
-  for (i = 0; i < threads; i++) {
-    char scratch[4096];
-    memset(scratch, 0xFF, sizeof(scratch));
-    if (sys_ptrace(PTRACE_GETREGS, pids[i], scratch, scratch) == 0) {
-      memcpy(thread_regs + i, scratch, sizeof(i386_regs));
-      memset(scratch, 0xFF, sizeof(scratch));
-      if (sys_ptrace(PTRACE_GETFPREGS, pids[i], scratch, scratch) == 0) {
-        memcpy(thread_fpregs + i, scratch, sizeof(i386_fpregs));
-        memset(scratch, 0xFF, sizeof(scratch));
-        #ifndef __x86_64__
-        /* Linux on x86-64 stores all FPU registers in the SSE structure     */
-        if (sys_ptrace(PTRACE_GETFPXREGS, pids[i], scratch, scratch) == 0) {
-          memcpy(thread_fpxregs + i, scratch, sizeof(i386_fpxregs));
-        } else {
-          hasSSE = 0;
-        }
-        #endif
-      } else
-        goto ptrace;
-    } else {
-   ptrace: /* Oh, well, undo everything and get out of here                  */
-      ResumeAllProcessThreads(threads, pids);
-      goto error;
-    }
-  }
-
-  /* Build the PRPSINFO data structure                                       */
-  memset(&prpsinfo, 0, sizeof(prpsinfo));
-  prpsinfo.pr_sname = 'R';
-  prpsinfo.pr_nice  = sys_getpriority(PRIO_PROCESS, 0);
-  prpsinfo.pr_uid   = sys_geteuid();
-  prpsinfo.pr_gid   = sys_getegid();
-  prpsinfo.pr_pid   = main_pid;
-  prpsinfo.pr_ppid  = sys_getppid();
-  prpsinfo.pr_pgrp  = sys_getpgrp();
-  prpsinfo.pr_sid   = sys_getsid(0);
-  /* scope */ {
-    char scratch[4096], *cmd = scratch, *ptr;
-    ssize_t size, len;
-    int cmd_fd;
-    memset(&scratch, 0, sizeof(scratch));
-    size = sys_readlink("/proc/self/exe", scratch, sizeof(scratch));
-    len = 0;
-    for (ptr = cmd; *ptr != '\000' && size-- > 0; ptr++) {
-      if (*ptr == '/') {
-        cmd = ptr+1;
-        len = 0;
-      } else
-        len++;
-    }
-    memcpy(prpsinfo.pr_fname, cmd,
-           len > sizeof(prpsinfo.pr_fname) ? sizeof(prpsinfo.pr_fname) : len);
-    NO_INTR(cmd_fd = sys_open("/proc/self/cmdline", O_RDONLY, 0));
-    if (cmd_fd >= 0) {
-      char *ptr;
-      ssize_t size = c_read(cmd_fd, &prpsinfo.pr_psargs,
-                            sizeof(prpsinfo.pr_psargs));
-      for (ptr = prpsinfo.pr_psargs; size-- > 0; ptr++)
-        if (*ptr == '\000')
-          *ptr = ' ';
-      NO_INTR(sys_close(cmd_fd));
-    }
-  }
-
-  /* Build the PRSTATUS data structure                                       */
-  /* scope */ {
-    int stat_fd;
-    memset(&prstatus, 0, sizeof(prstatus));
-    prstatus.pr_pid     = prpsinfo.pr_pid;
-    prstatus.pr_ppid    = prpsinfo.pr_ppid;
-    prstatus.pr_pgrp    = prpsinfo.pr_pgrp;
-    prstatus.pr_sid     = prpsinfo.pr_sid;
-    prstatus.pr_fpvalid = 1;
-    NO_INTR(stat_fd = sys_open("/proc/self/stat", O_RDONLY, 0));
-    if (stat_fd >= 0) {
-      char scratch[4096];
-      ssize_t size = c_read(stat_fd, scratch, sizeof(scratch) - 1);
-      if (size >= 0) {
-        unsigned long tms;
-        char *ptr = scratch;
-        scratch[size] = '\000';
-
-        /* User time                                                         */
-        for (i = 13; i && *ptr; ptr++) if (*ptr == ' ') i--;
-        tms = 0;
-        while (*ptr && *ptr != ' ') tms = 10*tms + *ptr++ - '0';
-        prstatus.pr_utime.tv_sec  = tms / 1000;
-        prstatus.pr_utime.tv_usec = (tms % 1000) * 1000;
-
-        /* System time                                                       */
-        if (*ptr) ptr++;
-        tms = 0;
-        while (*ptr && *ptr != ' ') tms = 10*tms + *ptr++ - '0';
-        prstatus.pr_stime.tv_sec  = tms / 1000;
-        prstatus.pr_stime.tv_usec = (tms % 1000) * 1000;
-
-        /* Cumulative user time                                              */
-        if (*ptr) ptr++;
-        tms = 0;
-        while (*ptr && *ptr != ' ') tms = 10*tms + *ptr++ - '0';
-        prstatus.pr_cutime.tv_sec  = tms / 1000;
-        prstatus.pr_cutime.tv_usec = (tms % 1000) * 1000;
-
-        /* Cumulative system time                                            */
-        if (*ptr) ptr++;
-        tms = 0;
-        while (*ptr && *ptr != ' ') tms = 10*tms + *ptr++ - '0';
-        prstatus.pr_cstime.tv_sec  = tms / 1000;
-        prstatus.pr_cstime.tv_usec = (tms % 1000) * 1000;
-
-        /* Pending signals                                                   */
-        for (i = 14; i && *ptr; ptr++) if (*ptr == ' ') i--;
-        while (*ptr && *ptr != ' ')
-          prstatus.pr_sigpend = 10*prstatus.pr_sigpend + *ptr++ - '0';
-
-        /* Held signals                                                      */
-        if (*ptr) ptr++;
-        while (*ptr && *ptr != ' ')
-          prstatus.pr_sigpend = 10*prstatus.pr_sigpend + *ptr++ - '0';
-      }
-      NO_INTR(sys_close(stat_fd));
-    }
-  }
-
-  /* Create a file descriptor that can be used for reading data from
-   * our child process. This is a little complicated because we need
-   * to make sure there is no race condition with other threads
-   * calling fork() at the same time (this is somewhat mitigated,
-   * because our threads are supposedly suspended at this time). We
-   * have to avoid other processes holding our file handles open. We
-   * can do this by creating the pipe in the child and passing the
-   * file handle back to the parent.
-   */
-  if (sys_socketpair(AF_UNIX, SOCK_STREAM, 0, pair) >= 0) {
-    int openmax  = sys_sysconf(_SC_OPEN_MAX);
-    int pagesize = sys_sysconf(_SC_PAGESIZE);
-
-    /* Block signals prior to forking. Technically, POSIX requires us to call
-     * pthread_sigmask(), if this is a threaded application. When using
-     * glibc, we are OK calling sigprocmask(), though. We will end up
-     * blocking additional signals that libpthread uses internally, but that
-     * is actually exactly what we want.
-     *
-     * Also, POSIX claims that this should not actually be necessarily, but
-     * reality says otherwise.
-     */
-    sigset_t old_signals, blocked_signals;
-    sigfillset(&blocked_signals);
-    sys_sigprocmask(SIG_BLOCK, &blocked_signals, &old_signals);
-
-    /* Create a new core dump in child process; call sys_fork() in order to
-     * avoid complications with pthread_atfork() handlers. In the child
-     * process, we should only ever call system calls.
-     */
-    if ((rc = sys_fork()) == 0) {
-      i386_user user;
-      int       fds[2];
-
-      /* All signals are blocked at this time, but we could still end up
-       * executing synchronous signals (such as SIGILL, SIGFPE, SIGSEGV,
-       * SIGBUS, or SIGTRAP). Reset them to SIG_DFL.
-       */
-      static const int signals[] = { SIGABRT, SIGILL, SIGFPE, SIGSEGV, SIGBUS};
-      for (i = 0; i < sizeof(signals)/sizeof(*signals); i++) {
-        struct sigaction act;
-        memset(&act, 0, sizeof(act));
-        act.sa_handler = SIG_DFL;
-        act.sa_flags   = SA_RESTART;
-        sys_sigaction(signals[i], &act, NULL);
-      }
-
-      /* Get parent's CPU registers, and user data structure                 */
-      if (sys_ptrace(PTRACE_ATTACH, main_pid, (void *)0, (void *)0) >= 0) {
-        char scratch[4096];
-        while (sys_waitpid(main_pid, (void *)0, __WALL) < 0) {
-          if (errno != EINTR)
-            sys_exit(1);
-        }
-        for (i = 0; i < sizeof(user); i += sizeof(int))
-          ((int *)&user)[i/sizeof(int)] = sys_ptrace(PTRACE_PEEKUSER,
-                                              main_pid, (void *)i, (void *) i);
-        memset(scratch, 0xFF, sizeof(scratch));
-        if (sys_ptrace(PTRACE_GETREGS, main_pid, scratch, scratch) == 0) {
-          memcpy(thread_regs + threads, scratch, sizeof(i386_regs));
-          memset(scratch, 0xFF, sizeof(scratch));
-          if (sys_ptrace(PTRACE_GETFPREGS, main_pid, scratch, scratch) == 0) {
-            memcpy(thread_fpregs + threads, scratch, sizeof(i386_fpregs));
-            memset(scratch, 0xFF, sizeof(scratch));
-            #ifndef __x86_64__
-            /* Linux on x86-64 stores all FPU regs in the SSE structure      */
-            if (sys_ptrace(PTRACE_GETFPXREGS,main_pid,scratch,scratch) == 0) {
-              memcpy(thread_fpxregs +threads,scratch,sizeof(i386_fpxregs));
-            } else {
-              hasSSE = 0;
-            }
-            #endif
-          } else
-            sys_exit(1);
-        } else
-          sys_exit(1);
-      } else
-        sys_exit(1);
-      sys_ptrace(PTRACE_DETACH, main_pid, (void *)0, (void *)0);
-
-      /* Fake a somewhat reasonable looking stack frame for the
-       * getCoreDump() function.
-       */
-      SET_FRAME(*(Frame *)frame, thread_regs[threads]);
-      memcpy(&user.regs, thread_regs + threads, sizeof(i386_regs));
-      pids[threads++] = main_pid;
-
-      /* Create a pipe for communicating with parent                         */
-      if (sys_pipe(fds) < 0)
-        sys_exit(1);
-
-      /* Pass file handle to parent                                          */
-      /* scope */ {
-        char cmsg_buf[CMSG_SPACE(sizeof(int))];
-        struct iovec  iov;
-        struct msghdr msg;
-        struct cmsghdr *cmsg;
-        memset(&iov, 0, sizeof(iov));
-        memset(&msg, 0, sizeof(msg));
-        iov.iov_base            = (void *)"";
-        iov.iov_len             = 1;
-        msg.msg_iov             = &iov;
-        msg.msg_iovlen          = 1;
-        msg.msg_control         = &cmsg_buf;
-        msg.msg_controllen      = sizeof(cmsg_buf);
-        cmsg                    = CMSG_FIRSTHDR(&msg);
-        cmsg->cmsg_level        = SOL_SOCKET;
-        cmsg->cmsg_type         = SCM_RIGHTS;
-        cmsg->cmsg_len          = CMSG_LEN(sizeof(int));
-        *(int *)CMSG_DATA(cmsg) = fds[0];
-        while (sys_sendmsg(pair[1], &msg, 0) < 0) {
-          if (errno != EINTR)
-            sys_exit(1);
-        }
-        while (sys_shutdown(pair[1], SHUT_RDWR) < 0) {
-          if (errno != EINTR)
-            sys_exit(1);
-        }
-      }
-
-      /* Close all file handles other than the write end of our pipe         */
-      for (i = 0; i < openmax; i++)
-        if (i != fds[1])
-          NO_INTR(sys_close(i));
-
-      /* Turn into a daemon process, so that "init" can reap us              */
-      if ((rc = sys_fork()) == 0) {
-        CreateElfCore(fds[1], &prpsinfo, &user, &prstatus, threads,
-                      pids, thread_regs, thread_fpregs,
-                      hasSSE ? thread_fpxregs : NULL, pagesize);
-        sys_exit(0);
-      } else {
-        sys_exit(rc < 0 ? 1 : 0);
-      }
-
-      /* Make the compiler happy. We never actually get here.                */
-      return 0;
-    }
-
-    /* In the parent                                                         */
-    sys_sigprocmask(SIG_SETMASK, &old_signals, (void *)0);
-    NO_INTR(sys_close(pair[1]));
-
-    /* Get pipe file handle from child                                       */
-    /* scope */ {
-      char buffer[1], cmsg_buf[CMSG_SPACE(sizeof(int))];
-      struct iovec  iov;
-      struct msghdr msg;
-      for (;;) {
-        int nbytes;
-        memset(&iov, 0, sizeof(iov));
-        memset(&msg, 0, sizeof(msg));
-        iov.iov_base       = buffer;
-        iov.iov_len        = 1;
-        msg.msg_iov        = &iov;
-        msg.msg_iovlen     = 1;
-        msg.msg_control    = &cmsg_buf;
-        msg.msg_controllen = sizeof(cmsg_buf);
-        if ((nbytes = sys_recvmsg(pair[0], &msg, 0)) > 0) {
-          struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
-          if (cmsg != NULL && cmsg->cmsg_level == SOL_SOCKET &&
-              cmsg->cmsg_type == SCM_RIGHTS)
-            fd = *(int *)CMSG_DATA(cmsg);
-          break;
-        } else if (nbytes == 0 || errno != EINTR) {
-          break;
-        }
-      }
-    }
-    sys_shutdown(pair[0], SHUT_RDWR);
-    NO_INTR(sys_close(pair[0]));
-  }
-
-  ResumeAllProcessThreads(threads, pids);
-
-  /* Wait for child to detach itself                                         */
-  if (rc > 0) {
-    int status;
-    while (sys_waitpid(rc, &status, 0) < 0) {
-      if (errno != EINTR)
-        goto error;
-    }
-    rc = WEXITSTATUS(status) ? -1 : 0;
-  }
-
-  /* Check if child process ran successfully                                 */
-  if (rc >= 0) {
-    return fd;
-  }
-
-error:
-  if (fd > 0)
-    NO_INTR(sys_close(fd));
-  return -1;
-}
-#endif
diff --git a/src/base/elfcore.h b/src/base/elfcore.h
index 90dea58..34e40a7 100644
--- a/src/base/elfcore.h
+++ b/src/base/elfcore.h
@@ -37,8 +37,10 @@
 /* We currently only support x86-32 and x86-64 on Linux. Porting to
  * other related platforms should not be difficult.
  */
-#if (defined(__i386__) || defined(__x86_64__)) && defined(__linux)
+#if (defined(__i386__) || defined(__x86_64__) || defined(__ARM_ARCH_3__)) && \
+    defined(__linux)
 
+#include <stdarg.h>
 #include <stdint.h>
 #include <sys/types.h>
 #include "config.h"
@@ -57,30 +59,40 @@
  * core file.
  */
 
-typedef struct i386_regs {      /* Normal (non-FPU) CPU registers            */
-#ifdef __x86_64__
-  #define BP rbp
-  #define SP rsp
-  #define IP rip
-  uint64_t  r15,r14,r13,r12,rbp,rbx,r11,r10;
-  uint64_t  r9,r8,rax,rcx,rdx,rsi,rdi,orig_rax;
-  uint64_t  rip,cs,eflags;
-  uint64_t  rsp,ss;
-  uint64_t  fs_base, gs_base;
-  uint64_t  ds,es,fs,gs;
-#else
-  #define BP ebp
-  #define SP esp
-  #define IP eip
-  uint32_t  ebx, ecx, edx, esi, edi, ebp, eax;
-  uint16_t  ds, __ds, es, __es;
-  uint16_t  fs, __fs, gs, __gs;
-  uint32_t  orig_eax, eip;
-  uint16_t  cs, __cs;
-  uint32_t  eflags, esp;
-  uint16_t  ss, __ss;
+#if defined(__i386__) || defined(__x86_64__)
+  typedef struct i386_regs {    /* Normal (non-FPU) CPU registers            */
+  #ifdef __x86_64__
+    #define BP rbp
+    #define SP rsp
+    #define IP rip
+    uint64_t  r15,r14,r13,r12,rbp,rbx,r11,r10;
+    uint64_t  r9,r8,rax,rcx,rdx,rsi,rdi,orig_rax;
+    uint64_t  rip,cs,eflags;
+    uint64_t  rsp,ss;
+    uint64_t  fs_base, gs_base;
+    uint64_t  ds,es,fs,gs;
+  #else
+    #define BP ebp
+    #define SP esp
+    #define IP eip
+    uint32_t  ebx, ecx, edx, esi, edi, ebp, eax;
+    uint16_t  ds, __ds, es, __es;
+    uint16_t  fs, __fs, gs, __gs;
+    uint32_t  orig_eax, eip;
+    uint16_t  cs, __cs;
+    uint32_t  eflags, esp;
+    uint16_t  ss, __ss;
+  #endif
+  } i386_regs;
+#elif defined(__ARM_ARCH_3__)
+  typedef struct arm_regs {     /* General purpose registers                 */
+    #define BP uregs[11]        /* Frame pointer                             */
+    #define SP uregs[13]        /* Stack pointer                             */
+    #define IP uregs[15]        /* Program counter                           */
+    #define LR uregs[14]        /* Link register                             */
+    long uregs[18];
+  } arm_regs;
 #endif
-} i386_regs;
 
 #if defined(__i386__) && defined(__GNUC__)
   /* On x86 we provide an optimized version of the FRAME() macro, if the
@@ -88,7 +100,7 @@ typedef struct i386_regs {      /* Normal (non-FPU) CPU registers            */
    * more accurate values for CPU registers.
    */
   typedef struct Frame {
-    struct i386_regs regs;
+    struct i386_regs uregs;
     int              errno_;
   } Frame;
   #define FRAME(f) Frame f;                                           \
@@ -135,7 +147,39 @@ typedef struct i386_regs {      /* Normal (non-FPU) CPU registers            */
   #define SET_FRAME(f,r)                                              \
                      do {                                             \
                        errno = (f).errno_;                            \
-                       (r)   = (f).regs;                              \
+                       (r)   = (f).uregs;                             \
+                     } while (0)
+#elif defined(__ARM_ARCH_3__) && defined(__GNUC__)
+  /* ARM calling conventions are a little more tricky. A little assembly
+   * helps in obtaining an accurate snapshot of all registers.
+   */
+  typedef struct Frame {
+    struct arm_regs arm;
+    int             errno_;
+  } Frame;
+  #define FRAME(f) Frame f;                                           \
+                   do {                                               \
+                     long cpsr;                                       \
+                     f.errno_ = errno;                                \
+                     __asm__ volatile(                                \
+                       "stmia %0, {r0-r15}\n" /* All integer regs   */\
+                       : : "r"(&f.arm) : "memory");                   \
+                     f.arm.uregs[16] = 0;                             \
+                     __asm__ volatile(                                \
+                       "mrs %0, cpsr\n"       /* Condition code reg */\
+                       : "=r"(cpsr));                                 \
+                     f.arm.uregs[17] = cpsr;                          \
+                   } while (0)
+  #define SET_FRAME(f,r)                                              \
+                     do {                                             \
+                       /* Don't override the FPU status register.   */\
+                       /* Use the value obtained from ptrace(). This*/\
+                       /* works, because our code does not perform  */\
+                       /* any FPU operations, itself.               */\
+                       long fps      = (f).arm.uregs[16];             \
+                       errno         = (f).errno_;                    \
+                       (r)           = (f).arm;                       \
+                       (r).uregs[16] = fps;                           \
                      } while (0)
 #else
   /* If we do not have a hand-optimized assembly version of the FRAME()
@@ -179,7 +223,7 @@ typedef struct i386_regs {      /* Normal (non-FPU) CPU registers            */
  * dumps. If called as
  *
  *   FRAME(frame);
- *   InternalGetCoreDump(&frame, 0, NULL);
+ *   InternalGetCoreDump(&frame, 0, NULL, ap);
  *
  * it creates a core file that only contains information about the
  * calling thread.
@@ -205,7 +249,11 @@ typedef struct i386_regs {      /* Normal (non-FPU) CPU registers            */
  * threaded environment, but it is ultimately the caller's responsibility
  * to provide locking.
  */
-int InternalGetCoreDump(void *frame, int num_threads, pid_t *thread_pids);
+int InternalGetCoreDump(void *frame, int num_threads, pid_t *thread_pids,
+                        va_list ap
+                     /* const char *PATH,
+                        const struct CoredumperCompressor *compressors,
+                        const struct CoredumperCompressor **selected_comp */);
 
 #endif
 
diff --git a/src/base/linux_syscall_support.h b/src/base/linux_syscall_support.h
new file mode 100644
index 0000000..1fa081c
--- /dev/null
+++ b/src/base/linux_syscall_support.h
@@ -0,0 +1,381 @@
+/* Copyright (c) 2005, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Markus Gutschke
+ */
+
+/* This file includes Linux-specific support functions common to the
+ * coredumper and the thread lister; primarily, this is a collection
+ * of direct system calls, and a couple of symbols missing from
+ * standard header files.
+ */
+#ifndef _LINUX_CORE_SUPPORT_H
+#define _LINUX_CORE_SUPPORT_H
+
+/* We currently only support x86-32 and x86-64 on Linux. Porting to
+ * other related platforms should not be difficult.
+ */
+#if (defined(__i386__) || defined(__x86_64__) || defined(__ARM_ARCH_3__)) && \
+    defined(__linux)
+
+#include <asm/posix_types.h>
+#include <asm/stat.h>
+#include <asm/types.h>
+#include <errno.h>
+#include <linux/dirent.h>
+#include <signal.h>
+#include <stdarg.h>
+#include <sys/ptrace.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <syscall.h>
+#include <unistd.h>
+#include <linux/unistd.h>
+
+/* Definitions missing from the standard header files                        */
+#ifndef O_DIRECTORY
+#if defined(__ARM_ARCH_3__)
+#define O_DIRECTORY 0040000
+#else
+#define O_DIRECTORY 0200000
+#endif
+#endif
+#ifndef NT_PRFPXREG
+#define NT_PRFPXREG       20
+#endif
+#ifndef PTRACE_GETFPXREGS
+#define PTRACE_GETFPXREGS ((enum __ptrace_request)18)
+#endif
+#ifndef PR_GET_DUMPABLE
+#define PR_GET_DUMPABLE   3
+#endif
+#ifndef PR_SET_DUMPABLE
+#define PR_SET_DUMPABLE   4
+#endif
+#ifndef __NR_gettid
+#define __NR_gettid       224
+#endif
+
+
+/* After forking, we must make sure to only call system calls.               */
+#if __BOUNDED_POINTERS__
+  #error "Need to port invocations of syscalls for bounded ptrs"
+#else
+  /* The core dumper and the thread lister get executed after threads
+   * have been suspended. As a consequence, we cannot call any functions
+   * that acquire locks. Unfortunately, libc wraps most system calls
+   * (e.g. in order to implement pthread_atfork, and to make calls
+   * cancellable), which means we cannot call these functions. Instead,
+   * we have to call syscall() directly.
+   */
+  #if defined(__i386__)
+    /* In PIC mode (e.g. when building shared libraries), gcc for i386
+     * reserves ebx. Unfortunately, most distribution ship with implementations
+     * of _syscallX() which clobber ebx.
+     * Also, most definitions of _syscallX() neglect to mark "memory" as being
+     * clobbered. This causes problems with compilers, that do a better job
+     * at optimizing across __asm__ calls.
+     * So, we just have to redefine all of the _syscallX() macros.
+     */
+    #define BODY(type,args...)                                                \
+        long __res;                                                           \
+        __asm__ __volatile__("push %%ebx\n"                                   \
+                             "movl %2,%%ebx\n"                                \
+                             "int $0x80\n"                                    \
+                             "pop %%ebx"                                      \
+                             args                                             \
+                             : "memory");                                     \
+        __syscall_return(type,__res)
+    #undef  _syscall0
+    #define _syscall0(type,name)                                              \
+      type name(void) {                                                       \
+      long __res;                                                             \
+      __asm__ volatile("int $0x80"                                            \
+                       : "=a" (__res)                                         \
+                       : "0" (__NR_##name)                                    \
+                       : "memory");                                           \
+      __syscall_return(type,__res);                                           \
+      }
+    #undef  _syscall1
+    #define _syscall1(type,name,type1,arg1)                                   \
+    type name(type1 arg1) {                                                   \
+      BODY(type,                                                              \
+           : "=a" (__res)                                                     \
+           : "0" (__NR_##name),"r" ((long)(arg1)));                           \
+      }
+    #undef  _syscall2
+    #define _syscall2(type,name,type1,arg1,type2,arg2)                        \
+    type name(type1 arg1,type2 arg2) {                                        \
+      BODY(type,                                                              \
+           : "=a" (__res)                                                     \
+           : "0" (__NR_##name),"r" ((long)(arg1)), "c" ((long)(arg2)));       \
+      }
+    #undef  _syscall3
+    #define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3)             \
+    type name(type1 arg1,type2 arg2,type3 arg3) {                             \
+      BODY(type,                                                              \
+           : "=a" (__res)                                                     \
+           : "0" (__NR_##name),"r" ((long)(arg1)),"c" ((long)(arg2)),         \
+             "d" ((long)(arg3)));                                             \
+    }
+    #undef  _syscall4
+    #define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4)  \
+    type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4) {              \
+      BODY(type,                                                              \
+           : "=a" (__res)                                                     \
+           : "0" (__NR_##name),"r" ((long)(arg1)),"c" ((long)(arg2)),         \
+             "d" ((long)(arg3)),"S" ((long)(arg4)));                          \
+    }
+  #elif defined(__ARM_ARCH_3__)
+    /* Most definitions of _syscallX() neglect to mark "memory" as being
+     * clobbered. This causes problems with compilers, that do a better job
+     * at optimizing across __asm__ calls.
+     * So, we just have to redefine all fo the _syscallX() macros.
+     */
+    #define REG(r,a) register long __r##r __asm__("r"#r) = (long)a
+    #define BODY(type,name,args...)                                           \
+          register long __res_r0 __asm__("r0");                               \
+          long __res;                                                         \
+          __asm__ __volatile__ (__syscall(name)                               \
+                                : "=r"(__res_r0) : args : "lr", "memory");    \
+          __res = __res_r0;                                                   \
+          __syscall_return(type, __res)
+    #undef _syscall0
+    #define _syscall0(type, name)                                             \
+        type name() {                                                         \
+          BODY(type, name);                                                   \
+        }
+    #undef _syscall1
+    #define _syscall1(type, name, type1, arg1)                                \
+        type name(type1 arg1) {                                               \
+          REG(0, arg1); BODY(type, name, "r"(__r0));                          \
+        }
+    #undef _syscall2
+    #define _syscall2(type, name, type1, arg1, type2, arg2)                   \
+        type name(type1 arg1, type2 arg2) {                                   \
+          REG(0, arg1); REG(1, arg2); BODY(type, name, "r"(__r0), "r"(__r1)); \
+        }
+    #undef _syscall3
+    #define _syscall3(type, name, type1, arg1, type2, arg2, type3, arg3)      \
+        type name(type1 arg1, type2 arg2, type3 arg3) {                       \
+          REG(0, arg1); REG(1, arg2); REG(2, arg3);                           \
+          BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2));                  \
+        }
+    #undef _syscall4
+    #define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4)  \
+        type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4) {           \
+          REG(0, arg1); REG(1, arg2); REG(2, arg3); REG(3, arg4);             \
+          BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2), "r"(__r3));       \
+        }
+  #endif
+  #if defined(__x86_64__)
+    #define __NR_sys_recvmsg        __NR_recvmsg
+    #define __NR_sys_sendmsg        __NR_sendmsg
+    #define __NR_sys_shutdown       __NR_shutdown
+    #define __NR_sys_rt_sigaction   __NR_rt_sigaction
+    #define __NR_sys_rt_sigprocmask __NR_rt_sigprocmask
+    #define __NR_sys_socket         __NR_socket
+    #define __NR_sys_socketpair     __NR_socketpair
+    static inline _syscall3(int, sys_recvmsg,        int,   s,
+                            struct msghdr*,          m, int, f);
+    static inline _syscall3(int, sys_sendmsg,        int,   s,
+                            const struct msghdr*,    m, int, f);
+    static inline _syscall2(int, sys_shutdown,       int,   s,
+                            int,                     h);
+    static inline _syscall4(int, sys_rt_sigaction,   int,   s,
+                            const struct sigaction*, a,
+                            struct sigaction*,       o, int,      c);
+    static inline _syscall4(int, sys_rt_sigprocmask, int,   h,
+                            const sigset_t*,         s, sigset_t*, o, int,  c);
+    static inline _syscall3(int, sys_socket,         int,   d,
+                            int,                     t, int,       p);
+    static inline _syscall4(int, sys_socketpair,     int,   d,
+                            int,                     t, int,       p, int*, s);
+    #define sys_sigaction(s,a,o)    sys_rt_sigaction((s), (a), (o), \
+                                                     (_NSIG+7)/8)
+    #define sys_sigprocmask(h,s,o)  sys_rt_sigprocmask((h), (s),(o), \
+                                                       (_NSIG+7)/8)
+  #endif
+  #if defined(__x86_64__) || defined(__ARM_ARCH_3__)
+    #define __NR_sys_wait4          __NR_wait4
+
+    static inline _syscall4(pid_t, sys_wait4,        pid_t, p,
+                            int*,                    s, int,       o,
+                            struct rusage*,          r);
+
+    #define sys_waitpid(p,s,o)      sys_wait4((p), (s), (o), 0)
+  #endif
+  #if defined(__i386__) || defined(__ARM_ARCH_3__)
+    #define __NR_sys_sigaction   __NR_sigaction
+    #define __NR_sys_sigprocmask __NR_sigprocmask
+    #define __NR_sys__socketcall __NR_socketcall
+
+    static inline _syscall3(int, sys_sigaction,      int,   s,
+                            const struct sigaction*, a, struct sigaction*, o);
+    static inline _syscall3(int, sys_sigprocmask,    int,   h,
+                            const sigset_t*,         s, sigset_t*,         o);
+    static inline _syscall2(int,   sys__socketcall,  int,   c,
+                            va_list,                 a);
+    static inline int sys_socketcall(int op, ...) {
+      int rc;
+      va_list ap;
+      va_start(ap, op);
+      rc = sys__socketcall(op, ap);
+      va_end(ap);
+      return rc;
+    }
+    #define sys_recvmsg(s,m,f)      sys_socketcall(17,      (s), (m), (f))
+    #define sys_sendmsg(s,m,f)      sys_socketcall(16,      (s), (m), (f))
+    #define sys_shutdown(s,h)       sys_socketcall(13,      (s), (h))
+    #define sys_socket(d,t,p)       sys_socketcall(1,       (d), (t), (p))
+    #define sys_socketpair(d,t,p,s) sys_socketcall(8,       (d), (t), (p),(s))
+  #endif
+  #if defined(__i386__)
+    #define __NR_sys_waitpid     __NR_waitpid
+    static inline _syscall3(pid_t, sys_waitpid,      pid_t, p,
+                            int*,              s,    int,   o);
+  #endif
+  #define __NR_sys_close        __NR_close
+  #define __NR_sys_dup          __NR_dup
+  #define __NR_sys_dup2         __NR_dup2
+  #define __NR_sys_execve       __NR_execve
+  #define __NR_sys__exit        __NR_exit
+  #define __NR_sys_fcntl        __NR_fcntl
+  #define __NR_sys_fork         __NR_fork
+  #define __NR_sys_fstat        __NR_fstat
+  #define __NR_sys_getdents     __NR_getdents
+  #define __NR_sys_getegid      __NR_getegid
+  #define __NR_sys_geteuid      __NR_geteuid
+  #define __NR_sys_getpgrp      __NR_getpgrp
+  #define __NR_sys_getpid       __NR_getpid
+  #define __NR_sys_getppid      __NR_getppid
+  #define __NR_sys_getpriority  __NR_getpriority
+  #define __NR_sys_getrlimit    __NR_getrlimit
+  #define __NR_sys_getsid       __NR_getsid
+  #define __NR__gettid          __NR_gettid
+  #define __NR_sys_kill         __NR_kill
+  #define __NR_sys_lseek        __NR_lseek
+  #define __NR_sys_open         __NR_open
+  #define __NR_sys_pipe         __NR_pipe
+  #define __NR_sys_prctl        __NR_prctl
+  #define __NR_sys_ptrace       __NR_ptrace
+  #define __NR_sys_read         __NR_read
+  #define __NR_sys_readlink     __NR_readlink
+  #define __NR_sys_sched_yield  __NR_sched_yield
+  #define __NR_sys_stat         __NR_stat
+  #define __NR_sys_write        __NR_write
+  static inline _syscall1(int,     sys_close,       int,         f);
+  static inline _syscall1(int,     sys_dup,         int,         f);
+  static inline _syscall2(int,     sys_dup2,        int,         s,
+                          int,            d);
+  static inline _syscall3(int,     sys_execve,      const char*, f,
+                          const char*const*,a,const char*const*, e);
+  static inline _syscall1(int,     sys__exit,       int,         e);
+  static inline _syscall3(int,     sys_fcntl,       int,         f,
+                          int,            c, long,   a);
+  static inline _syscall0(pid_t,   sys_fork);
+  static inline _syscall2(int,     sys_fstat,       int,         f,
+                          struct stat*,   b);
+  static inline _syscall3(int,   sys_getdents,      int,         f,
+                          struct dirent*, d, int,    c);
+  static inline _syscall0(gid_t,   sys_getegid);
+  static inline _syscall0(uid_t,   sys_geteuid);
+  static inline _syscall0(pid_t,   sys_getpgrp);
+  static inline _syscall0(pid_t,   sys_getpid);
+  static inline _syscall0(pid_t,   sys_getppid);
+  static inline _syscall2(int,     sys_getpriority, int,         a,
+                          int,            b);
+  static inline _syscall2(int,     sys_getrlimit,   int,         r,
+                          struct rlimit*, l);
+  static inline _syscall1(pid_t,   sys_getsid,      pid_t,       p);
+  static inline _syscall0(pid_t,   _gettid);
+  static inline _syscall2(int,     sys_kill,        pid_t,       p,
+                          int,            s);
+  static inline _syscall3(off_t,   sys_lseek,       int,         f,
+                          off_t,          o, int,    w);
+  static inline _syscall3(int,     sys_open,        const char*, p,
+                          int,            f, int,    m);
+  static inline _syscall1(int,     sys_pipe,        int*,        p);
+  static inline _syscall2(int,     sys_prctl,       int,         o,
+                          long,           a);
+  static inline _syscall4(long,    sys_ptrace,      int,         r,
+                          pid_t,          p, void *, a, void *, d);
+  static inline _syscall3(ssize_t, sys_read,        int,         f,
+                          void *,         b, size_t, c);
+  static inline _syscall3(int,     sys_readlink,    const char*, p,
+                          char*,          b, size_t, s);
+  static inline _syscall0(int,     sys_sched_yield);
+  static inline _syscall2(int,     sys_stat,        const char*, f,
+                          struct stat*,   b);
+  static inline _syscall3(ssize_t, sys_write,        int,        f,
+                          const void *,   b, size_t, c);
+
+  static inline int sys_sysconf(int name) {
+    extern int __getpagesize(void);
+    switch (name) {
+      case _SC_OPEN_MAX: {
+        struct rlimit ru;
+        return sys_getrlimit(RLIMIT_NOFILE, &ru) < 0 ? 8192 : ru.rlim_cur;
+      }
+      case _SC_PAGESIZE:
+        return __getpagesize();
+      default:
+        errno = ENOSYS;
+        return -1;
+    }
+  }
+
+  static inline pid_t sys_gettid() {
+    pid_t tid = _gettid();
+    if (tid != -1) {
+      return tid;
+    }
+    return sys_getpid();
+  }
+
+  static inline void sys_ptrace_detach(pid_t pid) {
+    /* PTRACE_DETACH can sometimes forget to wake up the tracee and it
+     * then sends job control signals to the real parent, rather than to
+     * the tracer. We reduce the risk of this happening by starting a
+     * whole new time slice, and then quickly sending a SIGCONT signal
+     * right after detaching from the tracee.
+     */
+    sys_sched_yield();
+    sys_ptrace(PTRACE_DETACH, pid, (void *)0, (void *)0);
+    sys_kill(pid, SIGCONT);
+  }
+  #undef REG
+  #undef BODY
+#endif
+
+
+#endif
+#endif
diff --git a/src/base/linuxthreads.c b/src/base/linuxthreads.c
index adc1e8e..9e13e83 100644
--- a/src/base/linuxthreads.c
+++ b/src/base/linuxthreads.c
@@ -35,68 +35,15 @@
 
 #ifdef THREADS
 
-#include <errno.h>
 #include <fcntl.h>
 #include <stdlib.h>
 #include <string.h>
-#include <sys/prctl.h>
-#include <sys/ptrace.h>
-#include <sys/types.h>
 #include <sys/socket.h>
-#include <sys/syscall.h>
 #include <sys/wait.h>
-#include <unistd.h>
 
+#include "base/linux_syscall_support.h"
 #include "base/thread_lister.h"
 
-#ifndef O_DIRECTORY
-#define O_DIRECTORY 0200000
-#endif
-
-#if __BOUNDED_POINTERS__
-  #error "Need to port invocations of syscalls for bounded ptrs"
-#else
-  /* (Most of) the code in this file gets executed after threads have been
-   * suspended. As a consequence, we cannot call any functions that acquire
-   * locks. Unfortunately, libc wraps most system calls (e.g. in order to
-   * implement pthread_atfork, and to make calls cancellable), which means
-   * we cannot call these functions. Instead, we have to call syscall()
-   * directly.
-   */
-  #include <asm/stat.h>
-  #include <asm/posix_types.h>
-  #include <asm/types.h>
-  #include <linux/dirent.h>
-  #include <stdarg.h>
-  #include <syscall.h>
-  #ifdef __x86_64__
-    #define sys_socket(d,t,p)  syscall(SYS_socket,   (d), (t), (p))
-    #define sys_waitpid(p,s,o) syscall(SYS_wait4,    (p), (s), (o), (void *)0)
-  #else
-    static int sys_socketcall(int op, ...) {
-      int rc;
-      va_list ap;
-      va_start(ap, op);
-      rc = syscall(SYS_socketcall, op, ap);
-      va_end(ap);
-      return rc;
-    }
-    #define sys_socket(d,t,p)  sys_socketcall(1,     (d), (t), (p))
-    #define sys_waitpid(p,s,o) syscall(SYS_waitpid,  (p), (s), (o))
-  #endif
-
-  #define sys_close(f)         syscall(SYS_close,    (f))
-  #define sys_fcntl(f,c,a)     syscall(SYS_fcntl,    (f), (c), (a))
-  #define sys_fstat(f,b)       syscall(SYS_fstat,    (f), (b))
-  #define sys_getdents(f,d,c)  syscall(SYS_getdents, (f), (d), (c))
-  #define sys_getpid()         syscall(SYS_getpid)
-  #define sys_lseek(f,o,w)     syscall(SYS_lseek,    (f), (o), (w))
-  #define sys_open(f,p,m)      syscall(SYS_open,     (f), (p), (m))
-  #define sys_prctl(o,a)       syscall(SYS_prctl,    (o), (a))
-  #define sys_ptrace(r,p,a,d)  syscall(SYS_ptrace,   (r), (p), (a), (d))
-  #define sys_stat(f,b)        syscall(SYS_stat,     (f), (b))
-#endif
-
 
 /* itoa() is not a standard function, and we cannot safely call printf()
  * after suspending threads. So, we just implement our own copy. A
@@ -149,13 +96,15 @@ static int c_open(const char *fname, int flags, int mode) {
  * 'callback' is supposed to do or arrange for ResumeAllProcessThreads.
  * We return -1 on error and the return value of 'callback' on success.
  */
-int GetAllProcessThreads(void *parameter,
-                         GetAllProcessThreadsCallBack callback) {
-  int              marker = -1, proc = -1, dumpable = 1;
-  int              num_threads = 0, max_threads = 0;
-  char             marker_name[48], *marker_path;
-  struct stat      proc_sb, marker_sb;
-  pid_t            my_pid = sys_getpid();
+int ListAllProcessThreads(void *parameter,
+                          ListAllProcessThreadsCallBack callback, ...) {
+  static const char *const proc_paths[] = { "/proc/self/task/", "/proc/", 0 };
+  const char *const *proc_path = proc_paths;
+  int               marker = -1, proc = -1, dumpable = 1;
+  int               num_threads = 0, max_threads = 0;
+  char              marker_name[48], *marker_path;
+  struct stat       proc_sb, marker_sb;
+  pid_t             my_pid = sys_getpid();
 
   /* Create "marker" that we can use to detect threads sharing the same
    * address space and the same file handles. By setting the FD_CLOEXEC flag
@@ -186,9 +135,12 @@ int GetAllProcessThreads(void *parameter,
      * a separate "task" directory. We check there first, and then fall back
      * on the older naming convention if necessary.
      */
-    if (((proc = c_open("/proc/self/task/", O_RDONLY|O_DIRECTORY, 0)) < 0 &&
-         (proc = c_open("/proc/", O_RDONLY|O_DIRECTORY, 0)) < 0) ||
-        sys_fstat(proc, &proc_sb) < 0)
+    if ((proc = c_open(*proc_path, O_RDONLY|O_DIRECTORY, 0)) < 0) {
+      if (*++proc_path != NULL)
+        continue;
+      goto failure;
+    }
+    if (sys_fstat(proc, &proc_sb) < 0)
       goto failure;
 
     /* Since we are suspending threads, we cannot call any libc functions that
@@ -251,7 +203,7 @@ int GetAllProcessThreads(void *parameter,
               if (sys_stat(fname, &tmp_sb) >= 0 &&
                   marker_sb.st_dev == tmp_sb.st_dev &&
                   marker_sb.st_ino == tmp_sb.st_ino) {
-                int i, j;
+                long i, j;
 
                 /* Found one of our threads, make sure it is no duplicate    */
                 for (i = 0; i < num_threads; i++) {
@@ -282,7 +234,7 @@ int GetAllProcessThreads(void *parameter,
                 }
                 while (sys_waitpid(pid, (void *)0, __WALL) < 0) {
                   if (errno != EINTR) {
-                    sys_ptrace(PTRACE_DETACH, pid, (void *)0, (void *)0);
+                    sys_ptrace_detach(pid);
                     goto next_entry;
                   }
                 }
@@ -293,7 +245,7 @@ int GetAllProcessThreads(void *parameter,
                    * show the "marker". This is probably a forked child
                    * process rather than a thread.
                    */
-                  sys_ptrace(PTRACE_DETACH, pid, (void *)0, (void *)0);
+                  sys_ptrace_detach(pid);
                 } else {
                   pids[num_threads++] = pid;
                   added_entries++;
@@ -304,19 +256,27 @@ int GetAllProcessThreads(void *parameter,
        next_entry:;
         }
       }
-      NO_INTR(sys_close(marker));
       NO_INTR(sys_close(proc));
 
-      /* Now we are ready to call the callback,
-       * which takes care of resuming the threads for us.
+      /* If we failed to find any threads, try looking somewhere else in
+       * /proc. Maybe, threads are reported differently on this system.
        */
-      result = callback(parameter, num_threads, pids);
-
-      /* Restore the "dumpable" state of the process                         */
-      if (!dumpable)
-        sys_prctl(PR_SET_DUMPABLE, dumpable);
-      return result;
-
+      if (num_threads > 1 || !*++proc_path) {
+        va_list ap;
+        NO_INTR(sys_close(marker));
+
+        /* Now we are ready to call the callback,
+         * which takes care of resuming the threads for us.
+         */
+        va_start(ap, callback);
+        result = callback(parameter, num_threads, pids, ap);
+        va_end(ap);
+  
+        /* Restore the "dumpable" state of the process                       */
+        if (!dumpable)
+          sys_prctl(PR_SET_DUMPABLE, dumpable);
+        return result;
+      }
    detach_threads:
       /* Resume all threads prior to retrying the operation                  */
       ResumeAllProcessThreads(num_threads, pids);
@@ -336,11 +296,11 @@ failure:
 }
 
 /* This function resumes the list of all linux threads that
- * GetAllProcessThreads pauses before giving to its callback.
+ * ListAllProcessThreads pauses before giving to its callback.
  */
 void ResumeAllProcessThreads(int num_threads, pid_t *thread_pids) {
   while (num_threads-- > 0) {
-    sys_ptrace(PTRACE_DETACH, thread_pids[num_threads], (void *)0, (void *)0);
+    sys_ptrace_detach(thread_pids[num_threads]);
   }
 }
 
diff --git a/src/base/linuxthreads.h b/src/base/linuxthreads.h
index 636fd6c..a6d4298 100644
--- a/src/base/linuxthreads.h
+++ b/src/base/linuxthreads.h
@@ -40,7 +40,8 @@
 /* We currently only support x86-32 and x86-64 on Linux. Porting to other
  * related platforms should not be difficult.
  */
-#if (defined(__i386__) || defined(__x86_64__)) && defined(__linux)
+#if (defined(__i386__) || defined(__x86_64__) || defined(__ARM_ARCH_3__)) && \
+    defined(__linux)
 
 /* Define the THREADS symbol to make sure that there is exactly one core dumper
  * built into the library.
diff --git a/src/base/thread_lister.c b/src/base/thread_lister.c
index 8de404d..7eca594 100644
--- a/src/base/thread_lister.c
+++ b/src/base/thread_lister.c
@@ -43,9 +43,15 @@
  * or if the multi-threading code has not been ported, yet.
  */
 
-int GetAllProcessThreads(void *parameter,
-                         GetAllProcessThreadsCallBack callback) {
-  return callback(parameter, 0, NULL);
+int ListAllProcessThreads(void *parameter,
+                          ListAllProcessThreadsCallBack callback, ...) {
+  int     rc;
+  va_list ap;
+
+  va_start(ap, callback);
+  rc = callback(parameter, 0, NULL, ap);
+  va_end(ap);
+  return rc;
 }
 
 void ResumeAllProcessThreads(int num_threads, pid_t *thread_pids) {
diff --git a/src/base/thread_lister.h b/src/base/thread_lister.h
index 6bae064..49bf3bf 100644
--- a/src/base/thread_lister.h
+++ b/src/base/thread_lister.h
@@ -34,15 +34,17 @@
 #ifndef _THREAD_LISTER_H
 #define _THREAD_LISTER_H
 
+#include <stdarg.h>
 #include <sys/types.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-typedef int (*GetAllProcessThreadsCallBack)(void *parameter,
-                                            int num_threads,
-                                            pid_t *thread_pids);
+typedef int (*ListAllProcessThreadsCallBack)(void *parameter,
+                                             int num_threads,
+                                             pid_t *thread_pids,
+                                             va_list ap);
 
 /* This function gets the list of all linux threads of the current process
  * but this one and passes them to the 'callback' along with the 'parameter'
@@ -51,11 +53,11 @@ typedef int (*GetAllProcessThreadsCallBack)(void *parameter,
  * 'callback' is supposed to do or arrange for ResumeAllProcessThreads.
  * We return -1 on error and the return value of 'callback' on success.
  */
-int GetAllProcessThreads(void *parameter,
-                         GetAllProcessThreadsCallBack callback);
+int ListAllProcessThreads(void *parameter,
+                          ListAllProcessThreadsCallBack callback, ...);
 
 /* This function resumes the list of all linux threads that
- * GetAllProcessThreads pauses before giving to its callback.
+ * ListAllProcessThreads pauses before giving to its callback.
  */
 void ResumeAllProcessThreads(int num_threads, pid_t *thread_pids);
 
diff --git a/src/google/heap-checker.h b/src/google/heap-checker.h
index ef6c343..c9607ca 100644
--- a/src/google/heap-checker.h
+++ b/src/google/heap-checker.h
@@ -484,9 +484,15 @@ class HeapLeakChecker {
   static void DoMainHeapCheck();
 
   // Type of task for UseProcMaps
-  enum ProcMapsTask { RECORD_GLOBAL_DATA_LOCKED, DISABLE_LIBRARY_ALLOCS };
+  enum ProcMapsTask { 
+    RECORD_GLOBAL_DATA_LOCKED, 
+    DISABLE_LIBRARY_ALLOCS 
+  };
+  // Success/Error Return codes for UseProcMaps.
+  enum ProcMapsResult { PROC_MAPS_USED, CANT_OPEN_PROC_MAPS,
+                        NO_SHARED_LIBS_IN_PROC_MAPS };
   // Read /proc/self/maps, parse it, and do the 'proc_maps_task' for each line.
-  static void UseProcMaps(ProcMapsTask proc_maps_task);
+  static ProcMapsResult UseProcMaps(ProcMapsTask proc_maps_task);
   // A ProcMapsTask to disable allocations from 'library'
   // that is mapped to [start_address..end_address)
   // (only if library is a certain system library).
diff --git a/src/google/malloc_extension.h b/src/google/malloc_extension.h
index 3de0955..e088154 100644
--- a/src/google/malloc_extension.h
+++ b/src/google/malloc_extension.h
@@ -79,6 +79,15 @@ class MallocExtension {
   // contents of "*result" are preserved.
   virtual void GetHeapSample(std::string* result);
 
+  // Get a string that contains the stack traces that caused growth in
+  // the addres sspace size.  The format of the returned string is
+  // equivalent to the output of the heap profiler and can therefore
+  // be passed to "pprof".
+  //
+  // The generated data is *appended* to "*result".  I.e., the old
+  // contents of "*result" are preserved.
+  virtual void GetHeapGrowthStacks(std::string* result);
+
   // -------------------------------------------------------------------
   // Control operations for getting and setting malloc implementation
   // specific parameters.  Some currently useful properties:
@@ -155,6 +164,10 @@ class MallocExtension {
   // This is an internal extension.  Callers should use the more
   // convenient "GetHeapSample(string*)" method defined above.
   virtual void** ReadStackTraces();
+
+  // Like ReadStackTraces(), but returns stack traces that caused growth
+  // in the address space size.
+  virtual void** ReadHeapGrowthStackTraces();
 };
 
 #endif  // _GOOGLE_MALLOC_EXTENSION_H__
diff --git a/src/google/profiler.h b/src/google/profiler.h
index 249d7be..5eea0de 100644
--- a/src/google/profiler.h
+++ b/src/google/profiler.h
@@ -32,20 +32,28 @@
 //
 // Module for CPU profiling based on periodic pc-sampling.
 //
-// To use this module, link it into your program.  To activate it
-// at runtime, set the environment variable "CPUPROFILE" to be the
-// name of the file in which the profile data should be written.
-// (If you don't set the environment variable, no profiling will
-// happen, and the program should run without any slowdowns.)
+// To use this module, link it into your program.  There should
+// be no slowdown caused by this unless you activate the profiler
+// using one of the steps given below.
 //
-// Once you have done this, there are two ways to determine which
-// region(s) of code should be profiled:
+// To activate the profiler, do one of the following:
 //
-// 1. If you set the "PROFILESELECTED" environment variable,
-//    only regions of code that are surrounded with "ProfilerEnable()"
-//    and "ProfilerDisable()" will be profiled.
-// 2. Otherwise, the main thread, and any thread that has had 
-//    ProfilerRegisterThread() called on it, will be profiled.
+//    1. Before starting the program, set the environment variable
+//       "CPUPROFILE" to be the name of the file to which the profile
+//       data should be written.
+//
+//    2. Programmatically, start and stop the profiler using
+//       the routines "ProfilerStart(filename)" and "ProfilerStop()".
+//
+// All threads in the program are profiled whenever profiling is on.
+// There used to be a mechanism where a subset of the threads could be
+// profiled, but that functionality no longer exists (it would not
+// work correctly in new systems since the interval timer used by the
+// profiler is a per-address-space setting in new systems instead of
+// being a per-thread setting in 2.4 and earlier systems).
+//
+// Limitation: on 2.4 and earlier kernels, just the main thread will
+// be profiled.
 //
 // Use pprof to view the resulting profile output.  If you have dot and
 // gv installed, you can also get a graphical representation of CPU usage.
@@ -56,6 +64,8 @@
 #ifndef _GOOGLE_PROFILER_H
 #define _GOOGLE_PROFILER_H
 
+#include <time.h>       // For time_t
+
 // Start profiling and write profile info into fname.
 extern bool ProfilerStart(const char* fname);
 
@@ -63,24 +73,35 @@ extern bool ProfilerStart(const char* fname);
 // the currently accumulated profiling data will be cleared.
 extern void ProfilerStop();
 
+// Flush any currently buffered profiling state to the profile file.
+// Has no effect if the profiler has not been started.
+extern void ProfilerFlush();
 
-// These functions have no effect if profiling has not been activated
-// globally (by specifying the "CPUPROFILE" environment variable or by
-// calling ProfilerStart() ).
-
-// Profile in the given thread.  This is most usefully called when a
-// new thread is first entered.  Note this may not work if
-// PROFILESELECTED is set.
-extern void ProfilerRegisterThread();
 
-// Turn profiling on and off, if PROFILESELECTED has been called.
+// DEPRECATED: these functions were used to enable/disable profiling
+// in the current thread, but no longer do anything.
 extern void ProfilerEnable();
 extern void ProfilerDisable();
 
-// Write out the current profile information to disk.
-extern void ProfilerFlush();
+// Returns true if profile is currently enabled
+extern bool ProfilingIsEnabledForAllThreads();
+
+// Routine for registering new threads with the profiler.  This is
+// most usefully called when a new thread is first entered.
+extern void ProfilerRegisterThread();
+
+// Stores state about profiler's current status into "*state".
+struct ProfilerState {
+  bool   enabled;                // Is profiling currently enabled?
+  time_t start_time;             // If enabled, when was profiling started?
+  char   profile_name[1024];     // Name of profile file being written, or '\0'
+  int    samples_gathered;       // Number of samples gatheered to far (or 0)
+};
+extern void ProfilerGetCurrentState(ProfilerState* state);
 
 // ------------------------- ProfilerThreadState -----------------------
+// DEPRECATED: this class is no longer needed.
+//
 // A small helper class that allows a thread to periodically check if
 // profiling has been enabled or disabled, and to react appropriately
 // to ensure that activity in the current thread is included in the
@@ -92,15 +113,13 @@ extern void ProfilerFlush();
 //    profile_state.ThreadCheck();
 //  }
 class ProfilerThreadState {
-public:
-  ProfilerThreadState();
+ public:
+  ProfilerThreadState() { }
 
   // Called in a thread to enable or disable profiling on the thread
   // based on whether profiling is currently on or off.
-  void ThreadCheck();
-
-private:
-  bool          was_enabled_;   // True if profiling was on in our last call
+  // DEPRECATED: No longer needed
+  void ThreadCheck() { }
 };
 
 #endif /* _GOOGLE_PROFILER_H */
diff --git a/src/heap-checker.cc b/src/heap-checker.cc
index c0ea994..f081f97 100644
--- a/src/heap-checker.cc
+++ b/src/heap-checker.cc
@@ -451,12 +451,14 @@ static bool IsLibraryNamed(const char* library, const char* library_base) {
 void HeapLeakChecker::DisableLibraryAllocs(const char* library,
                                            void* start_address,
                                            void* end_address) {
+  int depth = 0;
   // TODO(maxim): maybe this should be extended to also use objdump
   //              and pick the text portion of the library more precisely.
   if (IsLibraryNamed(library, "/libpthread")  ||
-        // pthread has a lot of small "system" leaks we don't care about
+        // libpthread has a lot of small "system" leaks we don't care about.
+        // In particular it allocates memory to store data supplied via
+        // pthread_setspecific (which can be the only pointer to a heap object).
       IsLibraryNamed(library, "/libdl")  ||
-      IsLibraryNamed(library, "/ld")  ||
         // library loaders leak some "system" heap that we don't care about
       IsLibraryNamed(library, "/libcrypto")
       // Sometimes libcrypto of OpenSSH is compiled with -fomit-frame-pointer
@@ -464,16 +466,36 @@ void HeapLeakChecker::DisableLibraryAllocs(const char* library,
       // is so important for making crypto usable).  We ignore all its
       // allocations because we can't see the call stacks.
      ) {
+    depth = 1;  // only disable allocation calls directly from the library code
+  } else if (IsLibraryNamed(library, "/ld")
+               // library loader leaks some "system" heap
+               // (e.g. thread-local storage) that we don't care about
+            ) {
+    depth = 2;  // disable allocation calls directly from the library code
+                // and at depth 2 from it.
+    // We need depth 2 here solely because of a libc bug that
+    // forces us to jump through __memalign_hook and MemalignOverride hoops
+    // in tcmalloc.cc.
+    // Those buggy __libc_memalign() calls are in ld-linux.so and happen for
+    // thread-local storage allocations that we want to ignore here.
+    // We go with the depth-2 hack as a workaround for this libc bug:
+    // otherwise we'd need to extend MallocHook interface
+    // so that correct stack depth adjustment can be propagated from
+    // the exceptional case of MemalignOverride.
+    // Using depth 2 here should not mask real leaks because ld-linux.so
+    // does not call user code.
+  }
+  if (depth) {
     HeapProfiler::MESSAGE(1, "HeapChecker: "
-                          "Disabling direct allocations from %s :\n",
-                          library);
+                          "Disabling allocations from %s at depth %d:\n",
+                          library, depth);
     DisableChecksFromTo(start_address, end_address,
-                        1);  // only disable allocation calls directly
-                             // from the library code
+                        depth);
   }
 }
 
-void HeapLeakChecker::UseProcMaps(ProcMapsTask proc_maps_task) {
+HeapLeakChecker::ProcMapsResult
+HeapLeakChecker::UseProcMaps(ProcMapsTask proc_maps_task) {
   FILE* const fp = fopen("/proc/self/maps", "r");
   if (!fp) {
     int errsv = errno;
@@ -481,27 +503,29 @@ void HeapLeakChecker::UseProcMaps(ProcMapsTask proc_maps_task) {
                           "Could not open /proc/self/maps: errno=%d.  "
                           "Libraries will not be handled correctly.\n",
                           errsv);
-    return;
+    return CANT_OPEN_PROC_MAPS;
   }
   char proc_map_line[1024];
+  bool saw_shared_lib = false;
   while (fgets(proc_map_line, sizeof(proc_map_line), fp) != NULL) {
     // All lines starting like
     // "401dc000-4030f000 r??p 00132000 03:01 13991972  lib/bin"
     // identify a data and code sections of a shared library or our binary
     uint64 start_address, end_address, file_offset, inode;
     int size;
-    char permissions[5];
+    char permissions[5], *filename;
     if (sscanf(proc_map_line, LLX"-"LLX" %4s "LLX" %*x:%*x "LLD" %n",
                &start_address, &end_address, permissions,
                &file_offset, &inode, &size) != 5) continue;
     proc_map_line[strlen(proc_map_line) - 1] = '\0';  // zap the newline
+    filename = proc_map_line + size;
     HeapProfiler::MESSAGE(4, "HeapChecker: "
                           "Looking at /proc/self/maps line:\n  %s\n",
                           proc_map_line);
     if (proc_maps_task == DISABLE_LIBRARY_ALLOCS  &&
         strncmp(permissions, "r-xp", 4) == 0  &&  inode != 0) {
       if (start_address >= end_address)  abort();
-      DisableLibraryAllocs(proc_map_line + size,
+      DisableLibraryAllocs(filename,
                            reinterpret_cast<void*>(start_address),
                            reinterpret_cast<void*>(end_address));
     }
@@ -514,8 +538,21 @@ void HeapLeakChecker::UseProcMaps(ProcMapsTask proc_maps_task) {
       if (start_address >= end_address)  abort();
       RecordGlobalDataLocked(proc_map_line + size, start_address, file_offset);
     }
+    // Determine if any shared libraries are present.
+    if (strstr(filename, "lib") && strstr(filename, ".so")) {
+      saw_shared_lib = true;
+    }
   }
   fclose(fp);
+
+  if (!saw_shared_lib) {
+    HeapProfiler::MESSAGE(-1, "HeapChecker: "
+                              "No shared libs detected.  "
+                              "Will likely report false leak positives "
+                              "for statically linked executables.\n");
+    return NO_SHARED_LIBS_IN_PROC_MAPS;
+  }
+  return PROC_MAPS_USED;
 }
 
 // Total number and size of live objects dropped from the profile.
@@ -527,11 +564,12 @@ static int64 live_bytes_total = 0;
 static int last_num_threads = 0;
 static pid_t* last_thread_pids = NULL;
 
-// Callback for GetAllProcessThreads to ignore
+// Callback for ListAllProcessThreads to ignore
 // thread stacks and registers for all our threads.
 static int IgnoreLiveThreads(void* parameter,
                              int num_threads,
-                             pid_t* thread_pids) {
+                             pid_t* thread_pids,
+                             va_list ap) {
   last_num_threads = num_threads;
   assert(last_thread_pids == NULL);
   last_thread_pids = new pid_t[num_threads];
@@ -547,7 +585,7 @@ static int IgnoreLiveThreads(void* parameter,
     i386_regs thread_regs;
 #define sys_ptrace(r,p,a,d)  syscall(SYS_ptrace, (r), (p), (a), (d))
     // We use sys_ptrace to avoid thread locking
-    // because this is called from GetAllProcessThreads
+    // because this is called from ListAllProcessThreads
     // when all but this thread are suspended.
     // (This does not seem to matter much though: allocations and
     //  logging with HeapProfiler::MESSAGE seem to work just fine.)
@@ -600,8 +638,8 @@ IgnoreAllLiveObjectsLocked(const StackExtent& self_stack) {
   if (HeapProfiler::ignored_objects_)  abort();
   HeapProfiler::ignored_objects_ = new HeapProfiler::IgnoredObjectSet;
   // Record global data as live:
-  // We need to do it before we stop the threads in GetAllProcessThreads below;
-  // otherwise deadlocks are possible
+  // We need to do it before we stop the threads in ListAllProcessThreads
+  // below; otherwise deadlocks are possible
   // when we try to fork to execute objdump in UseProcMaps.
   if (FLAGS_heap_check_ignore_global_live) {
     library_live_objects = new LibraryLiveObjectsStacks;
@@ -613,7 +651,7 @@ IgnoreAllLiveObjectsLocked(const StackExtent& self_stack) {
     // and keep them suspended for the whole time of liveness checking
     // (they can't (de)allocate due to profiler's lock but they could still
     //  mess with the pointer graph while we walk it).
-    int r = GetAllProcessThreads(NULL, IgnoreLiveThreads);
+    int r = ListAllProcessThreads(NULL, IgnoreLiveThreads);
     if (r == -1) {
       HeapProfiler::MESSAGE(0, "HeapChecker: Could not find thread stacks; "
                                "may get false leak reports\n");
@@ -1312,9 +1350,15 @@ void HeapLeakChecker::InternalInitStart(const string& heap_check_type) {
     assert(heap_checker_pid == getpid());
     heap_checker_on = true;
     if (!HeapProfiler::is_on_)  abort();
-    UseProcMaps(DISABLE_LIBRARY_ALLOCS);
+    ProcMapsResult pm_result = UseProcMaps(DISABLE_LIBRARY_ALLOCS);
       // might neeed to do this more than once
       // if one later dynamically loads libraries that we want disabled
+    if (pm_result != HeapLeakChecker::PROC_MAPS_USED) {
+      heap_checker_on = false;
+      HeapProfiler::MESSAGE(0, "HeapChecker: Turning itself off\n");
+      HeapProfiler::StopForLeaks();
+      return;
+    }
 
     // make a good place and name for heap profile leak dumps
     profile_prefix = new string(dump_directory());
diff --git a/src/heap-profiler.cc b/src/heap-profiler.cc
index bfee34d..45eb908 100644
--- a/src/heap-profiler.cc
+++ b/src/heap-profiler.cc
@@ -134,6 +134,7 @@ void HeapProfiler::MESSAGE(int level, const char* format, ...) {
   va_start(ap, format);
   char buf[600];
   vsnprintf(buf, sizeof(buf), format, ap);
+  va_end(ap);
   write(STDERR_FILENO, buf, strlen(buf));
 }
 
diff --git a/src/internal_logging.cc b/src/internal_logging.cc
index 16b040e..8c403c5 100644
--- a/src/internal_logging.cc
+++ b/src/internal_logging.cc
@@ -42,6 +42,7 @@ void TCMalloc_MESSAGE(const char* format, ...) {
   va_start(ap, format);
   char buf[800];
   vsnprintf(buf, sizeof(buf), format, ap);
+  va_end(ap);
   write(STDERR_FILENO, buf, strlen(buf));
 }
 
@@ -50,6 +51,7 @@ void TCMalloc_Printer::printf(const char* format, ...) {
     va_list ap;
     va_start(ap, format);
     const int r = vsnprintf(buf_, left_, format, ap);
+    va_end(ap);
     if (r < 0) {
       // Perhaps an old glibc that returns -1 on truncation?
       left_ = 0;
diff --git a/src/internal_spinlock.h b/src/internal_spinlock.h
index 2015763..79c1279 100644
--- a/src/internal_spinlock.h
+++ b/src/internal_spinlock.h
@@ -45,7 +45,7 @@
 #endif
 #include <stdlib.h>	/* for abort() */
 
-#if defined __i386__ && defined __GNUC__
+#if (defined __i386__ || defined __x86_64__) && defined __GNUC__
 
 static void TCMalloc_SlowLock(volatile unsigned int* lockword);
 
diff --git a/src/malloc_extension.cc b/src/malloc_extension.cc
index 8ca58a7..1a42e6e 100644
--- a/src/malloc_extension.cc
+++ b/src/malloc_extension.cc
@@ -108,6 +108,10 @@ void** MallocExtension::ReadStackTraces() {
   return NULL;
 }
 
+void** MallocExtension::ReadHeapGrowthStackTraces() {
+  return NULL;
+}
+
 // The current malloc extension object.  We also keep a pointer to
 // the default implementation so that the heap-leak checker does not
 // complain about a memory leak.
@@ -178,9 +182,37 @@ struct StackTraceEqual {
 
 typedef HASH_NAMESPACE::hash_set<void**, StackTraceHash, StackTraceEqual> StackTraceTable;
 
-void DebugStringWriter(const char* str, void* arg) {
-  string* result = reinterpret_cast<string*>(arg);
-  *result += str;
+void PrintHeader(string* result, const char* label, void** entries) {
+  // Compute the total count and total size
+  uintptr_t total_count = 0;
+  uintptr_t total_size = 0;
+  for (void** entry = entries; Count(entry) != 0; entry += 3 + Depth(entry)) {
+    total_count += Count(entry);
+    total_size += Size(entry);
+  }
+
+  char buf[200];
+  snprintf(buf, sizeof(buf),
+           "heap profile: %6lld: %8lld [%6lld: %8lld] @ %s\n",
+           static_cast<long long>(total_count),
+           static_cast<long long>(total_size),
+           static_cast<long long>(total_count),
+           static_cast<long long>(total_size),
+           label);
+  *result += buf;
+}
+
+void PrintStackEntry(string* result, void** entry) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "%6d: %8d [%6d: %8d] @",
+           int(Count(entry)), int(Size(entry)),
+           int(Count(entry)), int(Size(entry)));
+  *result += buf;
+  for (int i = 0; i < Depth(entry); i++) {
+    snprintf(buf, sizeof(buf), " %p", PC(entry, i));
+    *result += buf;
+  }
+  *result += "\n";
 }
 
 }
@@ -188,18 +220,16 @@ void DebugStringWriter(const char* str, void* arg) {
 void MallocExtension::GetHeapSample(string* result) {
   void** entries = ReadStackTraces();
   if (entries == NULL) {
-    *result += "this malloc implementation does not support sampling\n";
+    *result += "This malloc implementation does not support sampling.\n"
+               "As of 2005/01/26, only tcmalloc supports sampling, and you\n"
+               "are probably running a binary that does not use tcmalloc.\n";
     return;
   }
 
   // Group together all entries with same stack trace
   StackTraceTable table;
-  int total_count = 0;
-  int total_size = 0;
   for (void** entry = entries; Count(entry) != 0; entry += 3 + Depth(entry)) {
     StackTraceTable::iterator iter = table.find(entry);
-    total_count += Count(entry);
-    total_size += Size(entry);
     if (iter == table.end()) {
       // New occurrence
       table.insert(entry);
@@ -210,27 +240,37 @@ void MallocExtension::GetHeapSample(string* result) {
     }
   }
 
-  char buf[100];
-  snprintf(buf, sizeof(buf), "heap profile: %6d: %8d [%6d: %8d] @\n",
-           total_count, total_size, total_count, total_size);
-  *result += buf;
+  PrintHeader(result, "heap", entries);
   for (StackTraceTable::iterator iter = table.begin();
        iter != table.end();
        ++iter) {
-    void** entry = *iter;
-    snprintf(buf, sizeof(buf), "%6d: %8d [%6d: %8d] @",
-             int(Count(entry)), int(Size(entry)),
-             int(Count(entry)), int(Size(entry)));
-    *result += buf;
-    for (int i = 0; i < Depth(entry); i++) {
-      snprintf(buf, sizeof(buf), " %p", PC(entry, i));
-      *result += buf;
-    }
-    *result += "\n";
+    PrintStackEntry(result, *iter);
   }
 
   // TODO(menage) Get this working in google-perftools
   //DumpAddressMap(DebugStringWriter, result);
+}
+
+void MallocExtension::GetHeapGrowthStacks(std::string* result) {
+  void** entries = ReadHeapGrowthStackTraces();
+  if (entries == NULL) {
+    *result += "This malloc implementation does not support "
+               "ReadHeapGrowhStackTraces().\n"
+               "As of 2005/09/27, only tcmalloc supports this, and you\n"
+               "are probably running a binary that does not use tcmalloc.\n";
+    return;
+  }
 
+  // Do not canonicalize the stack entries, so that we get a
+  // time-ordered list of stack traces, which may be useful if the
+  // client wants to focus on the latest stack traces.
+
+  PrintHeader(result, "growth", entries);
+  for (void** entry = entries; Count(entry) != 0; entry += 3 + Depth(entry)) {
+    PrintStackEntry(result, entry);
+  }
   delete[] entries;
+
+  // TODO(menage) Get this working in google-perftools
+  //DumpAddressMap(DebugStringWriter, result);
 }
diff --git a/src/malloc_hook.cc b/src/malloc_hook.cc
index a238fd1..3047031 100644
--- a/src/malloc_hook.cc
+++ b/src/malloc_hook.cc
@@ -40,17 +40,24 @@ MallocHook::MunmapHook MallocHook::munmap_hook_ = NULL;
 
 // On Linux/x86, we override mmap/munmap and provide support for
 // calling the related hooks.  
-#if defined(__i386__) && defined(__linux)
+//
+// We define mmap() and mmap64(), which somewhat reimplements libc's mmap 
+// syscall stubs.  Unfortunately libc only exports the stubs via weak symbols 
+// (which we're overriding with our mmap64() and mmap() wrappers) so we can't 
+// just call through to them.
 
+
+#if defined(__linux) && (defined(__i386__) || defined(__x86_64__))
 #include <unistd.h>
 #include <syscall.h>
 #include <sys/mman.h>
 #include <errno.h>
 
-// This somewhat reimplements libc's mmap syscall stubs. Unfortunately
-// libc only exports the stubs via weak symbols (which we're
-// overriding with our mmap64() and mmap() wrappers) so we can't just
-// call through to them.
+// The x86-32 case and the x86-64 case differ:
+// 32b has a mmap2() syscall, 64b does not.
+// 64b and 32b have different calling conventions for mmap().
+# if defined(__i386__) 
+
 extern "C" void* mmap64(void *start, size_t length,
                         int prot, int flags, 
                         int fd, __off64_t offset) __THROW {
@@ -98,6 +105,29 @@ extern "C" void* mmap64(void *start, size_t length,
 
 }
 
+//--------------------------------------------------------------------------//
+
+# elif defined(__x86_64__)
+
+#define __NR_wrapped_mmap   __NR_mmap
+#define __NR_wrapped_munmap __NR_munmap
+static inline _syscall6(void *, wrapped_mmap, void  *,  start,  
+                        size_t, length, int, prot, int, flags, int,
+                        fd, __off64_t, offset);
+static inline _syscall2(int, wrapped_munmap, void *, start, size_t, length);
+
+extern "C" void* mmap64(void *start, size_t length,
+                        int prot, int flags, 
+                        int fd, __off64_t offset) __THROW {
+
+  void *result;
+  result = wrapped_mmap(start, length, prot, flags, fd, offset );
+  MallocHook::InvokeMmapHook(result, start, length, prot, flags, fd, offset);
+  return result;
+}
+
+# endif
+
 extern "C" void* mmap(void *start, size_t length,
                       int prot, int flags, 
                       int fd, off_t offset) __THROW {
diff --git a/src/pagemap.h b/src/pagemap.h
index 50ff1bf..1fdde99 100644
--- a/src/pagemap.h
+++ b/src/pagemap.h
@@ -76,6 +76,8 @@ class TCMalloc_PageMap1 {
     return true;
   }
 
+  void PreallocateMoreMemory() {}
+
   // REQUIRES "k" is in range "[0,2^BITS-1]".
   // REQUIRES "k" has been ensured before.
   //
@@ -152,6 +154,11 @@ class TCMalloc_PageMap2 {
     }
     return true;
   }
+
+  void PreallocateMoreMemory() {
+    // Allocate enough to keep track of all possible pages
+    Ensure(0, 1 << BITS);
+  }
 };
 
 // Three-level radix tree
@@ -236,6 +243,9 @@ class TCMalloc_PageMap3 {
     }
     return true;
   }
+
+  void PreallocateMoreMemory() {
+  }
 };
 
 #endif  // TCMALLOC_PAGEMAP_H__
diff --git a/src/pprof b/src/pprof
index 494235d..46366bc 100755
--- a/src/pprof
+++ b/src/pprof
@@ -34,7 +34,7 @@
 # or by the heap profiler (common/debugallocation.cc)
 #
 # The profile contains a sequence of entries of the form:
-#	<count> <stack trace>
+#       <count> <stack trace>
 # This program parses the profile, and generates user-readable
 # output.
 #
@@ -87,6 +87,7 @@ Usage: pprof [options] <program> <profile>
 Options:
    --cum               Sort by cumulative data
    --base=<base>       Subtract <base> from <profile> before display
+   --interactive       Run in interactive mode (interactive "help" gives help)
    
 Reporting Granularity:
    --addresses         Report at address level
@@ -191,10 +192,14 @@ $main::opt_alloc_space   = 0;
 $main::opt_alloc_objects = 0;
 $main::opt_show_bytes    = 0;
 $main::opt_drop_negative = 0;
+$main::opt_interactive   = 0;
 
 # Are we printing a heap profile?
 $main::heap_profile = 0;
 
+# Are we printing a lock profile?
+$main::lock_profile = 0;
+
 GetOptions("help!"          => \$main::opt_help,
 	   "version!"       => \$main::opt_version,
 	   "cum!"           => \$main::opt_cum,
@@ -211,6 +216,7 @@ GetOptions("help!"          => \$main::opt_help,
 	   "ps!"            => \$main::opt_ps,
 	   "pdf!"           => \$main::opt_pdf,
 	   "gif!"           => \$main::opt_gif,
+	   "interactive!"   => \$main::opt_interactive,
 	   "nodecount=i"    => \$main::opt_nodecount,
 	   "nodefraction=f" => \$main::opt_nodefraction,
 	   "edgefraction=f" => \$main::opt_edgefraction,
@@ -283,35 +289,43 @@ if ($modes == 0) {
 }
 
 my $prog = shift || fatal("Did not specify program");
-my $pfile = shift || fatal("Did not specify profile file");
+my $pfile_arg = shift || fatal("Did not specify profile file");
 
 ##### Main section #####
 
 # Setup tmp-file name and handler to clean it up
 $main::tmpfile_sym = "/tmp/pprof$$.sym";
-$main::tmpfile_ps = "/tmp/pprof$$.ps";
+$main::tmpfile_ps = "/tmp/pprof$$";
+$main::next_tmpfile = 0;
+$main::collected_profile = undef;
 $SIG{'INT'} = \&sighandler;
 
 # Read profile data
+my $pfile = FetchDynamicProfile($prog, $pfile_arg);
 my $data = ReadProfile($prog, $pfile);
 my $profile = $data->{profile};
-my $libs = $data->{libs};	# Info about main program and shared libraries
+my $libs = $data->{libs};       # Info about main program and shared libraries
 
 # List of function names to skip
 $main::skip = ();
 if ($main::heap_profile) {
   foreach my $name ('calloc',
-		    'cfree',
-		    'malloc',
-		    'free',
-		    'memalign',
-		    'pvalloc',
-		    'valloc',
-		    'realloc',
-		    '__builtin_delete',
-		    '__builtin_new',
-		    '__builtin_vec_delete',
-		    '__builtin_vec_new') {
+                    'cfree',
+                    'malloc',
+                    'free',
+                    'memalign',
+                    'pvalloc',
+                    'valloc',
+                    'realloc',
+                    '__builtin_delete',
+                    '__builtin_new',
+                    '__builtin_vec_delete',
+                    '__builtin_vec_new') {
+    $main::skip{$name} = 1;
+  }
+}
+if ($main::lock_profile) {
+  foreach my $name ('Mutex::Unlock') {
     $main::skip{$name} = 1;
   }
 }
@@ -347,41 +361,246 @@ my $flat = FlatProfile($reduced);
 my $cumulative = CumulativeProfile($reduced);
 
 # Print
-if ($main::opt_disasm) {
-  PrintDisassembly($libs, $flat, $cumulative);
-} elsif ($main::opt_list) {
-  PrintListing($libs, $flat, $cumulative);
-} elsif ($main::opt_text) {
-  PrintText($symbols, $flat, $cumulative, $total);
-} else {
-  PrintDot($prog, $symbols, $profile, $flat, $cumulative, $total);
-  if ($main::opt_gv) {
-    # Some versions of gv use -scale, and some use --scale.  *sigh*
-    # We use --help to determine if gv expects one dash or two.
-    system("$GV --help >/dev/null 2>&1 " .
-	   "&& $GV --scale=$main::opt_scale $main::tmpfile_ps " .
-	   "|| $GV -scale $main::opt_scale $main::tmpfile_ps")
+if (!$main::opt_interactive) {
+  if ($main::opt_disasm) {
+    PrintDisassembly($libs, $flat, $cumulative, $main::opt_disasm);
+  } elsif ($main::opt_list) {
+    PrintListing($libs, $flat, $cumulative, $main::opt_list);
+  } elsif ($main::opt_text) {
+    PrintText($symbols, $flat, $cumulative, $total, -1);
+  } else {
+    if (PrintDot($prog, $symbols, $profile, $flat, $cumulative, $total)) {
+      if ($main::opt_gv) {
+	# Some versions of gv use -scale, and some use --scale.  *sigh*
+	# We use --help to determine if gv expects one dash or two.
+	system("$GV --help >/dev/null 2>&1 " .
+	       "&& $GV --scale=$main::opt_scale $main::tmpfile_ps " .
+	       "|| $GV -scale $main::opt_scale $main::tmpfile_ps")
+      }
+    } else {
+      exit(1);
+    }
   }
+} else {
+  InteractiveMode();
 }
 
 cleanup();
 exit(0);
 
+
+##### Interactive helper routines #####
+sub InteractiveMode {
+  $| = 1;	# Make output unbuffered for interactive mode
+  my $orig_profile = $profile;
+  while (1) {
+    print "(pprof) ";
+    $_ = <STDIN>;
+    if (!defined($_)) {
+      print "\n";
+      last;
+    }
+    if (m/^ *quit/) {
+      last;
+    }
+    if (m/^ *help/) {
+      InteractiveHelpMessage();
+      next;
+    }
+    # Clear all the options
+    $main::opt_lines = 0;
+    $main::opt_text = 0;
+    $main::opt_disasm = 0;
+    $main::opt_list = 0;
+    $main::opt_gv = 0;
+    $main::opt_cum = 0;
+
+    if (m/^ *(text|top)(\d*) *(.*)/) {
+      $main::opt_text = 1;
+
+      my $line_limit = ($2 ne "") ? int($2) : 10;
+
+      my $routine;
+      my $ignore;
+      ($routine, $ignore) = ParseInteractiveArgs($3);
+
+      my $profile = ProcessProfile($orig_profile, "", $ignore);
+      my $reduced = ReduceProfile($symbols, $profile);
+
+      # Get derived profiles
+      my $flat = FlatProfile($reduced);
+      my $cumulative = CumulativeProfile($reduced);
+
+      PrintText($symbols, $flat, $cumulative, $total, $line_limit);
+      next;
+    }
+    if (m/^ *list *(.+)/) {
+      $main::opt_list = 1;
+
+      my $routine;
+      my $ignore;
+      ($routine, $ignore) = ParseInteractiveArgs($1);
+
+      my $profile = ProcessProfile($orig_profile, "", $ignore);
+      my $reduced = ReduceProfile($symbols, $profile);
+
+      # Get derived profiles
+      my $flat = FlatProfile($reduced);
+      my $cumulative = CumulativeProfile($reduced);
+
+      PrintListing($libs, $flat, $cumulative, $routine);
+      next;
+    }
+    if (m/^ *disasm *(.+)/) {
+      $main::opt_disasm = 1;
+
+      my $routine;
+      my $ignore;
+      ($routine, $ignore) = ParseInteractiveArgs($1);
+
+      # Process current profile to account for various settings
+      my $profile = ProcessProfile($orig_profile, "", $ignore);
+      my $reduced = ReduceProfile($symbols, $profile);
+
+      # Get derived profiles
+      my $flat = FlatProfile($reduced);
+      my $cumulative = CumulativeProfile($reduced);
+
+      PrintDisassembly($libs, $flat, $cumulative, $routine);
+      next;
+    }
+    if (m/^ *gv *(.*)/) {
+      $main::opt_gv = 1;
+
+      my $focus;
+      my $ignore;
+      ($focus, $ignore) = ParseInteractiveArgs($1);
+
+      # Process current profile to account for various settings
+      my $profile = ProcessProfile($orig_profile, $focus, $ignore);
+      my $reduced = ReduceProfile($symbols, $profile);
+
+      # Get derived profiles
+      my $flat = FlatProfile($reduced);
+      my $cumulative = CumulativeProfile($reduced);
+
+      if (PrintDot($prog, $symbols, $profile, $flat, $cumulative, $total)) {
+	system("gv -scale $main::opt_scale -noresize " . 
+	       PsTempName($main::next_tmpfile) . " &");
+	$main::next_tmpfile++;
+      }
+      next;
+    }
+  }
+}
+
+
+sub ProcessProfile {
+  my $orig_profile = shift;
+  my $focus = shift;
+  my $ignore = shift;
+
+  # Process current profile to account for various settings
+  my $profile = $orig_profile;
+  my $total_count = TotalProfile($profile);
+  print "Total: ", $total_count, " samples\n";
+  if ($focus ne '') {
+    $profile = FocusProfile($symbols, $profile, $focus);
+    my $focus_count = TotalProfile($profile);
+    printf "After focusing on '%s': %d samples of %d (%0.1f%%)\n", 
+      $focus, $focus_count, $total_count, ($focus_count*100.0) / $total_count;
+  }
+  if ($ignore ne '') {
+    $profile = IgnoreProfile($symbols, $profile, $ignore);
+    my $ignore_count = TotalProfile($profile);
+    printf "After ignoring '%s': %d samples of %d (%0.1f%%)\n", 
+      $ignore, $ignore_count, $total_count, 
+	($ignore_count*100.0) / $total_count;
+  }
+
+  return $profile;
+}
+
+sub InteractiveHelpMessage {
+  print <<ENDOFHELP;
+Interactive pprof mode
+
+Commands:
+  gv
+  gv [focus] [-ignore1] [-ignore2]
+      Show graphical hierarchical display of current profile.  Without
+      any arguments, shows all samples in the profile.  With the optional
+      "focus" argument, restricts the samples shown to just those where
+      the "focus" regular expression matches a routine name on the stack 
+      trace.
+
+  list [routine_regexp] [-ignore1] [-ignore2]
+      Show source listing of routines whose names match "routine_regexp"
+  
+  top [--cum] [-ignore1] [-ignore2]
+  top20 [--cum] [-ignore1] [-ignore2]
+  top37 [--cum] [-ignore1] [-ignore2]
+      Show top lines ordered by flat profile count, or cumulative count
+      if --cum is specified.  If a number is present after 'top', the
+      top K routines will be shown (defaults to showing the top 10)
+  
+  disasm [routine_regexp] [-ignore1] [-ignore2]
+      Show disassembly of routines whose names match "routine_regexp",
+      annotated with sample counts.
+
+  help - This listing
+  quit or ^D - End pprof
+
+For commands that accept optional -ignore tags, samples where any routine in
+the stack trace matches the regular expression in any of the -ignore
+parameters will be ignored.
+
+ENDOFHELP
+}
+sub ParseInteractiveArgs {
+  my $args = shift;
+  my $focus = "";
+  my $ignore = "";
+  my @x = split(/ +/, $args);
+  foreach $a (@x) {
+    if ($a =~ m/^(--|-)lines$/) {
+      $main::opt_lines = 1;
+    } elsif ($a =~ m/^(--|-)cum$/) {
+      $main::opt_cum = 1;
+    } elsif ($a =~ m/^-(.*)/) {
+      $ignore .= (($ignore ne "") ? "|" : "" ) . $1;
+    } else {
+      $focus .= (($focus ne "") ? "|" : "" ) . $a;
+    }
+  }
+  if ($ignore ne "") {
+    print "Ignoring samples in call stacks that match '$ignore'\n";
+  }
+  return ($focus, $ignore);
+}
+
 ##### Output code #####
 
+sub PsTempName {
+  my $fnum = shift;
+  return "$main::tmpfile_ps" . "." . "$fnum" . ".ps";
+}
+  
 # Print text output
 sub PrintText {
   my $symbols = shift;
   my $flat = shift;
   my $cumulative = shift;
   my $total = shift;
+  my $line_limit = shift;
 
   # Which profile to sort by?
   my $s = $main::opt_cum ? $cumulative : $flat;
 
   my $running_sum = 0;
+  my $lines = 0;
   foreach my $k (sort { GetEntry($s,$b) <=> GetEntry($s, $a) }
-		 keys(%{$cumulative})) {
+                 keys(%{$cumulative})) {
     my $f = GetEntry($flat, $k);
     my $c = GetEntry($cumulative, $k);
     $running_sum += $f;
@@ -390,7 +609,7 @@ sub PrintText {
     if (exists($symbols->{$k})) {
       $sym = $symbols->{$k}->[0] . " " . $symbols->{$k}->[1];
       if ($main::opt_addresses) {
-	$sym = $k . " " . $sym;
+        $sym = $k . " " . $sym;
       }
     }
 
@@ -403,6 +622,8 @@ sub PrintText {
              Percent($c, $total),
              $sym);
     }
+    $lines++;
+    last if ($line_limit >= 0 && $lines > $line_limit);
   }
 }
 
@@ -411,9 +632,10 @@ sub PrintDisassembly {
   my $libs = shift;
   my $flat = shift;
   my $cumulative = shift;
+  my $disasm_opts = shift;
 
   foreach my $lib (@{$libs}) {
-    my $symbol_table = GetProcedureBoundaries($lib->[0], $main::opt_disasm);
+    my $symbol_table = GetProcedureBoundaries($lib->[0], $disasm_opts);
     my $offset = $lib->[1] - $lib->[3];
     foreach my $routine (keys(%{$symbol_table})) {
       my $start_addr = $symbol_table->{$routine}->[0];
@@ -422,8 +644,8 @@ sub PrintDisassembly {
       my $total_flat = 0;
       my $total_cum = 0;
       for (my $addr = $start_addr; $addr < $end_addr; $addr++) {
-	$total_flat += GetEntry($flat, sprintf("0x%x", $addr+$offset));
-	$total_cum += GetEntry($cumulative, sprintf("0x%x", $addr+$offset));
+        $total_flat += GetEntry($flat, sprintf("0x%x", $addr+$offset));
+        $total_cum += GetEntry($cumulative, sprintf("0x%x", $addr+$offset));
       }
 
       # Skip disassembly if there are no samples in routine
@@ -431,26 +653,26 @@ sub PrintDisassembly {
 
       print "ROUTINE ====================== $routine\n";
       printf "%6s %6s Total samples (flat / cumulative)\n",
-	Unparse($total_flat), Unparse($total_cum);
+        Unparse($total_flat), Unparse($total_cum);
 
       my @instructions = Disassemble($lib->[0], $offset,
-				     $start_addr, $end_addr);
+                                     $start_addr, $end_addr);
       foreach my $e (@instructions) {
-	my $location = ($e->[2] >= 0) ? "$e->[1]:$e->[2]" : "";
-	$location =~ s|.*/||;	# Remove directory portion, if any
-	if (length($location) >= 20) {
-	  # For long locations, just show the last 20 characters
-	  $location = substr($location, -20);
-	}
-	my $f = GetEntry($flat, $e->[0]);
-	my $c = GetEntry($cumulative, $e->[0]);
-	my $address = $e->[0];	$address =~ s/^0x//;
-	printf("%6s %6s %-20s %8s: %6s\n",
-	       UnparseAlt($f),
-	       UnparseAlt($c),
-	       $location,
-	       $address,
-	       $e->[3]);
+        my $location = ($e->[2] >= 0) ? "$e->[1]:$e->[2]" : "";
+        $location =~ s|.*/||;   # Remove directory portion, if any
+        if (length($location) >= 20) {
+          # For long locations, just show the last 20 characters
+          $location = substr($location, -20);
+        }
+        my $f = GetEntry($flat, $e->[0]);
+        my $c = GetEntry($cumulative, $e->[0]);
+        my $address = $e->[0];  $address =~ s/^0x//;
+        printf("%6s %6s %-20s %8s: %6s\n",
+               UnparseAlt($f),
+               UnparseAlt($c),
+               $location,
+               $address,
+               $e->[3]);
       }
       close(OBJDUMP);
     }
@@ -458,9 +680,9 @@ sub PrintDisassembly {
 }
 
 # Return reference to array of tuples of the form:
-#	[address, filename, linenumber, instruction]
+#       [address, filename, linenumber, instruction]
 # E.g.,
-#	["0x806c43d", "/foo/bar.cc", 131, "ret"]
+#       ["0x806c43d", "/foo/bar.cc", 131, "ret"]
 sub Disassemble {
   my $prog = shift;
   my $offset = shift;
@@ -468,8 +690,8 @@ sub Disassemble {
   my $end_addr = shift;
 
   my $cmd = sprintf("$OBJDUMP -d -l --no-show-raw-insn " .
-		    "--start-address=%d --stop-address=%d $prog",
-		    $start_addr, $end_addr);
+                    "--start-address=%d --stop-address=%d $prog",
+                    $start_addr, $end_addr);
   open(OBJDUMP, "$cmd |") || error("$OBJDUMP: $!\n");
   my @result = ();
   my $filename = "";
@@ -501,26 +723,38 @@ sub PrintListing {
   my $libs = shift;
   my $flat = shift;
   my $cumulative = shift;
+  my $list_opts = shift;
 
   foreach my $lib (@{$libs}) {
-    my $symbol_table = GetProcedureBoundaries($lib->[0], $main::opt_list);
+    my $symbol_table = GetProcedureBoundaries($lib->[0], $list_opts);
     my $offset = $lib->[1] - $lib->[3];
     foreach my $routine (sort ByName keys(%{$symbol_table})) {
       # Print if there are any samples in this routine
       my $start_addr = $symbol_table->{$routine}->[0];
       my $end_addr = $symbol_table->{$routine}->[1];
       for (my $addr = $start_addr; $addr < $end_addr; $addr++) {
-	if (defined($cumulative->{sprintf("0x%x", $addr+$offset)})) {
-	  PrintSource($lib->[0], $offset,
-		      $routine, $flat, $cumulative,
-		      $start_addr, $end_addr);
-	  last;
-	}
+        if (defined($cumulative->{sprintf("0x%x", $addr+$offset)})) {
+          PrintSource($lib->[0], $offset,
+                      $routine, $flat, $cumulative,
+                      $start_addr, $end_addr);
+          last;
+        }
       }
     }
   }
 }
 
+# Returns the indentation of the line, if it has any non-whitespace
+# characters.  Otherwise, returns -1.
+sub Indentation {
+  my $line = shift;
+  if (m/^(\s*)\S/) {
+    return length($1);
+  } else {
+    return -1;
+  }
+}
+
 # Print source-listing for one routine
 sub PrintSource {
   my $prog = shift;
@@ -560,15 +794,45 @@ sub PrintSource {
     }
   }
 
+  # Hack 3: Extend last line forward until its indentation is less than 
+  # the indentation we saw on $firstline
+  my $oldlastline = $lastline;
+  {
+    if (!open(FILE, "<$filename")) {
+      print STDERR "$filename: $!\n";
+      return;
+    }
+    my $l = 0;
+    my $first_indentation = -1;
+    while (<FILE>) {
+      $l++;
+      my $indent = Indentation($_);
+      if ($l >= $firstline) {
+	if ($first_indentation < 0 && $indent >= 0) {
+	  $first_indentation = $indent;
+	  last if ($first_indentation == 0);
+	}
+      }
+      if ($l >= $lastline && $indent >= 0) {
+	if ($indent >= $first_indentation) {
+	  $lastline = $l+1;
+	} else {
+	  last;
+	}
+      }
+    }
+    close(FILE);
+  }
+
   # Assign all samples to the range $firstline,$lastline,
-  # Hack 3: If an instruction does not occur in the range, its samples
+  # Hack 4: If an instruction does not occur in the range, its samples
   # are moved to the next instruction that occurs in the range.
   my $samples1 = {};
   my $samples2 = {};
-  my $running1 = 0;	# Unassigned flat counts
-  my $running2 = 0;	# Unassigned cumulative counts
-  my $total1 = 0;	# Total flat counts
-  my $total2 = 0;	# Total cumulative counts
+  my $running1 = 0;     # Unassigned flat counts
+  my $running2 = 0;     # Unassigned cumulative counts
+  my $total1 = 0;       # Total flat counts
+  my $total2 = 0;       # Total cumulative counts
   foreach my $e (@instructions) {
     my $c1 = GetEntry($flat, $e->[0]);
     my $c2 = GetEntry($cumulative, $e->[0]);
@@ -594,12 +858,12 @@ sub PrintSource {
   AddEntry($samples2, $lastline, $running2);
 
   printf("ROUTINE ====================== %s in %s\n" .
-	 "%6s %6s Total %s (flat / cumulative)\n",
-	 ShortFunctionName($routine),
-	 $filename,
-	 Units(),
-	 Unparse($total1),
-	 Unparse($total2));
+         "%6s %6s Total %s (flat / cumulative)\n",
+         ShortFunctionName($routine),
+         $filename,
+         Units(),
+         Unparse($total1),
+         Unparse($total2));
   if (!open(FILE, "<$filename")) {
     print STDERR "$filename: $!\n";
     return;
@@ -607,14 +871,15 @@ sub PrintSource {
   my $l = 0;
   while (<FILE>) {
     $l++;
-    if ($l >= $firstline - 5 && $l <= $lastline + 5) {
+    if ($l >= $firstline - 5 && 
+	(($l <= $oldlastline + 5) || ($l <= $lastline))) {
       chop;
       my $text = $_;
       printf("%6s %6s %4d: %s\n",
-	     UnparseAlt(GetEntry($samples1, $l)),
-	     UnparseAlt(GetEntry($samples2, $l)),
-	     $l,
-	     $text);
+             UnparseAlt(GetEntry($samples1, $l)),
+             UnparseAlt(GetEntry($samples2, $l)),
+             $l,
+             $text);
     };
   }
   close(FILE);
@@ -638,7 +903,7 @@ sub PrintDot {
   # Find nodes to include
   my @list = (sort { abs(GetEntry($cumulative, $b)) <=>
                      abs(GetEntry($cumulative, $a)) }
-	      keys(%{$cumulative}));
+              keys(%{$cumulative}));
   my $last = $nodecount - 1;
   if ($last > $#list) {
     $last = $#list;
@@ -649,17 +914,20 @@ sub PrintDot {
   }
   if ($last < 0) {
     print STDERR "No nodes to print\n";
-    exit(1);
+    cleanup();
+    return 0;
   }
 
-  printf STDERR ("Dropping nodes with <= %s %s; edges with <= %s abs(%s)\n",
-		 Unparse($nodelimit), Units(),
-		 Unparse($edgelimit), Units());
+  if ($nodelimit > 0 || $edgelimit > 0) {
+    printf STDERR ("Dropping nodes with <= %s %s; edges with <= %s abs(%s)\n",
+		   Unparse($nodelimit), Units(),
+		   Unparse($edgelimit), Units());
+  }
 
   # Open DOT output file
   my $output;
   if ($main::opt_gv) {
-    $output = "| $DOT -Tps >$main::tmpfile_ps";
+    $output = "| $DOT -Tps >" . PsTempName($main::next_tmpfile);
   } elsif ($main::opt_ps) {
     $output = "| $DOT -Tps";
   } elsif ($main::opt_pdf) {
@@ -673,26 +941,26 @@ sub PrintDot {
 
   # Title
   printf DOT ("digraph \"%s; %s %s\" {\n",
-	      $prog,
-	      Unparse($overall_total),
-	      Units());
+              $prog,
+              Unparse($overall_total),
+              Units());
   if ($main::opt_pdf) {
-    # The output is more printable if we set the page size for dot. 
+    # The output is more printable if we set the page size for dot.
     printf DOT ("size=\"8,11\"\n");
-  }   
+  }
   printf DOT ("node [width=0.375,height=0.25];\n");
 
   # Print legend
   printf DOT ("Legend [shape=box,fontsize=24,shape=plaintext," .
-	      "label=\"%s\\l%s\\l%s\\l%s\\l%s\\l\"];\n",
-	      $prog,
-	      sprintf("Total %s: %s", Units(), Unparse($overall_total)),
-	      sprintf("Focusing on: %s", Unparse($local_total)),
-	      sprintf("Dropped nodes with <= %s abs(%s)",
-		      Unparse($nodelimit), Units()),
-	      sprintf("Dropped edges with <= %s %s",
-		      Unparse($edgelimit), Units())
-	      );
+              "label=\"%s\\l%s\\l%s\\l%s\\l%s\\l\"];\n",
+              $prog,
+              sprintf("Total %s: %s", Units(), Unparse($overall_total)),
+              sprintf("Focusing on: %s", Unparse($local_total)),
+              sprintf("Dropped nodes with <= %s abs(%s)",
+                      Unparse($nodelimit), Units()),
+              sprintf("Dropped edges with <= %s %s",
+                      Unparse($edgelimit), Units())
+              );
 
   # Print nodes
   my %node = ();
@@ -739,13 +1007,13 @@ sub PrintDot {
     for (my $i = 1; $i <= $#addrs; $i++) {
       my $src = OutputKey($symbols, $addrs[$i]);
       my $dst = OutputKey($symbols, $addrs[$i-1]);
-      #next if ($src eq $dst);	# Avoid self-edges?
+      #next if ($src eq $dst);  # Avoid self-edges?
       if (exists($node{$src}) && exists($node{$dst})) {
-	my $e = "$src\001$dst";
-	if (!exists($edge{$e})) {
-	  $edge{$e} = 0;
-	}
-	$edge{$e} += $n;
+        my $e = "$src\001$dst";
+        if (!exists($edge{$e})) {
+          $edge{$e} = 0;
+        }
+        $edge{$e} += $n;
       }
     }
   }
@@ -757,25 +1025,26 @@ sub PrintDot {
 
     if (abs($n) > $edgelimit) {
       # Compute line width based on edge count
-      my $fraction = $local_total ? (3 * ($n / $local_total)) : 0;
+      my $fraction = abs($local_total ? (3 * ($n / $local_total)) : 0);
       if ($fraction > 1) { $fraction = 1; }
       my $w = $fraction * 2;
       #if ($w < 1) { $w = 1; }
 
       # Use a slightly squashed function of the edge count as the weight
       printf DOT ("N%s -> N%s [label=%s, weight=%d, " .
-		  "style=\"setlinewidth(%f)\"];\n", 
-		  $node{$x[0]},
-		  $node{$x[1]},
-		  Unparse($n),
-		  int($n ** 0.7),
-		  $w);
+                  "style=\"setlinewidth(%f)\"];\n",
+                  $node{$x[0]},
+                  $node{$x[1]},
+                  Unparse($n),
+                  int(abs($n) ** 0.7),
+                  $w);
     }
   }
 
   print DOT ("}\n");
 
   close(DOT);
+  return 1;
 }
 
 # Generate the key under which a given address should be counted
@@ -919,14 +1188,14 @@ sub ReduceProfile {
     my @addrs = split(/\n/, $k);
     my @path = ();
     my %seen = ();
-    $seen{''} = 1;	# So that empty keys are skipped
+    $seen{''} = 1;      # So that empty keys are skipped
     foreach my $a (@addrs) {
       # To avoid double-counting due to recursion, skip a stack-trace
       # entry if it has already been seen
       my $key = OutputKey($symbols, $a);
       if (!$seen{$key}) {
-	$seen{$key} = 1;
-	push(@path, $key);
+        $seen{$key} = 1;
+        push(@path, $key);
       }
     }
     my $reduced_path = join("\n", @path);
@@ -946,12 +1215,12 @@ sub FocusProfile {
     my @addrs = split(/\n/, $k);
     foreach my $a (@addrs) {
       # Reply if it matches either the address/shortname/fileline
-      if (($a =~ m/$focus/o) ||
-	  (exists($symbols->{$a}) &&
-	   (($symbols->{$a}->[0] =~ m/$focus/o) ||
-	    ($symbols->{$a}->[1] =~ m/$focus/o)))) {
-	AddEntry($result, $k, $count);
-	last;
+      if (($a =~ m/$focus/) ||
+          (exists($symbols->{$a}) &&
+           (($symbols->{$a}->[0] =~ m/$focus/) ||
+            ($symbols->{$a}->[1] =~ m/$focus/)))) {
+        AddEntry($result, $k, $count);
+        last;
       }
     }
   }
@@ -970,12 +1239,12 @@ sub IgnoreProfile {
     my $matched = 0;
     foreach my $a (@addrs) {
       # Reply if it matches either the address/shortname/fileline
-      if (($a =~ m/$ignore/o) ||
-	  (exists($symbols->{$a}) &&
-	   (($symbols->{$a}->[0] =~ m/$ignore/o) ||
-	    ($symbols->{$a}->[1] =~ m/$ignore/o)))) {
-	$matched = 1;
-	last;
+      if (($a =~ m/$ignore/) ||
+          (exists($symbols->{$a}) &&
+           (($symbols->{$a}->[0] =~ m/$ignore/) ||
+            ($symbols->{$a}->[1] =~ m/$ignore/)))) {
+        $matched = 1;
+        last;
       }
     }
     if (!$matched) {
@@ -1041,6 +1310,16 @@ sub AddEntry {
   $profile->{$k} += $n;
 }
 
+##### Code to profile a server dynamically #####
+
+sub FetchDynamicProfile {
+  my $binary_name = shift;
+  my $profile_name = shift;
+
+  # TODO: Add support for fetching profiles dynamically from a server
+  return $profile_name;
+}
+
 ##### Parsing code #####
 
 # Parse profile generated by common/profiler.cc and return a reference
@@ -1054,19 +1333,30 @@ sub ReadProfile {
   my $prog = shift;
   my $fname = shift;
 
+  $main::heap_profile = 0;
+  $main::lock_profile = 0;
+
   # Look at first line to see if it is a heap or a CPU profile
   open(PROFILE, "<$fname") || error("$fname: $!\n");
-  binmode PROFILE;	# New perls do UTF-8 processing
+  binmode PROFILE;      # New perls do UTF-8 processing
   my $header = <PROFILE>;
   if ($header =~ m/^heap profile:/) {
     $main::heap_profile = 1;
-    return ReadHeapProfile($prog, $fname);
+    return ReadHeapProfile($prog, $fname, $header);
+  } elsif ($header =~ m/^--- *contentionz/ ) {
+    $main::lock_profile = 1;
+    return ReadSynchProfile($prog, $fname);
+  } elsif ($header =~ m/^--- *Stacks:/ ) {
+    print STDERR
+      "Old format contention profile: mistakenly reports " .
+      "condition variable signals as lock contentions.\n";
+    $main::lock_profile = 1;
+    return ReadSynchProfile($prog, $fname);
   } else {
     # Need to unread the line we just read
     close(PROFILE);
     open(PROFILE, "<$fname") || error("$fname: $!\n");
-    binmode PROFILE;	# New perls do UTF-8 processing
-    $main::heap_profile = 0;
+    binmode PROFILE;    # New perls do UTF-8 processing
     return ReadCPUProfile($prog, $fname);
   }
 }
@@ -1143,6 +1433,7 @@ sub ReadCPUProfile {
 sub ReadHeapProfile {
   my $prog = shift;
   my $fname = shift;
+  my $header = shift;
 
   my $index = 1;
   if ($main::opt_inuse_space) {
@@ -1155,6 +1446,15 @@ sub ReadHeapProfile {
     $index = 2;
   }
 
+  # Find the type of this profile
+  chomp($header);
+  my $type = "unknown";
+  if ($header =~ m/^heap profile:\s*(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\](\s*@\s*(.*))?/) {
+    if (defined($6) && ($6 ne '')) {
+      $type = $6;
+    }
+  }
+
   my $profile = {};
   my $pcs = {};
   my $map = "";
@@ -1162,15 +1462,24 @@ sub ReadHeapProfile {
     if (/^MAPPED_LIBRARIES:/) {
       # Read the /proc/self/maps data
       while (<PROFILE>) {
-	$map .= $_;
+        $map .= $_;
       }
       last;
     }
 
     if (/^--- Memory map:/) {
       # Read /proc/self/maps data as formatted by DumpAddressMap()
+      my $buildvar = "";
       while (<PROFILE>) {
-	$map .= $_;
+	# Parse "build=<dir>" specification if supplied
+	if (m/^\s*build=(.*)\n/) {
+	  $buildvar = $1;
+	}
+
+	# Expand "$build" variable if available
+	$_ =~ s/\$build\b/$buildvar/g;
+
+        $map .= $_;
       }
       last;
     }
@@ -1181,7 +1490,9 @@ sub ReadHeapProfile {
     s/\s*$//;
     if (m/^\s*(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\]\s+@\s+(.*)$/) {
       my $stack = $5;
-      my @counts = ($1, $2, $3, $4);
+      my ($n1, $s1, $n2, $s2) = ($1, $2, $3, $4);
+
+      my @counts = ($n1, $s1, $n2, $s2);
       my $n = $counts[$index];
       my $k = "";
       foreach my $e (split(/\s+/, $stack)) {
@@ -1203,6 +1514,46 @@ sub ReadHeapProfile {
   return $r;
 }
 
+sub ReadSynchProfile {
+  my ($prog, $fname, $header) = @_;
+  my ($line, $map, $pc, @k, $count, $stack);
+
+  $map = '';
+  my $profile = {};
+  my $pcs = {};
+  my $sampling_period = 1;
+
+  while ( $line = <PROFILE> ) {
+    if ( $line =~ /^(slow release).*thread \d+  \@\s*(.*?)\s*$/ ||
+         $line =~ /^\s*(\d+) \@\s*(.*?)\s*$/ ) {
+      # Sample entry
+      ($count, $stack) = ($1, $2);
+      $count = 1 if $count !~ /^\d+$/;
+
+      @k = ();
+      foreach $pc (split /\s+/, $stack) {
+        $pcs->{hex($pc)} = 1;
+        push @k, $pc;
+      }
+      AddEntry($profile, (join "\n", @k), $count);
+    } elsif ( $line =~ /sampling period = (\d+)/ ) {
+      $sampling_period = $1;
+    } else {
+      # Memory map entry
+      $map .= $line;
+    }
+  }
+  close PROFILE;
+
+  my $r = {};
+  $r->{'version'} = 0;
+  $r->{'period'} = $sampling_period;
+  $r->{'profile'} = $profile;
+  $r->{'libs'} = ParseLibraries($prog, $map, $pcs);
+  $r->{'pcs'} = $pcs;
+  return $r;
+}
+
 ##### Symbol extraction #####
 
 # Split /proc/pid/maps dump into a list of libraries
@@ -1227,7 +1578,7 @@ sub ParseLibraries {
       $lib = $4;
     } elsif ($l =~ /^\s*($h)-($h):\s*(\S+\.so(\.\d+)*)/) {
       # Cooked line from DumpAddressMap.  Example:
-      #	  40000000-40015000: /lib/ld-2.3.2.so
+      #   40000000-40015000: /lib/ld-2.3.2.so
       $start = hex($1);
       $finish = hex($2);
       $offset = 0;
@@ -1244,10 +1595,10 @@ sub ParseLibraries {
       #  10 .text         00104b2c  420156f0  420156f0  000156f0  2**4
       my @x = split;
       if (($#x >= 6) && ($x[1] eq '.text')) {
-	my $vma = hex($x[3]);
-	my $file_offset = hex($x[5]);
-	$offset += $vma - $file_offset;
-	last;
+        my $vma = hex($x[3]);
+        my $file_offset = hex($x[5]);
+        $offset += $vma - $file_offset;
+        last;
       }
     }
     close(OBJDUMP);
@@ -1285,8 +1636,8 @@ sub ExtractSymbols {
     my $contained = [];
     foreach my $pc (keys(%{$pcset})) {
       if (!$seen{$pc} && ($pc >= $start) && ($pc <= $finish)) {
-	$seen{$pc} = 1;
-	push(@{$contained}, $pc);
+        $seen{$pc} = 1;
+        push(@{$contained}, $pc);
       }
     }
     # Map to symbols
@@ -1307,7 +1658,10 @@ sub MapToSymbols {
   if ($#{$pclist} < 0) { return; }
 
   MapSymbolsWithNM($image, $offset, $pclist, $symbols);
-  if ($main::opt_lines || $main::opt_files || $main::opt_list) {
+  if ($main::opt_interactive ||
+      $main::opt_lines       ||
+      $main::opt_files       ||
+      $main::opt_list) {
     GetLineNumbers($image, $offset, $pclist, $symbols);
   }
 }
@@ -1396,7 +1750,7 @@ sub ShortFunctionName {
   my $function = shift;
   while ($function =~ s/\([^()]*\)//g) { }   # Remove argument types
   while ($function =~ s/<[^<>]*>//g)  { }    # Remove template arguments
-  $function =~ s/^.*\s+(\w+::)/$1/;	     # Remove leading type
+  $function =~ s/^.*\s+(\w+::)/$1/;          # Remove leading type
   return $function;
 }
 
@@ -1404,7 +1758,20 @@ sub ShortFunctionName {
 
 sub cleanup {
   unlink($main::tmpfile_sym);
-  unlink($main::tmpfile_ps);
+  for (my $i = 0; $i < $main::next_tmpfile; $i++) {
+    unlink(PsTempName($i));
+  }
+  # We leave any collected profiles in $HOME/pprof in case the user wants
+  # to look at them later.  We print a message informing them of this.
+  if (defined($main::collected_profile)) {
+    print STDERR "Dynamically gathered profile is in $main::collected_profile\n";
+    print STDERR "If you want to investigate this profile further, you can do:\n";
+    print STDERR "\n";
+    print STDERR "  pprof \\\n";
+    print STDERR "    $prog \\\n";
+    print STDERR "    $main::collected_profile\n";
+    print STDERR "\n";
+  }
 }
 
 sub sighandler {
@@ -1421,11 +1788,11 @@ sub error {
 
 # Return a list of all routines that match $regexp.
 # For each routine, the following list is returned:
-#	$result->[i]->[0]	Routine name
-#	$result->[i]->[1]	Start address
-#	$result->[i]->[2]	Finish address
-#	$result->[i]->[3]	Image file name (program or shared library)
-#	$result->[i]->[4]	Offset for image in address space
+#       $result->[i]->[0]       Routine name
+#       $result->[i]->[1]       Start address
+#       $result->[i]->[2]       Finish address
+#       $result->[i]->[3]       Image file name (program or shared library)
+#       $result->[i]->[4]       Offset for image in address space
 sub GetMatchingRoutines {
 }
 
@@ -1446,7 +1813,7 @@ sub GetProcedureBoundaries {
       my $start_val = $1;
       my $this_routine = $2;
       if (defined($routine) && $routine =~ m/$regexp/) {
-	$symbol_table->{$routine} = [hex($last_start), hex($start_val)];
+        $symbol_table->{$routine} = [hex($last_start), hex($start_val)];
       }
       $last_start = $start_val;
       $routine = $this_routine;
diff --git a/src/profiler.cc b/src/profiler.cc
index 2597917..7197d42 100644
--- a/src/profiler.cc
+++ b/src/profiler.cc
@@ -31,6 +31,11 @@
 // Author: Sanjay Ghemawat
 //
 // Profile current program by sampling stack-trace every so often
+//
+// TODO: Detect whether or not setitimer() applies to all threads in
+// the process.  If so, instead of starting and stopping by changing
+// the signal handler, start and stop by calling setitimer() and
+// do nothing in the per-thread registration code.
 
 #include "config.h"
 #include <assert.h>
@@ -137,9 +142,6 @@ class ProfileData {
   // Is profiling turned on at all
   inline bool enabled() { return out_ >= 0; }
     
-  // Should we automatically profile all threads
-  inline bool profile_all() { return (out_ >= 0) && profile_all_; }
-
   // What is the frequency of interrupts (ticks per second)
   inline int frequency() { return frequency_; }
 
@@ -152,6 +154,8 @@ class ProfileData {
   bool Start(const char* fname);
   // Stop profiling and flush the data
   void Stop();
+
+  void GetCurrentState(ProfilerState* state);
   
  private:
   static const int kMaxStackDepth = 64;         // Max stack depth profiled
@@ -177,20 +181,24 @@ class ProfileData {
   };
 
 #ifdef HAVE_PTHREAD
-  pthread_mutex_t lock_;        // Cannot use "Mutex" in signal handlers
-  pthread_mutex_t flush_lock_;  // Acquired during explicit flushes
+  // Invariant: table_lock_ is only grabbed by handler, or by other code
+  // when the signal is being ignored (via SIG_IGN).
+  //
+  // Locking order is "state_lock_" first, and then "table_lock_"
+  pthread_mutex_t state_lock_;  // Protects filename, etc.(not used in handler)
+  pthread_mutex_t table_lock_;  // Cannot use "Mutex" in signal handlers
 #endif
   Bucket*       hash_;          // hash table
   
   Slot*         evict_;         // evicted entries
   int           num_evicted_;   // how many evicted entries?
   int           out_;           // fd for output file
-  bool          profile_all_;   // profile all threads automatically?
   int           count_;         // How many interrupts recorded
   int           evictions_;     // How many evictions
   size_t        total_bytes_;   // How much output
   char*         fname_;         // Profile file name
   int           frequency_;     // Interrupts per second
+  time_t        start_time_;    // Start time, or 0
 
   // Add "pc -> count" to eviction buffer
   void Evict(const Entry& entry);
@@ -226,19 +234,15 @@ ProfileData::ProfileData() :
   evict_(0),
   num_evicted_(0),
   out_(-1),
-  profile_all_(false),
   count_(0),
   evictions_(0),
   total_bytes_(0),
   fname_(0),
-  frequency_(0) {
+  frequency_(0),
+  start_time_(0) {
 
-  PCALL(pthread_mutex_init(&lock_, NULL));
-  PCALL(pthread_mutex_init(&flush_lock_, NULL));
-
-  if (getenv("PROFILESELECTED") == NULL) {
-    profile_all_ = true;
-  }
+  PCALL(pthread_mutex_init(&state_lock_, NULL));
+  PCALL(pthread_mutex_init(&table_lock_, NULL));
 
   // Get frequency of interrupts (if specified)
   char junk;
@@ -251,7 +255,12 @@ ProfileData::ProfileData() :
     frequency_ = kDefaultFrequency;
   }
 
-  // Should profiling be enabled?
+  // Ignore signals until we decide to turn profiling on
+  SetHandler(SIG_IGN);
+
+  ProfilerRegisterThread();
+
+  // Should profiling be enabled automatically at start?
   char* cpuprofile = getenv("CPUPROFILE");
   if (!cpuprofile || cpuprofile[0] == '\0') {
     return;
@@ -294,10 +303,10 @@ ProfileData::ProfileData() :
 }
 
 bool ProfileData::Start(const char* fname) {
-  LOCK(&lock_);
+  LOCK(&state_lock_);
   if (enabled()) {
     // profiling is already enabled
-    UNLOCK(&lock_);
+    UNLOCK(&state_lock_);
     return false;
   }
 
@@ -305,19 +314,23 @@ bool ProfileData::Start(const char* fname) {
   int fd = open(fname, O_CREAT | O_WRONLY | O_TRUNC, 0666);
   if (fd < 0) {
     // Can't open outfile for write
-    UNLOCK(&lock_);
+    UNLOCK(&state_lock_);
     return false;
   }
+
+  start_time_ = time(NULL);
+  fname_ = strdup(fname);
+
+  LOCK(&table_lock_);
   
   // Reset counters 
   num_evicted_ = 0;
   count_       = 0;
   evictions_   = 0;
   total_bytes_ = 0;
-  // But leave profile_all_ and frequency_ alone (i.e., ProfilerStart()
-  // doesn't affect their values originally set in the constructor)
+  // But leave frequency_ alone (i.e., ProfilerStart() doesn't affect
+  // their values originally set in the constructor)
 
-  fname_ = strdup(fname);
   out_  = fd;
 
   hash_ = new Bucket[kBuckets];
@@ -331,13 +344,12 @@ bool ProfileData::Start(const char* fname) {
   evict_[num_evicted_++] = 1000000 / frequency_;  // Period (microseconds)
   evict_[num_evicted_++] = 0;                     // Padding
 
+  UNLOCK(&table_lock_);
+
   // Setup handler for SIGPROF interrupts
   SetHandler((void (*)(int)) prof_handler);
 
-  // Start profiling on this thread if automatic profiling is on
-  ProfilerRegisterThread();
-
-  UNLOCK(&lock_);
+  UNLOCK(&state_lock_);
   return true;
 }
 
@@ -348,15 +360,18 @@ ProfileData::~ProfileData() {
 
 // Stop profiling and write out any collected profile data
 void ProfileData::Stop() {
+  LOCK(&state_lock_);
+
   // Prevent handler from running anymore
   SetHandler(SIG_IGN);
 
   // This lock prevents interference with signal handlers in other threads
-  LOCK(&lock_);
+  LOCK(&table_lock_);
 
   if (out_ < 0) {
     // Profiling is not enabled
-    UNLOCK(&lock_);
+    UNLOCK(&table_lock_);
+    UNLOCK(&state_lock_);
     return;
   }
 
@@ -401,15 +416,35 @@ void ProfileData::Stop() {
   evict_ = 0;
   free(fname_);
   fname_ = 0;
+  start_time_ = 0;
 
   out_ = -1;
-  UNLOCK(&lock_);
+  UNLOCK(&table_lock_);
+  UNLOCK(&state_lock_);
+}
+
+void ProfileData::GetCurrentState(ProfilerState* state) {
+  LOCK(&state_lock_);
+  if (enabled()) {
+    state->enabled = true;
+    state->start_time = start_time_;
+    state->samples_gathered = count_;
+    int buf_size = sizeof(state->profile_name);
+    strncpy(state->profile_name, fname_, buf_size);
+    state->profile_name[buf_size-1] = '\0';
+  } else {
+    state->enabled = false;
+    state->start_time = 0;
+    state->samples_gathered = 0;
+    state->profile_name[0] = '\0';
+  }
+  UNLOCK(&state_lock_);
 }
 
 void ProfileData::SetHandler(void (*handler)(int)) {
   struct sigaction sa;
   sa.sa_handler = handler;
-  sa.sa_flags   = 0;
+  sa.sa_flags   = SA_RESTART;
   sigemptyset(&sa.sa_mask);
   if (sigaction(SIGPROF, &sa, NULL) != 0) {
     perror("sigaction(SIGPROF)");
@@ -423,9 +458,9 @@ void ProfileData::FlushTable() {
     return;
   }
 
-  LOCK(&flush_lock_); {
+  LOCK(&state_lock_); {
     SetHandler(SIG_IGN);       // Disable timer interrupts while we're flushing
-    LOCK(&lock_); {
+    LOCK(&table_lock_); {
       // Move data from hash table to eviction buffer
       for (int b = 0; b < kBuckets; b++) {
         Bucket* bucket = &hash_[b];
@@ -440,9 +475,9 @@ void ProfileData::FlushTable() {
 
       // Write out all pending data
       FlushEvicted();
-    } UNLOCK(&lock_);
+    } UNLOCK(&table_lock_);
     SetHandler((void (*)(int)) prof_handler);
-  } UNLOCK(&flush_lock_);
+  } UNLOCK(&state_lock_);
 }
 
 // Record the specified "pc" in the profile data
@@ -456,12 +491,12 @@ void ProfileData::Add(unsigned long pc) {
   // Make hash-value
   Slot h = 0;
   for (int i = 0; i < depth; i++) {
-    Slot pc = reinterpret_cast<Slot>(stack[i]);
+    Slot slot = reinterpret_cast<Slot>(stack[i]);
     h = (h << 8) | (h >> (8*(sizeof(h)-1)));
-    h += (pc * 31) + (pc * 7) + (pc * 3);
+    h += (slot * 31) + (slot * 7) + (slot * 3);
   }
 
-  LOCK(&lock_);
+  LOCK(&table_lock_);
   count_++;
 
   // See if table already has an entry for this stack trace
@@ -505,7 +540,7 @@ void ProfileData::Add(unsigned long pc) {
       e->stack[i] = reinterpret_cast<Slot>(stack[i]);
     }
   }
-  UNLOCK(&lock_);
+  UNLOCK(&table_lock_);
 }
 
 // Write all evicted data to the profile file
@@ -538,49 +573,29 @@ void ProfileData::prof_handler(int sig, SigStructure sig_structure) {
   errno = saved_errno;
 }
 
-// Start interval timer for the current thread
-void ProfilerEnable() {
-  // Generate periodic interrupts
-  if (pdata.enabled()) {
-    // TODO: Randomize the initial interrupt value?
-    // TODO: Randmize the inter-interrupt period on every interrupt?
-    struct itimerval timer;
-    timer.it_interval.tv_sec = 0;
-    timer.it_interval.tv_usec = 1000000 / pdata.frequency();
-    timer.it_value = timer.it_interval;
-    setitimer(ITIMER_PROF, &timer, 0);
-  }
-}
-
-static void ProfilerTurnOffIntervalTimer() {
+// Start interval timer for the current thread.  We do this for
+// every known thread.  If profiling is off, the generated signals
+// are ignored, otherwise they are captured by prof_handler().
+void ProfilerRegisterThread() {
+  // TODO: Randomize the initial interrupt value?
+  // TODO: Randomize the inter-interrupt period on every interrupt?
   struct itimerval timer;
   timer.it_interval.tv_sec = 0;
-  timer.it_interval.tv_usec = 0;
+  timer.it_interval.tv_usec = 1000000 / pdata.frequency();
   timer.it_value = timer.it_interval;
   setitimer(ITIMER_PROF, &timer, 0);
 }
 
-// Stop interval timer for the current thread
-void ProfilerDisable() {
-  if (pdata.enabled()) {
-    ProfilerTurnOffIntervalTimer();
-  }
-}
+// DEPRECATED routines
+void ProfilerEnable() { }
+void ProfilerDisable() { }
 
 void ProfilerFlush() {
-  if (pdata.enabled()) {
-    pdata.FlushTable();
-  }
-}
-
-void ProfilerRegisterThread() {
-  if (pdata.profile_all()) {
-    ProfilerEnable();
-  }
+  pdata.FlushTable();
 }
 
 bool ProfilingIsEnabledForAllThreads() { 
-  return pdata.profile_all();
+  return pdata.enabled();
 }
 
 bool ProfilerStart(const char* fname) {
@@ -591,24 +606,10 @@ void ProfilerStop() {
   pdata.Stop();
 }
 
-
-ProfilerThreadState::ProfilerThreadState() {
-  was_enabled_ = pdata.profile_all();
+void ProfilerGetCurrentState(ProfilerState* state) {
+  pdata.GetCurrentState(state);
 }
 
-void ProfilerThreadState::ThreadCheck() {
-  bool is_enabled = pdata.profile_all();
-  if (was_enabled_ != is_enabled) {
-    if (is_enabled) {
-      LOG("Enabling profiling in thread");
-      ProfilerRegisterThread();
-    } else {
-      LOG("Profiling disabled in thread");
-      ProfilerTurnOffIntervalTimer();
-    }
-    was_enabled_ = is_enabled;
-  }
-}
 
 REGISTER_MODULE_INITIALIZER(profiler, {
   if (!FLAGS_cpu_profile.empty()) {
diff --git a/src/stacktrace.cc b/src/stacktrace.cc
index dcc8bc3..708d6ce 100644
--- a/src/stacktrace.cc
+++ b/src/stacktrace.cc
@@ -46,6 +46,26 @@
 
 #include <stdint.h>   // for uintptr_t
 
+// Given a pointer to a stack frame, locate and return the calling
+// stackframe, or return NULL if no stackframe can be found. Perform
+// sanity checks to reduce the chance that a bad pointer is returned.
+static void **NextStackFrame(void **old_sp) {
+  void **new_sp = (void **) *old_sp;
+
+  // Check that the transition from frame pointer old_sp to frame
+  // pointer new_sp isn't clearly bogus
+  if (new_sp <= old_sp) return NULL;
+  if ((uintptr_t)new_sp & (sizeof(void *) - 1)) return NULL;
+#ifdef __i386__
+  // On 64-bit machines, the stack pointer can be very close to
+  // 0xffffffff, so we explicitly check for a pointer into the
+  // last two pages in the address space
+  if ((uintptr_t)new_sp >= 0xffffe000) return NULL;
+#endif
+  if ((uintptr_t)new_sp - (uintptr_t)old_sp > 100000) return NULL;
+  return new_sp;
+}
+
 // Note: the code for GetStackExtent below is pretty similar to this one;
 //       change both if chaning one.
 int GetStackTrace(void** result, int max_depth, int skip_count) {
@@ -68,18 +88,18 @@ int GetStackTrace(void** result, int max_depth, int skip_count) {
   int n = 0;
   skip_count++;         // Do not include the "GetStackTrace" frame
   while (sp && n < max_depth) {
+    if (*(sp+1) == (void *)0) {
+      // In 64-bit code, we often see a frame that
+      // points to itself and has a return address of 0.
+      break;
+    }
     if (skip_count > 0) {
       skip_count--;
     } else {
       result[n++] = *(sp+1);
     }
-    void** new_sp = (void**) *sp;
-
-    // A little bit of sanity checking to avoid crashes
-    if (new_sp < sp ||
-        (uintptr_t)new_sp - (uintptr_t)sp > 100000) {
-      break;
-    }
+    void** new_sp = NextStackFrame(sp);
+    if (!new_sp) break;
     sp = new_sp;
   }
   return n;
@@ -112,16 +132,14 @@ bool GetStackExtent(void* sp,  void** stack_top, void** stack_bottom) {
   }
 
   while (cur_sp) {
-    void** new_sp = (void**)*cur_sp;
-    // A little bit of sanity checking to avoid crashes
-    if (new_sp < cur_sp ||
-        (uintptr_t)new_sp - (uintptr_t)cur_sp > 100000) {
+    void** new_sp = NextStackFrame(cur_sp);
+    if (!new_sp) {
       *stack_bottom = (void*)cur_sp;
       return true;
     }
     cur_sp = new_sp;
     if (*stack_top == NULL)  *stack_top = (void*)cur_sp;
-      // get out of the stack frame for this call
+    // get out of the stack frame for this call
   }
   return false;
 }
diff --git a/src/system-alloc.cc b/src/system-alloc.cc
index 40f0046..dee2490 100644
--- a/src/system-alloc.cc
+++ b/src/system-alloc.cc
@@ -83,6 +83,10 @@ DEFINE_int32(malloc_devmem_limit, 0,
 #ifdef HAVE_SBRK
 
 static void* TrySbrk(size_t size, size_t alignment) {
+  // sbrk will release memory if passed a negative number, so we do
+  // a strict check here
+  if (static_cast<ptrdiff_t>(size + alignment) < 0) return NULL;
+
   size = ((size + alignment - 1) / alignment) * alignment;
   void* result = sbrk(size);
   if (result == reinterpret_cast<void*>(-1)) {
@@ -131,6 +135,11 @@ static void* TryMmap(size_t size, size_t alignment) {
   if (alignment > pagesize) {
     extra = alignment - pagesize;
   }
+
+  // Note: size + extra does not overflow since:
+  //            size + alignment < (1<<NBITS).
+  // and        extra <= alignment
+  // therefore  size + extra < (1<<NBITS)
   void* result = mmap(NULL, size + extra,
                       PROT_READ|PROT_WRITE,
                       MAP_PRIVATE|MAP_ANONYMOUS,
@@ -200,10 +209,16 @@ static void* TryDevMem(size_t size, size_t alignment) {
   }
   
   // check to see if we have any memory left
-  if (physmem_limit != 0 && physmem_base + size + extra > physmem_limit) {
+  if (physmem_limit != 0 &&
+      ((size + extra) > (physmem_limit - physmem_base))) {
     devmem_failure = true;
     return NULL;
   }
+
+  // Note: size + extra does not overflow since:
+  //            size + alignment < (1<<NBITS).
+  // and        extra <= alignment
+  // therefore  size + extra < (1<<NBITS)
   void *result = mmap(0, size + extra, PROT_WRITE|PROT_READ,
                       MAP_SHARED, physmem_fd, physmem_base);
   if (result == reinterpret_cast<void*>(MAP_FAILED)) {
@@ -233,6 +248,9 @@ static void* TryDevMem(size_t size, size_t alignment) {
 }
 
 void* TCMalloc_SystemAlloc(size_t size, size_t alignment) {
+  // Discard requests that overflow
+  if (size + alignment < size) return NULL;
+
   if (TCMallocDebug::level >= TCMallocDebug::kVerbose) {
     MESSAGE("TCMalloc_SystemAlloc(%" PRIuS ", %" PRIuS")\n", 
             size, alignment);
diff --git a/src/tcmalloc.cc b/src/tcmalloc.cc
index 748f8f5..2eb9ef4 100644
--- a/src/tcmalloc.cc
+++ b/src/tcmalloc.cc
@@ -110,6 +110,10 @@ static const size_t kAlignShift = 3;
 static const size_t kAlignment  = 1 << kAlignShift;
 static const size_t kNumClasses = 170;
 
+// Allocates a big block of memory for the pagemap once we reach more than
+// 128MB
+static const size_t kPageMapBigAllocationThreshold = 128 << 20;
+
 // Minimum number of pages to fetch from system at a time.  Must be
 // significantly bigger than kBlockSize to amortize system-call
 // overhead, and also to reduce external fragementation.  Also, we
@@ -167,12 +171,12 @@ static size_t class_to_size[kNumClasses];
 static size_t class_to_pages[kNumClasses];
 
 // Return floor(log2(n)) for n > 0.
-#if defined __i386__ && defined __GNUC__
+#if (defined __i386__ || defined __x86_64__) && defined __GNUC__
 static inline int LgFloor(size_t n) {
   // "ro" for the input spec means the input can come from either a
   // register ("r") or offsetable memory ("o").
-  int result;
-  __asm__("bsrl  %1, %0"
+  size_t result;
+  __asm__("bsr  %1, %0"
           : "=r" (result)               // Output spec
           : "ro" (n)                    // Input spec
           : "cc"                        // Clobbers condition-codes
@@ -307,7 +311,7 @@ template <class T>
 class PageHeapAllocator {
  private:
   // How much to allocate from system at a time
-  static const int kAllocIncrement = 32 << 10;
+  static const int kAllocIncrement = 128 << 10;
 
   // Aligned size of T
   static const size_t kAlignedSize
@@ -330,6 +334,8 @@ class PageHeapAllocator {
     free_area_ = NULL;
     free_avail_ = 0;
     free_list_ = NULL;
+    // Reserve some space at the beginning to avoid fragmentation.
+    Delete(New());
   }
 
   T* New() {
@@ -517,6 +523,12 @@ struct StackTrace {
 static PageHeapAllocator<StackTrace> stacktrace_allocator;
 static Span sampled_objects;
 
+// Linked list of stack traces recorded every time we allocated memory
+// from the system.  Useful for finding allocation sites that cause
+// increase in the footprint of the system.  The linked list pointer
+// is stored in trace->stack[kMaxStackDepth-1].
+static StackTrace* growth_stacks = NULL;
+
 // -------------------------------------------------------------------------
 // Map from page-id to per-page data
 // -------------------------------------------------------------------------
@@ -548,6 +560,8 @@ class TCMalloc_PageHeap {
   TCMalloc_PageHeap();
 
   // Allocate a run of "n" pages.  Returns zero if out of memory.
+  // Caller should not pass "n == 0" -- instead, n should have
+  // been rounded up already.
   Span* New(Length n);
 
   // Delete the span "[p, p+n-1]".
@@ -635,10 +649,12 @@ TCMalloc_PageHeap::TCMalloc_PageHeap() : pagemap_(MetaDataAlloc),
 
 Span* TCMalloc_PageHeap::New(Length n) {
   ASSERT(Check());
-  if (n == 0) n = 1;
+
+  // n==0 occurs iff pages() overflowed when we added kPageSize-1 to n
+  if (n == 0) return NULL;
 
   // Find first size >= n that has a non-empty list
-  for (int s = n; s < kMaxPages; s++) {
+  for (Length s = n; s < kMaxPages; s++) {
     if (!DLL_IsEmpty(&free_[s])) {
       Span* result = free_[s].next;
       Carve(result, n);
@@ -815,6 +831,14 @@ void TCMalloc_PageHeap::Dump(TCMalloc_Printer* out) {
               (cumulative << kPageShift) / 1048576.0);
 }
 
+static void RecordGrowth(size_t growth) {
+  StackTrace* t = stacktrace_allocator.New();
+  t->depth = GetStackTrace(t->stack, kMaxStackDepth-1, 4);
+  t->size = growth;
+  t->stack[kMaxStackDepth-1] = reinterpret_cast<void*>(growth_stacks);
+  growth_stacks = t;
+}
+
 bool TCMalloc_PageHeap::GrowHeap(Length n) {
   ASSERT(kMaxPages >= kMinSystemAlloc);
   Length ask = (n>kMinSystemAlloc) ? n : static_cast<Length>(kMinSystemAlloc);
@@ -827,10 +851,22 @@ bool TCMalloc_PageHeap::GrowHeap(Length n) {
     }
     if (ptr == NULL) return false;
   }
+  RecordGrowth(ask << kPageShift);
+
+  uint64_t old_system_bytes = system_bytes_;
   system_bytes_ += (ask << kPageShift);
   const PageID p = reinterpret_cast<uintptr_t>(ptr) >> kPageShift;
   ASSERT(p > 0);
 
+  // If we have already a lot of pages allocated, just pre allocate a bunch of
+  // memory for the page map. This prevents fragmentation by pagemap metadata
+  // when a program keeps allocating and freeing large blocks.
+
+  if (old_system_bytes < kPageMapBigAllocationThreshold
+      && system_bytes_ >= kPageMapBigAllocationThreshold) {
+    pagemap_.PreallocateMoreMemory();
+  }
+
   // Make sure pagemap_ has entries for all of the new pages.
   // Plus ensure one before and one after so coalescing code
   // does not need bounds-checking.
@@ -928,7 +964,7 @@ class TCMalloc_ThreadCache {
 
   size_t        size_;                  // Combined size of data
   pthread_t     tid_;                   // Which thread owns it
-  bool          setspecific_;           // Called pthread_setspecific?
+  bool          in_setspecific_;        // In call to pthread_setspecific?
   FreeList      list_[kNumClasses];     // Array indexed by size-class
 
   // We sample allocations, biased by the size of the allocation
@@ -1193,7 +1229,7 @@ void TCMalloc_ThreadCache::Init(pthread_t tid) {
   next_ = NULL;
   prev_ = NULL;
   tid_  = tid;
-  setspecific_ = false;
+  in_setspecific_ = false;
   for (size_t cl = 0; cl < kNumClasses; ++cl) {
     list_[cl].Init();
   }
@@ -1409,12 +1445,13 @@ void* TCMalloc_ThreadCache::CreateCacheIfNecessary() {
   }
 
   // We call pthread_setspecific() outside the lock because it may
-  // call malloc() recursively.  The recursive call will never get
-  // here again because it will find the already allocated heap in the
-  // linked list of heaps.
-  if (!heap->setspecific_ && tsd_inited) {
-    heap->setspecific_ = true;
+  // call malloc() recursively.  We check for the recursive call using
+  // the "in_setspecific_" flag so that we can avoid calling
+  // pthread_setspecific() if we are already inside pthread_setspecific().
+  if (!heap->in_setspecific_ && tsd_inited) {
+    heap->in_setspecific_ = true;
     perftools_pthread_setspecific(heap_key, heap);
+    heap->in_setspecific_ = false;
   }
   return heap;
 }
@@ -1600,6 +1637,50 @@ static void** DumpStackTraces() {
   return result;
 }
 
+static void** DumpHeapGrowthStackTraces() {
+  // Count how much space we need
+  int needed_slots = 0;
+  {
+    SpinLockHolder h(&pageheap_lock);
+    for (StackTrace* t = growth_stacks;
+         t != NULL;
+         t = reinterpret_cast<StackTrace*>(t->stack[kMaxStackDepth-1])) {
+      needed_slots += 3 + t->depth;
+    }
+    needed_slots += 100;            // Slop in case list grows
+    needed_slots += needed_slots/8; // An extra 12.5% slop
+  }
+
+  void** result = new void*[needed_slots];
+  if (result == NULL) {
+    MESSAGE("tcmalloc: could not allocate %d slots for stack traces\n",
+            needed_slots);
+    return NULL;
+  }
+
+  SpinLockHolder h(&pageheap_lock);
+  int used_slots = 0;
+  for (StackTrace* t = growth_stacks;
+       t != NULL;
+       t = reinterpret_cast<StackTrace*>(t->stack[kMaxStackDepth-1])) {
+    ASSERT(used_slots < needed_slots);  // Need to leave room for terminator
+    if (used_slots + 3 + t->depth >= needed_slots) {
+      // No more room
+      break;
+    }
+
+    result[used_slots+0] = reinterpret_cast<void*>(1);
+    result[used_slots+1] = reinterpret_cast<void*>(t->size);
+    result[used_slots+2] = reinterpret_cast<void*>(t->depth);
+    for (int d = 0; d < t->depth; d++) {
+      result[used_slots+3+d] = t->stack[d];
+    }
+    used_slots += 3 + t->depth;
+  }
+  result[used_slots] = reinterpret_cast<void*>(0);
+  return result;
+}
+
 // TCMalloc's support for extra malloc interfaces
 class TCMallocImplementation : public MallocExtension {
  public:
@@ -1619,6 +1700,10 @@ class TCMallocImplementation : public MallocExtension {
     return DumpStackTraces();
   }
 
+  virtual void** ReadHeapGrowthStackTraces() {
+    return DumpHeapGrowthStackTraces();
+  }
+
   virtual bool GetNumericProperty(const char* name, size_t* value) {
     ASSERT(name != NULL);
 
@@ -1681,15 +1766,6 @@ class TCMallocImplementation : public MallocExtension {
   }
 };
 
-// RedHat 9's pthread manager allocates an object directly by calling
-// a __libc_XXX() routine.  This memory block is not known to tcmalloc.
-// At cleanup time, the pthread manager calls free() on this
-// pointer, which then crashes.
-//
-// We hack around this problem by disabling all deallocations
-// after a global object destructor in this module has been called.
-static bool tcmalloc_is_destroyed = false;
-
 //-------------------------------------------------------------------
 // Helpers for the exported routines below
 //-------------------------------------------------------------------
@@ -1744,22 +1820,11 @@ static inline void* do_malloc(size_t size) {
 static inline void do_free(void* ptr) {
   if (TCMallocDebug::level >= TCMallocDebug::kVerbose) 
     MESSAGE("In tcmalloc do_free(%p)\n", ptr);
-  if (ptr == NULL  ||  tcmalloc_is_destroyed) return;
+  if (ptr == NULL) return;
   ASSERT(pageheap != NULL);  // Should not call free() before malloc()
   const PageID p = reinterpret_cast<uintptr_t>(ptr) >> kPageShift;
   Span* span = pageheap->GetDescriptor(p);
 
-  if (span == NULL) {
-    // We've seen systems where a piece of memory allocated using the
-    // allocator built in to libc is deallocated using free() and
-    // therefore ends up inside tcmalloc which can't find the
-    // corresponding span.  We silently throw this object on the floor
-    // instead of crashing.
-    MESSAGE("tcmalloc: ignoring potential glibc-2.3.5 induced free "
-            "of an unknown object %p\n", ptr);
-    return;
-  }
-
   ASSERT(span != NULL);
   ASSERT(!span->free);
   const size_t cl = span->sizeclass;
@@ -1796,6 +1861,8 @@ static inline void do_free(void* ptr) {
 static void* do_memalign(size_t align, size_t size) {
   ASSERT((align & (align - 1)) == 0);
   ASSERT(align > 0);
+  if (size + align < size) return NULL;         // Overflow
+
   if (pageheap == NULL) TCMalloc_ThreadCache::InitModule();
 
   // Allocate at least one byte to avoid boundary conditions below
@@ -1920,11 +1987,15 @@ extern "C" void free(void* ptr) {
 }
 
 extern "C" void* calloc(size_t n, size_t elem_size) {
-  void* result = do_malloc(n * elem_size);
+  // Overflow check
+  const size_t size = n * elem_size;
+  if (elem_size != 0 && size / elem_size != n) return NULL;
+
+  void* result = do_malloc(size);
   if (result != NULL) {
-    memset(result, 0, n * elem_size);
+    memset(result, 0, size);
   }
-  MallocHook::InvokeNewHook(result, n * elem_size);
+  MallocHook::InvokeNewHook(result, size);
   return result;
 }
 
@@ -2118,3 +2189,17 @@ extern "C" {
   }
 #endif
 }
+
+// Override __libc_memalign in libc on linux boxes specially.
+// They have a bug in libc that causes them to (very rarely) allocate
+// with __libc_memalign() yet deallocate with free() and the
+// definitions above don't catch it.
+// This function is an exception to the rule of calling MallocHook method
+// from the stack frame of the allocation function;
+// heap-checker handles this special case explicitly.
+static void *MemalignOverride(size_t align, size_t size, const void *caller) {
+  void* result = do_memalign(align, size);
+  MallocHook::InvokeNewHook(result, size);
+  return result;
+}
+void *(*__memalign_hook)(size_t, size_t, const void *) = MemalignOverride;
diff --git a/src/tests/tcmalloc_large_unittest.cc b/src/tests/tcmalloc_large_unittest.cc
new file mode 100644
index 0000000..fd9f885
--- /dev/null
+++ b/src/tests/tcmalloc_large_unittest.cc
@@ -0,0 +1,137 @@
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Michael Chastain
+//
+// This is a unit test for large allocations in malloc and friends.
+// "Large" means "so large that they overflow the address space".
+// For 32 bits, this means allocations near 2^32 bytes and 2^31 bytes.
+// For 64 bits, this means allocations near 2^64 bytes and 2^63 bytes.
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <set>
+
+#define CHECK(b)  do {                                                  \
+  if (b) {} else { fprintf(stderr, "TEST FAILED: " #b); exit(1); }      \
+} while (0)
+
+// Alloc a size that should always fail.
+
+void TryAllocExpectFail(size_t size) {
+  void* p1 = malloc(size);
+  CHECK(p1 == NULL);
+
+  void* p2 = malloc(1);
+  CHECK(p2 != NULL);
+
+  void* p3 = realloc(p2, size);
+  CHECK(p3 == NULL);
+
+  free(p2);
+}
+
+// Alloc a size that might work and might fail.
+// If it does work, touch some pages.
+
+void TryAllocMightFail(size_t size) {
+  unsigned char* p = static_cast<unsigned char*>(malloc(size));
+  if ( p != NULL ) {
+    unsigned char volatile* vp = p;  // prevent optimizations
+    static const size_t kPoints = 1024;
+
+    for ( size_t i = 0; i < kPoints; ++i ) {
+      vp[i * (size / kPoints)] = static_cast<unsigned char>(i);
+    }
+
+    for ( size_t i = 0; i < kPoints; ++i ) {
+      CHECK(vp[i * (size / kPoints)] == static_cast<unsigned char>(i));
+    }
+
+    vp[size-1] = 'M';
+    CHECK(vp[size-1] == 'M');
+  }
+
+  free(p);
+}
+
+int main (int argc, char** argv) {
+  // Allocate some 0-byte objects.  They better be unique.
+  // 0 bytes is not large but it exercises some paths related to
+  // large-allocation code.
+  {
+    static const int kZeroTimes = 1024;
+    printf("Test malloc(0) x %d\n", kZeroTimes);
+    std::set<char*> p_set;
+    for ( int i = 0; i < kZeroTimes; ++i ) {
+      char* p = new char;
+      CHECK(p != NULL);
+      CHECK(p_set.find(p) == p_set.end());
+      p_set.insert(p_set.end(), p);
+    }
+    // Just leak the memory.
+  }
+
+  // Grab some memory so that some later allocations are guaranteed to fail.
+  printf("Test small malloc\n");
+  void* p_small = malloc(4*1048576);
+  CHECK(p_small != NULL);
+
+  // Test sizes up near the maximum size_t.
+  // These allocations test the wrap-around code.
+  printf("Test malloc(0 - N)\n");
+  const size_t zero = 0;
+  static const size_t kMinusNTimes = 16384;
+  for ( size_t i = 1; i < kMinusNTimes; ++i ) {
+    TryAllocExpectFail(zero - i);
+  }
+
+  // Test sizes a bit smaller.
+  // The small malloc above guarantees that all these return NULL.
+  printf("Test malloc(0 - 1048576 - N)\n");
+  static const size_t kMinusMBMinusNTimes = 16384;
+  for ( size_t i = 0; i < kMinusMBMinusNTimes; ++i) {
+    TryAllocExpectFail(zero - 1048576 - i);
+  }
+
+  // Test sizes at half of size_t.
+  // These might or might not fail to allocate.
+  printf("Test malloc(max/2 +- N)\n");
+  static const size_t kHalfPlusMinusTimes = 64;
+  const size_t half = (zero - 2) / 2 + 1;
+  for ( size_t i = 0; i < kHalfPlusMinusTimes; ++i) {
+    TryAllocMightFail(half - i);
+    TryAllocMightFail(half + i);
+  }
+
+  printf("PASS\n");
+  return 0;
+}
diff --git a/src/tests/tcmalloc_unittest.cc b/src/tests/tcmalloc_unittest.cc
index 91da8b3..22a6c29 100644
--- a/src/tests/tcmalloc_unittest.cc
+++ b/src/tests/tcmalloc_unittest.cc
@@ -35,13 +35,12 @@
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
+#include <stdint.h>
 #include "google/malloc_extension.h"
 
 #define BUFSIZE (100 << 10)
 
 int main(int argc, char **argv) {
-
-  
   char *buf1 = (char *)malloc(BUFSIZE);
   memset(buf1, 0, BUFSIZE);
   printf("Allocated buf1 via malloc() at %p\n", buf1);