38 files changed, 2004 insertions, 502 deletions
diff --git a/ChangeLog b/ChangeLog
index 2675afb..933edc9 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,13 @@
+Tue Mar 18 14:30:44 2008  Google Inc. <opensource@google.com>
+
+	* google-perftools: version 0.96 release
+	* major atomicops rewrite; fixed atomic ops code for linux/ppc (vchen)
+	* nix the stacktrace library; now build structure is simpler (csilvers)
+	* Speed up heap-checker, and reduce extraneous logging (maxim)
+	* Improve itimer code for NPTL case (cgd)
+	* Add source code annotations for use by valgrind, etc (kcc)
+	* PORTING: Fix high resolution timers for Mac OS X (adlr)
+
 Tue Feb 19 12:01:31 2008  Google Inc. <opensource@google.com>
 
 	* google-perftools: version 0.95.1 release  (bugfix release)
diff --git a/INSTALL b/INSTALL
index d717ea6..7ce041f 100644
--- a/INSTALL
+++ b/INSTALL
@@ -79,7 +79,7 @@ Perftools has been tested on the following systems:
    Windows XP, Visual Studio 2005 (VC++ 8) (x86)
    Windows XP, MinGW 5.1.3 (x86)
 
-It works in its full generality on the Linux x86 and x86_64 systems
+It works in its full generality on the Linux systems
 tested (though see 64-bit notes above).  Portions of perftools work on
 the other systems.  The basic memory-allocation library,
 tcmalloc_minimal, works on all systems.  The cpu-profiler also works
@@ -125,20 +125,14 @@ above, by linking in libtcmalloc_minimal.
 
    I have not tested other *BSD systems, but they are probably similar.
 
-** Linux/PPC:
-
-   I've tested on a PowerPC Linux box using qemu against Debian Etch
-   (4.0).  Most of the tests pass.  The heap-checker unittest does not
-   pass for reasons which are not yet clear but seem to be related to
-   the clone() system call.  Heap checking may work properly for
-   single-threaded programs, though I haven't tested that.
-
 ** Mac OS X:
 
    I've tested OS X 10.5 [Leopard], OS X 10.4 [Tiger] and OS X 10.3
-   [Panther] on both intel (x86) and PowerPC systems.  For Panther/ppc
+   [Panther] on both intel (x86) and PowerPC systems.  For Panther
    systems, perftools does not work at all: it depends on a header
-   file, OSAtomic.h, which is new in 10.4.
+   file, OSAtomic.h, which is new in 10.4.  (It's possible to get the
+   code working for Panther/i386 without too much work; if you're
+   interested in exploring this, drop an e-mail.)
 
    For the other seven systems, the binaries and libraries that
    successfully build are exactly the same as for FreeBSD.  See that
diff --git a/Makefile.am b/Makefile.am
index 6ac4748..e13086f 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -81,9 +81,11 @@ dist_doc_DATA += doc/index.html doc/designstyle.css
 # This is a 'convenience library' -- it's not actually installed or anything
 LOGGING_INCLUDES = src/base/logging.h \
                    src/base/commandlineflags.h \
-                   src/base/basictypes.h
+                   src/base/basictypes.h \
+                   src/base/dynamic_annotations.h
 noinst_LTLIBRARIES += liblogging.la
 liblogging_la_SOURCES = src/base/logging.cc \
+                        src/base/dynamic_annotations.cc \
                         $(LOGGING_INCLUDES)
 
 SYSINFO_INCLUDES = src/base/sysinfo.h \
@@ -153,7 +155,7 @@ low_level_alloc_unittest_SOURCES = src/base/low_level_alloc.cc \
                                    src/malloc_hook.cc \
                                    src/tests/low_level_alloc_unittest.cc \
                                    $(LOW_LEVEL_ALLOC_UNITTEST_INCLUDES)
-low_level_alloc_unittest_LDADD = $(LIBSPINLOCK) libstacktrace.la
+low_level_alloc_unittest_LDADD = libstacktrace.la
 
 if !MINGW
 TESTS += atomicops_unittest
@@ -181,11 +183,9 @@ STACKTRACE_INCLUDES = $(S_STACKTRACE_INCLUDES) $(SG_STACKTRACE_INCLUDES)
 googleinclude_HEADERS += $(SG_STACKTRACE_INCLUDES)
 
 ### Making the library
-lib_LTLIBRARIES += libstacktrace.la
+noinst_LTLIBRARIES += libstacktrace.la
 libstacktrace_la_SOURCES = src/stacktrace.cc \
                            $(STACKTRACE_INCLUDES)
-# TODO(csilvers): only add these two things when stacktrace.cc would
-#                 #include "stacktrace_libunwind-inl.h"
 libstacktrace_la_LIBADD = $(UNWIND_LIBS) $(LIBSPINLOCK)
 STACKTRACE_SYMBOLS = '(GetStackTrace)'
 libstacktrace_la_LDFLAGS = -export-symbols-regex $(STACKTRACE_SYMBOLS)
@@ -257,19 +257,9 @@ libtcmalloc_minimal_la_SOURCES = src/internal_logging.cc \
 libtcmalloc_minimal_la_CXXFLAGS = $(PTHREAD_CFLAGS) -DNDEBUG $(AM_CXXFLAGS)
 libtcmalloc_minimal_la_LDFLAGS = $(PTHREAD_CFLAGS)
 libtcmalloc_minimal_la_LIBADD = $(PTHREAD_LIBS) \
-                                libstacktrace.la $(LIBSPINLOCK)
-
-# Whenever we link in tcmalloc_minimal, we also need to link in
-# libstacktrace.so (we also need libspinlock and liblogging, but those
-# are created as .a's, not .so's).  libtool should do this for us, via
-# the LIBADD above.  But on some systems, -rpath doesn't work
-# properly, and whatever libtool does fails.  So we just manually link
-# in -lstacktrace whenever linking in -ltcmalloc_minimal.
-# (Note this isn't a problem for an *installed* tcmalloc, because then
-# everything lives in /usr/lib or /usr/local/lib, which is on the
-# linker search path, so the value of -rpath doesn't matter.)
-# Remember tcmalloc should always be linked in last!
-LIBTCMALLOC_MINIMAL = libstacktrace.la libtcmalloc_minimal.la
+                                libstacktrace.la
+
+LIBTCMALLOC_MINIMAL = libtcmalloc_minimal.la
 
 ### Unittests
 
@@ -495,12 +485,9 @@ libtcmalloc_la_SOURCES = src/internal_logging.cc \
 libtcmalloc_la_CXXFLAGS = $(PTHREAD_CFLAGS) -DNDEBUG $(AM_CXXFLAGS)
 libtcmalloc_la_LDFLAGS = $(PTHREAD_CFLAGS)
 libtcmalloc_la_LIBADD = $(PTHREAD_LIBS) \
-                        libstacktrace.la $(LIBSPINLOCK)
+                        libstacktrace.la
 
-# See discussion above (under LIBTCMALLOC_MINIMAL) for why we do this.
-# Basically it's to work around systems where --rpath doesn't work right.
-# Remember tcmalloc should always be linked in last!
-LIBTCMALLOC = libstacktrace.la libtcmalloc.la
+LIBTCMALLOC = libtcmalloc.la
 
 
 ### Unittests
@@ -518,6 +505,7 @@ tcmalloc_unittest_LDADD = $(LIBTCMALLOC) liblogging.la $(PTHREAD_LIBS)
 
 # This makes sure it's safe to link in both tcmalloc and tcmalloc_minimal.
 # (One would never do this on purpose, but perhaps by accident...)
+# We also link in libprofiler to make sure that works too
 TESTS += tcmalloc_both_unittest
 tcmalloc_both_unittest_SOURCES = src/tests/tcmalloc_unittest.cc \
                                  src/tests/testutil.h src/tests/testutil.cc \
@@ -525,7 +513,7 @@ tcmalloc_both_unittest_SOURCES = src/tests/tcmalloc_unittest.cc \
 tcmalloc_both_unittest_CXXFLAGS = $(PTHREAD_CFLAGS) $(AM_CXXFLAGS)
 tcmalloc_both_unittest_LDFLAGS = $(PTHREAD_CFLAGS)
 tcmalloc_both_unittest_LDADD = $(LIBTCMALLOC) $(LIBTCMALLOC_MINIMAL) \
-                               liblogging.la $(PTHREAD_LIBS)
+                               libprofiler.la liblogging.la $(PTHREAD_LIBS)
 
 TESTS += tcmalloc_large_unittest
 tcmalloc_large_unittest_SOURCES = src/tests/tcmalloc_large_unittest.cc
@@ -619,7 +607,7 @@ lib_LTLIBRARIES += libprofiler.la
 libprofiler_la_SOURCES = src/profiler.cc \
                          src/profiledata.cc \
                          $(CPU_PROFILER_INCLUDES)
-libprofiler_la_LIBADD = $(LIBSPINLOCK) libstacktrace.la
+libprofiler_la_LIBADD = libstacktrace.la
 # We have to include ProfileData for profiledata_unittest
 CPU_PROFILER_SYMBOLS = '(ProfilerStart|ProfilerStop|ProfilerEnable|ProfilerDisable|ProfilerFlush|ProfilerRegisterThread|ProfileData)'
 libprofiler_la_LDFLAGS = -export-symbols-regex $(CPU_PROFILER_SYMBOLS)
diff --git a/Makefile.in b/Makefile.in
index f47672a..888586b 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -78,6 +78,7 @@ EXTRA_PROGRAMS = ptmalloc_unittest1$(EXEEXT) \
 
 # This makes sure it's safe to link in both tcmalloc and tcmalloc_minimal.
 # (One would never do this on purpose, but perhaps by accident...)
+# We also link in libprofiler to make sure that works too
 
 ### Unittests
 @MINGW_FALSE@am__append_12 = tcmalloc_unittest tcmalloc_both_unittest \
@@ -147,14 +148,10 @@ libLTLIBRARIES_INSTALL = $(INSTALL)
 LTLIBRARIES = $(lib_LTLIBRARIES) $(noinst_LTLIBRARIES)
 liblogging_la_LIBADD =
 am__objects_1 =
-am_liblogging_la_OBJECTS = logging.lo $(am__objects_1)
+am_liblogging_la_OBJECTS = logging.lo dynamic_annotations.lo \
+	$(am__objects_1)
 liblogging_la_OBJECTS = $(am_liblogging_la_OBJECTS)
-@MINGW_FALSE@am__DEPENDENCIES_1 = libspinlock.la libsysinfo.la \
-@MINGW_FALSE@	liblogging.la
-@MINGW_TRUE@am__DEPENDENCIES_1 = libwindows.la libsysinfo.la \
-@MINGW_TRUE@	liblogging.la
-@MINGW_FALSE@libprofiler_la_DEPENDENCIES = $(am__DEPENDENCIES_1) \
-@MINGW_FALSE@	libstacktrace.la
+@MINGW_FALSE@libprofiler_la_DEPENDENCIES = libstacktrace.la
 am__libprofiler_la_SOURCES_DIST = src/profiler.cc src/profiledata.cc \
 	src/profiledata.h src/getpc.h src/base/basictypes.h \
 	src/base/commandlineflags.h src/base/googleinit.h \
@@ -163,7 +160,8 @@ am__libprofiler_la_SOURCES_DIST = src/profiler.cc src/profiledata.cc \
 	src/base/atomicops-internals-macosx.h \
 	src/base/atomicops-internals-linuxppc.h \
 	src/base/atomicops-internals-x86-msvc.h \
-	src/base/atomicops-internals-x86.h src/google/profiler.h \
+	src/base/atomicops-internals-x86.h \
+	src/base/dynamic_annotations.h src/google/profiler.h \
 	src/google/stacktrace.h
 @MINGW_FALSE@am__objects_2 = $(am__objects_1) $(am__objects_1)
 @MINGW_FALSE@am__objects_3 = $(am__objects_2) $(am__objects_1)
@@ -182,17 +180,21 @@ am__libspinlock_la_SOURCES_DIST = src/base/spinlock.cc \
 @MINGW_FALSE@	atomicops-internals-x86.lo $(am__objects_1)
 libspinlock_la_OBJECTS = $(am_libspinlock_la_OBJECTS)
 @MINGW_FALSE@am_libspinlock_la_rpath =
-am__DEPENDENCIES_2 =
-libstacktrace_la_DEPENDENCIES = $(am__DEPENDENCIES_2) \
-	$(am__DEPENDENCIES_1)
+am__DEPENDENCIES_1 =
+@MINGW_FALSE@am__DEPENDENCIES_2 = libspinlock.la libsysinfo.la \
+@MINGW_FALSE@	liblogging.la
+@MINGW_TRUE@am__DEPENDENCIES_2 = libwindows.la libsysinfo.la \
+@MINGW_TRUE@	liblogging.la
+libstacktrace_la_DEPENDENCIES = $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_2)
 am__objects_4 = $(am__objects_1) $(am__objects_1)
 am_libstacktrace_la_OBJECTS = stacktrace.lo $(am__objects_4)
 libstacktrace_la_OBJECTS = $(am_libstacktrace_la_OBJECTS)
 libsysinfo_la_LIBADD =
 am_libsysinfo_la_OBJECTS = sysinfo.lo $(am__objects_1)
 libsysinfo_la_OBJECTS = $(am_libsysinfo_la_OBJECTS)
-@MINGW_FALSE@libtcmalloc_la_DEPENDENCIES = $(am__DEPENDENCIES_2) \
-@MINGW_FALSE@	libstacktrace.la $(am__DEPENDENCIES_1)
+@MINGW_FALSE@libtcmalloc_la_DEPENDENCIES = $(am__DEPENDENCIES_1) \
+@MINGW_FALSE@	libstacktrace.la
 am__libtcmalloc_la_SOURCES_DIST = src/internal_logging.cc \
 	src/system-alloc.cc src/memfs_malloc.cc src/tcmalloc.cc \
 	src/malloc_hook.cc src/malloc_extension.cc \
@@ -212,9 +214,10 @@ am__libtcmalloc_la_SOURCES_DIST = src/internal_logging.cc \
 	src/base/atomicops-internals-linuxppc.h \
 	src/base/atomicops-internals-x86-msvc.h \
 	src/base/atomicops-internals-x86.h src/base/logging.h \
-	src/google/malloc_hook.h src/google/malloc_extension.h \
-	src/google/heap-profiler.h src/google/heap-checker.h \
-	src/google/stacktrace.h src/heap-checker-bcad.cc
+	src/base/dynamic_annotations.h src/google/malloc_hook.h \
+	src/google/malloc_extension.h src/google/heap-profiler.h \
+	src/google/heap-checker.h src/google/stacktrace.h \
+	src/heap-checker-bcad.cc
 @MINGW_FALSE@am__objects_5 = libtcmalloc_la-system-alloc.lo
 @MINGW_FALSE@am__objects_6 = libtcmalloc_la-maybe_threads.lo
 @MINGW_FALSE@am_libtcmalloc_la_OBJECTS =  \
@@ -234,8 +237,8 @@ am__libtcmalloc_la_SOURCES_DIST = src/internal_logging.cc \
 @MINGW_FALSE@	libtcmalloc_la-heap-checker-bcad.lo
 libtcmalloc_la_OBJECTS = $(am_libtcmalloc_la_OBJECTS)
 @MINGW_FALSE@am_libtcmalloc_la_rpath = -rpath $(libdir)
-libtcmalloc_minimal_la_DEPENDENCIES = $(am__DEPENDENCIES_2) \
-	libstacktrace.la $(am__DEPENDENCIES_1)
+libtcmalloc_minimal_la_DEPENDENCIES = $(am__DEPENDENCIES_1) \
+	libstacktrace.la
 am__libtcmalloc_minimal_la_SOURCES_DIST = src/internal_logging.cc \
 	src/system-alloc.cc src/memfs_malloc.cc src/tcmalloc.cc \
 	src/malloc_hook.cc src/malloc_extension.cc \
@@ -310,17 +313,18 @@ am__atomicops_unittest_SOURCES_DIST = src/tests/atomicops_unittest.cc \
 	src/base/atomicops.h src/base/atomicops-internals-macosx.h \
 	src/base/atomicops-internals-x86-msvc.h \
 	src/base/atomicops-internals-x86.h src/base/logging.h \
-	src/base/commandlineflags.h src/base/basictypes.h
+	src/base/commandlineflags.h src/base/basictypes.h \
+	src/base/dynamic_annotations.h
 @MINGW_FALSE@am__objects_11 = $(am__objects_1)
 @MINGW_FALSE@am_atomicops_unittest_OBJECTS =  \
 @MINGW_FALSE@	atomicops_unittest.$(OBJEXT) $(am__objects_11)
 atomicops_unittest_OBJECTS = $(am_atomicops_unittest_OBJECTS)
-@MINGW_FALSE@atomicops_unittest_DEPENDENCIES = $(am__DEPENDENCIES_1)
+@MINGW_FALSE@atomicops_unittest_DEPENDENCIES = $(am__DEPENDENCIES_2)
 am_frag_unittest_OBJECTS = frag_unittest-frag_unittest.$(OBJEXT)
 frag_unittest_OBJECTS = $(am_frag_unittest_OBJECTS)
-am__DEPENDENCIES_3 = libstacktrace.la libtcmalloc_minimal.la
+am__DEPENDENCIES_3 = libtcmalloc_minimal.la
 frag_unittest_DEPENDENCIES = $(am__DEPENDENCIES_3) \
-	$(am__DEPENDENCIES_2)
+	$(am__DEPENDENCIES_1)
 am__getpc_test_SOURCES_DIST = src/tests/getpc_test.cc src/getpc.h
 @MINGW_FALSE@am_getpc_test_OBJECTS = getpc_test.$(OBJEXT)
 getpc_test_OBJECTS = $(am_getpc_test_OBJECTS)
@@ -335,13 +339,14 @@ am__heap_checker_unittest_SOURCES_DIST =  \
 	src/tests/heap-checker_unittest.cc src/config_for_unittests.h \
 	src/memory_region_map.h src/base/commandlineflags.h \
 	src/base/googleinit.h src/google/heap-checker.h \
-	src/base/logging.h src/base/basictypes.h
+	src/base/logging.h src/base/basictypes.h \
+	src/base/dynamic_annotations.h
 @MINGW_FALSE@am_heap_checker_unittest_OBJECTS = heap_checker_unittest-heap-checker_unittest.$(OBJEXT) \
 @MINGW_FALSE@	$(am__objects_11)
 heap_checker_unittest_OBJECTS = $(am_heap_checker_unittest_OBJECTS)
-@MINGW_FALSE@am__DEPENDENCIES_4 = libstacktrace.la libtcmalloc.la
+@MINGW_FALSE@am__DEPENDENCIES_4 = libtcmalloc.la
 @MINGW_FALSE@heap_checker_unittest_DEPENDENCIES =  \
-@MINGW_FALSE@	$(am__DEPENDENCIES_2) liblogging.la \
+@MINGW_FALSE@	$(am__DEPENDENCIES_1) liblogging.la \
 @MINGW_FALSE@	$(am__DEPENDENCIES_4)
 am__heap_checker_unittest_sh_SOURCES_DIST =  \
 	src/tests/heap-checker_unittest.sh
@@ -356,7 +361,7 @@ am__heap_profiler_unittest_SOURCES_DIST =  \
 @MINGW_FALSE@	$(am__objects_1)
 heap_profiler_unittest_OBJECTS = $(am_heap_profiler_unittest_OBJECTS)
 @MINGW_FALSE@heap_profiler_unittest_DEPENDENCIES =  \
-@MINGW_FALSE@	$(am__DEPENDENCIES_4) $(am__DEPENDENCIES_2)
+@MINGW_FALSE@	$(am__DEPENDENCIES_4) $(am__DEPENDENCIES_1)
 am__heap_profiler_unittest_sh_SOURCES_DIST =  \
 	src/tests/heap-profiler_unittest.sh
 am_heap_profiler_unittest_sh_OBJECTS =
@@ -372,20 +377,19 @@ am__low_level_alloc_unittest_SOURCES_DIST =  \
 	src/base/atomicops-internals-linuxppc.h \
 	src/base/atomicops-internals-x86-msvc.h \
 	src/base/atomicops-internals-x86.h src/base/logging.h \
-	src/base/commandlineflags.h
+	src/base/commandlineflags.h src/base/dynamic_annotations.h
 am_low_level_alloc_unittest_OBJECTS = low_level_alloc.$(OBJEXT) \
 	malloc_hook.$(OBJEXT) low_level_alloc_unittest.$(OBJEXT) \
 	$(am__objects_4)
 low_level_alloc_unittest_OBJECTS =  \
 	$(am_low_level_alloc_unittest_OBJECTS)
-low_level_alloc_unittest_DEPENDENCIES = $(am__DEPENDENCIES_1) \
-	libstacktrace.la
+low_level_alloc_unittest_DEPENDENCIES = libstacktrace.la
 am_markidle_unittest_OBJECTS =  \
 	markidle_unittest-markidle_unittest.$(OBJEXT) \
 	markidle_unittest-testutil.$(OBJEXT)
 markidle_unittest_OBJECTS = $(am_markidle_unittest_OBJECTS)
 markidle_unittest_DEPENDENCIES = $(am__DEPENDENCIES_3) \
-	$(am__DEPENDENCIES_2)
+	$(am__DEPENDENCIES_1)
 am_maybe_threads_unittest_sh_OBJECTS =
 maybe_threads_unittest_sh_OBJECTS =  \
 	$(am_maybe_threads_unittest_sh_OBJECTS)
@@ -394,7 +398,7 @@ am_memalign_unittest_OBJECTS =  \
 	memalign_unittest-memalign_unittest.$(OBJEXT)
 memalign_unittest_OBJECTS = $(am_memalign_unittest_OBJECTS)
 memalign_unittest_DEPENDENCIES = $(am__DEPENDENCIES_3) \
-	$(am__DEPENDENCIES_2)
+	$(am__DEPENDENCIES_1)
 am_packed_cache_test_OBJECTS = packed-cache_test.$(OBJEXT)
 packed_cache_test_OBJECTS = $(am_packed_cache_test_OBJECTS)
 packed_cache_test_LDADD = $(LDADD)
@@ -437,7 +441,7 @@ am__profiler3_unittest_SOURCES_DIST = src/tests/profiler_unittest.cc \
 @MINGW_FALSE@am_profiler3_unittest_OBJECTS = $(am__objects_14)
 profiler3_unittest_OBJECTS = $(am_profiler3_unittest_OBJECTS)
 @MINGW_FALSE@profiler3_unittest_DEPENDENCIES = $(am__DEPENDENCIES_5) \
-@MINGW_FALSE@	$(am__DEPENDENCIES_2)
+@MINGW_FALSE@	$(am__DEPENDENCIES_1)
 am__profiler4_unittest_SOURCES_DIST = src/tests/profiler_unittest.cc \
 	src/tests/testutil.h src/tests/testutil.cc \
 	src/config_for_unittests.h src/google/profiler.h
@@ -455,11 +459,11 @@ profiler_unittest_sh_LDADD = $(LDADD)
 am_ptmalloc_unittest1_OBJECTS = ptmalloc_unittest1-t-test1.$(OBJEXT) \
 	$(am__objects_1)
 ptmalloc_unittest1_OBJECTS = $(am_ptmalloc_unittest1_OBJECTS)
-ptmalloc_unittest1_DEPENDENCIES = $(am__DEPENDENCIES_2)
+ptmalloc_unittest1_DEPENDENCIES = $(am__DEPENDENCIES_1)
 am_ptmalloc_unittest2_OBJECTS = ptmalloc_unittest2-t-test2.$(OBJEXT) \
 	$(am__objects_1)
 ptmalloc_unittest2_OBJECTS = $(am_ptmalloc_unittest2_OBJECTS)
-ptmalloc_unittest2_DEPENDENCIES = $(am__DEPENDENCIES_2)
+ptmalloc_unittest2_DEPENDENCIES = $(am__DEPENDENCIES_1)
 am__objects_16 = $(am__objects_4) $(am__objects_1)
 am_stacktrace_unittest_OBJECTS = stacktrace_unittest.$(OBJEXT) \
 	$(am__objects_16)
@@ -470,7 +474,7 @@ am__system_alloc_unittest_SOURCES_DIST = src/config_for_unittests.h \
 @MINGW_FALSE@am_system_alloc_unittest_OBJECTS = system_alloc_unittest-system-alloc_unittest.$(OBJEXT)
 system_alloc_unittest_OBJECTS = $(am_system_alloc_unittest_OBJECTS)
 @MINGW_FALSE@system_alloc_unittest_DEPENDENCIES =  \
-@MINGW_FALSE@	$(am__DEPENDENCIES_3) $(am__DEPENDENCIES_2)
+@MINGW_FALSE@	$(am__DEPENDENCIES_3) $(am__DEPENDENCIES_1)
 am__tcmalloc_both_unittest_SOURCES_DIST =  \
 	src/tests/tcmalloc_unittest.cc src/tests/testutil.h \
 	src/tests/testutil.cc src/config_for_unittests.h \
@@ -481,19 +485,20 @@ am__tcmalloc_both_unittest_SOURCES_DIST =  \
 tcmalloc_both_unittest_OBJECTS = $(am_tcmalloc_both_unittest_OBJECTS)
 @MINGW_FALSE@tcmalloc_both_unittest_DEPENDENCIES =  \
 @MINGW_FALSE@	$(am__DEPENDENCIES_4) $(am__DEPENDENCIES_3) \
-@MINGW_FALSE@	liblogging.la $(am__DEPENDENCIES_2)
+@MINGW_FALSE@	libprofiler.la liblogging.la \
+@MINGW_FALSE@	$(am__DEPENDENCIES_1)
 am__tcmalloc_large_unittest_SOURCES_DIST =  \
 	src/tests/tcmalloc_large_unittest.cc
 @MINGW_FALSE@am_tcmalloc_large_unittest_OBJECTS = tcmalloc_large_unittest-tcmalloc_large_unittest.$(OBJEXT)
 tcmalloc_large_unittest_OBJECTS =  \
 	$(am_tcmalloc_large_unittest_OBJECTS)
 @MINGW_FALSE@tcmalloc_large_unittest_DEPENDENCIES =  \
-@MINGW_FALSE@	$(am__DEPENDENCIES_4) $(am__DEPENDENCIES_2)
+@MINGW_FALSE@	$(am__DEPENDENCIES_4) $(am__DEPENDENCIES_1)
 am_tcmalloc_minimal_large_unittest_OBJECTS = tcmalloc_minimal_large_unittest-tcmalloc_large_unittest.$(OBJEXT)
 tcmalloc_minimal_large_unittest_OBJECTS =  \
 	$(am_tcmalloc_minimal_large_unittest_OBJECTS)
 tcmalloc_minimal_large_unittest_DEPENDENCIES = $(am__DEPENDENCIES_3) \
-	$(am__DEPENDENCIES_2)
+	$(am__DEPENDENCIES_1)
 am__tcmalloc_minimal_unittest_SOURCES_DIST =  \
 	src/tests/tcmalloc_unittest.cc src/tests/testutil.h \
 	src/tests/testutil.cc src/config_for_unittests.h \
@@ -504,7 +509,7 @@ am_tcmalloc_minimal_unittest_OBJECTS =  \
 tcmalloc_minimal_unittest_OBJECTS =  \
 	$(am_tcmalloc_minimal_unittest_OBJECTS)
 tcmalloc_minimal_unittest_DEPENDENCIES = $(am__DEPENDENCIES_3) \
-	liblogging.la $(am__DEPENDENCIES_2)
+	liblogging.la $(am__DEPENDENCIES_1)
 am__tcmalloc_unittest_SOURCES_DIST = src/tests/tcmalloc_unittest.cc \
 	src/tcmalloc.h src/tests/testutil.h src/tests/testutil.cc \
 	src/config_for_unittests.h src/google/malloc_extension.h
@@ -514,14 +519,14 @@ am__tcmalloc_unittest_SOURCES_DIST = src/tests/tcmalloc_unittest.cc \
 @MINGW_FALSE@	$(am__objects_1)
 tcmalloc_unittest_OBJECTS = $(am_tcmalloc_unittest_OBJECTS)
 @MINGW_FALSE@tcmalloc_unittest_DEPENDENCIES = $(am__DEPENDENCIES_4) \
-@MINGW_FALSE@	liblogging.la $(am__DEPENDENCIES_2)
+@MINGW_FALSE@	liblogging.la $(am__DEPENDENCIES_1)
 am_thread_dealloc_unittest_OBJECTS =  \
 	thread_dealloc_unittest-thread_dealloc_unittest.$(OBJEXT) \
 	thread_dealloc_unittest-testutil.$(OBJEXT)
 thread_dealloc_unittest_OBJECTS =  \
 	$(am_thread_dealloc_unittest_OBJECTS)
 thread_dealloc_unittest_DEPENDENCIES = $(am__DEPENDENCIES_3) \
-	$(am__DEPENDENCIES_2)
+	$(am__DEPENDENCIES_1)
 binSCRIPT_INSTALL = $(INSTALL_SCRIPT)
 SCRIPTS = $(bin_SCRIPTS) $(noinst_SCRIPTS)
 DEFAULT_INCLUDES = -I. -I$(srcdir) -I$(top_builddir)/src
@@ -836,13 +841,12 @@ dist_doc_DATA = AUTHORS COPYING ChangeLog INSTALL NEWS README \
 # We'll add to this later, on a library-by-library basis
 
 ### Making the library
+lib_LTLIBRARIES = libtcmalloc_minimal.la $(am__append_11)
+# This is for 'convenience libraries' -- basically just a container for sources
 
 ### Making the library
-lib_LTLIBRARIES = libstacktrace.la libtcmalloc_minimal.la \
-	$(am__append_11)
-# This is for 'convenience libraries' -- basically just a container for sources
 noinst_LTLIBRARIES = liblogging.la libsysinfo.la $(am__append_4) \
-	$(am__append_5)
+	$(am__append_5) libstacktrace.la
 WINDOWS_PROJECTS = google-perftools.sln \
 	vsprojects/low_level_alloc_unittest/low_level_alloc_unittest.vcproj \
 	vsprojects/libtcmalloc_minimal/libtcmalloc_minimal.vcproj \
@@ -896,9 +900,11 @@ noinst_SCRIPTS = $(maybe_threads_unittest_sh_SOURCES) $(am__append_13)
 # This is a 'convenience library' -- it's not actually installed or anything
 LOGGING_INCLUDES = src/base/logging.h \
                    src/base/commandlineflags.h \
-                   src/base/basictypes.h
+                   src/base/basictypes.h \
+                   src/base/dynamic_annotations.h
 
 liblogging_la_SOURCES = src/base/logging.cc \
+                        src/base/dynamic_annotations.cc \
                         $(LOGGING_INCLUDES)
 
 SYSINFO_INCLUDES = src/base/sysinfo.h \
@@ -963,7 +969,7 @@ low_level_alloc_unittest_SOURCES = src/base/low_level_alloc.cc \
                                    src/tests/low_level_alloc_unittest.cc \
                                    $(LOW_LEVEL_ALLOC_UNITTEST_INCLUDES)
 
-low_level_alloc_unittest_LDADD = $(LIBSPINLOCK) libstacktrace.la
+low_level_alloc_unittest_LDADD = libstacktrace.la
 @MINGW_FALSE@ATOMICOPS_UNITTEST_INCLUDES = src/base/atomicops.h \
 @MINGW_FALSE@                              src/base/atomicops-internals-macosx.h \
 @MINGW_FALSE@                              src/base/atomicops-internals-x86-msvc.h \
@@ -989,8 +995,6 @@ STACKTRACE_INCLUDES = $(S_STACKTRACE_INCLUDES) $(SG_STACKTRACE_INCLUDES)
 libstacktrace_la_SOURCES = src/stacktrace.cc \
                            $(STACKTRACE_INCLUDES)
 
-# TODO(csilvers): only add these two things when stacktrace.cc would
-#                 #include "stacktrace_libunwind-inl.h"
 libstacktrace_la_LIBADD = $(UNWIND_LIBS) $(LIBSPINLOCK)
 STACKTRACE_SYMBOLS = '(GetStackTrace)'
 libstacktrace_la_LDFLAGS = -export-symbols-regex $(STACKTRACE_SYMBOLS)
@@ -1042,20 +1046,9 @@ libtcmalloc_minimal_la_SOURCES = src/internal_logging.cc \
 libtcmalloc_minimal_la_CXXFLAGS = $(PTHREAD_CFLAGS) -DNDEBUG $(AM_CXXFLAGS)
 libtcmalloc_minimal_la_LDFLAGS = $(PTHREAD_CFLAGS)
 libtcmalloc_minimal_la_LIBADD = $(PTHREAD_LIBS) \
-                                libstacktrace.la $(LIBSPINLOCK)
-
-
-# Whenever we link in tcmalloc_minimal, we also need to link in
-# libstacktrace.so (we also need libspinlock and liblogging, but those
-# are created as .a's, not .so's).  libtool should do this for us, via
-# the LIBADD above.  But on some systems, -rpath doesn't work
-# properly, and whatever libtool does fails.  So we just manually link
-# in -lstacktrace whenever linking in -ltcmalloc_minimal.
-# (Note this isn't a problem for an *installed* tcmalloc, because then
-# everything lives in /usr/lib or /usr/local/lib, which is on the
-# linker search path, so the value of -rpath doesn't matter.)
-# Remember tcmalloc should always be linked in last!
-LIBTCMALLOC_MINIMAL = libstacktrace.la libtcmalloc_minimal.la
+                                libstacktrace.la
+
+LIBTCMALLOC_MINIMAL = libtcmalloc_minimal.la
 tcmalloc_minimal_unittest_SOURCES = src/tests/tcmalloc_unittest.cc \
                                     src/tests/testutil.h src/tests/testutil.cc \
                                     $(TCMALLOC_UNITTEST_INCLUDES)
@@ -1185,13 +1178,9 @@ ptmalloc_unittest2_LDADD = $(PTHREAD_LIBS)
 @MINGW_FALSE@libtcmalloc_la_CXXFLAGS = $(PTHREAD_CFLAGS) -DNDEBUG $(AM_CXXFLAGS)
 @MINGW_FALSE@libtcmalloc_la_LDFLAGS = $(PTHREAD_CFLAGS)
 @MINGW_FALSE@libtcmalloc_la_LIBADD = $(PTHREAD_LIBS) \
-@MINGW_FALSE@                        libstacktrace.la $(LIBSPINLOCK)
-
+@MINGW_FALSE@                        libstacktrace.la
 
-# See discussion above (under LIBTCMALLOC_MINIMAL) for why we do this.
-# Basically it's to work around systems where --rpath doesn't work right.
-# Remember tcmalloc should always be linked in last!
-@MINGW_FALSE@LIBTCMALLOC = libstacktrace.la libtcmalloc.la
+@MINGW_FALSE@LIBTCMALLOC = libtcmalloc.la
 @MINGW_FALSE@TCMALLOC_UNITTEST_INCLUDES = src/config_for_unittests.h \
 @MINGW_FALSE@                             src/google/malloc_extension.h
 
@@ -1210,7 +1199,7 @@ ptmalloc_unittest2_LDADD = $(PTHREAD_LIBS)
 @MINGW_FALSE@tcmalloc_both_unittest_CXXFLAGS = $(PTHREAD_CFLAGS) $(AM_CXXFLAGS)
 @MINGW_FALSE@tcmalloc_both_unittest_LDFLAGS = $(PTHREAD_CFLAGS)
 @MINGW_FALSE@tcmalloc_both_unittest_LDADD = $(LIBTCMALLOC) $(LIBTCMALLOC_MINIMAL) \
-@MINGW_FALSE@                               liblogging.la $(PTHREAD_LIBS)
+@MINGW_FALSE@                               libprofiler.la liblogging.la $(PTHREAD_LIBS)
 
 @MINGW_FALSE@tcmalloc_large_unittest_SOURCES = src/tests/tcmalloc_large_unittest.cc
 @MINGW_FALSE@tcmalloc_large_unittest_CXXFLAGS = $(PTHREAD_CFLAGS) $(AM_CXXFLAGS)
@@ -1267,7 +1256,7 @@ ptmalloc_unittest2_LDADD = $(PTHREAD_LIBS)
 @MINGW_FALSE@                         src/profiledata.cc \
 @MINGW_FALSE@                         $(CPU_PROFILER_INCLUDES)
 
-@MINGW_FALSE@libprofiler_la_LIBADD = $(LIBSPINLOCK) libstacktrace.la
+@MINGW_FALSE@libprofiler_la_LIBADD = libstacktrace.la
 # We have to include ProfileData for profiledata_unittest
 @MINGW_FALSE@CPU_PROFILER_SYMBOLS = '(ProfilerStart|ProfilerStop|ProfilerEnable|ProfilerDisable|ProfilerFlush|ProfilerRegisterThread|ProfileData)'
 @MINGW_FALSE@libprofiler_la_LDFLAGS = -export-symbols-regex $(CPU_PROFILER_SYMBOLS)
@@ -1413,7 +1402,7 @@ libprofiler.la: $(libprofiler_la_OBJECTS) $(libprofiler_la_DEPENDENCIES)
 libspinlock.la: $(libspinlock_la_OBJECTS) $(libspinlock_la_DEPENDENCIES) 
 	$(CXXLINK) $(am_libspinlock_la_rpath) $(libspinlock_la_LDFLAGS) $(libspinlock_la_OBJECTS) $(libspinlock_la_LIBADD) $(LIBS)
 libstacktrace.la: $(libstacktrace_la_OBJECTS) $(libstacktrace_la_DEPENDENCIES) 
-	$(CXXLINK) -rpath $(libdir) $(libstacktrace_la_LDFLAGS) $(libstacktrace_la_OBJECTS) $(libstacktrace_la_LIBADD) $(LIBS)
+	$(CXXLINK)  $(libstacktrace_la_LDFLAGS) $(libstacktrace_la_OBJECTS) $(libstacktrace_la_LIBADD) $(LIBS)
 libsysinfo.la: $(libsysinfo_la_OBJECTS) $(libsysinfo_la_DEPENDENCIES) 
 	$(CXXLINK)  $(libsysinfo_la_LDFLAGS) $(libsysinfo_la_OBJECTS) $(libsysinfo_la_LIBADD) $(LIBS)
 libtcmalloc.la: $(libtcmalloc_la_OBJECTS) $(libtcmalloc_la_DEPENDENCIES) 
@@ -1545,6 +1534,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/addressmap_unittest-addressmap_unittest.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/atomicops-internals-x86.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/atomicops_unittest.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dynamic_annotations.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/frag_unittest-frag_unittest.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/getpc_test.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/heap_checker_unittest-heap-checker_unittest.Po@am__quote@
@@ -1706,6 +1696,13 @@ logging.lo: src/base/logging.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(LIBTOOL) --tag=CXX --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o logging.lo `test -f 'src/base/logging.cc' || echo '$(srcdir)/'`src/base/logging.cc
 
+dynamic_annotations.lo: src/base/dynamic_annotations.cc
+@am__fastdepCXX_TRUE@	if $(LIBTOOL) --tag=CXX --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT dynamic_annotations.lo -MD -MP -MF "$(DEPDIR)/dynamic_annotations.Tpo" -c -o dynamic_annotations.lo `test -f 'src/base/dynamic_annotations.cc' || echo '$(srcdir)/'`src/base/dynamic_annotations.cc; \
+@am__fastdepCXX_TRUE@	then mv -f "$(DEPDIR)/dynamic_annotations.Tpo" "$(DEPDIR)/dynamic_annotations.Plo"; else rm -f "$(DEPDIR)/dynamic_annotations.Tpo"; exit 1; fi
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	source='src/base/dynamic_annotations.cc' object='dynamic_annotations.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(LIBTOOL) --tag=CXX --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o dynamic_annotations.lo `test -f 'src/base/dynamic_annotations.cc' || echo '$(srcdir)/'`src/base/dynamic_annotations.cc
+
 profiler.lo: src/profiler.cc
 @am__fastdepCXX_TRUE@	if $(LIBTOOL) --tag=CXX --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT profiler.lo -MD -MP -MF "$(DEPDIR)/profiler.Tpo" -c -o profiler.lo `test -f 'src/profiler.cc' || echo '$(srcdir)/'`src/profiler.cc; \
 @am__fastdepCXX_TRUE@	then mv -f "$(DEPDIR)/profiler.Tpo" "$(DEPDIR)/profiler.Plo"; else rm -f "$(DEPDIR)/profiler.Tpo"; exit 1; fi
diff --git a/TODO b/TODO
index 9a6457e..550f7e0 100644
--- a/TODO
+++ b/TODO
@@ -41,8 +41,7 @@ CPU PROFILER
 
 STACKTRACE
 
-1) Document and advertise libstacktrace
-2) Remove dependency on linux/x86
+1) Remove dependency on linux/x86
 
 ---
-4 April 2007
+11 March 2008
diff --git a/configure b/configure
index f12fcd0..fe83e85 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.59 for google-perftools 0.95.1.
+# Generated by GNU Autoconf 2.59 for google-perftools 0.96.
 #
 # Report bugs to <opensource@google.com>.
 #
@@ -423,8 +423,8 @@ SHELL=${CONFIG_SHELL-/bin/sh}
 # Identity of this package.
 PACKAGE_NAME='google-perftools'
 PACKAGE_TARNAME='google-perftools'
-PACKAGE_VERSION='0.95.1'
-PACKAGE_STRING='google-perftools 0.95.1'
+PACKAGE_VERSION='0.96'
+PACKAGE_STRING='google-perftools 0.96'
 PACKAGE_BUGREPORT='opensource@google.com'
 
 ac_unique_file="README"
@@ -954,7 +954,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures google-perftools 0.95.1 to adapt to many kinds of systems.
+\`configure' configures google-perftools 0.96 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1021,7 +1021,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of google-perftools 0.95.1:";;
+     short | recursive ) echo "Configuration of google-perftools 0.96:";;
    esac
   cat <<\_ACEOF
 
@@ -1162,7 +1162,7 @@ fi
 test -n "$ac_init_help" && exit 0
 if $ac_init_version; then
   cat <<\_ACEOF
-google-perftools configure 0.95.1
+google-perftools configure 0.96
 generated by GNU Autoconf 2.59
 
 Copyright (C) 2003 Free Software Foundation, Inc.
@@ -1176,7 +1176,7 @@ cat >&5 <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by google-perftools $as_me 0.95.1, which was
+It was created by google-perftools $as_me 0.96, which was
 generated by GNU Autoconf 2.59.  Invocation command line was
 
   $ $0 $@
@@ -1904,7 +1904,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='google-perftools'
- VERSION='0.95.1'
+ VERSION='0.96'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -24092,7 +24092,7 @@ _ASBOX
 } >&5
 cat >&5 <<_CSEOF
 
-This file was extended by google-perftools $as_me 0.95.1, which was
+This file was extended by google-perftools $as_me 0.96, which was
 generated by GNU Autoconf 2.59.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -24155,7 +24155,7 @@ _ACEOF
 
 cat >>$CONFIG_STATUS <<_ACEOF
 ac_cs_version="\\
-google-perftools config.status 0.95.1
+google-perftools config.status 0.96
 configured by $0, generated by GNU Autoconf 2.59,
   with options \\"`echo "$ac_configure_args" | sed 's/[\\""\`\$]/\\\\&/g'`\\"
 
diff --git a/configure.ac b/configure.ac
index 291170e..2249f62 100644
--- a/configure.ac
+++ b/configure.ac
@@ -4,7 +4,7 @@
 # make sure we're interpreted by some minimal autoconf
 AC_PREREQ(2.57)
 
-AC_INIT(google-perftools, 0.95.1, opensource@google.com)
+AC_INIT(google-perftools, 0.96, opensource@google.com)
 # The argument here is just something that should be in the current directory
 # (for sanity checking)
 AC_CONFIG_SRCDIR(README)
diff --git a/doc/cpuprofile.html b/doc/cpuprofile.html
index d30a8aa..c3ba2de 100644
--- a/doc/cpuprofile.html
+++ b/doc/cpuprofile.html
@@ -30,7 +30,9 @@ profiler data file format is documented separately,
 <p>To install the CPU profiler into your executable, add
 <code>-lprofiler</code> to the link-time step for your executable.
 (It's also probably possible to add in the profiler at run-time using
-<code>LD_PRELOAD</code>, but this isn't necessarily recommended.)</p>
+<code>LD_PRELOAD</code>, e.g.
+<code>% env LD_PRELOAD="/usr/lib/libprofiler.so" &lt;binary&gt;</code>,
+but this isn't necessarily recommended.)</p>
 
 <p>This does <i>not</i> turn on CPU profiling; it just inserts the
 code.  For that reason, it's practical to just always link
@@ -78,6 +80,7 @@ environment variables.</p>
 
 <tr valign=top>
   <td><code>CPUPROFILE_FREQUENCY=<i>x</i></code></td>
+  <td>default: 100</td>
   <td>
     How many interrupts/second the cpu-profiler samples.
   </td>
diff --git a/doc/heapprofile.html b/doc/heapprofile.html
index b2d500c..c3b9aa2 100644
--- a/doc/heapprofile.html
+++ b/doc/heapprofile.html
@@ -41,7 +41,7 @@ application, running the code, and analyzing the output.</p>
 Also, while we don't necessarily recommend this form of usage, it's
 possible to add in the profiler at run-time using
 <code>LD_PRELOAD</code>:
-<pre>% env LD_PRELOAD="/usr/lib/libtcmalloc.so" <binary></pre>
+<pre>% env LD_PRELOAD="/usr/lib/libtcmalloc.so" &lt;binary&gt;</pre>
 
 <p>This does <i>not</i> turn on heap profiling; it just inserts the
 code.  For that reason, it's practical to just always link
diff --git a/packages/deb/changelog b/packages/deb/changelog
index 167d6c0..ae3a422 100644
--- a/packages/deb/changelog
+++ b/packages/deb/changelog
@@ -1,3 +1,9 @@
+google-perftools (0.96-1) unstable; urgency=low
+
+  * New upstream release.
+
+ -- Google Inc. <opensource@google.com>  Tue, 18 Mar 2008 14:30:44 -0700
+	
 google-perftools (0.95-1) unstable; urgency=low
 
   * New upstream release.
diff --git a/packages/rpm/rpm.spec b/packages/rpm/rpm.spec
index 15c7e63..2c16b7c 100644
--- a/packages/rpm/rpm.spec
+++ b/packages/rpm/rpm.spec
@@ -66,8 +66,6 @@ rm -rf $RPM_BUILD_ROOT
 %doc doc/cpuprofile.html doc/cpuprofile-fileformat.html
 %doc doc/pprof-test-big.gif doc/pprof-test.gif doc/pprof-vsnprintf-big.gif doc/pprof-vsnprintf.gif
 
-%{prefix}/lib/libstacktrace.so.0
-%{prefix}/lib/libstacktrace.so.0.0.0
 %{prefix}/lib/libtcmalloc.so.0
 %{prefix}/lib/libtcmalloc.so.0.0.0
 %{prefix}/lib/libtcmalloc_minimal.so.0
@@ -81,9 +79,6 @@ rm -rf $RPM_BUILD_ROOT
 %defattr(-,root,root)
 
 %{prefix}/include/google
-%{prefix}/lib/libstacktrace.a
-%{prefix}/lib/libstacktrace.la
-%{prefix}/lib/libstacktrace.so
 %{prefix}/lib/libtcmalloc.a
 %{prefix}/lib/libtcmalloc.la
 %{prefix}/lib/libtcmalloc.so
diff --git a/src/addressmap-inl.h b/src/addressmap-inl.h
index a8cbb77..e1ce1bf 100644
--- a/src/addressmap-inl.h
+++ b/src/addressmap-inl.h
@@ -91,6 +91,14 @@
 #include <sys/types.h>          // our last best hope
 #endif
 
+// This class is thread-unsafe -- that is, instances of this class can
+// not be accessed concurrently by multiple threads -- because the
+// callback function for Iterate() may mutate contained values. If the
+// callback functions you pass do not mutate their Value* argument,
+// AddressMap can be treated as thread-compatible -- that is, it's
+// safe for multiple threads to call "const" methods on this class,
+// but not safe for one thread to call const methods on this class
+// while another thread is calling non-const methods on the class.
 template <class Value>
 class AddressMap {
  public:
@@ -200,6 +208,8 @@ class AddressMap {
   // Find cluster object for specified address.  If not found
   // and "create" is true, create the object.  If not found
   // and "create" is false, return NULL.
+  //
+  // This method is bitwise-const if create is false.
   Cluster* FindCluster(Number address, bool create) {
     // Look in hashtable
     const Number cluster_id = address >> (kBlockBits + kClusterBits);
diff --git a/src/base/atomicops-internals-linuxppc.h b/src/base/atomicops-internals-linuxppc.h
index 6ddc5a8..09d227a 100644
--- a/src/base/atomicops-internals-linuxppc.h
+++ b/src/base/atomicops-internals-linuxppc.h
@@ -37,6 +37,10 @@
 #ifndef BASE_ATOMICOPS_INTERNALS_LINUXPPC_H__
 #define BASE_ATOMICOPS_INTERNALS_LINUXPPC_H__
 
+// int32_t and intptr_t seems to be equal on ppc-linux
+// There are no Atomic64 implementations in this file.
+typedef int32_t Atomic32;
+
 #define LWSYNC_ON_SMP
 #define PPC405_ERR77(a, b)
 #define ISYNC_ON_SMP
@@ -44,23 +48,38 @@
 
 /* Adapted from atomic_add in asm-powerpc/atomic.h */
 inline int32_t OSAtomicAdd32(int32_t amount, int32_t *value) {
-  int t;
+  int32_t t;
+  __asm__ __volatile__(
+"1:		lwarx   %0,0,%3         # atomic_add\n\
+		add     %0,%2,%0\n"
+		PPC405_ERR77(0,%3)
+"		stwcx.  %0,0,%3 \n\
+		bne-    1b"
+		: "=&r" (t), "+m" (*value)
+		: "r" (amount), "r" (value)
+                : "cc", "memory");
+  return t;
+}
+
+inline int32_t OSAtomicAdd32Barrier(int32_t amount, int32_t *value) {
+  int32_t t;
   __asm__ __volatile__(
 "1:		lwarx   %0,0,%3         # atomic_add\n\
 		add     %0,%2,%0\n"
 		PPC405_ERR77(0,%3)
 "		stwcx.  %0,0,%3 \n\
 		bne-    1b"
+		ISYNC_ON_SMP
 		: "=&r" (t), "+m" (*value)
 		: "r" (amount), "r" (value)
-		: "cc");
-  return *value;
+                : "cc", "memory");
+  return t;
 }
 
 /* Adapted from __cmpxchg_u32 in asm-powerpc/atomic.h */
 inline bool OSAtomicCompareAndSwap32(int32_t old_value, int32_t new_value,
                                      int32_t *value) {
-  unsigned int prev;
+  int32_t prev;
   __asm__ __volatile__ (
 		LWSYNC_ON_SMP
 "1:		lwarx   %0,0,%2         # __cmpxchg_u32\n\
@@ -69,20 +88,19 @@ inline bool OSAtomicCompareAndSwap32(int32_t old_value, int32_t new_value,
 		PPC405_ERR77(0,%2)
 "		stwcx.  %4,0,%2\n\
 		bne-    1b"
-		ISYNC_ON_SMP
 		"\n\
 2:"
                 : "=&r" (prev), "+m" (*value)
                 : "r" (value), "r" (old_value), "r" (new_value)
                 : "cc", "memory");
-  return true;
+  return prev == old_value;
 }
 
 /* Adapted from __cmpxchg_u32 in asm-powerpc/atomic.h */
 inline int32_t OSAtomicCompareAndSwap32Barrier(int32_t old_value,
                                                int32_t new_value,
                                                int32_t *value) {
-  unsigned int prev;
+  int32_t prev;
   __asm__ __volatile__ (
 		LWSYNC_ON_SMP
 "1:		lwarx   %0,0,%2         # __cmpxchg_u32\n\
@@ -97,32 +115,27 @@ inline int32_t OSAtomicCompareAndSwap32Barrier(int32_t old_value,
                 : "=&r" (prev), "+m" (*value)
                 : "r" (value), "r" (old_value), "r" (new_value)
                 : "cc", "memory");
-  return true;
+  return prev == old_value;
 }
 
+namespace base {
+namespace subtle {
+
+typedef int64_t Atomic64;  // Defined but unused
+
 inline void MemoryBarrier() {
   // TODO
 }
 
-// int32_t and intptr_t seems to be equal on ppc-linux
-// therefore we have no extra Atomic32 function versions.
-typedef int32_t Atomic32;
-typedef intptr_t AtomicWord;
-
-#define OSAtomicCastIntPtr(p) \
-               reinterpret_cast<int32_t *>(const_cast<AtomicWord *>(p))
-#define OSAtomicCompareAndSwapIntPtr OSAtomicCompareAndSwap32
-#define OSAtomicAddIntPtr OSAtomicAdd32
-#define OSAtomicCompareAndSwapIntPtrBarrier OSAtomicCompareAndSwap32Barrier
+// 32-bit Versions.
 
-
-inline AtomicWord CompareAndSwap(volatile AtomicWord *ptr,
-                                 AtomicWord old_value,
-                                 AtomicWord new_value) {
-  AtomicWord prev_value;
+inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32 *ptr,
+                                         Atomic32 old_value,
+                                         Atomic32 new_value) {
+  Atomic32 prev_value;
   do {
-    if (OSAtomicCompareAndSwapIntPtr(old_value, new_value,
-                                     OSAtomicCastIntPtr(ptr))) {
+    if (OSAtomicCompareAndSwap32(old_value, new_value,
+                                 const_cast<Atomic32*>(ptr))) {
       return old_value;
     }
     prev_value = *ptr;
@@ -130,28 +143,33 @@ inline AtomicWord CompareAndSwap(volatile AtomicWord *ptr,
   return prev_value;
 }
 
-inline AtomicWord AtomicExchange(volatile AtomicWord *ptr,
-                                 AtomicWord new_value) {
-  AtomicWord old_value;
+inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32 *ptr,
+                                         Atomic32 new_value) {
+  Atomic32 old_value;
   do {
     old_value = *ptr;
-  } while (!OSAtomicCompareAndSwapIntPtr(old_value, new_value,
-                                         OSAtomicCastIntPtr(ptr)));
+  } while (!OSAtomicCompareAndSwap32(old_value, new_value,
+                                     const_cast<Atomic32*>(ptr)));
   return old_value;
 }
 
+inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32 *ptr,
+                                          Atomic32 increment) {
+  return OSAtomicAdd32(increment, const_cast<Atomic32*>(ptr));
+}
 
-inline AtomicWord AtomicIncrement(volatile AtomicWord *ptr, AtomicWord increment) {
-  return OSAtomicAddIntPtr(increment, OSAtomicCastIntPtr(ptr));
+inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32 *ptr,
+                                        Atomic32 increment) {
+  return OSAtomicAdd32Barrier(increment, const_cast<Atomic32*>(ptr));
 }
 
-inline AtomicWord Acquire_CompareAndSwap(volatile AtomicWord *ptr,
-                                         AtomicWord old_value,
-                                         AtomicWord new_value) {
-  AtomicWord prev_value;
+inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32 *ptr,
+                                       Atomic32 old_value,
+                                       Atomic32 new_value) {
+  Atomic32 prev_value;
   do {
-    if (OSAtomicCompareAndSwapIntPtrBarrier(old_value, new_value,
-                                        OSAtomicCastIntPtr(ptr))) {
+    if (OSAtomicCompareAndSwap32Barrier(old_value, new_value,
+                                        const_cast<Atomic32*>(ptr))) {
       return old_value;
     }
     prev_value = *ptr;
@@ -159,35 +177,49 @@ inline AtomicWord Acquire_CompareAndSwap(volatile AtomicWord *ptr,
   return prev_value;
 }
 
-inline AtomicWord Release_CompareAndSwap(volatile AtomicWord *ptr,
-                                         AtomicWord old_value,
-                                         AtomicWord new_value) {
+inline Atomic32 Release_CompareAndSwap(volatile Atomic32 *ptr,
+                                       Atomic32 old_value,
+                                       Atomic32 new_value) {
   // The ppc interface does not distinguish between Acquire and
   // Release memory barriers; they are equivalent.
   return Acquire_CompareAndSwap(ptr, old_value, new_value);
 }
 
+inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) {
+  *ptr = value;
+}
 
-inline void Acquire_Store(volatile AtomicWord *ptr, AtomicWord value) {
+inline void Acquire_Store(volatile Atomic32 *ptr, Atomic32 value) {
   *ptr = value;
   MemoryBarrier();
 }
 
-inline void Release_Store(volatile AtomicWord *ptr, AtomicWord value) {
+inline void Release_Store(volatile Atomic32 *ptr, Atomic32 value) {
   MemoryBarrier();
   *ptr = value;
 }
 
-inline AtomicWord Acquire_Load(volatile const AtomicWord *ptr) {
-  AtomicWord value = *ptr;
+inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) {
+  return *ptr;
+}
+
+inline Atomic32 Acquire_Load(volatile const Atomic32 *ptr) {
+  Atomic32 value = *ptr;
   MemoryBarrier();
   return value;
 }
 
-inline AtomicWord Release_Load(volatile const AtomicWord *ptr) {
+inline Atomic32 Release_Load(volatile const Atomic32 *ptr) {
   MemoryBarrier();
   return *ptr;
 }
 
+}   // namespace base::subtle
+}   // namespace base
 
+// NOTE(vchen): The following is also deprecated.  New callers should use
+// the base::subtle namespace.
+inline void MemoryBarrier() {
+  base::subtle::MemoryBarrier();
+}
 #endif  // BASE_ATOMICOPS_INTERNALS_LINUXPPC_H__
diff --git a/src/base/atomicops-internals-macosx.h b/src/base/atomicops-internals-macosx.h
index 9d3a486..18c2d3c 100644
--- a/src/base/atomicops-internals-macosx.h
+++ b/src/base/atomicops-internals-macosx.h
@@ -39,35 +39,86 @@
 #define BASE_ATOMICOPS_INTERNALS_MACOSX_H__
 
 typedef int32_t Atomic32;
-typedef intptr_t AtomicWord;
+
+// MacOS uses long for intptr_t, AtomicWord and Atomic32 are always different
+// on the Mac, even when they are the same size.  Similarly, on __ppc64__,
+// AtomicWord and Atomic64 are always different.  Thus, we need explicit
+// casting.
+#ifdef __LP64__
+#define AtomicWordCastType base::subtle::Atomic64
+#else
+#define AtomicWordCastType Atomic32
+#endif
+
+#if defined(__LP64__) || defined(__i386__)
+#define BASE_HAS_ATOMIC64 1  // Use only in tests and base/atomic*
+#endif
 
 #include <libkern/OSAtomic.h>
 
-#ifdef __LP64__   // Indicates 64-bit pointers under OS 
-#define OSAtomicCastIntPtr(p) \
-               reinterpret_cast<int64_t *>(const_cast<AtomicWord *>(p))
-#define OSAtomicCompareAndSwapIntPtr OSAtomicCompareAndSwap64
-#define OSAtomicAddIntPtr OSAtomicAdd64
-#define OSAtomicCompareAndSwapIntPtrBarrier OSAtomicCompareAndSwap64Barrier
-#else
-#define OSAtomicCastIntPtr(p) \
-               reinterpret_cast<int32_t *>(const_cast<AtomicWord *>(p))
-#define OSAtomicCompareAndSwapIntPtr OSAtomicCompareAndSwap32
-#define OSAtomicAddIntPtr OSAtomicAdd32
-#define OSAtomicCompareAndSwapIntPtrBarrier OSAtomicCompareAndSwap32Barrier
+#if !defined(__LP64__) && defined(__ppc__)
+
+// The Mac 64-bit OSAtomic implementations are not available for 32-bit PowerPC,
+// while the underlying assembly instructions are available only some
+// implementations of PowerPC.
+
+// The following inline functions will fail with the error message at compile
+// time ONLY IF they are called.  So it is safe to use this header if user
+// code only calls AtomicWord and Atomic32 operations.
+//
+// NOTE(vchen): Implementation notes to implement the atomic ops below may
+// be found in "PowerPC Virtual Environment Architecture, Book II,
+// Version 2.02", January 28, 2005, Appendix B, page 46.  Unfortunately,
+// extra care must be taken to ensure data are properly 8-byte aligned, and
+// that data are returned correctly according to Mac OS X ABI specs.
+
+inline int64_t OSAtomicCompareAndSwap64(
+    int64_t oldValue, int64_t newValue, int64_t *theValue) {
+  __asm__ __volatile__(
+      "_OSAtomicCompareAndSwap64_not_supported_for_32_bit_ppc\n\t");
+  return 0;
+}
+
+inline int64_t OSAtomicAdd64(int64_t theAmount, int64_t *theValue) {
+  __asm__ __volatile__(
+      "_OSAtomicAdd64_not_supported_for_32_bit_ppc\n\t");
+  return 0;
+}
+
+inline int64_t OSAtomicCompareAndSwap64Barrier(
+    int64_t oldValue, int64_t newValue, int64_t *theValue) {
+  int64_t prev = OSAtomicCompareAndSwap64(oldValue, newValue, theValue);
+  OSMemoryBarrier();
+  return prev;
+}
+
+inline int64_t OSAtomicAdd64Barrier(
+    int64_t theAmount, int64_t *theValue) {
+  int64_t new_val = OSAtomicAdd64(theAmount, theValue);
+  OSMemoryBarrier();
+  return new_val;
+}
 #endif
 
+
+namespace base {
+namespace subtle {
+
+typedef int64_t Atomic64;
+
 inline void MemoryBarrier() {
   OSMemoryBarrier();
 }
 
-inline AtomicWord CompareAndSwap(volatile AtomicWord *ptr,
-                                 AtomicWord old_value,
-                                 AtomicWord new_value) {
-  AtomicWord prev_value;
+// 32-bit Versions.
+
+inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32 *ptr,
+                                         Atomic32 old_value,
+                                         Atomic32 new_value) {
+  Atomic32 prev_value;
   do {
-    if (OSAtomicCompareAndSwapIntPtr(old_value, new_value,
-                                     OSAtomicCastIntPtr(ptr))) {
+    if (OSAtomicCompareAndSwap32(old_value, new_value,
+                                 const_cast<Atomic32*>(ptr))) {
       return old_value;
     }
     prev_value = *ptr;
@@ -75,28 +126,33 @@ inline AtomicWord CompareAndSwap(volatile AtomicWord *ptr,
   return prev_value;
 }
 
-inline AtomicWord AtomicExchange(volatile AtomicWord *ptr,
-                                 AtomicWord new_value) {
-  AtomicWord old_value;
+inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32 *ptr,
+                                         Atomic32 new_value) {
+  Atomic32 old_value;
   do {
     old_value = *ptr;
-  } while (!OSAtomicCompareAndSwapIntPtr(old_value, new_value,
-                                         OSAtomicCastIntPtr(ptr)));
+  } while (!OSAtomicCompareAndSwap32(old_value, new_value,
+                                     const_cast<Atomic32*>(ptr)));
   return old_value;
 }
 
+inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32 *ptr,
+                                          Atomic32 increment) {
+  return OSAtomicAdd32(increment, const_cast<Atomic32*>(ptr));
+}
 
-inline AtomicWord AtomicIncrement(volatile AtomicWord *ptr, AtomicWord increment) {
-  return OSAtomicAddIntPtr(increment, OSAtomicCastIntPtr(ptr));
+inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32 *ptr,
+                                          Atomic32 increment) {
+  return OSAtomicAdd32Barrier(increment, const_cast<Atomic32*>(ptr));
 }
 
-inline AtomicWord Acquire_CompareAndSwap(volatile AtomicWord *ptr,
-                                         AtomicWord old_value,
-                                         AtomicWord new_value) {
-  AtomicWord prev_value;
+inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32 *ptr,
+                                       Atomic32 old_value,
+                                       Atomic32 new_value) {
+  Atomic32 prev_value;
   do {
-    if (OSAtomicCompareAndSwapIntPtrBarrier(old_value, new_value,
-                                        OSAtomicCastIntPtr(ptr))) {
+    if (OSAtomicCompareAndSwap32Barrier(old_value, new_value,
+                                        const_cast<Atomic32*>(ptr))) {
       return old_value;
     }
     prev_value = *ptr;
@@ -104,48 +160,50 @@ inline AtomicWord Acquire_CompareAndSwap(volatile AtomicWord *ptr,
   return prev_value;
 }
 
-inline AtomicWord Release_CompareAndSwap(volatile AtomicWord *ptr,
-                                         AtomicWord old_value,
-                                         AtomicWord new_value) {
-  // The lib kern interface does not distinguish between 
-  // Acquire and Release memory barriers; they are equivalent.
+inline Atomic32 Release_CompareAndSwap(volatile Atomic32 *ptr,
+                                       Atomic32 old_value,
+                                       Atomic32 new_value) {
   return Acquire_CompareAndSwap(ptr, old_value, new_value);
 }
 
+inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) {
+  *ptr = value;
+}
 
-inline void Acquire_Store(volatile AtomicWord *ptr, AtomicWord value) {
+inline void Acquire_Store(volatile Atomic32 *ptr, Atomic32 value) {
   *ptr = value;
   MemoryBarrier();
 }
 
-inline void Release_Store(volatile AtomicWord *ptr, AtomicWord value) {
+inline void Release_Store(volatile Atomic32 *ptr, Atomic32 value) {
   MemoryBarrier();
   *ptr = value;
 }
 
-inline AtomicWord Acquire_Load(volatile const AtomicWord *ptr) {
-  AtomicWord value = *ptr;
+inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) {
+  return *ptr;
+}
+
+inline Atomic32 Acquire_Load(volatile const Atomic32 *ptr) {
+  Atomic32 value = *ptr;
   MemoryBarrier();
   return value;
 }
 
-inline AtomicWord Release_Load(volatile const AtomicWord *ptr) {
+inline Atomic32 Release_Load(volatile const Atomic32 *ptr) {
   MemoryBarrier();
   return *ptr;
 }
 
+// 64-bit version
 
-// MacOS uses long for intptr_t, AtomicWord and Atomic32 are always different
-// on the Mac, even when they are the same size.  Thus, we always provide 
-// Atomic32 versions.
-
-inline Atomic32 CompareAndSwap(volatile Atomic32 *ptr,
-                               Atomic32 old_value,
-                               Atomic32 new_value) {
-  Atomic32 prev_value;
+inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64 *ptr,
+                                         Atomic64 old_value,
+                                         Atomic64 new_value) {
+  Atomic64 prev_value;
   do {
-    if (OSAtomicCompareAndSwap32(old_value, new_value,
-                                 const_cast<Atomic32*>(ptr))) {
+    if (OSAtomicCompareAndSwap64(old_value, new_value,
+                                 const_cast<Atomic64*>(ptr))) {
       return old_value;
     }
     prev_value = *ptr;
@@ -153,27 +211,33 @@ inline Atomic32 CompareAndSwap(volatile Atomic32 *ptr,
   return prev_value;
 }
 
-inline Atomic32 AtomicExchange(volatile Atomic32 *ptr,
-                               Atomic32 new_value) {
-  Atomic32 old_value;
+inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64 *ptr,
+                                         Atomic64 new_value) {
+  Atomic64 old_value;
   do {
     old_value = *ptr;
-  } while (!OSAtomicCompareAndSwap32(old_value, new_value,
-                                     const_cast<Atomic32*>(ptr)));
+  } while (!OSAtomicCompareAndSwap64(old_value, new_value,
+                                     const_cast<Atomic64*>(ptr)));
   return old_value;
 }
 
-inline Atomic32 AtomicIncrement(volatile Atomic32 *ptr, Atomic32 increment) {
-  return OSAtomicAdd32(increment, const_cast<Atomic32*>(ptr));
+inline Atomic64 NoBarrier_AtomicIncrement(volatile Atomic64 *ptr,
+                                          Atomic64 increment) {
+  return OSAtomicAdd64(increment, const_cast<Atomic64*>(ptr));
 }
 
-inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32 *ptr,
-                                       Atomic32 old_value,
-                                       Atomic32 new_value) {
-  Atomic32 prev_value;
+inline Atomic64 Barrier_AtomicIncrement(volatile Atomic64 *ptr,
+                                        Atomic64 increment) {
+  return OSAtomicAdd64Barrier(increment, const_cast<Atomic64*>(ptr));
+}
+
+inline Atomic64 Acquire_CompareAndSwap(volatile Atomic64 *ptr,
+                                       Atomic64 old_value,
+                                       Atomic64 new_value) {
+  Atomic64 prev_value;
   do {
-    if (OSAtomicCompareAndSwap32Barrier(old_value, new_value,
-                                        const_cast<Atomic32*>(ptr))) {
+    if (OSAtomicCompareAndSwap64Barrier(old_value, new_value,
+                                        const_cast<Atomic64*>(ptr))) {
       return old_value;
     }
     prev_value = *ptr;
@@ -181,32 +245,116 @@ inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32 *ptr,
   return prev_value;
 }
 
-inline Atomic32 Release_CompareAndSwap(volatile Atomic32 *ptr,
-                                       Atomic32 old_value,
-                                       Atomic32 new_value) {
+inline Atomic64 Release_CompareAndSwap(volatile Atomic64 *ptr,
+                                       Atomic64 old_value,
+                                       Atomic64 new_value) {
+  // The lib kern interface does not distinguish between
+  // Acquire and Release memory barriers; they are equivalent.
   return Acquire_CompareAndSwap(ptr, old_value, new_value);
 }
 
+#ifdef __LP64__
 
-inline void Acquire_Store(volatile Atomic32 *ptr, Atomic32 value) {
+// 64-bit implementation on 64-bit platform
+
+inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) {
+  *ptr = value;
+}
+
+inline void Acquire_Store(volatile Atomic64 *ptr, Atomic64 value) {
   *ptr = value;
   MemoryBarrier();
 }
 
-inline void Release_Store(volatile Atomic32 *ptr, Atomic32 value) {
+inline void Release_Store(volatile Atomic64 *ptr, Atomic64 value) {
   MemoryBarrier();
   *ptr = value;
 }
 
-inline Atomic32 Acquire_Load(volatile const Atomic32 *ptr) {
-  Atomic32 value = *ptr;
+inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) {
+  return *ptr;
+}
+
+inline Atomic64 Acquire_Load(volatile const Atomic64 *ptr) {
+  Atomic64 value = *ptr;
   MemoryBarrier();
   return value;
 }
 
-inline Atomic32 Release_Load(volatile const Atomic32 *ptr) {
+inline Atomic64 Release_Load(volatile const Atomic64 *ptr) {
   MemoryBarrier();
   return *ptr;
 }
 
+#else
+
+// 64-bit implementation on 32-bit platform
+
+#if defined(__ppc__)
+
+inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) {
+   __asm__ __volatile__(
+       "_NoBarrier_Store_not_supported_for_32_bit_ppc\n\t");
+}
+
+inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) {
+   __asm__ __volatile__(
+       "_NoBarrier_Load_not_supported_for_32_bit_ppc\n\t");
+   return 0;
+}
+
+#elif defined(__i386__)
+
+inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) {
+  __asm__ __volatile__("movq %1, %%mm0\n\t"    // Use mmx reg for 64-bit atomic
+                       "movq %%mm0, %0\n\t"  // moves (ptr could be read-only)
+                       "emms\n\t"              // Reset FP registers
+                       : "=m" (*ptr)
+                       : "m" (value)
+                       : "memory", "%mm0");
+}
+
+inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) {
+  Atomic64 value;
+  __asm__ __volatile__("movq %1, %%mm0\n\t"  // Use mmx reg for 64-bit atomic
+                       "movq %%mm0, %0\n\t"  // moves (ptr could be read-only)
+                       "emms\n\t"            // Reset FP registers
+                       : "=m" (value)
+                       : "m" (*ptr)
+                       : "memory", "%mm0");
+  return value;
+}
+#endif
+
+
+inline void Acquire_Store(volatile Atomic64 *ptr, Atomic64 value) {
+  NoBarrier_Store(ptr, value);
+  MemoryBarrier();
+}
+
+inline void Release_Store(volatile Atomic64 *ptr, Atomic64 value) {
+  MemoryBarrier();
+  NoBarrier_Store(ptr, value);
+}
+
+inline Atomic64 Acquire_Load(volatile const Atomic64 *ptr) {
+  Atomic64 value = NoBarrier_Load(ptr);
+  MemoryBarrier();
+  return value;
+}
+
+inline Atomic64 Release_Load(volatile const Atomic64 *ptr) {
+  MemoryBarrier();
+  return NoBarrier_Load(ptr);
+}
+#endif  // __LP64__
+
+}   // namespace base::subtle
+}   // namespace base
+
+// NOTE(vchen): The following is also deprecated.  New callers should use
+// the base::subtle namespace.
+inline void MemoryBarrier() {
+  base::subtle::MemoryBarrier();
+}
 #endif  // BASE_ATOMICOPS_INTERNALS_MACOSX_H__
diff --git a/src/base/atomicops-internals-x86-msvc.h b/src/base/atomicops-internals-x86-msvc.h
index cce120c..4fc2d6e 100644
--- a/src/base/atomicops-internals-x86-msvc.h
+++ b/src/base/atomicops-internals-x86-msvc.h
@@ -39,78 +39,153 @@
 #define BASE_ATOMICOPS_INTERNALS_X86_MSVC_H__
 #include "base/basictypes.h"  // For COMPILE_ASSERT
 
-typedef intptr_t AtomicWord;
-#ifdef _WIN64
-typedef LONG Atomic32;
-#else
-typedef AtomicWord Atomic32;
+typedef int32 Atomic32;
+
+#if defined(_WIN64)
+#define BASE_HAS_ATOMIC64 1  // Use only in tests and base/atomic*
 #endif
 
-COMPILE_ASSERT(sizeof(AtomicWord) == sizeof(PVOID), atomic_word_is_atomic);
+namespace base {
+namespace subtle {
 
-inline AtomicWord CompareAndSwap(volatile AtomicWord* ptr,
-                                 AtomicWord old_value,
-                                 AtomicWord new_value) {
-  PVOID result = InterlockedCompareExchangePointer(
-    reinterpret_cast<volatile PVOID*>(ptr),
-    reinterpret_cast<PVOID>(new_value), reinterpret_cast<PVOID>(old_value));
-  return reinterpret_cast<AtomicWord>(result);
-}
+typedef int64 Atomic64;
 
-inline AtomicWord AtomicExchange(volatile AtomicWord* ptr,
-                                 AtomicWord new_value) {
-  PVOID result = InterlockedExchangePointer(
-    const_cast<PVOID*>(reinterpret_cast<volatile PVOID*>(ptr)),
-    reinterpret_cast<PVOID>(new_value));
-  return reinterpret_cast<AtomicWord>(result);
-}
+// 32-bit low-level operations on any platform
 
-#ifdef _WIN64
-inline Atomic32 AtomicIncrement(volatile Atomic32* ptr, Atomic32 increment) {
-  // InterlockedExchangeAdd returns *ptr before being incremented
-  // and we must return nonzero iff *ptr is nonzero after being
-  // incremented.
-  return InterlockedExchangeAdd(ptr, increment) + increment;
+inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr,
+                                         Atomic32 old_value,
+                                         Atomic32 new_value) {
+  LONG result = InterlockedCompareExchange(
+      reinterpret_cast<volatile LONG*>(ptr),
+      static_cast<LONG>(new_value),
+      static_cast<LONG>(old_value));
+  return static_cast<Atomic32>(result);
 }
 
-inline AtomicWord AtomicIncrement(volatile AtomicWord* ptr, AtomicWord increment) {
-    return InterlockedExchangeAdd64(
-      reinterpret_cast<volatile LONGLONG*>(ptr),
-      static_cast<LONGLONG>(increment)) + increment;
+inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr,
+                                         Atomic32 new_value) {
+  LONG result = InterlockedExchange(
+      reinterpret_cast<volatile LONG*>(ptr),
+      static_cast<LONG>(new_value));
+  return static_cast<Atomic32>(result);
 }
-#else
-inline AtomicWord AtomicIncrement(volatile AtomicWord* ptr, AtomicWord increment) {
-    return InterlockedExchangeAdd(
+
+inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr,
+                                        Atomic32 increment) {
+  return InterlockedExchangeAdd(
       reinterpret_cast<volatile LONG*>(ptr),
       static_cast<LONG>(increment)) + increment;
 }
-#endif
 
-inline AtomicWord Acquire_CompareAndSwap(volatile AtomicWord* ptr,
-                                         AtomicWord old_value,
-                                         AtomicWord new_value) {
-  return CompareAndSwap(ptr, old_value, new_value);
+inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr,
+                                          Atomic32 increment) {
+  return Barrier_AtomicIncrement(ptr, increment);
 }
 
-inline AtomicWord Release_CompareAndSwap(volatile AtomicWord* ptr,
-                                         AtomicWord old_value,
-                                         AtomicWord new_value) {
-  return CompareAndSwap(ptr, old_value, new_value);
-}
+}  // namespace base::subtle
+}  // namespace base
+
 
 // In msvc8/vs2005, winnt.h already contains a definition for MemoryBarrier.
+// Defined it outside the namespace.
 #if !(defined(_MSC_VER) && _MSC_VER >= 1400)
 inline void MemoryBarrier() {
-  AtomicWord value = 0;
-  AtomicExchange(&value, 0); // acts as a barrier
+  Atomic32 value = 0;
+  base::subtle::NoBarrier_AtomicExchange(&value, 0);
+                        // actually acts as a barrier in thisd implementation
 }
 #endif
 
-inline void Acquire_Store(volatile AtomicWord* ptr, AtomicWord value) {
-  AtomicExchange(ptr, value);
+namespace base {
+namespace subtle {
+
+inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr,
+                                       Atomic32 old_value,
+                                       Atomic32 new_value) {
+  return NoBarrier_CompareAndSwap(ptr, old_value, new_value);
+}
+
+inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr,
+                                       Atomic32 old_value,
+                                       Atomic32 new_value) {
+  return NoBarrier_CompareAndSwap(ptr, old_value, new_value);
 }
 
-inline void Release_Store(volatile AtomicWord* ptr, AtomicWord value) {
+inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) {
+  *ptr = value;
+}
+
+inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) {
+  NoBarrier_AtomicExchange(ptr, value);
+              // acts as a barrier in this implementation
+}
+
+inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) {
+  *ptr = value; // works w/o barrier for current Intel chips as of June 2005
+  // See comments in Atomic64 version of Release_Store() below.
+}
+
+inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) {
+  return *ptr;
+}
+
+inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) {
+  Atomic32 value = *ptr;
+  return value;
+}
+
+inline Atomic32 Release_Load(volatile const Atomic32* ptr) {
+  MemoryBarrier();
+  return *ptr;
+}
+
+// 64-bit operations
+
+#if defined(_WIN64)
+
+// 64-bit low-level operations on 64-bit platform.
+
+COMPILE_ASSERT(sizeof(Atomic64) == sizeof(PVOID), atomic_word_is_atomic);
+
+inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr,
+                                         Atomic64 old_value,
+                                         Atomic64 new_value) {
+  PVOID result = InterlockedCompareExchangePointer(
+    reinterpret_cast<volatile PVOID*>(ptr),
+    reinterpret_cast<PVOID>(new_value), reinterpret_cast<PVOID>(old_value));
+  return reinterpret_cast<Atomic64>(result);
+}
+
+inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr,
+                                         Atomic64 new_value) {
+  PVOID result = InterlockedExchangePointer(
+    const_cast<PVOID*>(reinterpret_cast<volatile PVOID*>(ptr)),
+    reinterpret_cast<PVOID>(new_value));
+  return reinterpret_cast<Atomic64>(result);
+}
+
+inline Atomic64 Barrier_AtomicIncrement(volatile Atomic64* ptr,
+                                        Atomic64 increment) {
+  return InterlockedExchangeAdd64(
+      reinterpret_cast<volatile LONGLONG*>(ptr),
+      static_cast<LONGLONG>(increment)) + increment;
+}
+
+inline Atomic64 NoBarrier_AtomicIncrement(volatile Atomic64* ptr,
+                                          Atomic64 increment) {
+  return Barrier_AtomicIncrement(ptr, increment);
+}
+
+inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) {
+  *ptr = value;
+}
+
+inline void Acquire_Store(volatile Atomic64* ptr, Atomic64 value) {
+  NoBarrier_AtomicExchange(ptr, value);
+              // acts as a barrier in this implementation
+}
+
+inline void Release_Store(volatile Atomic64* ptr, Atomic64 value) {
   *ptr = value; // works w/o barrier for current Intel chips as of June 2005
 
   // When new chips come out, check:
@@ -121,15 +196,179 @@ inline void Release_Store(volatile AtomicWord* ptr, AtomicWord value) {
   //   http://developer.intel.com/design/pentium4/manuals/index_new.htm
 }
 
-inline AtomicWord Acquire_Load(volatile const AtomicWord* ptr) {
-  AtomicWord value = *ptr;
-  MemoryBarrier();
+inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) {
+  return *ptr;
+}
+
+inline Atomic64 Acquire_Load(volatile const Atomic64* ptr) {
+  Atomic64 value = *ptr;
   return value;
 }
 
-inline AtomicWord Release_Load(volatile const AtomicWord* ptr) {
+inline Atomic64 Release_Load(volatile const Atomic64* ptr) {
   MemoryBarrier();
   return *ptr;
 }
 
+#else  // defined(_WIN64)
+
+// 64-bit low-level operations on 32-bit platform
+
+// TBD(vchen): The GNU assembly below must be converted to MSVC inline
+// assembly.
+
+#include <stdio.h>
+#include <stdlib.h>
+
+inline void NotImplementedFatalError(const char *function_name) {
+  fprintf(stderr, "64-bit %s() not implemented on this platform\n",
+          function_name);
+  abort();
+}
+
+inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr,
+                                         Atomic64 old_value,
+                                         Atomic64 new_value) {
+#if 0 // Not implemented
+  Atomic64 prev;
+  __asm__ __volatile__("movl (%3), %%ebx\n\t"    // Move 64-bit new_value into
+                       "movl 4(%3), %%ecx\n\t"   // ecx:ebx
+                       "lock; cmpxchg8b %1\n\t"  // If edx:eax (old_value) same
+                       : "=A" (prev)             // as contents of ptr:
+                       : "m" (*ptr),             //   ecx:ebx => ptr
+                         "0" (old_value),        // else:
+                         "r" (&new_value)        //   old *ptr => edx:eax
+                       : "memory", "%ebx", "%ecx");
+  return prev;
+#else
+  NotImplementedFatalError("NoBarrier_CompareAndSwap");
+  return 0;
+#endif
+}
+
+inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr,
+                                         Atomic64 new_value) {
+#if 0 // Not implemented
+  __asm__ __volatile__(
+                       "movl (%2), %%ebx\n\t"    // Move 64-bit new_value into
+                       "movl 4(%2), %%ecx\n\t"   // ecx:ebx
+                       "0:\n\t"
+                       "movl %1, %%eax\n\t"      // Read contents of ptr into
+                       "movl 4%1, %%edx\n\t"     // edx:eax
+                       "lock; cmpxchg8b %1\n\t"  // Attempt cmpxchg; if *ptr
+                       "jnz 0b\n\t"              // is no longer edx:eax, loop
+                       : "=A" (new_value)
+                       : "m" (*ptr),
+                         "r" (&new_value)
+                       : "memory", "%ebx", "%ecx");
+  return new_value;  // Now it's the previous value.
+#else
+  NotImplementedFatalError("NoBarrier_AtomicExchange");
+  return 0;
+#endif
+}
+
+inline Atomic64 NoBarrier_AtomicIncrement(volatile Atomic64* ptr,
+                                          Atomic64 increment) {
+#if 0 // Not implemented
+  Atomic64 temp = increment;
+  __asm__ __volatile__(
+                       "0:\n\t"
+                       "movl (%3), %%ebx\n\t"    // Move 64-bit increment into
+                       "movl 4(%3), %%ecx\n\t"   // ecx:ebx
+                       "movl (%2), %%eax\n\t"    // Read contents of ptr into
+                       "movl 4(%2), %%edx\n\t"   // edx:eax
+                       "add %%eax, %%ebx\n\t"    // sum => ecx:ebx
+                       "adc %%edx, %%ecx\n\t"    // edx:eax still has old *ptr
+                       "lock; cmpxchg8b (%2)\n\t"// Attempt cmpxchg; if *ptr
+                       "jnz 0b\n\t"              // is no longer edx:eax, loop
+                       : "=A"(temp), "+m"(*ptr)
+                       : "D" (ptr), "S" (&increment)
+                       : "memory", "%ebx", "%ecx");
+  // temp now contains the previous value of *ptr
+  return temp + increment;
+#else
+  NotImplementedFatalError("NoBarrier_AtomicIncrement");
+  return 0;
+#endif
+}
+
+inline Atomic64 Barrier_AtomicIncrement(volatile Atomic64* ptr,
+                                        Atomic64 increment) {
+#if 0 // Not implemented
+  Atomic64 new_val = NoBarrier_AtomicIncrement(ptr, increment);
+  if (AtomicOps_Internalx86CPUFeatures.has_amd_lock_mb_bug) {
+    __asm__ __volatile__("lfence" : : : "memory");
+  }
+  return new_val;
+#else
+  NotImplementedFatalError("Barrier_AtomicIncrement");
+  return 0;
+#endif
+}
+
+inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) {
+#if 0 // Not implemented
+  __asm {
+    mov mm0, value;  // Use mmx reg for 64-bit atomic moves
+    mov ptr, mm0;
+    emms;            // Empty mmx state to enable FP registers
+  }
+#else
+  NotImplementedFatalError("NoBarrier_Store");
+#endif
+}
+
+inline void Acquire_Store(volatile Atomic64* ptr, Atomic64 value) {
+  NoBarrier_AtomicExchange(ptr, value);
+              // acts as a barrier in this implementation
+}
+
+inline void Release_Store(volatile Atomic64* ptr, Atomic64 value) {
+  NoBarrier_Store(ptr, value);
+}
+
+inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) {
+#if 0 // Not implemented
+  Atomic64 value;
+  __asm {
+    mov mm0, ptr;    // Use mmx reg for 64-bit atomic moves
+    mov value, mm0;
+    emms;            // Empty mmx state to enable FP registers
+  }
+  return value;
+#else
+  NotImplementedFatalError("NoBarrier_Store");
+  return 0;
+#endif
+}
+
+inline Atomic64 Acquire_Load(volatile const Atomic64* ptr) {
+  Atomic64 value = NoBarrier_Load(ptr);
+  return value;
+}
+
+inline Atomic64 Release_Load(volatile const Atomic64* ptr) {
+  MemoryBarrier();
+  return NoBarrier_Load(ptr);
+}
+
+#endif  // defined(_WIN64)
+
+
+inline Atomic64 Acquire_CompareAndSwap(volatile Atomic64* ptr,
+                                       Atomic64 old_value,
+                                       Atomic64 new_value) {
+  return NoBarrier_CompareAndSwap(ptr, old_value, new_value);
+}
+
+inline Atomic64 Release_CompareAndSwap(volatile Atomic64* ptr,
+                                       Atomic64 old_value,
+                                       Atomic64 new_value) {
+  return NoBarrier_CompareAndSwap(ptr, old_value, new_value);
+}
+
+}  // namespace base::subtle
+}  // namespace base
+
 #endif  // BASE_ATOMICOPS_INTERNALS_X86_MSVC_H__
diff --git a/src/base/atomicops-internals-x86.cc b/src/base/atomicops-internals-x86.cc
index 9d61fd7..1a6c24f 100644
--- a/src/base/atomicops-internals-x86.cc
+++ b/src/base/atomicops-internals-x86.cc
@@ -40,6 +40,12 @@
 #include "base/logging.h"
 #include <string.h>
 
+// This file only makes sense with atomicops-internals-x86.h -- it
+// depends on structs that are defined in that file.  If atomicops.h
+// doesn't sub-include that file, then we aren't needed, and shouldn't
+// try to do anything.
+#ifdef BASE_ATOMICOPS_INTERNALS_X86_H__
+
 // Inline cpuid instruction.  In PIC compilations, %ebx contains the address
 // of the global offset table.  To avoid breaking such executables, this code
 // must preserve that register's value across cpuid instructions.
@@ -117,3 +123,5 @@ REGISTER_MODULE_INITIALIZER(atomicops_x86, {
 });
 
 #endif
+
+#endif  /* ifdef BASE_ATOMICOPS_INTERNALS_X86_H__ */
diff --git a/src/base/atomicops-internals-x86.h b/src/base/atomicops-internals-x86.h
index db3d4d2..68839cc 100644
--- a/src/base/atomicops-internals-x86.h
+++ b/src/base/atomicops-internals-x86.h
@@ -38,17 +38,13 @@
 #ifndef BASE_ATOMICOPS_INTERNALS_X86_H__
 #define BASE_ATOMICOPS_INTERNALS_X86_H__
 
-typedef intptr_t AtomicWord;
 typedef int32_t Atomic32;
+#define BASE_HAS_ATOMIC64 1  // Use only in tests and base/atomic*
+
+
+// NOTE(vchen): x86 does not need to define AtomicWordCastType, because it
+// already matches Atomic32 or Atomic64, depending on the platform.
 
-// There are a couple places we need to specialize opcodes to account for the
-// different AtomicWord sizes on x86_64 and 32-bit platforms.
-// This macro is undefined after its last use, below.
-#if defined(__x86_64__)
-#define ATOMICOPS_WORD_SUFFIX "q"
-#else
-#define ATOMICOPS_WORD_SUFFIX "l"
-#endif
 
 // This struct is not part of the public API of this module; clients may not
 // use it.
@@ -62,63 +58,89 @@ struct AtomicOps_x86CPUFeatureStruct {
 };
 extern struct AtomicOps_x86CPUFeatureStruct AtomicOps_Internalx86CPUFeatures;
 
-inline AtomicWord CompareAndSwap(volatile AtomicWord* ptr,
-                                 AtomicWord old_value,
-                                 AtomicWord new_value) {
-  AtomicWord prev;
-  __asm__ __volatile__("lock; cmpxchg" ATOMICOPS_WORD_SUFFIX " %1,%2"
+
+#define ATOMICOPS_COMPILER_BARRIER() __asm__ __volatile__("" : : : "memory")
+
+
+namespace base {
+namespace subtle {
+
+typedef int64_t Atomic64;
+
+// 32-bit low-level operations on any platform.
+
+inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr,
+                                         Atomic32 old_value,
+                                         Atomic32 new_value) {
+  Atomic32 prev;
+  __asm__ __volatile__("lock; cmpxchgl %1,%2"
                        : "=a" (prev)
                        : "q" (new_value), "m" (*ptr), "0" (old_value)
                        : "memory");
   return prev;
 }
 
-inline AtomicWord AtomicExchange(volatile AtomicWord* ptr,
-                                 AtomicWord new_value) {
-  __asm__ __volatile__("xchg" ATOMICOPS_WORD_SUFFIX " %1,%0" // The lock prefix
-                       : "=r" (new_value)                    // is implicit for
-                       : "m" (*ptr), "0" (new_value)         // xchg.
+inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr,
+                                         Atomic32 new_value) {
+  __asm__ __volatile__("xchgl %1,%0"  // The lock prefix is implicit for xchg.
+                       : "=r" (new_value)
+                       : "m" (*ptr), "0" (new_value)
                        : "memory");
   return new_value;  // Now it's the previous value.
 }
 
-inline AtomicWord AtomicIncrement(volatile AtomicWord* ptr, AtomicWord increment) {
-  AtomicWord temp = increment;
-  __asm__ __volatile__("lock; xadd" ATOMICOPS_WORD_SUFFIX " %0,%1"
+inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr,
+                                          Atomic32 increment) {
+  Atomic32 temp = increment;
+  __asm__ __volatile__("lock; xaddl %0,%1"
                        : "+r" (temp), "+m" (*ptr)
                        : : "memory");
-  // temp now contains the previous value of *ptr
+  // temp now holds the old value of *ptr
   return temp + increment;
 }
 
-#undef ATOMICOPS_WORD_SUFFIX
-
+inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr,
+                                        Atomic32 increment) {
+  Atomic32 temp = increment;
+  __asm__ __volatile__("lock; xaddl %0,%1"
+                       : "+r" (temp), "+m" (*ptr)
+                       : : "memory");
+  // temp now holds the old value of *ptr
+  if (AtomicOps_Internalx86CPUFeatures.has_amd_lock_mb_bug) {
+    __asm__ __volatile__("lfence" : : : "memory");
+  }
+  return temp + increment;
+}
 
-inline AtomicWord Acquire_CompareAndSwap(volatile AtomicWord* ptr,
-                                         AtomicWord old_value,
-                                         AtomicWord new_value) {
-  AtomicWord x = CompareAndSwap(ptr, old_value, new_value);
+inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr,
+                                       Atomic32 old_value,
+                                       Atomic32 new_value) {
+  Atomic32 x = NoBarrier_CompareAndSwap(ptr, old_value, new_value);
   if (AtomicOps_Internalx86CPUFeatures.has_amd_lock_mb_bug) {
     __asm__ __volatile__("lfence" : : : "memory");
   }
   return x;
 }
 
-inline AtomicWord Release_CompareAndSwap(volatile AtomicWord* ptr,
-                                         AtomicWord old_value,
-                                         AtomicWord new_value) {
-  return CompareAndSwap(ptr, old_value, new_value);
+inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr,
+                                       Atomic32 old_value,
+                                       Atomic32 new_value) {
+  return NoBarrier_CompareAndSwap(ptr, old_value, new_value);
 }
 
-#define ATOMICOPS_COMPILER_BARRIER() __asm__ __volatile__("" : : : "memory")
+inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) {
+  *ptr = value;
+}
 
 #if defined(__x86_64__)
 
+// 64-bit implementations of memory barrier can be simpler, because it
+// "mfence" is guaranteed to exist.
 inline void MemoryBarrier() {
   __asm__ __volatile__("mfence" : : : "memory");
 }
 
-inline void Acquire_Store(volatile AtomicWord* ptr, AtomicWord value) {
+inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) {
   *ptr = value;
   MemoryBarrier();
 }
@@ -129,24 +151,103 @@ inline void MemoryBarrier() {
   if (AtomicOps_Internalx86CPUFeatures.has_sse2) {
     __asm__ __volatile__("mfence" : : : "memory");
   } else { // mfence is faster but not present on PIII
-    AtomicWord x = 0;
-    AtomicExchange(&x, 0);
+    Atomic32 x = 0;
+    NoBarrier_AtomicExchange(&x, 0);  // acts as a barrier on PIII
   }
 }
 
-inline void Acquire_Store(volatile AtomicWord* ptr, AtomicWord value) {
+inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) {
   if (AtomicOps_Internalx86CPUFeatures.has_sse2) {
     *ptr = value;
     __asm__ __volatile__("mfence" : : : "memory");
   } else {
-    AtomicExchange(ptr, value);
+    NoBarrier_AtomicExchange(ptr, value);
+                          // acts as a barrier on PIII
   }
 }
-
 #endif
 
-inline void Release_Store(volatile AtomicWord* ptr, AtomicWord value) {
+inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) {
+  ATOMICOPS_COMPILER_BARRIER();
+  *ptr = value; // An x86 store acts as a release barrier.
+  // See comments in Atomic64 version of Release_Store(), below.
+}
+
+inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) {
+  return *ptr;
+}
+
+inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) {
+  Atomic32 value = *ptr; // An x86 load acts as a acquire barrier.
+  // See comments in Atomic64 version of Release_Store(), below.
+  ATOMICOPS_COMPILER_BARRIER();
+  return value;
+}
+
+inline Atomic32 Release_Load(volatile const Atomic32* ptr) {
+  MemoryBarrier();
+  return *ptr;
+}
+
+#if defined(__x86_64__)
+
+// 64-bit low-level operations on 64-bit platform.
+
+inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr,
+                                         Atomic64 old_value,
+                                         Atomic64 new_value) {
+  Atomic64 prev;
+  __asm__ __volatile__("lock; cmpxchgq %1,%2"
+                       : "=a" (prev)
+                       : "q" (new_value), "m" (*ptr), "0" (old_value)
+                       : "memory");
+  return prev;
+}
+
+inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr,
+                                         Atomic64 new_value) {
+  __asm__ __volatile__("xchgq %1,%0"  // The lock prefix is implicit for xchg.
+                       : "=r" (new_value)
+                       : "m" (*ptr), "0" (new_value)
+                       : "memory");
+  return new_value;  // Now it's the previous value.
+}
+
+inline Atomic64 NoBarrier_AtomicIncrement(volatile Atomic64* ptr,
+                                          Atomic64 increment) {
+  Atomic64 temp = increment;
+  __asm__ __volatile__("lock; xaddq %0,%1"
+                       : "+r" (temp), "+m" (*ptr)
+                       : : "memory");
+  // temp now contains the previous value of *ptr
+  return temp + increment;
+}
+
+inline Atomic64 Barrier_AtomicIncrement(volatile Atomic64* ptr,
+                                        Atomic64 increment) {
+  Atomic64 temp = increment;
+  __asm__ __volatile__("lock; xaddq %0,%1"
+                       : "+r" (temp), "+m" (*ptr)
+                       : : "memory");
+  // temp now contains the previous value of *ptr
+  if (AtomicOps_Internalx86CPUFeatures.has_amd_lock_mb_bug) {
+    __asm__ __volatile__("lfence" : : : "memory");
+  }
+  return temp + increment;
+}
+
+inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) {
+  *ptr = value;
+}
+
+inline void Acquire_Store(volatile Atomic64* ptr, Atomic64 value) {
+  *ptr = value;
+  MemoryBarrier();
+}
+
+inline void Release_Store(volatile Atomic64* ptr, Atomic64 value) {
   ATOMICOPS_COMPILER_BARRIER();
+
   *ptr = value; // An x86 store acts as a release barrier
                 // for current AMD/Intel chips as of Jan 2008.
                 // See also Acquire_Load(), below.
@@ -165,94 +266,155 @@ inline void Release_Store(volatile AtomicWord* ptr, AtomicWord value) {
   // either flushing cache lines or non-temporal cache hints.
 }
 
-inline AtomicWord Acquire_Load(volatile const AtomicWord* ptr) {
-  AtomicWord value = *ptr; // An x86 load acts as a acquire barrier,
-                           // for current AMD/Intel chips as of Jan 2008.
-                           // See also Release_Store(), above.
+inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) {
+  return *ptr;
+}
+
+inline Atomic64 Acquire_Load(volatile const Atomic64* ptr) {
+  Atomic64 value = *ptr; // An x86 load acts as a acquire barrier,
+                         // for current AMD/Intel chips as of Jan 2008.
+                         // See also Release_Store(), above.
   ATOMICOPS_COMPILER_BARRIER();
   return value;
 }
 
-inline AtomicWord Release_Load(volatile const AtomicWord* ptr) {
+inline Atomic64 Release_Load(volatile const Atomic64* ptr) {
   MemoryBarrier();
   return *ptr;
 }
 
-// When Atomic32 and AtomicWord are different types, we need to copy
-// the preceding methods for Atomic32.
-
-#ifndef INT32_EQUALS_INTPTR
-
-inline Atomic32 CompareAndSwap(volatile Atomic32* ptr,
-                               Atomic32 old_value,
-                               Atomic32 new_value) {
-  Atomic32 prev;
-  __asm__ __volatile__("lock; cmpxchgl %1,%2"
-                       : "=a" (prev)
-                       : "q" (new_value), "m" (*ptr), "0" (old_value)
-                       : "memory");
+#else // defined(__x86_64__)
+
+// 64-bit low-level operations on 32-bit platform.
+
+inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr,
+                                         Atomic64 old_value,
+                                         Atomic64 new_value) {
+  Atomic64 prev;
+  __asm__ __volatile__("movl (%3), %%ebx\n\t"    // Move 64-bit new_value into
+                       "movl 4(%3), %%ecx\n\t"   // ecx:ebx
+                       "lock; cmpxchg8b %1\n\t"  // If edx:eax (old_value) same
+                       : "=A" (prev)             // as contents of ptr:
+                       : "m" (*ptr),             //   ecx:ebx => ptr
+                         "0" (old_value),        // else:
+                         "r" (&new_value)        //   old *ptr => edx:eax
+                       : "memory", "%ebx", "%ecx");
   return prev;
 }
 
-inline Atomic32 AtomicExchange(volatile Atomic32* ptr,
-                               Atomic32 new_value) {
-  __asm__ __volatile__("xchgl %1,%0"  // The lock prefix is implicit for xchg.
-                       : "=r" (new_value)
-                       : "m" (*ptr), "0" (new_value)
-                       : "memory");
+inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr,
+                                         Atomic64 new_value) {
+  __asm__ __volatile__(
+                       "movl (%2), %%ebx\n\t"    // Move 64-bit new_value into
+                       "movl 4(%2), %%ecx\n\t"   // ecx:ebx
+                       "0:\n\t"
+                       "movl %1, %%eax\n\t"      // Read contents of ptr into
+                       "movl 4%1, %%edx\n\t"     // edx:eax
+                       "lock; cmpxchg8b %1\n\t"  // Attempt cmpxchg; if *ptr
+                       "jnz 0b\n\t"              // is no longer edx:eax, loop
+                       : "=&A" (new_value)
+                       : "m" (*ptr),
+                         "r" (&new_value)
+                       : "memory", "%ebx", "%ecx");
   return new_value;  // Now it's the previous value.
 }
 
-inline Atomic32 AtomicIncrement(volatile Atomic32* ptr, Atomic32 increment) {
-  Atomic32 temp = increment;
-  __asm__ __volatile__("lock; xaddl %0,%1"
-                       : "+r" (temp), "+m" (*ptr)
-                       : : "memory");
-  // temp now holds the old value of *ptr
+inline Atomic64 NoBarrier_AtomicIncrement(volatile Atomic64* ptr,
+                                          Atomic64 increment) {
+  Atomic64 temp = increment;
+  __asm__ __volatile__(
+                       "0:\n\t"
+                       "movl (%3), %%ebx\n\t"    // Move 64-bit increment into
+                       "movl 4(%3), %%ecx\n\t"   // ecx:ebx
+                       "movl (%2), %%eax\n\t"    // Read contents of ptr into
+                       "movl 4(%2), %%edx\n\t"   // edx:eax
+                       "add %%eax, %%ebx\n\t"    // sum => ecx:ebx
+                       "adc %%edx, %%ecx\n\t"    // edx:eax still has old *ptr
+                       "lock; cmpxchg8b (%2)\n\t"// Attempt cmpxchg; if *ptr
+                       "jnz 0b\n\t"              // is no longer edx:eax, loop
+                       : "=A"(temp), "+m"(*ptr)
+                       : "D" (ptr), "S" (&increment)
+                       : "memory", "%ebx", "%ecx");
+  // temp now contains the previous value of *ptr
   return temp + increment;
 }
 
-inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr,
-                                       Atomic32 old_value,
-                                       Atomic32 new_value) {
-  Atomic32 x = CompareAndSwap(ptr, old_value, new_value);
+inline Atomic64 Barrier_AtomicIncrement(volatile Atomic64* ptr,
+                                        Atomic64 increment) {
+  Atomic64 new_val = NoBarrier_AtomicIncrement(ptr, increment);
   if (AtomicOps_Internalx86CPUFeatures.has_amd_lock_mb_bug) {
     __asm__ __volatile__("lfence" : : : "memory");
   }
-  return x;
+  return new_val;
 }
 
-inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr,
-                                       Atomic32 old_value,
-                                       Atomic32 new_value) {
-  return CompareAndSwap(ptr, old_value, new_value);
+inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) {
+  __asm__ __volatile__("movq %1, %%mm0\n\t"  // Use mmx reg for 64-bit atomic
+                       "movq %%mm0, %0\n\t"  // moves (ptr could be read-only)
+                       "emms\n\t"            // Empty mmx state/Reset FP regs
+                       : "=m" (*ptr)
+                       : "m" (value)
+                       : "memory", "%mm0");
 }
 
-inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) {
-  *ptr = value;
+inline void Acquire_Store(volatile Atomic64* ptr, Atomic64 value) {
+  NoBarrier_Store(ptr, value);
   MemoryBarrier();
 }
 
-inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) {
+inline void Release_Store(volatile Atomic64* ptr, Atomic64 value) {
   ATOMICOPS_COMPILER_BARRIER();
-  *ptr = value; // An x86 store acts as a release barrier.
-  // See comments in AtomicWord version of Release_Store(), above.
+  NoBarrier_Store(ptr, value);
 }
 
-inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) {
-  Atomic32 value = *ptr; // An x86 load acts as a acquire barrier.
-  // See comments in AtomicWord version of Release_Store(), above.
+inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) {
+  Atomic64 value;
+  __asm__ __volatile__("movq %1, %%mm0\n\t"  // Use mmx reg for 64-bit atomic
+                       "movq %%mm0, %0\n\t"  // moves (ptr could be read-only)
+                       "emms\n\t"            // Empty mmx state/Reset FP regs
+                       : "=m" (value)
+                       : "m" (*ptr)
+                       : "%mm0");            // Do not mark mem as clobbered
+  return value;
+}
+
+inline Atomic64 Acquire_Load(volatile const Atomic64* ptr) {
+  Atomic64 value = NoBarrier_Load(ptr);
   ATOMICOPS_COMPILER_BARRIER();
   return value;
 }
 
-inline Atomic32 Release_Load(volatile const Atomic32* ptr) {
+inline Atomic64 Release_Load(volatile const Atomic64* ptr) {
   MemoryBarrier();
-  return *ptr;
+  return NoBarrier_Load(ptr);
+}
+
+#endif // defined(__x86_64__)
+
+inline Atomic64 Acquire_CompareAndSwap(volatile Atomic64* ptr,
+                                       Atomic64 old_value,
+                                       Atomic64 new_value) {
+  Atomic64 x = NoBarrier_CompareAndSwap(ptr, old_value, new_value);
+  if (AtomicOps_Internalx86CPUFeatures.has_amd_lock_mb_bug) {
+    __asm__ __volatile__("lfence" : : : "memory");
+  }
+  return x;
 }
 
-#endif /* INT32_EQUALS_INTPTR */
+inline Atomic64 Release_CompareAndSwap(volatile Atomic64* ptr,
+                                       Atomic64 old_value,
+                                       Atomic64 new_value) {
+  return NoBarrier_CompareAndSwap(ptr, old_value, new_value);
+}
+
+} // namespace base::subtle
+} // namespace base
 
 #undef ATOMICOPS_COMPILER_BARRIER
 
+// NOTE(vchen): The following is also deprecated.  New callers should use
+// the base::subtle namespace.
+inline void MemoryBarrier() {
+  base::subtle::MemoryBarrier();
+}
 #endif  // BASE_ATOMICOPS_INTERNALS_X86_H__
diff --git a/src/base/atomicops.h b/src/base/atomicops.h
index 81c365f..834f518 100644
--- a/src/base/atomicops.h
+++ b/src/base/atomicops.h
@@ -31,10 +31,38 @@
  * Author: Sanjay Ghemawat
  */
 
+// For atomic operations on statistics counters, see atomic_stats_counter.h.
+// For atomic operations on sequence numbers, see atomic_sequence_num.h.
+// For atomic operations on reference counts, see atomic_refcount.h.
+
 // Some fast atomic operations -- typically with machine-dependent
 // implementations.  This file may need editing as Google code is
 // ported to different architectures.
 
+// The routines exported by this module are subtle.  If you use them, even if
+// you get the code right, it will depend on careful reasoning about atomicity
+// and memory ordering; it will be less readable, and harder to maintain.  If
+// you plan to use these routines, you should have a good reason, such as solid
+// evidence that performance would otherwise suffer, or there being no
+// alternative.  You should assume only properties explicitly guaranteed by the
+// specifications in this file.  You are almost certainly _not_ writing code
+// just for the x86; if you assume x86 semantics, x86 hardware bugs and
+// implementations on other archtectures will cause your code to break.  If you
+// do not know what you are doing, avoid these routines, and use a Mutex.
+//
+// It is incorrect to make direct assignments to/from an atomic variable.
+// You should use one of the Load or Store routines.  The NoBarrier
+// versions are provided when no barriers are needed:
+//   NoBarrier_Store()
+//   NoBarrier_Load()
+// Although there are currently no compiler enforcement, you are encouraged
+// to use these.  Moreover, if you choose to use base::subtle::Atomic64 type,
+// you MUST use one of the Load or Store routines to get correct behavior
+// on 32-bit platforms.
+//
+// The intent is eventually to put all of these routines in namespace
+// base::subtle
+
 #ifndef THREAD_ATOMICOPS_H__
 #define THREAD_ATOMICOPS_H__
 
@@ -43,18 +71,25 @@
 
 // ------------------------------------------------------------------------
 // Include the platform specific implementations of the types
-// and operations listed below.
+// and operations listed below.  Implementations are to provide Atomic32
+// and Atomic64 operations. If there is a mismatch between intptr_t and
+// the Atomic32 or Atomic64 types for a platform, the platform-specific header
+// should define the macro, AtomicWordCastType in a clause similar to the
+// following:
+// #if ...pointers are 64 bits...
+// # define AtomicWordCastType base::subtle::Atomic64
+// #else
+// # define AtomicWordCastType Atomic32
+// #endif
 // TODO(csilvers): figure out ARCH_PIII/ARCH_K8 (perhaps via ./configure?)
 // ------------------------------------------------------------------------
 
-// macosx.h should work correctly for Darwin/x86 as well, but the
-// x86.h version works fine as well, so we'll go with that.
 // TODO(csilvers): match piii, not just __i386.  Also, match k8
-#if defined(__MACH__) && defined(__APPLE__) && defined(__ppc__)
+#if defined(__MACH__) && defined(__APPLE__)
 #include "base/atomicops-internals-macosx.h"
 #elif defined(__GNUC__) && (defined(__i386) || defined(ARCH_K8))
 #include "base/atomicops-internals-x86.h"
-#elif defined(__i386) && defined(MSVC)
+#elif defined(__i386) && defined(_MSC_VER)
 #include "base/atomicops-internals-x86-msvc.h"
 #elif defined(__linux__) && defined(__PPC__)
 #include "base/atomicops-internals-linuxppc.h"
@@ -66,21 +101,19 @@
 #include "base/atomicops-internals-x86.h"
 #endif
 
-// ------------------------------------------------------------------------
-// Commented out type definitions and method declarations for documentation
-// of the interface provided by this module.
-// ------------------------------------------------------------------------
-
-#if 0
-
 // Signed type that can hold a pointer and supports the atomic ops below, as
 // well as atomic loads and stores.  Instances must be naturally-aligned.
 typedef intptr_t AtomicWord;
 
-// Signed 32-bit type that supports the atomic ops below, as well as atomic
-// loads and stores.  Instances must be naturally aligned.  This type differs
-// from AtomicWord in 64-bit binaries where AtomicWord is 64-bits.
-typedef int32_t Atomic32;
+#ifdef AtomicWordCastType
+// ------------------------------------------------------------------------
+// This section is needed only when explicit type casting is required to
+// cast AtomicWord to one of the basic atomic types (Atomic64 or Atomic32).
+// It also serves to document the AtomicWord interface.
+// ------------------------------------------------------------------------
+
+namespace base {
+namespace subtle {
 
 // Atomically execute:
 //      result = *ptr;
@@ -92,18 +125,36 @@ typedef int32_t Atomic32;
 // Always return the old value of "*ptr"
 //
 // This routine implies no memory barriers.
-AtomicWord CompareAndSwap(volatile AtomicWord* ptr,
-                          AtomicWord old_value,
-                          AtomicWord new_value);
+inline AtomicWord NoBarrier_CompareAndSwap(volatile AtomicWord* ptr,
+                                           AtomicWord old_value,
+                                           AtomicWord new_value) {
+  return NoBarrier_CompareAndSwap(
+      reinterpret_cast<volatile AtomicWordCastType*>(ptr),
+      old_value, new_value);
+}
 
 // Atomically store new_value into *ptr, returning the previous value held in
 // *ptr.  This routine implies no memory barriers.
-AtomicWord AtomicExchange(volatile AtomicWord* ptr, AtomicWord new_value);
+inline AtomicWord NoBarrier_AtomicExchange(volatile AtomicWord* ptr,
+                                           AtomicWord new_value) {
+  return NoBarrier_AtomicExchange(
+      reinterpret_cast<volatile AtomicWordCastType*>(ptr), new_value);
+}
 
 // Atomically increment *ptr by "increment".  Returns the new value of
 // *ptr with the increment applied.  This routine implies no memory
 // barriers.
-AtomicWord AtomicIncrement(volatile AtomicWord* ptr, AtomicWord increment);
+inline AtomicWord NoBarrier_AtomicIncrement(volatile AtomicWord* ptr,
+                                            AtomicWord increment) {
+  return NoBarrier_AtomicIncrement(
+      reinterpret_cast<volatile AtomicWordCastType*>(ptr), increment);
+}
+
+inline AtomicWord Barrier_AtomicIncrement(volatile AtomicWord* ptr,
+                                          AtomicWord increment) {
+  return Barrier_AtomicIncrement(
+      reinterpret_cast<volatile AtomicWordCastType*>(ptr), increment);
+}
 
 // ------------------------------------------------------------------------
 // These following lower-level operations are typically useful only to people
@@ -112,38 +163,221 @@ AtomicWord AtomicIncrement(volatile AtomicWord* ptr, AtomicWord increment);
 // a store with appropriate memory-ordering instructions.  "Acquire" operations
 // ensure that no later memory access can be reordered ahead of the operation.
 // "Release" operations ensure that no previous memory access can be reordered
-// after the operation.
+// after the operation.  "Barrier" operations have both "Acquire" and "Release"
+// semantics.   A MemoryBarrier() has "Barrier" semantics, but does no memory
+// access.
 // ------------------------------------------------------------------------
-AtomicWord Acquire_CompareAndSwap(volatile AtomicWord* ptr,
-                                  AtomicWord old_value,
-                                  AtomicWord new_value);
-AtomicWord Release_CompareAndSwap(volatile AtomicWord* ptr,
-                                  AtomicWord old_value,
-                                  AtomicWord new_value);
-void Acquire_Store(volatile AtomicWord* ptr, AtomicWord value);
-void Release_Store(volatile AtomicWord* ptr, AtomicWord value);
-AtomicWord Acquire_Load(volatile const AtomicWord* ptr);
-AtomicWord Release_Load(volatile const AtomicWord* ptr);
+inline AtomicWord Acquire_CompareAndSwap(volatile AtomicWord* ptr,
+                                         AtomicWord old_value,
+                                         AtomicWord new_value) {
+  return base::subtle::Acquire_CompareAndSwap(
+      reinterpret_cast<volatile AtomicWordCastType*>(ptr),
+      old_value, new_value);
+}
+
+inline AtomicWord Release_CompareAndSwap(volatile AtomicWord* ptr,
+                                         AtomicWord old_value,
+                                         AtomicWord new_value) {
+  return base::subtle::Release_CompareAndSwap(
+      reinterpret_cast<volatile AtomicWordCastType*>(ptr),
+      old_value, new_value);
+}
+
+inline void NoBarrier_Store(volatile AtomicWord *ptr, AtomicWord value) {
+  NoBarrier_Store(
+      reinterpret_cast<volatile AtomicWordCastType*>(ptr), value);
+}
+
+inline void Acquire_Store(volatile AtomicWord* ptr, AtomicWord value) {
+  return base::subtle::Acquire_Store(
+      reinterpret_cast<volatile AtomicWordCastType*>(ptr), value);
+}
+
+inline void Release_Store(volatile AtomicWord* ptr, AtomicWord value) {
+  return base::subtle::Release_Store(
+      reinterpret_cast<volatile AtomicWordCastType*>(ptr), value);
+}
+
+inline AtomicWord NoBarrier_Load(volatile const AtomicWord *ptr) {
+  return NoBarrier_Load(
+      reinterpret_cast<volatile const AtomicWordCastType*>(ptr));
+}
+
+inline AtomicWord Acquire_Load(volatile const AtomicWord* ptr) {
+  return base::subtle::Acquire_Load(
+      reinterpret_cast<volatile const AtomicWordCastType*>(ptr));
+}
+
+inline AtomicWord Release_Load(volatile const AtomicWord* ptr) {
+  return base::subtle::Release_Load(
+      reinterpret_cast<volatile const AtomicWordCastType*>(ptr));
+}
+
+}  // namespace base::subtle
+}  // namespace base
+#endif  // AtomicWordCastType
+
+// ------------------------------------------------------------------------
+// Commented out type definitions and method declarations for documentation
+// of the interface provided by this module.
+// ------------------------------------------------------------------------
+
+#if 0
+
+// Signed 32-bit type that supports the atomic ops below, as well as atomic
+// loads and stores.  Instances must be naturally aligned.  This type differs
+// from AtomicWord in 64-bit binaries where AtomicWord is 64-bits.
+typedef int32_t Atomic32;
 
 // Corresponding operations on Atomic32
-Atomic32 CompareAndSwap(volatile Atomic32* ptr,
-                        Atomic32 old_value,
-                        Atomic32 new_value);
-Atomic32 AtomicExchange(volatile Atomic32* ptr, Atomic32 new_value);
-Atomic32 AtomicIncrement(volatile Atomic32* ptr, Atomic32 increment);
+namespace base {
+namespace subtle {
+
+// Signed 64-bit type that supports the atomic ops below, as well as atomic
+// loads and stores.  Instances must be naturally aligned.  This type differs
+// from AtomicWord in 32-bit binaries where AtomicWord is 32-bits.
+typedef int64_t Atomic64;
+
+Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr,
+                                  Atomic32 old_value,
+                                  Atomic32 new_value);
+Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr, Atomic32 new_value);
+Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr, Atomic32 increment);
+Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr,
+                                 Atomic32 increment);
 Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr,
                                 Atomic32 old_value,
                                 Atomic32 new_value);
 Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr,
                                 Atomic32 old_value,
                                 Atomic32 new_value);
+void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value);
 void Acquire_Store(volatile Atomic32* ptr, Atomic32 value);
 void Release_Store(volatile Atomic32* ptr, Atomic32 value);
+Atomic32 NoBarrier_Load(volatile const Atomic32* ptr);
 Atomic32 Acquire_Load(volatile const Atomic32* ptr);
 Atomic32 Release_Load(volatile const Atomic32* ptr);
 
+// Corresponding operations on Atomic64
+Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr,
+                                  Atomic64 old_value,
+                                  Atomic64 new_value);
+Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr, Atomic64 new_value);
+Atomic64 NoBarrier_AtomicIncrement(volatile Atomic64* ptr, Atomic64 increment);
+Atomic64 Barrier_AtomicIncrement(volatile Atomic64* ptr, Atomic64 increment);
+
+Atomic64 Acquire_CompareAndSwap(volatile Atomic64* ptr,
+                                Atomic64 old_value,
+                                Atomic64 new_value);
+Atomic64 Release_CompareAndSwap(volatile Atomic64* ptr,
+                                Atomic64 old_value,
+                                Atomic64 new_value);
+void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value);
+void Acquire_Store(volatile Atomic64* ptr, Atomic64 value);
+void Release_Store(volatile Atomic64* ptr, Atomic64 value);
+Atomic64 NoBarrier_Load(volatile const Atomic64* ptr);
+Atomic64 Acquire_Load(volatile const Atomic64* ptr);
+Atomic64 Release_Load(volatile const Atomic64* ptr);
+}  // namespace base::subtle
+}  // namespace base
+
 void MemoryBarrier();
 
-#endif
+#endif  // 0
+
+
+// ------------------------------------------------------------------------
+// The following are to be deprecated when all uses have been changed to
+// use the base::subtle namespace.
+// ------------------------------------------------------------------------
+
+#ifdef AtomicWordCastType
+// AtomicWord versions to be deprecated
+inline AtomicWord Acquire_CompareAndSwap(volatile AtomicWord* ptr,
+                                         AtomicWord old_value,
+                                         AtomicWord new_value) {
+  return base::subtle::Acquire_CompareAndSwap(ptr, old_value, new_value);
+}
+
+inline AtomicWord Release_CompareAndSwap(volatile AtomicWord* ptr,
+                                         AtomicWord old_value,
+                                         AtomicWord new_value) {
+  return base::subtle::Release_CompareAndSwap(ptr, old_value, new_value);
+}
+
+inline void Acquire_Store(volatile AtomicWord* ptr, AtomicWord value) {
+  return base::subtle::Acquire_Store(ptr, value);
+}
+
+inline void Release_Store(volatile AtomicWord* ptr, AtomicWord value) {
+  return base::subtle::Release_Store(ptr, value);
+}
+
+inline AtomicWord Acquire_Load(volatile const AtomicWord* ptr) {
+  return base::subtle::Acquire_Load(ptr);
+}
+
+inline AtomicWord Release_Load(volatile const AtomicWord* ptr) {
+  return base::subtle::Release_Load(ptr);
+}
+#endif  // AtomicWordCastType
+
+// 32-bit Acquire/Release operations to be deprecated.
+
+inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr,
+                                       Atomic32 old_value,
+                                       Atomic32 new_value) {
+  return base::subtle::Acquire_CompareAndSwap(ptr, old_value, new_value);
+}
+inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr,
+                                       Atomic32 old_value,
+                                       Atomic32 new_value) {
+  return base::subtle::Release_CompareAndSwap(ptr, old_value, new_value);
+}
+inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) {
+  base::subtle::Acquire_Store(ptr, value);
+}
+inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) {
+  return base::subtle::Release_Store(ptr, value);
+}
+inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) {
+  return base::subtle::Acquire_Load(ptr);
+}
+inline Atomic32 Release_Load(volatile const Atomic32* ptr) {
+  return base::subtle::Release_Load(ptr);
+}
+
+#ifdef BASE_HAS_ATOMIC64
+
+// 64-bit Acquire/Release operations to be deprecated.
+
+inline base::subtle::Atomic64 Acquire_CompareAndSwap(
+    volatile base::subtle::Atomic64* ptr,
+    base::subtle::Atomic64 old_value, base::subtle::Atomic64 new_value) {
+  return base::subtle::Acquire_CompareAndSwap(ptr, old_value, new_value);
+}
+inline base::subtle::Atomic64 Release_CompareAndSwap(
+    volatile base::subtle::Atomic64* ptr,
+    base::subtle::Atomic64 old_value, base::subtle::Atomic64 new_value) {
+  return base::subtle::Release_CompareAndSwap(ptr, old_value, new_value);
+}
+inline void Acquire_Store(
+    volatile base::subtle::Atomic64* ptr, base::subtle::Atomic64 value) {
+  base::subtle::Acquire_Store(ptr, value);
+}
+inline void Release_Store(
+    volatile base::subtle::Atomic64* ptr, base::subtle::Atomic64 value) {
+  return base::subtle::Release_Store(ptr, value);
+}
+inline base::subtle::Atomic64 Acquire_Load(
+    volatile const base::subtle::Atomic64* ptr) {
+  return base::subtle::Acquire_Load(ptr);
+}
+inline base::subtle::Atomic64 Release_Load(
+    volatile const base::subtle::Atomic64* ptr) {
+  return base::subtle::Release_Load(ptr);
+}
+
+#endif  // BASE_HAS_ATOMIC64
 
 #endif  // THREAD_ATOMICOPS_H__
diff --git a/src/base/basictypes.h b/src/base/basictypes.h
index d9d2774..97f96d6 100644
--- a/src/base/basictypes.h
+++ b/src/base/basictypes.h
@@ -275,4 +275,20 @@ class AssignAttributeStartEnd {
 
 #endif  // HAVE___ATTRIBUTE__ and __ELF__ or __MACH__
 
+// The following enum should be used only as a constructor argument to indicate
+// that the variable has static storage class, and that the constructor should
+// do nothing to its state.  It indicates to the reader that it is legal to
+// declare a static nistance of the class, provided the constructor is given
+// the base::LINKER_INITIALIZED argument.  Normally, it is unsafe to declare a
+// static variable that has a constructor or a destructor because invocation
+// order is undefined.  However, IF the type can be initialized by filling with
+// zeroes (which the loader does for static variables), AND the destructor also
+// does nothing to the storage, then a constructor declared as
+//       explicit MyClass(base::LinkerInitialized x) {}
+// and invoked as
+//       static MyClass my_variable_name(base::LINKER_INITIALIZED);
+namespace base {
+enum LinkerInitialized { LINKER_INITIALIZED };
+}
+
 #endif  // _BASICTYPES_H_
diff --git a/src/base/cycleclock.h b/src/base/cycleclock.h
index 20681cb..a5fecb1 100644
--- a/src/base/cycleclock.h
+++ b/src/base/cycleclock.h
@@ -50,7 +50,18 @@
 struct CycleClock {
   // This should return the number of cycles since power-on
   static inline int64 Now() {
-#if defined(__i386__)
+#if defined(__MACH__) && defined(__APPLE__)
+    // this goes at the top because we need ALL Macs, regardless
+    // of architecture, to return the number of "mach time units"
+    // that have passes since startup. See sysinfo.cc where
+    // InitializeSystemInfo() sets the supposed cpu clock frequency of macs
+    // to the number of mach time units per second, not actual
+    // CPU clock frequency (which can change in the face of CPU
+    // frequency scaling).  also note that when the Mac sleeps,
+    // this counter pauses; it does not continue counting, nor resets
+    // to zero.
+    return mach_absolute_time();
+#elif defined(__i386__)
     int64 ret;
     __asm__ volatile ("rdtsc"
                       : "=A" (ret) );
@@ -79,8 +90,6 @@ struct CycleClock {
     return itc;
 #elif defined(_MSC_VER) && defined(_M_IX86)
     _asm rdtsc
-#elif defined(__MACH__) && defined(__APPLE__)
-    return mach_absolute_time();
 #else
     // We could define __alpha here as well, but it only has a 32-bit
     // timer (good for like 4 seconds), which isn't very useful.
diff --git a/src/base/dynamic_annotations.cc b/src/base/dynamic_annotations.cc
new file mode 100644
index 0000000..0f441ed
--- /dev/null
+++ b/src/base/dynamic_annotations.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2008, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Kostya Serebryany
+ */
+
+#include "base/dynamic_annotations.h"
+
+// Each function is empty and called (via a macro) only in debug mode.
+// The arguments are captured by dynamic tools at runtime.
+
+extern "C" void AnnotateRWLockCreate(const char *file, int line, void *lock){}
+extern "C" void AnnotateRWLockDestroy(const char *file, int line, void *lock){}
+extern "C" void AnnotateRWLockAcquired(const char *file, int line, 
+                                       void *lock, long is_w){}
+extern "C" void AnnotateRWLockReleased(const char *file, int line, 
+                                       void *lock, long is_w){}
+extern "C" void AnnotateCondVarWait(const char *file, int line, void *cv, 
+                                    void *lock){}
+extern "C" void AnnotateCondVarSignal(const char *file, int line, void *cv){}
+extern "C" void AnnotateCondVarSignalAll(const char *file, int line, void *cv){}
+extern "C" void AnnotatePCQCreate(const char *file, int line, void *pcq){}
+extern "C" void AnnotatePCQDestroy(const char *file, int line, void *pcq){}
+extern "C" void AnnotatePCQPut(const char *file, int line, void *pcq){}
+extern "C" void AnnotatePCQGet(const char *file, int line, void *pcq){}
+extern "C" void AnnotateNewMemory(const char *file, int line, void *mem, 
+                                  long size){}
+extern "C" void AnnotateExpectRace(const char *file, int line, void *mem, 
+                                   const char *description){}
+extern "C" void AnnotateBenignRace(const char *file, int line, void *mem, 
+                                   const char *description){}
+extern "C" void AnnotateMutexIsUsedAsCondVar(const char *file, int line, 
+                                            void *mu){}
+extern "C" void AnnotateTraceMemory(const char *file, int line, 
+                                    const void *arg){}
+extern "C" void AnnotateNoOp(const char *file, int line, const void *arg){}
diff --git a/src/base/dynamic_annotations.h b/src/base/dynamic_annotations.h
new file mode 100644
index 0000000..d619ffa
--- /dev/null
+++ b/src/base/dynamic_annotations.h
@@ -0,0 +1,188 @@
+/* Copyright (c) 2008, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Kostya Serebryany
+ */
+
+// This file defines dynamic annotations for use with dynamic analysis
+// tool such as valgrind, PIN, etc.
+//
+// Dynamic annotation is a source code annotation that affects
+// the generated code (that is, the annotation is not a comment).
+// Each such annotation is attached to a particular
+// instruction and/or to a particular object (address) in the program.
+//
+// The annotations that should be used by users are macros
+// (e.g. ANNOTATE_NEW_MEMORY).
+//
+// Actual implementation of these macros may differ depending on the
+// dynamic analysis tool being used.
+//
+// This file supports the following dynamic analysis tools:
+// - None (NDEBUG is defined).
+//    Macros are defined empty.
+// - Helgrind (NDEBUG is not defined).
+//    Macros are defined as calls to non-inlinable empty functions
+//    that are intercepted by helgrind.
+//
+#ifndef _BASE_DYNAMIC_ANNOTATIONS_H__
+#define _BASE_DYNAMIC_ANNOTATIONS_H__
+
+
+// All the annotation macros are in effect only in debug mode.
+#ifndef NDEBUG
+
+  // Report that "lock" has been created.
+  #define ANNOTATE_RWLOCK_CREATE(lock) \
+    AnnotateRWLockCreate(__FILE__, __LINE__, lock)
+
+  // Report that "lock" is about to be destroyed.
+  #define ANNOTATE_RWLOCK_DESTROY(lock) \
+    AnnotateRWLockDestroy(__FILE__, __LINE__, lock)
+
+  // Report that "lock" has been acquired.
+  // is_w=1 for writer lock, is_w=0 for reader lock.
+  #define ANNOTATE_RWLOCK_ACQUIRED(lock, is_w) \
+    AnnotateRWLockAcquired(__FILE__, __LINE__, lock, is_w)
+
+  // Report that "lock" is about to be relased.
+  #define ANNOTATE_RWLOCK_RELEASED(lock, is_w) \
+    AnnotateRWLockReleased(__FILE__, __LINE__, lock, is_w)
+
+  // Report that wait on 'cv' has succeeded and 'lock' is held.
+  #define ANNOTATE_CONDVAR_LOCK_WAIT(cv, lock) \
+    AnnotateCondVarWait(__FILE__, __LINE__, cv, lock)
+
+  // Report that wait on 'cv' has succeeded. Variant w/o lock.
+  #define ANNOTATE_CONDVAR_WAIT(cv) \
+    AnnotateCondVarWait(__FILE__, __LINE__, cv, NULL)
+
+  // Report that we are about to signal on 'cv'.
+  #define ANNOTATE_CONDVAR_SIGNAL(cv) \
+    AnnotateCondVarSignal(__FILE__, __LINE__, cv)
+
+  // Report that we are about to signal_all on 'cv'.
+  #define ANNOTATE_CONDVAR_SIGNAL_ALL(cv) \
+    AnnotateCondVarSignalAll(__FILE__, __LINE__, cv)
+
+  // Report that "pcq" (ProducerConsumerQueue) has been created.
+  #define ANNOTATE_PCQ_CREATE(pcq) \
+    AnnotatePCQCreate(__FILE__, __LINE__, pcq)
+
+  // Report that "pcq" is about to be destroyed.
+  #define ANNOTATE_PCQ_DESTROY(pcq) \
+    AnnotatePCQDestroy(__FILE__, __LINE__, pcq)
+
+  // Report that we are about to put an element into 'pcq'.
+  #define ANNOTATE_PCQ_PUT(pcq) \
+    AnnotatePCQPut(__FILE__, __LINE__, pcq)
+
+  // Report that we've just got an element from 'pcq'.
+  #define ANNOTATE_PCQ_GET(pcq) \
+    AnnotatePCQGet(__FILE__, __LINE__, pcq)
+
+  // Report that a new memory 'mem' of size 'size' has been allocated.
+  #define ANNOTATE_NEW_MEMORY(mem, size) \
+    AnnotateNewMemory(__FILE__, __LINE__, mem, size)
+
+  // Report that we expect a race on 'mem'. 
+  // To use only in unit tests for a race detector.
+  #define ANNOTATE_EXPECT_RACE(mem, description) \
+    AnnotateExpectRace(__FILE__, __LINE__, mem, description)
+
+  // Report that we may have a benign race on 'mem'.
+  // Insert at the point where 'mem' exists, preferably close to the point
+  // where the race happens.
+  #define ANNOTATE_BENIGN_RACE(mem, description) \
+    AnnotateBenignRace(__FILE__, __LINE__, mem, description)
+
+  // Report that the mutex 'mu' will be used with LockWhen/Await.
+  #define ANNOTATE_MUTEX_IS_USED_AS_CONDVAR(mu) \
+    AnnotateMutexIsUsedAsCondVar(__FILE__, __LINE__, mu)
+
+  // Request to trace every access to 'arg'.
+  #define ANNOTATE_TRACE_MEMORY(arg) \
+    AnnotateTraceMemory(__FILE__, __LINE__, arg)
+
+  // A no-op. Insert where you like to test the interceptors.
+  #define ANNOTATE_NO_OP(arg) \
+    AnnotateNoOp(__FILE__, __LINE__, arg)
+
+#else  // NDEBUG is defined 
+
+  #define ANNOTATE_RWLOCK_CREATE(lock) // empty
+  #define ANNOTATE_RWLOCK_DESTROY(lock) // empty
+  #define ANNOTATE_RWLOCK_ACQUIRED(lock, is_w) // empty
+  #define ANNOTATE_RWLOCK_RELEASED(lock, is_w) // empty
+  #define ANNOTATE_CONDVAR_LOCK_WAIT(cv, lock) // empty
+  #define ANNOTATE_CONDVAR_WAIT(cv) // empty
+  #define ANNOTATE_CONDVAR_SIGNAL(cv) // empty
+  #define ANNOTATE_CONDVAR_SIGNAL_ALL(cv) // empty
+  #define ANNOTATE_PCQ_CREATE(pcq) // empty
+  #define ANNOTATE_PCQ_DESTROY(pcq) // empty
+  #define ANNOTATE_PCQ_PUT(pcq) // empty
+  #define ANNOTATE_PCQ_GET(pcq) // empty
+  #define ANNOTATE_NEW_MEMORY(mem, size) // empty
+  #define ANNOTATE_EXPECT_RACE(mem, description) // empty
+  #define ANNOTATE_BENIGN_RACE(mem, description) // empty
+  #define ANNOTATE_MUTEX_IS_USED_AS_CONDVAR(mu) // empty
+  #define ANNOTATE_TRACE_MEMORY(arg) // empty
+  #define ANNOTATE_NO_OP(arg) // empty
+
+#endif  // NDEBUG
+
+// Use the macros above rather than using these functions directly.
+extern "C" void AnnotateRWLockCreate(const char *file, int line, void *lock);
+extern "C" void AnnotateRWLockDestroy(const char *file, int line, void *lock);
+extern "C" void AnnotateRWLockAcquired(const char *file, int line, 
+                                       void *lock, long is_w);
+extern "C" void AnnotateRWLockReleased(const char *file, int line, 
+                                       void *lock, long is_w);
+extern "C" void AnnotateCondVarWait(const char *file, int line, void *cv, 
+                                    void *lock);
+extern "C" void AnnotateCondVarSignal(const char *file, int line, void *cv);
+extern "C" void AnnotateCondVarSignalAll(const char *file, int line, void *cv);
+extern "C" void AnnotatePCQCreate(const char *file, int line, void *pcq);
+extern "C" void AnnotatePCQDestroy(const char *file, int line, void *pcq);
+extern "C" void AnnotatePCQPut(const char *file, int line, void *pcq);
+extern "C" void AnnotatePCQGet(const char *file, int line, void *pcq);
+extern "C" void AnnotateNewMemory(const char *file, int line, void *mem, 
+                                  long size);
+extern "C" void AnnotateExpectRace(const char *file, int line, void *mem, 
+                                   const char *description);
+extern "C" void AnnotateBenignRace(const char *file, int line, void *mem, 
+                                   const char *description);
+extern "C" void AnnotateMutexIsUsedAsCondVar(const char *file, int line, 
+                                            void *mu);
+extern "C" void AnnotateTraceMemory(const char *file, int line, 
+                                    const void *arg);
+extern "C" void AnnotateNoOp(const char *file, int line, const void *arg);
+
+#endif  // _BASE_DYNAMIC_ANNOTATIONS_H__
diff --git a/src/base/linux_syscall_support.h b/src/base/linux_syscall_support.h
index 9972806..979452c 100644
--- a/src/base/linux_syscall_support.h
+++ b/src/base/linux_syscall_support.h
@@ -1325,9 +1325,9 @@ struct kernel_statfs {
     }
 
     LSS_INLINE void (*LSS_NAME(restore_rt)(void))(void) {
-      /* For real-time signals, the kernel does not know how to return from
-       * a signal handler. Instead, it relies on user space to provide a
-       * restorer function that calls the rt_sigreturn() system call.
+      /* On i386, the kernel does not know how to return from a signal
+       * handler. Instead, it relies on user space to provide a
+       * restorer function that calls the {rt_,}sigreturn() system call.
        * Unfortunately, we cannot just reference the glibc version of this
        * function, as glibc goes out of its way to make it inaccessible.
        */
@@ -1342,6 +1342,25 @@ struct kernel_statfs {
                            : "i"  (__NR_rt_sigreturn));
       return res;
     }
+    LSS_INLINE void (*LSS_NAME(restore)(void))(void) {
+      /* On i386, the kernel does not know how to return from a signal
+       * handler. Instead, it relies on user space to provide a
+       * restorer function that calls the {rt_,}sigreturn() system call.
+       * Unfortunately, we cannot just reference the glibc version of this
+       * function, as glibc goes out of its way to make it inaccessible.
+       */
+      void (*res)(void);
+      __asm__ __volatile__("call   2f\n"
+                         "0:.align 16\n"
+                         "1:pop    %%eax\n"
+                           "movl   %1,%%eax\n"
+                           "int    $0x80\n"
+                         "2:popl   %0\n"
+                           "addl   $(1b-0b),%0\n"
+                           : "=a" (res)
+                           : "i"  (__NR_sigreturn));
+      return res;
+    }
   #elif defined(__x86_64__)
     /* There are no known problems with any of the _syscallX() macros
      * currently shipping for x86_64, but we still need to be able to define
@@ -2190,14 +2209,16 @@ struct kernel_statfs {
        * This function must have a "magic" signature that the "gdb"
        * (and maybe the kernel?) can recognize.
        */
-      struct kernel_sigaction a;
-      if (act != NULL) {
-        a             = *act;
+      if (act != NULL && !(act->sa_flags & SA_RESTORER)) {
+        struct kernel_sigaction a = *act;
         a.sa_flags   |= SA_RESTORER;
         a.sa_restorer = LSS_NAME(restore_rt)();
+        return LSS_NAME(rt_sigaction)(signum, &a, oldact,
+                                      (KERNEL_NSIG+7)/8);
+      } else {
+        return LSS_NAME(rt_sigaction)(signum, act, oldact,
+                                      (KERNEL_NSIG+7)/8);
       }
-      return LSS_NAME(rt_sigaction)(signum, act ? &a : act, oldact,
-                                    (KERNEL_NSIG+7)/8);
     }
 
     LSS_INLINE int LSS_NAME(sigpending)(struct kernel_sigset_t *set) {
@@ -2394,8 +2415,11 @@ struct kernel_statfs {
          *
          * TODO: Test whether ARM needs a restorer
          */
-        a.sa_flags   |= SA_RESTORER;
-        a.sa_restorer = LSS_NAME(restore_rt)();
+        if (!(a.sa_flags & SA_RESTORER)) {
+          a.sa_flags   |= SA_RESTORER;
+          a.sa_restorer = (a.sa_flags & SA_SIGINFO)
+                          ? LSS_NAME(restore_rt)() : LSS_NAME(restore)();
+        }
         #endif
       }
       rc = LSS_NAME(rt_sigaction)(signum, act ? &a : act, oldact,
diff --git a/src/base/low_level_alloc.cc b/src/base/low_level_alloc.cc
index 7d1459b..95ecf3b 100644
--- a/src/base/low_level_alloc.cc
+++ b/src/base/low_level_alloc.cc
@@ -37,6 +37,7 @@
 // it should not be used when performance is key.
 
 #include "base/low_level_alloc.h"
+#include "base/dynamic_annotations.h"
 #include "base/spinlock.h"
 #include "base/logging.h"
 #include <google/malloc_hook.h>
@@ -400,12 +401,12 @@ void *DoAllocWithArena(size_t request, LowLevelAlloc::Arena *arena) {
       void *new_pages = mmap(0, new_pages_size,
                      PROT_WRITE|PROT_READ, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
       RAW_CHECK(new_pages != MAP_FAILED, "mmap error");
+      arena->mu.Lock();
       s = reinterpret_cast<AllocList *>(new_pages);
       s->header.size = new_pages_size;
       // Pretend the block is allocated; call AddToFreelist() to free it.
       s->header.magic = Magic(kMagicAllocated, &s->header);
       s->header.arena = arena;
-      arena->mu.Lock();
       AddToFreelist(&s->levels, arena);  // insert new region into free list
     }
     AllocList *prev[kMaxLevel];
@@ -426,6 +427,7 @@ void *DoAllocWithArena(size_t request, LowLevelAlloc::Arena *arena) {
     arena->mu.Unlock();
     result = &s->levels;
   }
+  ANNOTATE_NEW_MEMORY(result, request);
   return result;
 }
 
diff --git a/src/base/spinlock.cc b/src/base/spinlock.cc
index 2e601eb..3cce898 100644
--- a/src/base/spinlock.cc
+++ b/src/base/spinlock.cc
@@ -51,6 +51,9 @@ void SubmitSpinLockProfileData(const void *, int64) {}
 
 static int adaptive_spin_count = 0;
 
+const base::LinkerInitialized SpinLock::LINKER_INITIALIZED =
+    base::LINKER_INITIALIZED;
+
 struct SpinLock_InitHelper {
   SpinLock_InitHelper() {
     // On multi-cpu machines, spin for longer before yielding
diff --git a/src/base/spinlock.h b/src/base/spinlock.h
index 0ad8587..2fd5e87 100644
--- a/src/base/spinlock.h
+++ b/src/base/spinlock.h
@@ -36,7 +36,7 @@
 // half the cost of a Mutex because the unlock just does a store instead
 // of a compare-and-swap which is expensive).
 
-// Spinlock is async signal safe.
+// SpinLock is async signal safe.
 // If used within a signal handler, all lock holders 
 // should block the signal even outside the signal handler.
 
@@ -46,6 +46,7 @@
 #include "config.h"
 #include "base/basictypes.h"
 #include "base/atomicops.h"
+#include "base/dynamic_annotations.h"
 
 class SpinLock {
  public:
@@ -53,15 +54,14 @@ class SpinLock {
 
   // Special constructor for use with static SpinLock objects.  E.g.,
   //
-  //    static SpinLock lock(SpinLock::LINKER_INITIALIZED);
+  //    static SpinLock lock(base::LINKER_INITIALIZED);
   //
   // When intialized using this constructor, we depend on the fact
   // that the linker has already initialized the memory appropriately.
   // A SpinLock constructed like this can be freely used from global
   // initializers without worrying about the order in which global
   // initializers run.
-  enum StaticInitializer { LINKER_INITIALIZED };
-  explicit SpinLock(StaticInitializer x) {
+  explicit SpinLock(base::LinkerInitialized x) {
     // Does nothing; lockword_ is already initialized
   }
 
@@ -69,10 +69,15 @@ class SpinLock {
     if (Acquire_CompareAndSwap(&lockword_, 0, 1) != 0) {
       SlowLock();
     }
+    ANNOTATE_RWLOCK_ACQUIRED(this, 1);
   }
 
   inline bool TryLock() {
-    return (Acquire_CompareAndSwap(&lockword_, 0, 1) == 0);
+    bool res = (Acquire_CompareAndSwap(&lockword_, 0, 1) == 0);
+    if (res) {
+      ANNOTATE_RWLOCK_ACQUIRED(this, 1);
+    }
+    return res;
   }
 
   inline void Unlock() {
@@ -80,7 +85,7 @@ class SpinLock {
     extern void SubmitSpinLockProfileData(const void *, int64);
 
     int64 wait_timestamp = static_cast<uint32>(lockword_);
-
+    ANNOTATE_RWLOCK_RELEASED(this, 1);
     Release_Store(&lockword_, 0);
     // Collect contention profile info if this lock was contended.
     // The lockword_ value indicates when the waiter started waiting
@@ -110,8 +115,11 @@ class SpinLock {
   // Waits this long should be very rare.
   enum { PROFILE_TIMESTAMP_SHIFT = 7 };
 
+  static const base::LinkerInitialized LINKER_INITIALIZED;  // backwards compat
  private:
-  // Lock-state: 0 means unlocked, 1 means locked
+  // Lock-state:  0 means unlocked; 1 means locked with no waiters; values
+  // greater than 1 indicate locked with waiters, where the value is the time
+  // the first waiter started waiting and is used for contention profiling.
   volatile AtomicWord lockword_;
 
   void SlowLock();
diff --git a/src/base/sysinfo.cc b/src/base/sysinfo.cc
index df0452a..ad2cc35 100644
--- a/src/base/sysinfo.cc
+++ b/src/base/sysinfo.cc
@@ -48,30 +48,34 @@
 #include <sys/sysctl.h>
 #elif defined __sun__         // Solaris
 #include <procfs.h>           // for, e.g., prmap_t
-#elif defined _MSC_VER        // Windows
+#elif defined WIN32           // Windows
 #include <process.h>          // for getpid() (actually, _getpid())
 #include <shlwapi.h>          // for SHGetValueA()
+#include <tlhelp32.h>         // for Module32First()
 #endif
 #include "base/sysinfo.h"
 #include "base/commandlineflags.h"
 #include "base/logging.h"
 #include "base/cycleclock.h"
 
-#if defined(WIN32) && defined(MODULEENTRY32)
+#ifdef WIN32
+#ifdef MODULEENTRY32
 // In a change from the usual W-A pattern, there is no A variant of
 // MODULEENTRY32.  Tlhelp32.h #defines the W variant, but not the A.
-// We want the original A variants, and this #undef is the only
-// way I see to get them.
+// In unicode mode, tlhelp32.h #defines MODULEENTRY32 to be
+// MODULEENTRY32W.  These #undefs are the only way I see to get back
+// access to the original, ascii struct (and related functions).
 #undef MODULEENTRY32
 #undef Module32First
 #undef Module32Next
 #undef PMODULEENTRY32
 #undef LPMODULEENTRY32
+#endif  /* MODULEENTRY32 */
 // MinGW doesn't seem to define this, perhaps some windowsen don't either.
 #ifndef TH32CS_SNAPMODULE32
 #define TH32CS_SNAPMODULE32  0
-#endif
-#endif
+#endif  /* TH32CS_SNAPMODULE32 */
+#endif  /* WIN32 */
 
 // Re-run fn until it doesn't cause EINTR.
 #define NO_INTR(fn)  do {} while ((fn) < 0 && errno == EINTR)
@@ -335,8 +339,20 @@ static void InitializeSystemInfo() {
   // TODO(csilvers): also figure out cpuinfo_num_cpus
 
 #elif defined(__MACH__) && defined(__APPLE__)
-  // TODO(csilvers): can we do better than this?
-  cpuinfo_cycles_per_second = EstimateCyclesPerSecond(1000);
+  // returning "mach time units" per second. the current number of elapsed
+  // mach time units can be found by calling uint64 mach_absolute_time();
+  // while not as precise as actual CPU cycles, it is accurate in the face
+  // of CPU frequency scaling and multi-cpu/core machines.
+  // Our mac users have these types of machines, and accuracy
+  // (i.e. correctness) trumps precision.
+  // See cycleclock.h: CycleClock::Now(), which returns number of mach time
+  // units on Mac OS X.
+  mach_timebase_info_data_t timebase_info;
+  mach_timebase_info(&timebase_info);
+  double mach_time_units_per_nanosecond =
+      static_cast<double>(timebase_info.denom) /
+      static_cast<double>(timebase_info.numer);
+  cpuinfo_cycles_per_second = mach_time_units_per_nanosecond * 1e9;
 
   int num_cpus = 0;
   size_t size = sizeof(num_cpus);
diff --git a/src/heap-checker.cc b/src/heap-checker.cc
index 432e733..0fea028 100644
--- a/src/heap-checker.cc
+++ b/src/heap-checker.cc
@@ -1116,7 +1116,7 @@ void HeapLeakChecker::IgnoreAllLiveObjectsLocked(const void* self_stack_top) {
     IgnoreNonThreadLiveObjectsLocked();
   }
   if (live_objects_total) {
-    RAW_VLOG(0, "Ignoring %"PRId64" reachable objects of %"PRId64" bytes",
+    RAW_VLOG(1, "Ignoring %"PRId64" reachable objects of %"PRId64" bytes",
                 live_objects_total, live_bytes_total);
   }
   // Free these: we made them here and heap_profile never saw them
@@ -1229,6 +1229,7 @@ void HeapLeakChecker::IgnoreLiveObjectsLocked(const char* name,
             // log call stacks to help debug how come something is not a leak
             HeapProfileTable::AllocInfo alloc;
             bool r = heap_profile->FindAllocDetails(ptr, &alloc);
+            r = r;              // suppress compiler warning in non-debug mode
             RAW_DCHECK(r, "");  // sanity
             RAW_LOG(INFO, "New live %p object's alloc stack:", ptr);
             for (int i = 0; i < alloc.stack_depth; ++i) {
@@ -1377,7 +1378,7 @@ void HeapLeakChecker::DumpProfileLocked(ProfileType profile_type,
                                         const void* self_stack_top,
                                         size_t* alloc_bytes,
                                         size_t* alloc_objects) {
-  RAW_VLOG(0, "%s check \"%s\"%s",
+  RAW_VLOG(1, "%s check \"%s\"%s",
               (profile_type == START_PROFILE ? "Starting"
                                              : "At an end point for"),
               name_,
@@ -1640,6 +1641,9 @@ bool HeapLeakChecker::DoNoLeaksOnce(CheckType check_type,
     size_t end_inuse_allocs;
     DumpProfileLocked(END_PROFILE, &a_local_var,
                       &end_inuse_bytes, &end_inuse_allocs);
+    // DumpProfileLocked via IgnoreAllLiveObjectsLocked sets these:
+    const int64 live_objects = live_objects_total;
+    const int64 live_bytes = live_bytes_total;
     const bool use_initial_profile =
       !(FLAGS_heap_check_before_constructors  &&  this == main_heap_checker);
     if (!use_initial_profile) {  // compare against empty initial profile
@@ -1746,8 +1750,10 @@ bool HeapLeakChecker::DoNoLeaksOnce(CheckType check_type,
         }
       }
     } else {
-      RAW_VLOG(0, "No leaks found for check \"%s\" "
-                  "(but no 100%% guarantee that there aren't any)", name_);
+      RAW_LOG(INFO, "No leaks found for check \"%s\" "
+                    "(but no 100%% guarantee that there aren't any): "
+                    "found %"PRId64" reachable heap objects of %"PRId64" bytes",
+                    name_, live_objects, live_bytes);
     }
     return !see_leaks;
   } else {
@@ -1946,6 +1952,8 @@ void HeapLeakChecker::InternalInitStart() {
   // (i.e. nm will list __builtin_new and __builtin_vec_new as undefined).
   // If this happens, it is a BUILD bug to be fixed.
 
+  RAW_LOG(WARNING, "Heap leak checker is active -- Performance may suffer");
+
   if (FLAGS_heap_check != "local") {
     // Schedule registered heap cleanup
     atexit(RunHeapCleanups);
@@ -1995,7 +2003,7 @@ bool HeapLeakChecker::NoGlobalLeaks() {
     CheckFullness fullness = check_type == NO_LEAKS ? USE_PPROF : USE_COUNTS;
       // use pprof if it can help ignore false leaks
     ReportMode report_mode = FLAGS_heap_check_report ? PPROF_REPORT : NO_REPORT;
-    RAW_VLOG(0, "Checking for whole-program memory leaks");
+    RAW_VLOG(1, "Checking for whole-program memory leaks");
     result = main_hc->DoNoLeaks(check_type, fullness, report_mode);
   }
   return result;
@@ -2090,7 +2098,7 @@ void HeapLeakChecker::BeforeConstructors() {
   heap_profile = new (Allocator::Allocate(sizeof(HeapProfileTable)))
                    HeapProfileTable(&Allocator::Allocate, &Allocator::Free);
   heap_checker_lock.Unlock();
-  RAW_VLOG(0, "Starting tracking the heap");
+  RAW_VLOG(1, "Starting tracking the heap");
   heap_checker_on = true;
   // Run silencing if we are called from the first global c-tor,
   // not from the first mmap/sbrk/alloc call:
@@ -2273,8 +2281,7 @@ void HeapLeakChecker::DisableChecksFromToLocked(const void* start_address,
         val.start_address != value.start_address) {
       RAW_LOG(FATAL, "Two DisableChecksToHereFrom calls conflict: "
                      "(%p, %p, %d) vs. (%p, %p, %d)",
-                     (void*)value.start_address, end_address,
-                     value.max_depth,
+                     (void*)val.start_address, end_address, val.max_depth,
                      start_address, end_address, max_depth);
     }
   }
diff --git a/src/heap-profile-table.cc b/src/heap-profile-table.cc
index ec591b2..b090fc7 100644
--- a/src/heap-profile-table.cc
+++ b/src/heap-profile-table.cc
@@ -421,7 +421,7 @@ void HeapProfileTable::CleanupOldProfiles(const char* prefix) {
       const char* fname = g.gl_pathv[i];
       if ((strlen(fname) >= prefix_length) &&
           (memcmp(fname, prefix, prefix_length) == 0)) {
-        RAW_VLOG(0, "Removing old heap profile %s", fname);
+        RAW_VLOG(1, "Removing old heap profile %s", fname);
         unlink(fname);
       }
     }
diff --git a/src/memfs_malloc.cc b/src/memfs_malloc.cc
index 93de8bf..21de3ee 100644
--- a/src/memfs_malloc.cc
+++ b/src/memfs_malloc.cc
@@ -168,7 +168,12 @@ static void InitSystemAllocator() {
 
     int hugetlb_fd = open(hugetlbfs_path.c_str(), O_RDWR | O_CREAT | O_EXCL,
                           0600);
-    CHECK_ERR(hugetlb_fd);
+    if (hugetlb_fd == -1) {
+      RAW_LOG(WARNING, "unable to create memfs_malloc_path file %s: %s",
+              hugetlbfs_path.c_str(), strerror(errno));
+      return;
+    }
+
     // Cleanup memory on process exit
     CHECK_ERR(unlink(hugetlbfs_path.c_str()));
 
diff --git a/src/pprof b/src/pprof
index 42d141f..dba0f7a 100755
--- a/src/pprof
+++ b/src/pprof
@@ -2444,6 +2444,16 @@ sub FindLibrary {
   return $file;
 }
 
+# Return path to library with debugging symbols.
+# For libc libraries, the copy in /usr/lib/debug contains debugging symbols
+sub DebuggingLibrary {
+  my $file = shift;
+  if ($file =~ m|^/| && -f "/usr/lib/debug$file") {
+    return "/usr/lib/debug$file";
+  }
+  return undef;
+}
+
 # Parse text section header of a library using objdump
 sub ParseTextSectionHeader {
    my $lib = shift;
@@ -2527,10 +2537,17 @@ sub ParseLibraries {
 
     $lib = FindLibrary($lib);
 
-    my $text = ParseTextSectionHeader($lib);
-    if (defined($text)) {
-       my $vma_offset = AddressSub($text->{vma}, $text->{file_offset});
-       $offset = AddressAdd($offset, $vma_offset);
+    # Check for pre-relocated libraries, which use pre-relocated symbol tables
+    # and thus require adjusting the offset that we'll use to translate
+    # VM addresses into symbol table addresses.
+    # Only do this if we're not going to fetch the symbol table from a 
+    # debugging copy of the library.
+    if (!DebuggingLibrary($lib)) {
+      my $text = ParseTextSectionHeader($lib);
+      if (defined($text)) {
+         my $vma_offset = AddressSub($text->{vma}, $text->{file_offset});
+         $offset = AddressAdd($offset, $vma_offset);
+      }
     }
 
     push(@{$result}, [$lib, $start, $finish, $offset]);
@@ -3004,9 +3021,11 @@ sub GetProcedureBoundaries {
   my $regexp = shift;
 
   # For libc libraries, the copy in /usr/lib/debug contains debugging symbols
-  if ($image =~ m|^/| && -f "/usr/lib/debug$image") {
-    $image = "/usr/lib/debug$image";
+  my $debugging = DebuggingLibrary($image);
+  if ($debugging) {
+    $image = $debugging;
   }
+
   my $nm = $obj_tool_map{"nm"};
   my $cppfilt = $obj_tool_map{"c++filt"};
 
diff --git a/src/profiler.cc b/src/profiler.cc
index 79cb4f8..1ea5601 100644
--- a/src/profiler.cc
+++ b/src/profiler.cc
@@ -32,11 +32,6 @@
 //         Chris Demetriou (refactoring)
 //
 // Profile current program by sampling stack-trace every so often
-//
-// TODO: Detect whether or not setitimer() applies to all threads in
-// the process.  If so, instead of starting and stopping by changing
-// the signal handler, start and stop by calling setitimer() and
-// do nothing in the per-thread registration code.
 
 #include "config.h"
 #include "getpc.h"      // should be first to get the _GNU_SOURCE dfn
@@ -85,9 +80,23 @@ class CpuProfiler {
 
   void GetCurrentState(ProfilerState* state);
 
-  // Start interval timer for the current thread.  We do this for
-  // every known thread.  If profiling is off, the generated signals
-  // are ignored, otherwise they are captured by prof_handler().
+  // Register the current thread with the profiler.  This should be
+  // called only once per thread.
+  //
+  // The profiler attempts to determine whether or not timers are
+  // shared by all threads in the process.  (With LinuxThreads, and
+  // with NPTL on some Linux kernel versions, each thread has separate
+  // timers.)
+  //
+  // On systems which have a separate interval timer for each thread,
+  // this function starts the timer for the current thread.  Profiling
+  // is disabled by ignoring the resulting signals, and enabled by
+  // setting their handler to be prof_handler.
+  //
+  // Prior to determining whether timers are shared, this function
+  // will unconditionally start the timer.  However, if this function
+  // determines that timers are shared, then it will stop the timer if
+  // profiling is not currently enabled.
   void RegisterThread();
 
   static CpuProfiler instance_;
@@ -125,6 +134,32 @@ class CpuProfiler {
   bool          (*filter_)(void*);
   void*         filter_arg_;
 
+  // Whether or not the threading system provides interval timers
+  // that are shared by all threads in a process.
+  enum {
+    TIMERS_UNTOUCHED,  // No timer initialization attempted yet.
+    TIMERS_ONE_SET,    // First thread has registered and set timer.
+    TIMERS_SHARED,     // Timers are shared by all threads.
+    TIMERS_SEPARATE    // Timers are separate in each thread.
+  }             timer_sharing_;
+
+  // Start the interval timer used for profiling.  If the thread
+  // library shares timers between threads, this is used to enable and
+  // disable the timer when starting and stopping profiling.  If
+  // timers are not shared, this is used to enable the timer in each
+  // thread.
+  void StartTimer();
+
+  // Stop the interval timer used for profiling.  Used only if the
+  // thread library shares timers between threads.
+  void StopTimer();
+
+  // Returns true if the profiling interval timer enabled in the
+  // current thread.  This actually checks the kernel's interval timer
+  // setting.  (It is used to detect whether timers are shared or
+  // separate.)
+  bool IsTimerRunning();
+
   // Sets the timer interrupt signal handler to one that stores the pc.
   static void EnableHandler();
 
@@ -141,7 +176,8 @@ class CpuProfiler {
 CpuProfiler CpuProfiler::instance_;
 
 // Initialize profiling: activated if getenv("CPUPROFILE") exists.
-CpuProfiler::CpuProfiler() {
+CpuProfiler::CpuProfiler()
+    : timer_sharing_(TIMERS_UNTOUCHED) {
   // Get frequency of interrupts (if specified)
   char junk;
   const char* fr = getenv("CPUPROFILE_FREQUENCY");
@@ -204,6 +240,10 @@ bool CpuProfiler::Start(const char* fname,
     // with signal delivered to this thread.
   }
 
+  if (timer_sharing_ == TIMERS_SHARED) {
+    StartTimer();
+  }
+
   // Setup handler for SIGPROF interrupts
   EnableHandler();
 
@@ -224,11 +264,15 @@ void CpuProfiler::Stop() {
 
   // Ignore timer signals.  Note that the handler may have just
   // started and might not have taken signal_lock_ yet.  Holding
-  // signal_lock_ here along with the semantics of collector_.Add()
+  // signal_lock_ below along with the semantics of collector_.Add()
   // (which does nothing if collection is not enabled) prevents that
   // late sample from causing a problem.
   DisableHandler();
 
+  if (timer_sharing_ == TIMERS_SHARED) {
+    StopTimer();
+  }
+
   {
     SpinLockHolder sl(&signal_lock_);
     collector_.Stop();
@@ -273,6 +317,53 @@ void CpuProfiler::GetCurrentState(ProfilerState* state) {
 }
 
 void CpuProfiler::RegisterThread() {
+  SpinLockHolder cl(&control_lock_);
+
+  // We try to detect whether timers are being shared by setting a
+  // timer in the first call to this function, then checking whether
+  // it's set in the second call.
+  //
+  // Note that this detection method requires that the first two calls
+  // to RegisterThread must be made from different threads.  (Subsequent
+  // calls will see timer_sharing_ set to either TIMERS_SEPARATE or
+  // TIMERS_SHARED, and won't try to detect the timer sharing type.)
+  //
+  // Also note that if timer settings were inherited across new thread
+  // creation but *not* shared, this approach wouldn't work.  That's
+  // not an issue for any Linux threading implementation, and should
+  // not be a problem for a POSIX-compliant threads implementation.
+  switch (timer_sharing_) {
+    case TIMERS_UNTOUCHED:
+      StartTimer();
+      timer_sharing_ = TIMERS_ONE_SET;
+      break;
+    case TIMERS_ONE_SET:
+      // If the timer is running, that means that the main thread's
+      // timer setup is seen in this (second) thread -- and therefore
+      // that timers are shared.
+      if (IsTimerRunning()) {
+        timer_sharing_ = TIMERS_SHARED;
+        // If profiling has already been enabled, we have to keep the
+        // timer running.  If not, we disable the timer here and
+        // re-enable it in start.
+        if (!collector_.enabled()) {
+          StopTimer();
+        }
+      } else {
+        timer_sharing_ = TIMERS_SEPARATE;
+        StartTimer();
+      }
+      break;
+    case TIMERS_SHARED:
+      // Nothing needed.
+      break;
+    case TIMERS_SEPARATE:
+      StartTimer();
+      break;
+  }
+}
+
+void CpuProfiler::StartTimer() {
   // TODO: Randomize the initial interrupt value?
   // TODO: Randomize the inter-interrupt period on every interrupt?
   struct itimerval timer;
@@ -282,6 +373,19 @@ void CpuProfiler::RegisterThread() {
   setitimer(ITIMER_PROF, &timer, 0);
 }
 
+void CpuProfiler::StopTimer() {
+  struct itimerval timer;
+  memset(&timer, 0, sizeof timer);
+  setitimer(ITIMER_PROF, &timer, 0);
+}
+
+bool CpuProfiler::IsTimerRunning() {
+  itimerval current_timer;
+  RAW_CHECK(getitimer(ITIMER_PROF, &current_timer) == 0, "getitimer failed");
+  return (current_timer.it_value.tv_sec != 0 ||
+          current_timer.it_value.tv_usec != 0);
+}
+
 void CpuProfiler::EnableHandler() {
   struct sigaction sa;
   sa.sa_sigaction = prof_handler;
diff --git a/src/tests/atomicops_unittest.cc b/src/tests/atomicops_unittest.cc
index 26fd896..25a518e 100644
--- a/src/tests/atomicops_unittest.cc
+++ b/src/tests/atomicops_unittest.cc
@@ -34,14 +34,16 @@
 #include "base/logging.h"
 #include "base/atomicops.h"
 
+#define GG_ULONGLONG(x)  static_cast<uint64>(x)
+
 template <class AtomicType>
 static void TestAtomicIncrement() {
   // For now, we just test single threaded execution
 
-  // use a guard value to make sure the AtomicIncrement doesn't go
+  // use a guard value to make sure the NoBarrier_AtomicIncrement doesn't go
   // outside the expected address bounds.  This is in particular to
   // test that some future change to the asm code doesn't cause the
-  // 32-bit AtomicIncrement doesn't do the wrong thing on 64-bit
+  // 32-bit NoBarrier_AtomicIncrement doesn't do the wrong thing on 64-bit
   // machines.
   struct {
     AtomicType prev_word;
@@ -57,55 +59,208 @@ static void TestAtomicIncrement() {
   s.count = 0;
   s.next_word = next_word_value;
 
-  CHECK_EQ(AtomicIncrement(&s.count, 1), 1);
+  CHECK_EQ(base::subtle::NoBarrier_AtomicIncrement(&s.count, 1), 1);
   CHECK_EQ(s.count, 1);
   CHECK_EQ(s.prev_word, prev_word_value);
   CHECK_EQ(s.next_word, next_word_value);
 
-  CHECK_EQ(AtomicIncrement(&s.count, 2), 3);
+  CHECK_EQ(base::subtle::NoBarrier_AtomicIncrement(&s.count, 2), 3);
   CHECK_EQ(s.count, 3);
   CHECK_EQ(s.prev_word, prev_word_value);
   CHECK_EQ(s.next_word, next_word_value);
 
-  CHECK_EQ(AtomicIncrement(&s.count, 3), 6);
+  CHECK_EQ(base::subtle::NoBarrier_AtomicIncrement(&s.count, 3), 6);
   CHECK_EQ(s.count, 6);
   CHECK_EQ(s.prev_word, prev_word_value);
   CHECK_EQ(s.next_word, next_word_value);
 
-  CHECK_EQ(AtomicIncrement(&s.count, -3), 3);
+  CHECK_EQ(base::subtle::NoBarrier_AtomicIncrement(&s.count, -3), 3);
   CHECK_EQ(s.count, 3);
   CHECK_EQ(s.prev_word, prev_word_value);
   CHECK_EQ(s.next_word, next_word_value);
 
-  CHECK_EQ(AtomicIncrement(&s.count, -2), 1);
+  CHECK_EQ(base::subtle::NoBarrier_AtomicIncrement(&s.count, -2), 1);
   CHECK_EQ(s.count, 1);
   CHECK_EQ(s.prev_word, prev_word_value);
   CHECK_EQ(s.next_word, next_word_value);
 
-  CHECK_EQ(AtomicIncrement(&s.count, -1), 0);
+  CHECK_EQ(base::subtle::NoBarrier_AtomicIncrement(&s.count, -1), 0);
   CHECK_EQ(s.count, 0);
   CHECK_EQ(s.prev_word, prev_word_value);
   CHECK_EQ(s.next_word, next_word_value);
 
-  CHECK_EQ(AtomicIncrement(&s.count, -1), -1);
+  CHECK_EQ(base::subtle::NoBarrier_AtomicIncrement(&s.count, -1), -1);
   CHECK_EQ(s.count, -1);
   CHECK_EQ(s.prev_word, prev_word_value);
   CHECK_EQ(s.next_word, next_word_value);
 
-  CHECK_EQ(AtomicIncrement(&s.count, -4), -5);
+  CHECK_EQ(base::subtle::NoBarrier_AtomicIncrement(&s.count, -4), -5);
   CHECK_EQ(s.count, -5);
   CHECK_EQ(s.prev_word, prev_word_value);
   CHECK_EQ(s.next_word, next_word_value);
 
-  CHECK_EQ(AtomicIncrement(&s.count, 5), 0);
+  CHECK_EQ(base::subtle::NoBarrier_AtomicIncrement(&s.count, 5), 0);
   CHECK_EQ(s.count, 0);
   CHECK_EQ(s.prev_word, prev_word_value);
   CHECK_EQ(s.next_word, next_word_value);
 }
 
+
+#define NUM_BITS(T) (sizeof(T) * 8)
+
+
+template <class AtomicType>
+static void TestCompareAndSwap() {
+  AtomicType value = 0;
+  AtomicType prev = base::subtle::NoBarrier_CompareAndSwap(&value, 0, 1);
+  CHECK_EQ(1, value);
+  CHECK_EQ(0, prev);
+
+  // Use test value that has non-zero bits in both halves, more for testing
+  // 64-bit implementation on 32-bit platforms.
+  const AtomicType k_test_val = (GG_ULONGLONG(1) <<
+                                 (NUM_BITS(AtomicType) - 2)) + 11;
+  value = k_test_val;
+  prev = base::subtle::NoBarrier_CompareAndSwap(&value, 0, 5);
+  CHECK_EQ(k_test_val, value);
+  CHECK_EQ(k_test_val, prev);
+
+  value = k_test_val;
+  prev = base::subtle::NoBarrier_CompareAndSwap(&value, k_test_val, 5);
+  CHECK_EQ(5, value);
+  CHECK_EQ(k_test_val, prev);
+}
+
+
+template <class AtomicType>
+static void TestAtomicExchange() {
+  AtomicType value = 0;
+  AtomicType new_value = base::subtle::NoBarrier_AtomicExchange(&value, 1);
+  CHECK_EQ(1, value);
+  CHECK_EQ(0, new_value);
+
+  // Use test value that has non-zero bits in both halves, more for testing
+  // 64-bit implementation on 32-bit platforms.
+  const AtomicType k_test_val = (GG_ULONGLONG(1) <<
+                                 (NUM_BITS(AtomicType) - 2)) + 11;
+  value = k_test_val;
+  new_value = base::subtle::NoBarrier_AtomicExchange(&value, k_test_val);
+  CHECK_EQ(k_test_val, value);
+  CHECK_EQ(k_test_val, new_value);
+
+  value = k_test_val;
+  new_value = base::subtle::NoBarrier_AtomicExchange(&value, 5);
+  CHECK_EQ(5, value);
+  CHECK_EQ(k_test_val, new_value);
+}
+
+
+template <class AtomicType>
+static void TestAtomicIncrementBounds() {
+  // Test at rollover boundary between int_max and int_min
+  AtomicType test_val = (GG_ULONGLONG(1) <<
+                         (NUM_BITS(AtomicType) - 1));
+  AtomicType value = -1 ^ test_val;
+  AtomicType new_value = base::subtle::NoBarrier_AtomicIncrement(&value, 1);
+  CHECK_EQ(test_val, value);
+  CHECK_EQ(value, new_value);
+
+  base::subtle::NoBarrier_AtomicIncrement(&value, -1);
+  CHECK_EQ(-1 ^ test_val, value);
+
+  // Test at 32-bit boundary for 64-bit atomic type.
+  test_val = GG_ULONGLONG(1) << (NUM_BITS(AtomicType) / 2);
+  value = test_val - 1;
+  new_value = base::subtle::NoBarrier_AtomicIncrement(&value, 1);
+  CHECK_EQ(test_val, value);
+  CHECK_EQ(value, new_value);
+
+  base::subtle::NoBarrier_AtomicIncrement(&value, -1);
+  CHECK_EQ(test_val - 1, value);
+}
+
+// This is a simple sanity check that values are correct. Not testing
+// atomicity
+template <class AtomicType>
+static void TestStore() {
+  const AtomicType kVal1 = static_cast<AtomicType>(0xa5a5a5a5a5a5a5a5LL);
+  const AtomicType kVal2 = static_cast<AtomicType>(-1);
+
+  AtomicType value;
+
+  base::subtle::NoBarrier_Store(&value, kVal1);
+  CHECK_EQ(kVal1, value);
+  base::subtle::NoBarrier_Store(&value, kVal2);
+  CHECK_EQ(kVal2, value);
+
+  base::subtle::Acquire_Store(&value, kVal1);
+  CHECK_EQ(kVal1, value);
+  base::subtle::Acquire_Store(&value, kVal2);
+  CHECK_EQ(kVal2, value);
+
+  base::subtle::Release_Store(&value, kVal1);
+  CHECK_EQ(kVal1, value);
+  base::subtle::Release_Store(&value, kVal2);
+  CHECK_EQ(kVal2, value);
+}
+
+// This is a simple sanity check that values are correct. Not testing
+// atomicity
+template <class AtomicType>
+static void TestLoad() {
+  const AtomicType kVal1 = static_cast<AtomicType>(0xa5a5a5a5a5a5a5a5LL);
+  const AtomicType kVal2 = static_cast<AtomicType>(-1);
+
+  AtomicType value;
+
+  value = kVal1;
+  CHECK_EQ(kVal1, base::subtle::NoBarrier_Load(&value));
+  value = kVal2;
+  CHECK_EQ(kVal2, base::subtle::NoBarrier_Load(&value));
+
+  value = kVal1;
+  CHECK_EQ(kVal1, base::subtle::Acquire_Load(&value));
+  value = kVal2;
+  CHECK_EQ(kVal2, base::subtle::Acquire_Load(&value));
+
+  value = kVal1;
+  CHECK_EQ(kVal1, base::subtle::Release_Load(&value));
+  value = kVal2;
+  CHECK_EQ(kVal2, base::subtle::Release_Load(&value));
+}
+
+template <class AtomicType>
+static void TestAtomicOps() {
+  TestCompareAndSwap<AtomicType>();
+  TestAtomicExchange<AtomicType>();
+  TestAtomicIncrementBounds<AtomicType>();
+  TestStore<AtomicType>();
+  TestLoad<AtomicType>();
+}
+
 int main(int argc, char** argv) {
   TestAtomicIncrement<AtomicWord>();
   TestAtomicIncrement<Atomic32>();
+
+  TestAtomicOps<AtomicWord>();
+  TestAtomicOps<Atomic32>();
+
+  // I've commented the Atomic64 tests out for now, because Atomic64
+  // doesn't work on x86 systems that are not compiled to support mmx
+  // registers.  Since I want this project to be as portable as
+  // possible -- that is, not to assume we've compiled for mmx or even
+  // that the processor supports it -- and we don't actually use
+  // Atomic64 anywhere, I've commented it out of the test for now.
+  // (Luckily, if we ever do use Atomic64 by accident, we'll get told
+  // via a compiler error rather than some obscure runtime failure, so
+  // this course of action is safe.)
+  // If we ever *do* want to enable this, try adding -msse (or -mmmx?)
+  // to the CXXFLAGS in Makefile.am.
+#if 0 and defined(BASE_HAS_ATOMIC64)
+  TestAtomicIncrement<base::subtle::Atomic64>();
+  TestAtomicOps<base::subtle::Atomic64>();
+#endif
+
   printf("PASS\n");
   return 0;
 }
diff --git a/src/tests/heap-checker-death_unittest.sh b/src/tests/heap-checker-death_unittest.sh
index 5e2e8ae..4f4242c 100755
--- a/src/tests/heap-checker-death_unittest.sh
+++ b/src/tests/heap-checker-death_unittest.sh
@@ -139,7 +139,7 @@ EARLY_MSG="Starting tracking the heap$"
 
 Test 60 0 "$EARLY_MSG" "" \
   HEAPCHECK="" HEAP_CHECKER_TEST_TEST_LEAK=1 HEAP_CHECKER_TEST_NO_THREADS=1 \
-  || exit 5
+  PERFTOOLS_VERBOSE=1 || exit 5
 Test 60 0 "MemoryRegionMap Init$" "" \
   HEAPCHECK="" HEAP_CHECKER_TEST_TEST_LEAK=1 HEAP_CHECKER_TEST_NO_THREADS=1 \
   PERFTOOLS_VERBOSE=2 || exit 6
diff --git a/src/tests/maybe_threads_unittest.sh b/src/tests/maybe_threads_unittest.sh
index 1c1a1d0..6e0e5f9 100755
--- a/src/tests/maybe_threads_unittest.sh
+++ b/src/tests/maybe_threads_unittest.sh
@@ -61,8 +61,15 @@ UNITTEST_DIR=`$UNITTEST_DIR/low_level_alloc_unittest --help 2>&1 \
               | awk '{print $2; exit;}' \
               | xargs dirname`
 
-# We need to set the library-path too: libtcmalloc depends on libstacktrace
-# (Note we try several different names: OS X uses its own libpath varname).
-LD_LIBRARY_PATH="$UNITTEST_DIR" DYLD_LIBRARY_PATH="$UNITTEST_DIR" \
-LD_PRELOAD="$UNITTEST_DIR/libtcmalloc_minimal.so" \
-    $UNITTEST_DIR/low_level_alloc_unittest
+# Figure out where libtcmalloc lives.   It should be in UNITTEST_DIR,
+# but with libtool it might be in a subdir.
+if [ -e "$UNITTEST_DIR/libtcmalloc_minimal.so" ]; then
+  LIB_PATH="$UNITTEST_DIR/libtcmalloc_minimal.so"
+elif [ -e "$UNITTEST_DIR/.libs/libtcmalloc_minimal.so" ]; then
+  LIB_PATH="$UNITTEST_DIR/.libs/libtcmalloc_minimal.so"
+else
+  echo "Cannot run $0: cannot find libtcmalloc_minimal.so"
+  exit 2
+fi
+
+LD_PRELOAD="$LIB_PATH" $UNITTEST_DIR/low_level_alloc_unittest
diff --git a/vsprojects/libtcmalloc_minimal/libtcmalloc_minimal.vcproj b/vsprojects/libtcmalloc_minimal/libtcmalloc_minimal.vcproj
index 212c860..0a8ba08 100755
--- a/vsprojects/libtcmalloc_minimal/libtcmalloc_minimal.vcproj
+++ b/vsprojects/libtcmalloc_minimal/libtcmalloc_minimal.vcproj
@@ -117,6 +117,23 @@
 			Filter="cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx"
 			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}">
 			<File
+				RelativePath="..\..\src\base\dynamic_annotations.cc">
+				<FileConfiguration
+					Name="Debug|Win32">
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories="..\..\src\windows; ..\..\src"
+						RuntimeLibrary="3"/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32">
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories="..\..\src\windows; ..\..\src"
+						RuntimeLibrary="2"/>
+				</FileConfiguration>
+			</File>
+			<File
 				RelativePath="..\..\src\heap-profile-table.cc">
 				<FileConfiguration
 					Name="Debug|Win32">
diff --git a/vsprojects/low_level_alloc_unittest/low_level_alloc_unittest.vcproj b/vsprojects/low_level_alloc_unittest/low_level_alloc_unittest.vcproj
index 85760c1..0c0b8d0 100755
--- a/vsprojects/low_level_alloc_unittest/low_level_alloc_unittest.vcproj
+++ b/vsprojects/low_level_alloc_unittest/low_level_alloc_unittest.vcproj
@@ -113,6 +113,23 @@
 			Filter="cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx"
 			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}">
 			<File
+				RelativePath="..\..\src\base\dynamic_annotations.cc">
+				<FileConfiguration
+					Name="Debug|Win32">
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories="..\..\src\windows; ..\..\src"
+						RuntimeLibrary="3"/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32">
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories="..\..\src\windows; ..\..\src"
+						RuntimeLibrary="2"/>
+				</FileConfiguration>
+			</File>
+			<File
 				RelativePath="..\..\src\base\logging.cc">
 				<FileConfiguration
 					Name="Debug|Win32">
diff --git a/vsprojects/tcmalloc_minimal_unittest-static/tcmalloc_minimal_unittest-static.vcproj b/vsprojects/tcmalloc_minimal_unittest-static/tcmalloc_minimal_unittest-static.vcproj
index fbee663..eda0f08 100755
--- a/vsprojects/tcmalloc_minimal_unittest-static/tcmalloc_minimal_unittest-static.vcproj
+++ b/vsprojects/tcmalloc_minimal_unittest-static/tcmalloc_minimal_unittest-static.vcproj
@@ -113,6 +113,23 @@
 			Filter="cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx"
 			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}">
 			<File
+				RelativePath="..\..\src\base\dynamic_annotations.cc">
+				<FileConfiguration
+					Name="Debug|Win32">
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories="..\..\src\windows; ..\..\src"
+						RuntimeLibrary="3"/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32">
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories="..\..\src\windows; ..\..\src"
+						RuntimeLibrary="2"/>
+				</FileConfiguration>
+			</File>
+			<File
 				RelativePath="..\..\src\heap-profile-table.cc">
 				<FileConfiguration
 					Name="Debug|Win32">