[libsanitizer merge from upstream r218156]

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@215527 138bc75d-0d04-0410-961f-82ee72b054a4
author: kcc <kcc@138bc75d-0d04-0410-961f-82ee72b054a4> 2014-09-23 17:59:53 +0000
committer: kcc <kcc@138bc75d-0d04-0410-961f-82ee72b054a4> 2014-09-23 17:59:53 +0000
commit: a9586c9cc10d268c2cd7642d03a75bb283b0e266 (patch)
tree: dfe8acd36f160811afc54c8eaf16e8160ba8bd70 /libsanitizer/tsan
parent: 4b9426aa0590d7761688e00e9da9393e228dd830 (diff)
download: gcc-a9586c9cc10d268c2cd7642d03a75bb283b0e266.tar.gz
41 files changed, 1734 insertions, 1363 deletions
diff --git a/libsanitizer/tsan/Makefile.am b/libsanitizer/tsan/Makefile.am
index 5d98e215e05..ab4748e0878 100644
--- a/libsanitizer/tsan/Makefile.am
+++ b/libsanitizer/tsan/Makefile.am
@@ -32,6 +32,7 @@ tsan_files = \
 	tsan_rtl_mutex.cc \
 	tsan_rtl_report.cc \
 	tsan_rtl_thread.cc \
+	tsan_stack_trace.cc \
 	tsan_stat.cc \
 	tsan_suppressions.cc \
 	tsan_symbolize.cc \
diff --git a/libsanitizer/tsan/Makefile.in b/libsanitizer/tsan/Makefile.in
index 068aaa87fe8..c6f1ecffa75 100644
--- a/libsanitizer/tsan/Makefile.in
+++ b/libsanitizer/tsan/Makefile.in
@@ -91,8 +91,8 @@ am__objects_1 = tsan_clock.lo tsan_fd.lo tsan_flags.lo \
 	tsan_mutexset.lo tsan_platform_linux.lo tsan_platform_mac.lo \
 	tsan_platform_windows.lo tsan_report.lo tsan_rtl.lo \
 	tsan_rtl_mutex.lo tsan_rtl_report.lo tsan_rtl_thread.lo \
-	tsan_stat.lo tsan_suppressions.lo tsan_symbolize.lo \
-	tsan_sync.lo tsan_rtl_amd64.lo
+	tsan_stack_trace.lo tsan_stat.lo tsan_suppressions.lo \
+	tsan_symbolize.lo tsan_sync.lo tsan_rtl_amd64.lo
 am_libtsan_la_OBJECTS = $(am__objects_1)
 libtsan_la_OBJECTS = $(am_libtsan_la_OBJECTS)
 libtsan_la_LINK = $(LIBTOOL) --tag=CXX $(AM_LIBTOOLFLAGS) \
@@ -301,6 +301,7 @@ tsan_files = \
 	tsan_rtl_mutex.cc \
 	tsan_rtl_report.cc \
 	tsan_rtl_thread.cc \
+	tsan_stack_trace.cc \
 	tsan_stat.cc \
 	tsan_suppressions.cc \
 	tsan_symbolize.cc \
@@ -449,6 +450,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tsan_rtl_mutex.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tsan_rtl_report.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tsan_rtl_thread.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tsan_stack_trace.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tsan_stat.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tsan_suppressions.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tsan_symbolize.Plo@am__quote@
diff --git a/libsanitizer/tsan/tsan_clock.cc b/libsanitizer/tsan/tsan_clock.cc
index b944cc50d54..a84caa952a6 100644
--- a/libsanitizer/tsan/tsan_clock.cc
+++ b/libsanitizer/tsan/tsan_clock.cc
@@ -10,6 +10,7 @@
 //===----------------------------------------------------------------------===//
 #include "tsan_clock.h"
 #include "tsan_rtl.h"
+#include "sanitizer_common/sanitizer_placement_new.h"
 
 // SyncClock and ThreadClock implement vector clocks for sync variables
 // (mutexes, atomic variables, file descriptors, etc) and threads, respectively.
@@ -100,13 +101,13 @@ ThreadClock::ThreadClock(unsigned tid, unsigned reused)
   clk_[tid_].reused = reused_;
 }
 
-void ThreadClock::acquire(const SyncClock *src) {
+void ThreadClock::acquire(ClockCache *c, const SyncClock *src) {
   DCHECK(nclk_ <= kMaxTid);
-  DCHECK(src->clk_.Size() <= kMaxTid);
+  DCHECK(src->size_ <= kMaxTid);
   CPP_STAT_INC(StatClockAcquire);
 
   // Check if it's empty -> no need to do anything.
-  const uptr nclk = src->clk_.Size();
+  const uptr nclk = src->size_;
   if (nclk == 0) {
     CPP_STAT_INC(StatClockAcquireEmpty);
     return;
@@ -116,12 +117,12 @@ void ThreadClock::acquire(const SyncClock *src) {
   bool acquired = false;
   if (nclk > tid_) {
     CPP_STAT_INC(StatClockAcquireLarge);
-    if (src->clk_[tid_].reused == reused_) {
+    if (src->elem(tid_).reused == reused_) {
       CPP_STAT_INC(StatClockAcquireRepeat);
       for (unsigned i = 0; i < kDirtyTids; i++) {
         unsigned tid = src->dirty_tids_[i];
         if (tid != kInvalidTid) {
-          u64 epoch = src->clk_[tid].epoch;
+          u64 epoch = src->elem(tid).epoch;
           if (clk_[tid].epoch < epoch) {
             clk_[tid].epoch = epoch;
             acquired = true;
@@ -140,7 +141,7 @@ void ThreadClock::acquire(const SyncClock *src) {
   CPP_STAT_INC(StatClockAcquireFull);
   nclk_ = max(nclk_, nclk);
   for (uptr i = 0; i < nclk; i++) {
-    u64 epoch = src->clk_[i].epoch;
+    u64 epoch = src->elem(i).epoch;
     if (clk_[i].epoch < epoch) {
       clk_[i].epoch = epoch;
       acquired = true;
@@ -149,7 +150,7 @@ void ThreadClock::acquire(const SyncClock *src) {
 
   // Remember that this thread has acquired this clock.
   if (nclk > tid_)
-    src->clk_[tid_].reused = reused_;
+    src->elem(tid_).reused = reused_;
 
   if (acquired) {
     CPP_STAT_INC(StatClockAcquiredSomething);
@@ -157,28 +158,26 @@ void ThreadClock::acquire(const SyncClock *src) {
   }
 }
 
-void ThreadClock::release(SyncClock *dst) const {
+void ThreadClock::release(ClockCache *c, SyncClock *dst) const {
   DCHECK_LE(nclk_, kMaxTid);
-  DCHECK_LE(dst->clk_.Size(), kMaxTid);
+  DCHECK_LE(dst->size_, kMaxTid);
 
-  if (dst->clk_.Size() == 0) {
+  if (dst->size_ == 0) {
     // ReleaseStore will correctly set release_store_tid_,
     // which can be important for future operations.
-    ReleaseStore(dst);
+    ReleaseStore(c, dst);
     return;
   }
 
   CPP_STAT_INC(StatClockRelease);
   // Check if we need to resize dst.
-  if (dst->clk_.Size() < nclk_) {
-    CPP_STAT_INC(StatClockReleaseResize);
-    dst->clk_.Resize(nclk_);
-  }
+  if (dst->size_ < nclk_)
+    dst->Resize(c, nclk_);
 
   // Check if we had not acquired anything from other threads
   // since the last release on dst. If so, we need to update
-  // only dst->clk_[tid_].
-  if (dst->clk_[tid_].epoch > last_acquire_) {
+  // only dst->elem(tid_).
+  if (dst->elem(tid_).epoch > last_acquire_) {
     UpdateCurrentThread(dst);
     if (dst->release_store_tid_ != tid_ ||
         dst->release_store_reused_ != reused_)
@@ -194,14 +193,15 @@ void ThreadClock::release(SyncClock *dst) const {
     CPP_STAT_INC(StatClockReleaseAcquired);
   // Update dst->clk_.
   for (uptr i = 0; i < nclk_; i++) {
-    dst->clk_[i].epoch = max(dst->clk_[i].epoch, clk_[i].epoch);
-    dst->clk_[i].reused = 0;
+    ClockElem &ce = dst->elem(i);
+    ce.epoch = max(ce.epoch, clk_[i].epoch);
+    ce.reused = 0;
   }
   // Clear 'acquired' flag in the remaining elements.
-  if (nclk_ < dst->clk_.Size())
+  if (nclk_ < dst->size_)
     CPP_STAT_INC(StatClockReleaseClearTail);
-  for (uptr i = nclk_; i < dst->clk_.Size(); i++)
-    dst->clk_[i].reused = 0;
+  for (uptr i = nclk_; i < dst->size_; i++)
+    dst->elem(i).reused = 0;
   for (unsigned i = 0; i < kDirtyTids; i++)
     dst->dirty_tids_[i] = kInvalidTid;
   dst->release_store_tid_ = kInvalidTid;
@@ -209,23 +209,21 @@ void ThreadClock::release(SyncClock *dst) const {
   // If we've acquired dst, remember this fact,
   // so that we don't need to acquire it on next acquire.
   if (acquired)
-    dst->clk_[tid_].reused = reused_;
+    dst->elem(tid_).reused = reused_;
 }
 
-void ThreadClock::ReleaseStore(SyncClock *dst) const {
+void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) const {
   DCHECK(nclk_ <= kMaxTid);
-  DCHECK(dst->clk_.Size() <= kMaxTid);
+  DCHECK(dst->size_ <= kMaxTid);
   CPP_STAT_INC(StatClockStore);
 
   // Check if we need to resize dst.
-  if (dst->clk_.Size() < nclk_) {
-    CPP_STAT_INC(StatClockStoreResize);
-    dst->clk_.Resize(nclk_);
-  }
+  if (dst->size_ < nclk_)
+    dst->Resize(c, nclk_);
 
   if (dst->release_store_tid_ == tid_ &&
       dst->release_store_reused_ == reused_ &&
-      dst->clk_[tid_].epoch > last_acquire_) {
+      dst->elem(tid_).epoch > last_acquire_) {
     CPP_STAT_INC(StatClockStoreFast);
     UpdateCurrentThread(dst);
     return;
@@ -234,13 +232,17 @@ void ThreadClock::ReleaseStore(SyncClock *dst) const {
   // O(N) release-store.
   CPP_STAT_INC(StatClockStoreFull);
   for (uptr i = 0; i < nclk_; i++) {
-    dst->clk_[i].epoch = clk_[i].epoch;
-    dst->clk_[i].reused = 0;
+    ClockElem &ce = dst->elem(i);
+    ce.epoch = clk_[i].epoch;
+    ce.reused = 0;
   }
   // Clear the tail of dst->clk_.
-  if (nclk_ < dst->clk_.Size()) {
-    internal_memset(&dst->clk_[nclk_], 0,
-        (dst->clk_.Size() - nclk_) * sizeof(dst->clk_[0]));
+  if (nclk_ < dst->size_) {
+    for (uptr i = nclk_; i < dst->size_; i++) {
+      ClockElem &ce = dst->elem(i);
+      ce.epoch = 0;
+      ce.reused = 0;
+    }
     CPP_STAT_INC(StatClockStoreTail);
   }
   for (unsigned i = 0; i < kDirtyTids; i++)
@@ -248,19 +250,19 @@ void ThreadClock::ReleaseStore(SyncClock *dst) const {
   dst->release_store_tid_ = tid_;
   dst->release_store_reused_ = reused_;
   // Rememeber that we don't need to acquire it in future.
-  dst->clk_[tid_].reused = reused_;
+  dst->elem(tid_).reused = reused_;
 }
 
-void ThreadClock::acq_rel(SyncClock *dst) {
+void ThreadClock::acq_rel(ClockCache *c, SyncClock *dst) {
   CPP_STAT_INC(StatClockAcquireRelease);
-  acquire(dst);
-  ReleaseStore(dst);
+  acquire(c, dst);
+  ReleaseStore(c, dst);
 }
 
 // Updates only single element related to the current thread in dst->clk_.
 void ThreadClock::UpdateCurrentThread(SyncClock *dst) const {
   // Update the threads time, but preserve 'acquired' flag.
-  dst->clk_[tid_].epoch = clk_[tid_].epoch;
+  dst->elem(tid_).epoch = clk_[tid_].epoch;
 
   for (unsigned i = 0; i < kDirtyTids; i++) {
     if (dst->dirty_tids_[i] == tid_) {
@@ -275,27 +277,73 @@ void ThreadClock::UpdateCurrentThread(SyncClock *dst) const {
   }
   // Reset all 'acquired' flags, O(N).
   CPP_STAT_INC(StatClockReleaseSlow);
-  for (uptr i = 0; i < dst->clk_.Size(); i++) {
-    dst->clk_[i].reused = 0;
-  }
+  for (uptr i = 0; i < dst->size_; i++)
+    dst->elem(i).reused = 0;
   for (unsigned i = 0; i < kDirtyTids; i++)
     dst->dirty_tids_[i] = kInvalidTid;
 }
 
 // Checks whether the current threads has already acquired src.
 bool ThreadClock::IsAlreadyAcquired(const SyncClock *src) const {
-  if (src->clk_[tid_].reused != reused_)
+  if (src->elem(tid_).reused != reused_)
     return false;
   for (unsigned i = 0; i < kDirtyTids; i++) {
     unsigned tid = src->dirty_tids_[i];
     if (tid != kInvalidTid) {
-      if (clk_[tid].epoch < src->clk_[tid].epoch)
+      if (clk_[tid].epoch < src->elem(tid).epoch)
         return false;
     }
   }
   return true;
 }
 
+void SyncClock::Resize(ClockCache *c, uptr nclk) {
+  CPP_STAT_INC(StatClockReleaseResize);
+  if (RoundUpTo(nclk, ClockBlock::kClockCount) <=
+      RoundUpTo(size_, ClockBlock::kClockCount)) {
+    // Growing within the same block.
+    // Memory is already allocated, just increase the size.
+    size_ = nclk;
+    return;
+  }
+  if (nclk <= ClockBlock::kClockCount) {
+    // Grow from 0 to one-level table.
+    CHECK_EQ(size_, 0);
+    CHECK_EQ(tab_, 0);
+    CHECK_EQ(tab_idx_, 0);
+    size_ = nclk;
+    tab_idx_ = ctx->clock_alloc.Alloc(c);
+    tab_ = ctx->clock_alloc.Map(tab_idx_);
+    internal_memset(tab_, 0, sizeof(*tab_));
+    return;
+  }
+  // Growing two-level table.
+  if (size_ == 0) {
+    // Allocate first level table.
+    tab_idx_ = ctx->clock_alloc.Alloc(c);
+    tab_ = ctx->clock_alloc.Map(tab_idx_);
+    internal_memset(tab_, 0, sizeof(*tab_));
+  } else if (size_ <= ClockBlock::kClockCount) {
+    // Transform one-level table to two-level table.
+    u32 old = tab_idx_;
+    tab_idx_ = ctx->clock_alloc.Alloc(c);
+    tab_ = ctx->clock_alloc.Map(tab_idx_);
+    internal_memset(tab_, 0, sizeof(*tab_));
+    tab_->table[0] = old;
+  }
+  // At this point we have first level table allocated.
+  // Add second level tables as necessary.
+  for (uptr i = RoundUpTo(size_, ClockBlock::kClockCount);
+      i < nclk; i += ClockBlock::kClockCount) {
+    u32 idx = ctx->clock_alloc.Alloc(c);
+    ClockBlock *cb = ctx->clock_alloc.Map(idx);
+    internal_memset(cb, 0, sizeof(*cb));
+    CHECK_EQ(tab_->table[i/ClockBlock::kClockCount], 0);
+    tab_->table[i/ClockBlock::kClockCount] = idx;
+  }
+  size_ = nclk;
+}
+
 // Sets a single element in the vector clock.
 // This function is called only from weird places like AcquireGlobal.
 void ThreadClock::set(unsigned tid, u64 v) {
@@ -319,28 +367,59 @@ void ThreadClock::DebugDump(int(*printf)(const char *s, ...)) {
 }
 
 SyncClock::SyncClock()
-    : clk_(MBlockClock) {
-  release_store_tid_ = kInvalidTid;
-  release_store_reused_ = 0;
+    : release_store_tid_(kInvalidTid)
+    , release_store_reused_()
+    , tab_()
+    , tab_idx_()
+    , size_() {
   for (uptr i = 0; i < kDirtyTids; i++)
     dirty_tids_[i] = kInvalidTid;
 }
 
-void SyncClock::Reset() {
-  clk_.Reset();
+SyncClock::~SyncClock() {
+  // Reset must be called before dtor.
+  CHECK_EQ(size_, 0);
+  CHECK_EQ(tab_, 0);
+  CHECK_EQ(tab_idx_, 0);
+}
+
+void SyncClock::Reset(ClockCache *c) {
+  if (size_ == 0) {
+    // nothing
+  } else if (size_ <= ClockBlock::kClockCount) {
+    // One-level table.
+    ctx->clock_alloc.Free(c, tab_idx_);
+  } else {
+    // Two-level table.
+    for (uptr i = 0; i < size_; i += ClockBlock::kClockCount)
+      ctx->clock_alloc.Free(c, tab_->table[i / ClockBlock::kClockCount]);
+    ctx->clock_alloc.Free(c, tab_idx_);
+  }
+  tab_ = 0;
+  tab_idx_ = 0;
+  size_ = 0;
   release_store_tid_ = kInvalidTid;
   release_store_reused_ = 0;
   for (uptr i = 0; i < kDirtyTids; i++)
     dirty_tids_[i] = kInvalidTid;
 }
 
+ClockElem &SyncClock::elem(unsigned tid) const {
+  DCHECK_LT(tid, size_);
+  if (size_ <= ClockBlock::kClockCount)
+    return tab_->clock[tid];
+  u32 idx = tab_->table[tid / ClockBlock::kClockCount];
+  ClockBlock *cb = ctx->clock_alloc.Map(idx);
+  return cb->clock[tid % ClockBlock::kClockCount];
+}
+
 void SyncClock::DebugDump(int(*printf)(const char *s, ...)) {
   printf("clock=[");
-  for (uptr i = 0; i < clk_.Size(); i++)
-    printf("%s%llu", i == 0 ? "" : ",", clk_[i].epoch);
+  for (uptr i = 0; i < size_; i++)
+    printf("%s%llu", i == 0 ? "" : ",", elem(i).epoch);
   printf("] reused=[");
-  for (uptr i = 0; i < clk_.Size(); i++)
-    printf("%s%llu", i == 0 ? "" : ",", clk_[i].reused);
+  for (uptr i = 0; i < size_; i++)
+    printf("%s%llu", i == 0 ? "" : ",", elem(i).reused);
   printf("] release_store_tid=%d/%d dirty_tids=%d/%d",
       release_store_tid_, release_store_reused_,
       dirty_tids_[0], dirty_tids_[1]);
diff --git a/libsanitizer/tsan/tsan_clock.h b/libsanitizer/tsan/tsan_clock.h
index 2ce480b2d16..3deb7f5198c 100644
--- a/libsanitizer/tsan/tsan_clock.h
+++ b/libsanitizer/tsan/tsan_clock.h
@@ -12,7 +12,7 @@
 #define TSAN_CLOCK_H
 
 #include "tsan_defs.h"
-#include "tsan_vector.h"
+#include "tsan_dense_alloc.h"
 
 namespace __tsan {
 
@@ -21,36 +21,65 @@ struct ClockElem {
   u64 reused : 64 - kClkBits;
 };
 
+struct ClockBlock {
+  static const uptr kSize = 512;
+  static const uptr kTableSize = kSize / sizeof(u32);
+  static const uptr kClockCount = kSize / sizeof(ClockElem);
+
+  union {
+    u32       table[kTableSize];
+    ClockElem clock[kClockCount];
+  };
+
+  ClockBlock() {
+  }
+};
+
+typedef DenseSlabAlloc<ClockBlock, 1<<16, 1<<10> ClockAlloc;
+typedef DenseSlabAllocCache ClockCache;
+
 // The clock that lives in sync variables (mutexes, atomics, etc).
 class SyncClock {
  public:
   SyncClock();
+  ~SyncClock();
 
   uptr size() const {
-    return clk_.Size();
+    return size_;
   }
 
   u64 get(unsigned tid) const {
-    DCHECK_LT(tid, clk_.Size());
-    return clk_[tid].epoch;
+    return elem(tid).epoch;
   }
 
-  void Reset();
+  void Resize(ClockCache *c, uptr nclk);
+  void Reset(ClockCache *c);
 
   void DebugDump(int(*printf)(const char *s, ...));
 
  private:
+  friend struct ThreadClock;
+  static const uptr kDirtyTids = 2;
+
   unsigned release_store_tid_;
   unsigned release_store_reused_;
-  static const uptr kDirtyTids = 2;
   unsigned dirty_tids_[kDirtyTids];
-  mutable Vector<ClockElem> clk_;
-  friend struct ThreadClock;
+  // tab_ contains indirect pointer to a 512b block using DenseSlabAlloc.
+  // If size_ <= 64, then tab_ points to an array with 64 ClockElem's.
+  // Otherwise, tab_ points to an array with 128 u32 elements,
+  // each pointing to the second-level 512b block with 64 ClockElem's.
+  ClockBlock *tab_;
+  u32 tab_idx_;
+  u32 size_;
+
+  ClockElem &elem(unsigned tid) const;
 };
 
 // The clock that lives in threads.
 struct ThreadClock {
  public:
+  typedef DenseSlabAllocCache Cache;
+
   explicit ThreadClock(unsigned tid, unsigned reused = 0);
 
   u64 get(unsigned tid) const {
@@ -73,10 +102,10 @@ struct ThreadClock {
     return nclk_;
   }
 
-  void acquire(const SyncClock *src);
-  void release(SyncClock *dst) const;
-  void acq_rel(SyncClock *dst);
-  void ReleaseStore(SyncClock *dst) const;
+  void acquire(ClockCache *c, const SyncClock *src);
+  void release(ClockCache *c, SyncClock *dst) const;
+  void acq_rel(ClockCache *c, SyncClock *dst);
+  void ReleaseStore(ClockCache *c, SyncClock *dst) const;
 
   void DebugReset();
   void DebugDump(int(*printf)(const char *s, ...));
diff --git a/libsanitizer/tsan/tsan_defs.h b/libsanitizer/tsan/tsan_defs.h
index cc65ae8b4a3..a8528cbb44a 100644
--- a/libsanitizer/tsan/tsan_defs.h
+++ b/libsanitizer/tsan/tsan_defs.h
@@ -52,6 +52,7 @@ const uptr kShadowCnt = TSAN_SHADOW_COUNT;
 # endif
 #else
 // Count of shadow values in a shadow cell.
+#define TSAN_SHADOW_COUNT 4
 const uptr kShadowCnt = 4;
 #endif
 
@@ -64,6 +65,13 @@ const uptr kShadowSize = 8;
 // Shadow memory is kShadowMultiplier times larger than user memory.
 const uptr kShadowMultiplier = kShadowSize * kShadowCnt / kShadowCell;
 
+// That many user bytes are mapped onto a single meta shadow cell.
+// Must be less or equal to minimal memory allocator alignment.
+const uptr kMetaShadowCell = 8;
+
+// Size of a single meta shadow value (u32).
+const uptr kMetaShadowSize = 4;
+
 #if defined(TSAN_NO_HISTORY) && TSAN_NO_HISTORY
 const bool kCollectHistory = false;
 #else
@@ -165,7 +173,15 @@ struct ReportStack;
 class ReportDesc;
 class RegionAlloc;
 class StackTrace;
-struct MBlock;
+
+// Descriptor of user's memory block.
+struct MBlock {
+  u64  siz;
+  u32  stk;
+  u16  tid;
+};
+
+COMPILER_CHECK(sizeof(MBlock) == 16);
 
 }  // namespace __tsan
 
diff --git a/libsanitizer/tsan/tsan_dense_alloc.h b/libsanitizer/tsan/tsan_dense_alloc.h
new file mode 100644
index 00000000000..651c112c78a
--- /dev/null
+++ b/libsanitizer/tsan/tsan_dense_alloc.h
@@ -0,0 +1,135 @@
+//===-- tsan_dense_alloc.h --------------------------------------*- C++ -*-===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of ThreadSanitizer (TSan), a race detector.
+//
+// A DenseSlabAlloc is a freelist-based allocator of fixed-size objects.
+// DenseSlabAllocCache is a thread-local cache for DenseSlabAlloc.
+// The only difference with traditional slab allocators is that DenseSlabAlloc
+// allocates/free indices of objects and provide a functionality to map
+// the index onto the real pointer. The index is u32, that is, 2 times smaller
+// than uptr (hense the Dense prefix).
+//===----------------------------------------------------------------------===//
+#ifndef TSAN_DENSE_ALLOC_H
+#define TSAN_DENSE_ALLOC_H
+
+#include "sanitizer_common/sanitizer_common.h"
+#include "tsan_defs.h"
+#include "tsan_mutex.h"
+
+namespace __tsan {
+
+class DenseSlabAllocCache {
+  static const uptr kSize = 128;
+  typedef u32 IndexT;
+  uptr pos;
+  IndexT cache[kSize];
+  template<typename T, uptr kL1Size, uptr kL2Size> friend class DenseSlabAlloc;
+};
+
+template<typename T, uptr kL1Size, uptr kL2Size>
+class DenseSlabAlloc {
+ public:
+  typedef DenseSlabAllocCache Cache;
+  typedef typename Cache::IndexT IndexT;
+
+  DenseSlabAlloc() {
+    // Check that kL1Size and kL2Size are sane.
+    CHECK_EQ(kL1Size & (kL1Size - 1), 0);
+    CHECK_EQ(kL2Size & (kL2Size - 1), 0);
+    CHECK_GE(1ull << (sizeof(IndexT) * 8), kL1Size * kL2Size);
+    // Check that it makes sense to use the dense alloc.
+    CHECK_GE(sizeof(T), sizeof(IndexT));
+    internal_memset(map_, 0, sizeof(map_));
+    freelist_ = 0;
+    fillpos_ = 0;
+  }
+
+  ~DenseSlabAlloc() {
+    for (uptr i = 0; i < kL1Size; i++) {
+      if (map_[i] != 0)
+        UnmapOrDie(map_[i], kL2Size * sizeof(T));
+    }
+  }
+
+  IndexT Alloc(Cache *c) {
+    if (c->pos == 0)
+      Refill(c);
+    return c->cache[--c->pos];
+  }
+
+  void Free(Cache *c, IndexT idx) {
+    DCHECK_NE(idx, 0);
+    if (c->pos == Cache::kSize)
+      Drain(c);
+    c->cache[c->pos++] = idx;
+  }
+
+  T *Map(IndexT idx) {
+    DCHECK_NE(idx, 0);
+    DCHECK_LE(idx, kL1Size * kL2Size);
+    return &map_[idx / kL2Size][idx % kL2Size];
+  }
+
+  void FlushCache(Cache *c) {
+    SpinMutexLock lock(&mtx_);
+    while (c->pos) {
+      IndexT idx = c->cache[--c->pos];
+      *(IndexT*)Map(idx) = freelist_;
+      freelist_ = idx;
+    }
+  }
+
+  void InitCache(Cache *c) {
+    c->pos = 0;
+    internal_memset(c->cache, 0, sizeof(c->cache));
+  }
+
+ private:
+  T *map_[kL1Size];
+  SpinMutex mtx_;
+  IndexT freelist_;
+  uptr fillpos_;
+
+  void Refill(Cache *c) {
+    SpinMutexLock lock(&mtx_);
+    if (freelist_ == 0) {
+      if (fillpos_ == kL1Size) {
+        Printf("ThreadSanitizer: DenseSlabAllocator overflow. Dying.\n");
+        Die();
+      }
+      T *batch = (T*)MmapOrDie(kL2Size * sizeof(T), "DenseSlabAllocator");
+      // Reserve 0 as invalid index.
+      IndexT start = fillpos_ == 0 ? 1 : 0;
+      for (IndexT i = start; i < kL2Size; i++) {
+        new(batch + i) T();
+        *(IndexT*)(batch + i) = i + 1 + fillpos_ * kL2Size;
+      }
+      *(IndexT*)(batch + kL2Size - 1) = 0;
+      freelist_ = fillpos_ * kL2Size + start;
+      map_[fillpos_++] = batch;
+    }
+    for (uptr i = 0; i < Cache::kSize / 2 && freelist_ != 0; i++) {
+      IndexT idx = freelist_;
+      c->cache[c->pos++] = idx;
+      freelist_ = *(IndexT*)Map(idx);
+    }
+  }
+
+  void Drain(Cache *c) {
+    SpinMutexLock lock(&mtx_);
+    for (uptr i = 0; i < Cache::kSize / 2; i++) {
+      IndexT idx = c->cache[--c->pos];
+      *(IndexT*)Map(idx) = freelist_;
+      freelist_ = idx;
+    }
+  }
+};
+
+}  // namespace __tsan
+
+#endif  // TSAN_DENSE_ALLOC_H
diff --git a/libsanitizer/tsan/tsan_fd.cc b/libsanitizer/tsan/tsan_fd.cc
index 7d62ae4c5ff..a74a668c13b 100644
--- a/libsanitizer/tsan/tsan_fd.cc
+++ b/libsanitizer/tsan/tsan_fd.cc
@@ -45,8 +45,8 @@ static bool bogusfd(int fd) {
   return fd < 0 || fd >= kTableSize;
 }
 
-static FdSync *allocsync() {
-  FdSync *s = (FdSync*)internal_alloc(MBlockFD, sizeof(FdSync));
+static FdSync *allocsync(ThreadState *thr, uptr pc) {
+  FdSync *s = (FdSync*)user_alloc(thr, pc, sizeof(FdSync));
   atomic_store(&s->rc, 1, memory_order_relaxed);
   return s;
 }
@@ -63,10 +63,7 @@ static void unref(ThreadState *thr, uptr pc, FdSync *s) {
       CHECK_NE(s, &fdctx.globsync);
       CHECK_NE(s, &fdctx.filesync);
       CHECK_NE(s, &fdctx.socksync);
-      SyncVar *v = ctx->synctab.GetAndRemove(thr, pc, (uptr)s);
-      if (v)
-        DestroyAndFree(v);
-      internal_free(s);
+      user_free(thr, pc, s);
     }
   }
 }
@@ -217,7 +214,7 @@ void FdDup(ThreadState *thr, uptr pc, int oldfd, int newfd) {
 
 void FdPipeCreate(ThreadState *thr, uptr pc, int rfd, int wfd) {
   DPrintf("#%d: FdCreatePipe(%d, %d)\n", thr->tid, rfd, wfd);
-  FdSync *s = allocsync();
+  FdSync *s = allocsync(thr, pc);
   init(thr, pc, rfd, ref(s));
   init(thr, pc, wfd, ref(s));
   unref(thr, pc, s);
@@ -227,7 +224,7 @@ void FdEventCreate(ThreadState *thr, uptr pc, int fd) {
   DPrintf("#%d: FdEventCreate(%d)\n", thr->tid, fd);
   if (bogusfd(fd))
     return;
-  init(thr, pc, fd, allocsync());
+  init(thr, pc, fd, allocsync(thr, pc));
 }
 
 void FdSignalCreate(ThreadState *thr, uptr pc, int fd) {
@@ -248,7 +245,7 @@ void FdPollCreate(ThreadState *thr, uptr pc, int fd) {
   DPrintf("#%d: FdPollCreate(%d)\n", thr->tid, fd);
   if (bogusfd(fd))
     return;
-  init(thr, pc, fd, allocsync());
+  init(thr, pc, fd, allocsync(thr, pc));
 }
 
 void FdSocketCreate(ThreadState *thr, uptr pc, int fd) {
diff --git a/libsanitizer/tsan/tsan_flags.cc b/libsanitizer/tsan/tsan_flags.cc
index e241cb6cda4..02bf6527e6a 100644
--- a/libsanitizer/tsan/tsan_flags.cc
+++ b/libsanitizer/tsan/tsan_flags.cc
@@ -35,7 +35,6 @@ static void ParseFlags(Flags *f, const char *env) {
   ParseFlag(env, &f->enable_annotations, "enable_annotations", "");
   ParseFlag(env, &f->suppress_equal_stacks, "suppress_equal_stacks", "");
   ParseFlag(env, &f->suppress_equal_addresses, "suppress_equal_addresses", "");
-  ParseFlag(env, &f->suppress_java, "suppress_java", "");
   ParseFlag(env, &f->report_bugs, "report_bugs", "");
   ParseFlag(env, &f->report_thread_leaks, "report_thread_leaks", "");
   ParseFlag(env, &f->report_destroy_locked, "report_destroy_locked", "");
@@ -43,8 +42,6 @@ static void ParseFlags(Flags *f, const char *env) {
   ParseFlag(env, &f->report_signal_unsafe, "report_signal_unsafe", "");
   ParseFlag(env, &f->report_atomic_races, "report_atomic_races", "");
   ParseFlag(env, &f->force_seq_cst_atomics, "force_seq_cst_atomics", "");
-  ParseFlag(env, &f->suppressions, "suppressions", "");
-  ParseFlag(env, &f->print_suppressions, "print_suppressions", "");
   ParseFlag(env, &f->print_benign, "print_benign", "");
   ParseFlag(env, &f->exitcode, "exitcode", "");
   ParseFlag(env, &f->halt_on_error, "halt_on_error", "");
@@ -70,7 +67,6 @@ void InitializeFlags(Flags *f, const char *env) {
   f->enable_annotations = true;
   f->suppress_equal_stacks = true;
   f->suppress_equal_addresses = true;
-  f->suppress_java = false;
   f->report_bugs = true;
   f->report_thread_leaks = true;
   f->report_destroy_locked = true;
@@ -78,8 +74,6 @@ void InitializeFlags(Flags *f, const char *env) {
   f->report_signal_unsafe = true;
   f->report_atomic_races = true;
   f->force_seq_cst_atomics = false;
-  f->suppressions = "";
-  f->print_suppressions = false;
   f->print_benign = false;
   f->exitcode = 66;
   f->halt_on_error = false;
@@ -97,19 +91,19 @@ void InitializeFlags(Flags *f, const char *env) {
   // DDFlags
   f->second_deadlock_stack = false;
 
-  SetCommonFlagsDefaults(f);
+  CommonFlags *cf = common_flags();
+  SetCommonFlagsDefaults(cf);
   // Override some common flags defaults.
-  f->allow_addr2line = true;
+  cf->allow_addr2line = true;
+  cf->detect_deadlocks = true;
+  cf->print_suppressions = false;
 
   // Let a frontend override.
   ParseFlags(f, __tsan_default_options());
-  ParseCommonFlagsFromString(f, __tsan_default_options());
+  ParseCommonFlagsFromString(cf, __tsan_default_options());
   // Override from command line.
   ParseFlags(f, env);
-  ParseCommonFlagsFromString(f, env);
-
-  // Copy back to common flags.
-  *common_flags() = *f;
+  ParseCommonFlagsFromString(cf, env);
 
   // Sanity check.
   if (!f->report_bugs) {
@@ -118,7 +112,7 @@ void InitializeFlags(Flags *f, const char *env) {
     f->report_signal_unsafe = false;
   }
 
-  if (f->help) PrintFlagDescriptions();
+  if (cf->help) PrintFlagDescriptions();
 
   if (f->history_size < 0 || f->history_size > 7) {
     Printf("ThreadSanitizer: incorrect value for history_size"
diff --git a/libsanitizer/tsan/tsan_flags.h b/libsanitizer/tsan/tsan_flags.h
index 4bf459d5981..182d9b298af 100644
--- a/libsanitizer/tsan/tsan_flags.h
+++ b/libsanitizer/tsan/tsan_flags.h
@@ -17,7 +17,7 @@
 
 namespace __tsan {
 
-struct Flags : CommonFlags, DDFlags {
+struct Flags : DDFlags {
   // Enable dynamic annotations, otherwise they are no-ops.
   bool enable_annotations;
   // Suppress a race report if we've already output another race report
@@ -26,9 +26,6 @@ struct Flags : CommonFlags, DDFlags {
   // Suppress a race report if we've already output another race report
   // on the same address.
   bool suppress_equal_addresses;
-  // Suppress weird race reports that can be seen if JVM is embed
-  // into the process.
-  bool suppress_java;
   // Turns off bug reporting entirely (useful for benchmarking).
   bool report_bugs;
   // Report thread leaks at exit?
@@ -45,10 +42,6 @@ struct Flags : CommonFlags, DDFlags {
   // If set, all atomics are effectively sequentially consistent (seq_cst),
   // regardless of what user actually specified.
   bool force_seq_cst_atomics;
-  // Suppressions filename.
-  const char *suppressions;
-  // Print matched suppressions at exit.
-  bool print_suppressions;
   // Print matched "benign" races at exit.
   bool print_benign;
   // Override exit status if something was reported.
diff --git a/libsanitizer/tsan/tsan_interceptors.cc b/libsanitizer/tsan/tsan_interceptors.cc
index 19a3b7b0643..b49622b3ad4 100644
--- a/libsanitizer/tsan/tsan_interceptors.cc
+++ b/libsanitizer/tsan/tsan_interceptors.cc
@@ -45,7 +45,7 @@ DECLARE_REAL(int, pthread_attr_getdetachstate, void *, void *)
 extern "C" int pthread_attr_setstacksize(void *attr, uptr stacksize);
 extern "C" int pthread_key_create(unsigned *key, void (*destructor)(void* v));
 extern "C" int pthread_setspecific(unsigned key, const void *v);
-extern "C" int pthread_mutexattr_gettype(void *a, int *type);
+DECLARE_REAL(int, pthread_mutexattr_gettype, void *, void *)
 extern "C" int pthread_yield();
 extern "C" int pthread_sigmask(int how, const __sanitizer_sigset_t *set,
                                __sanitizer_sigset_t *oldset);
@@ -91,6 +91,11 @@ typedef void (*sighandler_t)(int sig);
 
 #define errno (*__errno_location())
 
+// 16K loaded modules should be enough for everyone.
+static const uptr kMaxModules = 1 << 14;
+static LoadedModule *modules;
+static uptr nmodules;
+
 struct sigaction_t {
   union {
     sighandler_t sa_handler;
@@ -122,9 +127,9 @@ struct SignalDesc {
 };
 
 struct SignalContext {
-  int in_blocking_func;
   int int_signal_send;
-  int pending_signal_count;
+  atomic_uintptr_t in_blocking_func;
+  atomic_uintptr_t have_pending_signals;
   SignalDesc pending_signals[kSigCount];
 };
 
@@ -136,7 +141,7 @@ static LibIgnore *libignore() {
 }
 
 void InitializeLibIgnore() {
-  libignore()->Init(*GetSuppressionContext());
+  libignore()->Init(*SuppressionContext::Get());
   libignore()->OnLibraryLoaded(0);
 }
 
@@ -144,7 +149,7 @@ void InitializeLibIgnore() {
 
 static SignalContext *SigCtx(ThreadState *thr) {
   SignalContext *ctx = (SignalContext*)thr->signal_ctx;
-  if (ctx == 0 && thr->is_alive) {
+  if (ctx == 0 && !thr->is_dead) {
     ctx = (SignalContext*)MmapOrDie(sizeof(*ctx), "SignalContext");
     MemoryResetRange(thr, (uptr)&SigCtx, (uptr)ctx, sizeof(*ctx));
     thr->signal_ctx = ctx;
@@ -189,6 +194,7 @@ ScopedInterceptor::~ScopedInterceptor() {
   if (!thr_->ignore_interceptors) {
     ProcessPendingSignals(thr_);
     FuncExit(thr_);
+    CheckNoLocks(thr_);
   }
 }
 
@@ -205,7 +211,7 @@ ScopedInterceptor::~ScopedInterceptor() {
     if (REAL(func) == 0) { \
       Report("FATAL: ThreadSanitizer: failed to intercept %s\n", #func); \
       Die(); \
-    } \
+    }                                                    \
     if (thr->ignore_interceptors || thr->in_ignored_lib) \
       return REAL(func)(__VA_ARGS__); \
 /**/
@@ -218,22 +224,30 @@ ScopedInterceptor::~ScopedInterceptor() {
 
 struct BlockingCall {
   explicit BlockingCall(ThreadState *thr)
-      : ctx(SigCtx(thr)) {
-    ctx->in_blocking_func++;
+      : thr(thr)
+      , ctx(SigCtx(thr)) {
+    for (;;) {
+      atomic_store(&ctx->in_blocking_func, 1, memory_order_relaxed);
+      if (atomic_load(&ctx->have_pending_signals, memory_order_relaxed) == 0)
+        break;
+      atomic_store(&ctx->in_blocking_func, 0, memory_order_relaxed);
+      ProcessPendingSignals(thr);
+    }
+    // When we are in a "blocking call", we process signals asynchronously
+    // (right when they arrive). In this context we do not expect to be
+    // executing any user/runtime code. The known interceptor sequence when
+    // this is not true is: pthread_join -> munmap(stack). It's fine
+    // to ignore munmap in this case -- we handle stack shadow separately.
+    thr->ignore_interceptors++;
   }
 
   ~BlockingCall() {
-    ctx->in_blocking_func--;
+    thr->ignore_interceptors--;
+    atomic_store(&ctx->in_blocking_func, 0, memory_order_relaxed);
   }
 
+  ThreadState *thr;
   SignalContext *ctx;
-
-  // When we are in a "blocking call", we process signals asynchronously
-  // (right when they arrive). In this context we do not expect to be
-  // executing any user/runtime code. The known interceptor sequence when
-  // this is not true is: pthread_join -> munmap(stack). It's fine
-  // to ignore munmap in this case -- we handle stack shadow separately.
-  ScopedIgnoreInterceptors ignore_interceptors;
 };
 
 TSAN_INTERCEPTOR(unsigned, sleep, unsigned sec) {
@@ -257,74 +271,60 @@ TSAN_INTERCEPTOR(int, nanosleep, void *req, void *rem) {
   return res;
 }
 
-TSAN_INTERCEPTOR(void*, dlopen, const char *filename, int flag) {
-  SCOPED_INTERCEPTOR_RAW(dlopen, filename, flag);
-  void *res = REAL(dlopen)(filename, flag);
-  libignore()->OnLibraryLoaded(filename);
-  return res;
-}
-
-TSAN_INTERCEPTOR(int, dlclose, void *handle) {
-  SCOPED_INTERCEPTOR_RAW(dlclose, handle);
-  int res = REAL(dlclose)(handle);
-  libignore()->OnLibraryUnloaded();
-  return res;
-}
-
 class AtExitContext {
  public:
   AtExitContext()
     : mtx_(MutexTypeAtExit, StatMtxAtExit)
-    , pos_() {
+    , stack_(MBlockAtExit) {
   }
 
-  typedef void(*atexit_t)();
+  typedef void(*atexit_cb_t)();
 
   int atexit(ThreadState *thr, uptr pc, bool is_on_exit,
-             atexit_t f, void *arg) {
+             atexit_cb_t f, void *arg, void *dso) {
     Lock l(&mtx_);
-    if (pos_ == kMaxAtExit)
-      return 1;
     Release(thr, pc, (uptr)this);
-    stack_[pos_] = f;
-    args_[pos_] = arg;
-    is_on_exits_[pos_] = is_on_exit;
-    pos_++;
+    atexit_t *a = stack_.PushBack();
+    a->cb = f;
+    a->arg = arg;
+    a->dso = dso;
+    a->is_on_exit = is_on_exit;
     return 0;
   }
 
   void exit(ThreadState *thr, uptr pc) {
     for (;;) {
-      atexit_t f = 0;
-      void *arg = 0;
-      bool is_on_exit = false;
+      atexit_t a = {};
       {
         Lock l(&mtx_);
-        if (pos_) {
-          pos_--;
-          f = stack_[pos_];
-          arg = args_[pos_];
-          is_on_exit = is_on_exits_[pos_];
+        if (stack_.Size() != 0) {
+          a = stack_[stack_.Size() - 1];
+          stack_.PopBack();
           Acquire(thr, pc, (uptr)this);
         }
       }
-      if (f == 0)
+      if (a.cb == 0)
         break;
-      DPrintf("#%d: executing atexit func %p\n", thr->tid, f);
-      if (is_on_exit)
-        ((void(*)(int status, void *arg))f)(0, arg);
+      VPrintf(2, "#%d: executing atexit func %p(%p) dso=%p\n",
+          thr->tid, a.cb, a.arg, a.dso);
+      if (a.is_on_exit)
+        ((void(*)(int status, void *arg))a.cb)(0, a.arg);
       else
-        ((void(*)(void *arg, void *dso))f)(arg, 0);
+        ((void(*)(void *arg, void *dso))a.cb)(a.arg, a.dso);
     }
   }
 
  private:
-  static const int kMaxAtExit = 128;
+  struct atexit_t {
+    atexit_cb_t cb;
+    void *arg;
+    void *dso;
+    bool is_on_exit;
+  };
+
+  static const int kMaxAtExit = 1024;
   Mutex mtx_;
-  atexit_t stack_[kMaxAtExit];
-  void *args_[kMaxAtExit];
-  bool is_on_exits_[kMaxAtExit];
-  int pos_;
+  Vector<atexit_t> stack_;
 };
 
 static AtExitContext *atexit_ctx;
@@ -335,29 +335,42 @@ TSAN_INTERCEPTOR(int, atexit, void (*f)()) {
   // We want to setup the atexit callback even if we are in ignored lib
   // or after fork.
   SCOPED_INTERCEPTOR_RAW(atexit, f);
-  return atexit_ctx->atexit(thr, pc, false, (void(*)())f, 0);
+  return atexit_ctx->atexit(thr, pc, false, (void(*)())f, 0, 0);
 }
 
 TSAN_INTERCEPTOR(int, on_exit, void(*f)(int, void*), void *arg) {
   if (cur_thread()->in_symbolizer)
     return 0;
   SCOPED_TSAN_INTERCEPTOR(on_exit, f, arg);
-  return atexit_ctx->atexit(thr, pc, true, (void(*)())f, arg);
+  return atexit_ctx->atexit(thr, pc, true, (void(*)())f, arg, 0);
+}
+
+bool IsSaticModule(void *dso) {
+  if (modules == 0)
+    return false;
+  for (uptr i = 0; i < nmodules; i++) {
+    if (modules[i].containsAddress((uptr)dso))
+      return true;
+  }
+  return false;
 }
 
 TSAN_INTERCEPTOR(int, __cxa_atexit, void (*f)(void *a), void *arg, void *dso) {
   if (cur_thread()->in_symbolizer)
     return 0;
   SCOPED_TSAN_INTERCEPTOR(__cxa_atexit, f, arg, dso);
-  if (dso) {
-    // Memory allocation in __cxa_atexit will race with free during exit,
-    // because we do not see synchronization around atexit callback list.
-    ThreadIgnoreBegin(thr, pc);
-    int res = REAL(__cxa_atexit)(f, arg, dso);
-    ThreadIgnoreEnd(thr, pc);
-    return res;
-  }
-  return atexit_ctx->atexit(thr, pc, false, (void(*)())f, arg);
+  // If it's the main executable or a statically loaded library,
+  // we will call the callback.
+  if (dso == 0 || IsSaticModule(dso))
+    return atexit_ctx->atexit(thr, pc, false, (void(*)())f, arg, dso);
+
+  // Dynamically load module, don't know when to call the callback for it.
+  // Memory allocation in __cxa_atexit will race with free during exit,
+  // because we do not see synchronization around atexit callback list.
+  ThreadIgnoreBegin(thr, pc);
+  int res = REAL(__cxa_atexit)(f, arg, dso);
+  ThreadIgnoreEnd(thr, pc);
+  return res;
 }
 
 // Cleanup old bufs.
@@ -383,6 +396,13 @@ static void SetJmp(ThreadState *thr, uptr sp, uptr mangled_sp) {
   buf->sp = sp;
   buf->mangled_sp = mangled_sp;
   buf->shadow_stack_pos = thr->shadow_stack_pos;
+  SignalContext *sctx = SigCtx(thr);
+  buf->int_signal_send = sctx ? sctx->int_signal_send : 0;
+  buf->in_blocking_func = sctx ?
+      atomic_load(&sctx->in_blocking_func, memory_order_relaxed) :
+      false;
+  buf->in_signal_handler = atomic_load(&thr->in_signal_handler,
+      memory_order_relaxed);
 }
 
 static void LongJmp(ThreadState *thr, uptr *env) {
@@ -395,6 +415,14 @@ static void LongJmp(ThreadState *thr, uptr *env) {
       // Unwind the stack.
       while (thr->shadow_stack_pos > buf->shadow_stack_pos)
         FuncExit(thr);
+      SignalContext *sctx = SigCtx(thr);
+      if (sctx) {
+        sctx->int_signal_send = buf->int_signal_send;
+        atomic_store(&sctx->in_blocking_func, buf->in_blocking_func,
+            memory_order_relaxed);
+      }
+      atomic_store(&thr->in_signal_handler, buf->in_signal_handler,
+          memory_order_relaxed);
       JmpBufGarbageCollect(thr, buf->sp - 1);  // do not collect buf->sp
       return;
     }
@@ -531,7 +559,7 @@ TSAN_INTERCEPTOR(void, cfree, void *p) {
 
 TSAN_INTERCEPTOR(uptr, malloc_usable_size, void *p) {
   SCOPED_INTERCEPTOR_RAW(malloc_usable_size, p);
-  return user_alloc_usable_size(thr, pc, p);
+  return user_alloc_usable_size(p);
 }
 
 #define OPERATOR_NEW_BODY(mangled_name) \
@@ -749,6 +777,11 @@ TSAN_INTERCEPTOR(void*, memalign, uptr align, uptr sz) {
   return user_alloc(thr, pc, sz, align);
 }
 
+TSAN_INTERCEPTOR(void*, aligned_alloc, uptr align, uptr sz) {
+  SCOPED_INTERCEPTOR_RAW(memalign, align, sz);
+  return user_alloc(thr, pc, sz, align);
+}
+
 TSAN_INTERCEPTOR(void*, valloc, uptr sz) {
   SCOPED_INTERCEPTOR_RAW(valloc, sz);
   return user_alloc(thr, pc, sz, GetPageSizeCached());
@@ -1040,7 +1073,7 @@ TSAN_INTERCEPTOR(int, pthread_mutex_init, void *m, void *a) {
     bool recursive = false;
     if (a) {
       int type = 0;
-      if (pthread_mutexattr_gettype(a, &type) == 0)
+      if (REAL(pthread_mutexattr_gettype)(a, &type) == 0)
         recursive = (type == PTHREAD_MUTEX_RECURSIVE
             || type == PTHREAD_MUTEX_RECURSIVE_NP);
     }
@@ -1151,7 +1184,7 @@ TSAN_INTERCEPTOR(int, pthread_rwlock_tryrdlock, void *m) {
   SCOPED_TSAN_INTERCEPTOR(pthread_rwlock_tryrdlock, m);
   int res = REAL(pthread_rwlock_tryrdlock)(m);
   if (res == 0) {
-    MutexLock(thr, pc, (uptr)m, /*rec=*/1, /*try_lock=*/true);
+    MutexReadLock(thr, pc, (uptr)m, /*try_lock=*/true);
   }
   return res;
 }
@@ -1689,8 +1722,10 @@ TSAN_INTERCEPTOR(int, epoll_wait, int epfd, void *ev, int cnt, int timeout) {
 
 namespace __tsan {
 
-static void CallUserSignalHandler(ThreadState *thr, bool sync, bool sigact,
-    int sig, my_siginfo_t *info, void *uctx) {
+static void CallUserSignalHandler(ThreadState *thr, bool sync, bool acquire,
+    bool sigact, int sig, my_siginfo_t *info, void *uctx) {
+  if (acquire)
+    Acquire(thr, 0, (uptr)&sigactions[sig]);
   // Ensure that the handler does not spoil errno.
   const int saved_errno = errno;
   errno = 99;
@@ -1716,8 +1751,8 @@ static void CallUserSignalHandler(ThreadState *thr, bool sync, bool sigact,
     ThreadRegistryLock l(ctx->thread_registry);
     ScopedReport rep(ReportTypeErrnoInSignal);
     if (!IsFiredSuppression(ctx, rep, stack)) {
-      rep.AddStack(&stack);
-      OutputReport(ctx, rep, rep.GetReport()->stacks[0]);
+      rep.AddStack(&stack, true);
+      OutputReport(thr, rep);
     }
   }
   errno = saved_errno;
@@ -1725,10 +1760,11 @@ static void CallUserSignalHandler(ThreadState *thr, bool sync, bool sigact,
 
 void ProcessPendingSignals(ThreadState *thr) {
   SignalContext *sctx = SigCtx(thr);
-  if (sctx == 0 || sctx->pending_signal_count == 0 || thr->in_signal_handler)
+  if (sctx == 0 ||
+      atomic_load(&sctx->have_pending_signals, memory_order_relaxed) == 0)
     return;
-  thr->in_signal_handler = true;
-  sctx->pending_signal_count = 0;
+  atomic_store(&sctx->have_pending_signals, 0, memory_order_relaxed);
+  atomic_fetch_add(&thr->in_signal_handler, 1, memory_order_relaxed);
   // These are too big for stack.
   static THREADLOCAL __sanitizer_sigset_t emptyset, oldset;
   REAL(sigfillset)(&emptyset);
@@ -1739,14 +1775,13 @@ void ProcessPendingSignals(ThreadState *thr) {
       signal->armed = false;
       if (sigactions[sig].sa_handler != SIG_DFL
           && sigactions[sig].sa_handler != SIG_IGN) {
-        CallUserSignalHandler(thr, false, signal->sigaction,
+        CallUserSignalHandler(thr, false, true, signal->sigaction,
             sig, &signal->siginfo, &signal->ctx);
       }
     }
   }
   pthread_sigmask(SIG_SETMASK, &oldset, 0);
-  CHECK_EQ(thr->in_signal_handler, true);
-  thr->in_signal_handler = false;
+  atomic_fetch_add(&thr->in_signal_handler, -1, memory_order_relaxed);
 }
 
 }  // namespace __tsan
@@ -1772,21 +1807,27 @@ void ALWAYS_INLINE rtl_generic_sighandler(bool sigact, int sig,
       // If we are in blocking function, we can safely process it now
       // (but check if we are in a recursive interceptor,
       // i.e. pthread_join()->munmap()).
-      (sctx && sctx->in_blocking_func == 1)) {
-    CHECK_EQ(thr->in_signal_handler, false);
-    thr->in_signal_handler = true;
-    if (sctx && sctx->in_blocking_func == 1) {
+      (sctx && atomic_load(&sctx->in_blocking_func, memory_order_relaxed))) {
+    atomic_fetch_add(&thr->in_signal_handler, 1, memory_order_relaxed);
+    if (sctx && atomic_load(&sctx->in_blocking_func, memory_order_relaxed)) {
       // We ignore interceptors in blocking functions,
       // temporary enbled them again while we are calling user function.
       int const i = thr->ignore_interceptors;
       thr->ignore_interceptors = 0;
-      CallUserSignalHandler(thr, sync, sigact, sig, info, ctx);
+      atomic_store(&sctx->in_blocking_func, 0, memory_order_relaxed);
+      CallUserSignalHandler(thr, sync, true, sigact, sig, info, ctx);
       thr->ignore_interceptors = i;
+      atomic_store(&sctx->in_blocking_func, 1, memory_order_relaxed);
     } else {
-      CallUserSignalHandler(thr, sync, sigact, sig, info, ctx);
+      // Be very conservative with when we do acquire in this case.
+      // It's unsafe to do acquire in async handlers, because ThreadState
+      // can be in inconsistent state.
+      // SIGSYS looks relatively safe -- it's synchronous and can actually
+      // need some global state.
+      bool acq = (sig == SIGSYS);
+      CallUserSignalHandler(thr, sync, acq, sigact, sig, info, ctx);
     }
-    CHECK_EQ(thr->in_signal_handler, true);
-    thr->in_signal_handler = false;
+    atomic_fetch_add(&thr->in_signal_handler, -1, memory_order_relaxed);
     return;
   }
 
@@ -1800,7 +1841,7 @@ void ALWAYS_INLINE rtl_generic_sighandler(bool sigact, int sig,
       internal_memcpy(&signal->siginfo, info, sizeof(*info));
     if (ctx)
       internal_memcpy(&signal->ctx, ctx, sizeof(signal->ctx));
-    sctx->pending_signal_count++;
+    atomic_store(&sctx->have_pending_signals, 1, memory_order_relaxed);
   }
 }
 
@@ -1828,6 +1869,7 @@ TSAN_INTERCEPTOR(int, sigaction, int sig, sigaction_t *act, sigaction_t *old) {
     else
       newact.sa_handler = rtl_sighandler;
   }
+  ReleaseStore(thr, pc, (uptr)&sigactions[sig]);
   int res = REAL(sigaction)(sig, &newact, 0);
   return res;
 }
@@ -1911,35 +1953,6 @@ TSAN_INTERCEPTOR(int, getaddrinfo, void *node, void *service,
   return res;
 }
 
-// Linux kernel has a bug that leads to kernel deadlock if a process
-// maps TBs of memory and then calls mlock().
-static void MlockIsUnsupported() {
-  static atomic_uint8_t printed;
-  if (atomic_exchange(&printed, 1, memory_order_relaxed))
-    return;
-  VPrintf(1, "INFO: ThreadSanitizer ignores mlock/munlock[all]\n");
-}
-
-TSAN_INTERCEPTOR(int, mlock, const void *addr, uptr len) {
-  MlockIsUnsupported();
-  return 0;
-}
-
-TSAN_INTERCEPTOR(int, munlock, const void *addr, uptr len) {
-  MlockIsUnsupported();
-  return 0;
-}
-
-TSAN_INTERCEPTOR(int, mlockall, int flags) {
-  MlockIsUnsupported();
-  return 0;
-}
-
-TSAN_INTERCEPTOR(int, munlockall, void) {
-  MlockIsUnsupported();
-  return 0;
-}
-
 TSAN_INTERCEPTOR(int, fork, int fake) {
   if (cur_thread()->in_symbolizer)
     return REAL(fork)(fake);
@@ -2000,6 +2013,18 @@ static void HandleRecvmsg(ThreadState *thr, uptr pc,
 #include "sanitizer_common/sanitizer_platform_interceptors.h"
 // Causes interceptor recursion (getaddrinfo() and fopen())
 #undef SANITIZER_INTERCEPT_GETADDRINFO
+// There interceptors do not seem to be strictly necessary for tsan.
+// But we see cases where the interceptors consume 70% of execution time.
+// Memory blocks passed to fgetgrent_r are "written to" by tsan several times.
+// First, there is some recursion (getgrnam_r calls fgetgrent_r), and each
+// function "writes to" the buffer. Then, the same memory is "written to"
+// twice, first as buf and then as pwbufp (both of them refer to the same
+// addresses).
+#undef SANITIZER_INTERCEPT_GETPWENT
+#undef SANITIZER_INTERCEPT_GETPWENT_R
+#undef SANITIZER_INTERCEPT_FGETPWENT
+#undef SANITIZER_INTERCEPT_GETPWNAM_AND_FRIENDS
+#undef SANITIZER_INTERCEPT_GETPWNAM_R_AND_FRIENDS
 
 #define COMMON_INTERCEPT_FUNCTION(name) INTERCEPT_FUNCTION(name)
 
@@ -2019,6 +2044,12 @@ static void HandleRecvmsg(ThreadState *thr, uptr pc,
   ctx = (void *)&_ctx;                                \
   (void) ctx;
 
+#define COMMON_INTERCEPTOR_ENTER_NOIGNORE(ctx, func, ...) \
+  SCOPED_INTERCEPTOR_RAW(func, __VA_ARGS__);              \
+  TsanInterceptorContext _ctx = {thr, caller_pc, pc};     \
+  ctx = (void *)&_ctx;                                    \
+  (void) ctx;
+
 #define COMMON_INTERCEPTOR_FILE_OPEN(ctx, file, path) \
   Acquire(thr, pc, File2addr(path));                  \
   if (file) {                                         \
@@ -2032,6 +2063,12 @@ static void HandleRecvmsg(ThreadState *thr, uptr pc,
     if (fd >= 0) FdClose(thr, pc, fd);           \
   }
 
+#define COMMON_INTERCEPTOR_LIBRARY_LOADED(filename, res)  \
+  libignore()->OnLibraryLoaded(filename)
+
+#define COMMON_INTERCEPTOR_LIBRARY_UNLOADED() \
+  libignore()->OnLibraryUnloaded()
+
 #define COMMON_INTERCEPTOR_FD_ACQUIRE(ctx, fd) \
   FdAcquire(((TsanInterceptorContext *) ctx)->thr, pc, fd)
 
@@ -2367,15 +2404,8 @@ void InitializeInterceptors() {
   TSAN_INTERCEPT(gettimeofday);
   TSAN_INTERCEPT(getaddrinfo);
 
-  TSAN_INTERCEPT(mlock);
-  TSAN_INTERCEPT(munlock);
-  TSAN_INTERCEPT(mlockall);
-  TSAN_INTERCEPT(munlockall);
-
   TSAN_INTERCEPT(fork);
   TSAN_INTERCEPT(vfork);
-  TSAN_INTERCEPT(dlopen);
-  TSAN_INTERCEPT(dlclose);
   TSAN_INTERCEPT(on_exit);
   TSAN_INTERCEPT(__cxa_atexit);
   TSAN_INTERCEPT(_exit);
@@ -2397,6 +2427,11 @@ void InitializeInterceptors() {
   }
 
   FdInit();
+
+  // Remember list of loaded libraries for atexit interceptors.
+  modules = (LoadedModule*)MmapOrDie(sizeof(*modules)*kMaxModules,
+      "LoadedModule");
+  nmodules = GetListOfModules(modules, kMaxModules, 0);
 }
 
 void *internal_start_thread(void(*func)(void *arg), void *arg) {
diff --git a/libsanitizer/tsan/tsan_interface_ann.cc b/libsanitizer/tsan/tsan_interface_ann.cc
index 20e3d9a35ac..85de2e6221f 100644
--- a/libsanitizer/tsan/tsan_interface_ann.cc
+++ b/libsanitizer/tsan/tsan_interface_ann.cc
@@ -38,6 +38,7 @@ class ScopedAnnotation {
 
   ~ScopedAnnotation() {
     FuncExit(thr_);
+    CheckNoLocks(thr_);
   }
  private:
   ThreadState *const thr_;
@@ -123,8 +124,6 @@ static ExpectRace *FindRace(ExpectRace *list, uptr addr, uptr size) {
 
 static bool CheckContains(ExpectRace *list, uptr addr, uptr size) {
   ExpectRace *race = FindRace(list, addr, size);
-  if (race == 0 && AlternativeAddress(addr))
-    race = FindRace(list, AlternativeAddress(addr), size);
   if (race == 0)
     return false;
   DPrintf("Hit expected/benign race: %s addr=%zx:%d %s:%d\n",
diff --git a/libsanitizer/tsan/tsan_interface_atomic.cc b/libsanitizer/tsan/tsan_interface_atomic.cc
index 3f5a4ccc9f7..316614dd486 100644
--- a/libsanitizer/tsan/tsan_interface_atomic.cc
+++ b/libsanitizer/tsan/tsan_interface_atomic.cc
@@ -25,33 +25,23 @@
 
 using namespace __tsan;  // NOLINT
 
-#define SCOPED_ATOMIC(func, ...) \
-    const uptr callpc = (uptr)__builtin_return_address(0); \
-    uptr pc = __sanitizer::StackTrace::GetCurrentPc(); \
-    mo = flags()->force_seq_cst_atomics ? (morder)mo_seq_cst : mo; \
-    ThreadState *const thr = cur_thread(); \
-    if (thr->ignore_interceptors) \
-      return NoTsanAtomic##func(__VA_ARGS__); \
-    AtomicStatInc(thr, sizeof(*a), mo, StatAtomic##func); \
-    ScopedAtomic sa(thr, callpc, a, mo, __func__); \
-    return Atomic##func(thr, pc, __VA_ARGS__); \
-/**/
-
 // These should match declarations from public tsan_interface_atomic.h header.
 typedef unsigned char      a8;
 typedef unsigned short     a16;  // NOLINT
 typedef unsigned int       a32;
 typedef unsigned long long a64;  // NOLINT
-#if defined(__SIZEOF_INT128__) \
-    || (__clang_major__ * 100 + __clang_minor__ >= 302)
+#if !defined(TSAN_GO) && (defined(__SIZEOF_INT128__) \
+    || (__clang_major__ * 100 + __clang_minor__ >= 302))
 __extension__ typedef __int128 a128;
 # define __TSAN_HAS_INT128 1
 #else
 # define __TSAN_HAS_INT128 0
 #endif
 
+#ifndef TSAN_GO
 // Protects emulation of 128-bit atomic operations.
 static StaticSpinMutex mutex128;
+#endif
 
 // Part of ABI, do not change.
 // http://llvm.org/viewvc/llvm-project/libcxx/trunk/include/atomic?view=markup
@@ -64,38 +54,6 @@ typedef enum {
   mo_seq_cst
 } morder;
 
-class ScopedAtomic {
- public:
-  ScopedAtomic(ThreadState *thr, uptr pc, const volatile void *a,
-               morder mo, const char *func)
-      : thr_(thr) {
-    FuncEntry(thr_, pc);
-    DPrintf("#%d: %s(%p, %d)\n", thr_->tid, func, a, mo);
-  }
-  ~ScopedAtomic() {
-    ProcessPendingSignals(thr_);
-    FuncExit(thr_);
-  }
- private:
-  ThreadState *thr_;
-};
-
-static void AtomicStatInc(ThreadState *thr, uptr size, morder mo, StatType t) {
-  StatInc(thr, StatAtomic);
-  StatInc(thr, t);
-  StatInc(thr, size == 1 ? StatAtomic1
-             : size == 2 ? StatAtomic2
-             : size == 4 ? StatAtomic4
-             : size == 8 ? StatAtomic8
-             :             StatAtomic16);
-  StatInc(thr, mo == mo_relaxed ? StatAtomicRelaxed
-             : mo == mo_consume ? StatAtomicConsume
-             : mo == mo_acquire ? StatAtomicAcquire
-             : mo == mo_release ? StatAtomicRelease
-             : mo == mo_acq_rel ? StatAtomicAcq_Rel
-             :                    StatAtomicSeq_Cst);
-}
-
 static bool IsLoadOrder(morder mo) {
   return mo == mo_relaxed || mo == mo_consume
       || mo == mo_acquire || mo == mo_seq_cst;
@@ -165,7 +123,7 @@ template<typename T> T func_cas(volatile T *v, T cmp, T xch) {
 // Atomic ops are executed under tsan internal mutex,
 // here we assume that the atomic variables are not accessed
 // from non-instrumented code.
-#ifndef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_16
+#if !defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_16) && !defined(TSAN_GO)
 a128 func_xchg(volatile a128 *v, a128 op) {
   SpinMutexLock lock(&mutex128);
   a128 cmp = *v;
@@ -238,6 +196,7 @@ static int SizeLog() {
   // this leads to false negatives only in very obscure cases.
 }
 
+#ifndef TSAN_GO
 static atomic_uint8_t *to_atomic(const volatile a8 *a) {
   return (atomic_uint8_t*)a;
 }
@@ -245,6 +204,7 @@ static atomic_uint8_t *to_atomic(const volatile a8 *a) {
 static atomic_uint16_t *to_atomic(const volatile a16 *a) {
   return (atomic_uint16_t*)a;
 }
+#endif
 
 static atomic_uint32_t *to_atomic(const volatile a32 *a) {
   return (atomic_uint32_t*)a;
@@ -272,10 +232,12 @@ static T NoTsanAtomicLoad(const volatile T *a, morder mo) {
   return atomic_load(to_atomic(a), to_mo(mo));
 }
 
+#if __TSAN_HAS_INT128 && !defined(TSAN_GO)
 static a128 NoTsanAtomicLoad(const volatile a128 *a, morder mo) {
   SpinMutexLock lock(&mutex128);
   return *a;
 }
+#endif
 
 template<typename T>
 static T AtomicLoad(ThreadState *thr, uptr pc, const volatile T *a,
@@ -287,7 +249,7 @@ static T AtomicLoad(ThreadState *thr, uptr pc, const volatile T *a,
     MemoryReadAtomic(thr, pc, (uptr)a, SizeLog<T>());
     return NoTsanAtomicLoad(a, mo);
   }
-  SyncVar *s = ctx->synctab.GetOrCreateAndLock(thr, pc, (uptr)a, false);
+  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, (uptr)a, false);
   AcquireImpl(thr, pc, &s->clock);
   T v = NoTsanAtomicLoad(a, mo);
   s->mtx.ReadUnlock();
@@ -300,10 +262,12 @@ static void NoTsanAtomicStore(volatile T *a, T v, morder mo) {
   atomic_store(to_atomic(a), v, to_mo(mo));
 }
 
+#if __TSAN_HAS_INT128 && !defined(TSAN_GO)
 static void NoTsanAtomicStore(volatile a128 *a, a128 v, morder mo) {
   SpinMutexLock lock(&mutex128);
   *a = v;
 }
+#endif
 
 template<typename T>
 static void AtomicStore(ThreadState *thr, uptr pc, volatile T *a, T v,
@@ -319,7 +283,7 @@ static void AtomicStore(ThreadState *thr, uptr pc, volatile T *a, T v,
     return;
   }
   __sync_synchronize();
-  SyncVar *s = ctx->synctab.GetOrCreateAndLock(thr, pc, (uptr)a, true);
+  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, (uptr)a, true);
   thr->fast_state.IncrementEpoch();
   // Can't increment epoch w/o writing to the trace as well.
   TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
@@ -333,7 +297,7 @@ static T AtomicRMW(ThreadState *thr, uptr pc, volatile T *a, T v, morder mo) {
   MemoryWriteAtomic(thr, pc, (uptr)a, SizeLog<T>());
   SyncVar *s = 0;
   if (mo != mo_relaxed) {
-    s = ctx->synctab.GetOrCreateAndLock(thr, pc, (uptr)a, true);
+    s = ctx->metamap.GetOrCreateAndLock(thr, pc, (uptr)a, true);
     thr->fast_state.IncrementEpoch();
     // Can't increment epoch w/o writing to the trace as well.
     TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
@@ -432,6 +396,7 @@ static bool NoTsanAtomicCAS(volatile T *a, T *c, T v, morder mo, morder fmo) {
   return atomic_compare_exchange_strong(to_atomic(a), c, v, to_mo(mo));
 }
 
+#if __TSAN_HAS_INT128
 static bool NoTsanAtomicCAS(volatile a128 *a, a128 *c, a128 v,
     morder mo, morder fmo) {
   a128 old = *c;
@@ -441,10 +406,12 @@ static bool NoTsanAtomicCAS(volatile a128 *a, a128 *c, a128 v,
   *c = cur;
   return false;
 }
+#endif
 
 template<typename T>
-static bool NoTsanAtomicCAS(volatile T *a, T c, T v, morder mo, morder fmo) {
-  return NoTsanAtomicCAS(a, &c, v, mo, fmo);
+static T NoTsanAtomicCAS(volatile T *a, T c, T v, morder mo, morder fmo) {
+  NoTsanAtomicCAS(a, &c, v, mo, fmo);
+  return c;
 }
 
 template<typename T>
@@ -455,7 +422,7 @@ static bool AtomicCAS(ThreadState *thr, uptr pc,
   SyncVar *s = 0;
   bool write_lock = mo != mo_acquire && mo != mo_consume;
   if (mo != mo_relaxed) {
-    s = ctx->synctab.GetOrCreateAndLock(thr, pc, (uptr)a, write_lock);
+    s = ctx->metamap.GetOrCreateAndLock(thr, pc, (uptr)a, write_lock);
     thr->fast_state.IncrementEpoch();
     // Can't increment epoch w/o writing to the trace as well.
     TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
@@ -487,6 +454,7 @@ static T AtomicCAS(ThreadState *thr, uptr pc,
   return c;
 }
 
+#ifndef TSAN_GO
 static void NoTsanAtomicFence(morder mo) {
   __sync_synchronize();
 }
@@ -495,6 +463,56 @@ static void AtomicFence(ThreadState *thr, uptr pc, morder mo) {
   // FIXME(dvyukov): not implemented.
   __sync_synchronize();
 }
+#endif
+
+// Interface functions follow.
+#ifndef TSAN_GO
+
+// C/C++
+
+#define SCOPED_ATOMIC(func, ...) \
+    const uptr callpc = (uptr)__builtin_return_address(0); \
+    uptr pc = __sanitizer::StackTrace::GetCurrentPc(); \
+    mo = flags()->force_seq_cst_atomics ? (morder)mo_seq_cst : mo; \
+    ThreadState *const thr = cur_thread(); \
+    if (thr->ignore_interceptors) \
+      return NoTsanAtomic##func(__VA_ARGS__); \
+    AtomicStatInc(thr, sizeof(*a), mo, StatAtomic##func); \
+    ScopedAtomic sa(thr, callpc, a, mo, __func__); \
+    return Atomic##func(thr, pc, __VA_ARGS__); \
+/**/
+
+class ScopedAtomic {
+ public:
+  ScopedAtomic(ThreadState *thr, uptr pc, const volatile void *a,
+               morder mo, const char *func)
+      : thr_(thr) {
+    FuncEntry(thr_, pc);
+    DPrintf("#%d: %s(%p, %d)\n", thr_->tid, func, a, mo);
+  }
+  ~ScopedAtomic() {
+    ProcessPendingSignals(thr_);
+    FuncExit(thr_);
+  }
+ private:
+  ThreadState *thr_;
+};
+
+static void AtomicStatInc(ThreadState *thr, uptr size, morder mo, StatType t) {
+  StatInc(thr, StatAtomic);
+  StatInc(thr, t);
+  StatInc(thr, size == 1 ? StatAtomic1
+             : size == 2 ? StatAtomic2
+             : size == 4 ? StatAtomic4
+             : size == 8 ? StatAtomic8
+             :             StatAtomic16);
+  StatInc(thr, mo == mo_relaxed ? StatAtomicRelaxed
+             : mo == mo_consume ? StatAtomicConsume
+             : mo == mo_acquire ? StatAtomicAcquire
+             : mo == mo_release ? StatAtomicRelease
+             : mo == mo_acq_rel ? StatAtomicAcq_Rel
+             :                    StatAtomicSeq_Cst);
+}
 
 extern "C" {
 SANITIZER_INTERFACE_ATTRIBUTE
@@ -846,3 +864,88 @@ SANITIZER_INTERFACE_ATTRIBUTE
 void __tsan_atomic_signal_fence(morder mo) {
 }
 }  // extern "C"
+
+#else  // #ifndef TSAN_GO
+
+// Go
+
+#define ATOMIC(func, ...) \
+    if (thr->ignore_sync) { \
+      NoTsanAtomic##func(__VA_ARGS__); \
+    } else { \
+      FuncEntry(thr, cpc); \
+      Atomic##func(thr, pc, __VA_ARGS__); \
+      FuncExit(thr); \
+    } \
+/**/
+
+#define ATOMIC_RET(func, ret, ...) \
+    if (thr->ignore_sync) { \
+      (ret) = NoTsanAtomic##func(__VA_ARGS__); \
+    } else { \
+      FuncEntry(thr, cpc); \
+      (ret) = Atomic##func(thr, pc, __VA_ARGS__); \
+      FuncExit(thr); \
+    } \
+/**/
+
+extern "C" {
+SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_go_atomic32_load(ThreadState *thr, uptr cpc, uptr pc, u8 *a) {
+  ATOMIC_RET(Load, *(a32*)(a+8), *(a32**)a, mo_acquire);
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_go_atomic64_load(ThreadState *thr, uptr cpc, uptr pc, u8 *a) {
+  ATOMIC_RET(Load, *(a64*)(a+8), *(a64**)a, mo_acquire);
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_go_atomic32_store(ThreadState *thr, uptr cpc, uptr pc, u8 *a) {
+  ATOMIC(Store, *(a32**)a, *(a32*)(a+8), mo_release);
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_go_atomic64_store(ThreadState *thr, uptr cpc, uptr pc, u8 *a) {
+  ATOMIC(Store, *(a64**)a, *(a64*)(a+8), mo_release);
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_go_atomic32_fetch_add(ThreadState *thr, uptr cpc, uptr pc, u8 *a) {
+  ATOMIC_RET(FetchAdd, *(a32*)(a+16), *(a32**)a, *(a32*)(a+8), mo_acq_rel);
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_go_atomic64_fetch_add(ThreadState *thr, uptr cpc, uptr pc, u8 *a) {
+  ATOMIC_RET(FetchAdd, *(a64*)(a+16), *(a64**)a, *(a64*)(a+8), mo_acq_rel);
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_go_atomic32_exchange(ThreadState *thr, uptr cpc, uptr pc, u8 *a) {
+  ATOMIC_RET(Exchange, *(a32*)(a+16), *(a32**)a, *(a32*)(a+8), mo_acq_rel);
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_go_atomic64_exchange(ThreadState *thr, uptr cpc, uptr pc, u8 *a) {
+  ATOMIC_RET(Exchange, *(a64*)(a+16), *(a64**)a, *(a64*)(a+8), mo_acq_rel);
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_go_atomic32_compare_exchange(
+    ThreadState *thr, uptr cpc, uptr pc, u8 *a) {
+  a32 cur = 0;
+  a32 cmp = *(a32*)(a+8);
+  ATOMIC_RET(CAS, cur, *(a32**)a, cmp, *(a32*)(a+12), mo_acq_rel, mo_acquire);
+  *(bool*)(a+16) = (cur == cmp);
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_go_atomic64_compare_exchange(
+    ThreadState *thr, uptr cpc, uptr pc, u8 *a) {
+  a64 cur = 0;
+  a64 cmp = *(a64*)(a+8);
+  ATOMIC_RET(CAS, cur, *(a64**)a, cmp, *(a64*)(a+16), mo_acq_rel, mo_acquire);
+  *(bool*)(a+24) = (cur == cmp);
+}
+}  // extern "C"
+#endif  // #ifndef TSAN_GO
diff --git a/libsanitizer/tsan/tsan_interface_java.cc b/libsanitizer/tsan/tsan_interface_java.cc
index 7f451690946..03a9e280ddc 100644
--- a/libsanitizer/tsan/tsan_interface_java.cc
+++ b/libsanitizer/tsan/tsan_interface_java.cc
@@ -20,54 +20,17 @@
 
 using namespace __tsan;  // NOLINT
 
-namespace __tsan {
-
-const uptr kHeapShadow = 0x300000000000ull;
-const uptr kHeapAlignment = 8;
+const jptr kHeapAlignment = 8;
 
-struct BlockDesc {
-  bool begin;
-  Mutex mtx;
-  SyncVar *head;
-
-  BlockDesc()
-      : mtx(MutexTypeJavaMBlock, StatMtxJavaMBlock)
-      , head() {
-    CHECK_EQ(begin, false);
-    begin = true;
-  }
-
-  ~BlockDesc() {
-    CHECK_EQ(begin, true);
-    begin = false;
-    ThreadState *thr = cur_thread();
-    SyncVar *s = head;
-    while (s) {
-      SyncVar *s1 = s->next;
-      StatInc(thr, StatSyncDestroyed);
-      s->mtx.Lock();
-      s->mtx.Unlock();
-      thr->mset.Remove(s->GetId());
-      DestroyAndFree(s);
-      s = s1;
-    }
-  }
-};
+namespace __tsan {
 
 struct JavaContext {
   const uptr heap_begin;
   const uptr heap_size;
-  BlockDesc *heap_shadow;
 
   JavaContext(jptr heap_begin, jptr heap_size)
       : heap_begin(heap_begin)
       , heap_size(heap_size) {
-    uptr size = heap_size / kHeapAlignment * sizeof(BlockDesc);
-    heap_shadow = (BlockDesc*)MmapFixedNoReserve(kHeapShadow, size);
-    if ((uptr)heap_shadow != kHeapShadow) {
-      Printf("ThreadSanitizer: failed to mmap Java heap shadow\n");
-      Die();
-    }
   }
 };
 
@@ -91,63 +54,6 @@ class ScopedJavaFunc {
 static u64 jctx_buf[sizeof(JavaContext) / sizeof(u64) + 1];
 static JavaContext *jctx;
 
-static BlockDesc *getblock(uptr addr) {
-  uptr i = (addr - jctx->heap_begin) / kHeapAlignment;
-  return &jctx->heap_shadow[i];
-}
-
-static uptr USED getmem(BlockDesc *b) {
-  uptr i = b - jctx->heap_shadow;
-  uptr p = jctx->heap_begin + i * kHeapAlignment;
-  CHECK_GE(p, jctx->heap_begin);
-  CHECK_LT(p, jctx->heap_begin + jctx->heap_size);
-  return p;
-}
-
-static BlockDesc *getblockbegin(uptr addr) {
-  for (BlockDesc *b = getblock(addr);; b--) {
-    CHECK_GE(b, jctx->heap_shadow);
-    if (b->begin)
-      return b;
-  }
-  return 0;
-}
-
-SyncVar* GetJavaSync(ThreadState *thr, uptr pc, uptr addr,
-                     bool write_lock, bool create) {
-  if (jctx == 0 || addr < jctx->heap_begin
-      || addr >= jctx->heap_begin + jctx->heap_size)
-    return 0;
-  BlockDesc *b = getblockbegin(addr);
-  DPrintf("#%d: GetJavaSync %p->%p\n", thr->tid, addr, b);
-  Lock l(&b->mtx);
-  SyncVar *s = b->head;
-  for (; s; s = s->next) {
-    if (s->addr == addr) {
-      DPrintf("#%d: found existing sync for %p\n", thr->tid, addr);
-      break;
-    }
-  }
-  if (s == 0 && create) {
-    DPrintf("#%d: creating new sync for %p\n", thr->tid, addr);
-    s = ctx->synctab.Create(thr, pc, addr);
-    s->next = b->head;
-    b->head = s;
-  }
-  if (s) {
-    if (write_lock)
-      s->mtx.Lock();
-    else
-      s->mtx.ReadLock();
-  }
-  return s;
-}
-
-SyncVar* GetAndRemoveJavaSync(ThreadState *thr, uptr pc, uptr addr) {
-  // We do not destroy Java mutexes other than in __tsan_java_free().
-  return 0;
-}
-
 }  // namespace __tsan
 
 #define SCOPED_JAVA_FUNC(func) \
@@ -190,8 +96,7 @@ void __tsan_java_alloc(jptr ptr, jptr size) {
   CHECK_GE(ptr, jctx->heap_begin);
   CHECK_LE(ptr + size, jctx->heap_begin + jctx->heap_size);
 
-  BlockDesc *b = getblock(ptr);
-  new(b) BlockDesc();
+  OnUserAlloc(thr, pc, ptr, size, false);
 }
 
 void __tsan_java_free(jptr ptr, jptr size) {
@@ -204,12 +109,7 @@ void __tsan_java_free(jptr ptr, jptr size) {
   CHECK_GE(ptr, jctx->heap_begin);
   CHECK_LE(ptr + size, jctx->heap_begin + jctx->heap_size);
 
-  BlockDesc *beg = getblock(ptr);
-  BlockDesc *end = getblock(ptr + size);
-  for (BlockDesc *b = beg; b != end; b++) {
-    if (b->begin)
-      b->~BlockDesc();
-  }
+  ctx->metamap.FreeRange(thr, pc, ptr, size);
 }
 
 void __tsan_java_move(jptr src, jptr dst, jptr size) {
@@ -224,42 +124,36 @@ void __tsan_java_move(jptr src, jptr dst, jptr size) {
   CHECK_LE(src + size, jctx->heap_begin + jctx->heap_size);
   CHECK_GE(dst, jctx->heap_begin);
   CHECK_LE(dst + size, jctx->heap_begin + jctx->heap_size);
-  CHECK(dst >= src + size || src >= dst + size);
+  CHECK_NE(dst, src);
+  CHECK_NE(size, 0);
 
   // Assuming it's not running concurrently with threads that do
   // memory accesses and mutex operations (stop-the-world phase).
-  {  // NOLINT
-    BlockDesc *s = getblock(src);
-    BlockDesc *d = getblock(dst);
-    BlockDesc *send = getblock(src + size);
-    for (; s != send; s++, d++) {
-      CHECK_EQ(d->begin, false);
-      if (s->begin) {
-        DPrintf("#%d: moving block %p->%p\n", thr->tid, getmem(s), getmem(d));
-        new(d) BlockDesc;
-        d->head = s->head;
-        for (SyncVar *sync = d->head; sync; sync = sync->next) {
-          uptr newaddr = sync->addr - src + dst;
-          DPrintf("#%d: moving sync %p->%p\n", thr->tid, sync->addr, newaddr);
-          sync->addr = newaddr;
-        }
-        s->head = 0;
-        s->~BlockDesc();
-      }
-    }
+  ctx->metamap.MoveMemory(src, dst, size);
+
+  // Move shadow.
+  u64 *s = (u64*)MemToShadow(src);
+  u64 *d = (u64*)MemToShadow(dst);
+  u64 *send = (u64*)MemToShadow(src + size);
+  uptr inc = 1;
+  if (dst > src) {
+    s = (u64*)MemToShadow(src + size) - 1;
+    d = (u64*)MemToShadow(dst + size) - 1;
+    send = (u64*)MemToShadow(src) - 1;
+    inc = -1;
   }
-
-  {  // NOLINT
-    u64 *s = (u64*)MemToShadow(src);
-    u64 *d = (u64*)MemToShadow(dst);
-    u64 *send = (u64*)MemToShadow(src + size);
-    for (; s != send; s++, d++) {
-      *d = *s;
-      *s = 0;
-    }
+  for (; s != send; s += inc, d += inc) {
+    *d = *s;
+    *s = 0;
   }
 }
 
+void __tsan_java_finalize() {
+  SCOPED_JAVA_FUNC(__tsan_java_finalize);
+  DPrintf("#%d: java_mutex_finalize()\n", thr->tid);
+  AcquireGlobal(thr, 0);
+}
+
 void __tsan_java_mutex_lock(jptr addr) {
   SCOPED_JAVA_FUNC(__tsan_java_mutex_lock);
   DPrintf("#%d: java_mutex_lock(%p)\n", thr->tid, addr);
diff --git a/libsanitizer/tsan/tsan_interface_java.h b/libsanitizer/tsan/tsan_interface_java.h
index 885ff289751..9af1f3fe21a 100644
--- a/libsanitizer/tsan/tsan_interface_java.h
+++ b/libsanitizer/tsan/tsan_interface_java.h
@@ -48,8 +48,13 @@ void __tsan_java_alloc(jptr ptr, jptr size) INTERFACE_ATTRIBUTE;
 void __tsan_java_free(jptr ptr, jptr size) INTERFACE_ATTRIBUTE;
 // Callback for memory move by GC.
 // Can be aggregated for several objects (preferably).
-// The ranges must not overlap.
+// The ranges can overlap.
 void __tsan_java_move(jptr src, jptr dst, jptr size) INTERFACE_ATTRIBUTE;
+// This function must be called on the finalizer thread
+// before executing a batch of finalizers.
+// It ensures necessary synchronization between
+// java object creation and finalization.
+void __tsan_java_finalize() INTERFACE_ATTRIBUTE;
 
 // Mutex lock.
 // Addr is any unique address associated with the mutex.
diff --git a/libsanitizer/tsan/tsan_mman.cc b/libsanitizer/tsan/tsan_mman.cc
index 3df0531f0c8..2eae60de512 100644
--- a/libsanitizer/tsan/tsan_mman.cc
+++ b/libsanitizer/tsan/tsan_mman.cc
@@ -8,6 +8,7 @@
 // This file is a part of ThreadSanitizer (TSan), a race detector.
 //
 //===----------------------------------------------------------------------===//
+#include "sanitizer_common/sanitizer_allocator_interface.h"
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_placement_new.h"
 #include "tsan_mman.h"
@@ -16,43 +17,17 @@
 #include "tsan_flags.h"
 
 // May be overriden by front-end.
-extern "C" void WEAK __tsan_malloc_hook(void *ptr, uptr size) {
+extern "C" void WEAK __sanitizer_malloc_hook(void *ptr, uptr size) {
   (void)ptr;
   (void)size;
 }
 
-extern "C" void WEAK __tsan_free_hook(void *ptr) {
+extern "C" void WEAK __sanitizer_free_hook(void *ptr) {
   (void)ptr;
 }
 
 namespace __tsan {
 
-COMPILER_CHECK(sizeof(MBlock) == 16);
-
-void MBlock::Lock() {
-  atomic_uintptr_t *a = reinterpret_cast<atomic_uintptr_t*>(this);
-  uptr v = atomic_load(a, memory_order_relaxed);
-  for (int iter = 0;; iter++) {
-    if (v & 1) {
-      if (iter < 10)
-        proc_yield(20);
-      else
-        internal_sched_yield();
-      v = atomic_load(a, memory_order_relaxed);
-      continue;
-    }
-    if (atomic_compare_exchange_weak(a, &v, v | 1, memory_order_acquire))
-      break;
-  }
-}
-
-void MBlock::Unlock() {
-  atomic_uintptr_t *a = reinterpret_cast<atomic_uintptr_t*>(this);
-  uptr v = atomic_load(a, memory_order_relaxed);
-  DCHECK(v & 1);
-  atomic_store(a, v & ~1, memory_order_relaxed);
-}
-
 struct MapUnmapCallback {
   void OnMap(uptr p, uptr size) const { }
   void OnUnmap(uptr p, uptr size) const {
@@ -86,15 +61,16 @@ void AllocatorPrintStats() {
 }
 
 static void SignalUnsafeCall(ThreadState *thr, uptr pc) {
-  if (!thr->in_signal_handler || !flags()->report_signal_unsafe)
+  if (atomic_load(&thr->in_signal_handler, memory_order_relaxed) == 0 ||
+      !flags()->report_signal_unsafe)
     return;
   StackTrace stack;
   stack.ObtainCurrent(thr, pc);
   ThreadRegistryLock l(ctx->thread_registry);
   ScopedReport rep(ReportTypeSignalUnsafe);
   if (!IsFiredSuppression(ctx, rep, stack)) {
-    rep.AddStack(&stack);
-    OutputReport(ctx, rep, rep.GetReport()->stacks[0]);
+    rep.AddStack(&stack, true);
+    OutputReport(thr, rep);
   }
 }
 
@@ -104,43 +80,36 @@ void *user_alloc(ThreadState *thr, uptr pc, uptr sz, uptr align) {
   void *p = allocator()->Allocate(&thr->alloc_cache, sz, align);
   if (p == 0)
     return 0;
-  MBlock *b = new(allocator()->GetMetaData(p)) MBlock;
-  b->Init(sz, thr->tid, CurrentStackId(thr, pc));
-  if (ctx && ctx->initialized) {
-    if (thr->ignore_reads_and_writes == 0)
-      MemoryRangeImitateWrite(thr, pc, (uptr)p, sz);
-    else
-      MemoryResetRange(thr, pc, (uptr)p, sz);
-  }
-  DPrintf("#%d: alloc(%zu) = %p\n", thr->tid, sz, p);
+  if (ctx && ctx->initialized)
+    OnUserAlloc(thr, pc, (uptr)p, sz, true);
   SignalUnsafeCall(thr, pc);
   return p;
 }
 
 void user_free(ThreadState *thr, uptr pc, void *p) {
-  CHECK_NE(p, (void*)0);
-  DPrintf("#%d: free(%p)\n", thr->tid, p);
-  MBlock *b = (MBlock*)allocator()->GetMetaData(p);
-  if (b->ListHead()) {
-    MBlock::ScopedLock l(b);
-    for (SyncVar *s = b->ListHead(); s;) {
-      SyncVar *res = s;
-      s = s->next;
-      StatInc(thr, StatSyncDestroyed);
-      res->mtx.Lock();
-      res->mtx.Unlock();
-      DestroyAndFree(res);
-    }
-    b->ListReset();
-  }
-  if (ctx && ctx->initialized) {
-    if (thr->ignore_reads_and_writes == 0)
-      MemoryRangeFreed(thr, pc, (uptr)p, b->Size());
-  }
+  if (ctx && ctx->initialized)
+    OnUserFree(thr, pc, (uptr)p, true);
   allocator()->Deallocate(&thr->alloc_cache, p);
   SignalUnsafeCall(thr, pc);
 }
 
+void OnUserAlloc(ThreadState *thr, uptr pc, uptr p, uptr sz, bool write) {
+  DPrintf("#%d: alloc(%zu) = %p\n", thr->tid, sz, p);
+  ctx->metamap.AllocBlock(thr, pc, p, sz);
+  if (write && thr->ignore_reads_and_writes == 0)
+    MemoryRangeImitateWrite(thr, pc, (uptr)p, sz);
+  else
+    MemoryResetRange(thr, pc, (uptr)p, sz);
+}
+
+void OnUserFree(ThreadState *thr, uptr pc, uptr p, bool write) {
+  CHECK_NE(p, (void*)0);
+  uptr sz = ctx->metamap.FreeBlock(thr, pc, p);
+  DPrintf("#%d: free(%p, %zu)\n", thr->tid, p, sz);
+  if (write && thr->ignore_reads_and_writes == 0)
+    MemoryRangeFreed(thr, pc, (uptr)p, sz);
+}
+
 void *user_realloc(ThreadState *thr, uptr pc, void *p, uptr sz) {
   void *p2 = 0;
   // FIXME: Handle "shrinking" more efficiently,
@@ -150,9 +119,8 @@ void *user_realloc(ThreadState *thr, uptr pc, void *p, uptr sz) {
     if (p2 == 0)
       return 0;
     if (p) {
-      MBlock *b = user_mblock(thr, p);
-      CHECK_NE(b, 0);
-      internal_memcpy(p2, p, min(b->Size(), sz));
+      uptr oldsz = user_alloc_usable_size(p);
+      internal_memcpy(p2, p, min(oldsz, sz));
     }
   }
   if (p)
@@ -160,39 +128,29 @@ void *user_realloc(ThreadState *thr, uptr pc, void *p, uptr sz) {
   return p2;
 }
 
-uptr user_alloc_usable_size(ThreadState *thr, uptr pc, void *p) {
+uptr user_alloc_usable_size(const void *p) {
   if (p == 0)
     return 0;
-  MBlock *b = (MBlock*)allocator()->GetMetaData(p);
-  return b ? b->Size() : 0;
-}
-
-MBlock *user_mblock(ThreadState *thr, void *p) {
-  CHECK_NE(p, 0);
-  Allocator *a = allocator();
-  void *b = a->GetBlockBegin(p);
-  if (b == 0)
-    return 0;
-  return (MBlock*)a->GetMetaData(b);
+  MBlock *b = ctx->metamap.GetBlock((uptr)p);
+  return b ? b->siz : 0;
 }
 
 void invoke_malloc_hook(void *ptr, uptr size) {
   ThreadState *thr = cur_thread();
   if (ctx == 0 || !ctx->initialized || thr->ignore_interceptors)
     return;
-  __tsan_malloc_hook(ptr, size);
+  __sanitizer_malloc_hook(ptr, size);
 }
 
 void invoke_free_hook(void *ptr) {
   ThreadState *thr = cur_thread();
   if (ctx == 0 || !ctx->initialized || thr->ignore_interceptors)
     return;
-  __tsan_free_hook(ptr);
+  __sanitizer_free_hook(ptr);
 }
 
 void *internal_alloc(MBlockType typ, uptr sz) {
   ThreadState *thr = cur_thread();
-  CHECK_LE(sz, InternalSizeClassMap::kMaxSize);
   if (thr->nomalloc) {
     thr->nomalloc = 0;  // CHECK calls internal_malloc().
     CHECK(0);
@@ -214,47 +172,42 @@ void internal_free(void *p) {
 using namespace __tsan;
 
 extern "C" {
-uptr __tsan_get_current_allocated_bytes() {
+uptr __sanitizer_get_current_allocated_bytes() {
   uptr stats[AllocatorStatCount];
   allocator()->GetStats(stats);
   return stats[AllocatorStatAllocated];
 }
 
-uptr __tsan_get_heap_size() {
+uptr __sanitizer_get_heap_size() {
   uptr stats[AllocatorStatCount];
   allocator()->GetStats(stats);
   return stats[AllocatorStatMapped];
 }
 
-uptr __tsan_get_free_bytes() {
+uptr __sanitizer_get_free_bytes() {
   return 1;
 }
 
-uptr __tsan_get_unmapped_bytes() {
+uptr __sanitizer_get_unmapped_bytes() {
   return 1;
 }
 
-uptr __tsan_get_estimated_allocated_size(uptr size) {
+uptr __sanitizer_get_estimated_allocated_size(uptr size) {
   return size;
 }
 
-bool __tsan_get_ownership(void *p) {
+int __sanitizer_get_ownership(const void *p) {
   return allocator()->GetBlockBegin(p) != 0;
 }
 
-uptr __tsan_get_allocated_size(void *p) {
-  if (p == 0)
-    return 0;
-  p = allocator()->GetBlockBegin(p);
-  if (p == 0)
-    return 0;
-  MBlock *b = (MBlock*)allocator()->GetMetaData(p);
-  return b->Size();
+uptr __sanitizer_get_allocated_size(const void *p) {
+  return user_alloc_usable_size(p);
 }
 
 void __tsan_on_thread_idle() {
   ThreadState *thr = cur_thread();
   allocator()->SwallowCache(&thr->alloc_cache);
   internal_allocator()->SwallowCache(&thr->internal_alloc_cache);
+  ctx->metamap.OnThreadIdle(thr);
 }
 }  // extern "C"
diff --git a/libsanitizer/tsan/tsan_mman.h b/libsanitizer/tsan/tsan_mman.h
index 90faaffa1fc..ab8eb83f919 100644
--- a/libsanitizer/tsan/tsan_mman.h
+++ b/libsanitizer/tsan/tsan_mman.h
@@ -29,10 +29,7 @@ void *user_alloc(ThreadState *thr, uptr pc, uptr sz,
 void user_free(ThreadState *thr, uptr pc, void *p);
 void *user_realloc(ThreadState *thr, uptr pc, void *p, uptr sz);
 void *user_alloc_aligned(ThreadState *thr, uptr pc, uptr sz, uptr align);
-uptr user_alloc_usable_size(ThreadState *thr, uptr pc, void *p);
-// Given the pointer p into a valid allocated block,
-// returns the descriptor of the block.
-MBlock *user_mblock(ThreadState *thr, void *p);
+uptr user_alloc_usable_size(const void *p);
 
 // Invoking malloc/free hooks that may be installed by the user.
 void invoke_malloc_hook(void *ptr, uptr size);
@@ -60,7 +57,6 @@ enum MBlockType {
   MBlockSuppression,
   MBlockExpectRace,
   MBlockSignal,
-  MBlockFD,
   MBlockJmpBuf,
 
   // This must be the last.
diff --git a/libsanitizer/tsan/tsan_mutex.cc b/libsanitizer/tsan/tsan_mutex.cc
index 0c3bb4a6721..2e49b9d2de9 100644
--- a/libsanitizer/tsan/tsan_mutex.cc
+++ b/libsanitizer/tsan/tsan_mutex.cc
@@ -29,13 +29,13 @@ static MutexType CanLockTab[MutexTypeCount][MutexTypeCount] = {
   /*0  MutexTypeInvalid*/     {},
   /*1  MutexTypeTrace*/       {MutexTypeLeaf},
   /*2  MutexTypeThreads*/     {MutexTypeReport},
-  /*3  MutexTypeReport*/      {MutexTypeSyncTab, MutexTypeSyncVar,
+  /*3  MutexTypeReport*/      {MutexTypeSyncVar,
                                MutexTypeMBlock, MutexTypeJavaMBlock},
   /*4  MutexTypeSyncVar*/     {MutexTypeDDetector},
-  /*5  MutexTypeSyncTab*/     {MutexTypeSyncVar},
+  /*5  MutexTypeSyncTab*/     {},  // unused
   /*6  MutexTypeSlab*/        {MutexTypeLeaf},
   /*7  MutexTypeAnnotations*/ {},
-  /*8  MutexTypeAtExit*/      {MutexTypeSyncTab},
+  /*8  MutexTypeAtExit*/      {MutexTypeSyncVar},
   /*9  MutexTypeMBlock*/      {MutexTypeSyncVar},
   /*10 MutexTypeJavaMBlock*/  {MutexTypeSyncVar},
   /*11 MutexTypeDDetector*/   {},
@@ -159,7 +159,19 @@ void InternalDeadlockDetector::Unlock(MutexType t) {
   CHECK(locked_[t]);
   locked_[t] = 0;
 }
+
+void InternalDeadlockDetector::CheckNoLocks() {
+  for (int i = 0; i != MutexTypeCount; i++) {
+    CHECK_EQ(locked_[i], 0);
+  }
+}
+#endif
+
+void CheckNoLocks(ThreadState *thr) {
+#if TSAN_DEBUG && !TSAN_GO
+  thr->internal_deadlock_detector.CheckNoLocks();
 #endif
+}
 
 const uptr kUnlocked = 0;
 const uptr kWriteLock = 1;
@@ -220,7 +232,7 @@ void Mutex::Lock() {
       cmp = kUnlocked;
       if (atomic_compare_exchange_weak(&state_, &cmp, kWriteLock,
                                        memory_order_acquire)) {
-#if TSAN_COLLECT_STATS
+#if TSAN_COLLECT_STATS && !TSAN_GO
         StatInc(cur_thread(), stat_type_, backoff.Contention());
 #endif
         return;
@@ -248,7 +260,7 @@ void Mutex::ReadLock() {
   for (Backoff backoff; backoff.Do();) {
     prev = atomic_load(&state_, memory_order_acquire);
     if ((prev & kWriteLock) == 0) {
-#if TSAN_COLLECT_STATS
+#if TSAN_COLLECT_STATS && !TSAN_GO
       StatInc(cur_thread(), stat_type_, backoff.Contention());
 #endif
       return;
diff --git a/libsanitizer/tsan/tsan_mutex.h b/libsanitizer/tsan/tsan_mutex.h
index f075ce831e6..c27bc2b4459 100644
--- a/libsanitizer/tsan/tsan_mutex.h
+++ b/libsanitizer/tsan/tsan_mutex.h
@@ -69,6 +69,7 @@ class InternalDeadlockDetector {
   InternalDeadlockDetector();
   void Lock(MutexType t);
   void Unlock(MutexType t);
+  void CheckNoLocks();
  private:
   u64 seq_;
   u64 locked_[MutexTypeCount];
@@ -76,6 +77,10 @@ class InternalDeadlockDetector {
 
 void InitializeMutex();
 
+// Checks that the current thread does not hold any runtime locks
+// (e.g. when returning from an interceptor).
+void CheckNoLocks(ThreadState *thr);
+
 }  // namespace __tsan
 
 #endif  // TSAN_MUTEX_H
diff --git a/libsanitizer/tsan/tsan_platform.h b/libsanitizer/tsan/tsan_platform.h
index 60eb1a84995..b7a6376e8ed 100644
--- a/libsanitizer/tsan/tsan_platform.h
+++ b/libsanitizer/tsan/tsan_platform.h
@@ -14,31 +14,23 @@
 C++ linux memory layout:
 0000 0000 0000 - 03c0 0000 0000: protected
 03c0 0000 0000 - 1000 0000 0000: shadow
-1000 0000 0000 - 6000 0000 0000: protected
+1000 0000 0000 - 3000 0000 0000: protected
+3000 0000 0000 - 4000 0000 0000: metainfo (memory blocks and sync objects)
+4000 0000 0000 - 6000 0000 0000: protected
 6000 0000 0000 - 6200 0000 0000: traces
 6200 0000 0000 - 7d00 0000 0000: -
 7d00 0000 0000 - 7e00 0000 0000: heap
 7e00 0000 0000 - 7fff ffff ffff: modules and main thread stack
 
-C++ COMPAT linux memory layout:
-0000 0000 0000 - 0400 0000 0000: protected
-0400 0000 0000 - 1000 0000 0000: shadow
-1000 0000 0000 - 2900 0000 0000: protected
-2900 0000 0000 - 2c00 0000 0000: modules
-2c00 0000 0000 - 6000 0000 0000: -
-6000 0000 0000 - 6200 0000 0000: traces
-6200 0000 0000 - 7d00 0000 0000: -
-7d00 0000 0000 - 7e00 0000 0000: heap
-7e00 0000 0000 - 7f00 0000 0000: -
-7f00 0000 0000 - 7fff ffff ffff: main thread stack
-
 Go linux and darwin memory layout:
 0000 0000 0000 - 0000 1000 0000: executable
 0000 1000 0000 - 00f8 0000 0000: -
 00c0 0000 0000 - 00e0 0000 0000: heap
 00e0 0000 0000 - 1000 0000 0000: -
 1000 0000 0000 - 1380 0000 0000: shadow
-1460 0000 0000 - 6000 0000 0000: -
+1460 0000 0000 - 2000 0000 0000: -
+3000 0000 0000 - 4000 0000 0000: metainfo (memory blocks and sync objects)
+4000 0000 0000 - 6000 0000 0000: -
 6000 0000 0000 - 6200 0000 0000: traces
 6200 0000 0000 - 7fff ffff ffff: -
 
@@ -49,7 +41,8 @@ Go windows memory layout:
 00e0 0000 0000 - 0100 0000 0000: -
 0100 0000 0000 - 0560 0000 0000: shadow
 0560 0000 0000 - 0760 0000 0000: traces
-0760 0000 0000 - 07ff ffff ffff: -
+0760 0000 0000 - 07d0 0000 0000: metainfo (memory blocks and sync objects)
+07d0 0000 0000 - 07ff ffff ffff: -
 */
 
 #ifndef TSAN_PLATFORM_H
@@ -66,18 +59,16 @@ static const uptr kLinuxAppMemBeg = 0x000000000000ULL;
 static const uptr kLinuxAppMemEnd = 0x04dfffffffffULL;
 # if SANITIZER_WINDOWS
 static const uptr kLinuxShadowMsk = 0x010000000000ULL;
-# else
+static const uptr kMetaShadow     = 0x076000000000ULL;
+static const uptr kMetaSize       = 0x007000000000ULL;
+# else  // if SANITIZER_WINDOWS
 static const uptr kLinuxShadowMsk = 0x200000000000ULL;
-# endif
-// TSAN_COMPAT_SHADOW is intended for COMPAT virtual memory layout,
-// when memory addresses are of the 0x2axxxxxxxxxx form.
-// The option is enabled with 'setarch x86_64 -L'.
-#elif defined(TSAN_COMPAT_SHADOW) && TSAN_COMPAT_SHADOW
-static const uptr kLinuxAppMemBeg = 0x290000000000ULL;
-static const uptr kLinuxAppMemEnd = 0x7fffffffffffULL;
-static const uptr kAppMemGapBeg   = 0x2c0000000000ULL;
-static const uptr kAppMemGapEnd   = 0x7d0000000000ULL;
-#else
+static const uptr kMetaShadow     = 0x300000000000ULL;
+static const uptr kMetaSize       = 0x100000000000ULL;
+# endif  // if SANITIZER_WINDOWS
+#else  // defined(TSAN_GO)
+static const uptr kMetaShadow     = 0x300000000000ULL;
+static const uptr kMetaSize       = 0x100000000000ULL;
 static const uptr kLinuxAppMemBeg = 0x7cf000000000ULL;
 static const uptr kLinuxAppMemEnd = 0x7fffffffffffULL;
 #endif
@@ -94,10 +85,16 @@ const uptr kTraceMemSize = 0x020000000000ULL;
 // This has to be a macro to allow constant initialization of constants below.
 #ifndef TSAN_GO
 #define MemToShadow(addr) \
-    (((addr) & ~(kLinuxAppMemMsk | (kShadowCell - 1))) * kShadowCnt)
+    ((((uptr)addr) & ~(kLinuxAppMemMsk | (kShadowCell - 1))) * kShadowCnt)
+#define MemToMeta(addr) \
+    (u32*)(((((uptr)addr) & ~(kLinuxAppMemMsk | (kMetaShadowCell - 1))) \
+    / kMetaShadowCell * kMetaShadowSize) | kMetaShadow)
 #else
 #define MemToShadow(addr) \
-    ((((addr) & ~(kShadowCell - 1)) * kShadowCnt) | kLinuxShadowMsk)
+    (((((uptr)addr) & ~(kShadowCell - 1)) * kShadowCnt) | kLinuxShadowMsk)
+#define MemToMeta(addr) \
+    (u32*)(((((uptr)addr) & ~(kMetaShadowCell - 1)) \
+    / kMetaShadowCell * kMetaShadowSize) | kMetaShadow)
 #endif
 
 static const uptr kLinuxShadowBeg = MemToShadow(kLinuxAppMemBeg);
@@ -105,9 +102,8 @@ static const uptr kLinuxShadowEnd =
     MemToShadow(kLinuxAppMemEnd) | 0xff;
 
 static inline bool IsAppMem(uptr mem) {
-#if defined(TSAN_COMPAT_SHADOW) && TSAN_COMPAT_SHADOW
-  return (mem >= kLinuxAppMemBeg && mem < kAppMemGapBeg) ||
-         (mem >= kAppMemGapEnd   && mem <= kLinuxAppMemEnd);
+#if defined(TSAN_GO)
+  return mem <= kLinuxAppMemEnd;
 #else
   return mem >= kLinuxAppMemBeg && mem <= kLinuxAppMemEnd;
 #endif
@@ -126,22 +122,11 @@ static inline uptr ShadowToMem(uptr shadow) {
 #endif
 }
 
-// For COMPAT mapping returns an alternative address
-// that mapped to the same shadow address.
-// COMPAT mapping is not quite one-to-one.
-static inline uptr AlternativeAddress(uptr addr) {
-#if defined(TSAN_COMPAT_SHADOW) && TSAN_COMPAT_SHADOW
-  return (addr & ~kLinuxAppMemMsk) | 0x280000000000ULL;
-#else
-  return 0;
-#endif
-}
-
 void FlushShadowMemory();
-void WriteMemoryProfile(char *buf, uptr buf_size);
+void WriteMemoryProfile(char *buf, uptr buf_size, uptr nthread, uptr nlive);
 uptr GetRSS();
 
-const char *InitializePlatform();
+void InitializePlatform();
 void FinalizePlatform();
 
 // The additional page is to catch shadow stack overflow as paging fault.
diff --git a/libsanitizer/tsan/tsan_platform_linux.cc b/libsanitizer/tsan/tsan_platform_linux.cc
index 062e84615cc..ba81fd242e3 100644
--- a/libsanitizer/tsan/tsan_platform_linux.cc
+++ b/libsanitizer/tsan/tsan_platform_linux.cc
@@ -12,7 +12,7 @@
 
 
 #include "sanitizer_common/sanitizer_platform.h"
-#if SANITIZER_LINUX
+#if SANITIZER_LINUX || SANITIZER_FREEBSD
 
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_libc.h"
@@ -30,7 +30,6 @@
 #include <string.h>
 #include <stdarg.h>
 #include <sys/mman.h>
-#include <sys/prctl.h>
 #include <sys/syscall.h>
 #include <sys/socket.h>
 #include <sys/time.h>
@@ -41,9 +40,10 @@
 #include <errno.h>
 #include <sched.h>
 #include <dlfcn.h>
+#if SANITIZER_LINUX
 #define __need_res_state
 #include <resolv.h>
-#include <malloc.h>
+#endif
 
 #ifdef sa_handler
 # undef sa_handler
@@ -53,61 +53,98 @@
 # undef sa_sigaction
 #endif
 
-extern "C" struct mallinfo __libc_mallinfo();
+#if SANITIZER_FREEBSD
+extern "C" void *__libc_stack_end;
+void *__libc_stack_end = 0;
+#endif
 
 namespace __tsan {
 
 const uptr kPageSize = 4096;
 
+enum {
+  MemTotal  = 0,
+  MemShadow = 1,
+  MemMeta   = 2,
+  MemFile   = 3,
+  MemMmap   = 4,
+  MemTrace  = 5,
+  MemHeap   = 6,
+  MemOther  = 7,
+  MemCount  = 8,
+};
+
 void FillProfileCallback(uptr start, uptr rss, bool file,
                          uptr *mem, uptr stats_size) {
-  CHECK_EQ(7, stats_size);
-  mem[6] += rss;  // total
+  mem[MemTotal] += rss;
   start >>= 40;
-  if (start < 0x10)  // shadow
-    mem[0] += rss;
-  else if (start >= 0x20 && start < 0x30)  // compat modules
-    mem[file ? 1 : 2] += rss;
-  else if (start >= 0x7e)  // modules
-    mem[file ? 1 : 2] += rss;
-  else if (start >= 0x60 && start < 0x62)  // traces
-    mem[3] += rss;
-  else if (start >= 0x7d && start < 0x7e)  // heap
-    mem[4] += rss;
-  else  // other
-    mem[5] += rss;
+  if (start < 0x10)
+    mem[MemShadow] += rss;
+  else if (start >= 0x20 && start < 0x30)
+    mem[file ? MemFile : MemMmap] += rss;
+  else if (start >= 0x30 && start < 0x40)
+    mem[MemMeta] += rss;
+  else if (start >= 0x7e)
+    mem[file ? MemFile : MemMmap] += rss;
+  else if (start >= 0x60 && start < 0x62)
+    mem[MemTrace] += rss;
+  else if (start >= 0x7d && start < 0x7e)
+    mem[MemHeap] += rss;
+  else
+    mem[MemOther] += rss;
 }
 
-void WriteMemoryProfile(char *buf, uptr buf_size) {
-  uptr mem[7] = {};
+void WriteMemoryProfile(char *buf, uptr buf_size, uptr nthread, uptr nlive) {
+  uptr mem[MemCount] = {};
   __sanitizer::GetMemoryProfile(FillProfileCallback, mem, 7);
-  char *buf_pos = buf;
-  char *buf_end = buf + buf_size;
-  buf_pos += internal_snprintf(buf_pos, buf_end - buf_pos,
-      "RSS %zd MB: shadow:%zd file:%zd mmap:%zd trace:%zd heap:%zd other:%zd\n",
-      mem[6] >> 20, mem[0] >> 20, mem[1] >> 20, mem[2] >> 20,
-      mem[3] >> 20, mem[4] >> 20, mem[5] >> 20);
-  struct mallinfo mi = __libc_mallinfo();
-  buf_pos += internal_snprintf(buf_pos, buf_end - buf_pos,
-      "mallinfo: arena=%d mmap=%d fordblks=%d keepcost=%d\n",
-      mi.arena >> 20, mi.hblkhd >> 20, mi.fordblks >> 20, mi.keepcost >> 20);
+  internal_snprintf(buf, buf_size,
+      "RSS %zd MB: shadow:%zd meta:%zd file:%zd mmap:%zd"
+      " trace:%zd heap:%zd other:%zd nthr=%zd/%zd\n",
+      mem[MemTotal] >> 20, mem[MemShadow] >> 20, mem[MemMeta] >> 20,
+      mem[MemFile] >> 20, mem[MemMmap] >> 20, mem[MemTrace] >> 20,
+      mem[MemHeap] >> 20, mem[MemOther] >> 20,
+      nlive, nthread);
 }
 
 uptr GetRSS() {
-  uptr mem[7] = {};
-  __sanitizer::GetMemoryProfile(FillProfileCallback, mem, 7);
-  return mem[6];
+  uptr fd = OpenFile("/proc/self/statm", false);
+  if ((sptr)fd < 0)
+    return 0;
+  char buf[64];
+  uptr len = internal_read(fd, buf, sizeof(buf) - 1);
+  internal_close(fd);
+  if ((sptr)len <= 0)
+    return 0;
+  buf[len] = 0;
+  // The format of the file is:
+  // 1084 89 69 11 0 79 0
+  // We need the second number which is RSS in 4K units.
+  char *pos = buf;
+  // Skip the first number.
+  while (*pos >= '0' && *pos <= '9')
+    pos++;
+  // Skip whitespaces.
+  while (!(*pos >= '0' && *pos <= '9') && *pos != 0)
+    pos++;
+  // Read the number.
+  uptr rss = 0;
+  while (*pos >= '0' && *pos <= '9')
+    rss = rss * 10 + *pos++ - '0';
+  return rss * 4096;
 }
 
-
+#if SANITIZER_LINUX
 void FlushShadowMemoryCallback(
     const SuspendedThreadsList &suspended_threads_list,
     void *argument) {
   FlushUnneededShadowMemory(kLinuxShadowBeg, kLinuxShadowEnd - kLinuxShadowBeg);
 }
+#endif
 
 void FlushShadowMemory() {
+#if SANITIZER_LINUX
   StopTheWorld(FlushShadowMemoryCallback, 0);
+#endif
 }
 
 #ifndef TSAN_GO
@@ -121,9 +158,7 @@ static void ProtectRange(uptr beg, uptr end) {
     Die();
   }
 }
-#endif
 
-#ifndef TSAN_GO
 // Mark shadow for .rodata sections with the special kShadowRodata marker.
 // Accesses to .rodata can't race, so this saves time, memory and trace space.
 static void MapRodata() {
@@ -182,6 +217,7 @@ static void MapRodata() {
 }
 
 void InitializeShadowMemory() {
+  // Map memory shadow.
   uptr shadow = (uptr)MmapFixedNoReserve(kLinuxShadowBeg,
     kLinuxShadowEnd - kLinuxShadowBeg);
   if (shadow != kLinuxShadowBeg) {
@@ -190,23 +226,56 @@ void InitializeShadowMemory() {
                "to link with -pie (%p, %p).\n", shadow, kLinuxShadowBeg);
     Die();
   }
+  // This memory range is used for thread stacks and large user mmaps.
+  // Frequently a thread uses only a small part of stack and similarly
+  // a program uses a small part of large mmap. On some programs
+  // we see 20% memory usage reduction without huge pages for this range.
+#ifdef MADV_NOHUGEPAGE
+  madvise((void*)MemToShadow(0x7f0000000000ULL),
+      0x10000000000ULL * kShadowMultiplier, MADV_NOHUGEPAGE);
+#endif
+  DPrintf("memory shadow: %zx-%zx (%zuGB)\n",
+      kLinuxShadowBeg, kLinuxShadowEnd,
+      (kLinuxShadowEnd - kLinuxShadowBeg) >> 30);
+
+  // Map meta shadow.
+  if (MemToMeta(kLinuxAppMemBeg) < (u32*)kMetaShadow) {
+    Printf("ThreadSanitizer: bad meta shadow (%p -> %p < %p)\n",
+        kLinuxAppMemBeg, MemToMeta(kLinuxAppMemBeg), kMetaShadow);
+    Die();
+  }
+  if (MemToMeta(kLinuxAppMemEnd) >= (u32*)(kMetaShadow + kMetaSize)) {
+    Printf("ThreadSanitizer: bad meta shadow (%p -> %p >= %p)\n",
+        kLinuxAppMemEnd, MemToMeta(kLinuxAppMemEnd), kMetaShadow + kMetaSize);
+    Die();
+  }
+  uptr meta = (uptr)MmapFixedNoReserve(kMetaShadow, kMetaSize);
+  if (meta != kMetaShadow) {
+    Printf("FATAL: ThreadSanitizer can not mmap the shadow memory\n");
+    Printf("FATAL: Make sure to compile with -fPIE and "
+               "to link with -pie (%p, %p).\n", meta, kMetaShadow);
+    Die();
+  }
+  DPrintf("meta shadow: %zx-%zx (%zuGB)\n",
+      kMetaShadow, kMetaShadow + kMetaSize, kMetaSize >> 30);
+
+  // Protect gaps.
   const uptr kClosedLowBeg  = 0x200000;
   const uptr kClosedLowEnd  = kLinuxShadowBeg - 1;
   const uptr kClosedMidBeg = kLinuxShadowEnd + 1;
-  const uptr kClosedMidEnd = min(kLinuxAppMemBeg, kTraceMemBegin);
+  const uptr kClosedMidEnd = min(min(kLinuxAppMemBeg, kTraceMemBegin),
+      kMetaShadow);
+
   ProtectRange(kClosedLowBeg, kClosedLowEnd);
   ProtectRange(kClosedMidBeg, kClosedMidEnd);
-  DPrintf("kClosedLow   %zx-%zx (%zuGB)\n",
+  VPrintf(2, "kClosedLow   %zx-%zx (%zuGB)\n",
       kClosedLowBeg, kClosedLowEnd, (kClosedLowEnd - kClosedLowBeg) >> 30);
-  DPrintf("kLinuxShadow %zx-%zx (%zuGB)\n",
-      kLinuxShadowBeg, kLinuxShadowEnd,
-      (kLinuxShadowEnd - kLinuxShadowBeg) >> 30);
-  DPrintf("kClosedMid   %zx-%zx (%zuGB)\n",
+  VPrintf(2, "kClosedMid   %zx-%zx (%zuGB)\n",
       kClosedMidBeg, kClosedMidEnd, (kClosedMidEnd - kClosedMidBeg) >> 30);
-  DPrintf("kLinuxAppMem %zx-%zx (%zuGB)\n",
+  VPrintf(2, "app mem: %zx-%zx (%zuGB)\n",
       kLinuxAppMemBeg, kLinuxAppMemEnd,
       (kLinuxAppMemEnd - kLinuxAppMemBeg) >> 30);
-  DPrintf("stack        %zx\n", (uptr)&shadow);
+  VPrintf(2, "stack: %zx\n", (uptr)&shadow);
 
   MapRodata();
 }
@@ -261,26 +330,8 @@ static void InitDataSeg() {
 
 #endif  // #ifndef TSAN_GO
 
-static rlim_t getlim(int res) {
-  rlimit rlim;
-  CHECK_EQ(0, getrlimit(res, &rlim));
-  return rlim.rlim_cur;
-}
-
-static void setlim(int res, rlim_t lim) {
-  // The following magic is to prevent clang from replacing it with memset.
-  volatile rlimit rlim;
-  rlim.rlim_cur = lim;
-  rlim.rlim_max = lim;
-  setrlimit(res, (rlimit*)&rlim);
-}
-
-const char *InitializePlatform() {
-  void *p = 0;
-  if (sizeof(p) == 8) {
-    // Disable core dumps, dumping of 16TB usually takes a bit long.
-    setlim(RLIMIT_CORE, 0);
-  }
+void InitializePlatform() {
+  DisableCoreDumperIfNecessary();
 
   // Go maps shadow memory lazily and works fine with limited address space.
   // Unlimited stack is not a problem as well, because the executable
@@ -290,7 +341,7 @@ const char *InitializePlatform() {
     // TSan doesn't play well with unlimited stack size (as stack
     // overlaps with shadow memory). If we detect unlimited stack size,
     // we re-exec the program with limited stack size as a best effort.
-    if (getlim(RLIMIT_STACK) == (rlim_t)-1) {
+    if (StackSizeIsUnlimited()) {
       const uptr kMaxStackSize = 32 * 1024 * 1024;
       VReport(1, "Program is run with unlimited stack size, which wouldn't "
                  "work with ThreadSanitizer.\n"
@@ -300,11 +351,11 @@ const char *InitializePlatform() {
       reexec = true;
     }
 
-    if (getlim(RLIMIT_AS) != (rlim_t)-1) {
+    if (!AddressSpaceIsUnlimited()) {
       Report("WARNING: Program is run with limited virtual address space,"
              " which wouldn't work with ThreadSanitizer.\n");
       Report("Re-execing with unlimited virtual address space.\n");
-      setlim(RLIMIT_AS, -1);
+      SetAddressSpaceUnlimited();
       reexec = true;
     }
     if (reexec)
@@ -316,7 +367,6 @@ const char *InitializePlatform() {
   InitTlsSize();
   InitDataSeg();
 #endif
-  return GetEnv(kTsanOptionsEnv);
 }
 
 bool IsGlobalVar(uptr addr) {
@@ -328,6 +378,7 @@ bool IsGlobalVar(uptr addr) {
 // This is required to properly "close" the fds, because we do not see internal
 // closes within glibc. The code is a pure hack.
 int ExtractResolvFDs(void *state, int *fds, int nfd) {
+#if SANITIZER_LINUX
   int cnt = 0;
   __res_state *statp = (__res_state*)state;
   for (int i = 0; i < MAXNS && cnt < nfd; i++) {
@@ -335,6 +386,9 @@ int ExtractResolvFDs(void *state, int *fds, int nfd) {
       fds[cnt++] = statp->_u._ext.nssocks[i];
   }
   return cnt;
+#else
+  return 0;
+#endif
 }
 
 // Extract file descriptors passed via UNIX domain sockets.
diff --git a/libsanitizer/tsan/tsan_platform_mac.cc b/libsanitizer/tsan/tsan_platform_mac.cc
index c3d4d905219..95527c79431 100644
--- a/libsanitizer/tsan/tsan_platform_mac.cc
+++ b/libsanitizer/tsan/tsan_platform_mac.cc
@@ -45,7 +45,7 @@ uptr GetShadowMemoryConsumption() {
 void FlushShadowMemory() {
 }
 
-void WriteMemoryProfile(char *buf, uptr buf_size) {
+void WriteMemoryProfile(char *buf, uptr buf_size, uptr nthread, uptr nlive) {
 }
 
 uptr GetRSS() {
@@ -71,18 +71,8 @@ void InitializeShadowMemory() {
 }
 #endif
 
-const char *InitializePlatform() {
-  void *p = 0;
-  if (sizeof(p) == 8) {
-    // Disable core dumps, dumping of 16TB usually takes a bit long.
-    // The following magic is to prevent clang from replacing it with memset.
-    volatile rlimit lim;
-    lim.rlim_cur = 0;
-    lim.rlim_max = 0;
-    setrlimit(RLIMIT_CORE, (rlimit*)&lim);
-  }
-
-  return GetEnv(kTsanOptionsEnv);
+void InitializePlatform() {
+  DisableCoreDumperIfNecessary();
 }
 
 void FinalizePlatform() {
diff --git a/libsanitizer/tsan/tsan_platform_windows.cc b/libsanitizer/tsan/tsan_platform_windows.cc
index f16bebf7f4c..d6e9e6d33f4 100644
--- a/libsanitizer/tsan/tsan_platform_windows.cc
+++ b/libsanitizer/tsan/tsan_platform_windows.cc
@@ -26,15 +26,14 @@ uptr GetShadowMemoryConsumption() {
 void FlushShadowMemory() {
 }
 
-void WriteMemoryProfile(char *buf, uptr buf_size) {
+void WriteMemoryProfile(char *buf, uptr buf_size, uptr nthread, uptr nlive) {
 }
 
 uptr GetRSS() {
   return 0;
 }
 
-const char *InitializePlatform() {
-  return GetEnv(kTsanOptionsEnv);
+void InitializePlatform() {
 }
 
 void FinalizePlatform() {
diff --git a/libsanitizer/tsan/tsan_report.cc b/libsanitizer/tsan/tsan_report.cc
index 00a512e0cf8..b14856cd153 100644
--- a/libsanitizer/tsan/tsan_report.cc
+++ b/libsanitizer/tsan/tsan_report.cc
@@ -15,9 +15,9 @@
 
 namespace __tsan {
 
-class Decorator: private __sanitizer::AnsiColorDecorator {
+class Decorator: public __sanitizer::SanitizerCommonDecorator {
  public:
-  Decorator() : __sanitizer::AnsiColorDecorator(PrintsToTtyCached()) { }
+  Decorator() : SanitizerCommonDecorator() { }
   const char *Warning()    { return Red(); }
   const char *EndWarning() { return Default(); }
   const char *Access()     { return Blue(); }
diff --git a/libsanitizer/tsan/tsan_report.h b/libsanitizer/tsan/tsan_report.h
index d773e057af0..0bde59b1675 100644
--- a/libsanitizer/tsan/tsan_report.h
+++ b/libsanitizer/tsan/tsan_report.h
@@ -40,6 +40,7 @@ struct ReportStack {
   char *file;
   int line;
   int col;
+  bool suppressable;
 };
 
 struct ReportMopMutex {
@@ -78,6 +79,7 @@ struct ReportLocation {
   char *name;
   char *file;
   int line;
+  bool suppressable;
   ReportStack *stack;
 };
 
diff --git a/libsanitizer/tsan/tsan_rtl.cc b/libsanitizer/tsan/tsan_rtl.cc
index 7932a6d9e04..f5942bcaa3e 100644
--- a/libsanitizer/tsan/tsan_rtl.cc
+++ b/libsanitizer/tsan/tsan_rtl.cc
@@ -23,6 +23,16 @@
 #include "tsan_suppressions.h"
 #include "tsan_symbolize.h"
 
+#ifdef __SSE3__
+// <emmintrin.h> transitively includes <stdlib.h>,
+// and it's prohibited to include std headers into tsan runtime.
+// So we do this dirty trick.
+#define _MM_MALLOC_H_INCLUDED
+#define __MM_MALLOC_H
+#include <emmintrin.h>
+typedef __m128i m128;
+#endif
+
 volatile int __tsan_resumed = 0;
 
 extern "C" void __tsan_resume() {
@@ -110,10 +120,7 @@ static void MemoryProfiler(Context *ctx, fd_t fd, int i) {
   uptr n_running_threads;
   ctx->thread_registry->GetNumberOfThreads(&n_threads, &n_running_threads);
   InternalScopedBuffer<char> buf(4096);
-  internal_snprintf(buf.data(), buf.size(), "%d: nthr=%d nlive=%d\n",
-      i, n_threads, n_running_threads);
-  internal_write(fd, buf.data(), internal_strlen(buf.data()));
-  WriteMemoryProfile(buf.data(), buf.size());
+  WriteMemoryProfile(buf.data(), buf.size(), n_threads, n_running_threads);
   internal_write(fd, buf.data(), internal_strlen(buf.data()));
 }
 
@@ -129,15 +136,21 @@ static void BackgroundThread(void *arg) {
 
   fd_t mprof_fd = kInvalidFd;
   if (flags()->profile_memory && flags()->profile_memory[0]) {
-    InternalScopedBuffer<char> filename(4096);
-    internal_snprintf(filename.data(), filename.size(), "%s.%d",
-        flags()->profile_memory, (int)internal_getpid());
-    uptr openrv = OpenFile(filename.data(), true);
-    if (internal_iserror(openrv)) {
-      Printf("ThreadSanitizer: failed to open memory profile file '%s'\n",
-          &filename[0]);
+    if (internal_strcmp(flags()->profile_memory, "stdout") == 0) {
+      mprof_fd = 1;
+    } else if (internal_strcmp(flags()->profile_memory, "stderr") == 0) {
+      mprof_fd = 2;
     } else {
-      mprof_fd = openrv;
+      InternalScopedBuffer<char> filename(4096);
+      internal_snprintf(filename.data(), filename.size(), "%s.%d",
+          flags()->profile_memory, (int)internal_getpid());
+      uptr openrv = OpenFile(filename.data(), true);
+      if (internal_iserror(openrv)) {
+        Printf("ThreadSanitizer: failed to open memory profile file '%s'\n",
+            &filename[0]);
+      } else {
+        mprof_fd = openrv;
+      }
     }
   }
 
@@ -152,27 +165,23 @@ static void BackgroundThread(void *arg) {
     // Flush memory if requested.
     if (flags()->flush_memory_ms > 0) {
       if (last_flush + flags()->flush_memory_ms * kMs2Ns < now) {
-        if (flags()->verbosity > 0)
-          Printf("ThreadSanitizer: periodic memory flush\n");
+        VPrintf(1, "ThreadSanitizer: periodic memory flush\n");
         FlushShadowMemory();
         last_flush = NanoTime();
       }
     }
+    // GetRSS can be expensive on huge programs, so don't do it every 100ms.
     if (flags()->memory_limit_mb > 0) {
       uptr rss = GetRSS();
       uptr limit = uptr(flags()->memory_limit_mb) << 20;
-      if (flags()->verbosity > 0) {
-        Printf("ThreadSanitizer: memory flush check"
-               " RSS=%llu LAST=%llu LIMIT=%llu\n",
-               (u64)rss>>20, (u64)last_rss>>20, (u64)limit>>20);
-      }
+      VPrintf(1, "ThreadSanitizer: memory flush check"
+                 " RSS=%llu LAST=%llu LIMIT=%llu\n",
+              (u64)rss >> 20, (u64)last_rss >> 20, (u64)limit >> 20);
       if (2 * rss > limit + last_rss) {
-        if (flags()->verbosity > 0)
-          Printf("ThreadSanitizer: flushing memory due to RSS\n");
+        VPrintf(1, "ThreadSanitizer: flushing memory due to RSS\n");
         FlushShadowMemory();
         rss = GetRSS();
-        if (flags()->verbosity > 0)
-          Printf("ThreadSanitizer: memory flushed RSS=%llu\n", (u64)rss>>20);
+        VPrintf(1, "ThreadSanitizer: memory flushed RSS=%llu\n", (u64)rss>>20);
       }
       last_rss = rss;
     }
@@ -201,11 +210,13 @@ static void StartBackgroundThread() {
   ctx->background_thread = internal_start_thread(&BackgroundThread, 0);
 }
 
+#ifndef TSAN_GO
 static void StopBackgroundThread() {
   atomic_store(&ctx->stop_background_thread, 1, memory_order_relaxed);
   internal_join_thread(ctx->background_thread);
   ctx->background_thread = 0;
 }
+#endif
 
 void DontNeedShadowFor(uptr addr, uptr size) {
   uptr shadow_beg = MemToShadow(addr);
@@ -218,6 +229,32 @@ void MapShadow(uptr addr, uptr size) {
   // so we can get away with unaligned mapping.
   // CHECK_EQ(addr, addr & ~((64 << 10) - 1));  // windows wants 64K alignment
   MmapFixedNoReserve(MemToShadow(addr), size * kShadowMultiplier);
+
+  // Meta shadow is 2:1, so tread carefully.
+  static bool data_mapped = false;
+  static uptr mapped_meta_end = 0;
+  uptr meta_begin = (uptr)MemToMeta(addr);
+  uptr meta_end = (uptr)MemToMeta(addr + size);
+  meta_begin = RoundDownTo(meta_begin, 64 << 10);
+  meta_end = RoundUpTo(meta_end, 64 << 10);
+  if (!data_mapped) {
+    // First call maps data+bss.
+    data_mapped = true;
+    MmapFixedNoReserve(meta_begin, meta_end - meta_begin);
+  } else {
+    // Mapping continous heap.
+    // Windows wants 64K alignment.
+    meta_begin = RoundDownTo(meta_begin, 64 << 10);
+    meta_end = RoundUpTo(meta_end, 64 << 10);
+    if (meta_end <= mapped_meta_end)
+      return;
+    if (meta_begin < mapped_meta_end)
+      meta_begin = mapped_meta_end;
+    MmapFixedNoReserve(meta_begin, meta_end - meta_begin);
+    mapped_meta_end = meta_end;
+  }
+  VPrintf(2, "mapped meta shadow for (%p-%p) at (%p-%p)\n",
+      addr, addr+size, meta_begin, meta_end);
 }
 
 void MapThreadTrace(uptr addr, uptr size) {
@@ -245,34 +282,35 @@ void Initialize(ThreadState *thr) {
   // Install tool-specific callbacks in sanitizer_common.
   SetCheckFailedCallback(TsanCheckFailed);
 
+  ctx = new(ctx_placeholder) Context;
+  const char *options = GetEnv(kTsanOptionsEnv);
+  InitializeFlags(&ctx->flags, options);
 #ifndef TSAN_GO
   InitializeAllocator();
 #endif
   InitializeInterceptors();
-  const char *env = InitializePlatform();
+  InitializePlatform();
   InitializeMutex();
   InitializeDynamicAnnotations();
-  ctx = new(ctx_placeholder) Context;
 #ifndef TSAN_GO
   InitializeShadowMemory();
 #endif
-  InitializeFlags(&ctx->flags, env);
   // Setup correct file descriptor for error reports.
-  __sanitizer_set_report_path(flags()->log_path);
+  __sanitizer_set_report_path(common_flags()->log_path);
   InitializeSuppressions();
 #ifndef TSAN_GO
   InitializeLibIgnore();
-  Symbolizer::Init(common_flags()->external_symbolizer_path);
-  Symbolizer::Get()->AddHooks(EnterSymbolizer, ExitSymbolizer);
+  Symbolizer::GetOrInit()->AddHooks(EnterSymbolizer, ExitSymbolizer);
 #endif
   StartBackgroundThread();
+#ifndef TSAN_GO
   SetSandboxingCallback(StopBackgroundThread);
-  if (flags()->detect_deadlocks)
+#endif
+  if (common_flags()->detect_deadlocks)
     ctx->dd = DDetector::Create(flags());
 
-  if (ctx->flags.verbosity)
-    Printf("***** Running under ThreadSanitizer v2 (pid %d) *****\n",
-           (int)internal_getpid());
+  VPrintf(1, "***** Running under ThreadSanitizer v2 (pid %d) *****\n",
+          (int)internal_getpid());
 
   // Initialize thread 0.
   int tid = ThreadCreate(thr, 0, 0, true);
@@ -291,7 +329,6 @@ void Initialize(ThreadState *thr) {
 }
 
 int Finalize(ThreadState *thr) {
-  Context *ctx = __tsan::ctx;
   bool failed = false;
 
   if (flags()->atexit_sleep_ms > 0 && ThreadCount(thr) > 1)
@@ -304,7 +341,7 @@ int Finalize(ThreadState *thr) {
   ctx->report_mtx.Unlock();
 
 #ifndef TSAN_GO
-  if (ctx->flags.verbosity)
+  if (common_flags()->verbosity)
     AllocatorPrintStats();
 #endif
 
@@ -325,7 +362,7 @@ int Finalize(ThreadState *thr) {
         ctx->nmissed_expected);
   }
 
-  if (flags()->print_suppressions)
+  if (common_flags()->print_suppressions)
     PrintMatchedSuppressions();
 #ifndef TSAN_GO
   if (flags()->print_benign)
@@ -372,16 +409,37 @@ void ForkChildAfter(ThreadState *thr, uptr pc) {
 }
 #endif
 
+#ifdef TSAN_GO
+NOINLINE
+void GrowShadowStack(ThreadState *thr) {
+  const int sz = thr->shadow_stack_end - thr->shadow_stack;
+  const int newsz = 2 * sz;
+  uptr *newstack = (uptr*)internal_alloc(MBlockShadowStack,
+      newsz * sizeof(uptr));
+  internal_memcpy(newstack, thr->shadow_stack, sz * sizeof(uptr));
+  internal_free(thr->shadow_stack);
+  thr->shadow_stack = newstack;
+  thr->shadow_stack_pos = newstack + sz;
+  thr->shadow_stack_end = newstack + newsz;
+}
+#endif
+
 u32 CurrentStackId(ThreadState *thr, uptr pc) {
   if (thr->shadow_stack_pos == 0)  // May happen during bootstrap.
     return 0;
-  if (pc) {
+  if (pc != 0) {
+#ifndef TSAN_GO
+    DCHECK_LT(thr->shadow_stack_pos, thr->shadow_stack_end);
+#else
+    if (thr->shadow_stack_pos == thr->shadow_stack_end)
+      GrowShadowStack(thr);
+#endif
     thr->shadow_stack_pos[0] = pc;
     thr->shadow_stack_pos++;
   }
   u32 id = StackDepotPut(thr->shadow_stack,
                          thr->shadow_stack_pos - thr->shadow_stack);
-  if (pc)
+  if (pc != 0)
     thr->shadow_stack_pos--;
   return id;
 }
@@ -443,7 +501,8 @@ void StoreIfNotYetStored(u64 *sp, u64 *s) {
   *s = 0;
 }
 
-static inline void HandleRace(ThreadState *thr, u64 *shadow_mem,
+ALWAYS_INLINE
+void HandleRace(ThreadState *thr, u64 *shadow_mem,
                               Shadow cur, Shadow old) {
   thr->racy_state[0] = cur.raw();
   thr->racy_state[1] = old.raw();
@@ -455,16 +514,12 @@ static inline void HandleRace(ThreadState *thr, u64 *shadow_mem,
 #endif
 }
 
-static inline bool OldIsInSameSynchEpoch(Shadow old, ThreadState *thr) {
-  return old.epoch() >= thr->fast_synch_epoch;
-}
-
 static inline bool HappensBefore(Shadow old, ThreadState *thr) {
   return thr->clock.get(old.TidWithIgnore()) >= old.epoch();
 }
 
-ALWAYS_INLINE USED
-void MemoryAccessImpl(ThreadState *thr, uptr addr,
+ALWAYS_INLINE
+void MemoryAccessImpl1(ThreadState *thr, uptr addr,
     int kAccessSizeLog, bool kAccessIsWrite, bool kIsAtomic,
     u64 *shadow_mem, Shadow cur) {
   StatInc(thr, StatMop);
@@ -542,13 +597,13 @@ void UnalignedMemoryAccess(ThreadState *thr, uptr pc, uptr addr,
   while (size) {
     int size1 = 1;
     int kAccessSizeLog = kSizeLog1;
-    if (size >= 8 && (addr & ~7) == ((addr + 8) & ~7)) {
+    if (size >= 8 && (addr & ~7) == ((addr + 7) & ~7)) {
       size1 = 8;
       kAccessSizeLog = kSizeLog8;
-    } else if (size >= 4 && (addr & ~7) == ((addr + 4) & ~7)) {
+    } else if (size >= 4 && (addr & ~7) == ((addr + 3) & ~7)) {
       size1 = 4;
       kAccessSizeLog = kSizeLog4;
-    } else if (size >= 2 && (addr & ~7) == ((addr + 2) & ~7)) {
+    } else if (size >= 2 && (addr & ~7) == ((addr + 1) & ~7)) {
       size1 = 2;
       kAccessSizeLog = kSizeLog2;
     }
@@ -558,6 +613,90 @@ void UnalignedMemoryAccess(ThreadState *thr, uptr pc, uptr addr,
   }
 }
 
+ALWAYS_INLINE
+bool ContainsSameAccessSlow(u64 *s, u64 a, u64 sync_epoch, bool is_write) {
+  Shadow cur(a);
+  for (uptr i = 0; i < kShadowCnt; i++) {
+    Shadow old(LoadShadow(&s[i]));
+    if (Shadow::Addr0AndSizeAreEqual(cur, old) &&
+        old.TidWithIgnore() == cur.TidWithIgnore() &&
+        old.epoch() > sync_epoch &&
+        old.IsAtomic() == cur.IsAtomic() &&
+        old.IsRead() <= cur.IsRead())
+      return true;
+  }
+  return false;
+}
+
+#if defined(__SSE3__) && TSAN_SHADOW_COUNT == 4
+#define SHUF(v0, v1, i0, i1, i2, i3) _mm_castps_si128(_mm_shuffle_ps( \
+    _mm_castsi128_ps(v0), _mm_castsi128_ps(v1), \
+    (i0)*1 + (i1)*4 + (i2)*16 + (i3)*64))
+ALWAYS_INLINE
+bool ContainsSameAccessFast(u64 *s, u64 a, u64 sync_epoch, bool is_write) {
+  // This is an optimized version of ContainsSameAccessSlow.
+  // load current access into access[0:63]
+  const m128 access     = _mm_cvtsi64_si128(a);
+  // duplicate high part of access in addr0:
+  // addr0[0:31]        = access[32:63]
+  // addr0[32:63]       = access[32:63]
+  // addr0[64:95]       = access[32:63]
+  // addr0[96:127]      = access[32:63]
+  const m128 addr0      = SHUF(access, access, 1, 1, 1, 1);
+  // load 4 shadow slots
+  const m128 shadow0    = _mm_load_si128((__m128i*)s);
+  const m128 shadow1    = _mm_load_si128((__m128i*)s + 1);
+  // load high parts of 4 shadow slots into addr_vect:
+  // addr_vect[0:31]    = shadow0[32:63]
+  // addr_vect[32:63]   = shadow0[96:127]
+  // addr_vect[64:95]   = shadow1[32:63]
+  // addr_vect[96:127]  = shadow1[96:127]
+  m128 addr_vect        = SHUF(shadow0, shadow1, 1, 3, 1, 3);
+  if (!is_write) {
+    // set IsRead bit in addr_vect
+    const m128 rw_mask1 = _mm_cvtsi64_si128(1<<15);
+    const m128 rw_mask  = SHUF(rw_mask1, rw_mask1, 0, 0, 0, 0);
+    addr_vect           = _mm_or_si128(addr_vect, rw_mask);
+  }
+  // addr0 == addr_vect?
+  const m128 addr_res   = _mm_cmpeq_epi32(addr0, addr_vect);
+  // epoch1[0:63]       = sync_epoch
+  const m128 epoch1     = _mm_cvtsi64_si128(sync_epoch);
+  // epoch[0:31]        = sync_epoch[0:31]
+  // epoch[32:63]       = sync_epoch[0:31]
+  // epoch[64:95]       = sync_epoch[0:31]
+  // epoch[96:127]      = sync_epoch[0:31]
+  const m128 epoch      = SHUF(epoch1, epoch1, 0, 0, 0, 0);
+  // load low parts of shadow cell epochs into epoch_vect:
+  // epoch_vect[0:31]   = shadow0[0:31]
+  // epoch_vect[32:63]  = shadow0[64:95]
+  // epoch_vect[64:95]  = shadow1[0:31]
+  // epoch_vect[96:127] = shadow1[64:95]
+  const m128 epoch_vect = SHUF(shadow0, shadow1, 0, 2, 0, 2);
+  // epoch_vect >= sync_epoch?
+  const m128 epoch_res  = _mm_cmpgt_epi32(epoch_vect, epoch);
+  // addr_res & epoch_res
+  const m128 res        = _mm_and_si128(addr_res, epoch_res);
+  // mask[0] = res[7]
+  // mask[1] = res[15]
+  // ...
+  // mask[15] = res[127]
+  const int mask        = _mm_movemask_epi8(res);
+  return mask != 0;
+}
+#endif
+
+ALWAYS_INLINE
+bool ContainsSameAccess(u64 *s, u64 a, u64 sync_epoch, bool is_write) {
+#if defined(__SSE3__) && TSAN_SHADOW_COUNT == 4
+  bool res = ContainsSameAccessFast(s, a, sync_epoch, is_write);
+  DCHECK_EQ(res, ContainsSameAccessSlow(s, a, sync_epoch, is_write));
+  return res;
+#else
+  return ContainsSameAccessSlow(s, a, sync_epoch, is_write);
+#endif
+}
+
 ALWAYS_INLINE USED
 void MemoryAccess(ThreadState *thr, uptr pc, uptr addr,
     int kAccessSizeLog, bool kAccessIsWrite, bool kIsAtomic) {
@@ -579,7 +718,7 @@ void MemoryAccess(ThreadState *thr, uptr pc, uptr addr,
   }
 #endif
 
-  if (*shadow_mem == kShadowRodata) {
+  if (kCppMode && *shadow_mem == kShadowRodata) {
     // Access to .rodata section, no races here.
     // Measurements show that it can be 10-20% of all memory accesses.
     StatInc(thr, StatMop);
@@ -590,14 +729,12 @@ void MemoryAccess(ThreadState *thr, uptr pc, uptr addr,
   }
 
   FastState fast_state = thr->fast_state;
-  if (fast_state.GetIgnoreBit())
+  if (fast_state.GetIgnoreBit()) {
+    StatInc(thr, StatMop);
+    StatInc(thr, kAccessIsWrite ? StatMopWrite : StatMopRead);
+    StatInc(thr, (StatType)(StatMop1 + kAccessSizeLog));
+    StatInc(thr, StatMopIgnored);
     return;
-  if (kCollectHistory) {
-    fast_state.IncrementEpoch();
-    thr->fast_state = fast_state;
-    // We must not store to the trace if we do not store to the shadow.
-    // That is, this call must be moved somewhere below.
-    TraceAddEvent(thr, fast_state, EventTypeMop, pc);
   }
 
   Shadow cur(fast_state);
@@ -605,7 +742,41 @@ void MemoryAccess(ThreadState *thr, uptr pc, uptr addr,
   cur.SetWrite(kAccessIsWrite);
   cur.SetAtomic(kIsAtomic);
 
-  MemoryAccessImpl(thr, addr, kAccessSizeLog, kAccessIsWrite, kIsAtomic,
+  if (LIKELY(ContainsSameAccess(shadow_mem, cur.raw(),
+      thr->fast_synch_epoch, kAccessIsWrite))) {
+    StatInc(thr, StatMop);
+    StatInc(thr, kAccessIsWrite ? StatMopWrite : StatMopRead);
+    StatInc(thr, (StatType)(StatMop1 + kAccessSizeLog));
+    StatInc(thr, StatMopSame);
+    return;
+  }
+
+  if (kCollectHistory) {
+    fast_state.IncrementEpoch();
+    thr->fast_state = fast_state;
+    TraceAddEvent(thr, fast_state, EventTypeMop, pc);
+    cur.IncrementEpoch();
+  }
+
+  MemoryAccessImpl1(thr, addr, kAccessSizeLog, kAccessIsWrite, kIsAtomic,
+      shadow_mem, cur);
+}
+
+// Called by MemoryAccessRange in tsan_rtl_thread.cc
+ALWAYS_INLINE USED
+void MemoryAccessImpl(ThreadState *thr, uptr addr,
+    int kAccessSizeLog, bool kAccessIsWrite, bool kIsAtomic,
+    u64 *shadow_mem, Shadow cur) {
+  if (LIKELY(ContainsSameAccess(shadow_mem, cur.raw(),
+      thr->fast_synch_epoch, kAccessIsWrite))) {
+    StatInc(thr, StatMop);
+    StatInc(thr, kAccessIsWrite ? StatMopWrite : StatMopRead);
+    StatInc(thr, (StatType)(StatMop1 + kAccessSizeLog));
+    StatInc(thr, StatMopSame);
+    return;
+  }
+
+  MemoryAccessImpl1(thr, addr, kAccessSizeLog, kAccessIsWrite, kIsAtomic,
       shadow_mem, cur);
 }
 
@@ -723,17 +894,8 @@ void FuncEntry(ThreadState *thr, uptr pc) {
 #ifndef TSAN_GO
   DCHECK_LT(thr->shadow_stack_pos, thr->shadow_stack_end);
 #else
-  if (thr->shadow_stack_pos == thr->shadow_stack_end) {
-    const int sz = thr->shadow_stack_end - thr->shadow_stack;
-    const int newsz = 2 * sz;
-    uptr *newstack = (uptr*)internal_alloc(MBlockShadowStack,
-        newsz * sizeof(uptr));
-    internal_memcpy(newstack, thr->shadow_stack, sz * sizeof(uptr));
-    internal_free(thr->shadow_stack);
-    thr->shadow_stack = newstack;
-    thr->shadow_stack_pos = newstack + sz;
-    thr->shadow_stack_end = newstack + newsz;
-  }
+  if (thr->shadow_stack_pos == thr->shadow_stack_end)
+    GrowShadowStack(thr);
 #endif
   thr->shadow_stack_pos[0] = pc;
   thr->shadow_stack_pos++;
diff --git a/libsanitizer/tsan/tsan_rtl.h b/libsanitizer/tsan/tsan_rtl.h
index e6bc8b28595..c7ea94dfbde 100644
--- a/libsanitizer/tsan/tsan_rtl.h
+++ b/libsanitizer/tsan/tsan_rtl.h
@@ -42,6 +42,7 @@
 #include "tsan_platform.h"
 #include "tsan_mutexset.h"
 #include "tsan_ignoreset.h"
+#include "tsan_stack_trace.h"
 
 #if SANITIZER_WORDSIZE != 64
 # error "ThreadSanitizer is supported only on 64-bit platforms"
@@ -49,87 +50,12 @@
 
 namespace __tsan {
 
-// Descriptor of user's memory block.
-struct MBlock {
-  /*
-  u64 mtx : 1;  // must be first
-  u64 lst : 44;
-  u64 stk : 31;  // on word boundary
-  u64 tid : kTidBits;
-  u64 siz : 128 - 1 - 31 - 44 - kTidBits;  // 39
-  */
-  u64 raw[2];
-
-  void Init(uptr siz, u32 tid, u32 stk) {
-    raw[0] = raw[1] = 0;
-    raw[1] |= (u64)siz << ((1 + 44 + 31 + kTidBits) % 64);
-    raw[1] |= (u64)tid << ((1 + 44 + 31) % 64);
-    raw[0] |= (u64)stk << (1 + 44);
-    raw[1] |= (u64)stk >> (64 - 44 - 1);
-    DCHECK_EQ(Size(), siz);
-    DCHECK_EQ(Tid(), tid);
-    DCHECK_EQ(StackId(), stk);
-  }
-
-  u32 Tid() const {
-    return GetLsb(raw[1] >> ((1 + 44 + 31) % 64), kTidBits);
-  }
-
-  uptr Size() const {
-    return raw[1] >> ((1 + 31 + 44 + kTidBits) % 64);
-  }
-
-  u32 StackId() const {
-    return (raw[0] >> (1 + 44)) | GetLsb(raw[1] << (64 - 44 - 1), 31);
-  }
-
-  SyncVar *ListHead() const {
-    return (SyncVar*)(GetLsb(raw[0] >> 1, 44) << 3);
-  }
-
-  void ListPush(SyncVar *v) {
-    SyncVar *lst = ListHead();
-    v->next = lst;
-    u64 x = (u64)v ^ (u64)lst;
-    x = (x >> 3) << 1;
-    raw[0] ^= x;
-    DCHECK_EQ(ListHead(), v);
-  }
-
-  SyncVar *ListPop() {
-    SyncVar *lst = ListHead();
-    SyncVar *nxt = lst->next;
-    lst->next = 0;
-    u64 x = (u64)lst ^ (u64)nxt;
-    x = (x >> 3) << 1;
-    raw[0] ^= x;
-    DCHECK_EQ(ListHead(), nxt);
-    return lst;
-  }
-
-  void ListReset() {
-    SyncVar *lst = ListHead();
-    u64 x = (u64)lst;
-    x = (x >> 3) << 1;
-    raw[0] ^= x;
-    DCHECK_EQ(ListHead(), 0);
-  }
-
-  void Lock();
-  void Unlock();
-  typedef GenericScopedLock<MBlock> ScopedLock;
-};
-
 #ifndef TSAN_GO
-#if defined(TSAN_COMPAT_SHADOW) && TSAN_COMPAT_SHADOW
-const uptr kAllocatorSpace = 0x7d0000000000ULL;
-#else
 const uptr kAllocatorSpace = 0x7d0000000000ULL;
-#endif
 const uptr kAllocatorSize  =  0x10000000000ULL;  // 1T.
 
 struct MapUnmapCallback;
-typedef SizeClassAllocator64<kAllocatorSpace, kAllocatorSize, sizeof(MBlock),
+typedef SizeClassAllocator64<kAllocatorSpace, kAllocatorSize, 0,
     DefaultSizeClassMap, MapUnmapCallback> PrimaryAllocator;
 typedef SizeClassAllocatorLocalCache<PrimaryAllocator> AllocatorCache;
 typedef LargeMmapAllocator<MapUnmapCallback> SecondaryAllocator;
@@ -146,14 +72,14 @@ const u64 kShadowRodata = (u64)-1;  // .rodata shadow marker
 // FastState (from most significant bit):
 //   ignore          : 1
 //   tid             : kTidBits
-//   epoch           : kClkBits
 //   unused          : -
 //   history_size    : 3
+//   epoch           : kClkBits
 class FastState {
  public:
   FastState(u64 tid, u64 epoch) {
     x_ = tid << kTidShift;
-    x_ |= epoch << kClkShift;
+    x_ |= epoch;
     DCHECK_EQ(tid, this->tid());
     DCHECK_EQ(epoch, this->epoch());
     DCHECK_EQ(GetIgnoreBit(), false);
@@ -178,13 +104,13 @@ class FastState {
   }
 
   u64 epoch() const {
-    u64 res = (x_ << (kTidBits + 1)) >> (64 - kClkBits);
+    u64 res = x_ & ((1ull << kClkBits) - 1);
     return res;
   }
 
   void IncrementEpoch() {
     u64 old_epoch = epoch();
-    x_ += 1 << kClkShift;
+    x_ += 1;
     DCHECK_EQ(old_epoch + 1, epoch());
     (void)old_epoch;
   }
@@ -196,17 +122,19 @@ class FastState {
   void SetHistorySize(int hs) {
     CHECK_GE(hs, 0);
     CHECK_LE(hs, 7);
-    x_ = (x_ & ~7) | hs;
+    x_ = (x_ & ~(kHistoryMask << kHistoryShift)) | (u64(hs) << kHistoryShift);
   }
 
+  ALWAYS_INLINE
   int GetHistorySize() const {
-    return (int)(x_ & 7);
+    return (int)((x_ >> kHistoryShift) & kHistoryMask);
   }
 
   void ClearHistorySize() {
-    x_ &= ~7;
+    SetHistorySize(0);
   }
 
+  ALWAYS_INLINE
   u64 GetTracePos() const {
     const int hs = GetHistorySize();
     // When hs == 0, the trace consists of 2 parts.
@@ -217,20 +145,21 @@ class FastState {
  private:
   friend class Shadow;
   static const int kTidShift = 64 - kTidBits - 1;
-  static const int kClkShift = kTidShift - kClkBits;
   static const u64 kIgnoreBit = 1ull << 63;
   static const u64 kFreedBit = 1ull << 63;
+  static const u64 kHistoryShift = kClkBits;
+  static const u64 kHistoryMask = 7;
   u64 x_;
 };
 
 // Shadow (from most significant bit):
 //   freed           : 1
 //   tid             : kTidBits
-//   epoch           : kClkBits
 //   is_atomic       : 1
 //   is_read         : 1
 //   size_log        : 2
 //   addr0           : 3
+//   epoch           : kClkBits
 class Shadow : public FastState {
  public:
   explicit Shadow(u64 x)
@@ -243,10 +172,10 @@ class Shadow : public FastState {
   }
 
   void SetAddr0AndSizeLog(u64 addr0, unsigned kAccessSizeLog) {
-    DCHECK_EQ(x_ & 31, 0);
+    DCHECK_EQ((x_ >> kClkBits) & 31, 0);
     DCHECK_LE(addr0, 7);
     DCHECK_LE(kAccessSizeLog, 3);
-    x_ |= (kAccessSizeLog << 3) | addr0;
+    x_ |= ((kAccessSizeLog << 3) | addr0) << kClkBits;
     DCHECK_EQ(kAccessSizeLog, size_log());
     DCHECK_EQ(addr0, this->addr0());
   }
@@ -279,47 +208,34 @@ class Shadow : public FastState {
     return shifted_xor == 0;
   }
 
-  static inline bool Addr0AndSizeAreEqual(const Shadow s1, const Shadow s2) {
-    u64 masked_xor = (s1.x_ ^ s2.x_) & 31;
+  static ALWAYS_INLINE
+  bool Addr0AndSizeAreEqual(const Shadow s1, const Shadow s2) {
+    u64 masked_xor = ((s1.x_ ^ s2.x_) >> kClkBits) & 31;
     return masked_xor == 0;
   }
 
-  static inline bool TwoRangesIntersect(Shadow s1, Shadow s2,
+  static ALWAYS_INLINE bool TwoRangesIntersect(Shadow s1, Shadow s2,
       unsigned kS2AccessSize) {
     bool res = false;
     u64 diff = s1.addr0() - s2.addr0();
     if ((s64)diff < 0) {  // s1.addr0 < s2.addr0  // NOLINT
       // if (s1.addr0() + size1) > s2.addr0()) return true;
-      if (s1.size() > -diff)  res = true;
+      if (s1.size() > -diff)
+        res = true;
     } else {
       // if (s2.addr0() + kS2AccessSize > s1.addr0()) return true;
-      if (kS2AccessSize > diff) res = true;
+      if (kS2AccessSize > diff)
+        res = true;
     }
-    DCHECK_EQ(res, TwoRangesIntersectSLOW(s1, s2));
-    DCHECK_EQ(res, TwoRangesIntersectSLOW(s2, s1));
+    DCHECK_EQ(res, TwoRangesIntersectSlow(s1, s2));
+    DCHECK_EQ(res, TwoRangesIntersectSlow(s2, s1));
     return res;
   }
 
-  // The idea behind the offset is as follows.
-  // Consider that we have 8 bool's contained within a single 8-byte block
-  // (mapped to a single shadow "cell"). Now consider that we write to the bools
-  // from a single thread (which we consider the common case).
-  // W/o offsetting each access will have to scan 4 shadow values at average
-  // to find the corresponding shadow value for the bool.
-  // With offsetting we start scanning shadow with the offset so that
-  // each access hits necessary shadow straight off (at least in an expected
-  // optimistic case).
-  // This logic works seamlessly for any layout of user data. For example,
-  // if user data is {int, short, char, char}, then accesses to the int are
-  // offsetted to 0, short - 4, 1st char - 6, 2nd char - 7. Hopefully, accesses
-  // from a single thread won't need to scan all 8 shadow values.
-  unsigned ComputeSearchOffset() {
-    return x_ & 7;
-  }
-  u64 addr0() const { return x_ & 7; }
-  u64 size() const { return 1ull << size_log(); }
-  bool IsWrite() const { return !IsRead(); }
-  bool IsRead() const { return x_ & kReadBit; }
+  u64 ALWAYS_INLINE addr0() const { return (x_ >> kClkBits) & 7; }
+  u64 ALWAYS_INLINE size() const { return 1ull << size_log(); }
+  bool ALWAYS_INLINE IsWrite() const { return !IsRead(); }
+  bool ALWAYS_INLINE IsRead() const { return x_ & kReadBit; }
 
   // The idea behind the freed bit is as follows.
   // When the memory is freed (or otherwise unaccessible) we write to the shadow
@@ -344,15 +260,14 @@ class Shadow : public FastState {
     return res;
   }
 
-  bool IsBothReadsOrAtomic(bool kIsWrite, bool kIsAtomic) const {
-    // analyzes 5-th bit (is_read) and 6-th bit (is_atomic)
-    bool v = x_ & u64(((kIsWrite ^ 1) << kReadShift)
-        | (kIsAtomic << kAtomicShift));
+  bool ALWAYS_INLINE IsBothReadsOrAtomic(bool kIsWrite, bool kIsAtomic) const {
+    bool v = x_ & ((u64(kIsWrite ^ 1) << kReadShift)
+        | (u64(kIsAtomic) << kAtomicShift));
     DCHECK_EQ(v, (!IsWrite() && !kIsWrite) || (IsAtomic() && kIsAtomic));
     return v;
   }
 
-  bool IsRWNotWeaker(bool kIsWrite, bool kIsAtomic) const {
+  bool ALWAYS_INLINE IsRWNotWeaker(bool kIsWrite, bool kIsAtomic) const {
     bool v = ((x_ >> kReadShift) & 3)
         <= u64((kIsWrite ^ 1) | (kIsAtomic << 1));
     DCHECK_EQ(v, (IsAtomic() < kIsAtomic) ||
@@ -360,7 +275,7 @@ class Shadow : public FastState {
     return v;
   }
 
-  bool IsRWWeakerOrEqual(bool kIsWrite, bool kIsAtomic) const {
+  bool ALWAYS_INLINE IsRWWeakerOrEqual(bool kIsWrite, bool kIsAtomic) const {
     bool v = ((x_ >> kReadShift) & 3)
         >= u64((kIsWrite ^ 1) | (kIsAtomic << 1));
     DCHECK_EQ(v, (IsAtomic() > kIsAtomic) ||
@@ -369,14 +284,14 @@ class Shadow : public FastState {
   }
 
  private:
-  static const u64 kReadShift   = 5;
+  static const u64 kReadShift   = 5 + kClkBits;
   static const u64 kReadBit     = 1ull << kReadShift;
-  static const u64 kAtomicShift = 6;
+  static const u64 kAtomicShift = 6 + kClkBits;
   static const u64 kAtomicBit   = 1ull << kAtomicShift;
 
-  u64 size_log() const { return (x_ >> 3) & 3; }
+  u64 size_log() const { return (x_ >> (3 + kClkBits)) & 3; }
 
-  static bool TwoRangesIntersectSLOW(const Shadow s1, const Shadow s2) {
+  static bool TwoRangesIntersectSlow(const Shadow s1, const Shadow s2) {
     if (s1.addr0() == s2.addr0()) return true;
     if (s1.addr0() < s2.addr0() && s1.addr0() + s1.size() > s2.addr0())
       return true;
@@ -391,6 +306,9 @@ struct SignalContext;
 struct JmpBuf {
   uptr sp;
   uptr mangled_sp;
+  int int_signal_send;
+  bool in_blocking_func;
+  uptr in_signal_handler;
   uptr *shadow_stack_pos;
 };
 
@@ -439,7 +357,7 @@ struct ThreadState {
   const int unique_id;
   bool in_symbolizer;
   bool in_ignored_lib;
-  bool is_alive;
+  bool is_dead;
   bool is_freeing;
   bool is_vptr_access;
   const uptr stk_addr;
@@ -452,9 +370,13 @@ struct ThreadState {
   DDPhysicalThread *dd_pt;
   DDLogicalThread *dd_lt;
 
-  bool in_signal_handler;
+  atomic_uintptr_t in_signal_handler;
   SignalContext *signal_ctx;
 
+  DenseSlabAllocCache block_cache;
+  DenseSlabAllocCache sync_cache;
+  DenseSlabAllocCache clock_cache;
+
 #ifndef TSAN_GO
   u32 last_sleep_stack_id;
   ThreadClock last_sleep_clock;
@@ -498,6 +420,7 @@ class ThreadContext : public ThreadContextBase {
   void OnStarted(void *arg);
   void OnCreated(void *arg);
   void OnReset();
+  void OnDetached(void *arg);
 };
 
 struct RacyStacks {
@@ -528,7 +451,7 @@ struct Context {
   bool initialized;
   bool after_multithreaded_fork;
 
-  SyncTab synctab;
+  MetaMap metamap;
 
   Mutex report_mtx;
   int nreported;
@@ -546,6 +469,8 @@ struct Context {
   InternalMmapVector<FiredSuppression> fired_suppressions;
   DDetector *dd;
 
+  ClockAlloc clock_alloc;
+
   Flags flags;
 
   u64 stat[StatCnt];
@@ -574,11 +499,11 @@ class ScopedReport {
   explicit ScopedReport(ReportType typ);
   ~ScopedReport();
 
-  void AddStack(const StackTrace *stack);
   void AddMemoryAccess(uptr addr, Shadow s, const StackTrace *stack,
                        const MutexSet *mset);
-  void AddThread(const ThreadContext *tctx);
-  void AddThread(int unique_tid);
+  void AddStack(const StackTrace *stack, bool suppressable = false);
+  void AddThread(const ThreadContext *tctx, bool suppressable = false);
+  void AddThread(int unique_tid, bool suppressable = false);
   void AddUniqueTid(int unique_tid);
   void AddMutex(const SyncVar *s);
   u64 AddMutex(u64 id);
@@ -626,11 +551,7 @@ void ForkParentAfter(ThreadState *thr, uptr pc);
 void ForkChildAfter(ThreadState *thr, uptr pc);
 
 void ReportRace(ThreadState *thr);
-bool OutputReport(Context *ctx,
-                  const ScopedReport &srep,
-                  const ReportStack *suppress_stack1,
-                  const ReportStack *suppress_stack2 = 0,
-                  const ReportLocation *suppress_loc = 0);
+bool OutputReport(ThreadState *thr, const ScopedReport &srep);
 bool IsFiredSuppression(Context *ctx,
                         const ScopedReport &srep,
                         const StackTrace &trace);
@@ -659,9 +580,8 @@ void PrintCurrentStackSlow();  // uses libunwind
 void Initialize(ThreadState *thr);
 int Finalize(ThreadState *thr);
 
-SyncVar* GetJavaSync(ThreadState *thr, uptr pc, uptr addr,
-                     bool write_lock, bool create);
-SyncVar* GetAndRemoveJavaSync(ThreadState *thr, uptr pc, uptr addr);
+void OnUserAlloc(ThreadState *thr, uptr pc, uptr p, uptr sz, bool write);
+void OnUserFree(ThreadState *thr, uptr pc, uptr p, bool write);
 
 void MemoryAccess(ThreadState *thr, uptr pc, uptr addr,
     int kAccessSizeLog, bool kAccessIsWrite, bool kIsAtomic);
diff --git a/libsanitizer/tsan/tsan_rtl_mutex.cc b/libsanitizer/tsan/tsan_rtl_mutex.cc
index 3724571cfff..cc183138aba 100644
--- a/libsanitizer/tsan/tsan_rtl_mutex.cc
+++ b/libsanitizer/tsan/tsan_rtl_mutex.cc
@@ -50,14 +50,18 @@ void DDMutexInit(ThreadState *thr, uptr pc, SyncVar *s) {
 
 static void ReportMutexMisuse(ThreadState *thr, uptr pc, ReportType typ,
     uptr addr, u64 mid) {
+  // In Go, these misuses are either impossible, or detected by std lib,
+  // or false positives (e.g. unlock in a different thread).
+  if (kGoMode)
+    return;
   ThreadRegistryLock l(ctx->thread_registry);
   ScopedReport rep(typ);
   rep.AddMutex(mid);
   StackTrace trace;
   trace.ObtainCurrent(thr, pc);
-  rep.AddStack(&trace);
+  rep.AddStack(&trace, true);
   rep.AddLocation(addr, 1);
-  OutputReport(ctx, rep, rep.GetReport()->stacks[0]);
+  OutputReport(thr, rep);
 }
 
 void MutexCreate(ThreadState *thr, uptr pc, uptr addr,
@@ -70,10 +74,12 @@ void MutexCreate(ThreadState *thr, uptr pc, uptr addr,
     MemoryWrite(thr, pc, addr, kSizeLog1);
     thr->is_freeing = false;
   }
-  SyncVar *s = ctx->synctab.GetOrCreateAndLock(thr, pc, addr, true);
+  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, true);
   s->is_rw = rw;
   s->is_recursive = recursive;
   s->is_linker_init = linker_init;
+  if (kCppMode && s->creation_stack_id == 0)
+    s->creation_stack_id = CurrentStackId(thr, pc);
   s->mtx.Unlock();
 }
 
@@ -86,37 +92,54 @@ void MutexDestroy(ThreadState *thr, uptr pc, uptr addr) {
   if (IsGlobalVar(addr))
     return;
 #endif
-  SyncVar *s = ctx->synctab.GetAndRemove(thr, pc, addr);
-  if (s == 0)
-    return;
-  if (flags()->detect_deadlocks) {
-    Callback cb(thr, pc);
-    ctx->dd->MutexDestroy(&cb, &s->dd);
-  }
   if (IsAppMem(addr)) {
     CHECK(!thr->is_freeing);
     thr->is_freeing = true;
     MemoryWrite(thr, pc, addr, kSizeLog1);
     thr->is_freeing = false;
   }
+  SyncVar *s = ctx->metamap.GetIfExistsAndLock(addr);
+  if (s == 0)
+    return;
+  if (common_flags()->detect_deadlocks) {
+    Callback cb(thr, pc);
+    ctx->dd->MutexDestroy(&cb, &s->dd);
+    ctx->dd->MutexInit(&cb, &s->dd);
+  }
+  bool unlock_locked = false;
   if (flags()->report_destroy_locked
       && s->owner_tid != SyncVar::kInvalidTid
       && !s->is_broken) {
     s->is_broken = true;
+    unlock_locked = true;
+  }
+  u64 mid = s->GetId();
+  u32 last_lock = s->last_lock;
+  if (!unlock_locked)
+    s->Reset(thr);  // must not reset it before the report is printed
+  s->mtx.Unlock();
+  if (unlock_locked) {
     ThreadRegistryLock l(ctx->thread_registry);
     ScopedReport rep(ReportTypeMutexDestroyLocked);
-    rep.AddMutex(s);
+    rep.AddMutex(mid);
     StackTrace trace;
     trace.ObtainCurrent(thr, pc);
     rep.AddStack(&trace);
-    FastState last(s->last_lock);
+    FastState last(last_lock);
     RestoreStack(last.tid(), last.epoch(), &trace, 0);
-    rep.AddStack(&trace);
-    rep.AddLocation(s->addr, 1);
-    OutputReport(ctx, rep, rep.GetReport()->stacks[0]);
+    rep.AddStack(&trace, true);
+    rep.AddLocation(addr, 1);
+    OutputReport(thr, rep);
+  }
+  if (unlock_locked) {
+    SyncVar *s = ctx->metamap.GetIfExistsAndLock(addr);
+    if (s != 0) {
+      s->Reset(thr);
+      s->mtx.Unlock();
+    }
   }
-  thr->mset.Remove(s->GetId());
-  DestroyAndFree(s);
+  thr->mset.Remove(mid);
+  // s will be destroyed and freed in MetaMap::FreeBlock.
 }
 
 void MutexLock(ThreadState *thr, uptr pc, uptr addr, int rec, bool try_lock) {
@@ -124,7 +147,7 @@ void MutexLock(ThreadState *thr, uptr pc, uptr addr, int rec, bool try_lock) {
   CHECK_GT(rec, 0);
   if (IsAppMem(addr))
     MemoryReadAtomic(thr, pc, addr, kSizeLog1);
-  SyncVar *s = ctx->synctab.GetOrCreateAndLock(thr, pc, addr, true);
+  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, true);
   thr->fast_state.IncrementEpoch();
   TraceAddEvent(thr, thr->fast_state, EventTypeLock, s->GetId());
   bool report_double_lock = false;
@@ -147,7 +170,7 @@ void MutexLock(ThreadState *thr, uptr pc, uptr addr, int rec, bool try_lock) {
   }
   s->recursion += rec;
   thr->mset.Add(s->GetId(), true, thr->fast_state.epoch());
-  if (flags()->detect_deadlocks && s->recursion == 1) {
+  if (common_flags()->detect_deadlocks && (s->recursion - rec) == 0) {
     Callback cb(thr, pc);
     if (!try_lock)
       ctx->dd->MutexBeforeLock(&cb, &s->dd, true);
@@ -158,7 +181,7 @@ void MutexLock(ThreadState *thr, uptr pc, uptr addr, int rec, bool try_lock) {
   // Can't touch s after this point.
   if (report_double_lock)
     ReportMutexMisuse(thr, pc, ReportTypeMutexDoubleLock, addr, mid);
-  if (flags()->detect_deadlocks) {
+  if (common_flags()->detect_deadlocks) {
     Callback cb(thr, pc);
     ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
   }
@@ -168,12 +191,12 @@ int MutexUnlock(ThreadState *thr, uptr pc, uptr addr, bool all) {
   DPrintf("#%d: MutexUnlock %zx all=%d\n", thr->tid, addr, all);
   if (IsAppMem(addr))
     MemoryReadAtomic(thr, pc, addr, kSizeLog1);
-  SyncVar *s = ctx->synctab.GetOrCreateAndLock(thr, pc, addr, true);
+  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, true);
   thr->fast_state.IncrementEpoch();
   TraceAddEvent(thr, thr->fast_state, EventTypeUnlock, s->GetId());
   int rec = 0;
   bool report_bad_unlock = false;
-  if (s->recursion == 0 || s->owner_tid != thr->tid) {
+  if (kCppMode && (s->recursion == 0 || s->owner_tid != thr->tid)) {
     if (flags()->report_mutex_bugs && !s->is_broken) {
       s->is_broken = true;
       report_bad_unlock = true;
@@ -190,7 +213,8 @@ int MutexUnlock(ThreadState *thr, uptr pc, uptr addr, bool all) {
     }
   }
   thr->mset.Del(s->GetId(), true);
-  if (flags()->detect_deadlocks && s->recursion == 0) {
+  if (common_flags()->detect_deadlocks && s->recursion == 0 &&
+      !report_bad_unlock) {
     Callback cb(thr, pc);
     ctx->dd->MutexBeforeUnlock(&cb, &s->dd, true);
   }
@@ -199,7 +223,7 @@ int MutexUnlock(ThreadState *thr, uptr pc, uptr addr, bool all) {
   // Can't touch s after this point.
   if (report_bad_unlock)
     ReportMutexMisuse(thr, pc, ReportTypeMutexBadUnlock, addr, mid);
-  if (flags()->detect_deadlocks) {
+  if (common_flags()->detect_deadlocks && !report_bad_unlock) {
     Callback cb(thr, pc);
     ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
   }
@@ -211,7 +235,7 @@ void MutexReadLock(ThreadState *thr, uptr pc, uptr addr, bool trylock) {
   StatInc(thr, StatMutexReadLock);
   if (IsAppMem(addr))
     MemoryReadAtomic(thr, pc, addr, kSizeLog1);
-  SyncVar *s = ctx->synctab.GetOrCreateAndLock(thr, pc, addr, false);
+  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, false);
   thr->fast_state.IncrementEpoch();
   TraceAddEvent(thr, thr->fast_state, EventTypeRLock, s->GetId());
   bool report_bad_lock = false;
@@ -224,7 +248,7 @@ void MutexReadLock(ThreadState *thr, uptr pc, uptr addr, bool trylock) {
   AcquireImpl(thr, pc, &s->clock);
   s->last_lock = thr->fast_state.raw();
   thr->mset.Add(s->GetId(), false, thr->fast_state.epoch());
-  if (flags()->detect_deadlocks && s->recursion == 0) {
+  if (common_flags()->detect_deadlocks && s->recursion == 0) {
     Callback cb(thr, pc);
     if (!trylock)
       ctx->dd->MutexBeforeLock(&cb, &s->dd, false);
@@ -235,7 +259,7 @@ void MutexReadLock(ThreadState *thr, uptr pc, uptr addr, bool trylock) {
   // Can't touch s after this point.
   if (report_bad_lock)
     ReportMutexMisuse(thr, pc, ReportTypeMutexBadReadLock, addr, mid);
-  if (flags()->detect_deadlocks) {
+  if (common_flags()->detect_deadlocks) {
     Callback cb(thr, pc);
     ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
   }
@@ -246,7 +270,7 @@ void MutexReadUnlock(ThreadState *thr, uptr pc, uptr addr) {
   StatInc(thr, StatMutexReadUnlock);
   if (IsAppMem(addr))
     MemoryReadAtomic(thr, pc, addr, kSizeLog1);
-  SyncVar *s = ctx->synctab.GetOrCreateAndLock(thr, pc, addr, true);
+  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, true);
   thr->fast_state.IncrementEpoch();
   TraceAddEvent(thr, thr->fast_state, EventTypeRUnlock, s->GetId());
   bool report_bad_unlock = false;
@@ -257,7 +281,7 @@ void MutexReadUnlock(ThreadState *thr, uptr pc, uptr addr) {
     }
   }
   ReleaseImpl(thr, pc, &s->read_clock);
-  if (flags()->detect_deadlocks && s->recursion == 0) {
+  if (common_flags()->detect_deadlocks && s->recursion == 0) {
     Callback cb(thr, pc);
     ctx->dd->MutexBeforeUnlock(&cb, &s->dd, false);
   }
@@ -267,7 +291,7 @@ void MutexReadUnlock(ThreadState *thr, uptr pc, uptr addr) {
   thr->mset.Del(mid, false);
   if (report_bad_unlock)
     ReportMutexMisuse(thr, pc, ReportTypeMutexBadReadUnlock, addr, mid);
-  if (flags()->detect_deadlocks) {
+  if (common_flags()->detect_deadlocks) {
     Callback cb(thr, pc);
     ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
   }
@@ -277,7 +301,7 @@ void MutexReadOrWriteUnlock(ThreadState *thr, uptr pc, uptr addr) {
   DPrintf("#%d: MutexReadOrWriteUnlock %zx\n", thr->tid, addr);
   if (IsAppMem(addr))
     MemoryReadAtomic(thr, pc, addr, kSizeLog1);
-  SyncVar *s = ctx->synctab.GetOrCreateAndLock(thr, pc, addr, true);
+  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, true);
   bool write = true;
   bool report_bad_unlock = false;
   if (s->owner_tid == SyncVar::kInvalidTid) {
@@ -305,7 +329,7 @@ void MutexReadOrWriteUnlock(ThreadState *thr, uptr pc, uptr addr) {
     report_bad_unlock = true;
   }
   thr->mset.Del(s->GetId(), write);
-  if (flags()->detect_deadlocks && s->recursion == 0) {
+  if (common_flags()->detect_deadlocks && s->recursion == 0) {
     Callback cb(thr, pc);
     ctx->dd->MutexBeforeUnlock(&cb, &s->dd, write);
   }
@@ -314,7 +338,7 @@ void MutexReadOrWriteUnlock(ThreadState *thr, uptr pc, uptr addr) {
   // Can't touch s after this point.
   if (report_bad_unlock)
     ReportMutexMisuse(thr, pc, ReportTypeMutexBadUnlock, addr, mid);
-  if (flags()->detect_deadlocks) {
+  if (common_flags()->detect_deadlocks) {
     Callback cb(thr, pc);
     ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
   }
@@ -322,7 +346,7 @@ void MutexReadOrWriteUnlock(ThreadState *thr, uptr pc, uptr addr) {
 
 void MutexRepair(ThreadState *thr, uptr pc, uptr addr) {
   DPrintf("#%d: MutexRepair %zx\n", thr->tid, addr);
-  SyncVar *s = ctx->synctab.GetOrCreateAndLock(thr, pc, addr, true);
+  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, true);
   s->owner_tid = SyncVar::kInvalidTid;
   s->recursion = 0;
   s->mtx.Unlock();
@@ -332,7 +356,7 @@ void Acquire(ThreadState *thr, uptr pc, uptr addr) {
   DPrintf("#%d: Acquire %zx\n", thr->tid, addr);
   if (thr->ignore_sync)
     return;
-  SyncVar *s = ctx->synctab.GetOrCreateAndLock(thr, pc, addr, false);
+  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, false);
   AcquireImpl(thr, pc, &s->clock);
   s->mtx.ReadUnlock();
 }
@@ -359,7 +383,7 @@ void Release(ThreadState *thr, uptr pc, uptr addr) {
   DPrintf("#%d: Release %zx\n", thr->tid, addr);
   if (thr->ignore_sync)
     return;
-  SyncVar *s = ctx->synctab.GetOrCreateAndLock(thr, pc, addr, true);
+  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, true);
   thr->fast_state.IncrementEpoch();
   // Can't increment epoch w/o writing to the trace as well.
   TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
@@ -371,7 +395,7 @@ void ReleaseStore(ThreadState *thr, uptr pc, uptr addr) {
   DPrintf("#%d: ReleaseStore %zx\n", thr->tid, addr);
   if (thr->ignore_sync)
     return;
-  SyncVar *s = ctx->synctab.GetOrCreateAndLock(thr, pc, addr, true);
+  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, true);
   thr->fast_state.IncrementEpoch();
   // Can't increment epoch w/o writing to the trace as well.
   TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
@@ -404,7 +428,7 @@ void AcquireImpl(ThreadState *thr, uptr pc, SyncClock *c) {
   if (thr->ignore_sync)
     return;
   thr->clock.set(thr->fast_state.epoch());
-  thr->clock.acquire(c);
+  thr->clock.acquire(&thr->clock_cache, c);
   StatInc(thr, StatSyncAcquire);
 }
 
@@ -413,7 +437,7 @@ void ReleaseImpl(ThreadState *thr, uptr pc, SyncClock *c) {
     return;
   thr->clock.set(thr->fast_state.epoch());
   thr->fast_synch_epoch = thr->fast_state.epoch();
-  thr->clock.release(c);
+  thr->clock.release(&thr->clock_cache, c);
   StatInc(thr, StatSyncRelease);
 }
 
@@ -422,7 +446,7 @@ void ReleaseStoreImpl(ThreadState *thr, uptr pc, SyncClock *c) {
     return;
   thr->clock.set(thr->fast_state.epoch());
   thr->fast_synch_epoch = thr->fast_state.epoch();
-  thr->clock.ReleaseStore(c);
+  thr->clock.ReleaseStore(&thr->clock_cache, c);
   StatInc(thr, StatSyncRelease);
 }
 
@@ -431,7 +455,7 @@ void AcquireReleaseImpl(ThreadState *thr, uptr pc, SyncClock *c) {
     return;
   thr->clock.set(thr->fast_state.epoch());
   thr->fast_synch_epoch = thr->fast_state.epoch();
-  thr->clock.acq_rel(c);
+  thr->clock.acq_rel(&thr->clock_cache, c);
   StatInc(thr, StatSyncAcquire);
   StatInc(thr, StatSyncRelease);
 }
@@ -446,7 +470,7 @@ void ReportDeadlock(ThreadState *thr, uptr pc, DDReport *r) {
     rep.AddUniqueTid((int)r->loop[i].thr_ctx);
     rep.AddThread((int)r->loop[i].thr_ctx);
   }
-  StackTrace stacks[2 * DDReport::kMaxLoopSize];
+  InternalScopedBuffer<StackTrace> stacks(2 * DDReport::kMaxLoopSize);
   uptr dummy_pc = 0x42;
   for (int i = 0; i < r->n; i++) {
     uptr size;
@@ -460,12 +484,10 @@ void ReportDeadlock(ThreadState *thr, uptr pc, DDReport *r) {
         // but we should still produce some stack trace in the report.
         stacks[i].Init(&dummy_pc, 1);
       }
-      rep.AddStack(&stacks[i]);
+      rep.AddStack(&stacks[i], true);
     }
   }
-  // FIXME: use all stacks for suppressions, not just the second stack of the
-  // first edge.
-  OutputReport(ctx, rep, rep.GetReport()->stacks[0]);
+  OutputReport(thr, rep);
 }
 
 }  // namespace __tsan
diff --git a/libsanitizer/tsan/tsan_rtl_report.cc b/libsanitizer/tsan/tsan_rtl_report.cc
index d19deb066df..eafd1f4dfcd 100644
--- a/libsanitizer/tsan/tsan_rtl_report.cc
+++ b/libsanitizer/tsan/tsan_rtl_report.cc
@@ -59,7 +59,7 @@ static void StackStripMain(ReportStack *stack) {
   ReportStack *last_frame2 = 0;
   const char *prefix = "__interceptor_";
   uptr prefix_len = internal_strlen(prefix);
-  const char *path_prefix = flags()->strip_path_prefix;
+  const char *path_prefix = common_flags()->strip_path_prefix;
   uptr path_prefix_len = internal_strlen(path_prefix);
   char *pos;
   for (ReportStack *ent = stack; ent; ent = ent->next) {
@@ -160,9 +160,10 @@ ScopedReport::~ScopedReport() {
   DestroyAndFree(rep_);
 }
 
-void ScopedReport::AddStack(const StackTrace *stack) {
+void ScopedReport::AddStack(const StackTrace *stack, bool suppressable) {
   ReportStack **rs = rep_->stacks.PushBack();
   *rs = SymbolizeStack(*stack);
+  (*rs)->suppressable = suppressable;
 }
 
 void ScopedReport::AddMemoryAccess(uptr addr, Shadow s,
@@ -176,6 +177,8 @@ void ScopedReport::AddMemoryAccess(uptr addr, Shadow s,
   mop->write = s.IsWrite();
   mop->atomic = s.IsAtomic();
   mop->stack = SymbolizeStack(*stack);
+  if (mop->stack)
+    mop->stack->suppressable = true;
   for (uptr i = 0; i < mset->Size(); i++) {
     MutexSet::Desc d = mset->Get(i);
     u64 mid = this->AddMutex(d.id);
@@ -188,7 +191,7 @@ void ScopedReport::AddUniqueTid(int unique_tid) {
   rep_->unique_tids.PushBack(unique_tid);
 }
 
-void ScopedReport::AddThread(const ThreadContext *tctx) {
+void ScopedReport::AddThread(const ThreadContext *tctx, bool suppressable) {
   for (uptr i = 0; i < rep_->threads.Size(); i++) {
     if ((u32)rep_->threads[i]->id == tctx->tid)
       return;
@@ -203,6 +206,8 @@ void ScopedReport::AddThread(const ThreadContext *tctx) {
   rt->parent_tid = tctx->parent_tid;
   rt->stack = 0;
   rt->stack = SymbolizeStackId(tctx->creation_stack_id);
+  if (rt->stack)
+    rt->stack->suppressable = suppressable;
 }
 
 #ifndef TSAN_GO
@@ -249,9 +254,9 @@ ThreadContext *IsThreadStackOrTls(uptr addr, bool *is_stack) {
 }
 #endif
 
-void ScopedReport::AddThread(int unique_tid) {
+void ScopedReport::AddThread(int unique_tid, bool suppressable) {
 #ifndef TSAN_GO
-  AddThread(FindThreadByUidLocked(unique_tid));
+  AddThread(FindThreadByUidLocked(unique_tid), suppressable);
 #endif
 }
 
@@ -273,7 +278,7 @@ u64 ScopedReport::AddMutex(u64 id) {
   u64 uid = 0;
   u64 mid = id;
   uptr addr = SyncVar::SplitId(id, &uid);
-  SyncVar *s = ctx->synctab.GetIfExistsAndLock(addr, false);
+  SyncVar *s = ctx->metamap.GetIfExistsAndLock(addr);
   // Check that the mutex is still alive.
   // Another mutex can be created at the same address,
   // so check uid as well.
@@ -284,7 +289,7 @@ u64 ScopedReport::AddMutex(u64 id) {
     AddDeadMutex(id);
   }
   if (s)
-    s->mtx.ReadUnlock();
+    s->mtx.Unlock();
   return mid;
 }
 
@@ -309,8 +314,7 @@ void ScopedReport::AddLocation(uptr addr, uptr size) {
   int fd = -1;
   int creat_tid = -1;
   u32 creat_stack = 0;
-  if (FdLocation(addr, &fd, &creat_tid, &creat_stack)
-      || FdLocation(AlternativeAddress(addr), &fd, &creat_tid, &creat_stack)) {
+  if (FdLocation(addr, &fd, &creat_tid, &creat_stack)) {
     void *mem = internal_alloc(MBlockReportLoc, sizeof(ReportLocation));
     ReportLocation *loc = new(mem) ReportLocation();
     rep_->locs.PushBack(loc);
@@ -324,21 +328,26 @@ void ScopedReport::AddLocation(uptr addr, uptr size) {
     return;
   }
   MBlock *b = 0;
-  if (allocator()->PointerIsMine((void*)addr)
-      && (b = user_mblock(0, (void*)addr))) {
-    ThreadContext *tctx = FindThreadByTidLocked(b->Tid());
+  Allocator *a = allocator();
+  if (a->PointerIsMine((void*)addr)) {
+    void *block_begin = a->GetBlockBegin((void*)addr);
+    if (block_begin)
+      b = ctx->metamap.GetBlock((uptr)block_begin);
+  }
+  if (b != 0) {
+    ThreadContext *tctx = FindThreadByTidLocked(b->tid);
     void *mem = internal_alloc(MBlockReportLoc, sizeof(ReportLocation));
     ReportLocation *loc = new(mem) ReportLocation();
     rep_->locs.PushBack(loc);
     loc->type = ReportLocationHeap;
     loc->addr = (uptr)allocator()->GetBlockBegin((void*)addr);
-    loc->size = b->Size();
-    loc->tid = tctx ? tctx->tid : b->Tid();
+    loc->size = b->siz;
+    loc->tid = tctx ? tctx->tid : b->tid;
     loc->name = 0;
     loc->file = 0;
     loc->line = 0;
     loc->stack = 0;
-    loc->stack = SymbolizeStackId(b->StackId());
+    loc->stack = SymbolizeStackId(b->stk);
     if (tctx)
       AddThread(tctx);
     return;
@@ -354,6 +363,7 @@ void ScopedReport::AddLocation(uptr addr, uptr size) {
   }
   ReportLocation *loc = SymbolizeData(addr);
   if (loc) {
+    loc->suppressable = true;
     rep_->locs.PushBack(loc);
     return;
   }
@@ -493,25 +503,31 @@ static void AddRacyStacks(ThreadState *thr, const StackTrace (&traces)[2],
   }
 }
 
-bool OutputReport(Context *ctx,
-                  const ScopedReport &srep,
-                  const ReportStack *suppress_stack1,
-                  const ReportStack *suppress_stack2,
-                  const ReportLocation *suppress_loc) {
+bool OutputReport(ThreadState *thr, const ScopedReport &srep) {
   atomic_store(&ctx->last_symbolize_time_ns, NanoTime(), memory_order_relaxed);
   const ReportDesc *rep = srep.GetReport();
   Suppression *supp = 0;
-  uptr suppress_pc = IsSuppressed(rep->typ, suppress_stack1, &supp);
-  if (suppress_pc == 0)
-    suppress_pc = IsSuppressed(rep->typ, suppress_stack2, &supp);
-  if (suppress_pc == 0)
-    suppress_pc = IsSuppressed(rep->typ, suppress_loc, &supp);
+  uptr suppress_pc = 0;
+  for (uptr i = 0; suppress_pc == 0 && i < rep->mops.Size(); i++)
+    suppress_pc = IsSuppressed(rep->typ, rep->mops[i]->stack, &supp);
+  for (uptr i = 0; suppress_pc == 0 && i < rep->stacks.Size(); i++)
+    suppress_pc = IsSuppressed(rep->typ, rep->stacks[i], &supp);
+  for (uptr i = 0; suppress_pc == 0 && i < rep->threads.Size(); i++)
+    suppress_pc = IsSuppressed(rep->typ, rep->threads[i]->stack, &supp);
+  for (uptr i = 0; suppress_pc == 0 && i < rep->locs.Size(); i++)
+    suppress_pc = IsSuppressed(rep->typ, rep->locs[i], &supp);
   if (suppress_pc != 0) {
     FiredSuppression s = {srep.GetReport()->typ, suppress_pc, supp};
     ctx->fired_suppressions.push_back(s);
   }
-  if (OnReport(rep, suppress_pc != 0))
-    return false;
+  {
+    bool old_is_freeing = thr->is_freeing;
+    thr->is_freeing = false;
+    bool suppressed = OnReport(rep, suppress_pc != 0);
+    thr->is_freeing = old_is_freeing;
+    if (suppressed)
+      return false;
+  }
   PrintReport(rep);
   ctx->nreported++;
   if (flags()->halt_on_error)
@@ -560,41 +576,6 @@ bool FrameIsInternal(const ReportStack *frame) {
           internal_strstr(frame->file, "tsan_interface_"));
 }
 
-// On programs that use Java we see weird reports like:
-// WARNING: ThreadSanitizer: data race (pid=22512)
-//   Read of size 8 at 0x7d2b00084318 by thread 100:
-//     #0 memcpy tsan_interceptors.cc:406 (foo+0x00000d8dfae3)
-//     #1 <null> <null>:0 (0x7f7ad9b40193)
-//   Previous write of size 8 at 0x7d2b00084318 by thread 105:
-//     #0 strncpy tsan_interceptors.cc:501 (foo+0x00000d8e0919)
-//     #1 <null> <null>:0 (0x7f7ad9b42707)
-static bool IsJavaNonsense(const ReportDesc *rep) {
-#ifndef TSAN_GO
-  for (uptr i = 0; i < rep->mops.Size(); i++) {
-    ReportMop *mop = rep->mops[i];
-    ReportStack *frame = mop->stack;
-    if (frame == 0
-        || (frame->func == 0 && frame->file == 0 && frame->line == 0
-          && frame->module == 0)) {
-      return true;
-    }
-    if (FrameIsInternal(frame)) {
-      frame = frame->next;
-      if (frame == 0
-          || (frame->func == 0 && frame->file == 0 && frame->line == 0
-          && frame->module == 0)) {
-        if (frame) {
-          FiredSuppression supp = {rep->typ, frame->pc, 0};
-          ctx->fired_suppressions.push_back(supp);
-        }
-        return true;
-      }
-    }
-  }
-#endif
-  return false;
-}
-
 static bool RaceBetweenAtomicAndFree(ThreadState *thr) {
   Shadow s0(thr->racy_state[0]);
   Shadow s1(thr->racy_state[1]);
@@ -609,6 +590,8 @@ static bool RaceBetweenAtomicAndFree(ThreadState *thr) {
 }
 
 void ReportRace(ThreadState *thr) {
+  CheckNoLocks(thr);
+
   // Symbolizer makes lots of intercepted calls. If we try to process them,
   // at best it will cause deadlocks on internal mutexes.
   ScopedIgnoreInterceptors ignore;
@@ -671,9 +654,6 @@ void ReportRace(ThreadState *thr) {
                         i == 0 ? &thr->mset : mset2.data());
   }
 
-  if (flags()->suppress_java && IsJavaNonsense(rep.GetReport()))
-    return;
-
   for (uptr i = 0; i < kMop; i++) {
     FastState s(thr->racy_state[i]);
     ThreadContext *tctx = static_cast<ThreadContext*>(
@@ -693,11 +673,7 @@ void ReportRace(ThreadState *thr) {
   }
 #endif
 
-  ReportLocation *suppress_loc = rep.GetReport()->locs.Size() ?
-                                 rep.GetReport()->locs[0] : 0;
-  if (!OutputReport(ctx, rep, rep.GetReport()->mops[0]->stack,
-                              rep.GetReport()->mops[1]->stack,
-                              suppress_loc))
+  if (!OutputReport(thr, rep))
     return;
 
   AddRacyStacks(thr, traces, addr_min, addr_max);
diff --git a/libsanitizer/tsan/tsan_rtl_thread.cc b/libsanitizer/tsan/tsan_rtl_thread.cc
index 385af7e1fa6..0c0acc2787f 100644
--- a/libsanitizer/tsan/tsan_rtl_thread.cc
+++ b/libsanitizer/tsan/tsan_rtl_thread.cc
@@ -34,13 +34,13 @@ ThreadContext::~ThreadContext() {
 #endif
 
 void ThreadContext::OnDead() {
-  sync.Reset();
+  CHECK_EQ(sync.size(), 0);
 }
 
 void ThreadContext::OnJoined(void *arg) {
   ThreadState *caller_thr = static_cast<ThreadState *>(arg);
   AcquireImpl(caller_thr, 0, &sync);
-  sync.Reset();
+  sync.Reset(&caller_thr->clock_cache);
 }
 
 struct OnCreatedArgs {
@@ -63,11 +63,16 @@ void ThreadContext::OnCreated(void *arg) {
 }
 
 void ThreadContext::OnReset() {
-  sync.Reset();
+  CHECK_EQ(sync.size(), 0);
   FlushUnneededShadowMemory(GetThreadTrace(tid), TraceSize() * sizeof(Event));
   //!!! FlushUnneededShadowMemory(GetThreadTraceHeader(tid), sizeof(Trace));
 }
 
+void ThreadContext::OnDetached(void *arg) {
+  ThreadState *thr1 = static_cast<ThreadState*>(arg);
+  sync.Reset(&thr1->clock_cache);
+}
+
 struct OnStartedArgs {
   ThreadState *thr;
   uptr stk_addr;
@@ -100,7 +105,7 @@ void ThreadContext::OnStarted(void *arg) {
 #ifndef TSAN_GO
   AllocatorThreadStart(thr);
 #endif
-  if (flags()->detect_deadlocks) {
+  if (common_flags()->detect_deadlocks) {
     thr->dd_pt = ctx->dd->CreatePhysicalThread();
     thr->dd_lt = ctx->dd->CreateLogicalThread(unique_id);
   }
@@ -111,12 +116,11 @@ void ThreadContext::OnStarted(void *arg) {
   Trace *thr_trace = ThreadTrace(thr->tid);
   thr_trace->headers[trace].epoch0 = epoch0;
   StatInc(thr, StatSyncAcquire);
-  sync.Reset();
+  sync.Reset(&thr->clock_cache);
   DPrintf("#%d: ThreadStart epoch=%zu stk_addr=%zx stk_size=%zx "
           "tls_addr=%zx tls_size=%zx\n",
           tid, (uptr)epoch0, args->stk_addr, args->stk_size,
           args->tls_addr, args->tls_size);
-  thr->is_alive = true;
 }
 
 void ThreadContext::OnFinished() {
@@ -128,10 +132,12 @@ void ThreadContext::OnFinished() {
   }
   epoch1 = thr->fast_state.epoch();
 
-  if (flags()->detect_deadlocks) {
+  if (common_flags()->detect_deadlocks) {
     ctx->dd->DestroyPhysicalThread(thr->dd_pt);
     ctx->dd->DestroyLogicalThread(thr->dd_lt);
   }
+  ctx->clock_alloc.FlushCache(&thr->clock_cache);
+  ctx->metamap.OnThreadIdle(thr);
 #ifndef TSAN_GO
   AllocatorThreadFinish(thr);
 #endif
@@ -203,9 +209,9 @@ void ThreadFinalize(ThreadState *thr) {
       MaybeReportThreadLeak, &leaks);
   for (uptr i = 0; i < leaks.Size(); i++) {
     ScopedReport rep(ReportTypeThreadLeak);
-    rep.AddThread(leaks[i].tctx);
+    rep.AddThread(leaks[i].tctx, true);
     rep.SetCount(leaks[i].count);
-    OutputReport(ctx, rep, rep.GetReport()->threads[0]->stack);
+    OutputReport(thr, rep);
   }
 #endif
 }
@@ -275,7 +281,7 @@ void ThreadFinish(ThreadState *thr) {
     DontNeedShadowFor(thr->stk_addr, thr->stk_size);
   if (thr->tls_addr && thr->tls_size)
     DontNeedShadowFor(thr->tls_addr, thr->tls_size);
-  thr->is_alive = false;
+  thr->is_dead = true;
   ctx->thread_registry->FinishThread(thr->tid);
 }
 
@@ -304,7 +310,7 @@ void ThreadJoin(ThreadState *thr, uptr pc, int tid) {
 void ThreadDetach(ThreadState *thr, uptr pc, int tid) {
   CHECK_GT(tid, 0);
   CHECK_LT(tid, kMaxTid);
-  ctx->thread_registry->DetachThread(tid);
+  ctx->thread_registry->DetachThread(tid, thr);
 }
 
 void ThreadSetName(ThreadState *thr, const char *name) {
diff --git a/libsanitizer/tsan/tsan_stack_trace.cc b/libsanitizer/tsan/tsan_stack_trace.cc
new file mode 100644
index 00000000000..45bd2517837
--- /dev/null
+++ b/libsanitizer/tsan/tsan_stack_trace.cc
@@ -0,0 +1,110 @@
+//===-- tsan_stack_trace.cc -----------------------------------------------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of ThreadSanitizer (TSan), a race detector.
+//
+//===----------------------------------------------------------------------===//
+//#include "sanitizer_common/sanitizer_placement_new.h"
+#include "tsan_stack_trace.h"
+#include "tsan_rtl.h"
+#include "tsan_mman.h"
+
+namespace __tsan {
+
+StackTrace::StackTrace()
+    : n_()
+    , s_()
+    , c_() {
+}
+
+StackTrace::StackTrace(uptr *buf, uptr cnt)
+    : n_()
+    , s_(buf)
+    , c_(cnt) {
+  CHECK_NE(buf, 0);
+  CHECK_NE(cnt, 0);
+}
+
+StackTrace::~StackTrace() {
+  Reset();
+}
+
+void StackTrace::Reset() {
+  if (s_ && !c_) {
+    CHECK_NE(n_, 0);
+    internal_free(s_);
+    s_ = 0;
+  }
+  n_ = 0;
+}
+
+void StackTrace::Init(const uptr *pcs, uptr cnt) {
+  Reset();
+  if (cnt == 0)
+    return;
+  if (c_) {
+    CHECK_NE(s_, 0);
+    CHECK_LE(cnt, c_);
+  } else {
+    s_ = (uptr*)internal_alloc(MBlockStackTrace, cnt * sizeof(s_[0]));
+  }
+  n_ = cnt;
+  internal_memcpy(s_, pcs, cnt * sizeof(s_[0]));
+}
+
+void StackTrace::ObtainCurrent(ThreadState *thr, uptr toppc) {
+  Reset();
+  n_ = thr->shadow_stack_pos - thr->shadow_stack;
+  if (n_ + !!toppc == 0)
+    return;
+  uptr start = 0;
+  if (c_) {
+    CHECK_NE(s_, 0);
+    if (n_ + !!toppc > c_) {
+      start = n_ - c_ + !!toppc;
+      n_ = c_ - !!toppc;
+    }
+  } else {
+    // Cap potentially huge stacks.
+    if (n_ + !!toppc > kTraceStackSize) {
+      start = n_ - kTraceStackSize + !!toppc;
+      n_ = kTraceStackSize - !!toppc;
+    }
+    s_ = (uptr*)internal_alloc(MBlockStackTrace,
+                               (n_ + !!toppc) * sizeof(s_[0]));
+  }
+  for (uptr i = 0; i < n_; i++)
+    s_[i] = thr->shadow_stack[start + i];
+  if (toppc) {
+    s_[n_] = toppc;
+    n_++;
+  }
+}
+
+void StackTrace::CopyFrom(const StackTrace& other) {
+  Reset();
+  Init(other.Begin(), other.Size());
+}
+
+bool StackTrace::IsEmpty() const {
+  return n_ == 0;
+}
+
+uptr StackTrace::Size() const {
+  return n_;
+}
+
+uptr StackTrace::Get(uptr i) const {
+  CHECK_LT(i, n_);
+  return s_[i];
+}
+
+const uptr *StackTrace::Begin() const {
+  return s_;
+}
+
+}  // namespace __tsan
diff --git a/libsanitizer/tsan/tsan_stack_trace.h b/libsanitizer/tsan/tsan_stack_trace.h
new file mode 100644
index 00000000000..ce0cb8859d2
--- /dev/null
+++ b/libsanitizer/tsan/tsan_stack_trace.h
@@ -0,0 +1,52 @@
+//===-- tsan_stack_trace.h --------------------------------------*- C++ -*-===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of ThreadSanitizer (TSan), a race detector.
+//
+//===----------------------------------------------------------------------===//
+#ifndef TSAN_STACK_TRACE_H
+#define TSAN_STACK_TRACE_H
+
+//#include "sanitizer_common/sanitizer_atomic.h"
+//#include "sanitizer_common/sanitizer_common.h"
+//#include "sanitizer_common/sanitizer_deadlock_detector_interface.h"
+#include "tsan_defs.h"
+//#include "tsan_clock.h"
+//#include "tsan_mutex.h"
+//#include "tsan_dense_alloc.h"
+
+namespace __tsan {
+
+class StackTrace {
+ public:
+  StackTrace();
+  // Initialized the object in "static mode",
+  // in this mode it never calls malloc/free but uses the provided buffer.
+  StackTrace(uptr *buf, uptr cnt);
+  ~StackTrace();
+  void Reset();
+
+  void Init(const uptr *pcs, uptr cnt);
+  void ObtainCurrent(ThreadState *thr, uptr toppc);
+  bool IsEmpty() const;
+  uptr Size() const;
+  uptr Get(uptr i) const;
+  const uptr *Begin() const;
+  void CopyFrom(const StackTrace& other);
+
+ private:
+  uptr n_;
+  uptr *s_;
+  const uptr c_;
+
+  StackTrace(const StackTrace&);
+  void operator = (const StackTrace&);
+};
+
+}  // namespace __tsan
+
+#endif  // TSAN_STACK_TRACE_H
diff --git a/libsanitizer/tsan/tsan_stat.cc b/libsanitizer/tsan/tsan_stat.cc
index e8d3a790b1c..cdb48358551 100644
--- a/libsanitizer/tsan/tsan_stat.cc
+++ b/libsanitizer/tsan/tsan_stat.cc
@@ -35,6 +35,7 @@ void StatOutput(u64 *stat) {
   name[StatMop4]                         = "            size 4                ";
   name[StatMop8]                         = "            size 8                ";
   name[StatMopSame]                      = "  Including same                  ";
+  name[StatMopIgnored]                   = "  Including ignored               ";
   name[StatMopRange]                     = "  Including range                 ";
   name[StatMopRodata]                    = "  Including .rodata               ";
   name[StatMopRangeRodata]               = "  Including .rodata range         ";
diff --git a/libsanitizer/tsan/tsan_stat.h b/libsanitizer/tsan/tsan_stat.h
index 5bdd9de8213..132656f035f 100644
--- a/libsanitizer/tsan/tsan_stat.h
+++ b/libsanitizer/tsan/tsan_stat.h
@@ -24,6 +24,7 @@ enum StatType {
   StatMop4,
   StatMop8,
   StatMopSame,
+  StatMopIgnored,
   StatMopRange,
   StatMopRodata,
   StatMopRangeRodata,
diff --git a/libsanitizer/tsan/tsan_suppressions.cc b/libsanitizer/tsan/tsan_suppressions.cc
index ce8d5fe8611..6b42d3a67b8 100644
--- a/libsanitizer/tsan/tsan_suppressions.cc
+++ b/libsanitizer/tsan/tsan_suppressions.cc
@@ -39,55 +39,16 @@ extern "C" const char *WEAK __tsan_default_suppressions() {
 
 namespace __tsan {
 
-static SuppressionContext* g_ctx;
-
-static char *ReadFile(const char *filename) {
-  if (filename == 0 || filename[0] == 0)
-    return 0;
-  InternalScopedBuffer<char> tmp(4*1024);
-  if (filename[0] == '/' || GetPwd() == 0)
-    internal_snprintf(tmp.data(), tmp.size(), "%s", filename);
-  else
-    internal_snprintf(tmp.data(), tmp.size(), "%s/%s", GetPwd(), filename);
-  uptr openrv = OpenFile(tmp.data(), false);
-  if (internal_iserror(openrv)) {
-    Printf("ThreadSanitizer: failed to open suppressions file '%s'\n",
-               tmp.data());
-    Die();
-  }
-  fd_t fd = openrv;
-  const uptr fsize = internal_filesize(fd);
-  if (fsize == (uptr)-1) {
-    Printf("ThreadSanitizer: failed to stat suppressions file '%s'\n",
-               tmp.data());
-    Die();
-  }
-  char *buf = (char*)internal_alloc(MBlockSuppression, fsize + 1);
-  if (fsize != internal_read(fd, buf, fsize)) {
-    Printf("ThreadSanitizer: failed to read suppressions file '%s'\n",
-               tmp.data());
-    Die();
-  }
-  internal_close(fd);
-  buf[fsize] = 0;
-  return buf;
-}
+static bool suppressions_inited = false;
 
 void InitializeSuppressions() {
-  ALIGNED(64) static char placeholder_[sizeof(SuppressionContext)];
-  g_ctx = new(placeholder_) SuppressionContext;
-  const char *supp = ReadFile(flags()->suppressions);
-  g_ctx->Parse(supp);
+  CHECK(!suppressions_inited);
+  SuppressionContext::InitIfNecessary();
 #ifndef TSAN_GO
-  supp = __tsan_default_suppressions();
-  g_ctx->Parse(supp);
-  g_ctx->Parse(std_suppressions);
+  SuppressionContext::Get()->Parse(__tsan_default_suppressions());
+  SuppressionContext::Get()->Parse(std_suppressions);
 #endif
-}
-
-SuppressionContext *GetSuppressionContext() {
-  CHECK_NE(g_ctx, 0);
-  return g_ctx;
+  suppressions_inited = true;
 }
 
 SuppressionType conv(ReportType typ) {
@@ -120,16 +81,17 @@ SuppressionType conv(ReportType typ) {
 }
 
 uptr IsSuppressed(ReportType typ, const ReportStack *stack, Suppression **sp) {
-  CHECK(g_ctx);
-  if (!g_ctx->SuppressionCount() || stack == 0) return 0;
+  if (!SuppressionContext::Get()->SuppressionCount() || stack == 0 ||
+      !stack->suppressable)
+    return 0;
   SuppressionType stype = conv(typ);
   if (stype == SuppressionNone)
     return 0;
   Suppression *s;
   for (const ReportStack *frame = stack; frame; frame = frame->next) {
-    if (g_ctx->Match(frame->func, stype, &s) ||
-        g_ctx->Match(frame->file, stype, &s) ||
-        g_ctx->Match(frame->module, stype, &s)) {
+    if (SuppressionContext::Get()->Match(frame->func, stype, &s) ||
+        SuppressionContext::Get()->Match(frame->file, stype, &s) ||
+        SuppressionContext::Get()->Match(frame->module, stype, &s)) {
       DPrintf("ThreadSanitizer: matched suppression '%s'\n", s->templ);
       s->hit_count++;
       *sp = s;
@@ -140,17 +102,16 @@ uptr IsSuppressed(ReportType typ, const ReportStack *stack, Suppression **sp) {
 }
 
 uptr IsSuppressed(ReportType typ, const ReportLocation *loc, Suppression **sp) {
-  CHECK(g_ctx);
-  if (!g_ctx->SuppressionCount() || loc == 0 ||
-      loc->type != ReportLocationGlobal)
+  if (!SuppressionContext::Get()->SuppressionCount() || loc == 0 ||
+      loc->type != ReportLocationGlobal || !loc->suppressable)
     return 0;
   SuppressionType stype = conv(typ);
   if (stype == SuppressionNone)
     return 0;
   Suppression *s;
-  if (g_ctx->Match(loc->name, stype, &s) ||
-      g_ctx->Match(loc->file, stype, &s) ||
-      g_ctx->Match(loc->module, stype, &s)) {
+  if (SuppressionContext::Get()->Match(loc->name, stype, &s) ||
+      SuppressionContext::Get()->Match(loc->file, stype, &s) ||
+      SuppressionContext::Get()->Match(loc->module, stype, &s)) {
       DPrintf("ThreadSanitizer: matched suppression '%s'\n", s->templ);
       s->hit_count++;
       *sp = s;
@@ -160,9 +121,8 @@ uptr IsSuppressed(ReportType typ, const ReportLocation *loc, Suppression **sp) {
 }
 
 void PrintMatchedSuppressions() {
-  CHECK(g_ctx);
   InternalMmapVector<Suppression *> matched(1);
-  g_ctx->GetMatched(&matched);
+  SuppressionContext::Get()->GetMatched(&matched);
   if (!matched.size())
     return;
   int hit_count = 0;
diff --git a/libsanitizer/tsan/tsan_suppressions.h b/libsanitizer/tsan/tsan_suppressions.h
index 2939e9a8b9f..e38d81ece85 100644
--- a/libsanitizer/tsan/tsan_suppressions.h
+++ b/libsanitizer/tsan/tsan_suppressions.h
@@ -20,7 +20,6 @@ void InitializeSuppressions();
 void PrintMatchedSuppressions();
 uptr IsSuppressed(ReportType typ, const ReportStack *stack, Suppression **sp);
 uptr IsSuppressed(ReportType typ, const ReportLocation *loc, Suppression **sp);
-SuppressionContext *GetSuppressionContext();
 
 }  // namespace __tsan
 
diff --git a/libsanitizer/tsan/tsan_symbolize.cc b/libsanitizer/tsan/tsan_symbolize.cc
index fa36017846c..49ae3dffa47 100644
--- a/libsanitizer/tsan/tsan_symbolize.cc
+++ b/libsanitizer/tsan/tsan_symbolize.cc
@@ -109,7 +109,7 @@ ReportStack *SymbolizeCode(uptr addr) {
   InternalScopedBuffer<AddressInfo> addr_frames(kMaxAddrFrames);
   for (uptr i = 0; i < kMaxAddrFrames; i++)
     new(&addr_frames[i]) AddressInfo();
-  uptr addr_frames_num = Symbolizer::Get()->SymbolizePC(
+  uptr addr_frames_num = Symbolizer::GetOrInit()->SymbolizePC(
       addr, addr_frames.data(), kMaxAddrFrames);
   if (addr_frames_num == 0)
     return NewReportStackEntry(addr);
@@ -130,7 +130,7 @@ ReportStack *SymbolizeCode(uptr addr) {
 
 ReportLocation *SymbolizeData(uptr addr) {
   DataInfo info;
-  if (!Symbolizer::Get()->SymbolizeData(addr, &info))
+  if (!Symbolizer::GetOrInit()->SymbolizeData(addr, &info))
     return 0;
   ReportLocation *ent = (ReportLocation*)internal_alloc(MBlockReportStack,
                                                         sizeof(ReportLocation));
@@ -146,7 +146,7 @@ ReportLocation *SymbolizeData(uptr addr) {
 }
 
 void SymbolizeFlush() {
-  Symbolizer::Get()->Flush();
+  Symbolizer::GetOrInit()->Flush();
 }
 
 }  // namespace __tsan
diff --git a/libsanitizer/tsan/tsan_sync.cc b/libsanitizer/tsan/tsan_sync.cc
index f6f2cb731e7..2209199ac48 100644
--- a/libsanitizer/tsan/tsan_sync.cc
+++ b/libsanitizer/tsan/tsan_sync.cc
@@ -17,293 +17,207 @@ namespace __tsan {
 
 void DDMutexInit(ThreadState *thr, uptr pc, SyncVar *s);
 
-SyncVar::SyncVar(uptr addr, u64 uid)
-  : mtx(MutexTypeSyncVar, StatMtxSyncVar)
-  , addr(addr)
-  , uid(uid)
-  , creation_stack_id()
-  , owner_tid(kInvalidTid)
-  , last_lock()
-  , recursion()
-  , is_rw()
-  , is_recursive()
-  , is_broken()
-  , is_linker_init() {
+SyncVar::SyncVar()
+    : mtx(MutexTypeSyncVar, StatMtxSyncVar) {
+  Reset(0);
+}
+
+void SyncVar::Init(ThreadState *thr, uptr pc, uptr addr, u64 uid) {
+  this->addr = addr;
+  this->uid = uid;
+  this->next = 0;
+
+  creation_stack_id = 0;
+  if (kCppMode)  // Go does not use them
+    creation_stack_id = CurrentStackId(thr, pc);
+  if (common_flags()->detect_deadlocks)
+    DDMutexInit(thr, pc, this);
+}
+
+void SyncVar::Reset(ThreadState *thr) {
+  uid = 0;
+  creation_stack_id = 0;
+  owner_tid = kInvalidTid;
+  last_lock = 0;
+  recursion = 0;
+  is_rw = 0;
+  is_recursive = 0;
+  is_broken = 0;
+  is_linker_init = 0;
+
+  if (thr == 0) {
+    CHECK_EQ(clock.size(), 0);
+    CHECK_EQ(read_clock.size(), 0);
+  } else {
+    clock.Reset(&thr->clock_cache);
+    read_clock.Reset(&thr->clock_cache);
+  }
 }
 
-SyncTab::Part::Part()
-  : mtx(MutexTypeSyncTab, StatMtxSyncTab)
-  , val() {
+MetaMap::MetaMap() {
+  atomic_store(&uid_gen_, 0, memory_order_relaxed);
 }
 
-SyncTab::SyncTab() {
+void MetaMap::AllocBlock(ThreadState *thr, uptr pc, uptr p, uptr sz) {
+  u32 idx = block_alloc_.Alloc(&thr->block_cache);
+  MBlock *b = block_alloc_.Map(idx);
+  b->siz = sz;
+  b->tid = thr->tid;
+  b->stk = CurrentStackId(thr, pc);
+  u32 *meta = MemToMeta(p);
+  DCHECK_EQ(*meta, 0);
+  *meta = idx | kFlagBlock;
 }
 
-SyncTab::~SyncTab() {
-  for (int i = 0; i < kPartCount; i++) {
-    while (tab_[i].val) {
-      SyncVar *tmp = tab_[i].val;
-      tab_[i].val = tmp->next;
-      DestroyAndFree(tmp);
+uptr MetaMap::FreeBlock(ThreadState *thr, uptr pc, uptr p) {
+  MBlock* b = GetBlock(p);
+  if (b == 0)
+    return 0;
+  uptr sz = RoundUpTo(b->siz, kMetaShadowCell);
+  FreeRange(thr, pc, p, sz);
+  return sz;
+}
+
+void MetaMap::FreeRange(ThreadState *thr, uptr pc, uptr p, uptr sz) {
+  u32 *meta = MemToMeta(p);
+  u32 *end = MemToMeta(p + sz);
+  if (end == meta)
+    end++;
+  for (; meta < end; meta++) {
+    u32 idx = *meta;
+    *meta = 0;
+    for (;;) {
+      if (idx == 0)
+        break;
+      if (idx & kFlagBlock) {
+        block_alloc_.Free(&thr->block_cache, idx & ~kFlagMask);
+        break;
+      } else if (idx & kFlagSync) {
+        DCHECK(idx & kFlagSync);
+        SyncVar *s = sync_alloc_.Map(idx & ~kFlagMask);
+        u32 next = s->next;
+        s->Reset(thr);
+        sync_alloc_.Free(&thr->sync_cache, idx & ~kFlagMask);
+        idx = next;
+      } else {
+        CHECK(0);
+      }
     }
   }
 }
 
-SyncVar* SyncTab::GetOrCreateAndLock(ThreadState *thr, uptr pc,
-                                     uptr addr, bool write_lock) {
-  return GetAndLock(thr, pc, addr, write_lock, true);
+MBlock* MetaMap::GetBlock(uptr p) {
+  u32 *meta = MemToMeta(p);
+  u32 idx = *meta;
+  for (;;) {
+    if (idx == 0)
+      return 0;
+    if (idx & kFlagBlock)
+      return block_alloc_.Map(idx & ~kFlagMask);
+    DCHECK(idx & kFlagSync);
+    SyncVar * s = sync_alloc_.Map(idx & ~kFlagMask);
+    idx = s->next;
+  }
 }
 
-SyncVar* SyncTab::GetIfExistsAndLock(uptr addr, bool write_lock) {
-  return GetAndLock(0, 0, addr, write_lock, false);
+SyncVar* MetaMap::GetOrCreateAndLock(ThreadState *thr, uptr pc,
+                              uptr addr, bool write_lock) {
+  return GetAndLock(thr, pc, addr, write_lock, true);
 }
 
-SyncVar* SyncTab::Create(ThreadState *thr, uptr pc, uptr addr) {
-  StatInc(thr, StatSyncCreated);
-  void *mem = internal_alloc(MBlockSync, sizeof(SyncVar));
-  const u64 uid = atomic_fetch_add(&uid_gen_, 1, memory_order_relaxed);
-  SyncVar *res = new(mem) SyncVar(addr, uid);
-  res->creation_stack_id = 0;
-  if (!kGoMode)  // Go does not use them
-    res->creation_stack_id = CurrentStackId(thr, pc);
-  if (flags()->detect_deadlocks)
-    DDMutexInit(thr, pc, res);
-  return res;
+SyncVar* MetaMap::GetIfExistsAndLock(uptr addr) {
+  return GetAndLock(0, 0, addr, true, false);
 }
 
-SyncVar* SyncTab::GetAndLock(ThreadState *thr, uptr pc,
+SyncVar* MetaMap::GetAndLock(ThreadState *thr, uptr pc,
                              uptr addr, bool write_lock, bool create) {
-#ifndef TSAN_GO
-  {  // NOLINT
-    SyncVar *res = GetJavaSync(thr, pc, addr, write_lock, create);
-    if (res)
-      return res;
-  }
-
-  // Here we ask only PrimaryAllocator, because
-  // SecondaryAllocator::PointerIsMine() is slow and we have fallback on
-  // the hashmap anyway.
-  if (PrimaryAllocator::PointerIsMine((void*)addr)) {
-    MBlock *b = user_mblock(thr, (void*)addr);
-    CHECK_NE(b, 0);
-    MBlock::ScopedLock l(b);
-    SyncVar *res = 0;
-    for (res = b->ListHead(); res; res = res->next) {
-      if (res->addr == addr)
+  u32 *meta = MemToMeta(addr);
+  u32 idx0 = *meta;
+  u32 myidx = 0;
+  SyncVar *mys = 0;
+  for (;;) {
+    u32 idx = idx0;
+    for (;;) {
+      if (idx == 0)
         break;
-    }
-    if (res == 0) {
-      if (!create)
-        return 0;
-      res = Create(thr, pc, addr);
-      b->ListPush(res);
-    }
-    if (write_lock)
-      res->mtx.Lock();
-    else
-      res->mtx.ReadLock();
-    return res;
-  }
-#endif
-
-  Part *p = &tab_[PartIdx(addr)];
-  {
-    ReadLock l(&p->mtx);
-    for (SyncVar *res = p->val; res; res = res->next) {
-      if (res->addr == addr) {
+      if (idx & kFlagBlock)
+        break;
+      DCHECK(idx & kFlagSync);
+      SyncVar * s = sync_alloc_.Map(idx & ~kFlagMask);
+      if (s->addr == addr) {
+        if (myidx != 0) {
+          mys->Reset(thr);
+          sync_alloc_.Free(&thr->sync_cache, myidx);
+        }
         if (write_lock)
-          res->mtx.Lock();
+          s->mtx.Lock();
         else
-          res->mtx.ReadLock();
-        return res;
+          s->mtx.ReadLock();
+        return s;
       }
+      idx = s->next;
     }
-  }
-  if (!create)
-    return 0;
-  {
-    Lock l(&p->mtx);
-    SyncVar *res = p->val;
-    for (; res; res = res->next) {
-      if (res->addr == addr)
-        break;
-    }
-    if (res == 0) {
-      res = Create(thr, pc, addr);
-      res->next = p->val;
-      p->val = res;
+    if (!create)
+      return 0;
+    if (*meta != idx0) {
+      idx0 = *meta;
+      continue;
     }
-    if (write_lock)
-      res->mtx.Lock();
-    else
-      res->mtx.ReadLock();
-    return res;
-  }
-}
 
-SyncVar* SyncTab::GetAndRemove(ThreadState *thr, uptr pc, uptr addr) {
-#ifndef TSAN_GO
-  {  // NOLINT
-    SyncVar *res = GetAndRemoveJavaSync(thr, pc, addr);
-    if (res)
-      return res;
-  }
-  if (PrimaryAllocator::PointerIsMine((void*)addr)) {
-    MBlock *b = user_mblock(thr, (void*)addr);
-    CHECK_NE(b, 0);
-    SyncVar *res = 0;
-    {
-      MBlock::ScopedLock l(b);
-      res = b->ListHead();
-      if (res) {
-        if (res->addr == addr) {
-          if (res->is_linker_init)
-            return 0;
-          b->ListPop();
-        } else {
-          SyncVar **prev = &res->next;
-          res = *prev;
-          while (res) {
-            if (res->addr == addr) {
-              if (res->is_linker_init)
-                return 0;
-              *prev = res->next;
-              break;
-            }
-            prev = &res->next;
-            res = *prev;
-          }
-        }
-        if (res) {
-          StatInc(thr, StatSyncDestroyed);
-          res->mtx.Lock();
-          res->mtx.Unlock();
-        }
-      }
+    if (myidx == 0) {
+      const u64 uid = atomic_fetch_add(&uid_gen_, 1, memory_order_relaxed);
+      myidx = sync_alloc_.Alloc(&thr->sync_cache);
+      mys = sync_alloc_.Map(myidx);
+      mys->Init(thr, pc, addr, uid);
     }
-    return res;
-  }
-#endif
-
-  Part *p = &tab_[PartIdx(addr)];
-  SyncVar *res = 0;
-  {
-    Lock l(&p->mtx);
-    SyncVar **prev = &p->val;
-    res = *prev;
-    while (res) {
-      if (res->addr == addr) {
-        if (res->is_linker_init)
-          return 0;
-        *prev = res->next;
-        break;
-      }
-      prev = &res->next;
-      res = *prev;
+    mys->next = idx0;
+    if (atomic_compare_exchange_strong((atomic_uint32_t*)meta, &idx0,
+        myidx | kFlagSync, memory_order_release)) {
+      if (write_lock)
+        mys->mtx.Lock();
+      else
+        mys->mtx.ReadLock();
+      return mys;
     }
   }
-  if (res) {
-    StatInc(thr, StatSyncDestroyed);
-    res->mtx.Lock();
-    res->mtx.Unlock();
-  }
-  return res;
-}
-
-int SyncTab::PartIdx(uptr addr) {
-  return (addr >> 3) % kPartCount;
-}
-
-StackTrace::StackTrace()
-    : n_()
-    , s_()
-    , c_() {
 }
 
-StackTrace::StackTrace(uptr *buf, uptr cnt)
-    : n_()
-    , s_(buf)
-    , c_(cnt) {
-  CHECK_NE(buf, 0);
-  CHECK_NE(cnt, 0);
-}
-
-StackTrace::~StackTrace() {
-  Reset();
-}
-
-void StackTrace::Reset() {
-  if (s_ && !c_) {
-    CHECK_NE(n_, 0);
-    internal_free(s_);
-    s_ = 0;
-  }
-  n_ = 0;
-}
-
-void StackTrace::Init(const uptr *pcs, uptr cnt) {
-  Reset();
-  if (cnt == 0)
-    return;
-  if (c_) {
-    CHECK_NE(s_, 0);
-    CHECK_LE(cnt, c_);
-  } else {
-    s_ = (uptr*)internal_alloc(MBlockStackTrace, cnt * sizeof(s_[0]));
+void MetaMap::MoveMemory(uptr src, uptr dst, uptr sz) {
+  // src and dst can overlap,
+  // there are no concurrent accesses to the regions (e.g. stop-the-world).
+  CHECK_NE(src, dst);
+  CHECK_NE(sz, 0);
+  uptr diff = dst - src;
+  u32 *src_meta = MemToMeta(src);
+  u32 *dst_meta = MemToMeta(dst);
+  u32 *src_meta_end = MemToMeta(src + sz);
+  uptr inc = 1;
+  if (dst > src) {
+    src_meta = MemToMeta(src + sz) - 1;
+    dst_meta = MemToMeta(dst + sz) - 1;
+    src_meta_end = MemToMeta(src) - 1;
+    inc = -1;
   }
-  n_ = cnt;
-  internal_memcpy(s_, pcs, cnt * sizeof(s_[0]));
-}
-
-void StackTrace::ObtainCurrent(ThreadState *thr, uptr toppc) {
-  Reset();
-  n_ = thr->shadow_stack_pos - thr->shadow_stack;
-  if (n_ + !!toppc == 0)
-    return;
-  uptr start = 0;
-  if (c_) {
-    CHECK_NE(s_, 0);
-    if (n_ + !!toppc > c_) {
-      start = n_ - c_ + !!toppc;
-      n_ = c_ - !!toppc;
-    }
-  } else {
-    // Cap potentially huge stacks.
-    if (n_ + !!toppc > kTraceStackSize) {
-      start = n_ - kTraceStackSize + !!toppc;
-      n_ = kTraceStackSize - !!toppc;
+  for (; src_meta != src_meta_end; src_meta += inc, dst_meta += inc) {
+    CHECK_EQ(*dst_meta, 0);
+    u32 idx = *src_meta;
+    *src_meta = 0;
+    *dst_meta = idx;
+    // Patch the addresses in sync objects.
+    while (idx != 0) {
+      if (idx & kFlagBlock)
+        break;
+      CHECK(idx & kFlagSync);
+      SyncVar *s = sync_alloc_.Map(idx & ~kFlagMask);
+      s->addr += diff;
+      idx = s->next;
     }
-    s_ = (uptr*)internal_alloc(MBlockStackTrace,
-                               (n_ + !!toppc) * sizeof(s_[0]));
-  }
-  for (uptr i = 0; i < n_; i++)
-    s_[i] = thr->shadow_stack[start + i];
-  if (toppc) {
-    s_[n_] = toppc;
-    n_++;
   }
 }
 
-void StackTrace::CopyFrom(const StackTrace& other) {
-  Reset();
-  Init(other.Begin(), other.Size());
-}
-
-bool StackTrace::IsEmpty() const {
-  return n_ == 0;
-}
-
-uptr StackTrace::Size() const {
-  return n_;
-}
-
-uptr StackTrace::Get(uptr i) const {
-  CHECK_LT(i, n_);
-  return s_[i];
-}
-
-const uptr *StackTrace::Begin() const {
-  return s_;
+void MetaMap::OnThreadIdle(ThreadState *thr) {
+  block_alloc_.FlushCache(&thr->block_cache);
+  sync_alloc_.FlushCache(&thr->sync_cache);
 }
 
 }  // namespace __tsan
diff --git a/libsanitizer/tsan/tsan_sync.h b/libsanitizer/tsan/tsan_sync.h
index 3838df91d75..6ed3715ee0a 100644
--- a/libsanitizer/tsan/tsan_sync.h
+++ b/libsanitizer/tsan/tsan_sync.h
@@ -14,46 +14,21 @@
 #include "sanitizer_common/sanitizer_atomic.h"
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_deadlock_detector_interface.h"
-#include "tsan_clock.h"
 #include "tsan_defs.h"
+#include "tsan_clock.h"
 #include "tsan_mutex.h"
+#include "tsan_dense_alloc.h"
 
 namespace __tsan {
 
-class StackTrace {
- public:
-  StackTrace();
-  // Initialized the object in "static mode",
-  // in this mode it never calls malloc/free but uses the provided buffer.
-  StackTrace(uptr *buf, uptr cnt);
-  ~StackTrace();
-  void Reset();
-
-  void Init(const uptr *pcs, uptr cnt);
-  void ObtainCurrent(ThreadState *thr, uptr toppc);
-  bool IsEmpty() const;
-  uptr Size() const;
-  uptr Get(uptr i) const;
-  const uptr *Begin() const;
-  void CopyFrom(const StackTrace& other);
-
- private:
-  uptr n_;
-  uptr *s_;
-  const uptr c_;
-
-  StackTrace(const StackTrace&);
-  void operator = (const StackTrace&);
-};
-
 struct SyncVar {
-  explicit SyncVar(uptr addr, u64 uid);
+  SyncVar();
 
   static const int kInvalidTid = -1;
 
+  uptr addr;  // overwritten by DenseSlabAlloc freelist
   Mutex mtx;
-  uptr addr;
-  const u64 uid;  // Globally unique id.
+  u64 uid;  // Globally unique id.
   u32 creation_stack_id;
   int owner_tid;  // Set only by exclusive owners.
   u64 last_lock;
@@ -62,13 +37,16 @@ struct SyncVar {
   bool is_recursive;
   bool is_broken;
   bool is_linker_init;
-  SyncVar *next;  // In SyncTab hashtable.
+  u32 next;  // in MetaMap
   DDMutex dd;
   SyncClock read_clock;  // Used for rw mutexes only.
   // The clock is placed last, so that it is situated on a different cache line
   // with the mtx. This reduces contention for hot sync objects.
   SyncClock clock;
 
+  void Init(ThreadState *thr, uptr pc, uptr addr, u64 uid);
+  void Reset(ThreadState *thr);
+
   u64 GetId() const {
     // 47 lsb is addr, then 14 bits is low part of uid, then 3 zero bits.
     return GetLsb((u64)addr | (uid << 47), 61);
@@ -83,40 +61,39 @@ struct SyncVar {
   }
 };
 
-class SyncTab {
+/* MetaMap allows to map arbitrary user pointers onto various descriptors.
+   Currently it maps pointers to heap block descriptors and sync var descs.
+   It uses 1/2 direct shadow, see tsan_platform.h.
+*/
+class MetaMap {
  public:
-  SyncTab();
-  ~SyncTab();
+  MetaMap();
+
+  void AllocBlock(ThreadState *thr, uptr pc, uptr p, uptr sz);
+  uptr FreeBlock(ThreadState *thr, uptr pc, uptr p);
+  void FreeRange(ThreadState *thr, uptr pc, uptr p, uptr sz);
+  MBlock* GetBlock(uptr p);
 
   SyncVar* GetOrCreateAndLock(ThreadState *thr, uptr pc,
                               uptr addr, bool write_lock);
-  SyncVar* GetIfExistsAndLock(uptr addr, bool write_lock);
+  SyncVar* GetIfExistsAndLock(uptr addr);
 
-  // If the SyncVar does not exist, returns 0.
-  SyncVar* GetAndRemove(ThreadState *thr, uptr pc, uptr addr);
+  void MoveMemory(uptr src, uptr dst, uptr sz);
 
-  SyncVar* Create(ThreadState *thr, uptr pc, uptr addr);
+  void OnThreadIdle(ThreadState *thr);
 
  private:
-  struct Part {
-    Mutex mtx;
-    SyncVar *val;
-    char pad[kCacheLineSize - sizeof(Mutex) - sizeof(SyncVar*)];  // NOLINT
-    Part();
-  };
-
-  // FIXME: Implement something more sane.
-  static const int kPartCount = 1009;
-  Part tab_[kPartCount];
+  static const u32 kFlagMask  = 3 << 30;
+  static const u32 kFlagBlock = 1 << 30;
+  static const u32 kFlagSync  = 2 << 30;
+  typedef DenseSlabAlloc<MBlock, 1<<16, 1<<12> BlockAlloc;
+  typedef DenseSlabAlloc<SyncVar, 1<<16, 1<<10> SyncAlloc;
+  BlockAlloc block_alloc_;
+  SyncAlloc sync_alloc_;
   atomic_uint64_t uid_gen_;
 
-  int PartIdx(uptr addr);
-
-  SyncVar* GetAndLock(ThreadState *thr, uptr pc,
-                      uptr addr, bool write_lock, bool create);
-
-  SyncTab(const SyncTab&);  // Not implemented.
-  void operator = (const SyncTab&);  // Not implemented.
+  SyncVar* GetAndLock(ThreadState *thr, uptr pc, uptr addr, bool write_lock,
+                      bool create);
 };
 
 }  // namespace __tsan
diff --git a/libsanitizer/tsan/tsan_trace.h b/libsanitizer/tsan/tsan_trace.h
index 93ed8d907e5..af140686f5d 100644
--- a/libsanitizer/tsan/tsan_trace.h
+++ b/libsanitizer/tsan/tsan_trace.h
@@ -13,7 +13,7 @@
 
 #include "tsan_defs.h"
 #include "tsan_mutex.h"
-#include "tsan_sync.h"
+#include "tsan_stack_trace.h"
 #include "tsan_mutexset.h"
 
 namespace __tsan {
diff --git a/libsanitizer/tsan/tsan_update_shadow_word_inl.h b/libsanitizer/tsan/tsan_update_shadow_word_inl.h
index 42caf80d349..91def7bb1e8 100644
--- a/libsanitizer/tsan/tsan_update_shadow_word_inl.h
+++ b/libsanitizer/tsan/tsan_update_shadow_word_inl.h
@@ -14,8 +14,7 @@
 do {
   StatInc(thr, StatShadowProcessed);
   const unsigned kAccessSize = 1 << kAccessSizeLog;
-  unsigned off = cur.ComputeSearchOffset();
-  u64 *sp = &shadow_mem[(idx + off) % kShadowCnt];
+  u64 *sp = &shadow_mem[idx];
   old = LoadShadow(sp);
   if (old.IsZero()) {
     StatInc(thr, StatShadowZero);
@@ -31,16 +30,6 @@ do {
     // same thread?
     if (Shadow::TidsAreEqual(old, cur)) {
       StatInc(thr, StatShadowSameThread);
-      if (OldIsInSameSynchEpoch(old, thr)) {
-        if (old.IsRWNotWeaker(kAccessIsWrite, kIsAtomic)) {
-          // found a slot that holds effectively the same info
-          // (that is, same tid, same sync epoch and same size)
-          StatInc(thr, StatMopSame);
-          return;
-        }
-        StoreIfNotYetStored(sp, &store_word);
-        break;
-      }
       if (old.IsRWWeakerOrEqual(kAccessIsWrite, kIsAtomic))
         StoreIfNotYetStored(sp, &store_word);
       break;
diff --git a/libsanitizer/tsan/tsan_vector.h b/libsanitizer/tsan/tsan_vector.h
index f65ad2b5560..c0485513ee2 100644
--- a/libsanitizer/tsan/tsan_vector.h
+++ b/libsanitizer/tsan/tsan_vector.h
@@ -76,6 +76,10 @@ class Vector {
   }
 
   void Resize(uptr size) {
+    if (size == 0) {
+      end_ = begin_;
+      return;
+    }
     uptr old_size = Size();
     EnsureSize(size);
     if (old_size < size) {
@@ -98,7 +102,7 @@ class Vector {
       return;
     }
     uptr cap0 = last_ - begin_;
-    uptr cap = 2 * cap0;
+    uptr cap = cap0 * 5 / 4;  // 25% growth
     if (cap == 0)
       cap = 16;
     if (cap < size)
author	kcc <kcc@138bc75d-0d04-0410-961f-82ee72b054a4>	2014-09-23 17:59:53 +0000
committer	kcc <kcc@138bc75d-0d04-0410-961f-82ee72b054a4>	2014-09-23 17:59:53 +0000
commit	a9586c9cc10d268c2cd7642d03a75bb283b0e266 (patch)
tree	dfe8acd36f160811afc54c8eaf16e8160ba8bd70 /libsanitizer/tsan
parent	4b9426aa0590d7761688e00e9da9393e228dd830 (diff)
download	gcc-a9586c9cc10d268c2cd7642d03a75bb283b0e266.tar.gz