Merge pull request #9623 from yoav-steinberg/upgrade_jemalloc_5.2.1

Upgraded to jemalloc 5.2.1 from 5.1.0. Cherry picked all relevant fixes (by diffing our 5.1.0 to upstream 5.10 and finding relevant commits). Details of what was done: [cherry-picked] fd7d51c 2021-05-03 Resolve nonsense static analysis warnings (Oran Agra) [cherry-picked] 448c435 2020-09-29 Fix compilation warnings in Lua and jemalloc dependencies (#7785) (YoongHM) [skipped - already in upstream] 9216b96 2020-09-21 Fix compilation warning in jemalloc's malloc_vsnprintf (#7789) (YoongHM) [cherry-picked] 88d71f4 2020-05-20 fix a rare active defrag edge case bug leading to stagnation (Oran Agra) [skipped - already in upstream] 2fec7d9 2019-05-30 Jemalloc: Avoid blocking on background thread lock for stats. [cherry-picked] 920158e 2018-07-11 Active defrag fixes for 32bit builds (again) (Oran Agra) [cherry-picked] e8099ca 2018-06-26 add defrag hint support into jemalloc 5 (Oran Agra) [re-done] 4e729fc 2018-05-24 Generate configure for Jemalloc. (antirez) Additionally had to do this: 7727cc2 2021-10-10 Fix defrag to support sharded bins in arena (added in v5.2.1) (Yoav Steinberg) When reviewing please look at all except the first commit which is just replacing 5.1.0 with 5.2.1 sources. Also I think we should merge this without squashing to preserve the changes we did to to jemalloc.
author: Oran Agra <oran@redislabs.com> 2021-10-18 12:45:11 +0300
committer: GitHub <noreply@github.com> 2021-10-18 12:45:11 +0300
commit: c4b4b6c06b0562740d214d0df467b2ba40396ffc (patch)
tree: ccd9a5e688f16311edefdacb63994976f8c6f687 /deps/jemalloc/include/jemalloc
parent: 276b460ea9554f79109eb9a234a847a2520cf4c2 (diff)
parent: 85737e674552bafefb6beb9a37531645e5d2178b (diff)
download: redis-c4b4b6c06b0562740d214d0df467b2ba40396ffc.tar.gz
66 files changed, 2122 insertions, 1057 deletions
diff --git a/deps/jemalloc/include/jemalloc/internal/arena_externs.h b/deps/jemalloc/include/jemalloc/internal/arena_externs.h
index 4b3732b41..a4523ae0c 100644
--- a/deps/jemalloc/include/jemalloc/internal/arena_externs.h
+++ b/deps/jemalloc/include/jemalloc/internal/arena_externs.h
@@ -3,8 +3,8 @@
 
 #include "jemalloc/internal/bin.h"
 #include "jemalloc/internal/extent_dss.h"
+#include "jemalloc/internal/hook.h"
 #include "jemalloc/internal/pages.h"
-#include "jemalloc/internal/size_classes.h"
 #include "jemalloc/internal/stats.h"
 
 extern ssize_t opt_dirty_decay_ms;
@@ -16,13 +16,17 @@ extern const char *percpu_arena_mode_names[];
 extern const uint64_t h_steps[SMOOTHSTEP_NSTEPS];
 extern malloc_mutex_t arenas_lock;
 
+extern size_t opt_oversize_threshold;
+extern size_t oversize_threshold;
+
 void arena_basic_stats_merge(tsdn_t *tsdn, arena_t *arena,
     unsigned *nthreads, const char **dss, ssize_t *dirty_decay_ms,
     ssize_t *muzzy_decay_ms, size_t *nactive, size_t *ndirty, size_t *nmuzzy);
 void arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
     const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms,
     size_t *nactive, size_t *ndirty, size_t *nmuzzy, arena_stats_t *astats,
-    bin_stats_t *bstats, arena_stats_large_t *lstats);
+    bin_stats_t *bstats, arena_stats_large_t *lstats,
+    arena_stats_extents_t *estats);
 void arena_extents_dirty_dalloc(tsdn_t *tsdn, arena_t *arena,
     extent_hooks_t **r_extent_hooks, extent_t *extent);
 #ifdef JEMALLOC_JET
@@ -56,16 +60,17 @@ void *arena_malloc_hard(tsdn_t *tsdn, arena_t *arena, size_t size,
     szind_t ind, bool zero);
 void *arena_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize,
     size_t alignment, bool zero, tcache_t *tcache);
-void arena_prof_promote(tsdn_t *tsdn, const void *ptr, size_t usize);
+void arena_prof_promote(tsdn_t *tsdn, void *ptr, size_t usize);
 void arena_dalloc_promoted(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
     bool slow_path);
-void arena_dalloc_bin_junked_locked(tsdn_t *tsdn, arena_t *arena,
-    extent_t *extent, void *ptr);
+void arena_dalloc_bin_junked_locked(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
+    szind_t binind, extent_t *extent, void *ptr);
 void arena_dalloc_small(tsdn_t *tsdn, void *ptr);
 bool arena_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
-    size_t extra, bool zero);
+    size_t extra, bool zero, size_t *newsize);
 void *arena_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t oldsize,
-    size_t size, size_t alignment, bool zero, tcache_t *tcache);
+    size_t size, size_t alignment, bool zero, tcache_t *tcache,
+    hook_ralloc_args_t *hook_args);
 dss_prec_t arena_dss_prec_get(arena_t *arena);
 bool arena_dss_prec_set(arena_t *arena, dss_prec_t dss_prec);
 ssize_t arena_dirty_decay_ms_default_get(void);
@@ -79,7 +84,12 @@ void arena_nthreads_inc(arena_t *arena, bool internal);
 void arena_nthreads_dec(arena_t *arena, bool internal);
 size_t arena_extent_sn_next(arena_t *arena);
 arena_t *arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks);
-void arena_boot(void);
+bool arena_init_huge(void);
+bool arena_is_huge(unsigned arena_ind);
+arena_t *arena_choose_huge(tsd_t *tsd);
+bin_t *arena_bin_choose_lock(tsdn_t *tsdn, arena_t *arena, szind_t binind,
+    unsigned *binshard);
+void arena_boot(sc_data_t *sc_data);
 void arena_prefork0(tsdn_t *tsdn, arena_t *arena);
 void arena_prefork1(tsdn_t *tsdn, arena_t *arena);
 void arena_prefork2(tsdn_t *tsdn, arena_t *arena);
diff --git a/deps/jemalloc/include/jemalloc/internal/arena_inlines_b.h b/deps/jemalloc/include/jemalloc/internal/arena_inlines_b.h
index 2b7e77e72..dd926575f 100644
--- a/deps/jemalloc/include/jemalloc/internal/arena_inlines_b.h
+++ b/deps/jemalloc/include/jemalloc/internal/arena_inlines_b.h
@@ -4,10 +4,36 @@
 #include "jemalloc/internal/jemalloc_internal_types.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/rtree.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/sz.h"
 #include "jemalloc/internal/ticker.h"
 
+JEMALLOC_ALWAYS_INLINE bool
+arena_has_default_hooks(arena_t *arena) {
+	return (extent_hooks_get(arena) == &extent_hooks_default);
+}
+
+JEMALLOC_ALWAYS_INLINE arena_t *
+arena_choose_maybe_huge(tsd_t *tsd, arena_t *arena, size_t size) {
+	if (arena != NULL) {
+		return arena;
+	}
+
+	/*
+	 * For huge allocations, use the dedicated huge arena if both are true:
+	 * 1) is using auto arena selection (i.e. arena == NULL), and 2) the
+	 * thread is not assigned to a manual arena.
+	 */
+	if (unlikely(size >= oversize_threshold)) {
+		arena_t *tsd_arena = tsd_arena_get(tsd);
+		if (tsd_arena == NULL || arena_is_auto(tsd_arena)) {
+			return arena_choose_huge(tsd);
+		}
+	}
+
+	return arena_choose(tsd, NULL);
+}
+
 JEMALLOC_ALWAYS_INLINE prof_tctx_t *
 arena_prof_tctx_get(tsdn_t *tsdn, const void *ptr, alloc_ctx_t *alloc_ctx) {
 	cassert(config_prof);
@@ -28,7 +54,7 @@ arena_prof_tctx_get(tsdn_t *tsdn, const void *ptr, alloc_ctx_t *alloc_ctx) {
 }
 
 JEMALLOC_ALWAYS_INLINE void
-arena_prof_tctx_set(tsdn_t *tsdn, const void *ptr, UNUSED size_t usize,
+arena_prof_tctx_set(tsdn_t *tsdn, const void *ptr, size_t usize,
     alloc_ctx_t *alloc_ctx, prof_tctx_t *tctx) {
 	cassert(config_prof);
 	assert(ptr != NULL);
@@ -47,7 +73,7 @@ arena_prof_tctx_set(tsdn_t *tsdn, const void *ptr, UNUSED size_t usize,
 }
 
 static inline void
-arena_prof_tctx_reset(tsdn_t *tsdn, const void *ptr, UNUSED prof_tctx_t *tctx) {
+arena_prof_tctx_reset(tsdn_t *tsdn, const void *ptr, prof_tctx_t *tctx) {
 	cassert(config_prof);
 	assert(ptr != NULL);
 
@@ -57,6 +83,32 @@ arena_prof_tctx_reset(tsdn_t *tsdn, const void *ptr, UNUSED prof_tctx_t *tctx) {
 	large_prof_tctx_reset(tsdn, extent);
 }
 
+JEMALLOC_ALWAYS_INLINE nstime_t
+arena_prof_alloc_time_get(tsdn_t *tsdn, const void *ptr,
+    alloc_ctx_t *alloc_ctx) {
+	cassert(config_prof);
+	assert(ptr != NULL);
+
+	extent_t *extent = iealloc(tsdn, ptr);
+	/*
+	 * Unlike arena_prof_prof_tctx_{get, set}, we only call this once we're
+	 * sure we have a sampled allocation.
+	 */
+	assert(!extent_slab_get(extent));
+	return large_prof_alloc_time_get(extent);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+arena_prof_alloc_time_set(tsdn_t *tsdn, const void *ptr, alloc_ctx_t *alloc_ctx,
+    nstime_t t) {
+	cassert(config_prof);
+	assert(ptr != NULL);
+
+	extent_t *extent = iealloc(tsdn, ptr);
+	assert(!extent_slab_get(extent));
+	large_prof_alloc_time_set(extent, t);
+}
+
 JEMALLOC_ALWAYS_INLINE void
 arena_decay_ticks(tsdn_t *tsdn, arena_t *arena, unsigned nticks) {
 	tsd_t *tsd;
@@ -83,14 +135,33 @@ arena_decay_tick(tsdn_t *tsdn, arena_t *arena) {
 	arena_decay_ticks(tsdn, arena, 1);
 }
 
+/* Purge a single extent to retained / unmapped directly. */
+JEMALLOC_ALWAYS_INLINE void
+arena_decay_extent(tsdn_t *tsdn,arena_t *arena, extent_hooks_t **r_extent_hooks,
+    extent_t *extent) {
+	size_t extent_size = extent_size_get(extent);
+	extent_dalloc_wrapper(tsdn, arena,
+	    r_extent_hooks, extent);
+	if (config_stats) {
+		/* Update stats accordingly. */
+		arena_stats_lock(tsdn, &arena->stats);
+		arena_stats_add_u64(tsdn, &arena->stats,
+		    &arena->decay_dirty.stats->nmadvise, 1);
+		arena_stats_add_u64(tsdn, &arena->stats,
+		    &arena->decay_dirty.stats->purged, extent_size >> LG_PAGE);
+		arena_stats_sub_zu(tsdn, &arena->stats, &arena->stats.mapped,
+		    extent_size);
+		arena_stats_unlock(tsdn, &arena->stats);
+	}
+}
+
 JEMALLOC_ALWAYS_INLINE void *
 arena_malloc(tsdn_t *tsdn, arena_t *arena, size_t size, szind_t ind, bool zero,
     tcache_t *tcache, bool slow_path) {
 	assert(!tsdn_null(tsdn) || tcache == NULL);
-	assert(size != 0);
 
 	if (likely(tcache != NULL)) {
-		if (likely(size <= SMALL_MAXCLASS)) {
+		if (likely(size <= SC_SMALL_MAXCLASS)) {
 			return tcache_alloc_small(tsdn_tsd(tsdn), arena,
 			    tcache, size, ind, zero, slow_path);
 		}
@@ -119,7 +190,7 @@ arena_salloc(tsdn_t *tsdn, const void *ptr) {
 
 	szind_t szind = rtree_szind_read(tsdn, &extents_rtree, rtree_ctx,
 	    (uintptr_t)ptr, true);
-	assert(szind != NSIZES);
+	assert(szind != SC_NSIZES);
 
 	return sz_index2size(szind);
 }
@@ -152,12 +223,22 @@ arena_vsalloc(tsdn_t *tsdn, const void *ptr) {
 	/* Only slab members should be looked up via interior pointers. */
 	assert(extent_addr_get(extent) == ptr || extent_slab_get(extent));
 
-	assert(szind != NSIZES);
+	assert(szind != SC_NSIZES);
 
 	return sz_index2size(szind);
 }
 
 static inline void
+arena_dalloc_large_no_tcache(tsdn_t *tsdn, void *ptr, szind_t szind) {
+	if (config_prof && unlikely(szind < SC_NBINS)) {
+		arena_dalloc_promoted(tsdn, ptr, NULL, true);
+	} else {
+		extent_t *extent = iealloc(tsdn, ptr);
+		large_dalloc(tsdn, extent);
+	}
+}
+
+static inline void
 arena_dalloc_no_tcache(tsdn_t *tsdn, void *ptr) {
 	assert(ptr != NULL);
 
@@ -173,7 +254,7 @@ arena_dalloc_no_tcache(tsdn_t *tsdn, void *ptr) {
 		extent_t *extent = rtree_extent_read(tsdn, &extents_rtree,
 		    rtree_ctx, (uintptr_t)ptr, true);
 		assert(szind == extent_szind_get(extent));
-		assert(szind < NSIZES);
+		assert(szind < SC_NSIZES);
 		assert(slab == extent_slab_get(extent));
 	}
 
@@ -181,6 +262,21 @@ arena_dalloc_no_tcache(tsdn_t *tsdn, void *ptr) {
 		/* Small allocation. */
 		arena_dalloc_small(tsdn, ptr);
 	} else {
+		arena_dalloc_large_no_tcache(tsdn, ptr, szind);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void
+arena_dalloc_large(tsdn_t *tsdn, void *ptr, tcache_t *tcache, szind_t szind,
+    bool slow_path) {
+	if (szind < nhbins) {
+		if (config_prof && unlikely(szind < SC_NBINS)) {
+			arena_dalloc_promoted(tsdn, ptr, tcache, slow_path);
+		} else {
+			tcache_dalloc_large(tsdn_tsd(tsdn), tcache, ptr, szind,
+			    slow_path);
+		}
+	} else {
 		extent_t *extent = iealloc(tsdn, ptr);
 		large_dalloc(tsdn, extent);
 	}
@@ -203,7 +299,7 @@ arena_dalloc(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
 	if (alloc_ctx != NULL) {
 		szind = alloc_ctx->szind;
 		slab = alloc_ctx->slab;
-		assert(szind != NSIZES);
+		assert(szind != SC_NSIZES);
 	} else {
 		rtree_ctx = tsd_rtree_ctx(tsdn_tsd(tsdn));
 		rtree_szind_slab_read(tsdn, &extents_rtree, rtree_ctx,
@@ -215,7 +311,7 @@ arena_dalloc(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
 		extent_t *extent = rtree_extent_read(tsdn, &extents_rtree,
 		    rtree_ctx, (uintptr_t)ptr, true);
 		assert(szind == extent_szind_get(extent));
-		assert(szind < NSIZES);
+		assert(szind < SC_NSIZES);
 		assert(slab == extent_slab_get(extent));
 	}
 
@@ -224,25 +320,14 @@ arena_dalloc(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
 		tcache_dalloc_small(tsdn_tsd(tsdn), tcache, ptr, szind,
 		    slow_path);
 	} else {
-		if (szind < nhbins) {
-			if (config_prof && unlikely(szind < NBINS)) {
-				arena_dalloc_promoted(tsdn, ptr, tcache,
-				    slow_path);
-			} else {
-				tcache_dalloc_large(tsdn_tsd(tsdn), tcache, ptr,
-				    szind, slow_path);
-			}
-		} else {
-			extent_t *extent = iealloc(tsdn, ptr);
-			large_dalloc(tsdn, extent);
-		}
+		arena_dalloc_large(tsdn, ptr, tcache, szind, slow_path);
 	}
 }
 
 static inline void
 arena_sdalloc_no_tcache(tsdn_t *tsdn, void *ptr, size_t size) {
 	assert(ptr != NULL);
-	assert(size <= LARGE_MAXCLASS);
+	assert(size <= SC_LARGE_MAXCLASS);
 
 	szind_t szind;
 	bool slab;
@@ -252,7 +337,7 @@ arena_sdalloc_no_tcache(tsdn_t *tsdn, void *ptr, size_t size) {
 		 * object, so base szind and slab on the given size.
 		 */
 		szind = sz_size2index(size);
-		slab = (szind < NBINS);
+		slab = (szind < SC_NBINS);
 	}
 
 	if ((config_prof && opt_prof) || config_debug) {
@@ -264,7 +349,7 @@ arena_sdalloc_no_tcache(tsdn_t *tsdn, void *ptr, size_t size) {
 		    (uintptr_t)ptr, true, &szind, &slab);
 
 		assert(szind == sz_size2index(size));
-		assert((config_prof && opt_prof) || slab == (szind < NBINS));
+		assert((config_prof && opt_prof) || slab == (szind < SC_NBINS));
 
 		if (config_debug) {
 			extent_t *extent = rtree_extent_read(tsdn,
@@ -278,8 +363,7 @@ arena_sdalloc_no_tcache(tsdn_t *tsdn, void *ptr, size_t size) {
 		/* Small allocation. */
 		arena_dalloc_small(tsdn, ptr);
 	} else {
-		extent_t *extent = iealloc(tsdn, ptr);
-		large_dalloc(tsdn, extent);
+		arena_dalloc_large_no_tcache(tsdn, ptr, szind);
 	}
 }
 
@@ -288,7 +372,7 @@ arena_sdalloc(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache,
     alloc_ctx_t *alloc_ctx, bool slow_path) {
 	assert(!tsdn_null(tsdn) || tcache == NULL);
 	assert(ptr != NULL);
-	assert(size <= LARGE_MAXCLASS);
+	assert(size <= SC_LARGE_MAXCLASS);
 
 	if (unlikely(tcache == NULL)) {
 		arena_sdalloc_no_tcache(tsdn, ptr, size);
@@ -297,7 +381,7 @@ arena_sdalloc(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache,
 
 	szind_t szind;
 	bool slab;
-	UNUSED alloc_ctx_t local_ctx;
+	alloc_ctx_t local_ctx;
 	if (config_prof && opt_prof) {
 		if (alloc_ctx == NULL) {
 			/* Uncommon case and should be a static check. */
@@ -318,7 +402,7 @@ arena_sdalloc(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache,
 		 * object, so base szind and slab on the given size.
 		 */
 		szind = sz_size2index(size);
-		slab = (szind < NBINS);
+		slab = (szind < SC_NBINS);
 	}
 
 	if (config_debug) {
@@ -336,18 +420,7 @@ arena_sdalloc(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache,
 		tcache_dalloc_small(tsdn_tsd(tsdn), tcache, ptr, szind,
 		    slow_path);
 	} else {
-		if (szind < nhbins) {
-			if (config_prof && unlikely(szind < NBINS)) {
-				arena_dalloc_promoted(tsdn, ptr, tcache,
-				    slow_path);
-			} else {
-				tcache_dalloc_large(tsdn_tsd(tsdn),
-				    tcache, ptr, szind, slow_path);
-			}
-		} else {
-			extent_t *extent = iealloc(tsdn, ptr);
-			large_dalloc(tsdn, extent);
-		}
+		arena_dalloc_large(tsdn, ptr, tcache, szind, slow_path);
 	}
 }
 
diff --git a/deps/jemalloc/include/jemalloc/internal/arena_stats.h b/deps/jemalloc/include/jemalloc/internal/arena_stats.h
index 5f3dca8b1..23949ed92 100644
--- a/deps/jemalloc/include/jemalloc/internal/arena_stats.h
+++ b/deps/jemalloc/include/jemalloc/internal/arena_stats.h
@@ -4,7 +4,9 @@
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/mutex_prof.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
+
+JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
 
 /*
  * In those architectures that support 64-bit atomics, we use atomic updates for
@@ -33,6 +35,13 @@ struct arena_stats_large_s {
 	 * periodically merges into this counter.
 	 */
 	arena_stats_u64_t	nrequests; /* Partially derived. */
+	/*
+	 * Number of tcache fills / flushes for large (similarly, periodically
+	 * merged).  Note that there is no large tcache batch-fill currently
+	 * (i.e. only fill 1 at a time); however flush may be batched.
+	 */
+	arena_stats_u64_t	nfills; /* Partially derived. */
+	arena_stats_u64_t	nflushes; /* Partially derived. */
 
 	/* Current number of allocations of this size class. */
 	size_t		curlextents; /* Derived. */
@@ -48,6 +57,22 @@ struct arena_stats_decay_s {
 	arena_stats_u64_t	purged;
 };
 
+typedef struct arena_stats_extents_s arena_stats_extents_t;
+struct arena_stats_extents_s {
+	/*
+	 * Stats for a given index in the range [0, SC_NPSIZES] in an extents_t.
+	 * We track both bytes and # of extents: two extents in the same bucket
+	 * may have different sizes if adjacent size classes differ by more than
+	 * a page, so bytes cannot always be derived from # of extents.
+	 */
+	atomic_zu_t ndirty;
+	atomic_zu_t dirty_bytes;
+	atomic_zu_t nmuzzy;
+	atomic_zu_t muzzy_bytes;
+	atomic_zu_t nretained;
+	atomic_zu_t retained_bytes;
+};
+
 /*
  * Arena stats.  Note that fields marked "derived" are not directly maintained
  * within the arena code; rather their values are derived during stats merge
@@ -69,6 +94,9 @@ struct arena_stats_s {
 	 */
 	atomic_zu_t		retained; /* Derived. */
 
+	/* Number of extent_t structs allocated by base, but not being used. */
+	atomic_zu_t		extent_avail;
+
 	arena_stats_decay_t	decay_dirty;
 	arena_stats_decay_t	decay_muzzy;
 
@@ -80,22 +108,27 @@ struct arena_stats_s {
 	atomic_zu_t		allocated_large; /* Derived. */
 	arena_stats_u64_t	nmalloc_large; /* Derived. */
 	arena_stats_u64_t	ndalloc_large; /* Derived. */
+	arena_stats_u64_t	nfills_large; /* Derived. */
+	arena_stats_u64_t	nflushes_large; /* Derived. */
 	arena_stats_u64_t	nrequests_large; /* Derived. */
 
+	/* VM space had to be leaked (undocumented).  Normally 0. */
+	atomic_zu_t		abandoned_vm;
+
 	/* Number of bytes cached in tcache associated with this arena. */
 	atomic_zu_t		tcache_bytes; /* Derived. */
 
 	mutex_prof_data_t mutex_prof_data[mutex_prof_num_arena_mutexes];
 
 	/* One element for each large size class. */
-	arena_stats_large_t	lstats[NSIZES - NBINS];
+	arena_stats_large_t	lstats[SC_NSIZES - SC_NBINS];
 
 	/* Arena uptime. */
 	nstime_t		uptime;
 };
 
 static inline bool
-arena_stats_init(UNUSED tsdn_t *tsdn, arena_stats_t *arena_stats) {
+arena_stats_init(tsdn_t *tsdn, arena_stats_t *arena_stats) {
 	if (config_debug) {
 		for (size_t i = 0; i < sizeof(arena_stats_t); i++) {
 			assert(((char *)arena_stats)[i] == 0);
@@ -147,11 +180,11 @@ arena_stats_add_u64(tsdn_t *tsdn, arena_stats_t *arena_stats,
 #endif
 }
 
-UNUSED static inline void
+static inline void
 arena_stats_sub_u64(tsdn_t *tsdn, arena_stats_t *arena_stats,
     arena_stats_u64_t *p, uint64_t x) {
 #ifdef JEMALLOC_ATOMIC_U64
-	UNUSED uint64_t r = atomic_fetch_sub_u64(p, x, ATOMIC_RELAXED);
+	uint64_t r = atomic_fetch_sub_u64(p, x, ATOMIC_RELAXED);
 	assert(r - x <= r);
 #else
 	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
@@ -176,7 +209,8 @@ arena_stats_accum_u64(arena_stats_u64_t *dst, uint64_t src) {
 }
 
 static inline size_t
-arena_stats_read_zu(tsdn_t *tsdn, arena_stats_t *arena_stats, atomic_zu_t *p) {
+arena_stats_read_zu(tsdn_t *tsdn, arena_stats_t *arena_stats,
+    atomic_zu_t *p) {
 #ifdef JEMALLOC_ATOMIC_U64
 	return atomic_load_zu(p, ATOMIC_RELAXED);
 #else
@@ -186,8 +220,8 @@ arena_stats_read_zu(tsdn_t *tsdn, arena_stats_t *arena_stats, atomic_zu_t *p) {
 }
 
 static inline void
-arena_stats_add_zu(tsdn_t *tsdn, arena_stats_t *arena_stats, atomic_zu_t *p,
-    size_t x) {
+arena_stats_add_zu(tsdn_t *tsdn, arena_stats_t *arena_stats,
+    atomic_zu_t *p, size_t x) {
 #ifdef JEMALLOC_ATOMIC_U64
 	atomic_fetch_add_zu(p, x, ATOMIC_RELAXED);
 #else
@@ -198,10 +232,10 @@ arena_stats_add_zu(tsdn_t *tsdn, arena_stats_t *arena_stats, atomic_zu_t *p,
 }
 
 static inline void
-arena_stats_sub_zu(tsdn_t *tsdn, arena_stats_t *arena_stats, atomic_zu_t *p,
-    size_t x) {
+arena_stats_sub_zu(tsdn_t *tsdn, arena_stats_t *arena_stats,
+    atomic_zu_t *p, size_t x) {
 #ifdef JEMALLOC_ATOMIC_U64
-	UNUSED size_t r = atomic_fetch_sub_zu(p, x, ATOMIC_RELAXED);
+	size_t r = atomic_fetch_sub_zu(p, x, ATOMIC_RELAXED);
 	assert(r - x <= r);
 #else
 	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
@@ -218,11 +252,12 @@ arena_stats_accum_zu(atomic_zu_t *dst, size_t src) {
 }
 
 static inline void
-arena_stats_large_nrequests_add(tsdn_t *tsdn, arena_stats_t *arena_stats,
+arena_stats_large_flush_nrequests_add(tsdn_t *tsdn, arena_stats_t *arena_stats,
     szind_t szind, uint64_t nrequests) {
 	arena_stats_lock(tsdn, arena_stats);
-	arena_stats_add_u64(tsdn, arena_stats, &arena_stats->lstats[szind -
-	    NBINS].nrequests, nrequests);
+	arena_stats_large_t *lstats = &arena_stats->lstats[szind - SC_NBINS];
+	arena_stats_add_u64(tsdn, arena_stats, &lstats->nrequests, nrequests);
+	arena_stats_add_u64(tsdn, arena_stats, &lstats->nflushes, 1);
 	arena_stats_unlock(tsdn, arena_stats);
 }
 
@@ -233,5 +268,4 @@ arena_stats_mapped_add(tsdn_t *tsdn, arena_stats_t *arena_stats, size_t size) {
 	arena_stats_unlock(tsdn, arena_stats);
 }
 
-
 #endif /* JEMALLOC_INTERNAL_ARENA_STATS_H */
diff --git a/deps/jemalloc/include/jemalloc/internal/arena_structs_b.h b/deps/jemalloc/include/jemalloc/internal/arena_structs_b.h
index 38bc95962..eeab57fd6 100644
--- a/deps/jemalloc/include/jemalloc/internal/arena_structs_b.h
+++ b/deps/jemalloc/include/jemalloc/internal/arena_structs_b.h
@@ -10,7 +10,7 @@
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/nstime.h"
 #include "jemalloc/internal/ql.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/smoothstep.h"
 #include "jemalloc/internal/ticker.h"
 
@@ -90,6 +90,9 @@ struct arena_s {
 	 */
 	atomic_u_t		nthreads[2];
 
+	/* Next bin shard for binding new threads. Synchronization: atomic. */
+	atomic_u_t		binshard_next;
+
 	/*
 	 * When percpu_arena is enabled, to amortize the cost of reading /
 	 * updating the current CPU id, track the most recent thread accessing
@@ -113,7 +116,6 @@ struct arena_s {
 
 	/* Synchronization: internal. */
 	prof_accum_t		prof_accum;
-	uint64_t		prof_accumbytes;
 
 	/*
 	 * PRNG state for cache index randomization of large allocation base
@@ -196,6 +198,7 @@ struct arena_s {
 	 * Synchronization: extent_avail_mtx.
 	 */
 	extent_tree_t		extent_avail;
+	atomic_zu_t		extent_avail_cnt;
 	malloc_mutex_t		extent_avail_mtx;
 
 	/*
@@ -203,7 +206,7 @@ struct arena_s {
 	 *
 	 * Synchronization: internal.
 	 */
-	bin_t			bins[NBINS];
+	bins_t			bins[SC_NBINS];
 
 	/*
 	 * Base allocator, from which arena metadata are allocated.
diff --git a/deps/jemalloc/include/jemalloc/internal/arena_types.h b/deps/jemalloc/include/jemalloc/internal/arena_types.h
index 70001b5f1..624937e4f 100644
--- a/deps/jemalloc/include/jemalloc/internal/arena_types.h
+++ b/deps/jemalloc/include/jemalloc/internal/arena_types.h
@@ -1,13 +1,15 @@
 #ifndef JEMALLOC_INTERNAL_ARENA_TYPES_H
 #define JEMALLOC_INTERNAL_ARENA_TYPES_H
 
+#include "jemalloc/internal/sc.h"
+
 /* Maximum number of regions in one slab. */
-#define LG_SLAB_MAXREGS		(LG_PAGE - LG_TINY_MIN)
+#define LG_SLAB_MAXREGS		(LG_PAGE - SC_LG_TINY_MIN)
 #define SLAB_MAXREGS		(1U << LG_SLAB_MAXREGS)
 
 /* Default decay times in milliseconds. */
 #define DIRTY_DECAY_MS_DEFAULT	ZD(10 * 1000)
-#define MUZZY_DECAY_MS_DEFAULT	ZD(10 * 1000)
+#define MUZZY_DECAY_MS_DEFAULT	(0)
 /* Number of event ticks between time checks. */
 #define DECAY_NTICKS_PER_UPDATE	1000
 
@@ -40,4 +42,10 @@ typedef enum {
 #define PERCPU_ARENA_ENABLED(m)	((m) >= percpu_arena_mode_enabled_base)
 #define PERCPU_ARENA_DEFAULT	percpu_arena_disabled
 
+/*
+ * When allocation_size >= oversize_threshold, use the dedicated huge arena
+ * (unless have explicitly spicified arena index).  0 disables the feature.
+ */
+#define OVERSIZE_THRESHOLD_DEFAULT (8 << 20)
+
 #endif /* JEMALLOC_INTERNAL_ARENA_TYPES_H */
diff --git a/deps/jemalloc/include/jemalloc/internal/atomic.h b/deps/jemalloc/include/jemalloc/internal/atomic.h
index adadb1a3a..a76f54cee 100644
--- a/deps/jemalloc/include/jemalloc/internal/atomic.h
+++ b/deps/jemalloc/include/jemalloc/internal/atomic.h
@@ -1,12 +1,19 @@
 #ifndef JEMALLOC_INTERNAL_ATOMIC_H
 #define JEMALLOC_INTERNAL_ATOMIC_H
 
-#define ATOMIC_INLINE static inline
+#define ATOMIC_INLINE JEMALLOC_ALWAYS_INLINE
 
+#define JEMALLOC_U8_ATOMICS
 #if defined(JEMALLOC_GCC_ATOMIC_ATOMICS)
 #  include "jemalloc/internal/atomic_gcc_atomic.h"
+#  if !defined(JEMALLOC_GCC_U8_ATOMIC_ATOMICS)
+#    undef JEMALLOC_U8_ATOMICS
+#  endif
 #elif defined(JEMALLOC_GCC_SYNC_ATOMICS)
 #  include "jemalloc/internal/atomic_gcc_sync.h"
+#  if !defined(JEMALLOC_GCC_U8_SYNC_ATOMICS)
+#    undef JEMALLOC_U8_ATOMICS
+#  endif
 #elif defined(_MSC_VER)
 #  include "jemalloc/internal/atomic_msvc.h"
 #elif defined(JEMALLOC_C11_ATOMICS)
@@ -66,6 +73,8 @@ JEMALLOC_GENERATE_INT_ATOMICS(size_t, zu, LG_SIZEOF_PTR)
 
 JEMALLOC_GENERATE_INT_ATOMICS(ssize_t, zd, LG_SIZEOF_PTR)
 
+JEMALLOC_GENERATE_INT_ATOMICS(uint8_t, u8, 0)
+
 JEMALLOC_GENERATE_INT_ATOMICS(uint32_t, u32, 2)
 
 #ifdef JEMALLOC_ATOMIC_U64
diff --git a/deps/jemalloc/include/jemalloc/internal/atomic_gcc_atomic.h b/deps/jemalloc/include/jemalloc/internal/atomic_gcc_atomic.h
index 6b73a14f8..471515e82 100644
--- a/deps/jemalloc/include/jemalloc/internal/atomic_gcc_atomic.h
+++ b/deps/jemalloc/include/jemalloc/internal/atomic_gcc_atomic.h
@@ -67,7 +67,8 @@ atomic_exchange_##short_type(atomic_##short_type##_t *a, type val,	\
 									\
 ATOMIC_INLINE bool							\
 atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a,	\
-    type *expected, type desired, atomic_memory_order_t success_mo,	\
+    UNUSED type *expected, type desired,				\
+    atomic_memory_order_t success_mo,					\
     atomic_memory_order_t failure_mo) {					\
 	return __atomic_compare_exchange(&a->repr, expected, &desired,	\
 	    true, atomic_enum_to_builtin(success_mo),			\
@@ -76,7 +77,8 @@ atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a,	\
 									\
 ATOMIC_INLINE bool							\
 atomic_compare_exchange_strong_##short_type(atomic_##short_type##_t *a,	\
-    type *expected, type desired, atomic_memory_order_t success_mo,	\
+    UNUSED type *expected, type desired,				\
+    atomic_memory_order_t success_mo,					\
     atomic_memory_order_t failure_mo) {					\
 	return __atomic_compare_exchange(&a->repr, expected, &desired,	\
 	    false,							\
diff --git a/deps/jemalloc/include/jemalloc/internal/atomic_gcc_sync.h b/deps/jemalloc/include/jemalloc/internal/atomic_gcc_sync.h
index 30846e4d2..e02b7cbe3 100644
--- a/deps/jemalloc/include/jemalloc/internal/atomic_gcc_sync.h
+++ b/deps/jemalloc/include/jemalloc/internal/atomic_gcc_sync.h
@@ -27,8 +27,10 @@ atomic_fence(atomic_memory_order_t mo) {
 	asm volatile("" ::: "memory");
 #  if defined(__i386__) || defined(__x86_64__)
 	/* This is implicit on x86. */
-#  elif defined(__ppc__)
+#  elif defined(__ppc64__)
 	asm volatile("lwsync");
+#  elif defined(__ppc__)
+	asm volatile("sync");
 #  elif defined(__sparc__) && defined(__arch64__)
 	if (mo == atomic_memory_order_acquire) {
 		asm volatile("membar #LoadLoad | #LoadStore");
@@ -113,8 +115,8 @@ atomic_store_##short_type(atomic_##short_type##_t *a,			\
 }									\
 									\
 ATOMIC_INLINE type							\
-atomic_exchange_##short_type(atomic_##short_type##_t *a, type val,	\
-    atomic_memory_order_t mo) {						\
+atomic_exchange_##short_type(atomic_##short_type##_t *a, type val, \
+    atomic_memory_order_t mo) {                  					 \
 	/*								\
 	 * Because of FreeBSD, we care about gcc 4.2, which doesn't have\
 	 * an atomic exchange builtin.  We fake it with a CAS loop.	\
@@ -129,8 +131,9 @@ atomic_exchange_##short_type(atomic_##short_type##_t *a, type val,	\
 									\
 ATOMIC_INLINE bool							\
 atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a,	\
-    type *expected, type desired, atomic_memory_order_t success_mo,	\
-    atomic_memory_order_t failure_mo) {					\
+    type *expected, type desired,                                     \
+    atomic_memory_order_t success_mo,                          \
+    atomic_memory_order_t failure_mo) {				                \
 	type prev = __sync_val_compare_and_swap(&a->repr, *expected,	\
 	    desired);							\
 	if (prev == *expected) {					\
@@ -142,8 +145,9 @@ atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a,	\
 }									\
 ATOMIC_INLINE bool							\
 atomic_compare_exchange_strong_##short_type(atomic_##short_type##_t *a,	\
-    type *expected, type desired, atomic_memory_order_t success_mo,	\
-    atomic_memory_order_t failure_mo) {					\
+    type *expected, type desired,                                       \
+    atomic_memory_order_t success_mo,                            \
+    atomic_memory_order_t failure_mo) {                          \
 	type prev = __sync_val_compare_and_swap(&a->repr, *expected,	\
 	    desired);							\
 	if (prev == *expected) {					\
diff --git a/deps/jemalloc/include/jemalloc/internal/background_thread_externs.h b/deps/jemalloc/include/jemalloc/internal/background_thread_externs.h
index 3209aa49f..0f997e18b 100644
--- a/deps/jemalloc/include/jemalloc/internal/background_thread_externs.h
+++ b/deps/jemalloc/include/jemalloc/internal/background_thread_externs.h
@@ -8,7 +8,6 @@ extern atomic_b_t background_thread_enabled_state;
 extern size_t n_background_threads;
 extern size_t max_background_threads;
 extern background_thread_info_t *background_thread_info;
-extern bool can_enable_background_thread;
 
 bool background_thread_create(tsd_t *tsd, unsigned arena_ind);
 bool background_threads_enable(tsd_t *tsd);
diff --git a/deps/jemalloc/include/jemalloc/internal/background_thread_inlines.h b/deps/jemalloc/include/jemalloc/internal/background_thread_inlines.h
index ef50231e8..f85e86fa3 100644
--- a/deps/jemalloc/include/jemalloc/internal/background_thread_inlines.h
+++ b/deps/jemalloc/include/jemalloc/internal/background_thread_inlines.h
@@ -15,7 +15,12 @@ background_thread_enabled_set(tsdn_t *tsdn, bool state) {
 JEMALLOC_ALWAYS_INLINE background_thread_info_t *
 arena_background_thread_info_get(arena_t *arena) {
 	unsigned arena_ind = arena_ind_get(arena);
-	return &background_thread_info[arena_ind % ncpus];
+	return &background_thread_info[arena_ind % max_background_threads];
+}
+
+JEMALLOC_ALWAYS_INLINE background_thread_info_t *
+background_thread_info_get(size_t ind) {
+	return &background_thread_info[ind % max_background_threads];
 }
 
 JEMALLOC_ALWAYS_INLINE uint64_t
diff --git a/deps/jemalloc/include/jemalloc/internal/background_thread_structs.h b/deps/jemalloc/include/jemalloc/internal/background_thread_structs.h
index c1107dfe9..c02aa434c 100644
--- a/deps/jemalloc/include/jemalloc/internal/background_thread_structs.h
+++ b/deps/jemalloc/include/jemalloc/internal/background_thread_structs.h
@@ -9,6 +9,7 @@
 
 #define BACKGROUND_THREAD_INDEFINITE_SLEEP UINT64_MAX
 #define MAX_BACKGROUND_THREAD_LIMIT MALLOCX_ARENA_LIMIT
+#define DEFAULT_NUM_BACKGROUND_THREAD 4
 
 typedef enum {
 	background_thread_stopped,
diff --git a/deps/jemalloc/include/jemalloc/internal/base_structs.h b/deps/jemalloc/include/jemalloc/internal/base_structs.h
index 2102247ac..07f214eb2 100644
--- a/deps/jemalloc/include/jemalloc/internal/base_structs.h
+++ b/deps/jemalloc/include/jemalloc/internal/base_structs.h
@@ -3,7 +3,7 @@
 
 #include "jemalloc/internal/jemalloc_internal_types.h"
 #include "jemalloc/internal/mutex.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 
 /* Embedded at the beginning of every block of base-managed virtual memory. */
 struct base_block_s {
@@ -46,7 +46,7 @@ struct base_s {
 	base_block_t	*blocks;
 
 	/* Heap of extents that track unused trailing space within blocks. */
-	extent_heap_t	avail[NSIZES];
+	extent_heap_t	avail[SC_NSIZES];
 
 	/* Stats, only maintained if config_stats. */
 	size_t		allocated;
diff --git a/deps/jemalloc/include/jemalloc/internal/bin.h b/deps/jemalloc/include/jemalloc/internal/bin.h
index 9b416ada7..8547e8930 100644
--- a/deps/jemalloc/include/jemalloc/internal/bin.h
+++ b/deps/jemalloc/include/jemalloc/internal/bin.h
@@ -1,10 +1,12 @@
 #ifndef JEMALLOC_INTERNAL_BIN_H
 #define JEMALLOC_INTERNAL_BIN_H
 
+#include "jemalloc/internal/bin_stats.h"
+#include "jemalloc/internal/bin_types.h"
 #include "jemalloc/internal/extent_types.h"
 #include "jemalloc/internal/extent_structs.h"
 #include "jemalloc/internal/mutex.h"
-#include "jemalloc/internal/bin_stats.h"
+#include "jemalloc/internal/sc.h"
 
 /*
  * A bin contains a set of extents that are currently being used for slab
@@ -41,6 +43,9 @@ struct bin_info_s {
 	/* Total number of regions in a slab for this bin's size class. */
 	uint32_t		nregs;
 
+	/* Number of sharded bins in each arena for this size class. */
+	uint32_t		n_shards;
+
 	/*
 	 * Metadata used to manipulate bitmaps for slabs associated with this
 	 * bin.
@@ -48,8 +53,7 @@ struct bin_info_s {
 	bitmap_info_t		bitmap_info;
 };
 
-extern const bin_info_t bin_infos[NBINS];
-
+extern bin_info_t bin_infos[SC_NBINS];
 
 typedef struct bin_s bin_t;
 struct bin_s {
@@ -78,6 +82,18 @@ struct bin_s {
 	bin_stats_t	stats;
 };
 
+/* A set of sharded bins of the same size class. */
+typedef struct bins_s bins_t;
+struct bins_s {
+	/* Sharded bins.  Dynamically sized. */
+	bin_t *bin_shards;
+};
+
+void bin_shard_sizes_boot(unsigned bin_shards[SC_NBINS]);
+bool bin_update_shard_size(unsigned bin_shards[SC_NBINS], size_t start_size,
+    size_t end_size, size_t nshards);
+void bin_boot(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS]);
+
 /* Initializes a bin to empty.  Returns true on error. */
 bool bin_init(bin_t *bin);
 
@@ -90,7 +106,7 @@ void bin_postfork_child(tsdn_t *tsdn, bin_t *bin);
 static inline void
 bin_stats_merge(tsdn_t *tsdn, bin_stats_t *dst_bin_stats, bin_t *bin) {
 	malloc_mutex_lock(tsdn, &bin->lock);
-	malloc_mutex_prof_read(tsdn, &dst_bin_stats->mutex_data, &bin->lock);
+	malloc_mutex_prof_accum(tsdn, &dst_bin_stats->mutex_data, &bin->lock);
 	dst_bin_stats->nmalloc += bin->stats.nmalloc;
 	dst_bin_stats->ndalloc += bin->stats.ndalloc;
 	dst_bin_stats->nrequests += bin->stats.nrequests;
@@ -100,6 +116,7 @@ bin_stats_merge(tsdn_t *tsdn, bin_stats_t *dst_bin_stats, bin_t *bin) {
 	dst_bin_stats->nslabs += bin->stats.nslabs;
 	dst_bin_stats->reslabs += bin->stats.reslabs;
 	dst_bin_stats->curslabs += bin->stats.curslabs;
+	dst_bin_stats->nonfull_slabs += bin->stats.nonfull_slabs;
 	malloc_mutex_unlock(tsdn, &bin->lock);
 }
 
diff --git a/deps/jemalloc/include/jemalloc/internal/bin_stats.h b/deps/jemalloc/include/jemalloc/internal/bin_stats.h
index 86e673ec4..d04519c82 100644
--- a/deps/jemalloc/include/jemalloc/internal/bin_stats.h
+++ b/deps/jemalloc/include/jemalloc/internal/bin_stats.h
@@ -45,6 +45,9 @@ struct bin_stats_s {
 	/* Current number of slabs in this bin. */
 	size_t		curslabs;
 
+	/* Current size of nonfull slabs heap in this bin. */
+	size_t		nonfull_slabs;
+
 	mutex_prof_data_t mutex_data;
 };
 
diff --git a/deps/jemalloc/include/jemalloc/internal/bin_types.h b/deps/jemalloc/include/jemalloc/internal/bin_types.h
new file mode 100644
index 000000000..3533606b9
--- /dev/null
+++ b/deps/jemalloc/include/jemalloc/internal/bin_types.h
@@ -0,0 +1,17 @@
+#ifndef JEMALLOC_INTERNAL_BIN_TYPES_H
+#define JEMALLOC_INTERNAL_BIN_TYPES_H
+
+#include "jemalloc/internal/sc.h"
+
+#define BIN_SHARDS_MAX (1 << EXTENT_BITS_BINSHARD_WIDTH)
+#define N_BIN_SHARDS_DEFAULT 1
+
+/* Used in TSD static initializer only. Real init in arena_bind(). */
+#define TSD_BINSHARDS_ZERO_INITIALIZER {{UINT8_MAX}}
+
+typedef struct tsd_binshards_s tsd_binshards_t;
+struct tsd_binshards_s {
+	uint8_t binshard[SC_NBINS];
+};
+
+#endif /* JEMALLOC_INTERNAL_BIN_TYPES_H */
diff --git a/deps/jemalloc/include/jemalloc/internal/bit_util.h b/deps/jemalloc/include/jemalloc/internal/bit_util.h
index 8d078a8a3..c045eb868 100644
--- a/deps/jemalloc/include/jemalloc/internal/bit_util.h
+++ b/deps/jemalloc/include/jemalloc/internal/bit_util.h
@@ -27,6 +27,25 @@ ffs_u(unsigned bitmap) {
 	return JEMALLOC_INTERNAL_FFS(bitmap);
 }
 
+#ifdef JEMALLOC_INTERNAL_POPCOUNTL
+BIT_UTIL_INLINE unsigned
+popcount_lu(unsigned long bitmap) {
+  return JEMALLOC_INTERNAL_POPCOUNTL(bitmap);
+}
+#endif
+
+/*
+ * Clears first unset bit in bitmap, and returns
+ * place of bit.  bitmap *must not* be 0.
+ */
+
+BIT_UTIL_INLINE size_t
+cfs_lu(unsigned long* bitmap) {
+	size_t bit = ffs_lu(*bitmap) - 1;
+	*bitmap ^= ZU(1) << bit;
+	return bit;
+}
+
 BIT_UTIL_INLINE unsigned
 ffs_zu(size_t bitmap) {
 #if LG_SIZEOF_PTR == LG_SIZEOF_INT
@@ -63,6 +82,22 @@ ffs_u32(uint32_t bitmap) {
 
 BIT_UTIL_INLINE uint64_t
 pow2_ceil_u64(uint64_t x) {
+#if (defined(__amd64__) || defined(__x86_64__) || defined(JEMALLOC_HAVE_BUILTIN_CLZ))
+	if(unlikely(x <= 1)) {
+		return x;
+	}
+	size_t msb_on_index;
+#if (defined(__amd64__) || defined(__x86_64__))
+	asm ("bsrq %1, %0"
+			: "=r"(msb_on_index) // Outputs.
+			: "r"(x-1)           // Inputs.
+		);
+#elif (defined(JEMALLOC_HAVE_BUILTIN_CLZ))
+	msb_on_index = (63 ^ __builtin_clzll(x - 1));
+#endif
+	assert(msb_on_index < 63);
+	return 1ULL << (msb_on_index + 1);
+#else
 	x--;
 	x |= x >> 1;
 	x |= x >> 2;
@@ -72,10 +107,27 @@ pow2_ceil_u64(uint64_t x) {
 	x |= x >> 32;
 	x++;
 	return x;
+#endif
 }
 
 BIT_UTIL_INLINE uint32_t
 pow2_ceil_u32(uint32_t x) {
+#if ((defined(__i386__) || defined(JEMALLOC_HAVE_BUILTIN_CLZ)) && (!defined(__s390__)))
+	if(unlikely(x <= 1)) {
+		return x;
+	}
+	size_t msb_on_index;
+#if (defined(__i386__))
+	asm ("bsr %1, %0"
+			: "=r"(msb_on_index) // Outputs.
+			: "r"(x-1)           // Inputs.
+		);
+#elif (defined(JEMALLOC_HAVE_BUILTIN_CLZ))
+	msb_on_index = (31 ^ __builtin_clz(x - 1));
+#endif
+	assert(msb_on_index < 31);
+	return 1U << (msb_on_index + 1);
+#else
 	x--;
 	x |= x >> 1;
 	x |= x >> 2;
@@ -84,6 +136,7 @@ pow2_ceil_u32(uint32_t x) {
 	x |= x >> 16;
 	x++;
 	return x;
+#endif
 }
 
 /* Compute the smallest power of 2 that is >= x. */
@@ -160,6 +213,27 @@ lg_floor(size_t x) {
 }
 #endif
 
+BIT_UTIL_INLINE unsigned
+lg_ceil(size_t x) {
+	return lg_floor(x) + ((x & (x - 1)) == 0 ? 0 : 1);
+}
+
 #undef BIT_UTIL_INLINE
 
+/* A compile-time version of lg_floor and lg_ceil. */
+#define LG_FLOOR_1(x) 0
+#define LG_FLOOR_2(x) (x < (1ULL << 1) ? LG_FLOOR_1(x) : 1 + LG_FLOOR_1(x >> 1))
+#define LG_FLOOR_4(x) (x < (1ULL << 2) ? LG_FLOOR_2(x) : 2 + LG_FLOOR_2(x >> 2))
+#define LG_FLOOR_8(x) (x < (1ULL << 4) ? LG_FLOOR_4(x) : 4 + LG_FLOOR_4(x >> 4))
+#define LG_FLOOR_16(x) (x < (1ULL << 8) ? LG_FLOOR_8(x) : 8 + LG_FLOOR_8(x >> 8))
+#define LG_FLOOR_32(x) (x < (1ULL << 16) ? LG_FLOOR_16(x) : 16 + LG_FLOOR_16(x >> 16))
+#define LG_FLOOR_64(x) (x < (1ULL << 32) ? LG_FLOOR_32(x) : 32 + LG_FLOOR_32(x >> 32))
+#if LG_SIZEOF_PTR == 2
+#  define LG_FLOOR(x) LG_FLOOR_32((x))
+#else
+#  define LG_FLOOR(x) LG_FLOOR_64((x))
+#endif
+
+#define LG_CEIL(x) (LG_FLOOR(x) + (((x) & ((x) - 1)) == 0 ? 0 : 1))
+
 #endif /* JEMALLOC_INTERNAL_BIT_UTIL_H */
diff --git a/deps/jemalloc/include/jemalloc/internal/bitmap.h b/deps/jemalloc/include/jemalloc/internal/bitmap.h
index ac990290a..c3f9cb490 100644
--- a/deps/jemalloc/include/jemalloc/internal/bitmap.h
+++ b/deps/jemalloc/include/jemalloc/internal/bitmap.h
@@ -3,18 +3,18 @@
 
 #include "jemalloc/internal/arena_types.h"
 #include "jemalloc/internal/bit_util.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 
 typedef unsigned long bitmap_t;
 #define LG_SIZEOF_BITMAP	LG_SIZEOF_LONG
 
 /* Maximum bitmap bit count is 2^LG_BITMAP_MAXBITS. */
-#if LG_SLAB_MAXREGS > LG_CEIL_NSIZES
+#if LG_SLAB_MAXREGS > LG_CEIL(SC_NSIZES)
 /* Maximum bitmap bit count is determined by maximum regions per slab. */
 #  define LG_BITMAP_MAXBITS	LG_SLAB_MAXREGS
 #else
 /* Maximum bitmap bit count is determined by number of extent size classes. */
-#  define LG_BITMAP_MAXBITS	LG_CEIL_NSIZES
+#  define LG_BITMAP_MAXBITS	LG_CEIL(SC_NSIZES)
 #endif
 #define BITMAP_MAXBITS		(ZU(1) << LG_BITMAP_MAXBITS)
 
diff --git a/deps/jemalloc/include/jemalloc/internal/cache_bin.h b/deps/jemalloc/include/jemalloc/internal/cache_bin.h
index 12f3ef2dd..d14556a3d 100644
--- a/deps/jemalloc/include/jemalloc/internal/cache_bin.h
+++ b/deps/jemalloc/include/jemalloc/internal/cache_bin.h
@@ -88,11 +88,21 @@ JEMALLOC_ALWAYS_INLINE void *
 cache_bin_alloc_easy(cache_bin_t *bin, bool *success) {
 	void *ret;
 
-	if (unlikely(bin->ncached == 0)) {
-		bin->low_water = -1;
-		*success = false;
-		return NULL;
+	bin->ncached--;
+
+	/*
+	 * Check for both bin->ncached == 0 and ncached < low_water
+	 * in a single branch.
+	 */
+	if (unlikely(bin->ncached <= bin->low_water)) {
+		bin->low_water = bin->ncached;
+		if (bin->ncached == -1) {
+			bin->ncached = 0;
+			*success = false;
+			return NULL;
+		}
 	}
+
 	/*
 	 * success (instead of ret) should be checked upon the return of this
 	 * function.  We avoid checking (ret == NULL) because there is never a
@@ -101,14 +111,21 @@ cache_bin_alloc_easy(cache_bin_t *bin, bool *success) {
 	 * cacheline).
 	 */
 	*success = true;
-	ret = *(bin->avail - bin->ncached);
-	bin->ncached--;
+	ret = *(bin->avail - (bin->ncached + 1));
 
-	if (unlikely(bin->ncached < bin->low_water)) {
-		bin->low_water = bin->ncached;
+	return ret;
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+cache_bin_dalloc_easy(cache_bin_t *bin, cache_bin_info_t *bin_info, void *ptr) {
+	if (unlikely(bin->ncached == bin_info->ncached_max)) {
+		return false;
 	}
+	assert(bin->ncached < bin_info->ncached_max);
+	bin->ncached++;
+	*(bin->avail - bin->ncached) = ptr;
 
-	return ret;
+	return true;
 }
 
 #endif /* JEMALLOC_INTERNAL_CACHE_BIN_H */
diff --git a/deps/jemalloc/include/jemalloc/internal/ctl.h b/deps/jemalloc/include/jemalloc/internal/ctl.h
index d927d9480..1d1aacc6f 100644
--- a/deps/jemalloc/include/jemalloc/internal/ctl.h
+++ b/deps/jemalloc/include/jemalloc/internal/ctl.h
@@ -5,7 +5,7 @@
 #include "jemalloc/internal/malloc_io.h"
 #include "jemalloc/internal/mutex_prof.h"
 #include "jemalloc/internal/ql.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/stats.h"
 
 /* Maximum ctl tree depth. */
@@ -39,9 +39,12 @@ typedef struct ctl_arena_stats_s {
 	uint64_t nmalloc_small;
 	uint64_t ndalloc_small;
 	uint64_t nrequests_small;
+	uint64_t nfills_small;
+	uint64_t nflushes_small;
 
-	bin_stats_t bstats[NBINS];
-	arena_stats_large_t lstats[NSIZES - NBINS];
+	bin_stats_t bstats[SC_NBINS];
+	arena_stats_large_t lstats[SC_NSIZES - SC_NBINS];
+	arena_stats_extents_t estats[SC_NPSIZES];
 } ctl_arena_stats_t;
 
 typedef struct ctl_stats_s {
diff --git a/deps/jemalloc/include/jemalloc/internal/emitter.h b/deps/jemalloc/include/jemalloc/internal/emitter.h
index 3a2b2f7f2..542bc79c3 100644
--- a/deps/jemalloc/include/jemalloc/internal/emitter.h
+++ b/deps/jemalloc/include/jemalloc/internal/emitter.h
@@ -45,7 +45,9 @@ struct emitter_col_s {
 		int int_val;
 		unsigned unsigned_val;
 		uint32_t uint32_val;
+		uint32_t uint32_t_val;
 		uint64_t uint64_val;
+		uint64_t uint64_t_val;
 		size_t size_val;
 		ssize_t ssize_val;
 		const char *str_val;
@@ -60,17 +62,6 @@ struct emitter_row_s {
 	ql_head(emitter_col_t) cols;
 };
 
-static inline void
-emitter_row_init(emitter_row_t *row) {
-	ql_new(&row->cols);
-}
-
-static inline void
-emitter_col_init(emitter_col_t *col, emitter_row_t *row) {
-	ql_elm_new(col, link);
-	ql_tail_insert(&row->cols, col, link);
-}
-
 typedef struct emitter_s emitter_t;
 struct emitter_s {
 	emitter_output_t output;
@@ -80,18 +71,10 @@ struct emitter_s {
 	int nesting_depth;
 	/* True if we've already emitted a value at the given depth. */
 	bool item_at_depth;
+	/* True if we emitted a key and will emit corresponding value next. */
+	bool emitted_key;
 };
 
-static inline void
-emitter_init(emitter_t *emitter, emitter_output_t emitter_output,
-    void (*write_cb)(void *, const char *), void *cbopaque) {
-	emitter->output = emitter_output;
-	emitter->write_cb = write_cb;
-	emitter->cbopaque = cbopaque;
-	emitter->item_at_depth = false;
-	emitter->nesting_depth = 0;
-}
-
 /* Internal convenience function.  Write to the emitter the given string. */
 JEMALLOC_FORMAT_PRINTF(2, 3)
 static inline void
@@ -103,22 +86,11 @@ emitter_printf(emitter_t *emitter, const char *format, ...) {
 	va_end(ap);
 }
 
-/* Write to the emitter the given string, but only in table mode. */
-JEMALLOC_FORMAT_PRINTF(2, 3)
-static inline void
-emitter_table_printf(emitter_t *emitter, const char *format, ...) {
-	if (emitter->output == emitter_output_table) {
-		va_list ap;
-		va_start(ap, format);
-		malloc_vcprintf(emitter->write_cb, emitter->cbopaque, format, ap);
-		va_end(ap);
-	}
-}
-
-static inline void
+static inline const char * JEMALLOC_FORMAT_ARG(3)
 emitter_gen_fmt(char *out_fmt, size_t out_size, const char *fmt_specifier,
     emitter_justify_t justify, int width) {
 	size_t written;
+	fmt_specifier++;
 	if (justify == emitter_justify_none) {
 		written = malloc_snprintf(out_fmt, out_size,
 		    "%%%s", fmt_specifier);
@@ -131,6 +103,7 @@ emitter_gen_fmt(char *out_fmt, size_t out_size, const char *fmt_specifier,
 	}
 	/* Only happens in case of bad format string, which *we* choose. */
 	assert(written <  out_size);
+	return out_fmt;
 }
 
 /*
@@ -156,26 +129,27 @@ emitter_print_value(emitter_t *emitter, emitter_justify_t justify, int width,
 	char buf[BUF_SIZE];
 
 #define EMIT_SIMPLE(type, format)					\
-	emitter_gen_fmt(fmt, FMT_SIZE, format, justify, width);		\
-	emitter_printf(emitter, fmt, *(const type *)value);		\
+	emitter_printf(emitter,						\
+	    emitter_gen_fmt(fmt, FMT_SIZE, format, justify, width),	\
+	    *(const type *)value);
 
 	switch (value_type) {
 	case emitter_type_bool:
-		emitter_gen_fmt(fmt, FMT_SIZE, "s", justify, width);
-		emitter_printf(emitter, fmt, *(const bool *)value ?
-		    "true" : "false");
+		emitter_printf(emitter, 
+		    emitter_gen_fmt(fmt, FMT_SIZE, "%s", justify, width),
+		    *(const bool *)value ?  "true" : "false");
 		break;
 	case emitter_type_int:
-		EMIT_SIMPLE(int, "d")
+		EMIT_SIMPLE(int, "%d")
 		break;
 	case emitter_type_unsigned:
-		EMIT_SIMPLE(unsigned, "u")
+		EMIT_SIMPLE(unsigned, "%u")
 		break;
 	case emitter_type_ssize:
-		EMIT_SIMPLE(ssize_t, "zd")
+		EMIT_SIMPLE(ssize_t, "%zd")
 		break;
 	case emitter_type_size:
-		EMIT_SIMPLE(size_t, "zu")
+		EMIT_SIMPLE(size_t, "%zu")
 		break;
 	case emitter_type_string:
 		str_written = malloc_snprintf(buf, BUF_SIZE, "\"%s\"",
@@ -185,17 +159,17 @@ emitter_print_value(emitter_t *emitter, emitter_justify_t justify, int width,
 		 * anywhere near the fmt size.
 		 */
 		assert(str_written < BUF_SIZE);
-		emitter_gen_fmt(fmt, FMT_SIZE, "s", justify, width);
-		emitter_printf(emitter, fmt, buf);
+		emitter_printf(emitter, 
+		    emitter_gen_fmt(fmt, FMT_SIZE, "%s", justify, width), buf);
 		break;
 	case emitter_type_uint32:
-		EMIT_SIMPLE(uint32_t, FMTu32)
+		EMIT_SIMPLE(uint32_t, "%" FMTu32)
 		break;
 	case emitter_type_uint64:
-		EMIT_SIMPLE(uint64_t, FMTu64)
+		EMIT_SIMPLE(uint64_t, "%" FMTu64)
 		break;
 	case emitter_type_title:
-		EMIT_SIMPLE(char *const, "s");
+		EMIT_SIMPLE(char *const, "%s");
 		break;
 	default:
 		unreachable();
@@ -235,201 +209,278 @@ emitter_indent(emitter_t *emitter) {
 
 static inline void
 emitter_json_key_prefix(emitter_t *emitter) {
+	if (emitter->emitted_key) {
+		emitter->emitted_key = false;
+		return;
+	}
 	emitter_printf(emitter, "%s\n", emitter->item_at_depth ? "," : "");
 	emitter_indent(emitter);
 }
 
+/******************************************************************************/
+/* Public functions for emitter_t. */
+
 static inline void
-emitter_begin(emitter_t *emitter) {
-	if (emitter->output == emitter_output_json) {
-		assert(emitter->nesting_depth == 0);
-		emitter_printf(emitter, "{");
-		emitter_nest_inc(emitter);
-	} else {
-		// tabular init
-		emitter_printf(emitter, "%s", "");
-	}
+emitter_init(emitter_t *emitter, emitter_output_t emitter_output,
+    void (*write_cb)(void *, const char *), void *cbopaque) {
+	emitter->output = emitter_output;
+	emitter->write_cb = write_cb;
+	emitter->cbopaque = cbopaque;
+	emitter->item_at_depth = false;
+	emitter->emitted_key = false; 
+	emitter->nesting_depth = 0;
 }
 
+/******************************************************************************/
+/* JSON public API. */
+
+/* 
+ * Emits a key (e.g. as appears in an object). The next json entity emitted will
+ * be the corresponding value.
+ */
 static inline void
-emitter_end(emitter_t *emitter) {
+emitter_json_key(emitter_t *emitter, const char *json_key) {
 	if (emitter->output == emitter_output_json) {
-		assert(emitter->nesting_depth == 1);
-		emitter_nest_dec(emitter);
-		emitter_printf(emitter, "\n}\n");
+		emitter_json_key_prefix(emitter);
+		emitter_printf(emitter, "\"%s\": ", json_key);
+		emitter->emitted_key = true;
 	}
 }
 
-/*
- * Note emits a different kv pair as well, but only in table mode.  Omits the
- * note if table_note_key is NULL.
- */
 static inline void
-emitter_kv_note(emitter_t *emitter, const char *json_key, const char *table_key,
-    emitter_type_t value_type, const void *value,
-    const char *table_note_key, emitter_type_t table_note_value_type,
-    const void *table_note_value) {
+emitter_json_value(emitter_t *emitter, emitter_type_t value_type,
+    const void *value) {
 	if (emitter->output == emitter_output_json) {
-		assert(emitter->nesting_depth > 0);
 		emitter_json_key_prefix(emitter);
-		emitter_printf(emitter, "\"%s\": ", json_key);
-		emitter_print_value(emitter, emitter_justify_none, -1,
-		    value_type, value);
-	} else {
-		emitter_indent(emitter);
-		emitter_printf(emitter, "%s: ", table_key);
 		emitter_print_value(emitter, emitter_justify_none, -1,
 		    value_type, value);
-		if (table_note_key != NULL) {
-			emitter_printf(emitter, " (%s: ", table_note_key);
-			emitter_print_value(emitter, emitter_justify_none, -1,
-			    table_note_value_type, table_note_value);
-			emitter_printf(emitter, ")");
-		}
-		emitter_printf(emitter, "\n");
+		emitter->item_at_depth = true;
 	}
-	emitter->item_at_depth = true;
 }
 
+/* Shorthand for calling emitter_json_key and then emitter_json_value. */
 static inline void
-emitter_kv(emitter_t *emitter, const char *json_key, const char *table_key,
+emitter_json_kv(emitter_t *emitter, const char *json_key,
     emitter_type_t value_type, const void *value) {
-	emitter_kv_note(emitter, json_key, table_key, value_type, value, NULL,
-	    emitter_type_bool, NULL);
+	emitter_json_key(emitter, json_key);
+	emitter_json_value(emitter, value_type, value);
 }
 
 static inline void
-emitter_json_kv(emitter_t *emitter, const char *json_key,
-    emitter_type_t value_type, const void *value) {
+emitter_json_array_begin(emitter_t *emitter) {
 	if (emitter->output == emitter_output_json) {
-		emitter_kv(emitter, json_key, NULL, value_type, value);
+		emitter_json_key_prefix(emitter);
+		emitter_printf(emitter, "[");
+		emitter_nest_inc(emitter);
 	}
 }
 
+/* Shorthand for calling emitter_json_key and then emitter_json_array_begin. */
 static inline void
-emitter_table_kv(emitter_t *emitter, const char *table_key,
-    emitter_type_t value_type, const void *value) {
-	if (emitter->output == emitter_output_table) {
-		emitter_kv(emitter, NULL, table_key, value_type, value);
+emitter_json_array_kv_begin(emitter_t *emitter, const char *json_key) {
+	emitter_json_key(emitter, json_key);
+	emitter_json_array_begin(emitter);
+}
+
+static inline void
+emitter_json_array_end(emitter_t *emitter) {
+	if (emitter->output == emitter_output_json) {
+		assert(emitter->nesting_depth > 0);
+		emitter_nest_dec(emitter);
+		emitter_printf(emitter, "\n");
+		emitter_indent(emitter);
+		emitter_printf(emitter, "]");
 	}
 }
 
 static inline void
-emitter_dict_begin(emitter_t *emitter, const char *json_key,
-    const char *table_header) {
+emitter_json_object_begin(emitter_t *emitter) {
 	if (emitter->output == emitter_output_json) {
 		emitter_json_key_prefix(emitter);
-		emitter_printf(emitter, "\"%s\": {", json_key);
-		emitter_nest_inc(emitter);
-	} else {
-		emitter_indent(emitter);
-		emitter_printf(emitter, "%s\n", table_header);
+		emitter_printf(emitter, "{");
 		emitter_nest_inc(emitter);
 	}
 }
 
+/* Shorthand for calling emitter_json_key and then emitter_json_object_begin. */
 static inline void
-emitter_dict_end(emitter_t *emitter) {
+emitter_json_object_kv_begin(emitter_t *emitter, const char *json_key) {
+	emitter_json_key(emitter, json_key);
+	emitter_json_object_begin(emitter);
+}
+
+static inline void
+emitter_json_object_end(emitter_t *emitter) {
 	if (emitter->output == emitter_output_json) {
 		assert(emitter->nesting_depth > 0);
 		emitter_nest_dec(emitter);
 		emitter_printf(emitter, "\n");
 		emitter_indent(emitter);
 		emitter_printf(emitter, "}");
-	} else {
-		emitter_nest_dec(emitter);
 	}
 }
 
+
+/******************************************************************************/
+/* Table public API. */
+
 static inline void
-emitter_json_dict_begin(emitter_t *emitter, const char *json_key) {
-	if (emitter->output == emitter_output_json) {
-		emitter_dict_begin(emitter, json_key, NULL);
+emitter_table_dict_begin(emitter_t *emitter, const char *table_key) {
+	if (emitter->output == emitter_output_table) {
+		emitter_indent(emitter);
+		emitter_printf(emitter, "%s\n", table_key);
+		emitter_nest_inc(emitter);
 	}
 }
 
 static inline void
-emitter_json_dict_end(emitter_t *emitter) {
-	if (emitter->output == emitter_output_json) {
-		emitter_dict_end(emitter);
+emitter_table_dict_end(emitter_t *emitter) {
+	if (emitter->output == emitter_output_table) {
+		emitter_nest_dec(emitter);
 	}
 }
 
 static inline void
-emitter_table_dict_begin(emitter_t *emitter, const char *table_key) {
+emitter_table_kv_note(emitter_t *emitter, const char *table_key,
+    emitter_type_t value_type, const void *value,
+    const char *table_note_key, emitter_type_t table_note_value_type,
+    const void *table_note_value) {
 	if (emitter->output == emitter_output_table) {
-		emitter_dict_begin(emitter, NULL, table_key);
+		emitter_indent(emitter);
+		emitter_printf(emitter, "%s: ", table_key);
+		emitter_print_value(emitter, emitter_justify_none, -1,
+		    value_type, value);
+		if (table_note_key != NULL) {
+			emitter_printf(emitter, " (%s: ", table_note_key);
+			emitter_print_value(emitter, emitter_justify_none, -1,
+			    table_note_value_type, table_note_value);
+			emitter_printf(emitter, ")");
+		}
+		emitter_printf(emitter, "\n");
 	}
+	emitter->item_at_depth = true;
 }
 
 static inline void
-emitter_table_dict_end(emitter_t *emitter) {
+emitter_table_kv(emitter_t *emitter, const char *table_key,
+    emitter_type_t value_type, const void *value) {
+	emitter_table_kv_note(emitter, table_key, value_type, value, NULL,
+	    emitter_type_bool, NULL);
+}
+
+
+/* Write to the emitter the given string, but only in table mode. */
+JEMALLOC_FORMAT_PRINTF(2, 3)
+static inline void
+emitter_table_printf(emitter_t *emitter, const char *format, ...) {
 	if (emitter->output == emitter_output_table) {
-		emitter_dict_end(emitter);
+		va_list ap;
+		va_start(ap, format);
+		malloc_vcprintf(emitter->write_cb, emitter->cbopaque, format, ap);
+		va_end(ap);
 	}
 }
 
 static inline void
-emitter_json_arr_begin(emitter_t *emitter, const char *json_key) {
-	if (emitter->output == emitter_output_json) {
-		emitter_json_key_prefix(emitter);
-		emitter_printf(emitter, "\"%s\": [", json_key);
-		emitter_nest_inc(emitter);
+emitter_table_row(emitter_t *emitter, emitter_row_t *row) {
+	if (emitter->output != emitter_output_table) {
+		return;
+	}
+	emitter_col_t *col;
+	ql_foreach(col, &row->cols, link) {
+		emitter_print_value(emitter, col->justify, col->width,
+		    col->type, (const void *)&col->bool_val);
 	}
+	emitter_table_printf(emitter, "\n");
+}
+
+static inline void
+emitter_row_init(emitter_row_t *row) {
+	ql_new(&row->cols);
 }
 
 static inline void
-emitter_json_arr_end(emitter_t *emitter) {
+emitter_col_init(emitter_col_t *col, emitter_row_t *row) {
+	ql_elm_new(col, link);
+	ql_tail_insert(&row->cols, col, link);
+}
+
+
+/******************************************************************************/
+/*
+ * Generalized public API. Emits using either JSON or table, according to
+ * settings in the emitter_t. */
+
+/*
+ * Note emits a different kv pair as well, but only in table mode.  Omits the
+ * note if table_note_key is NULL.
+ */
+static inline void
+emitter_kv_note(emitter_t *emitter, const char *json_key, const char *table_key,
+    emitter_type_t value_type, const void *value,
+    const char *table_note_key, emitter_type_t table_note_value_type,
+    const void *table_note_value) {
 	if (emitter->output == emitter_output_json) {
-		assert(emitter->nesting_depth > 0);
-		emitter_nest_dec(emitter);
-		emitter_printf(emitter, "\n");
-		emitter_indent(emitter);
-		emitter_printf(emitter, "]");
+		emitter_json_key(emitter, json_key);
+		emitter_json_value(emitter, value_type, value);
+	} else {
+		emitter_table_kv_note(emitter, table_key, value_type, value,
+		    table_note_key, table_note_value_type, table_note_value);
 	}
+	emitter->item_at_depth = true;
 }
 
 static inline void
-emitter_json_arr_obj_begin(emitter_t *emitter) {
+emitter_kv(emitter_t *emitter, const char *json_key, const char *table_key,
+    emitter_type_t value_type, const void *value) {
+	emitter_kv_note(emitter, json_key, table_key, value_type, value, NULL,
+	    emitter_type_bool, NULL);
+}
+
+static inline void
+emitter_dict_begin(emitter_t *emitter, const char *json_key,
+    const char *table_header) {
 	if (emitter->output == emitter_output_json) {
-		emitter_json_key_prefix(emitter);
-		emitter_printf(emitter, "{");
-		emitter_nest_inc(emitter);
+		emitter_json_key(emitter, json_key);
+		emitter_json_object_begin(emitter);
+	} else {
+		emitter_table_dict_begin(emitter, table_header);
 	}
 }
 
 static inline void
-emitter_json_arr_obj_end(emitter_t *emitter) {
+emitter_dict_end(emitter_t *emitter) {
 	if (emitter->output == emitter_output_json) {
-		assert(emitter->nesting_depth > 0);
-		emitter_nest_dec(emitter);
-		emitter_printf(emitter, "\n");
-		emitter_indent(emitter);
-		emitter_printf(emitter, "}");
+		emitter_json_object_end(emitter);
+	} else {
+		emitter_table_dict_end(emitter);
 	}
 }
 
 static inline void
-emitter_json_arr_value(emitter_t *emitter, emitter_type_t value_type,
-    const void *value) {
+emitter_begin(emitter_t *emitter) {
 	if (emitter->output == emitter_output_json) {
-		emitter_json_key_prefix(emitter);
-		emitter_print_value(emitter, emitter_justify_none, -1,
-		    value_type, value);
+		assert(emitter->nesting_depth == 0);
+		emitter_printf(emitter, "{");
+		emitter_nest_inc(emitter);
+	} else {
+		/*
+		 * This guarantees that we always call write_cb at least once.
+		 * This is useful if some invariant is established by each call
+		 * to write_cb, but doesn't hold initially: e.g., some buffer
+		 * holds a null-terminated string.
+		 */
+		emitter_printf(emitter, "%s", "");
 	}
 }
 
 static inline void
-emitter_table_row(emitter_t *emitter, emitter_row_t *row) {
-	if (emitter->output != emitter_output_table) {
-		return;
-	}
-	emitter_col_t *col;
-	ql_foreach(col, &row->cols, link) {
-		emitter_print_value(emitter, col->justify, col->width,
-		    col->type, (const void *)&col->bool_val);
+emitter_end(emitter_t *emitter) {
+	if (emitter->output == emitter_output_json) {
+		assert(emitter->nesting_depth == 1);
+		emitter_nest_dec(emitter);
+		emitter_printf(emitter, "\n}\n");
 	}
-	emitter_table_printf(emitter, "\n");
 }
 
 #endif /* JEMALLOC_INTERNAL_EMITTER_H */
diff --git a/deps/jemalloc/include/jemalloc/internal/extent_externs.h b/deps/jemalloc/include/jemalloc/internal/extent_externs.h
index b8a4d026c..8aba57633 100644
--- a/deps/jemalloc/include/jemalloc/internal/extent_externs.h
+++ b/deps/jemalloc/include/jemalloc/internal/extent_externs.h
@@ -24,13 +24,17 @@ size_t extent_size_quantize_floor(size_t size);
 size_t extent_size_quantize_ceil(size_t size);
 #endif
 
-rb_proto(, extent_avail_, extent_tree_t, extent_t)
+ph_proto(, extent_avail_, extent_tree_t, extent_t)
 ph_proto(, extent_heap_, extent_heap_t, extent_t)
 
 bool extents_init(tsdn_t *tsdn, extents_t *extents, extent_state_t state,
     bool delay_coalesce);
 extent_state_t extents_state_get(const extents_t *extents);
 size_t extents_npages_get(extents_t *extents);
+/* Get the number of extents in the given page size index. */
+size_t extents_nextents_get(extents_t *extents, pszind_t ind);
+/* Get the sum total bytes of the extents in the given page size index. */
+size_t extents_nbytes_get(extents_t *extents, pszind_t ind);
 extent_t *extents_alloc(tsdn_t *tsdn, arena_t *arena,
     extent_hooks_t **r_extent_hooks, extents_t *extents, void *new_addr,
     size_t size, size_t pad, size_t alignment, bool slab, szind_t szind,
@@ -70,4 +74,10 @@ bool extent_merge_wrapper(tsdn_t *tsdn, arena_t *arena,
 
 bool extent_boot(void);
 
+void extent_util_stats_get(tsdn_t *tsdn, const void *ptr,
+    size_t *nfree, size_t *nregs, size_t *size);
+void extent_util_stats_verbose_get(tsdn_t *tsdn, const void *ptr,
+    size_t *nfree, size_t *nregs, size_t *size,
+    size_t *bin_nfree, size_t *bin_nregs, void **slabcur_addr);
+
 #endif /* JEMALLOC_INTERNAL_EXTENT_EXTERNS_H */
diff --git a/deps/jemalloc/include/jemalloc/internal/extent_inlines.h b/deps/jemalloc/include/jemalloc/internal/extent_inlines.h
index 77181df8d..77fa4c4a2 100644
--- a/deps/jemalloc/include/jemalloc/internal/extent_inlines.h
+++ b/deps/jemalloc/include/jemalloc/internal/extent_inlines.h
@@ -6,6 +6,7 @@
 #include "jemalloc/internal/pages.h"
 #include "jemalloc/internal/prng.h"
 #include "jemalloc/internal/ql.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/sz.h"
 
 static inline void
@@ -34,18 +35,19 @@ extent_unlock2(tsdn_t *tsdn, extent_t *extent1, extent_t *extent2) {
 	    (uintptr_t)extent2);
 }
 
-static inline arena_t *
-extent_arena_get(const extent_t *extent) {
+static inline unsigned
+extent_arena_ind_get(const extent_t *extent) {
 	unsigned arena_ind = (unsigned)((extent->e_bits &
 	    EXTENT_BITS_ARENA_MASK) >> EXTENT_BITS_ARENA_SHIFT);
-	/*
-	 * The following check is omitted because we should never actually read
-	 * a NULL arena pointer.
-	 */
-	if (false && arena_ind >= MALLOCX_ARENA_LIMIT) {
-		return NULL;
-	}
 	assert(arena_ind < MALLOCX_ARENA_LIMIT);
+
+	return arena_ind;
+}
+
+static inline arena_t *
+extent_arena_get(const extent_t *extent) {
+	unsigned arena_ind = extent_arena_ind_get(extent);
+
 	return (arena_t *)atomic_load_p(&arenas[arena_ind], ATOMIC_ACQUIRE);
 }
 
@@ -53,14 +55,14 @@ static inline szind_t
 extent_szind_get_maybe_invalid(const extent_t *extent) {
 	szind_t szind = (szind_t)((extent->e_bits & EXTENT_BITS_SZIND_MASK) >>
 	    EXTENT_BITS_SZIND_SHIFT);
-	assert(szind <= NSIZES);
+	assert(szind <= SC_NSIZES);
 	return szind;
 }
 
 static inline szind_t
 extent_szind_get(const extent_t *extent) {
 	szind_t szind = extent_szind_get_maybe_invalid(extent);
-	assert(szind < NSIZES); /* Never call when "invalid". */
+	assert(szind < SC_NSIZES); /* Never call when "invalid". */
 	return szind;
 }
 
@@ -69,6 +71,14 @@ extent_usize_get(const extent_t *extent) {
 	return sz_index2size(extent_szind_get(extent));
 }
 
+static inline unsigned
+extent_binshard_get(const extent_t *extent) {
+	unsigned binshard = (unsigned)((extent->e_bits &
+	    EXTENT_BITS_BINSHARD_MASK) >> EXTENT_BITS_BINSHARD_SHIFT);
+	assert(binshard < bin_infos[extent_szind_get(extent)].n_shards);
+	return binshard;
+}
+
 static inline size_t
 extent_sn_get(const extent_t *extent) {
 	return (size_t)((extent->e_bits & EXTENT_BITS_SN_MASK) >>
@@ -176,6 +186,11 @@ extent_prof_tctx_get(const extent_t *extent) {
 	    ATOMIC_ACQUIRE);
 }
 
+static inline nstime_t
+extent_prof_alloc_time_get(const extent_t *extent) {
+	return extent->e_alloc_time;
+}
+
 static inline void
 extent_arena_set(extent_t *extent, arena_t *arena) {
 	unsigned arena_ind = (arena != NULL) ? arena_ind_get(arena) : ((1U <<
@@ -185,12 +200,20 @@ extent_arena_set(extent_t *extent, arena_t *arena) {
 }
 
 static inline void
+extent_binshard_set(extent_t *extent, unsigned binshard) {
+	/* The assertion assumes szind is set already. */
+	assert(binshard < bin_infos[extent_szind_get(extent)].n_shards);
+	extent->e_bits = (extent->e_bits & ~EXTENT_BITS_BINSHARD_MASK) |
+	    ((uint64_t)binshard << EXTENT_BITS_BINSHARD_SHIFT);
+}
+
+static inline void
 extent_addr_set(extent_t *extent, void *addr) {
 	extent->e_addr = addr;
 }
 
 static inline void
-extent_addr_randomize(UNUSED tsdn_t *tsdn, extent_t *extent, size_t alignment) {
+extent_addr_randomize(tsdn_t *tsdn, extent_t *extent, size_t alignment) {
 	assert(extent_base_get(extent) == extent_addr_get(extent));
 
 	if (alignment < PAGE) {
@@ -234,7 +257,7 @@ extent_bsize_set(extent_t *extent, size_t bsize) {
 
 static inline void
 extent_szind_set(extent_t *extent, szind_t szind) {
-	assert(szind <= NSIZES); /* NSIZES means "invalid". */
+	assert(szind <= SC_NSIZES); /* SC_NSIZES means "invalid". */
 	extent->e_bits = (extent->e_bits & ~EXTENT_BITS_SZIND_MASK) |
 	    ((uint64_t)szind << EXTENT_BITS_SZIND_SHIFT);
 }
@@ -247,6 +270,16 @@ extent_nfree_set(extent_t *extent, unsigned nfree) {
 }
 
 static inline void
+extent_nfree_binshard_set(extent_t *extent, unsigned nfree, unsigned binshard) {
+	/* The assertion assumes szind is set already. */
+	assert(binshard < bin_infos[extent_szind_get(extent)].n_shards);
+	extent->e_bits = (extent->e_bits &
+	    (~EXTENT_BITS_NFREE_MASK & ~EXTENT_BITS_BINSHARD_MASK)) |
+	    ((uint64_t)binshard << EXTENT_BITS_BINSHARD_SHIFT) |
+	    ((uint64_t)nfree << EXTENT_BITS_NFREE_SHIFT);
+}
+
+static inline void
 extent_nfree_inc(extent_t *extent) {
 	assert(extent_slab_get(extent));
 	extent->e_bits += ((uint64_t)1U << EXTENT_BITS_NFREE_SHIFT);
@@ -259,6 +292,12 @@ extent_nfree_dec(extent_t *extent) {
 }
 
 static inline void
+extent_nfree_sub(extent_t *extent, uint64_t n) {
+	assert(extent_slab_get(extent));
+	extent->e_bits -= (n << EXTENT_BITS_NFREE_SHIFT);
+}
+
+static inline void
 extent_sn_set(extent_t *extent, size_t sn) {
 	extent->e_bits = (extent->e_bits & ~EXTENT_BITS_SN_MASK) |
 	    ((uint64_t)sn << EXTENT_BITS_SN_SHIFT);
@@ -300,9 +339,34 @@ extent_prof_tctx_set(extent_t *extent, prof_tctx_t *tctx) {
 }
 
 static inline void
+extent_prof_alloc_time_set(extent_t *extent, nstime_t t) {
+	nstime_copy(&extent->e_alloc_time, &t);
+}
+
+static inline bool
+extent_is_head_get(extent_t *extent) {
+	if (maps_coalesce) {
+		not_reached();
+	}
+
+	return (bool)((extent->e_bits & EXTENT_BITS_IS_HEAD_MASK) >>
+	    EXTENT_BITS_IS_HEAD_SHIFT);
+}
+
+static inline void
+extent_is_head_set(extent_t *extent, bool is_head) {
+	if (maps_coalesce) {
+		not_reached();
+	}
+
+	extent->e_bits = (extent->e_bits & ~EXTENT_BITS_IS_HEAD_MASK) |
+	    ((uint64_t)is_head << EXTENT_BITS_IS_HEAD_SHIFT);
+}
+
+static inline void
 extent_init(extent_t *extent, arena_t *arena, void *addr, size_t size,
     bool slab, szind_t szind, size_t sn, extent_state_t state, bool zeroed,
-    bool committed, bool dumpable) {
+    bool committed, bool dumpable, extent_head_state_t is_head) {
 	assert(addr == PAGE_ADDR2BASE(addr) || !slab);
 
 	extent_arena_set(extent, arena);
@@ -316,6 +380,10 @@ extent_init(extent_t *extent, arena_t *arena, void *addr, size_t size,
 	extent_committed_set(extent, committed);
 	extent_dumpable_set(extent, dumpable);
 	ql_elm_new(extent, ql_link);
+	if (!maps_coalesce) {
+		extent_is_head_set(extent, (is_head == EXTENT_IS_HEAD) ? true :
+		    false);
+	}
 	if (config_prof) {
 		extent_prof_tctx_set(extent, NULL);
 	}
@@ -327,7 +395,7 @@ extent_binit(extent_t *extent, void *addr, size_t bsize, size_t sn) {
 	extent_addr_set(extent, addr);
 	extent_bsize_set(extent, bsize);
 	extent_slab_set(extent, false);
-	extent_szind_set(extent, NSIZES);
+	extent_szind_set(extent, SC_NSIZES);
 	extent_sn_set(extent, sn);
 	extent_state_set(extent, extent_state_active);
 	extent_zeroed_set(extent, true);
diff --git a/deps/jemalloc/include/jemalloc/internal/extent_structs.h b/deps/jemalloc/include/jemalloc/internal/extent_structs.h
index 4873b9e9e..767cd8930 100644
--- a/deps/jemalloc/include/jemalloc/internal/extent_structs.h
+++ b/deps/jemalloc/include/jemalloc/internal/extent_structs.h
@@ -2,11 +2,12 @@
 #define JEMALLOC_INTERNAL_EXTENT_STRUCTS_H
 
 #include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/bit_util.h"
 #include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/ql.h"
 #include "jemalloc/internal/ph.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 
 typedef enum {
 	extent_state_active   = 0,
@@ -28,9 +29,10 @@ struct extent_s {
 	 * t: state
 	 * i: szind
 	 * f: nfree
+	 * s: bin_shard
 	 * n: sn
 	 *
-	 * nnnnnnnn ... nnnnffff ffffffii iiiiiitt zdcbaaaa aaaaaaaa
+	 * nnnnnnnn ... nnnnnnss ssssffff ffffffii iiiiiitt zdcbaaaa aaaaaaaa
 	 *
 	 * arena_ind: Arena from which this extent came, or all 1 bits if
 	 *            unassociated.
@@ -75,6 +77,8 @@ struct extent_s {
 	 *
 	 * nfree: Number of free regions in slab.
 	 *
+	 * bin_shard: the shard of the bin from which this extent came.
+	 *
 	 * sn: Serial number (potentially non-unique).
 	 *
 	 *     Serial numbers may wrap around if !opt_retain, but as long as
@@ -112,7 +116,7 @@ struct extent_s {
 #define EXTENT_BITS_STATE_SHIFT  (EXTENT_BITS_ZEROED_WIDTH + EXTENT_BITS_ZEROED_SHIFT)
 #define EXTENT_BITS_STATE_MASK  MASK(EXTENT_BITS_STATE_WIDTH, EXTENT_BITS_STATE_SHIFT)
 
-#define EXTENT_BITS_SZIND_WIDTH  LG_CEIL_NSIZES
+#define EXTENT_BITS_SZIND_WIDTH  LG_CEIL(SC_NSIZES)
 #define EXTENT_BITS_SZIND_SHIFT  (EXTENT_BITS_STATE_WIDTH + EXTENT_BITS_STATE_SHIFT)
 #define EXTENT_BITS_SZIND_MASK  MASK(EXTENT_BITS_SZIND_WIDTH, EXTENT_BITS_SZIND_SHIFT)
 
@@ -120,7 +124,15 @@ struct extent_s {
 #define EXTENT_BITS_NFREE_SHIFT  (EXTENT_BITS_SZIND_WIDTH + EXTENT_BITS_SZIND_SHIFT)
 #define EXTENT_BITS_NFREE_MASK  MASK(EXTENT_BITS_NFREE_WIDTH, EXTENT_BITS_NFREE_SHIFT)
 
-#define EXTENT_BITS_SN_SHIFT  (EXTENT_BITS_NFREE_WIDTH + EXTENT_BITS_NFREE_SHIFT)
+#define EXTENT_BITS_BINSHARD_WIDTH  6
+#define EXTENT_BITS_BINSHARD_SHIFT  (EXTENT_BITS_NFREE_WIDTH + EXTENT_BITS_NFREE_SHIFT)
+#define EXTENT_BITS_BINSHARD_MASK  MASK(EXTENT_BITS_BINSHARD_WIDTH, EXTENT_BITS_BINSHARD_SHIFT)
+
+#define EXTENT_BITS_IS_HEAD_WIDTH 1
+#define EXTENT_BITS_IS_HEAD_SHIFT  (EXTENT_BITS_BINSHARD_WIDTH + EXTENT_BITS_BINSHARD_SHIFT)
+#define EXTENT_BITS_IS_HEAD_MASK  MASK(EXTENT_BITS_IS_HEAD_WIDTH, EXTENT_BITS_IS_HEAD_SHIFT)
+
+#define EXTENT_BITS_SN_SHIFT   (EXTENT_BITS_IS_HEAD_WIDTH + EXTENT_BITS_IS_HEAD_SHIFT)
 #define EXTENT_BITS_SN_MASK  (UINT64_MAX << EXTENT_BITS_SN_SHIFT)
 
 	/* Pointer to the extent that this structure is responsible for. */
@@ -160,11 +172,13 @@ struct extent_s {
 		/* Small region slab metadata. */
 		arena_slab_data_t	e_slab_data;
 
-		/*
-		 * Profile counters, used for large objects.  Points to a
-		 * prof_tctx_t.
-		 */
-		atomic_p_t		e_prof_tctx;
+		/* Profiling data, used for large objects. */
+		struct {
+			/* Time when this was allocated. */
+			nstime_t		e_alloc_time;
+			/* Points to a prof_tctx_t. */
+			atomic_p_t		e_prof_tctx;
+		};
 	};
 };
 typedef ql_head(extent_t) extent_list_t;
@@ -180,14 +194,16 @@ struct extents_s {
 	 *
 	 * Synchronization: mtx.
 	 */
-	extent_heap_t		heaps[NPSIZES+1];
+	extent_heap_t		heaps[SC_NPSIZES + 1];
+	atomic_zu_t		nextents[SC_NPSIZES + 1];
+	atomic_zu_t		nbytes[SC_NPSIZES + 1];
 
 	/*
 	 * Bitmap for which set bits correspond to non-empty heaps.
 	 *
 	 * Synchronization: mtx.
 	 */
-	bitmap_t		bitmap[BITMAP_GROUPS(NPSIZES+1)];
+	bitmap_t		bitmap[BITMAP_GROUPS(SC_NPSIZES + 1)];
 
 	/*
 	 * LRU of all extents in heaps.
@@ -216,4 +232,25 @@ struct extents_s {
 	bool			delay_coalesce;
 };
 
+/*
+ * The following two structs are for experimental purposes. See
+ * experimental_utilization_query_ctl and
+ * experimental_utilization_batch_query_ctl in src/ctl.c.
+ */
+
+struct extent_util_stats_s {
+	size_t nfree;
+	size_t nregs;
+	size_t size;
+};
+
+struct extent_util_stats_verbose_s {
+	void *slabcur_addr;
+	size_t nfree;
+	size_t nregs;
+	size_t size;
+	size_t bin_nfree;
+	size_t bin_nregs;
+};
+
 #endif /* JEMALLOC_INTERNAL_EXTENT_STRUCTS_H */
diff --git a/deps/jemalloc/include/jemalloc/internal/extent_types.h b/deps/jemalloc/include/jemalloc/internal/extent_types.h
index c0561d99f..96925cf95 100644
--- a/deps/jemalloc/include/jemalloc/internal/extent_types.h
+++ b/deps/jemalloc/include/jemalloc/internal/extent_types.h
@@ -4,9 +4,10 @@
 typedef struct extent_s extent_t;
 typedef struct extents_s extents_t;
 
-#define EXTENT_HOOKS_INITIALIZER	NULL
+typedef struct extent_util_stats_s extent_util_stats_t;
+typedef struct extent_util_stats_verbose_s extent_util_stats_verbose_t;
 
-#define EXTENT_GROW_MAX_PIND (NPSIZES - 1)
+#define EXTENT_HOOKS_INITIALIZER	NULL
 
 /*
  * When reuse (and split) an active extent, (1U << opt_lg_extent_max_active_fit)
@@ -14,4 +15,9 @@ typedef struct extents_s extents_t;
  */
 #define LG_EXTENT_MAX_ACTIVE_FIT_DEFAULT 6
 
+typedef enum {
+	EXTENT_NOT_HEAD,
+	EXTENT_IS_HEAD   /* Only relevant for Windows && opt.retain. */
+} extent_head_state_t;
+
 #endif /* JEMALLOC_INTERNAL_EXTENT_TYPES_H */
diff --git a/deps/jemalloc/include/jemalloc/internal/hash.h b/deps/jemalloc/include/jemalloc/internal/hash.h
index dcfc992df..0270034e8 100644
--- a/deps/jemalloc/include/jemalloc/internal/hash.h
+++ b/deps/jemalloc/include/jemalloc/internal/hash.h
@@ -104,8 +104,8 @@ hash_x86_32(const void *key, int len, uint32_t seed) {
 		uint32_t k1 = 0;
 
 		switch (len & 3) {
-		case 3: k1 ^= tail[2] << 16;
-		case 2: k1 ^= tail[1] << 8;
+		case 3: k1 ^= tail[2] << 16; JEMALLOC_FALLTHROUGH
+		case 2: k1 ^= tail[1] << 8; JEMALLOC_FALLTHROUGH
 		case 1: k1 ^= tail[0]; k1 *= c1; k1 = hash_rotl_32(k1, 15);
 			k1 *= c2; h1 ^= k1;
 		}
@@ -119,7 +119,7 @@ hash_x86_32(const void *key, int len, uint32_t seed) {
 	return h1;
 }
 
-UNUSED static inline void
+static inline void
 hash_x86_128(const void *key, const int len, uint32_t seed,
     uint64_t r_out[2]) {
 	const uint8_t * data = (const uint8_t *) key;
@@ -177,28 +177,29 @@ hash_x86_128(const void *key, const int len, uint32_t seed,
 		uint32_t k4 = 0;
 
 		switch (len & 15) {
-		case 15: k4 ^= tail[14] << 16;
-		case 14: k4 ^= tail[13] << 8;
+		case 15: k4 ^= tail[14] << 16; JEMALLOC_FALLTHROUGH
+		case 14: k4 ^= tail[13] << 8; JEMALLOC_FALLTHROUGH
 		case 13: k4 ^= tail[12] << 0;
 			k4 *= c4; k4 = hash_rotl_32(k4, 18); k4 *= c1; h4 ^= k4;
-
-		case 12: k3 ^= tail[11] << 24;
-		case 11: k3 ^= tail[10] << 16;
-		case 10: k3 ^= tail[ 9] << 8;
+      JEMALLOC_FALLTHROUGH
+		case 12: k3 ^= tail[11] << 24; JEMALLOC_FALLTHROUGH
+		case 11: k3 ^= tail[10] << 16; JEMALLOC_FALLTHROUGH
+		case 10: k3 ^= tail[ 9] << 8; JEMALLOC_FALLTHROUGH
 		case  9: k3 ^= tail[ 8] << 0;
 		     k3 *= c3; k3 = hash_rotl_32(k3, 17); k3 *= c4; h3 ^= k3;
-
-		case  8: k2 ^= tail[ 7] << 24;
-		case  7: k2 ^= tail[ 6] << 16;
-		case  6: k2 ^= tail[ 5] << 8;
+         JEMALLOC_FALLTHROUGH
+		case  8: k2 ^= tail[ 7] << 24; JEMALLOC_FALLTHROUGH
+		case  7: k2 ^= tail[ 6] << 16; JEMALLOC_FALLTHROUGH
+		case  6: k2 ^= tail[ 5] << 8; JEMALLOC_FALLTHROUGH
 		case  5: k2 ^= tail[ 4] << 0;
 			k2 *= c2; k2 = hash_rotl_32(k2, 16); k2 *= c3; h2 ^= k2;
-
-		case  4: k1 ^= tail[ 3] << 24;
-		case  3: k1 ^= tail[ 2] << 16;
-		case  2: k1 ^= tail[ 1] << 8;
+      JEMALLOC_FALLTHROUGH
+		case  4: k1 ^= tail[ 3] << 24; JEMALLOC_FALLTHROUGH
+		case  3: k1 ^= tail[ 2] << 16; JEMALLOC_FALLTHROUGH
+		case  2: k1 ^= tail[ 1] << 8; JEMALLOC_FALLTHROUGH
 		case  1: k1 ^= tail[ 0] << 0;
 			k1 *= c1; k1 = hash_rotl_32(k1, 15); k1 *= c2; h1 ^= k1;
+      JEMALLOC_FALLTHROUGH
 		}
 	}
 
@@ -220,7 +221,7 @@ hash_x86_128(const void *key, const int len, uint32_t seed,
 	r_out[1] = (((uint64_t) h4) << 32) | h3;
 }
 
-UNUSED static inline void
+static inline void
 hash_x64_128(const void *key, const int len, const uint32_t seed,
     uint64_t r_out[2]) {
 	const uint8_t *data = (const uint8_t *) key;
@@ -260,22 +261,22 @@ hash_x64_128(const void *key, const int len, const uint32_t seed,
 		uint64_t k2 = 0;
 
 		switch (len & 15) {
-		case 15: k2 ^= ((uint64_t)(tail[14])) << 48; /* falls through */
-		case 14: k2 ^= ((uint64_t)(tail[13])) << 40; /* falls through */
-		case 13: k2 ^= ((uint64_t)(tail[12])) << 32; /* falls through */
-		case 12: k2 ^= ((uint64_t)(tail[11])) << 24; /* falls through */
-		case 11: k2 ^= ((uint64_t)(tail[10])) << 16; /* falls through */
-		case 10: k2 ^= ((uint64_t)(tail[ 9])) << 8;  /* falls through */
+		case 15: k2 ^= ((uint64_t)(tail[14])) << 48; JEMALLOC_FALLTHROUGH
+		case 14: k2 ^= ((uint64_t)(tail[13])) << 40; JEMALLOC_FALLTHROUGH
+		case 13: k2 ^= ((uint64_t)(tail[12])) << 32; JEMALLOC_FALLTHROUGH
+		case 12: k2 ^= ((uint64_t)(tail[11])) << 24; JEMALLOC_FALLTHROUGH
+		case 11: k2 ^= ((uint64_t)(tail[10])) << 16; JEMALLOC_FALLTHROUGH
+		case 10: k2 ^= ((uint64_t)(tail[ 9])) << 8;  JEMALLOC_FALLTHROUGH
 		case  9: k2 ^= ((uint64_t)(tail[ 8])) << 0;
 			k2 *= c2; k2 = hash_rotl_64(k2, 33); k2 *= c1; h2 ^= k2;
-			/* falls through */
-		case  8: k1 ^= ((uint64_t)(tail[ 7])) << 56; /* falls through */
-		case  7: k1 ^= ((uint64_t)(tail[ 6])) << 48; /* falls through */
-		case  6: k1 ^= ((uint64_t)(tail[ 5])) << 40; /* falls through */
-		case  5: k1 ^= ((uint64_t)(tail[ 4])) << 32; /* falls through */
-		case  4: k1 ^= ((uint64_t)(tail[ 3])) << 24; /* falls through */
-		case  3: k1 ^= ((uint64_t)(tail[ 2])) << 16; /* falls through */
-		case  2: k1 ^= ((uint64_t)(tail[ 1])) << 8;  /* falls through */
+			JEMALLOC_FALLTHROUGH
+		case  8: k1 ^= ((uint64_t)(tail[ 7])) << 56; JEMALLOC_FALLTHROUGH
+		case  7: k1 ^= ((uint64_t)(tail[ 6])) << 48; JEMALLOC_FALLTHROUGH
+		case  6: k1 ^= ((uint64_t)(tail[ 5])) << 40; JEMALLOC_FALLTHROUGH
+		case  5: k1 ^= ((uint64_t)(tail[ 4])) << 32; JEMALLOC_FALLTHROUGH
+		case  4: k1 ^= ((uint64_t)(tail[ 3])) << 24; JEMALLOC_FALLTHROUGH
+		case  3: k1 ^= ((uint64_t)(tail[ 2])) << 16; JEMALLOC_FALLTHROUGH
+		case  2: k1 ^= ((uint64_t)(tail[ 1])) << 8;  JEMALLOC_FALLTHROUGH
 		case  1: k1 ^= ((uint64_t)(tail[ 0])) << 0;
 			k1 *= c1; k1 = hash_rotl_64(k1, 31); k1 *= c2; h1 ^= k1;
 		}
diff --git a/deps/jemalloc/include/jemalloc/internal/hook.h b/deps/jemalloc/include/jemalloc/internal/hook.h
new file mode 100644
index 000000000..ee246b1e0
--- /dev/null
+++ b/deps/jemalloc/include/jemalloc/internal/hook.h
@@ -0,0 +1,163 @@
+#ifndef JEMALLOC_INTERNAL_HOOK_H
+#define JEMALLOC_INTERNAL_HOOK_H
+
+#include "jemalloc/internal/tsd.h"
+
+/*
+ * This API is *extremely* experimental, and may get ripped out, changed in API-
+ * and ABI-incompatible ways, be insufficiently or incorrectly documented, etc.
+ *
+ * It allows hooking the stateful parts of the API to see changes as they
+ * happen.
+ *
+ * Allocation hooks are called after the allocation is done, free hooks are
+ * called before the free is done, and expand hooks are called after the
+ * allocation is expanded.
+ *
+ * For realloc and rallocx, if the expansion happens in place, the expansion
+ * hook is called.  If it is moved, then the alloc hook is called on the new
+ * location, and then the free hook is called on the old location (i.e. both
+ * hooks are invoked in between the alloc and the dalloc).
+ *
+ * If we return NULL from OOM, then usize might not be trustworthy.  Calling
+ * realloc(NULL, size) only calls the alloc hook, and calling realloc(ptr, 0)
+ * only calls the free hook.  (Calling realloc(NULL, 0) is treated as malloc(0),
+ * and only calls the alloc hook).
+ *
+ * Reentrancy:
+ *   Reentrancy is guarded against from within the hook implementation.  If you
+ *   call allocator functions from within a hook, the hooks will not be invoked
+ *   again.
+ * Threading:
+ *   The installation of a hook synchronizes with all its uses.  If you can
+ *   prove the installation of a hook happens-before a jemalloc entry point,
+ *   then the hook will get invoked (unless there's a racing removal).
+ *
+ *   Hook insertion appears to be atomic at a per-thread level (i.e. if a thread
+ *   allocates and has the alloc hook invoked, then a subsequent free on the
+ *   same thread will also have the free hook invoked).
+ *
+ *   The *removal* of a hook does *not* block until all threads are done with
+ *   the hook.  Hook authors have to be resilient to this, and need some
+ *   out-of-band mechanism for cleaning up any dynamically allocated memory
+ *   associated with their hook.
+ * Ordering:
+ *   Order of hook execution is unspecified, and may be different than insertion
+ *   order.
+ */
+
+#define HOOK_MAX 4
+
+enum hook_alloc_e {
+	hook_alloc_malloc,
+	hook_alloc_posix_memalign,
+	hook_alloc_aligned_alloc,
+	hook_alloc_calloc,
+	hook_alloc_memalign,
+	hook_alloc_valloc,
+	hook_alloc_mallocx,
+
+	/* The reallocating functions have both alloc and dalloc variants */
+	hook_alloc_realloc,
+	hook_alloc_rallocx,
+};
+/*
+ * We put the enum typedef after the enum, since this file may get included by
+ * jemalloc_cpp.cpp, and C++ disallows enum forward declarations.
+ */
+typedef enum hook_alloc_e hook_alloc_t;
+
+enum hook_dalloc_e {
+	hook_dalloc_free,
+	hook_dalloc_dallocx,
+	hook_dalloc_sdallocx,
+
+	/*
+	 * The dalloc halves of reallocation (not called if in-place expansion
+	 * happens).
+	 */
+	hook_dalloc_realloc,
+	hook_dalloc_rallocx,
+};
+typedef enum hook_dalloc_e hook_dalloc_t;
+
+
+enum hook_expand_e {
+	hook_expand_realloc,
+	hook_expand_rallocx,
+	hook_expand_xallocx,
+};
+typedef enum hook_expand_e hook_expand_t;
+
+typedef void (*hook_alloc)(
+    void *extra, hook_alloc_t type, void *result, uintptr_t result_raw,
+    uintptr_t args_raw[3]);
+
+typedef void (*hook_dalloc)(
+    void *extra, hook_dalloc_t type, void *address, uintptr_t args_raw[3]);
+
+typedef void (*hook_expand)(
+    void *extra, hook_expand_t type, void *address, size_t old_usize,
+    size_t new_usize, uintptr_t result_raw, uintptr_t args_raw[4]);
+
+typedef struct hooks_s hooks_t;
+struct hooks_s {
+	hook_alloc alloc_hook;
+	hook_dalloc dalloc_hook;
+	hook_expand expand_hook;
+	void *extra;
+};
+
+/*
+ * Begin implementation details; everything above this point might one day live
+ * in a public API.  Everything below this point never will.
+ */
+
+/*
+ * The realloc pathways haven't gotten any refactoring love in a while, and it's
+ * fairly difficult to pass information from the entry point to the hooks.  We
+ * put the informaiton the hooks will need into a struct to encapsulate
+ * everything.
+ *
+ * Much of these pathways are force-inlined, so that the compiler can avoid
+ * materializing this struct until we hit an extern arena function.  For fairly
+ * goofy reasons, *many* of the realloc paths hit an extern arena function.
+ * These paths are cold enough that it doesn't matter; eventually, we should
+ * rewrite the realloc code to make the expand-in-place and the
+ * free-then-realloc paths more orthogonal, at which point we don't need to
+ * spread the hook logic all over the place.
+ */
+typedef struct hook_ralloc_args_s hook_ralloc_args_t;
+struct hook_ralloc_args_s {
+	/* I.e. as opposed to rallocx. */
+	bool is_realloc;
+	/*
+	 * The expand hook takes 4 arguments, even if only 3 are actually used;
+	 * we add an extra one in case the user decides to memcpy without
+	 * looking too closely at the hooked function.
+	 */
+	uintptr_t args[4];
+};
+
+/*
+ * Returns an opaque handle to be used when removing the hook.  NULL means that
+ * we couldn't install the hook.
+ */
+bool hook_boot();
+
+void *hook_install(tsdn_t *tsdn, hooks_t *hooks);
+/* Uninstalls the hook with the handle previously returned from hook_install. */
+void hook_remove(tsdn_t *tsdn, void *opaque);
+
+/* Hooks */
+
+void hook_invoke_alloc(hook_alloc_t type, void *result, uintptr_t result_raw,
+    uintptr_t args_raw[3]);
+
+void hook_invoke_dalloc(hook_dalloc_t type, void *address,
+    uintptr_t args_raw[3]);
+
+void hook_invoke_expand(hook_expand_t type, void *address, size_t old_usize,
+    size_t new_usize, uintptr_t result_raw, uintptr_t args_raw[4]);
+
+#endif /* JEMALLOC_INTERNAL_HOOK_H */
diff --git a/deps/jemalloc/include/jemalloc/internal/hooks.h b/deps/jemalloc/include/jemalloc/internal/hooks.h
deleted file mode 100644
index cd49afcb0..000000000
--- a/deps/jemalloc/include/jemalloc/internal/hooks.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef JEMALLOC_INTERNAL_HOOKS_H
-#define JEMALLOC_INTERNAL_HOOKS_H
-
-extern JEMALLOC_EXPORT void (*hooks_arena_new_hook)();
-extern JEMALLOC_EXPORT void (*hooks_libc_hook)();
-
-#define JEMALLOC_HOOK(fn, hook) ((void)(hook != NULL && (hook(), 0)), fn)
-
-#define open JEMALLOC_HOOK(open, hooks_libc_hook)
-#define read JEMALLOC_HOOK(read, hooks_libc_hook)
-#define write JEMALLOC_HOOK(write, hooks_libc_hook)
-#define readlink JEMALLOC_HOOK(readlink, hooks_libc_hook)
-#define close JEMALLOC_HOOK(close, hooks_libc_hook)
-#define creat JEMALLOC_HOOK(creat, hooks_libc_hook)
-#define secure_getenv JEMALLOC_HOOK(secure_getenv, hooks_libc_hook)
-/* Note that this is undef'd and re-define'd in src/prof.c. */
-#define _Unwind_Backtrace JEMALLOC_HOOK(_Unwind_Backtrace, hooks_libc_hook)
-
-#endif /* JEMALLOC_INTERNAL_HOOKS_H */
diff --git a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_decls.h b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_decls.h
index be70df510..7d6053e21 100644
--- a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_decls.h
+++ b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_decls.h
@@ -31,6 +31,9 @@
 #    include <sys/uio.h>
 #  endif
 #  include <pthread.h>
+#  ifdef __FreeBSD__
+#  include <pthread_np.h>
+#  endif
 #  include <signal.h>
 #  ifdef JEMALLOC_OS_UNFAIR_LOCK
 #    include <os/lock.h>
diff --git a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_defs.h.in b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_defs.h.in
index 8dad9a1db..c442a2191 100644
--- a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -48,25 +48,13 @@
 
 /* Defined if GCC __atomic atomics are available. */
 #undef JEMALLOC_GCC_ATOMIC_ATOMICS
+/* and the 8-bit variant support. */
+#undef JEMALLOC_GCC_U8_ATOMIC_ATOMICS
 
 /* Defined if GCC __sync atomics are available. */
 #undef JEMALLOC_GCC_SYNC_ATOMICS
-
-/*
- * Defined if __sync_add_and_fetch(uint32_t *, uint32_t) and
- * __sync_sub_and_fetch(uint32_t *, uint32_t) are available, despite
- * __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 not being defined (which means the
- * functions are defined in libgcc instead of being inlines).
- */
-#undef JE_FORCE_SYNC_COMPARE_AND_SWAP_4
-
-/*
- * Defined if __sync_add_and_fetch(uint64_t *, uint64_t) and
- * __sync_sub_and_fetch(uint64_t *, uint64_t) are available, despite
- * __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 not being defined (which means the
- * functions are defined in libgcc instead of being inlines).
- */
-#undef JE_FORCE_SYNC_COMPARE_AND_SWAP_8
+/* and the 8-bit variant support. */
+#undef JEMALLOC_GCC_U8_SYNC_ATOMICS
 
 /*
  * Defined if __builtin_clz() and __builtin_clzl() are available.
@@ -78,12 +66,6 @@
  */
 #undef JEMALLOC_OS_UNFAIR_LOCK
 
-/*
- * Defined if OSSpin*() functions are available, as provided by Darwin, and
- * documented in the spinlock(3) manual page.
- */
-#undef JEMALLOC_OSSPIN
-
 /* Defined if syscall(2) is usable. */
 #undef JEMALLOC_USE_SYSCALL
 
@@ -153,6 +135,9 @@
 /* JEMALLOC_STATS enables statistics calculation. */
 #undef JEMALLOC_STATS
 
+/* JEMALLOC_EXPERIMENTAL_SMALLOCX_API enables experimental smallocx API. */
+#undef JEMALLOC_EXPERIMENTAL_SMALLOCX_API
+
 /* JEMALLOC_PROF enables allocation profiling. */
 #undef JEMALLOC_PROF
 
@@ -234,6 +219,12 @@
 #undef JEMALLOC_INTERNAL_FFS
 
 /*
+ * popcount*() functions to use for bitmapping.
+ */
+#undef JEMALLOC_INTERNAL_POPCOUNTL
+#undef JEMALLOC_INTERNAL_POPCOUNT
+
+/*
  * If defined, explicitly attempt to more uniformly distribute large allocation
  * pointer alignments across all cache indices.
  */
@@ -246,6 +237,12 @@
 #undef JEMALLOC_LOG
 
 /*
+ * If defined, use readlinkat() (instead of readlink()) to follow
+ * /etc/malloc_conf.
+ */
+#undef JEMALLOC_READLINKAT
+
+/*
  * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
  */
 #undef JEMALLOC_ZONE
@@ -363,4 +360,7 @@
  */
 #undef JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE
 
+/* Performs additional safety checks when defined. */
+#undef JEMALLOC_OPT_SAFETY_CHECKS
+
 #endif /* JEMALLOC_INTERNAL_DEFS_H_ */
diff --git a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_externs.h b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_externs.h
index e10fb275d..d291170be 100644
--- a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_externs.h
+++ b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_externs.h
@@ -2,7 +2,6 @@
 #define JEMALLOC_INTERNAL_EXTERNS_H
 
 #include "jemalloc/internal/atomic.h"
-#include "jemalloc/internal/size_classes.h"
 #include "jemalloc/internal/tsd_types.h"
 
 /* TSD checks this to set thread local slow state accordingly. */
@@ -11,6 +10,7 @@ extern bool malloc_slow;
 /* Run-time options. */
 extern bool opt_abort;
 extern bool opt_abort_conf;
+extern bool opt_confirm_conf;
 extern const char *opt_junk;
 extern bool opt_junk_alloc;
 extern bool opt_junk_free;
@@ -25,6 +25,9 @@ extern unsigned ncpus;
 /* Number of arenas used for automatic multiplexing of threads and arenas. */
 extern unsigned narenas_auto;
 
+/* Base index for manual arenas. */
+extern unsigned manual_arena_base;
+
 /*
  * Arenas that are used to service external requests.  Not all elements of the
  * arenas array are necessarily used; arenas are created lazily as needed.
@@ -49,5 +52,6 @@ void jemalloc_prefork(void);
 void jemalloc_postfork_parent(void);
 void jemalloc_postfork_child(void);
 bool malloc_initialized(void);
+void je_sdallocx_noflags(void *ptr, size_t size);
 
 #endif /* JEMALLOC_INTERNAL_EXTERNS_H */
diff --git a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_a.h b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_a.h
index c6a1f7eb2..ddde9b4e6 100644
--- a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_a.h
+++ b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_a.h
@@ -4,13 +4,15 @@
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/bit_util.h"
 #include "jemalloc/internal/jemalloc_internal_types.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/ticker.h"
 
 JEMALLOC_ALWAYS_INLINE malloc_cpuid_t
 malloc_getcpu(void) {
 	assert(have_percpu_arena);
-#if defined(JEMALLOC_HAVE_SCHED_GETCPU)
+#if defined(_WIN32)
+	return GetCurrentProcessorNumber();
+#elif defined(JEMALLOC_HAVE_SCHED_GETCPU)
 	return (malloc_cpuid_t)sched_getcpu();
 #else
 	not_reached();
@@ -108,14 +110,14 @@ decay_ticker_get(tsd_t *tsd, unsigned ind) {
 
 JEMALLOC_ALWAYS_INLINE cache_bin_t *
 tcache_small_bin_get(tcache_t *tcache, szind_t binind) {
-	assert(binind < NBINS);
+	assert(binind < SC_NBINS);
 	return &tcache->bins_small[binind];
 }
 
 JEMALLOC_ALWAYS_INLINE cache_bin_t *
 tcache_large_bin_get(tcache_t *tcache, szind_t binind) {
-	assert(binind >= NBINS &&binind < nhbins);
-	return &tcache->bins_large[binind - NBINS];
+	assert(binind >= SC_NBINS &&binind < nhbins);
+	return &tcache->bins_large[binind - SC_NBINS];
 }
 
 JEMALLOC_ALWAYS_INLINE bool
@@ -156,7 +158,7 @@ pre_reentrancy(tsd_t *tsd, arena_t *arena) {
 	if (fast) {
 		/* Prepare slow path for reentrancy. */
 		tsd_slow_update(tsd);
-		assert(tsd->state == tsd_state_nominal_slow);
+		assert(tsd_state_get(tsd) == tsd_state_nominal_slow);
 	}
 }
 
diff --git a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_b.h b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_b.h
index 2e76e5d8f..70d6e5788 100644
--- a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_b.h
+++ b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_b.h
@@ -71,7 +71,8 @@ arena_ichoose(tsd_t *tsd, arena_t *arena) {
 static inline bool
 arena_is_auto(arena_t *arena) {
 	assert(narenas_auto > 0);
-	return (arena_ind_get(arena) < narenas_auto);
+
+	return (arena_ind_get(arena) < manual_arena_base);
 }
 
 JEMALLOC_ALWAYS_INLINE extent_t *
diff --git a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h
index b19a94207..adae014a1 100644
--- a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h
+++ b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h
@@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_INLINES_C_H
 #define JEMALLOC_INTERNAL_INLINES_C_H
 
+#include "jemalloc/internal/hook.h"
 #include "jemalloc/internal/jemalloc_internal_types.h"
 #include "jemalloc/internal/sz.h"
 #include "jemalloc/internal/witness.h"
@@ -42,7 +43,6 @@ iallocztm(tsdn_t *tsdn, size_t size, szind_t ind, bool zero, tcache_t *tcache,
     bool is_internal, arena_t *arena, bool slow_path) {
 	void *ret;
 
-	assert(size != 0);
 	assert(!is_internal || tcache == NULL);
 	assert(!is_internal || arena == NULL || arena_is_auto(arena));
 	if (!tsdn_null(tsdn) && tsd_reentrancy_level_get(tsdn_tsd(tsdn)) == 0) {
@@ -133,31 +133,20 @@ isdalloct(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache,
 
 JEMALLOC_ALWAYS_INLINE void *
 iralloct_realign(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
-    size_t extra, size_t alignment, bool zero, tcache_t *tcache,
-    arena_t *arena) {
+    size_t alignment, bool zero, tcache_t *tcache, arena_t *arena,
+    hook_ralloc_args_t *hook_args) {
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, 0);
 	void *p;
 	size_t usize, copysize;
 
-	usize = sz_sa2u(size + extra, alignment);
-	if (unlikely(usize == 0 || usize > LARGE_MAXCLASS)) {
+	usize = sz_sa2u(size, alignment);
+	if (unlikely(usize == 0 || usize > SC_LARGE_MAXCLASS)) {
 		return NULL;
 	}
 	p = ipalloct(tsdn, usize, alignment, zero, tcache, arena);
 	if (p == NULL) {
-		if (extra == 0) {
-			return NULL;
-		}
-		/* Try again, without extra this time. */
-		usize = sz_sa2u(size, alignment);
-		if (unlikely(usize == 0 || usize > LARGE_MAXCLASS)) {
-			return NULL;
-		}
-		p = ipalloct(tsdn, usize, alignment, zero, tcache, arena);
-		if (p == NULL) {
-			return NULL;
-		}
+		return NULL;
 	}
 	/*
 	 * Copy at most size bytes (not size+extra), since the caller has no
@@ -165,13 +154,26 @@ iralloct_realign(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
 	 */
 	copysize = (size < oldsize) ? size : oldsize;
 	memcpy(p, ptr, copysize);
+	hook_invoke_alloc(hook_args->is_realloc
+	    ? hook_alloc_realloc : hook_alloc_rallocx, p, (uintptr_t)p,
+	    hook_args->args);
+	hook_invoke_dalloc(hook_args->is_realloc
+	    ? hook_dalloc_realloc : hook_dalloc_rallocx, ptr, hook_args->args);
 	isdalloct(tsdn, ptr, oldsize, tcache, NULL, true);
 	return p;
 }
 
+/*
+ * is_realloc threads through the knowledge of whether or not this call comes
+ * from je_realloc (as opposed to je_rallocx); this ensures that we pass the
+ * correct entry point into any hooks.
+ * Note that these functions are all force-inlined, so no actual bool gets
+ * passed-around anywhere.
+ */
 JEMALLOC_ALWAYS_INLINE void *
 iralloct(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t alignment,
-    bool zero, tcache_t *tcache, arena_t *arena) {
+    bool zero, tcache_t *tcache, arena_t *arena, hook_ralloc_args_t *hook_args)
+{
 	assert(ptr != NULL);
 	assert(size != 0);
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
@@ -183,24 +185,24 @@ iralloct(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t alignment,
 		 * Existing object alignment is inadequate; allocate new space
 		 * and copy.
 		 */
-		return iralloct_realign(tsdn, ptr, oldsize, size, 0, alignment,
-		    zero, tcache, arena);
+		return iralloct_realign(tsdn, ptr, oldsize, size, alignment,
+		    zero, tcache, arena, hook_args);
 	}
 
 	return arena_ralloc(tsdn, arena, ptr, oldsize, size, alignment, zero,
-	    tcache);
+	    tcache, hook_args);
 }
 
 JEMALLOC_ALWAYS_INLINE void *
 iralloc(tsd_t *tsd, void *ptr, size_t oldsize, size_t size, size_t alignment,
-    bool zero) {
+    bool zero, hook_ralloc_args_t *hook_args) {
 	return iralloct(tsd_tsdn(tsd), ptr, oldsize, size, alignment, zero,
-	    tcache_get(tsd), NULL);
+	    tcache_get(tsd), NULL, hook_args);
 }
 
 JEMALLOC_ALWAYS_INLINE bool
 ixalloc(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t extra,
-    size_t alignment, bool zero) {
+    size_t alignment, bool zero, size_t *newsize) {
 	assert(ptr != NULL);
 	assert(size != 0);
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
@@ -209,10 +211,12 @@ ixalloc(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t extra,
 	if (alignment != 0 && ((uintptr_t)ptr & ((uintptr_t)alignment-1))
 	    != 0) {
 		/* Existing object alignment is inadequate. */
+		*newsize = oldsize;
 		return true;
 	}
 
-	return arena_ralloc_no_move(tsdn, ptr, oldsize, size, extra, zero);
+	return arena_ralloc_no_move(tsdn, ptr, oldsize, size, extra, zero,
+	    newsize);
 }
 
 JEMALLOC_ALWAYS_INLINE int
@@ -228,7 +232,8 @@ iget_defrag_hint(tsdn_t *tsdn, void* ptr) {
 		extent_t *slab = iealloc(tsdn, ptr);
 		arena_t *arena = extent_arena_get(slab);
 		szind_t binind = extent_szind_get(slab);
-		bin_t *bin = &arena->bins[binind];
+		unsigned binshard = extent_binshard_get(slab);
+		bin_t *bin = &arena->bins[binind].bin_shards[binshard];
 		malloc_mutex_lock(tsdn, &bin->lock);
 		/* don't bother moving allocations from the slab currently used for new allocations */
 		if (slab != bin->slabcur) {
diff --git a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_macros.h b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_macros.h
index ed75d3768..d8ea06f6d 100644
--- a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_macros.h
+++ b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_macros.h
@@ -30,7 +30,7 @@
 #  define restrict
 #endif
 
-/* Various function pointers are statick and immutable except during testing. */
+/* Various function pointers are static and immutable except during testing. */
 #ifdef JEMALLOC_JET
 #  define JET_MUTABLE
 #else
@@ -40,4 +40,75 @@
 #define JEMALLOC_VA_ARGS_HEAD(head, ...) head
 #define JEMALLOC_VA_ARGS_TAIL(head, ...) __VA_ARGS__
 
+#if (defined(__GNUC__) || defined(__GNUG__)) && !defined(__clang__) \
+  && defined(JEMALLOC_HAVE_ATTR) && (__GNUC__ >= 7)
+#define JEMALLOC_FALLTHROUGH JEMALLOC_ATTR(fallthrough);
+#else
+#define JEMALLOC_FALLTHROUGH /* falls through */
+#endif
+
+/* Diagnostic suppression macros */
+#if defined(_MSC_VER) && !defined(__clang__)
+#  define JEMALLOC_DIAGNOSTIC_PUSH __pragma(warning(push))
+#  define JEMALLOC_DIAGNOSTIC_POP __pragma(warning(pop))
+#  define JEMALLOC_DIAGNOSTIC_IGNORE(W) __pragma(warning(disable:W))
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
+#  define JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
+/* #pragma GCC diagnostic first appeared in gcc 4.6. */
+#elif (defined(__GNUC__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && \
+  (__GNUC_MINOR__ > 5)))) || defined(__clang__)
+/*
+ * The JEMALLOC_PRAGMA__ macro is an implementation detail of the GCC and Clang
+ * diagnostic suppression macros and should not be used anywhere else.
+ */
+#  define JEMALLOC_PRAGMA__(X) _Pragma(#X)
+#  define JEMALLOC_DIAGNOSTIC_PUSH JEMALLOC_PRAGMA__(GCC diagnostic push)
+#  define JEMALLOC_DIAGNOSTIC_POP JEMALLOC_PRAGMA__(GCC diagnostic pop)
+#  define JEMALLOC_DIAGNOSTIC_IGNORE(W) \
+     JEMALLOC_PRAGMA__(GCC diagnostic ignored W)
+
+/*
+ * The -Wmissing-field-initializers warning is buggy in GCC versions < 5.1 and
+ * all clang versions up to version 7 (currently trunk, unreleased).  This macro
+ * suppresses the warning for the affected compiler versions only.
+ */
+#  if ((defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ < 5)) || \
+     defined(__clang__)
+#    define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS  \
+          JEMALLOC_DIAGNOSTIC_IGNORE("-Wmissing-field-initializers")
+#  else
+#    define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
+#  endif
+
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS  \
+     JEMALLOC_DIAGNOSTIC_IGNORE("-Wtype-limits")
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_UNUSED_PARAMETER \
+     JEMALLOC_DIAGNOSTIC_IGNORE("-Wunused-parameter")
+#  if defined(__GNUC__) && !defined(__clang__) && (__GNUC__ >= 7)
+#    define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN \
+       JEMALLOC_DIAGNOSTIC_IGNORE("-Walloc-size-larger-than=")
+#  else
+#    define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
+#  endif
+#  define JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS \
+  JEMALLOC_DIAGNOSTIC_PUSH \
+  JEMALLOC_DIAGNOSTIC_IGNORE_UNUSED_PARAMETER
+#else
+#  define JEMALLOC_DIAGNOSTIC_PUSH
+#  define JEMALLOC_DIAGNOSTIC_POP
+#  define JEMALLOC_DIAGNOSTIC_IGNORE(W)
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
+#  define JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
+#endif
+
+/*
+ * Disables spurious diagnostics for all headers.  Since these headers are not
+ * included by users directly, it does not affect their diagnostic settings.
+ */
+JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
+
 #endif /* JEMALLOC_INTERNAL_MACROS_H */
diff --git a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_types.h b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_types.h
index 1b750b122..e296c5a7e 100644
--- a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_types.h
+++ b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_types.h
@@ -1,6 +1,8 @@
 #ifndef JEMALLOC_INTERNAL_TYPES_H
 #define JEMALLOC_INTERNAL_TYPES_H
 
+#include "jemalloc/internal/quantum.h"
+
 /* Page size index type. */
 typedef unsigned pszind_t;
 
@@ -50,79 +52,6 @@ typedef int malloc_cpuid_t;
 /* Smallest size class to support. */
 #define TINY_MIN		(1U << LG_TINY_MIN)
 
-/*
- * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size
- * classes).
- */
-#ifndef LG_QUANTUM
-#  if (defined(__i386__) || defined(_M_IX86))
-#    define LG_QUANTUM		4
-#  endif
-#  ifdef __ia64__
-#    define LG_QUANTUM		4
-#  endif
-#  ifdef __alpha__
-#    define LG_QUANTUM		4
-#  endif
-#  if (defined(__sparc64__) || defined(__sparcv9) || defined(__sparc_v9__))
-#    define LG_QUANTUM		4
-#  endif
-#  if (defined(__amd64__) || defined(__x86_64__) || defined(_M_X64))
-#    define LG_QUANTUM		4
-#  endif
-#  ifdef __arm__
-#    define LG_QUANTUM		3
-#  endif
-#  ifdef __aarch64__
-#    define LG_QUANTUM		4
-#  endif
-#  ifdef __hppa__
-#    define LG_QUANTUM		4
-#  endif
-#  ifdef __m68k__
-#    define LG_QUANTUM		3
-#  endif
-#  ifdef __mips__
-#    define LG_QUANTUM		3
-#  endif
-#  ifdef __nios2__
-#    define LG_QUANTUM		3
-#  endif
-#  ifdef __or1k__
-#    define LG_QUANTUM		3
-#  endif
-#  ifdef __powerpc__
-#    define LG_QUANTUM		4
-#  endif
-#  if defined(__riscv) || defined(__riscv__)
-#    define LG_QUANTUM		4
-#  endif
-#  ifdef __s390__
-#    define LG_QUANTUM		4
-#  endif
-#  if (defined (__SH3E__) || defined(__SH4_SINGLE__) || defined(__SH4__) || \
-	defined(__SH4_SINGLE_ONLY__))
-#    define LG_QUANTUM		4
-#  endif
-#  ifdef __tile__
-#    define LG_QUANTUM		4
-#  endif
-#  ifdef __le32__
-#    define LG_QUANTUM		4
-#  endif
-#  ifndef LG_QUANTUM
-#    error "Unknown minimum alignment for architecture; specify via "
-	 "--with-lg-quantum"
-#  endif
-#endif
-
-#define QUANTUM			((size_t)(1U << LG_QUANTUM))
-#define QUANTUM_MASK		(QUANTUM - 1)
-
-/* Return the smallest quantum multiple that is >= a. */
-#define QUANTUM_CEILING(a)						\
-	(((a) + QUANTUM_MASK) & ~QUANTUM_MASK)
-
 #define LONG			((size_t)(1U << LG_SIZEOF_LONG))
 #define LONG_MASK		(LONG - 1)
 
diff --git a/deps/jemalloc/include/jemalloc/internal/jemalloc_preamble.h.in b/deps/jemalloc/include/jemalloc/internal/jemalloc_preamble.h.in
index e621fbc85..3418cbfa2 100644
--- a/deps/jemalloc/include/jemalloc/internal/jemalloc_preamble.h.in
+++ b/deps/jemalloc/include/jemalloc/internal/jemalloc_preamble.h.in
@@ -21,7 +21,7 @@
 #  include "../jemalloc@install_suffix@.h"
 #endif
 
-#if (defined(JEMALLOC_OSATOMIC) || defined(JEMALLOC_OSSPIN))
+#if defined(JEMALLOC_OSATOMIC)
 #include <libkern/OSAtomic.h>
 #endif
 
@@ -45,7 +45,7 @@
 #    include "jemalloc/internal/private_namespace_jet.h"
 #  endif
 #endif
-#include "jemalloc/internal/hooks.h"
+#include "jemalloc/internal/test_hooks.h"
 
 #ifdef JEMALLOC_DEFINE_MADVISE_FREE
 #  define JEMALLOC_MADV_FREE 8
@@ -161,7 +161,26 @@ static const bool config_log =
     false
 #endif
     ;
-#ifdef JEMALLOC_HAVE_SCHED_GETCPU
+/*
+ * Are extra safety checks enabled; things like checking the size of sized
+ * deallocations, double-frees, etc.
+ */
+static const bool config_opt_safety_checks =
+#ifdef JEMALLOC_OPT_SAFETY_CHECKS
+    true
+#elif defined(JEMALLOC_DEBUG)
+    /*
+     * This lets us only guard safety checks by one flag instead of two; fast
+     * checks can guard solely by config_opt_safety_checks and run in debug mode
+     * too.
+     */
+    true
+#else
+    false
+#endif
+    ;
+
+#if defined(_WIN32) || defined(JEMALLOC_HAVE_SCHED_GETCPU)
 /* Currently percpu_arena depends on sched_getcpu. */
 #define JEMALLOC_PERCPU_ARENA
 #endif
diff --git a/deps/jemalloc/include/jemalloc/internal/large_externs.h b/deps/jemalloc/include/jemalloc/internal/large_externs.h
index 3f36282cd..a05019e8a 100644
--- a/deps/jemalloc/include/jemalloc/internal/large_externs.h
+++ b/deps/jemalloc/include/jemalloc/internal/large_externs.h
@@ -1,13 +1,16 @@
 #ifndef JEMALLOC_INTERNAL_LARGE_EXTERNS_H
 #define JEMALLOC_INTERNAL_LARGE_EXTERNS_H
 
+#include "jemalloc/internal/hook.h"
+
 void *large_malloc(tsdn_t *tsdn, arena_t *arena, size_t usize, bool zero);
 void *large_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment,
     bool zero);
 bool large_ralloc_no_move(tsdn_t *tsdn, extent_t *extent, size_t usize_min,
     size_t usize_max, bool zero);
-void *large_ralloc(tsdn_t *tsdn, arena_t *arena, extent_t *extent, size_t usize,
-    size_t alignment, bool zero, tcache_t *tcache);
+void *large_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t usize,
+    size_t alignment, bool zero, tcache_t *tcache,
+    hook_ralloc_args_t *hook_args);
 
 typedef void (large_dalloc_junk_t)(void *, size_t);
 extern large_dalloc_junk_t *JET_MUTABLE large_dalloc_junk;
@@ -23,4 +26,7 @@ prof_tctx_t *large_prof_tctx_get(tsdn_t *tsdn, const extent_t *extent);
 void large_prof_tctx_set(tsdn_t *tsdn, extent_t *extent, prof_tctx_t *tctx);
 void large_prof_tctx_reset(tsdn_t *tsdn, extent_t *extent);
 
+nstime_t large_prof_alloc_time_get(const extent_t *extent);
+void large_prof_alloc_time_set(extent_t *extent, nstime_t time);
+
 #endif /* JEMALLOC_INTERNAL_LARGE_EXTERNS_H */
diff --git a/deps/jemalloc/include/jemalloc/internal/malloc_io.h b/deps/jemalloc/include/jemalloc/internal/malloc_io.h
index bfe556b52..1d1a414e0 100644
--- a/deps/jemalloc/include/jemalloc/internal/malloc_io.h
+++ b/deps/jemalloc/include/jemalloc/internal/malloc_io.h
@@ -54,7 +54,7 @@ size_t malloc_vsnprintf(char *str, size_t size, const char *format,
 size_t malloc_snprintf(char *str, size_t size, const char *format, ...)
     JEMALLOC_FORMAT_PRINTF(3, 4);
 /*
- * The caller can set write_cb and cbopaque to null to choose to print with the
+ * The caller can set write_cb to null to choose to print with the
  * je_malloc_message hook.
  */
 void malloc_vcprintf(void (*write_cb)(void *, const char *), void *cbopaque,
diff --git a/deps/jemalloc/include/jemalloc/internal/mutex.h b/deps/jemalloc/include/jemalloc/internal/mutex.h
index 6520c2512..7c24f0725 100644
--- a/deps/jemalloc/include/jemalloc/internal/mutex.h
+++ b/deps/jemalloc/include/jemalloc/internal/mutex.h
@@ -37,14 +37,17 @@ struct malloc_mutex_s {
 #  endif
 #elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
 			os_unfair_lock		lock;
-#elif (defined(JEMALLOC_OSSPIN))
-			OSSpinLock		lock;
 #elif (defined(JEMALLOC_MUTEX_INIT_CB))
 			pthread_mutex_t		lock;
 			malloc_mutex_t		*postponed_next;
 #else
 			pthread_mutex_t		lock;
 #endif
+			/* 
+			 * Hint flag to avoid exclusive cache line contention
+			 * during spin waiting
+			 */
+			atomic_b_t		locked;
 		};
 		/*
 		 * We only touch witness when configured w/ debug.  However we
@@ -84,10 +87,6 @@ struct malloc_mutex_s {
 #    define MALLOC_MUTEX_LOCK(m)    os_unfair_lock_lock(&(m)->lock)
 #    define MALLOC_MUTEX_UNLOCK(m)  os_unfair_lock_unlock(&(m)->lock)
 #    define MALLOC_MUTEX_TRYLOCK(m) (!os_unfair_lock_trylock(&(m)->lock))
-#elif (defined(JEMALLOC_OSSPIN))
-#    define MALLOC_MUTEX_LOCK(m)    OSSpinLockLock(&(m)->lock)
-#    define MALLOC_MUTEX_UNLOCK(m)  OSSpinLockUnlock(&(m)->lock)
-#    define MALLOC_MUTEX_TRYLOCK(m) (!OSSpinLockTry(&(m)->lock))
 #else
 #    define MALLOC_MUTEX_LOCK(m)    pthread_mutex_lock(&(m)->lock)
 #    define MALLOC_MUTEX_UNLOCK(m)  pthread_mutex_unlock(&(m)->lock)
@@ -101,22 +100,37 @@ struct malloc_mutex_s {
 #ifdef _WIN32
 #  define MALLOC_MUTEX_INITIALIZER
 #elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
-#  define MALLOC_MUTEX_INITIALIZER					\
-     {{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT}},		\
-      WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
-#elif (defined(JEMALLOC_OSSPIN))
-#  define MALLOC_MUTEX_INITIALIZER					\
-     {{{LOCK_PROF_DATA_INITIALIZER, 0}},				\
+#  if defined(JEMALLOC_DEBUG)
+#    define MALLOC_MUTEX_INITIALIZER					\
+  {{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT, ATOMIC_INIT(false)}}, \
+         WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0}
+#  else
+#    define MALLOC_MUTEX_INITIALIZER                      \
+  {{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT, ATOMIC_INIT(false)}},  \
       WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
+#  endif
 #elif (defined(JEMALLOC_MUTEX_INIT_CB))
-#  define MALLOC_MUTEX_INITIALIZER					\
-     {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL}},	\
-      WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
+#  if (defined(JEMALLOC_DEBUG))
+#     define MALLOC_MUTEX_INITIALIZER					\
+      {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL, ATOMIC_INIT(false)}},	\
+           WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0}
+#  else
+#     define MALLOC_MUTEX_INITIALIZER					\
+      {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL, ATOMIC_INIT(false)}},	\
+           WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
+#  endif
+
 #else
 #    define MALLOC_MUTEX_TYPE PTHREAD_MUTEX_DEFAULT
+#  if defined(JEMALLOC_DEBUG)
 #    define MALLOC_MUTEX_INITIALIZER					\
-       {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER}},	\
-        WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
+     {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, ATOMIC_INIT(false)}}, \
+           WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0}
+#  else
+#    define MALLOC_MUTEX_INITIALIZER                          \
+     {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, ATOMIC_INIT(false)}},	\
+      WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
+#  endif
 #endif
 
 #ifdef JEMALLOC_LAZY_LOCK
@@ -139,6 +153,7 @@ void malloc_mutex_lock_slow(malloc_mutex_t *mutex);
 static inline void
 malloc_mutex_lock_final(malloc_mutex_t *mutex) {
 	MALLOC_MUTEX_LOCK(mutex);
+	atomic_store_b(&mutex->locked, true, ATOMIC_RELAXED);
 }
 
 static inline bool
@@ -164,6 +179,7 @@ malloc_mutex_trylock(tsdn_t *tsdn, malloc_mutex_t *mutex) {
 	witness_assert_not_owner(tsdn_witness_tsdp_get(tsdn), &mutex->witness);
 	if (isthreaded) {
 		if (malloc_mutex_trylock_final(mutex)) {
+			atomic_store_b(&mutex->locked, true, ATOMIC_RELAXED);
 			return true;
 		}
 		mutex_owner_stats_update(tsdn, mutex);
@@ -203,6 +219,7 @@ malloc_mutex_lock(tsdn_t *tsdn, malloc_mutex_t *mutex) {
 	if (isthreaded) {
 		if (malloc_mutex_trylock_final(mutex)) {
 			malloc_mutex_lock_slow(mutex);
+			atomic_store_b(&mutex->locked, true, ATOMIC_RELAXED);
 		}
 		mutex_owner_stats_update(tsdn, mutex);
 	}
@@ -211,6 +228,7 @@ malloc_mutex_lock(tsdn_t *tsdn, malloc_mutex_t *mutex) {
 
 static inline void
 malloc_mutex_unlock(tsdn_t *tsdn, malloc_mutex_t *mutex) {
+	atomic_store_b(&mutex->locked, false, ATOMIC_RELAXED);
 	witness_unlock(tsdn_witness_tsdp_get(tsdn), &mutex->witness);
 	if (isthreaded) {
 		MALLOC_MUTEX_UNLOCK(mutex);
@@ -245,4 +263,26 @@ malloc_mutex_prof_read(tsdn_t *tsdn, mutex_prof_data_t *data,
 	atomic_store_u32(&data->n_waiting_thds, 0, ATOMIC_RELAXED);
 }
 
+static inline void
+malloc_mutex_prof_accum(tsdn_t *tsdn, mutex_prof_data_t *data,
+    malloc_mutex_t *mutex) {
+	mutex_prof_data_t *source = &mutex->prof_data;
+	/* Can only read holding the mutex. */
+	malloc_mutex_assert_owner(tsdn, mutex);
+
+	nstime_add(&data->tot_wait_time, &source->tot_wait_time);
+	if (nstime_compare(&source->max_wait_time, &data->max_wait_time) > 0) {
+		nstime_copy(&data->max_wait_time, &source->max_wait_time);
+	}
+	data->n_wait_times += source->n_wait_times;
+	data->n_spin_acquired += source->n_spin_acquired;
+	if (data->max_n_thds < source->max_n_thds) {
+		data->max_n_thds = source->max_n_thds;
+	}
+	/* n_wait_thds is not reported. */
+	atomic_store_u32(&data->n_waiting_thds, 0, ATOMIC_RELAXED);
+	data->n_owner_switches += source->n_owner_switches;
+	data->n_lock_ops += source->n_lock_ops;
+}
+
 #endif /* JEMALLOC_INTERNAL_MUTEX_H */
diff --git a/deps/jemalloc/include/jemalloc/internal/mutex_prof.h b/deps/jemalloc/include/jemalloc/internal/mutex_prof.h
index ce183d335..2cb8fb0cb 100644
--- a/deps/jemalloc/include/jemalloc/internal/mutex_prof.h
+++ b/deps/jemalloc/include/jemalloc/internal/mutex_prof.h
@@ -35,22 +35,31 @@ typedef enum {
 	mutex_prof_num_arena_mutexes
 } mutex_prof_arena_ind_t;
 
+/*
+ * The forth parameter is a boolean value that is true for derived rate counters
+ * and false for real ones.
+ */
 #define MUTEX_PROF_UINT64_COUNTERS					\
-    OP(num_ops, uint64_t, "n_lock_ops")					\
-    OP(num_wait, uint64_t, "n_waiting")					\
-    OP(num_spin_acq, uint64_t, "n_spin_acq")				\
-    OP(num_owner_switch, uint64_t, "n_owner_switch")			\
-    OP(total_wait_time, uint64_t, "total_wait_ns")			\
-    OP(max_wait_time, uint64_t, "max_wait_ns")
+    OP(num_ops, uint64_t, "n_lock_ops", false, num_ops)					\
+    OP(num_ops_ps, uint64_t, "(#/sec)", true, num_ops)				\
+    OP(num_wait, uint64_t, "n_waiting", false, num_wait)				\
+    OP(num_wait_ps, uint64_t, "(#/sec)", true, num_wait)				\
+    OP(num_spin_acq, uint64_t, "n_spin_acq", false, num_spin_acq)			\
+    OP(num_spin_acq_ps, uint64_t, "(#/sec)", true, num_spin_acq)			\
+    OP(num_owner_switch, uint64_t, "n_owner_switch", false, num_owner_switch)		\
+    OP(num_owner_switch_ps, uint64_t, "(#/sec)", true, num_owner_switch)	\
+    OP(total_wait_time, uint64_t, "total_wait_ns", false, total_wait_time)		\
+    OP(total_wait_time_ps, uint64_t, "(#/sec)", true, total_wait_time)		\
+    OP(max_wait_time, uint64_t, "max_wait_ns", false, max_wait_time)
 
 #define MUTEX_PROF_UINT32_COUNTERS					\
-    OP(max_num_thds, uint32_t, "max_n_thds")
+    OP(max_num_thds, uint32_t, "max_n_thds", false, max_num_thds)
 
 #define MUTEX_PROF_COUNTERS						\
 		MUTEX_PROF_UINT64_COUNTERS				\
 		MUTEX_PROF_UINT32_COUNTERS
 
-#define OP(counter, type, human) mutex_counter_##counter,
+#define OP(counter, type, human, derived, base_counter) mutex_counter_##counter,
 
 #define COUNTER_ENUM(counter_list, t)					\
 		typedef enum {						\
diff --git a/deps/jemalloc/include/jemalloc/internal/prof_externs.h b/deps/jemalloc/include/jemalloc/internal/prof_externs.h
index 04348696f..094f3e170 100644
--- a/deps/jemalloc/include/jemalloc/internal/prof_externs.h
+++ b/deps/jemalloc/include/jemalloc/internal/prof_externs.h
@@ -14,6 +14,7 @@ extern bool	opt_prof_gdump;       /* High-water memory dumping. */
 extern bool	opt_prof_final;       /* Final profile dumping. */
 extern bool	opt_prof_leak;        /* Dump leak summary at exit. */
 extern bool	opt_prof_accum;       /* Report cumulative bytes. */
+extern bool	opt_prof_log;	      /* Turn logging on at boot. */
 extern char	opt_prof_prefix[
     /* Minimize memory bloat for non-prof builds. */
 #ifdef JEMALLOC_PROF
@@ -45,7 +46,8 @@ extern size_t	lg_prof_sample;
 void prof_alloc_rollback(tsd_t *tsd, prof_tctx_t *tctx, bool updated);
 void prof_malloc_sample_object(tsdn_t *tsdn, const void *ptr, size_t usize,
     prof_tctx_t *tctx);
-void prof_free_sampled_object(tsd_t *tsd, size_t usize, prof_tctx_t *tctx);
+void prof_free_sampled_object(tsd_t *tsd, const void *ptr, size_t usize,
+    prof_tctx_t *tctx);
 void bt_init(prof_bt_t *bt, void **vec);
 void prof_backtrace(prof_bt_t *bt);
 prof_tctx_t *prof_lookup(tsd_t *tsd, prof_bt_t *bt);
@@ -89,4 +91,15 @@ void prof_postfork_parent(tsdn_t *tsdn);
 void prof_postfork_child(tsdn_t *tsdn);
 void prof_sample_threshold_update(prof_tdata_t *tdata);
 
+bool prof_log_start(tsdn_t *tsdn, const char *filename);
+bool prof_log_stop(tsdn_t *tsdn);
+#ifdef JEMALLOC_JET
+size_t prof_log_bt_count(void);
+size_t prof_log_alloc_count(void);
+size_t prof_log_thr_count(void);
+bool prof_log_is_logging(void);
+bool prof_log_rep_check(void);
+void prof_log_dummy_set(bool new_value);
+#endif
+
 #endif /* JEMALLOC_INTERNAL_PROF_EXTERNS_H */
diff --git a/deps/jemalloc/include/jemalloc/internal/prof_inlines_a.h b/deps/jemalloc/include/jemalloc/internal/prof_inlines_a.h
index a6efb4851..471d9853c 100644
--- a/deps/jemalloc/include/jemalloc/internal/prof_inlines_a.h
+++ b/deps/jemalloc/include/jemalloc/internal/prof_inlines_a.h
@@ -4,7 +4,8 @@
 #include "jemalloc/internal/mutex.h"
 
 static inline bool
-prof_accum_add(tsdn_t *tsdn, prof_accum_t *prof_accum, uint64_t accumbytes) {
+prof_accum_add(tsdn_t *tsdn, prof_accum_t *prof_accum,
+    uint64_t accumbytes) {
 	cassert(config_prof);
 
 	bool overflow;
@@ -42,7 +43,8 @@ prof_accum_add(tsdn_t *tsdn, prof_accum_t *prof_accum, uint64_t accumbytes) {
 }
 
 static inline void
-prof_accum_cancel(tsdn_t *tsdn, prof_accum_t *prof_accum, size_t usize) {
+prof_accum_cancel(tsdn_t *tsdn, prof_accum_t *prof_accum,
+    size_t usize) {
 	cassert(config_prof);
 
 	/*
@@ -55,15 +57,15 @@ prof_accum_cancel(tsdn_t *tsdn, prof_accum_t *prof_accum, size_t usize) {
 #ifdef JEMALLOC_ATOMIC_U64
 	a0 = atomic_load_u64(&prof_accum->accumbytes, ATOMIC_RELAXED);
 	do {
-		a1 = (a0 >= LARGE_MINCLASS - usize) ?  a0 - (LARGE_MINCLASS -
-		    usize) : 0;
+		a1 = (a0 >= SC_LARGE_MINCLASS - usize)
+		    ? a0 - (SC_LARGE_MINCLASS - usize) : 0;
 	} while (!atomic_compare_exchange_weak_u64(&prof_accum->accumbytes, &a0,
 	    a1, ATOMIC_RELAXED, ATOMIC_RELAXED));
 #else
 	malloc_mutex_lock(tsdn, &prof_accum->mtx);
 	a0 = prof_accum->accumbytes;
-	a1 = (a0 >= LARGE_MINCLASS - usize) ?  a0 - (LARGE_MINCLASS - usize) :
-	    0;
+	a1 = (a0 >= SC_LARGE_MINCLASS - usize)
+	    ?  a0 - (SC_LARGE_MINCLASS - usize) : 0;
 	prof_accum->accumbytes = a1;
 	malloc_mutex_unlock(tsdn, &prof_accum->mtx);
 #endif
diff --git a/deps/jemalloc/include/jemalloc/internal/prof_inlines_b.h b/deps/jemalloc/include/jemalloc/internal/prof_inlines_b.h
index 6ff465ad7..8ba8a1e1f 100644
--- a/deps/jemalloc/include/jemalloc/internal/prof_inlines_b.h
+++ b/deps/jemalloc/include/jemalloc/internal/prof_inlines_b.h
@@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_PROF_INLINES_B_H
 #define JEMALLOC_INTERNAL_PROF_INLINES_B_H
 
+#include "jemalloc/internal/safety_check.h"
 #include "jemalloc/internal/sz.h"
 
 JEMALLOC_ALWAYS_INLINE bool
@@ -61,13 +62,54 @@ prof_tctx_reset(tsdn_t *tsdn, const void *ptr, prof_tctx_t *tctx) {
 	arena_prof_tctx_reset(tsdn, ptr, tctx);
 }
 
+JEMALLOC_ALWAYS_INLINE nstime_t
+prof_alloc_time_get(tsdn_t *tsdn, const void *ptr, alloc_ctx_t *alloc_ctx) {
+	cassert(config_prof);
+	assert(ptr != NULL);
+
+	return arena_prof_alloc_time_get(tsdn, ptr, alloc_ctx);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+prof_alloc_time_set(tsdn_t *tsdn, const void *ptr, alloc_ctx_t *alloc_ctx,
+    nstime_t t) {
+	cassert(config_prof);
+	assert(ptr != NULL);
+
+	arena_prof_alloc_time_set(tsdn, ptr, alloc_ctx, t);
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+prof_sample_check(tsd_t *tsd, size_t usize, bool update) {
+	ssize_t check = update ? 0 : usize;
+
+	int64_t bytes_until_sample = tsd_bytes_until_sample_get(tsd);
+	if (update) {
+		bytes_until_sample -= usize;
+		if (tsd_nominal(tsd)) {
+			tsd_bytes_until_sample_set(tsd, bytes_until_sample);
+		}
+	}
+	if (likely(bytes_until_sample >= check)) {
+		return true;
+	}
+
+	return false;
+}
+
 JEMALLOC_ALWAYS_INLINE bool
 prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
-    prof_tdata_t **tdata_out) {
+			 prof_tdata_t **tdata_out) {
 	prof_tdata_t *tdata;
 
 	cassert(config_prof);
 
+	/* Fastpath: no need to load tdata */
+	if (likely(prof_sample_check(tsd, usize, update))) {
+		return true;
+	}
+
+	bool booted = tsd_prof_tdata_get(tsd);
 	tdata = prof_tdata_get(tsd, true);
 	if (unlikely((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)) {
 		tdata = NULL;
@@ -81,21 +123,23 @@ prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
 		return true;
 	}
 
-	if (likely(tdata->bytes_until_sample >= usize)) {
-		if (update) {
-			tdata->bytes_until_sample -= usize;
-		}
+	/*
+	 * If this was the first creation of tdata, then
+	 * prof_tdata_get() reset bytes_until_sample, so decrement and
+	 * check it again
+	 */
+	if (!booted && prof_sample_check(tsd, usize, update)) {
 		return true;
-	} else {
-		if (tsd_reentrancy_level_get(tsd) > 0) {
-			return true;
-		}
-		/* Compute new sample threshold. */
-		if (update) {
-			prof_sample_threshold_update(tdata);
-		}
-		return !tdata->active;
 	}
+
+	if (tsd_reentrancy_level_get(tsd) > 0) {
+		return true;
+	}
+	/* Compute new sample threshold. */
+	if (update) {
+		prof_sample_threshold_update(tdata);
+	}
+	return !tdata->active;
 }
 
 JEMALLOC_ALWAYS_INLINE prof_tctx_t *
@@ -187,7 +231,7 @@ prof_realloc(tsd_t *tsd, const void *ptr, size_t usize, prof_tctx_t *tctx,
 	 * counters.
 	 */
 	if (unlikely(old_sampled)) {
-		prof_free_sampled_object(tsd, old_usize, old_tctx);
+		prof_free_sampled_object(tsd, ptr, old_usize, old_tctx);
 	}
 }
 
@@ -199,7 +243,7 @@ prof_free(tsd_t *tsd, const void *ptr, size_t usize, alloc_ctx_t *alloc_ctx) {
 	assert(usize == isalloc(tsd_tsdn(tsd), ptr));
 
 	if (unlikely((uintptr_t)tctx > (uintptr_t)1U)) {
-		prof_free_sampled_object(tsd, usize, tctx);
+		prof_free_sampled_object(tsd, ptr, usize, tctx);
 	}
 }
 
diff --git a/deps/jemalloc/include/jemalloc/internal/prof_structs.h b/deps/jemalloc/include/jemalloc/internal/prof_structs.h
index 0d58ae100..34ed4822b 100644
--- a/deps/jemalloc/include/jemalloc/internal/prof_structs.h
+++ b/deps/jemalloc/include/jemalloc/internal/prof_structs.h
@@ -169,7 +169,6 @@ struct prof_tdata_s {
 
 	/* Sampling state. */
 	uint64_t		prng_state;
-	uint64_t		bytes_until_sample;
 
 	/* State used to avoid dumping while operating on prof internals. */
 	bool			enq;
diff --git a/deps/jemalloc/include/jemalloc/internal/quantum.h b/deps/jemalloc/include/jemalloc/internal/quantum.h
new file mode 100644
index 000000000..821086e99
--- /dev/null
+++ b/deps/jemalloc/include/jemalloc/internal/quantum.h
@@ -0,0 +1,77 @@
+#ifndef JEMALLOC_INTERNAL_QUANTUM_H
+#define JEMALLOC_INTERNAL_QUANTUM_H
+
+/*
+ * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size
+ * classes).
+ */
+#ifndef LG_QUANTUM
+#  if (defined(__i386__) || defined(_M_IX86))
+#    define LG_QUANTUM		4
+#  endif
+#  ifdef __ia64__
+#    define LG_QUANTUM		4
+#  endif
+#  ifdef __alpha__
+#    define LG_QUANTUM		4
+#  endif
+#  if (defined(__sparc64__) || defined(__sparcv9) || defined(__sparc_v9__))
+#    define LG_QUANTUM		4
+#  endif
+#  if (defined(__amd64__) || defined(__x86_64__) || defined(_M_X64))
+#    define LG_QUANTUM		4
+#  endif
+#  ifdef __arm__
+#    define LG_QUANTUM		3
+#  endif
+#  ifdef __aarch64__
+#    define LG_QUANTUM		4
+#  endif
+#  ifdef __hppa__
+#    define LG_QUANTUM		4
+#  endif
+#  ifdef __m68k__
+#    define LG_QUANTUM		3
+#  endif
+#  ifdef __mips__
+#    define LG_QUANTUM		3
+#  endif
+#  ifdef __nios2__
+#    define LG_QUANTUM		3
+#  endif
+#  ifdef __or1k__
+#    define LG_QUANTUM		3
+#  endif
+#  ifdef __powerpc__
+#    define LG_QUANTUM		4
+#  endif
+#  if defined(__riscv) || defined(__riscv__)
+#    define LG_QUANTUM		4
+#  endif
+#  ifdef __s390__
+#    define LG_QUANTUM		4
+#  endif
+#  if (defined (__SH3E__) || defined(__SH4_SINGLE__) || defined(__SH4__) || \
+	defined(__SH4_SINGLE_ONLY__))
+#    define LG_QUANTUM		4
+#  endif
+#  ifdef __tile__
+#    define LG_QUANTUM		4
+#  endif
+#  ifdef __le32__
+#    define LG_QUANTUM		4
+#  endif
+#  ifndef LG_QUANTUM
+#    error "Unknown minimum alignment for architecture; specify via "
+	 "--with-lg-quantum"
+#  endif
+#endif
+
+#define QUANTUM			((size_t)(1U << LG_QUANTUM))
+#define QUANTUM_MASK		(QUANTUM - 1)
+
+/* Return the smallest quantum multiple that is >= a. */
+#define QUANTUM_CEILING(a)						\
+	(((a) + QUANTUM_MASK) & ~QUANTUM_MASK)
+
+#endif /* JEMALLOC_INTERNAL_QUANTUM_H */
diff --git a/deps/jemalloc/include/jemalloc/internal/rtree.h b/deps/jemalloc/include/jemalloc/internal/rtree.h
index b59d33a80..16ccbebee 100644
--- a/deps/jemalloc/include/jemalloc/internal/rtree.h
+++ b/deps/jemalloc/include/jemalloc/internal/rtree.h
@@ -4,7 +4,7 @@
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/rtree_tsd.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/tsd.h"
 
 /*
@@ -31,7 +31,7 @@
 #  error Unsupported number of significant virtual address bits
 #endif
 /* Use compact leaf representation if virtual address encoding allows. */
-#if RTREE_NHIB >= LG_CEIL_NSIZES
+#if RTREE_NHIB >= LG_CEIL(SC_NSIZES)
 #  define RTREE_LEAF_COMPACT
 #endif
 
@@ -170,8 +170,8 @@ rtree_subkey(uintptr_t key, unsigned level) {
  */
 #  ifdef RTREE_LEAF_COMPACT
 JEMALLOC_ALWAYS_INLINE uintptr_t
-rtree_leaf_elm_bits_read(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
-    bool dependent) {
+rtree_leaf_elm_bits_read(tsdn_t *tsdn, rtree_t *rtree,
+    rtree_leaf_elm_t *elm, bool dependent) {
 	return (uintptr_t)atomic_load_p(&elm->le_bits, dependent
 	    ? ATOMIC_RELAXED : ATOMIC_ACQUIRE);
 }
@@ -208,7 +208,7 @@ rtree_leaf_elm_bits_slab_get(uintptr_t bits) {
 #  endif
 
 JEMALLOC_ALWAYS_INLINE extent_t *
-rtree_leaf_elm_extent_read(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
+rtree_leaf_elm_extent_read(tsdn_t *tsdn, rtree_t *rtree,
     rtree_leaf_elm_t *elm, bool dependent) {
 #ifdef RTREE_LEAF_COMPACT
 	uintptr_t bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm, dependent);
@@ -221,7 +221,7 @@ rtree_leaf_elm_extent_read(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
 }
 
 JEMALLOC_ALWAYS_INLINE szind_t
-rtree_leaf_elm_szind_read(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
+rtree_leaf_elm_szind_read(tsdn_t *tsdn, rtree_t *rtree,
     rtree_leaf_elm_t *elm, bool dependent) {
 #ifdef RTREE_LEAF_COMPACT
 	uintptr_t bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm, dependent);
@@ -233,7 +233,7 @@ rtree_leaf_elm_szind_read(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
 }
 
 JEMALLOC_ALWAYS_INLINE bool
-rtree_leaf_elm_slab_read(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
+rtree_leaf_elm_slab_read(tsdn_t *tsdn, rtree_t *rtree,
     rtree_leaf_elm_t *elm, bool dependent) {
 #ifdef RTREE_LEAF_COMPACT
 	uintptr_t bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm, dependent);
@@ -245,7 +245,7 @@ rtree_leaf_elm_slab_read(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
 }
 
 static inline void
-rtree_leaf_elm_extent_write(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
+rtree_leaf_elm_extent_write(tsdn_t *tsdn, rtree_t *rtree,
     rtree_leaf_elm_t *elm, extent_t *extent) {
 #ifdef RTREE_LEAF_COMPACT
 	uintptr_t old_bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm, true);
@@ -259,9 +259,9 @@ rtree_leaf_elm_extent_write(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
 }
 
 static inline void
-rtree_leaf_elm_szind_write(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
+rtree_leaf_elm_szind_write(tsdn_t *tsdn, rtree_t *rtree,
     rtree_leaf_elm_t *elm, szind_t szind) {
-	assert(szind <= NSIZES);
+	assert(szind <= SC_NSIZES);
 
 #ifdef RTREE_LEAF_COMPACT
 	uintptr_t old_bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm,
@@ -277,7 +277,7 @@ rtree_leaf_elm_szind_write(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
 }
 
 static inline void
-rtree_leaf_elm_slab_write(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
+rtree_leaf_elm_slab_write(tsdn_t *tsdn, rtree_t *rtree,
     rtree_leaf_elm_t *elm, bool slab) {
 #ifdef RTREE_LEAF_COMPACT
 	uintptr_t old_bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm,
@@ -292,8 +292,8 @@ rtree_leaf_elm_slab_write(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
 }
 
 static inline void
-rtree_leaf_elm_write(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
-    extent_t *extent, szind_t szind, bool slab) {
+rtree_leaf_elm_write(tsdn_t *tsdn, rtree_t *rtree,
+    rtree_leaf_elm_t *elm, extent_t *extent, szind_t szind, bool slab) {
 #ifdef RTREE_LEAF_COMPACT
 	uintptr_t bits = ((uintptr_t)szind << LG_VADDR) |
 	    ((uintptr_t)extent & (((uintptr_t)0x1 << LG_VADDR) - 1)) |
@@ -313,7 +313,7 @@ rtree_leaf_elm_write(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
 static inline void
 rtree_leaf_elm_szind_slab_update(tsdn_t *tsdn, rtree_t *rtree,
     rtree_leaf_elm_t *elm, szind_t szind, bool slab) {
-	assert(!slab || szind < NBINS);
+	assert(!slab || szind < SC_NBINS);
 
 	/*
 	 * The caller implicitly assures that it is the only writer to the szind
@@ -429,7 +429,7 @@ rtree_szind_read(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
 	rtree_leaf_elm_t *elm = rtree_read(tsdn, rtree, rtree_ctx, key,
 	    dependent);
 	if (!dependent && elm == NULL) {
-		return NSIZES;
+		return SC_NSIZES;
 	}
 	return rtree_leaf_elm_szind_read(tsdn, rtree, elm, dependent);
 }
@@ -452,6 +452,42 @@ rtree_extent_szind_read(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
 	return false;
 }
 
+/*
+ * Try to read szind_slab from the L1 cache.  Returns true on a hit,
+ * and fills in r_szind and r_slab.  Otherwise returns false.
+ *
+ * Key is allowed to be NULL in order to save an extra branch on the
+ * fastpath.  returns false in this case.
+ */
+JEMALLOC_ALWAYS_INLINE bool
+rtree_szind_slab_read_fast(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
+			    uintptr_t key, szind_t *r_szind, bool *r_slab) {
+	rtree_leaf_elm_t *elm;
+
+	size_t slot = rtree_cache_direct_map(key);
+	uintptr_t leafkey = rtree_leafkey(key);
+	assert(leafkey != RTREE_LEAFKEY_INVALID);
+
+	if (likely(rtree_ctx->cache[slot].leafkey == leafkey)) {
+		rtree_leaf_elm_t *leaf = rtree_ctx->cache[slot].leaf;
+		assert(leaf != NULL);
+		uintptr_t subkey = rtree_subkey(key, RTREE_HEIGHT-1);
+		elm = &leaf[subkey];
+
+#ifdef RTREE_LEAF_COMPACT
+		uintptr_t bits = rtree_leaf_elm_bits_read(tsdn, rtree,
+							  elm, true);
+		*r_szind = rtree_leaf_elm_bits_szind_get(bits);
+		*r_slab = rtree_leaf_elm_bits_slab_get(bits);
+#else
+		*r_szind = rtree_leaf_elm_szind_read(tsdn, rtree, elm, true);
+		*r_slab = rtree_leaf_elm_slab_read(tsdn, rtree, elm, true);
+#endif
+		return true;
+	} else {
+		return false;
+	}
+}
 JEMALLOC_ALWAYS_INLINE bool
 rtree_szind_slab_read(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
     uintptr_t key, bool dependent, szind_t *r_szind, bool *r_slab) {
@@ -474,7 +510,7 @@ rtree_szind_slab_read(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
 static inline void
 rtree_szind_slab_update(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
     uintptr_t key, szind_t szind, bool slab) {
-	assert(!slab || szind < NBINS);
+	assert(!slab || szind < SC_NBINS);
 
 	rtree_leaf_elm_t *elm = rtree_read(tsdn, rtree, rtree_ctx, key, true);
 	rtree_leaf_elm_szind_slab_update(tsdn, rtree, elm, szind, slab);
@@ -486,7 +522,7 @@ rtree_clear(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
 	rtree_leaf_elm_t *elm = rtree_read(tsdn, rtree, rtree_ctx, key, true);
 	assert(rtree_leaf_elm_extent_read(tsdn, rtree, elm, false) !=
 	    NULL);
-	rtree_leaf_elm_write(tsdn, rtree, elm, NULL, NSIZES, false);
+	rtree_leaf_elm_write(tsdn, rtree, elm, NULL, SC_NSIZES, false);
 }
 
 #endif /* JEMALLOC_INTERNAL_RTREE_H */
diff --git a/deps/jemalloc/include/jemalloc/internal/rtree_tsd.h b/deps/jemalloc/include/jemalloc/internal/rtree_tsd.h
index 93a75173a..562e29297 100644
--- a/deps/jemalloc/include/jemalloc/internal/rtree_tsd.h
+++ b/deps/jemalloc/include/jemalloc/internal/rtree_tsd.h
@@ -26,7 +26,7 @@
  * Zero initializer required for tsd initialization only.  Proper initialization
  * done via rtree_ctx_data_init().
  */
-#define RTREE_CTX_ZERO_INITIALIZER {{{0}}, {{0}}}
+#define RTREE_CTX_ZERO_INITIALIZER {{{0, 0}}, {{0, 0}}}
 
 
 typedef struct rtree_leaf_elm_s rtree_leaf_elm_t;
diff --git a/deps/jemalloc/include/jemalloc/internal/safety_check.h b/deps/jemalloc/include/jemalloc/internal/safety_check.h
new file mode 100644
index 000000000..53339ac12
--- /dev/null
+++ b/deps/jemalloc/include/jemalloc/internal/safety_check.h
@@ -0,0 +1,26 @@
+#ifndef JEMALLOC_INTERNAL_SAFETY_CHECK_H
+#define JEMALLOC_INTERNAL_SAFETY_CHECK_H
+
+void safety_check_fail(const char *format, ...);
+/* Can set to NULL for a default. */
+void safety_check_set_abort(void (*abort_fn)());
+
+JEMALLOC_ALWAYS_INLINE void
+safety_check_set_redzone(void *ptr, size_t usize, size_t bumped_usize) {
+	assert(usize < bumped_usize);
+	for (size_t i = usize; i < bumped_usize && i < usize + 32; ++i) {
+		*((unsigned char *)ptr + i) = 0xBC;
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void
+safety_check_verify_redzone(const void *ptr, size_t usize, size_t bumped_usize)
+{
+	for (size_t i = usize; i < bumped_usize && i < usize + 32; ++i) {
+		if (unlikely(*((unsigned char *)ptr + i) != 0xBC)) {
+			safety_check_fail("Use after free error\n");
+		}
+	}
+}
+
+#endif /*JEMALLOC_INTERNAL_SAFETY_CHECK_H */
diff --git a/deps/jemalloc/include/jemalloc/internal/sc.h b/deps/jemalloc/include/jemalloc/internal/sc.h
new file mode 100644
index 000000000..9a099d8b6
--- /dev/null
+++ b/deps/jemalloc/include/jemalloc/internal/sc.h
@@ -0,0 +1,333 @@
+#ifndef JEMALLOC_INTERNAL_SC_H
+#define JEMALLOC_INTERNAL_SC_H
+
+#include "jemalloc/internal/jemalloc_internal_types.h"
+
+/*
+ * Size class computations:
+ *
+ * These are a little tricky; we'll first start by describing how things
+ * generally work, and then describe some of the details.
+ *
+ * Ignore the first few size classes for a moment. We can then split all the
+ * remaining size classes into groups. The size classes in a group are spaced
+ * such that they cover allocation request sizes in a power-of-2 range. The
+ * power of two is called the base of the group, and the size classes in it
+ * satisfy allocations in the half-open range (base, base * 2]. There are
+ * SC_NGROUP size classes in each group, equally spaced in the range, so that
+ * each one covers allocations for base / SC_NGROUP possible allocation sizes.
+ * We call that value (base / SC_NGROUP) the delta of the group. Each size class
+ * is delta larger than the one before it (including the initial size class in a
+ * group, which is delta larger than base, the largest size class in the
+ * previous group).
+ * To make the math all work out nicely, we require that SC_NGROUP is a power of
+ * two, and define it in terms of SC_LG_NGROUP. We'll often talk in terms of
+ * lg_base and lg_delta. For each of these groups then, we have that
+ * lg_delta == lg_base - SC_LG_NGROUP.
+ * The size classes in a group with a given lg_base and lg_delta (which, recall,
+ * can be computed from lg_base for these groups) are therefore:
+ *   base + 1 * delta
+ *     which covers allocations in (base, base + 1 * delta]
+ *   base + 2 * delta
+ *     which covers allocations in (base + 1 * delta, base + 2 * delta].
+ *   base + 3 * delta
+ *     which covers allocations in (base + 2 * delta, base + 3 * delta].
+ *   ...
+ *   base + SC_NGROUP * delta ( == 2 * base)
+ *     which covers allocations in (base + (SC_NGROUP - 1) * delta, 2 * base].
+ * (Note that currently SC_NGROUP is always 4, so the "..." is empty in
+ * practice.)
+ * Note that the last size class in the group is the next power of two (after
+ * base), so that we've set up the induction correctly for the next group's
+ * selection of delta.
+ *
+ * Now, let's start considering the first few size classes. Two extra constants
+ * come into play here: LG_QUANTUM and SC_LG_TINY_MIN. LG_QUANTUM ensures
+ * correct platform alignment; all objects of size (1 << LG_QUANTUM) or larger
+ * are at least (1 << LG_QUANTUM) aligned; this can be used to ensure that we
+ * never return improperly aligned memory, by making (1 << LG_QUANTUM) equal the
+ * highest required alignment of a platform. For allocation sizes smaller than
+ * (1 << LG_QUANTUM) though, we can be more relaxed (since we don't support
+ * platforms with types with alignment larger than their size). To allow such
+ * allocations (without wasting space unnecessarily), we introduce tiny size
+ * classes; one per power of two, up until we hit the quantum size. There are
+ * therefore LG_QUANTUM - SC_LG_TINY_MIN such size classes.
+ *
+ * Next, we have a size class of size (1 << LG_QUANTUM).  This can't be the
+ * start of a group in the sense we described above (covering a power of two
+ * range) since, if we divided into it to pick a value of delta, we'd get a
+ * delta smaller than (1 << LG_QUANTUM) for sizes >= (1 << LG_QUANTUM), which
+ * is against the rules.
+ *
+ * The first base we can divide by SC_NGROUP while still being at least
+ * (1 << LG_QUANTUM) is SC_NGROUP * (1 << LG_QUANTUM). We can get there by
+ * having SC_NGROUP size classes, spaced (1 << LG_QUANTUM) apart. These size
+ * classes are:
+ *   1 * (1 << LG_QUANTUM)
+ *   2 * (1 << LG_QUANTUM)
+ *   3 * (1 << LG_QUANTUM)
+ *   ... (although, as above, this "..." is empty in practice)
+ *   SC_NGROUP * (1 << LG_QUANTUM).
+ *
+ * There are SC_NGROUP of these size classes, so we can regard it as a sort of
+ * pseudo-group, even though it spans multiple powers of 2, is divided
+ * differently, and both starts and ends on a power of 2 (as opposed to just
+ * ending). SC_NGROUP is itself a power of two, so the first group after the
+ * pseudo-group has the power-of-two base SC_NGROUP * (1 << LG_QUANTUM), for a
+ * lg_base of LG_QUANTUM + SC_LG_NGROUP. We can divide this base into SC_NGROUP
+ * sizes without violating our LG_QUANTUM requirements, so we can safely set
+ * lg_delta = lg_base - SC_LG_GROUP (== LG_QUANTUM).
+ *
+ * So, in order, the size classes are:
+ *
+ * Tiny size classes:
+ * - Count: LG_QUANTUM - SC_LG_TINY_MIN.
+ * - Sizes:
+ *     1 << SC_LG_TINY_MIN
+ *     1 << (SC_LG_TINY_MIN + 1)
+ *     1 << (SC_LG_TINY_MIN + 2)
+ *     ...
+ *     1 << (LG_QUANTUM - 1)
+ *
+ * Initial pseudo-group:
+ * - Count: SC_NGROUP
+ * - Sizes:
+ *     1 * (1 << LG_QUANTUM)
+ *     2 * (1 << LG_QUANTUM)
+ *     3 * (1 << LG_QUANTUM)
+ *     ...
+ *     SC_NGROUP * (1 << LG_QUANTUM)
+ *
+ * Regular group 0:
+ * - Count: SC_NGROUP
+ * - Sizes:
+ *   (relative to lg_base of LG_QUANTUM + SC_LG_NGROUP and lg_delta of
+ *   lg_base - SC_LG_NGROUP)
+ *     (1 << lg_base) + 1 * (1 << lg_delta)
+ *     (1 << lg_base) + 2 * (1 << lg_delta)
+ *     (1 << lg_base) + 3 * (1 << lg_delta)
+ *     ...
+ *     (1 << lg_base) + SC_NGROUP * (1 << lg_delta) [ == (1 << (lg_base + 1)) ]
+ *
+ * Regular group 1:
+ * - Count: SC_NGROUP
+ * - Sizes:
+ *   (relative to lg_base of LG_QUANTUM + SC_LG_NGROUP + 1 and lg_delta of
+ *   lg_base - SC_LG_NGROUP)
+ *     (1 << lg_base) + 1 * (1 << lg_delta)
+ *     (1 << lg_base) + 2 * (1 << lg_delta)
+ *     (1 << lg_base) + 3 * (1 << lg_delta)
+ *     ...
+ *     (1 << lg_base) + SC_NGROUP * (1 << lg_delta) [ == (1 << (lg_base + 1)) ]
+ *
+ * ...
+ *
+ * Regular group N:
+ * - Count: SC_NGROUP
+ * - Sizes:
+ *   (relative to lg_base of LG_QUANTUM + SC_LG_NGROUP + N and lg_delta of
+ *   lg_base - SC_LG_NGROUP)
+ *     (1 << lg_base) + 1 * (1 << lg_delta)
+ *     (1 << lg_base) + 2 * (1 << lg_delta)
+ *     (1 << lg_base) + 3 * (1 << lg_delta)
+ *     ...
+ *     (1 << lg_base) + SC_NGROUP * (1 << lg_delta) [ == (1 << (lg_base + 1)) ]
+ *
+ *
+ * Representation of metadata:
+ * To make the math easy, we'll mostly work in lg quantities. We record lg_base,
+ * lg_delta, and ndelta (i.e. number of deltas above the base) on a
+ * per-size-class basis, and maintain the invariant that, across all size
+ * classes, size == (1 << lg_base) + ndelta * (1 << lg_delta).
+ *
+ * For regular groups (i.e. those with lg_base >= LG_QUANTUM + SC_LG_NGROUP),
+ * lg_delta is lg_base - SC_LG_NGROUP, and ndelta goes from 1 to SC_NGROUP.
+ *
+ * For the initial tiny size classes (if any), lg_base is lg(size class size).
+ * lg_delta is lg_base for the first size class, and lg_base - 1 for all
+ * subsequent ones. ndelta is always 0.
+ *
+ * For the pseudo-group, if there are no tiny size classes, then we set
+ * lg_base == LG_QUANTUM, lg_delta == LG_QUANTUM, and have ndelta range from 0
+ * to SC_NGROUP - 1. (Note that delta == base, so base + (SC_NGROUP - 1) * delta
+ * is just SC_NGROUP * base, or (1 << (SC_LG_NGROUP + LG_QUANTUM)), so we do
+ * indeed get a power of two that way). If there *are* tiny size classes, then
+ * the first size class needs to have lg_delta relative to the largest tiny size
+ * class. We therefore set lg_base == LG_QUANTUM - 1,
+ * lg_delta == LG_QUANTUM - 1, and ndelta == 1, keeping the rest of the
+ * pseudo-group the same.
+ *
+ *
+ * Other terminology:
+ * "Small" size classes mean those that are allocated out of bins, which is the
+ * same as those that are slab allocated.
+ * "Large" size classes are those that are not small. The cutoff for counting as
+ * large is page size * group size.
+ */
+
+/*
+ * Size class N + (1 << SC_LG_NGROUP) twice the size of size class N.
+ */
+#define SC_LG_NGROUP 2
+#define SC_LG_TINY_MIN 3
+
+#if SC_LG_TINY_MIN == 0
+/* The div module doesn't support division by 1, which this would require. */
+#error "Unsupported LG_TINY_MIN"
+#endif
+
+/*
+ * The definitions below are all determined by the above settings and system
+ * characteristics.
+ */
+#define SC_NGROUP (1ULL << SC_LG_NGROUP)
+#define SC_PTR_BITS ((1ULL << LG_SIZEOF_PTR) * 8)
+#define SC_NTINY (LG_QUANTUM - SC_LG_TINY_MIN)
+#define SC_LG_TINY_MAXCLASS (LG_QUANTUM > SC_LG_TINY_MIN ? LG_QUANTUM - 1 : -1)
+#define SC_NPSEUDO SC_NGROUP
+#define SC_LG_FIRST_REGULAR_BASE (LG_QUANTUM + SC_LG_NGROUP)
+/*
+ * We cap allocations to be less than 2 ** (ptr_bits - 1), so the highest base
+ * we need is 2 ** (ptr_bits - 2). (This also means that the last group is 1
+ * size class shorter than the others).
+ * We could probably save some space in arenas by capping this at LG_VADDR size.
+ */
+#define SC_LG_BASE_MAX (SC_PTR_BITS - 2)
+#define SC_NREGULAR (SC_NGROUP * 					\
+    (SC_LG_BASE_MAX - SC_LG_FIRST_REGULAR_BASE + 1) - 1)
+#define SC_NSIZES (SC_NTINY + SC_NPSEUDO + SC_NREGULAR)
+
+/* The number of size classes that are a multiple of the page size. */
+#define SC_NPSIZES (							\
+    /* Start with all the size classes. */				\
+    SC_NSIZES								\
+    /* Subtract out those groups with too small a base. */		\
+    - (LG_PAGE - 1 - SC_LG_FIRST_REGULAR_BASE) * SC_NGROUP		\
+    /* And the pseudo-group. */						\
+    - SC_NPSEUDO							\
+    /* And the tiny group. */						\
+    - SC_NTINY								\
+    /* Sizes where ndelta*delta is not a multiple of the page size. */	\
+    - (SC_LG_NGROUP * SC_NGROUP))
+/*
+ * Note that the last line is computed as the sum of the second column in the
+ * following table:
+ *                      lg(base) | count of sizes to exclude
+ * ------------------------------|-----------------------------
+ *                   LG_PAGE - 1 | SC_NGROUP - 1
+ *                       LG_PAGE | SC_NGROUP - 1
+ *                   LG_PAGE + 1 | SC_NGROUP - 2
+ *                   LG_PAGE + 2 | SC_NGROUP - 4
+ *                           ... | ...
+ *  LG_PAGE + (SC_LG_NGROUP - 1) | SC_NGROUP - (SC_NGROUP / 2)
+ */
+
+/*
+ * We declare a size class is binnable if size < page size * group. Or, in other
+ * words, lg(size) < lg(page size) + lg(group size).
+ */
+#define SC_NBINS (							\
+    /* Sub-regular size classes. */					\
+    SC_NTINY + SC_NPSEUDO						\
+    /* Groups with lg_regular_min_base <= lg_base <= lg_base_max */	\
+    + SC_NGROUP * (LG_PAGE + SC_LG_NGROUP - SC_LG_FIRST_REGULAR_BASE)	\
+    /* Last SC of the last group hits the bound exactly; exclude it. */	\
+    - 1)
+
+/*
+ * The size2index_tab lookup table uses uint8_t to encode each bin index, so we
+ * cannot support more than 256 small size classes.
+ */
+#if (SC_NBINS > 256)
+#  error "Too many small size classes"
+#endif
+
+/* The largest size class in the lookup table. */
+#define SC_LOOKUP_MAXCLASS ((size_t)1 << 12)
+
+/* Internal, only used for the definition of SC_SMALL_MAXCLASS. */
+#define SC_SMALL_MAX_BASE ((size_t)1 << (LG_PAGE + SC_LG_NGROUP - 1))
+#define SC_SMALL_MAX_DELTA ((size_t)1 << (LG_PAGE - 1))
+
+/* The largest size class allocated out of a slab. */
+#define SC_SMALL_MAXCLASS (SC_SMALL_MAX_BASE				\
+    + (SC_NGROUP - 1) * SC_SMALL_MAX_DELTA)
+
+/* The smallest size class not allocated out of a slab. */
+#define SC_LARGE_MINCLASS ((size_t)1ULL << (LG_PAGE + SC_LG_NGROUP))
+#define SC_LG_LARGE_MINCLASS (LG_PAGE + SC_LG_NGROUP)
+
+/* Internal; only used for the definition of SC_LARGE_MAXCLASS. */
+#define SC_MAX_BASE ((size_t)1 << (SC_PTR_BITS - 2))
+#define SC_MAX_DELTA ((size_t)1 << (SC_PTR_BITS - 2 - SC_LG_NGROUP))
+
+/* The largest size class supported. */
+#define SC_LARGE_MAXCLASS (SC_MAX_BASE + (SC_NGROUP - 1) * SC_MAX_DELTA)
+
+typedef struct sc_s sc_t;
+struct sc_s {
+	/* Size class index, or -1 if not a valid size class. */
+	int index;
+	/* Lg group base size (no deltas added). */
+	int lg_base;
+	/* Lg delta to previous size class. */
+	int lg_delta;
+	/* Delta multiplier.  size == 1<<lg_base + ndelta<<lg_delta */
+	int ndelta;
+	/*
+	 * True if the size class is a multiple of the page size, false
+	 * otherwise.
+	 */
+	bool psz;
+	/*
+	 * True if the size class is a small, bin, size class. False otherwise.
+	 */
+	bool bin;
+	/* The slab page count if a small bin size class, 0 otherwise. */
+	int pgs;
+	/* Same as lg_delta if a lookup table size class, 0 otherwise. */
+	int lg_delta_lookup;
+};
+
+typedef struct sc_data_s sc_data_t;
+struct sc_data_s {
+	/* Number of tiny size classes. */
+	unsigned ntiny;
+	/* Number of bins supported by the lookup table. */
+	int nlbins;
+	/* Number of small size class bins. */
+	int nbins;
+	/* Number of size classes. */
+	int nsizes;
+	/* Number of bits required to store NSIZES. */
+	int lg_ceil_nsizes;
+	/* Number of size classes that are a multiple of (1U << LG_PAGE). */
+	unsigned npsizes;
+	/* Lg of maximum tiny size class (or -1, if none). */
+	int lg_tiny_maxclass;
+	/* Maximum size class included in lookup table. */
+	size_t lookup_maxclass;
+	/* Maximum small size class. */
+	size_t small_maxclass;
+	/* Lg of minimum large size class. */
+	int lg_large_minclass;
+	/* The minimum large size class. */
+	size_t large_minclass;
+	/* Maximum (large) size class. */
+	size_t large_maxclass;
+	/* True if the sc_data_t has been initialized (for debugging only). */
+	bool initialized;
+
+	sc_t sc[SC_NSIZES];
+};
+
+void sc_data_init(sc_data_t *data);
+/*
+ * Updates slab sizes in [begin, end] to be pgs pages in length, if possible.
+ * Otherwise, does its best to accomodate the request.
+ */
+void sc_data_update_slab_size(sc_data_t *data, size_t begin, size_t end,
+    int pgs);
+void sc_boot(sc_data_t *data);
+
+#endif /* JEMALLOC_INTERNAL_SC_H */
diff --git a/deps/jemalloc/include/jemalloc/internal/seq.h b/deps/jemalloc/include/jemalloc/internal/seq.h
new file mode 100644
index 000000000..ef2df4c6e
--- /dev/null
+++ b/deps/jemalloc/include/jemalloc/internal/seq.h
@@ -0,0 +1,55 @@
+#ifndef JEMALLOC_INTERNAL_SEQ_H
+#define JEMALLOC_INTERNAL_SEQ_H
+
+#include "jemalloc/internal/atomic.h"
+
+/*
+ * A simple seqlock implementation.
+ */
+
+#define seq_define(type, short_type)					\
+typedef struct {							\
+	atomic_zu_t seq;						\
+	atomic_zu_t data[						\
+	    (sizeof(type) + sizeof(size_t) - 1) / sizeof(size_t)];	\
+} seq_##short_type##_t;							\
+									\
+/*									\
+ * No internal synchronization -- the caller must ensure that there's	\
+ * only a single writer at a time.					\
+ */									\
+static inline void							\
+seq_store_##short_type(seq_##short_type##_t *dst, type *src) {		\
+	size_t buf[sizeof(dst->data) / sizeof(size_t)];			\
+	buf[sizeof(buf) / sizeof(size_t) - 1] = 0;			\
+	memcpy(buf, src, sizeof(type));					\
+	size_t old_seq = atomic_load_zu(&dst->seq, ATOMIC_RELAXED);	\
+	atomic_store_zu(&dst->seq, old_seq + 1, ATOMIC_RELAXED);	\
+	atomic_fence(ATOMIC_RELEASE);					\
+	for (size_t i = 0; i < sizeof(buf) / sizeof(size_t); i++) {	\
+		atomic_store_zu(&dst->data[i], buf[i], ATOMIC_RELAXED);	\
+	}								\
+	atomic_store_zu(&dst->seq, old_seq + 2, ATOMIC_RELEASE);	\
+}									\
+									\
+/* Returns whether or not the read was consistent. */			\
+static inline bool							\
+seq_try_load_##short_type(type *dst, seq_##short_type##_t *src) {	\
+	size_t buf[sizeof(src->data) / sizeof(size_t)];			\
+	size_t seq1 = atomic_load_zu(&src->seq, ATOMIC_ACQUIRE);	\
+	if (seq1 % 2 != 0) {						\
+		return false;						\
+	}								\
+	for (size_t i = 0; i < sizeof(buf) / sizeof(size_t); i++) {	\
+		buf[i] = atomic_load_zu(&src->data[i], ATOMIC_RELAXED);	\
+	}								\
+	atomic_fence(ATOMIC_ACQUIRE);					\
+	size_t seq2 = atomic_load_zu(&src->seq, ATOMIC_RELAXED);	\
+	if (seq1 != seq2) {						\
+		return false;						\
+	}								\
+	memcpy(dst, buf, sizeof(type));					\
+	return true;							\
+}
+
+#endif /* JEMALLOC_INTERNAL_SEQ_H */
diff --git a/deps/jemalloc/include/jemalloc/internal/size_classes.sh b/deps/jemalloc/include/jemalloc/internal/size_classes.sh
deleted file mode 100755
index 998994d09..000000000
--- a/deps/jemalloc/include/jemalloc/internal/size_classes.sh
+++ /dev/null
@@ -1,361 +0,0 @@
-#!/bin/sh
-#
-# Usage: size_classes.sh <lg_qarr> <lg_tmin> <lg_parr> <lg_g>
-
-# The following limits are chosen such that they cover all supported platforms.
-
-# Pointer sizes.
-lg_zarr="2 3"
-
-# Quanta.
-lg_qarr=$1
-
-# The range of tiny size classes is [2^lg_tmin..2^(lg_q-1)].
-lg_tmin=$2
-
-# Maximum lookup size.
-lg_kmax=12
-
-# Page sizes.
-lg_parr=`echo $3 | tr ',' ' '`
-
-# Size class group size (number of size classes for each size doubling).
-lg_g=$4
-
-pow2() {
-  e=$1
-  pow2_result=1
-  while [ ${e} -gt 0 ] ; do
-    pow2_result=$((${pow2_result} + ${pow2_result}))
-    e=$((${e} - 1))
-  done
-}
-
-lg() {
-  x=$1
-  lg_result=0
-  while [ ${x} -gt 1 ] ; do
-    lg_result=$((${lg_result} + 1))
-    x=$((${x} / 2))
-  done
-}
-
-lg_ceil() {
-  y=$1
-  lg ${y}; lg_floor=${lg_result}
-  pow2 ${lg_floor}; pow2_floor=${pow2_result}
-  if [ ${pow2_floor} -lt ${y} ] ; then
-    lg_ceil_result=$((${lg_floor} + 1))
-  else
-    lg_ceil_result=${lg_floor}
-  fi
-}
-
-reg_size_compute() {
-  lg_grp=$1
-  lg_delta=$2
-  ndelta=$3
-
-  pow2 ${lg_grp}; grp=${pow2_result}
-  pow2 ${lg_delta}; delta=${pow2_result}
-  reg_size=$((${grp} + ${delta}*${ndelta}))
-}
-
-slab_size() {
-  lg_p=$1
-  lg_grp=$2
-  lg_delta=$3
-  ndelta=$4
-
-  pow2 ${lg_p}; p=${pow2_result}
-  reg_size_compute ${lg_grp} ${lg_delta} ${ndelta}
-
-  # Compute smallest slab size that is an integer multiple of reg_size.
-  try_slab_size=${p}
-  try_nregs=$((${try_slab_size} / ${reg_size}))
-  perfect=0
-  while [ ${perfect} -eq 0 ] ; do
-    perfect_slab_size=${try_slab_size}
-    perfect_nregs=${try_nregs}
-
-    try_slab_size=$((${try_slab_size} + ${p}))
-    try_nregs=$((${try_slab_size} / ${reg_size}))
-    if [ ${perfect_slab_size} -eq $((${perfect_nregs} * ${reg_size})) ] ; then
-      perfect=1
-    fi
-  done
-
-  slab_size_pgs=$((${perfect_slab_size} / ${p}))
-}
-
-size_class() {
-  index=$1
-  lg_grp=$2
-  lg_delta=$3
-  ndelta=$4
-  lg_p=$5
-  lg_kmax=$6
-
-  if [ ${lg_delta} -ge ${lg_p} ] ; then
-    psz="yes"
-  else
-    pow2 ${lg_p}; p=${pow2_result}
-    pow2 ${lg_grp}; grp=${pow2_result}
-    pow2 ${lg_delta}; delta=${pow2_result}
-    sz=$((${grp} + ${delta} * ${ndelta}))
-    npgs=$((${sz} / ${p}))
-    if [ ${sz} -eq $((${npgs} * ${p})) ] ; then
-      psz="yes"
-    else
-      psz="no"
-    fi
-  fi
-
-  lg ${ndelta}; lg_ndelta=${lg_result}; pow2 ${lg_ndelta}
-  if [ ${pow2_result} -lt ${ndelta} ] ; then
-    rem="yes"
-  else
-    rem="no"
-  fi
-
-  lg_size=${lg_grp}
-  if [ $((${lg_delta} + ${lg_ndelta})) -eq ${lg_grp} ] ; then
-    lg_size=$((${lg_grp} + 1))
-  else
-    lg_size=${lg_grp}
-    rem="yes"
-  fi
-
-  if [ ${lg_size} -lt $((${lg_p} + ${lg_g})) ] ; then
-    bin="yes"
-    slab_size ${lg_p} ${lg_grp} ${lg_delta} ${ndelta}; pgs=${slab_size_pgs}
-  else
-    bin="no"
-    pgs=0
-  fi
-  if [ ${lg_size} -lt ${lg_kmax} \
-      -o ${lg_size} -eq ${lg_kmax} -a ${rem} = "no" ] ; then
-    lg_delta_lookup=${lg_delta}
-  else
-    lg_delta_lookup="no"
-  fi
-  printf '    SC(%3d, %6d, %8d, %6d, %3s, %3s, %3d, %2s) \\\n' ${index} ${lg_grp} ${lg_delta} ${ndelta} ${psz} ${bin} ${pgs} ${lg_delta_lookup}
-  # Defined upon return:
-  # - psz ("yes" or "no")
-  # - bin ("yes" or "no")
-  # - pgs
-  # - lg_delta_lookup (${lg_delta} or "no")
-}
-
-sep_line() {
-  echo "                                                         \\"
-}
-
-size_classes() {
-  lg_z=$1
-  lg_q=$2
-  lg_t=$3
-  lg_p=$4
-  lg_g=$5
-
-  pow2 $((${lg_z} + 3)); ptr_bits=${pow2_result}
-  pow2 ${lg_g}; g=${pow2_result}
-
-  echo "#define SIZE_CLASSES \\"
-  echo "  /* index, lg_grp, lg_delta, ndelta, psz, bin, pgs, lg_delta_lookup */ \\"
-
-  ntbins=0
-  nlbins=0
-  lg_tiny_maxclass='"NA"'
-  nbins=0
-  npsizes=0
-
-  # Tiny size classes.
-  ndelta=0
-  index=0
-  lg_grp=${lg_t}
-  lg_delta=${lg_grp}
-  while [ ${lg_grp} -lt ${lg_q} ] ; do
-    size_class ${index} ${lg_grp} ${lg_delta} ${ndelta} ${lg_p} ${lg_kmax}
-    if [ ${lg_delta_lookup} != "no" ] ; then
-      nlbins=$((${index} + 1))
-    fi
-    if [ ${psz} = "yes" ] ; then
-      npsizes=$((${npsizes} + 1))
-    fi
-    if [ ${bin} != "no" ] ; then
-      nbins=$((${index} + 1))
-    fi
-    ntbins=$((${ntbins} + 1))
-    lg_tiny_maxclass=${lg_grp} # Final written value is correct.
-    index=$((${index} + 1))
-    lg_delta=${lg_grp}
-    lg_grp=$((${lg_grp} + 1))
-  done
-
-  # First non-tiny group.
-  if [ ${ntbins} -gt 0 ] ; then
-    sep_line
-    # The first size class has an unusual encoding, because the size has to be
-    # split between grp and delta*ndelta.
-    lg_grp=$((${lg_grp} - 1))
-    ndelta=1
-    size_class ${index} ${lg_grp} ${lg_delta} ${ndelta} ${lg_p} ${lg_kmax}
-    index=$((${index} + 1))
-    lg_grp=$((${lg_grp} + 1))
-    lg_delta=$((${lg_delta} + 1))
-    if [ ${psz} = "yes" ] ; then
-      npsizes=$((${npsizes} + 1))
-    fi
-  fi
-  while [ ${ndelta} -lt ${g} ] ; do
-    size_class ${index} ${lg_grp} ${lg_delta} ${ndelta} ${lg_p} ${lg_kmax}
-    index=$((${index} + 1))
-    ndelta=$((${ndelta} + 1))
-    if [ ${psz} = "yes" ] ; then
-      npsizes=$((${npsizes} + 1))
-    fi
-  done
-
-  # All remaining groups.
-  lg_grp=$((${lg_grp} + ${lg_g}))
-  while [ ${lg_grp} -lt $((${ptr_bits} - 1)) ] ; do
-    sep_line
-    ndelta=1
-    if [ ${lg_grp} -eq $((${ptr_bits} - 2)) ] ; then
-      ndelta_limit=$((${g} - 1))
-    else
-      ndelta_limit=${g}
-    fi
-    while [ ${ndelta} -le ${ndelta_limit} ] ; do
-      size_class ${index} ${lg_grp} ${lg_delta} ${ndelta} ${lg_p} ${lg_kmax}
-      if [ ${lg_delta_lookup} != "no" ] ; then
-        nlbins=$((${index} + 1))
-        # Final written value is correct:
-        lookup_maxclass="((((size_t)1) << ${lg_grp}) + (((size_t)${ndelta}) << ${lg_delta}))"
-      fi
-      if [ ${psz} = "yes" ] ; then
-        npsizes=$((${npsizes} + 1))
-      fi
-      if [ ${bin} != "no" ] ; then
-        nbins=$((${index} + 1))
-        # Final written value is correct:
-        small_maxclass="((((size_t)1) << ${lg_grp}) + (((size_t)${ndelta}) << ${lg_delta}))"
-        if [ ${lg_g} -gt 0 ] ; then
-          lg_large_minclass=$((${lg_grp} + 1))
-        else
-          lg_large_minclass=$((${lg_grp} + 2))
-        fi
-      fi
-      # Final written value is correct:
-      large_maxclass="((((size_t)1) << ${lg_grp}) + (((size_t)${ndelta}) << ${lg_delta}))"
-      index=$((${index} + 1))
-      ndelta=$((${ndelta} + 1))
-    done
-    lg_grp=$((${lg_grp} + 1))
-    lg_delta=$((${lg_delta} + 1))
-  done
-  echo
-  nsizes=${index}
-  lg_ceil ${nsizes}; lg_ceil_nsizes=${lg_ceil_result}
-
-  # Defined upon completion:
-  # - ntbins
-  # - nlbins
-  # - nbins
-  # - nsizes
-  # - lg_ceil_nsizes
-  # - npsizes
-  # - lg_tiny_maxclass
-  # - lookup_maxclass
-  # - small_maxclass
-  # - lg_large_minclass
-  # - large_maxclass
-}
-
-cat <<EOF
-#ifndef JEMALLOC_INTERNAL_SIZE_CLASSES_H
-#define JEMALLOC_INTERNAL_SIZE_CLASSES_H
-
-/* This file was automatically generated by size_classes.sh. */
-
-#include "jemalloc/internal/jemalloc_internal_types.h"
-
-/*
- * This header file defines:
- *
- *   LG_SIZE_CLASS_GROUP: Lg of size class count for each size doubling.
- *   LG_TINY_MIN: Lg of minimum size class to support.
- *   SIZE_CLASSES: Complete table of SC(index, lg_grp, lg_delta, ndelta, psz,
- *                 bin, pgs, lg_delta_lookup) tuples.
- *     index: Size class index.
- *     lg_grp: Lg group base size (no deltas added).
- *     lg_delta: Lg delta to previous size class.
- *     ndelta: Delta multiplier.  size == 1<<lg_grp + ndelta<<lg_delta
- *     psz: 'yes' if a multiple of the page size, 'no' otherwise.
- *     bin: 'yes' if a small bin size class, 'no' otherwise.
- *     pgs: Slab page count if a small bin size class, 0 otherwise.
- *     lg_delta_lookup: Same as lg_delta if a lookup table size class, 'no'
- *                      otherwise.
- *   NTBINS: Number of tiny bins.
- *   NLBINS: Number of bins supported by the lookup table.
- *   NBINS: Number of small size class bins.
- *   NSIZES: Number of size classes.
- *   LG_CEIL_NSIZES: Number of bits required to store NSIZES.
- *   NPSIZES: Number of size classes that are a multiple of (1U << LG_PAGE).
- *   LG_TINY_MAXCLASS: Lg of maximum tiny size class.
- *   LOOKUP_MAXCLASS: Maximum size class included in lookup table.
- *   SMALL_MAXCLASS: Maximum small size class.
- *   LG_LARGE_MINCLASS: Lg of minimum large size class.
- *   LARGE_MAXCLASS: Maximum (large) size class.
- */
-
-#define LG_SIZE_CLASS_GROUP	${lg_g}
-#define LG_TINY_MIN		${lg_tmin}
-
-EOF
-
-for lg_z in ${lg_zarr} ; do
-  for lg_q in ${lg_qarr} ; do
-    lg_t=${lg_tmin}
-    while [ ${lg_t} -le ${lg_q} ] ; do
-      # Iterate through page sizes and compute how many bins there are.
-      for lg_p in ${lg_parr} ; do
-        echo "#if (LG_SIZEOF_PTR == ${lg_z} && LG_TINY_MIN == ${lg_t} && LG_QUANTUM == ${lg_q} && LG_PAGE == ${lg_p})"
-        size_classes ${lg_z} ${lg_q} ${lg_t} ${lg_p} ${lg_g}
-        echo "#define SIZE_CLASSES_DEFINED"
-        echo "#define NTBINS			${ntbins}"
-        echo "#define NLBINS			${nlbins}"
-        echo "#define NBINS			${nbins}"
-        echo "#define NSIZES			${nsizes}"
-        echo "#define LG_CEIL_NSIZES		${lg_ceil_nsizes}"
-        echo "#define NPSIZES			${npsizes}"
-        echo "#define LG_TINY_MAXCLASS	${lg_tiny_maxclass}"
-        echo "#define LOOKUP_MAXCLASS		${lookup_maxclass}"
-        echo "#define SMALL_MAXCLASS		${small_maxclass}"
-        echo "#define LG_LARGE_MINCLASS	${lg_large_minclass}"
-        echo "#define LARGE_MINCLASS		(ZU(1) << LG_LARGE_MINCLASS)"
-        echo "#define LARGE_MAXCLASS		${large_maxclass}"
-        echo "#endif"
-        echo
-      done
-      lg_t=$((${lg_t} + 1))
-    done
-  done
-done
-
-cat <<EOF
-#ifndef SIZE_CLASSES_DEFINED
-#  error "No size class definitions match configuration"
-#endif
-#undef SIZE_CLASSES_DEFINED
-/*
- * The size2index_tab lookup table uses uint8_t to encode each bin index, so we
- * cannot support more than 256 small size classes.
- */
-#if (NBINS > 256)
-#  error "Too many small size classes"
-#endif
-
-#endif /* JEMALLOC_INTERNAL_SIZE_CLASSES_H */
-EOF
diff --git a/deps/jemalloc/include/jemalloc/internal/stats.h b/deps/jemalloc/include/jemalloc/internal/stats.h
index 852e34269..3b9e0eac1 100644
--- a/deps/jemalloc/include/jemalloc/internal/stats.h
+++ b/deps/jemalloc/include/jemalloc/internal/stats.h
@@ -10,7 +10,8 @@
     OPTION('a',		unmerged,	config_stats,	false)		\
     OPTION('b',		bins,		true,		false)		\
     OPTION('l',		large,		true,		false)		\
-    OPTION('x',		mutex,		true,		false)
+    OPTION('x',		mutex,		true,		false)		\
+    OPTION('e',		extents,	true,		false)
 
 enum {
 #define OPTION(o, v, d, s) stats_print_option_num_##v,
diff --git a/deps/jemalloc/include/jemalloc/internal/sz.h b/deps/jemalloc/include/jemalloc/internal/sz.h
index 979462898..68e558abf 100644
--- a/deps/jemalloc/include/jemalloc/internal/sz.h
+++ b/deps/jemalloc/include/jemalloc/internal/sz.h
@@ -3,7 +3,7 @@
 
 #include "jemalloc/internal/bit_util.h"
 #include "jemalloc/internal/pages.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/util.h"
 
 /*
@@ -26,18 +26,18 @@
  * sz_pind2sz_tab encodes the same information as could be computed by
  * sz_pind2sz_compute().
  */
-extern size_t const sz_pind2sz_tab[NPSIZES+1];
+extern size_t sz_pind2sz_tab[SC_NPSIZES + 1];
 /*
  * sz_index2size_tab encodes the same information as could be computed (at
  * unacceptable cost in some code paths) by sz_index2size_compute().
  */
-extern size_t const sz_index2size_tab[NSIZES];
+extern size_t sz_index2size_tab[SC_NSIZES];
 /*
  * sz_size2index_tab is a compact lookup table that rounds request sizes up to
  * size classes.  In order to reduce cache footprint, the table is compressed,
  * and all accesses are via sz_size2index().
  */
-extern uint8_t const sz_size2index_tab[];
+extern uint8_t sz_size2index_tab[];
 
 static const size_t sz_large_pad =
 #ifdef JEMALLOC_CACHE_OBLIVIOUS
@@ -47,49 +47,47 @@ static const size_t sz_large_pad =
 #endif
     ;
 
+extern void sz_boot(const sc_data_t *sc_data);
+
 JEMALLOC_ALWAYS_INLINE pszind_t
 sz_psz2ind(size_t psz) {
-	if (unlikely(psz > LARGE_MAXCLASS)) {
-		return NPSIZES;
+	if (unlikely(psz > SC_LARGE_MAXCLASS)) {
+		return SC_NPSIZES;
 	}
-	{
-		pszind_t x = lg_floor((psz<<1)-1);
-		pszind_t shift = (x < LG_SIZE_CLASS_GROUP + LG_PAGE) ? 0 : x -
-		    (LG_SIZE_CLASS_GROUP + LG_PAGE);
-		pszind_t grp = shift << LG_SIZE_CLASS_GROUP;
+	pszind_t x = lg_floor((psz<<1)-1);
+	pszind_t shift = (x < SC_LG_NGROUP + LG_PAGE) ?
+	    0 : x - (SC_LG_NGROUP + LG_PAGE);
+	pszind_t grp = shift << SC_LG_NGROUP;
 
-		pszind_t lg_delta = (x < LG_SIZE_CLASS_GROUP + LG_PAGE + 1) ?
-		    LG_PAGE : x - LG_SIZE_CLASS_GROUP - 1;
+	pszind_t lg_delta = (x < SC_LG_NGROUP + LG_PAGE + 1) ?
+	    LG_PAGE : x - SC_LG_NGROUP - 1;
 
-		size_t delta_inverse_mask = ZU(-1) << lg_delta;
-		pszind_t mod = ((((psz-1) & delta_inverse_mask) >> lg_delta)) &
-		    ((ZU(1) << LG_SIZE_CLASS_GROUP) - 1);
+	size_t delta_inverse_mask = ZU(-1) << lg_delta;
+	pszind_t mod = ((((psz-1) & delta_inverse_mask) >> lg_delta)) &
+	    ((ZU(1) << SC_LG_NGROUP) - 1);
 
-		pszind_t ind = grp + mod;
-		return ind;
-	}
+	pszind_t ind = grp + mod;
+	return ind;
 }
 
 static inline size_t
 sz_pind2sz_compute(pszind_t pind) {
-	if (unlikely(pind == NPSIZES)) {
-		return LARGE_MAXCLASS + PAGE;
+	if (unlikely(pind == SC_NPSIZES)) {
+		return SC_LARGE_MAXCLASS + PAGE;
 	}
-	{
-		size_t grp = pind >> LG_SIZE_CLASS_GROUP;
-		size_t mod = pind & ((ZU(1) << LG_SIZE_CLASS_GROUP) - 1);
+	size_t grp = pind >> SC_LG_NGROUP;
+	size_t mod = pind & ((ZU(1) << SC_LG_NGROUP) - 1);
 
-		size_t grp_size_mask = ~((!!grp)-1);
-		size_t grp_size = ((ZU(1) << (LG_PAGE +
-		    (LG_SIZE_CLASS_GROUP-1))) << grp) & grp_size_mask;
+	size_t grp_size_mask = ~((!!grp)-1);
+	size_t grp_size = ((ZU(1) << (LG_PAGE + (SC_LG_NGROUP-1))) << grp)
+	    & grp_size_mask;
 
-		size_t shift = (grp == 0) ? 1 : grp;
-		size_t lg_delta = shift + (LG_PAGE-1);
-		size_t mod_size = (mod+1) << lg_delta;
+	size_t shift = (grp == 0) ? 1 : grp;
+	size_t lg_delta = shift + (LG_PAGE-1);
+	size_t mod_size = (mod+1) << lg_delta;
 
-		size_t sz = grp_size + mod_size;
-		return sz;
-	}
+	size_t sz = grp_size + mod_size;
+	return sz;
 }
 
 static inline size_t
@@ -101,70 +99,70 @@ sz_pind2sz_lookup(pszind_t pind) {
 
 static inline size_t
 sz_pind2sz(pszind_t pind) {
-	assert(pind < NPSIZES+1);
+	assert(pind < SC_NPSIZES + 1);
 	return sz_pind2sz_lookup(pind);
 }
 
 static inline size_t
 sz_psz2u(size_t psz) {
-	if (unlikely(psz > LARGE_MAXCLASS)) {
-		return LARGE_MAXCLASS + PAGE;
-	}
-	{
-		size_t x = lg_floor((psz<<1)-1);
-		size_t lg_delta = (x < LG_SIZE_CLASS_GROUP + LG_PAGE + 1) ?
-		    LG_PAGE : x - LG_SIZE_CLASS_GROUP - 1;
-		size_t delta = ZU(1) << lg_delta;
-		size_t delta_mask = delta - 1;
-		size_t usize = (psz + delta_mask) & ~delta_mask;
-		return usize;
+	if (unlikely(psz > SC_LARGE_MAXCLASS)) {
+		return SC_LARGE_MAXCLASS + PAGE;
 	}
+	size_t x = lg_floor((psz<<1)-1);
+	size_t lg_delta = (x < SC_LG_NGROUP + LG_PAGE + 1) ?
+	    LG_PAGE : x - SC_LG_NGROUP - 1;
+	size_t delta = ZU(1) << lg_delta;
+	size_t delta_mask = delta - 1;
+	size_t usize = (psz + delta_mask) & ~delta_mask;
+	return usize;
 }
 
 static inline szind_t
 sz_size2index_compute(size_t size) {
-	if (unlikely(size > LARGE_MAXCLASS)) {
-		return NSIZES;
+	if (unlikely(size > SC_LARGE_MAXCLASS)) {
+		return SC_NSIZES;
+	}
+
+	if (size == 0) {
+		return 0;
 	}
-#if (NTBINS != 0)
-	if (size <= (ZU(1) << LG_TINY_MAXCLASS)) {
-		szind_t lg_tmin = LG_TINY_MAXCLASS - NTBINS + 1;
+#if (SC_NTINY != 0)
+	if (size <= (ZU(1) << SC_LG_TINY_MAXCLASS)) {
+		szind_t lg_tmin = SC_LG_TINY_MAXCLASS - SC_NTINY + 1;
 		szind_t lg_ceil = lg_floor(pow2_ceil_zu(size));
 		return (lg_ceil < lg_tmin ? 0 : lg_ceil - lg_tmin);
 	}
 #endif
 	{
 		szind_t x = lg_floor((size<<1)-1);
-		szind_t shift = (x < LG_SIZE_CLASS_GROUP + LG_QUANTUM) ? 0 :
-		    x - (LG_SIZE_CLASS_GROUP + LG_QUANTUM);
-		szind_t grp = shift << LG_SIZE_CLASS_GROUP;
+		szind_t shift = (x < SC_LG_NGROUP + LG_QUANTUM) ? 0 :
+		    x - (SC_LG_NGROUP + LG_QUANTUM);
+		szind_t grp = shift << SC_LG_NGROUP;
 
-		szind_t lg_delta = (x < LG_SIZE_CLASS_GROUP + LG_QUANTUM + 1)
-		    ? LG_QUANTUM : x - LG_SIZE_CLASS_GROUP - 1;
+		szind_t lg_delta = (x < SC_LG_NGROUP + LG_QUANTUM + 1)
+		    ? LG_QUANTUM : x - SC_LG_NGROUP - 1;
 
 		size_t delta_inverse_mask = ZU(-1) << lg_delta;
 		szind_t mod = ((((size-1) & delta_inverse_mask) >> lg_delta)) &
-		    ((ZU(1) << LG_SIZE_CLASS_GROUP) - 1);
+		    ((ZU(1) << SC_LG_NGROUP) - 1);
 
-		szind_t index = NTBINS + grp + mod;
+		szind_t index = SC_NTINY + grp + mod;
 		return index;
 	}
 }
 
 JEMALLOC_ALWAYS_INLINE szind_t
 sz_size2index_lookup(size_t size) {
-	assert(size <= LOOKUP_MAXCLASS);
-	{
-		szind_t ret = (sz_size2index_tab[(size-1) >> LG_TINY_MIN]);
-		assert(ret == sz_size2index_compute(size));
-		return ret;
-	}
+	assert(size <= SC_LOOKUP_MAXCLASS);
+	szind_t ret = (sz_size2index_tab[(size + (ZU(1) << SC_LG_TINY_MIN) - 1)
+					 >> SC_LG_TINY_MIN]);
+	assert(ret == sz_size2index_compute(size));
+	return ret;
 }
 
 JEMALLOC_ALWAYS_INLINE szind_t
 sz_size2index(size_t size) {
-	assert(size > 0);
-	if (likely(size <= LOOKUP_MAXCLASS)) {
+	if (likely(size <= SC_LOOKUP_MAXCLASS)) {
 		return sz_size2index_lookup(size);
 	}
 	return sz_size2index_compute(size);
@@ -172,20 +170,20 @@ sz_size2index(size_t size) {
 
 static inline size_t
 sz_index2size_compute(szind_t index) {
-#if (NTBINS > 0)
-	if (index < NTBINS) {
-		return (ZU(1) << (LG_TINY_MAXCLASS - NTBINS + 1 + index));
+#if (SC_NTINY > 0)
+	if (index < SC_NTINY) {
+		return (ZU(1) << (SC_LG_TINY_MAXCLASS - SC_NTINY + 1 + index));
 	}
 #endif
 	{
-		size_t reduced_index = index - NTBINS;
-		size_t grp = reduced_index >> LG_SIZE_CLASS_GROUP;
-		size_t mod = reduced_index & ((ZU(1) << LG_SIZE_CLASS_GROUP) -
+		size_t reduced_index = index - SC_NTINY;
+		size_t grp = reduced_index >> SC_LG_NGROUP;
+		size_t mod = reduced_index & ((ZU(1) << SC_LG_NGROUP) -
 		    1);
 
 		size_t grp_size_mask = ~((!!grp)-1);
 		size_t grp_size = ((ZU(1) << (LG_QUANTUM +
-		    (LG_SIZE_CLASS_GROUP-1))) << grp) & grp_size_mask;
+		    (SC_LG_NGROUP-1))) << grp) & grp_size_mask;
 
 		size_t shift = (grp == 0) ? 1 : grp;
 		size_t lg_delta = shift + (LG_QUANTUM-1);
@@ -205,18 +203,22 @@ sz_index2size_lookup(szind_t index) {
 
 JEMALLOC_ALWAYS_INLINE size_t
 sz_index2size(szind_t index) {
-	assert(index < NSIZES);
+	assert(index < SC_NSIZES);
 	return sz_index2size_lookup(index);
 }
 
 JEMALLOC_ALWAYS_INLINE size_t
 sz_s2u_compute(size_t size) {
-	if (unlikely(size > LARGE_MAXCLASS)) {
+	if (unlikely(size > SC_LARGE_MAXCLASS)) {
 		return 0;
 	}
-#if (NTBINS > 0)
-	if (size <= (ZU(1) << LG_TINY_MAXCLASS)) {
-		size_t lg_tmin = LG_TINY_MAXCLASS - NTBINS + 1;
+
+	if (size == 0) {
+		size++;
+	}
+#if (SC_NTINY > 0)
+	if (size <= (ZU(1) << SC_LG_TINY_MAXCLASS)) {
+		size_t lg_tmin = SC_LG_TINY_MAXCLASS - SC_NTINY + 1;
 		size_t lg_ceil = lg_floor(pow2_ceil_zu(size));
 		return (lg_ceil < lg_tmin ? (ZU(1) << lg_tmin) :
 		    (ZU(1) << lg_ceil));
@@ -224,8 +226,8 @@ sz_s2u_compute(size_t size) {
 #endif
 	{
 		size_t x = lg_floor((size<<1)-1);
-		size_t lg_delta = (x < LG_SIZE_CLASS_GROUP + LG_QUANTUM + 1)
-		    ?  LG_QUANTUM : x - LG_SIZE_CLASS_GROUP - 1;
+		size_t lg_delta = (x < SC_LG_NGROUP + LG_QUANTUM + 1)
+		    ?  LG_QUANTUM : x - SC_LG_NGROUP - 1;
 		size_t delta = ZU(1) << lg_delta;
 		size_t delta_mask = delta - 1;
 		size_t usize = (size + delta_mask) & ~delta_mask;
@@ -247,8 +249,7 @@ sz_s2u_lookup(size_t size) {
  */
 JEMALLOC_ALWAYS_INLINE size_t
 sz_s2u(size_t size) {
-	assert(size > 0);
-	if (likely(size <= LOOKUP_MAXCLASS)) {
+	if (likely(size <= SC_LOOKUP_MAXCLASS)) {
 		return sz_s2u_lookup(size);
 	}
 	return sz_s2u_compute(size);
@@ -265,7 +266,7 @@ sz_sa2u(size_t size, size_t alignment) {
 	assert(alignment != 0 && ((alignment - 1) & alignment) == 0);
 
 	/* Try for a small size class. */
-	if (size <= SMALL_MAXCLASS && alignment < PAGE) {
+	if (size <= SC_SMALL_MAXCLASS && alignment < PAGE) {
 		/*
 		 * Round size up to the nearest multiple of alignment.
 		 *
@@ -281,20 +282,20 @@ sz_sa2u(size_t size, size_t alignment) {
 		 *    192 | 11000000 |  64
 		 */
 		usize = sz_s2u(ALIGNMENT_CEILING(size, alignment));
-		if (usize < LARGE_MINCLASS) {
+		if (usize < SC_LARGE_MINCLASS) {
 			return usize;
 		}
 	}
 
 	/* Large size class.  Beware of overflow. */
 
-	if (unlikely(alignment > LARGE_MAXCLASS)) {
+	if (unlikely(alignment > SC_LARGE_MAXCLASS)) {
 		return 0;
 	}
 
 	/* Make sure result is a large size class. */
-	if (size <= LARGE_MINCLASS) {
-		usize = LARGE_MINCLASS;
+	if (size <= SC_LARGE_MINCLASS) {
+		usize = SC_LARGE_MINCLASS;
 	} else {
 		usize = sz_s2u(size);
 		if (usize < size) {
diff --git a/deps/jemalloc/include/jemalloc/internal/tcache_externs.h b/deps/jemalloc/include/jemalloc/internal/tcache_externs.h
index 790367bd4..d63eafde8 100644
--- a/deps/jemalloc/include/jemalloc/internal/tcache_externs.h
+++ b/deps/jemalloc/include/jemalloc/internal/tcache_externs.h
@@ -1,15 +1,13 @@
 #ifndef JEMALLOC_INTERNAL_TCACHE_EXTERNS_H
 #define JEMALLOC_INTERNAL_TCACHE_EXTERNS_H
 
-#include "jemalloc/internal/size_classes.h"
-
 extern bool	opt_tcache;
 extern ssize_t	opt_lg_tcache_max;
 
 extern cache_bin_info_t	*tcache_bin_info;
 
 /*
- * Number of tcache bins.  There are NBINS small-object bins, plus 0 or more
+ * Number of tcache bins.  There are SC_NBINS small-object bins, plus 0 or more
  * large-object bins.
  */
 extern unsigned	nhbins;
diff --git a/deps/jemalloc/include/jemalloc/internal/tcache_inlines.h b/deps/jemalloc/include/jemalloc/internal/tcache_inlines.h
index 0f6ab8cb5..5eca20e89 100644
--- a/deps/jemalloc/include/jemalloc/internal/tcache_inlines.h
+++ b/deps/jemalloc/include/jemalloc/internal/tcache_inlines.h
@@ -3,7 +3,7 @@
 
 #include "jemalloc/internal/bin.h"
 #include "jemalloc/internal/jemalloc_internal_types.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/sz.h"
 #include "jemalloc/internal/ticker.h"
 #include "jemalloc/internal/util.h"
@@ -40,13 +40,13 @@ tcache_event(tsd_t *tsd, tcache_t *tcache) {
 
 JEMALLOC_ALWAYS_INLINE void *
 tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache,
-    UNUSED size_t size, szind_t binind, bool zero, bool slow_path) {
+    size_t size, szind_t binind, bool zero, bool slow_path) {
 	void *ret;
 	cache_bin_t *bin;
 	bool tcache_success;
 	size_t usize JEMALLOC_CC_SILENCE_INIT(0);
 
-	assert(binind < NBINS);
+	assert(binind < SC_NBINS);
 	bin = tcache_small_bin_get(tcache, binind);
 	ret = cache_bin_alloc_easy(bin, &tcache_success);
 	assert(tcache_success == (ret != NULL));
@@ -107,7 +107,7 @@ tcache_alloc_large(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
 	cache_bin_t *bin;
 	bool tcache_success;
 
-	assert(binind >= NBINS &&binind < nhbins);
+	assert(binind >= SC_NBINS &&binind < nhbins);
 	bin = tcache_large_bin_get(tcache, binind);
 	ret = cache_bin_alloc_easy(bin, &tcache_success);
 	assert(tcache_success == (ret != NULL));
@@ -166,7 +166,8 @@ tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind,
 	cache_bin_t *bin;
 	cache_bin_info_t *bin_info;
 
-	assert(tcache_salloc(tsd_tsdn(tsd), ptr) <= SMALL_MAXCLASS);
+	assert(tcache_salloc(tsd_tsdn(tsd), ptr)
+	    <= SC_SMALL_MAXCLASS);
 
 	if (slow_path && config_fill && unlikely(opt_junk_free)) {
 		arena_dalloc_junk_small(ptr, &bin_infos[binind]);
@@ -174,13 +175,12 @@ tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind,
 
 	bin = tcache_small_bin_get(tcache, binind);
 	bin_info = &tcache_bin_info[binind];
-	if (unlikely(bin->ncached == bin_info->ncached_max)) {
+	if (unlikely(!cache_bin_dalloc_easy(bin, bin_info, ptr))) {
 		tcache_bin_flush_small(tsd, tcache, bin, binind,
 		    (bin_info->ncached_max >> 1));
+		bool ret = cache_bin_dalloc_easy(bin, bin_info, ptr);
+		assert(ret);
 	}
-	assert(bin->ncached < bin_info->ncached_max);
-	bin->ncached++;
-	*(bin->avail - bin->ncached) = ptr;
 
 	tcache_event(tsd, tcache);
 }
@@ -191,7 +191,8 @@ tcache_dalloc_large(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind,
 	cache_bin_t *bin;
 	cache_bin_info_t *bin_info;
 
-	assert(tcache_salloc(tsd_tsdn(tsd), ptr) > SMALL_MAXCLASS);
+	assert(tcache_salloc(tsd_tsdn(tsd), ptr)
+	    > SC_SMALL_MAXCLASS);
 	assert(tcache_salloc(tsd_tsdn(tsd), ptr) <= tcache_maxclass);
 
 	if (slow_path && config_fill && unlikely(opt_junk_free)) {
@@ -215,6 +216,9 @@ JEMALLOC_ALWAYS_INLINE tcache_t *
 tcaches_get(tsd_t *tsd, unsigned ind) {
 	tcaches_t *elm = &tcaches[ind];
 	if (unlikely(elm->tcache == NULL)) {
+		malloc_printf("<jemalloc>: invalid tcache id (%u).\n", ind);
+		abort();
+	} else if (unlikely(elm->tcache == TCACHES_ELM_NEED_REINIT)) {
 		elm->tcache = tcache_create_explicit(tsd);
 	}
 	return elm->tcache;
diff --git a/deps/jemalloc/include/jemalloc/internal/tcache_structs.h b/deps/jemalloc/include/jemalloc/internal/tcache_structs.h
index 07b738705..172ef9040 100644
--- a/deps/jemalloc/include/jemalloc/internal/tcache_structs.h
+++ b/deps/jemalloc/include/jemalloc/internal/tcache_structs.h
@@ -1,10 +1,14 @@
 #ifndef JEMALLOC_INTERNAL_TCACHE_STRUCTS_H
 #define JEMALLOC_INTERNAL_TCACHE_STRUCTS_H
 
-#include "jemalloc/internal/ql.h"
-#include "jemalloc/internal/size_classes.h"
 #include "jemalloc/internal/cache_bin.h"
+#include "jemalloc/internal/ql.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/ticker.h"
+#include "jemalloc/internal/tsd_types.h"
+
+/* Various uses of this struct need it to be a named type. */
+typedef ql_elm(tsd_t) tsd_link_t;
 
 struct tcache_s {
 	/*
@@ -21,7 +25,7 @@ struct tcache_s {
 	 * During tcache initialization, the avail pointer in each element of
 	 * tbins is initialized to point to the proper offset within this array.
 	 */
-	cache_bin_t	bins_small[NBINS];
+	cache_bin_t	bins_small[SC_NBINS];
 
 	/*
 	 * This data is less hot; we can be a little less careful with our
@@ -29,6 +33,11 @@ struct tcache_s {
 	 */
 	/* Lets us track all the tcaches in an arena. */
 	ql_elm(tcache_t) link;
+
+	/* Logically scoped to tsd, but put here for cache layout reasons. */
+	ql_elm(tsd_t) tsd_link;
+	bool in_hook;
+
 	/*
 	 * The descriptor lets the arena find our cache bins without seeing the
 	 * tcache definition.  This enables arenas to aggregate stats across
@@ -41,13 +50,13 @@ struct tcache_s {
 	/* Next bin to GC. */
 	szind_t		next_gc_bin;
 	/* For small bins, fill (ncached_max >> lg_fill_div). */
-	uint8_t		lg_fill_div[NBINS];
+	uint8_t		lg_fill_div[SC_NBINS];
 	/*
 	 * We put the cache bins for large size classes at the end of the
 	 * struct, since some of them might not get used.  This might end up
 	 * letting us avoid touching an extra page if we don't have to.
 	 */
-	cache_bin_t	bins_large[NSIZES-NBINS];
+	cache_bin_t	bins_large[SC_NSIZES-SC_NBINS];
 };
 
 /* Linkage for list of available (previously used) explicit tcache IDs. */
diff --git a/deps/jemalloc/include/jemalloc/internal/tcache_types.h b/deps/jemalloc/include/jemalloc/internal/tcache_types.h
index e49bc9d79..dce69382e 100644
--- a/deps/jemalloc/include/jemalloc/internal/tcache_types.h
+++ b/deps/jemalloc/include/jemalloc/internal/tcache_types.h
@@ -1,7 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_TCACHE_TYPES_H
 #define JEMALLOC_INTERNAL_TCACHE_TYPES_H
 
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 
 typedef struct tcache_s tcache_t;
 typedef struct tcaches_s tcaches_t;
@@ -45,7 +45,7 @@ typedef struct tcaches_s tcaches_t;
 
 /* Number of tcache allocation/deallocation events between incremental GCs. */
 #define TCACHE_GC_INCR							\
-    ((TCACHE_GC_SWEEP / NBINS) + ((TCACHE_GC_SWEEP / NBINS == 0) ? 0 : 1))
+    ((TCACHE_GC_SWEEP / SC_NBINS) + ((TCACHE_GC_SWEEP / SC_NBINS == 0) ? 0 : 1))
 
 /* Used in TSD static initializer only. Real init in tcache_data_init(). */
 #define TCACHE_ZERO_INITIALIZER {0}
@@ -53,4 +53,7 @@ typedef struct tcaches_s tcaches_t;
 /* Used in TSD static initializer only. Will be initialized to opt_tcache. */
 #define TCACHE_ENABLED_ZERO_INITIALIZER false
 
+/* Used for explicit tcache only. Means flushed but not destroyed. */
+#define TCACHES_ELM_NEED_REINIT ((tcache_t *)(uintptr_t)1)
+
 #endif /* JEMALLOC_INTERNAL_TCACHE_TYPES_H */
diff --git a/deps/jemalloc/include/jemalloc/internal/test_hooks.h b/deps/jemalloc/include/jemalloc/internal/test_hooks.h
new file mode 100644
index 000000000..a6351e59a
--- /dev/null
+++ b/deps/jemalloc/include/jemalloc/internal/test_hooks.h
@@ -0,0 +1,19 @@
+#ifndef JEMALLOC_INTERNAL_TEST_HOOKS_H
+#define JEMALLOC_INTERNAL_TEST_HOOKS_H
+
+extern JEMALLOC_EXPORT void (*test_hooks_arena_new_hook)();
+extern JEMALLOC_EXPORT void (*test_hooks_libc_hook)();
+
+#define JEMALLOC_HOOK(fn, hook) ((void)(hook != NULL && (hook(), 0)), fn)
+
+#define open JEMALLOC_HOOK(open, test_hooks_libc_hook)
+#define read JEMALLOC_HOOK(read, test_hooks_libc_hook)
+#define write JEMALLOC_HOOK(write, test_hooks_libc_hook)
+#define readlink JEMALLOC_HOOK(readlink, test_hooks_libc_hook)
+#define close JEMALLOC_HOOK(close, test_hooks_libc_hook)
+#define creat JEMALLOC_HOOK(creat, test_hooks_libc_hook)
+#define secure_getenv JEMALLOC_HOOK(secure_getenv, test_hooks_libc_hook)
+/* Note that this is undef'd and re-define'd in src/prof.c. */
+#define _Unwind_Backtrace JEMALLOC_HOOK(_Unwind_Backtrace, test_hooks_libc_hook)
+
+#endif /* JEMALLOC_INTERNAL_TEST_HOOKS_H */
diff --git a/deps/jemalloc/include/jemalloc/internal/ticker.h b/deps/jemalloc/include/jemalloc/internal/ticker.h
index 4b3604708..52d0db4c8 100644
--- a/deps/jemalloc/include/jemalloc/internal/ticker.h
+++ b/deps/jemalloc/include/jemalloc/internal/ticker.h
@@ -75,4 +75,17 @@ ticker_tick(ticker_t *ticker) {
 	return ticker_ticks(ticker, 1);
 }
 
+/* 
+ * Try to tick.  If ticker would fire, return true, but rely on
+ * slowpath to reset ticker.
+ */
+static inline bool
+ticker_trytick(ticker_t *ticker) {
+	--ticker->tick;
+	if (unlikely(ticker->tick < 0)) {
+		return true;
+	}
+	return false;
+}
+
 #endif /* JEMALLOC_INTERNAL_TICKER_H */
diff --git a/deps/jemalloc/include/jemalloc/internal/tsd.h b/deps/jemalloc/include/jemalloc/internal/tsd.h
index 0b9841aa7..9ba260045 100644
--- a/deps/jemalloc/include/jemalloc/internal/tsd.h
+++ b/deps/jemalloc/include/jemalloc/internal/tsd.h
@@ -3,6 +3,7 @@
 
 #include "jemalloc/internal/arena_types.h"
 #include "jemalloc/internal/assert.h"
+#include "jemalloc/internal/bin_types.h"
 #include "jemalloc/internal/jemalloc_internal_externs.h"
 #include "jemalloc/internal/prof_types.h"
 #include "jemalloc/internal/ql.h"
@@ -68,17 +69,19 @@ typedef void (*test_callback_t)(int *);
     O(offset_state,		uint64_t,		uint64_t)	\
     O(thread_allocated,		uint64_t,		uint64_t)	\
     O(thread_deallocated,	uint64_t,		uint64_t)	\
+    O(bytes_until_sample,	int64_t,		int64_t)	\
     O(prof_tdata,		prof_tdata_t *,		prof_tdata_t *)	\
     O(rtree_ctx,		rtree_ctx_t,		rtree_ctx_t)	\
     O(iarena,			arena_t *,		arena_t *)	\
     O(arena,			arena_t *,		arena_t *)	\
     O(arenas_tdata,		arena_tdata_t *,	arena_tdata_t *)\
+    O(binshards,		tsd_binshards_t,	tsd_binshards_t)\
     O(tcache,			tcache_t,		tcache_t)	\
     O(witness_tsd,              witness_tsd_t,		witness_tsdn_t)	\
     MALLOC_TEST_TSD
 
 #define TSD_INITIALIZER {						\
-    tsd_state_uninitialized,						\
+    ATOMIC_INIT(tsd_state_uninitialized),				\
     TCACHE_ENABLED_ZERO_INITIALIZER,					\
     false,								\
     0,									\
@@ -86,29 +89,97 @@ typedef void (*test_callback_t)(int *);
     0,									\
     0,									\
     0,									\
+    0,									\
     NULL,								\
     RTREE_CTX_ZERO_INITIALIZER,						\
     NULL,								\
     NULL,								\
     NULL,								\
+    TSD_BINSHARDS_ZERO_INITIALIZER,					\
     TCACHE_ZERO_INITIALIZER,						\
     WITNESS_TSD_INITIALIZER						\
     MALLOC_TEST_TSD_INITIALIZER						\
 }
 
+void *malloc_tsd_malloc(size_t size);
+void malloc_tsd_dalloc(void *wrapper);
+void malloc_tsd_cleanup_register(bool (*f)(void));
+tsd_t *malloc_tsd_boot0(void);
+void malloc_tsd_boot1(void);
+void tsd_cleanup(void *arg);
+tsd_t *tsd_fetch_slow(tsd_t *tsd, bool internal);
+void tsd_state_set(tsd_t *tsd, uint8_t new_state);
+void tsd_slow_update(tsd_t *tsd);
+void tsd_prefork(tsd_t *tsd);
+void tsd_postfork_parent(tsd_t *tsd);
+void tsd_postfork_child(tsd_t *tsd);
+
+/*
+ * Call ..._inc when your module wants to take all threads down the slow paths,
+ * and ..._dec when it no longer needs to.
+ */
+void tsd_global_slow_inc(tsdn_t *tsdn);
+void tsd_global_slow_dec(tsdn_t *tsdn);
+bool tsd_global_slow();
+
 enum {
-	tsd_state_nominal = 0, /* Common case --> jnz. */
-	tsd_state_nominal_slow = 1, /* Initialized but on slow path. */
-	/* the above 2 nominal states should be lower values. */
-	tsd_state_nominal_max = 1, /* used for comparison only. */
-	tsd_state_minimal_initialized = 2,
-	tsd_state_purgatory = 3,
-	tsd_state_reincarnated = 4,
-	tsd_state_uninitialized = 5
+	/* Common case --> jnz. */
+	tsd_state_nominal = 0,
+	/* Initialized but on slow path. */
+	tsd_state_nominal_slow = 1,
+	/*
+	 * Some thread has changed global state in such a way that all nominal
+	 * threads need to recompute their fast / slow status the next time they
+	 * get a chance.
+	 *
+	 * Any thread can change another thread's status *to* recompute, but
+	 * threads are the only ones who can change their status *from*
+	 * recompute.
+	 */
+	tsd_state_nominal_recompute = 2,
+	/*
+	 * The above nominal states should be lower values.  We use
+	 * tsd_nominal_max to separate nominal states from threads in the
+	 * process of being born / dying.
+	 */
+	tsd_state_nominal_max = 2,
+
+	/*
+	 * A thread might free() during its death as its only allocator action;
+	 * in such scenarios, we need tsd, but set up in such a way that no
+	 * cleanup is necessary.
+	 */
+	tsd_state_minimal_initialized = 3,
+	/* States during which we know we're in thread death. */
+	tsd_state_purgatory = 4,
+	tsd_state_reincarnated = 5,
+	/*
+	 * What it says on the tin; tsd that hasn't been initialized.  Note
+	 * that even when the tsd struct lives in TLS, when need to keep track
+	 * of stuff like whether or not our pthread destructors have been
+	 * scheduled, so this really truly is different than the nominal state.
+	 */
+	tsd_state_uninitialized = 6
 };
 
-/* Manually limit tsd_state_t to a single byte. */
-typedef uint8_t tsd_state_t;
+/*
+ * Some TSD accesses can only be done in a nominal state.  To enforce this, we
+ * wrap TSD member access in a function that asserts on TSD state, and mangle
+ * field names to prevent touching them accidentally.
+ */
+#define TSD_MANGLE(n) cant_access_tsd_items_directly_use_a_getter_or_setter_##n
+
+#ifdef JEMALLOC_U8_ATOMICS
+#  define tsd_state_t atomic_u8_t
+#  define tsd_atomic_load atomic_load_u8
+#  define tsd_atomic_store atomic_store_u8
+#  define tsd_atomic_exchange atomic_exchange_u8
+#else
+#  define tsd_state_t atomic_u32_t
+#  define tsd_atomic_load atomic_load_u32
+#  define tsd_atomic_store atomic_store_u32
+#  define tsd_atomic_exchange atomic_exchange_u32
+#endif
 
 /* The actual tsd. */
 struct tsd_s {
@@ -117,13 +188,29 @@ struct tsd_s {
 	 * module.  Access any thread-local state through the getters and
 	 * setters below.
 	 */
-	tsd_state_t	state;
+
+	/*
+	 * We manually limit the state to just a single byte.  Unless the 8-bit
+	 * atomics are unavailable (which is rare).
+	 */
+	tsd_state_t state;
 #define O(n, t, nt)							\
-	t use_a_getter_or_setter_instead_##n;
+	t TSD_MANGLE(n);
 MALLOC_TSD
 #undef O
 };
 
+JEMALLOC_ALWAYS_INLINE uint8_t
+tsd_state_get(tsd_t *tsd) {
+	/*
+	 * This should be atomic.  Unfortunately, compilers right now can't tell
+	 * that this can be done as a memory comparison, and forces a load into
+	 * a register that hurts fast-path performance.
+	 */
+	/* return atomic_load_u8(&tsd->state, ATOMIC_RELAXED); */
+	return *(uint8_t *)&tsd->state;
+}
+
 /*
  * Wrapper around tsd_t that makes it possible to avoid implicit conversion
  * between tsd_t and tsdn_t, where tsdn_t is "nullable" and has to be
@@ -150,15 +237,6 @@ tsdn_tsd(tsdn_t *tsdn) {
 	return &tsdn->tsd;
 }
 
-void *malloc_tsd_malloc(size_t size);
-void malloc_tsd_dalloc(void *wrapper);
-void malloc_tsd_cleanup_register(bool (*f)(void));
-tsd_t *malloc_tsd_boot0(void);
-void malloc_tsd_boot1(void);
-void tsd_cleanup(void *arg);
-tsd_t *tsd_fetch_slow(tsd_t *tsd, bool internal);
-void tsd_slow_update(tsd_t *tsd);
-
 /*
  * We put the platform-specific data declarations and inlines into their own
  * header files to avoid cluttering this file.  They define tsd_boot0,
@@ -182,7 +260,7 @@ void tsd_slow_update(tsd_t *tsd);
 #define O(n, t, nt)							\
 JEMALLOC_ALWAYS_INLINE t *						\
 tsd_##n##p_get_unsafe(tsd_t *tsd) {					\
-	return &tsd->use_a_getter_or_setter_instead_##n;		\
+	return &tsd->TSD_MANGLE(n);					\
 }
 MALLOC_TSD
 #undef O
@@ -191,10 +269,16 @@ MALLOC_TSD
 #define O(n, t, nt)							\
 JEMALLOC_ALWAYS_INLINE t *						\
 tsd_##n##p_get(tsd_t *tsd) {						\
-	assert(tsd->state == tsd_state_nominal ||			\
-	    tsd->state == tsd_state_nominal_slow ||			\
-	    tsd->state == tsd_state_reincarnated ||			\
-	    tsd->state == tsd_state_minimal_initialized);		\
+	/*								\
+	 * Because the state might change asynchronously if it's	\
+	 * nominal, we need to make sure that we only read it once.	\
+	 */								\
+	uint8_t state = tsd_state_get(tsd);				\
+	assert(state == tsd_state_nominal ||				\
+	    state == tsd_state_nominal_slow ||				\
+	    state == tsd_state_nominal_recompute ||			\
+	    state == tsd_state_reincarnated ||				\
+	    state == tsd_state_minimal_initialized);			\
 	return tsd_##n##p_get_unsafe(tsd);				\
 }
 MALLOC_TSD
@@ -229,8 +313,8 @@ MALLOC_TSD
 #define O(n, t, nt)							\
 JEMALLOC_ALWAYS_INLINE void						\
 tsd_##n##_set(tsd_t *tsd, t val) {					\
-	assert(tsd->state != tsd_state_reincarnated &&			\
-	    tsd->state != tsd_state_minimal_initialized);		\
+	assert(tsd_state_get(tsd) != tsd_state_reincarnated &&		\
+	    tsd_state_get(tsd) != tsd_state_minimal_initialized);	\
 	*tsd_##n##p_get(tsd) = val;					\
 }
 MALLOC_TSD
@@ -238,13 +322,18 @@ MALLOC_TSD
 
 JEMALLOC_ALWAYS_INLINE void
 tsd_assert_fast(tsd_t *tsd) {
+	/*
+	 * Note that our fastness assertion does *not* include global slowness
+	 * counters; it's not in general possible to ensure that they won't
+	 * change asynchronously from underneath us.
+	 */
 	assert(!malloc_slow && tsd_tcache_enabled_get(tsd) &&
 	    tsd_reentrancy_level_get(tsd) == 0);
 }
 
 JEMALLOC_ALWAYS_INLINE bool
 tsd_fast(tsd_t *tsd) {
-	bool fast = (tsd->state == tsd_state_nominal);
+	bool fast = (tsd_state_get(tsd) == tsd_state_nominal);
 	if (fast) {
 		tsd_assert_fast(tsd);
 	}
@@ -261,7 +350,7 @@ tsd_fetch_impl(bool init, bool minimal) {
 	}
 	assert(tsd != NULL);
 
-	if (unlikely(tsd->state != tsd_state_nominal)) {
+	if (unlikely(tsd_state_get(tsd) != tsd_state_nominal)) {
 		return tsd_fetch_slow(tsd, minimal);
 	}
 	assert(tsd_fast(tsd));
@@ -281,7 +370,7 @@ JEMALLOC_ALWAYS_INLINE tsd_t *
 tsd_internal_fetch(void) {
 	tsd_t *tsd = tsd_fetch_min();
 	/* Use reincarnated state to prevent full initialization. */
-	tsd->state = tsd_state_reincarnated;
+	tsd_state_set(tsd, tsd_state_reincarnated);
 
 	return tsd;
 }
@@ -293,7 +382,7 @@ tsd_fetch(void) {
 
 static inline bool
 tsd_nominal(tsd_t *tsd) {
-	return (tsd->state <= tsd_state_nominal_max);
+	return (tsd_state_get(tsd) <= tsd_state_nominal_max);
 }
 
 JEMALLOC_ALWAYS_INLINE tsdn_t *
diff --git a/deps/jemalloc/include/jemalloc/internal/tsd_generic.h b/deps/jemalloc/include/jemalloc/internal/tsd_generic.h
index 1e52ef767..cf73c0c71 100644
--- a/deps/jemalloc/include/jemalloc/internal/tsd_generic.h
+++ b/deps/jemalloc/include/jemalloc/internal/tsd_generic.h
@@ -77,7 +77,10 @@ tsd_wrapper_get(bool init) {
 			abort();
 		} else {
 			wrapper->initialized = false;
+      JEMALLOC_DIAGNOSTIC_PUSH
+      JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
 			tsd_t initializer = TSD_INITIALIZER;
+      JEMALLOC_DIAGNOSTIC_POP
 			wrapper->val = initializer;
 		}
 		tsd_wrapper_set(wrapper);
@@ -107,7 +110,10 @@ tsd_boot1(void) {
 	tsd_boot_wrapper.initialized = false;
 	tsd_cleanup(&tsd_boot_wrapper.val);
 	wrapper->initialized = false;
+  JEMALLOC_DIAGNOSTIC_PUSH
+  JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
 	tsd_t initializer = TSD_INITIALIZER;
+  JEMALLOC_DIAGNOSTIC_POP
 	wrapper->val = initializer;
 	tsd_wrapper_set(wrapper);
 }
diff --git a/deps/jemalloc/include/jemalloc/internal/tsd_malloc_thread_cleanup.h b/deps/jemalloc/include/jemalloc/internal/tsd_malloc_thread_cleanup.h
index beb467a67..65852d5c1 100644
--- a/deps/jemalloc/include/jemalloc/internal/tsd_malloc_thread_cleanup.h
+++ b/deps/jemalloc/include/jemalloc/internal/tsd_malloc_thread_cleanup.h
@@ -3,8 +3,10 @@
 #endif
 #define JEMALLOC_INTERNAL_TSD_MALLOC_THREAD_CLEANUP_H
 
-extern __thread tsd_t tsd_tls;
-extern __thread bool tsd_initialized;
+#define JEMALLOC_TSD_TYPE_ATTR(type) __thread type JEMALLOC_TLS_MODEL
+
+extern JEMALLOC_TSD_TYPE_ATTR(tsd_t) tsd_tls;
+extern JEMALLOC_TSD_TYPE_ATTR(bool) tsd_initialized;
 extern bool tsd_booted;
 
 /* Initialization/cleanup. */
@@ -47,7 +49,6 @@ tsd_get_allocates(void) {
 /* Get/set. */
 JEMALLOC_ALWAYS_INLINE tsd_t *
 tsd_get(bool init) {
-	assert(tsd_booted);
 	return &tsd_tls;
 }
 JEMALLOC_ALWAYS_INLINE void
diff --git a/deps/jemalloc/include/jemalloc/internal/tsd_tls.h b/deps/jemalloc/include/jemalloc/internal/tsd_tls.h
index 0de64b7b8..7d6c805be 100644
--- a/deps/jemalloc/include/jemalloc/internal/tsd_tls.h
+++ b/deps/jemalloc/include/jemalloc/internal/tsd_tls.h
@@ -3,7 +3,9 @@
 #endif
 #define JEMALLOC_INTERNAL_TSD_TLS_H
 
-extern __thread tsd_t tsd_tls;
+#define JEMALLOC_TSD_TYPE_ATTR(type) __thread type JEMALLOC_TLS_MODEL
+
+extern JEMALLOC_TSD_TYPE_ATTR(tsd_t) tsd_tls;
 extern pthread_key_t tsd_tsd;
 extern bool tsd_booted;
 
@@ -39,8 +41,7 @@ tsd_get_allocates(void) {
 
 /* Get/set. */
 JEMALLOC_ALWAYS_INLINE tsd_t *
-tsd_get(UNUSED bool init) {
-	assert(tsd_booted);
+tsd_get(bool init) {
 	return &tsd_tls;
 }
 
diff --git a/deps/jemalloc/include/jemalloc/internal/witness.h b/deps/jemalloc/include/jemalloc/internal/witness.h
index 7ace8ae4a..fff9e98cb 100644
--- a/deps/jemalloc/include/jemalloc/internal/witness.h
+++ b/deps/jemalloc/include/jemalloc/internal/witness.h
@@ -27,9 +27,9 @@
 #define WITNESS_RANK_PROF_BT2GCTX	6U
 #define WITNESS_RANK_PROF_TDATAS	7U
 #define WITNESS_RANK_PROF_TDATA		8U
-#define WITNESS_RANK_PROF_GCTX		9U
-
-#define WITNESS_RANK_BACKGROUND_THREAD	10U
+#define WITNESS_RANK_PROF_LOG		9U
+#define WITNESS_RANK_PROF_GCTX		10U
+#define WITNESS_RANK_BACKGROUND_THREAD	11U
 
 /*
  * Used as an argument to witness_assert_depth_to_rank() in order to validate
@@ -37,18 +37,19 @@
  * witness_assert_depth_to_rank() is inclusive rather than exclusive, this
  * definition can have the same value as the minimally ranked core lock.
  */
-#define WITNESS_RANK_CORE		11U
-
-#define WITNESS_RANK_DECAY		11U
-#define WITNESS_RANK_TCACHE_QL		12U
-#define WITNESS_RANK_EXTENT_GROW	13U
-#define WITNESS_RANK_EXTENTS		14U
-#define WITNESS_RANK_EXTENT_AVAIL	15U
-
-#define WITNESS_RANK_EXTENT_POOL	16U
-#define WITNESS_RANK_RTREE		17U
-#define WITNESS_RANK_BASE		18U
-#define WITNESS_RANK_ARENA_LARGE	19U
+#define WITNESS_RANK_CORE		12U
+
+#define WITNESS_RANK_DECAY		12U
+#define WITNESS_RANK_TCACHE_QL		13U
+#define WITNESS_RANK_EXTENT_GROW	14U
+#define WITNESS_RANK_EXTENTS		15U
+#define WITNESS_RANK_EXTENT_AVAIL	16U
+
+#define WITNESS_RANK_EXTENT_POOL	17U
+#define WITNESS_RANK_RTREE		18U
+#define WITNESS_RANK_BASE		19U
+#define WITNESS_RANK_ARENA_LARGE	20U
+#define WITNESS_RANK_HOOK		21U
 
 #define WITNESS_RANK_LEAF		0xffffffffU
 #define WITNESS_RANK_BIN		WITNESS_RANK_LEAF
diff --git a/deps/jemalloc/include/jemalloc/jemalloc_defs.h.in b/deps/jemalloc/include/jemalloc/jemalloc_defs.h.in
index 6d89435c2..11c39181b 100644
--- a/deps/jemalloc/include/jemalloc/jemalloc_defs.h.in
+++ b/deps/jemalloc/include/jemalloc/jemalloc_defs.h.in
@@ -4,6 +4,9 @@
 /* Defined if alloc_size attribute is supported. */
 #undef JEMALLOC_HAVE_ATTR_ALLOC_SIZE
 
+/* Defined if format_arg(...) attribute is supported. */
+#undef JEMALLOC_HAVE_ATTR_FORMAT_ARG
+
 /* Defined if format(gnu_printf, ...) attribute is supported. */
 #undef JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF
 
diff --git a/deps/jemalloc/include/jemalloc/jemalloc_macros.h.in b/deps/jemalloc/include/jemalloc/jemalloc_macros.h.in
index daf9e571b..3421321a4 100644
--- a/deps/jemalloc/include/jemalloc/jemalloc_macros.h.in
+++ b/deps/jemalloc/include/jemalloc/jemalloc_macros.h.in
@@ -10,6 +10,7 @@
 #define JEMALLOC_VERSION_BUGFIX @jemalloc_version_bugfix@
 #define JEMALLOC_VERSION_NREV @jemalloc_version_nrev@
 #define JEMALLOC_VERSION_GID "@jemalloc_version_gid@"
+#define JEMALLOC_VERSION_GID_IDENT @jemalloc_version_gid@
 
 #define MALLOCX_LG_ALIGN(la)	((int)(la))
 #if LG_SIZEOF_PTR == 2
@@ -68,6 +69,7 @@
 #      define JEMALLOC_EXPORT __declspec(dllimport)
 #    endif
 #  endif
+#  define JEMALLOC_FORMAT_ARG(i)
 #  define JEMALLOC_FORMAT_PRINTF(s, i)
 #  define JEMALLOC_NOINLINE __declspec(noinline)
 #  ifdef __cplusplus
@@ -95,6 +97,11 @@
 #  ifndef JEMALLOC_EXPORT
 #    define JEMALLOC_EXPORT JEMALLOC_ATTR(visibility("default"))
 #  endif
+#  ifdef JEMALLOC_HAVE_ATTR_FORMAT_ARG
+#    define JEMALLOC_FORMAT_ARG(i) JEMALLOC_ATTR(__format_arg__(3))
+#  else
+#    define JEMALLOC_FORMAT_ARG(i)
+#  endif
 #  ifdef JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF
 #    define JEMALLOC_FORMAT_PRINTF(s, i) JEMALLOC_ATTR(format(gnu_printf, s, i))
 #  elif defined(JEMALLOC_HAVE_ATTR_FORMAT_PRINTF)
author	Oran Agra <oran@redislabs.com>	2021-10-18 12:45:11 +0300
committer	GitHub <noreply@github.com>	2021-10-18 12:45:11 +0300
commit	c4b4b6c06b0562740d214d0df467b2ba40396ffc (patch)
tree	ccd9a5e688f16311edefdacb63994976f8c6f687 /deps/jemalloc/include/jemalloc
parent	276b460ea9554f79109eb9a234a847a2520cf4c2 (diff)
parent	85737e674552bafefb6beb9a37531645e5d2178b (diff)
download	redis-c4b4b6c06b0562740d214d0df467b2ba40396ffc.tar.gz