Jemalloc updated to 4.0.3.

author: antirez <antirez@gmail.com> 2015-10-06 16:18:30 +0200
committer: antirez <antirez@gmail.com> 2015-10-06 16:55:37 +0200
commit: a9951b1b6a326532163e0fe4ee1a26e972258a1e (patch)
tree: ca555f37238537175cc1b34aa62a9f873026047f /deps/jemalloc/include/jemalloc/internal
parent: e3ded0273c43986a49ddd9d5fb4a20d187d015de (diff)
download: redis-a9951b1b6a326532163e0fe4ee1a26e972258a1e.tar.gz
32 files changed, 3962 insertions, 2005 deletions
diff --git a/deps/jemalloc/include/jemalloc/internal/arena.h b/deps/jemalloc/include/jemalloc/internal/arena.h
index 9d000c03d..12c617979 100644
--- a/deps/jemalloc/include/jemalloc/internal/arena.h
+++ b/deps/jemalloc/include/jemalloc/internal/arena.h
@@ -1,30 +1,10 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_TYPES
 
-/*
- * RUN_MAX_OVRHD indicates maximum desired run header overhead.  Runs are sized
- * as small as possible such that this setting is still honored, without
- * violating other constraints.  The goal is to make runs as small as possible
- * without exceeding a per run external fragmentation threshold.
- *
- * We use binary fixed point math for overhead computations, where the binary
- * point is implicitly RUN_BFP bits to the left.
- *
- * Note that it is possible to set RUN_MAX_OVRHD low enough that it cannot be
- * honored for some/all object sizes, since when heap profiling is enabled
- * there is one pointer of header overhead per object (plus a constant).  This
- * constraint is relaxed (ignored) for runs that are so small that the
- * per-region overhead is greater than:
- *
- *   (RUN_MAX_OVRHD / (reg_interval << (3+RUN_BFP))
- */
-#define	RUN_BFP			12
-/*                                    \/   Implicit binary fixed point. */
-#define	RUN_MAX_OVRHD		0x0000003dU
-#define	RUN_MAX_OVRHD_RELAX	0x00001800U
+#define	LARGE_MINCLASS		(ZU(1) << LG_LARGE_MINCLASS)
 
 /* Maximum number of regions in one run. */
-#define	LG_RUN_MAXREGS		11
+#define	LG_RUN_MAXREGS		(LG_PAGE - LG_TINY_MIN)
 #define	RUN_MAXREGS		(1U << LG_RUN_MAXREGS)
 
 /*
@@ -36,16 +16,18 @@
 /*
  * The minimum ratio of active:dirty pages per arena is computed as:
  *
- *   (nactive >> opt_lg_dirty_mult) >= ndirty
+ *   (nactive >> lg_dirty_mult) >= ndirty
  *
- * So, supposing that opt_lg_dirty_mult is 3, there can be no less than 8 times
- * as many active pages as dirty pages.
+ * So, supposing that lg_dirty_mult is 3, there can be no less than 8 times as
+ * many active pages as dirty pages.
  */
 #define	LG_DIRTY_MULT_DEFAULT	3
 
-typedef struct arena_chunk_map_s arena_chunk_map_t;
-typedef struct arena_chunk_s arena_chunk_t;
+typedef struct arena_runs_dirty_link_s arena_runs_dirty_link_t;
 typedef struct arena_run_s arena_run_t;
+typedef struct arena_chunk_map_bits_s arena_chunk_map_bits_t;
+typedef struct arena_chunk_map_misc_s arena_chunk_map_misc_t;
+typedef struct arena_chunk_s arena_chunk_t;
 typedef struct arena_bin_info_s arena_bin_info_t;
 typedef struct arena_bin_s arena_bin_t;
 typedef struct arena_s arena_t;
@@ -54,54 +36,34 @@ typedef struct arena_s arena_t;
 /******************************************************************************/
 #ifdef JEMALLOC_H_STRUCTS
 
-/* Each element of the chunk map corresponds to one page within the chunk. */
-struct arena_chunk_map_s {
-#ifndef JEMALLOC_PROF
-	/*
-	 * Overlay prof_ctx in order to allow it to be referenced by dead code.
-	 * Such antics aren't warranted for per arena data structures, but
-	 * chunk map overhead accounts for a percentage of memory, rather than
-	 * being just a fixed cost.
-	 */
-	union {
-#endif
-	union {
-		/*
-		 * Linkage for run trees.  There are two disjoint uses:
-		 *
-		 * 1) arena_t's runs_avail tree.
-		 * 2) arena_run_t conceptually uses this linkage for in-use
-		 *    non-full runs, rather than directly embedding linkage.
-		 */
-		rb_node(arena_chunk_map_t)	rb_link;
-		/*
-		 * List of runs currently in purgatory.  arena_chunk_purge()
-		 * temporarily allocates runs that contain dirty pages while
-		 * purging, so that other threads cannot use the runs while the
-		 * purging thread is operating without the arena lock held.
-		 */
-		ql_elm(arena_chunk_map_t)	ql_link;
-	}				u;
+#ifdef JEMALLOC_ARENA_STRUCTS_A
+struct arena_run_s {
+	/* Index of bin this run is associated with. */
+	szind_t		binind;
 
-	/* Profile counters, used for large object runs. */
-	prof_ctx_t			*prof_ctx;
-#ifndef JEMALLOC_PROF
-	}; /* union { ... }; */
-#endif
+	/* Number of free regions in run. */
+	unsigned	nfree;
 
+	/* Per region allocated/deallocated bitmap. */
+	bitmap_t	bitmap[BITMAP_GROUPS_MAX];
+};
+
+/* Each element of the chunk map corresponds to one page within the chunk. */
+struct arena_chunk_map_bits_s {
 	/*
 	 * Run address (or size) and various flags are stored together.  The bit
 	 * layout looks like (assuming 32-bit system):
 	 *
-	 *   ???????? ???????? ????nnnn nnnndula
+	 *   ???????? ???????? ???nnnnn nnndumla
 	 *
 	 * ? : Unallocated: Run address for first/last pages, unset for internal
 	 *                  pages.
 	 *     Small: Run page offset.
-	 *     Large: Run size for first page, unset for trailing pages.
+	 *     Large: Run page count for first page, unset for trailing pages.
 	 * n : binind for small size class, BININD_INVALID for large size class.
 	 * d : dirty?
 	 * u : unzeroed?
+	 * m : decommitted?
 	 * l : large?
 	 * a : allocated?
 	 *
@@ -110,78 +72,109 @@ struct arena_chunk_map_s {
 	 * p : run page offset
 	 * s : run size
 	 * n : binind for size class; large objects set these to BININD_INVALID
-	 *     except for promoted allocations (see prof_promote)
 	 * x : don't care
 	 * - : 0
 	 * + : 1
-	 * [DULA] : bit set
-	 * [dula] : bit unset
+	 * [DUMLA] : bit set
+	 * [dumla] : bit unset
 	 *
 	 *   Unallocated (clean):
-	 *     ssssssss ssssssss ssss++++ ++++du-a
-	 *     xxxxxxxx xxxxxxxx xxxxxxxx xxxx-Uxx
-	 *     ssssssss ssssssss ssss++++ ++++dU-a
+	 *     ssssssss ssssssss sss+++++ +++dum-a
+	 *     xxxxxxxx xxxxxxxx xxxxxxxx xxx-Uxxx
+	 *     ssssssss ssssssss sss+++++ +++dUm-a
 	 *
 	 *   Unallocated (dirty):
-	 *     ssssssss ssssssss ssss++++ ++++D--a
+	 *     ssssssss ssssssss sss+++++ +++D-m-a
 	 *     xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx
-	 *     ssssssss ssssssss ssss++++ ++++D--a
+	 *     ssssssss ssssssss sss+++++ +++D-m-a
 	 *
 	 *   Small:
-	 *     pppppppp pppppppp ppppnnnn nnnnd--A
-	 *     pppppppp pppppppp ppppnnnn nnnn---A
-	 *     pppppppp pppppppp ppppnnnn nnnnd--A
+	 *     pppppppp pppppppp pppnnnnn nnnd---A
+	 *     pppppppp pppppppp pppnnnnn nnn----A
+	 *     pppppppp pppppppp pppnnnnn nnnd---A
 	 *
 	 *   Large:
-	 *     ssssssss ssssssss ssss++++ ++++D-LA
+	 *     ssssssss ssssssss sss+++++ +++D--LA
 	 *     xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx
-	 *     -------- -------- ----++++ ++++D-LA
+	 *     -------- -------- ---+++++ +++D--LA
 	 *
-	 *   Large (sampled, size <= PAGE):
-	 *     ssssssss ssssssss ssssnnnn nnnnD-LA
+	 *   Large (sampled, size <= LARGE_MINCLASS):
+	 *     ssssssss ssssssss sssnnnnn nnnD--LA
+	 *     xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx
+	 *     -------- -------- ---+++++ +++D--LA
 	 *
-	 *   Large (not sampled, size == PAGE):
-	 *     ssssssss ssssssss ssss++++ ++++D-LA
+	 *   Large (not sampled, size == LARGE_MINCLASS):
+	 *     ssssssss ssssssss sss+++++ +++D--LA
+	 *     xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx
+	 *     -------- -------- ---+++++ +++D--LA
 	 */
 	size_t				bits;
-#define	CHUNK_MAP_BININD_SHIFT	4
+#define	CHUNK_MAP_ALLOCATED	((size_t)0x01U)
+#define	CHUNK_MAP_LARGE		((size_t)0x02U)
+#define	CHUNK_MAP_STATE_MASK	((size_t)0x3U)
+
+#define	CHUNK_MAP_DECOMMITTED	((size_t)0x04U)
+#define	CHUNK_MAP_UNZEROED	((size_t)0x08U)
+#define	CHUNK_MAP_DIRTY		((size_t)0x10U)
+#define	CHUNK_MAP_FLAGS_MASK	((size_t)0x1cU)
+
+#define	CHUNK_MAP_BININD_SHIFT	5
 #define	BININD_INVALID		((size_t)0xffU)
-/*     CHUNK_MAP_BININD_MASK == (BININD_INVALID << CHUNK_MAP_BININD_SHIFT) */
-#define	CHUNK_MAP_BININD_MASK	((size_t)0xff0U)
+#define	CHUNK_MAP_BININD_MASK	(BININD_INVALID << CHUNK_MAP_BININD_SHIFT)
 #define	CHUNK_MAP_BININD_INVALID CHUNK_MAP_BININD_MASK
-#define	CHUNK_MAP_FLAGS_MASK	((size_t)0xcU)
-#define	CHUNK_MAP_DIRTY		((size_t)0x8U)
-#define	CHUNK_MAP_UNZEROED	((size_t)0x4U)
-#define	CHUNK_MAP_LARGE		((size_t)0x2U)
-#define	CHUNK_MAP_ALLOCATED	((size_t)0x1U)
-#define	CHUNK_MAP_KEY		CHUNK_MAP_ALLOCATED
+
+#define	CHUNK_MAP_RUNIND_SHIFT	(CHUNK_MAP_BININD_SHIFT + 8)
+#define	CHUNK_MAP_SIZE_SHIFT	(CHUNK_MAP_RUNIND_SHIFT - LG_PAGE)
+#define	CHUNK_MAP_SIZE_MASK						\
+    (~(CHUNK_MAP_BININD_MASK | CHUNK_MAP_FLAGS_MASK | CHUNK_MAP_STATE_MASK))
 };
-typedef rb_tree(arena_chunk_map_t) arena_avail_tree_t;
-typedef rb_tree(arena_chunk_map_t) arena_run_tree_t;
-typedef ql_head(arena_chunk_map_t) arena_chunk_mapelms_t;
 
-/* Arena chunk header. */
-struct arena_chunk_s {
-	/* Arena that owns the chunk. */
-	arena_t			*arena;
+struct arena_runs_dirty_link_s {
+	qr(arena_runs_dirty_link_t)	rd_link;
+};
 
-	/* Linkage for tree of arena chunks that contain dirty runs. */
-	rb_node(arena_chunk_t)	dirty_link;
+/*
+ * Each arena_chunk_map_misc_t corresponds to one page within the chunk, just
+ * like arena_chunk_map_bits_t.  Two separate arrays are stored within each
+ * chunk header in order to improve cache locality.
+ */
+struct arena_chunk_map_misc_s {
+	/*
+	 * Linkage for run trees.  There are two disjoint uses:
+	 *
+	 * 1) arena_t's runs_avail tree.
+	 * 2) arena_run_t conceptually uses this linkage for in-use non-full
+	 *    runs, rather than directly embedding linkage.
+	 */
+	rb_node(arena_chunk_map_misc_t)		rb_link;
 
-	/* Number of dirty pages. */
-	size_t			ndirty;
+	union {
+		/* Linkage for list of dirty runs. */
+		arena_runs_dirty_link_t		rd;
 
-	/* Number of available runs. */
-	size_t			nruns_avail;
+		/* Profile counters, used for large object runs. */
+		union {
+			void				*prof_tctx_pun;
+			prof_tctx_t			*prof_tctx;
+		};
 
+		/* Small region run metadata. */
+		arena_run_t			run;
+	};
+};
+typedef rb_tree(arena_chunk_map_misc_t) arena_avail_tree_t;
+typedef rb_tree(arena_chunk_map_misc_t) arena_run_tree_t;
+#endif /* JEMALLOC_ARENA_STRUCTS_A */
+
+#ifdef JEMALLOC_ARENA_STRUCTS_B
+/* Arena chunk header. */
+struct arena_chunk_s {
 	/*
-	 * Number of available run adjacencies that purging could coalesce.
-	 * Clean and dirty available runs are not coalesced, which causes
-	 * virtual memory fragmentation.  The ratio of
-	 * (nruns_avail-nruns_adjac):nruns_adjac is used for tracking this
-	 * fragmentation.
+	 * A pointer to the arena that owns the chunk is stored within the node.
+	 * This field as a whole is used by chunks_rtree to support both
+	 * ivsalloc() and core-based debugging.
 	 */
-	size_t			nruns_adjac;
+	extent_node_t		node;
 
 	/*
 	 * Map of pages within chunk that keeps track of free/large/small.  The
@@ -189,19 +182,7 @@ struct arena_chunk_s {
 	 * need to be tracked in the map.  This omission saves a header page
 	 * for common chunk sizes (e.g. 4 MiB).
 	 */
-	arena_chunk_map_t	map[1]; /* Dynamically sized. */
-};
-typedef rb_tree(arena_chunk_t) arena_chunk_tree_t;
-
-struct arena_run_s {
-	/* Bin this run is associated with. */
-	arena_bin_t	*bin;
-
-	/* Index of next region that has never been allocated, or nregs. */
-	uint32_t	nextind;
-
-	/* Number of free regions in run. */
-	unsigned	nfree;
+	arena_chunk_map_bits_t	map_bits[1]; /* Dynamically sized. */
 };
 
 /*
@@ -212,12 +193,7 @@ struct arena_run_s {
  * Each run has the following layout:
  *
  *               /--------------------\
- *               | arena_run_t header |
- *               | ...                |
- * bitmap_offset | bitmap             |
- *               | ...                |
- *   ctx0_offset | ctx map            |
- *               | ...                |
+ *               | pad?               |
  *               |--------------------|
  *               | redzone            |
  *   reg0_offset | region 0           |
@@ -259,23 +235,11 @@ struct arena_bin_info_s {
 	uint32_t	nregs;
 
 	/*
-	 * Offset of first bitmap_t element in a run header for this bin's size
-	 * class.
-	 */
-	uint32_t	bitmap_offset;
-
-	/*
 	 * Metadata used to manipulate bitmaps for runs associated with this
 	 * bin.
 	 */
 	bitmap_info_t	bitmap_info;
 
-	/*
-	 * Offset of first (prof_ctx_t *) in a run header for this bin's size
-	 * class, or 0 if (config_prof == false || opt_prof == false).
-	 */
-	uint32_t	ctx0_offset;
-
 	/* Offset of first region in a run for this bin's size class. */
 	uint32_t	reg0_offset;
 };
@@ -321,8 +285,7 @@ struct arena_s {
 	/*
 	 * There are three classes of arena operations from a locking
 	 * perspective:
-	 * 1) Thread asssignment (modifies nthreads) is protected by
-	 *    arenas_lock.
+	 * 1) Thread assignment (modifies nthreads) is protected by arenas_lock.
 	 * 2) Bin-related operations are protected by bin locks.
 	 * 3) Chunk- and run-related operations are protected by this mutex.
 	 */
@@ -331,16 +294,20 @@ struct arena_s {
 	arena_stats_t		stats;
 	/*
 	 * List of tcaches for extant threads associated with this arena.
-	 * Stats from these are merged incrementally, and at exit.
+	 * Stats from these are merged incrementally, and at exit if
+	 * opt_stats_print is enabled.
 	 */
 	ql_head(tcache_t)	tcache_ql;
 
 	uint64_t		prof_accumbytes;
 
-	dss_prec_t		dss_prec;
+	/*
+	 * PRNG state for cache index randomization of large allocation base
+	 * pointers.
+	 */
+	uint64_t		offset_state;
 
-	/* Tree of dirty-page-containing chunks this arena manages. */
-	arena_chunk_tree_t	chunks_dirty;
+	dss_prec_t		dss_prec;
 
 	/*
 	 * In order to avoid rapid chunk allocation/deallocation when an arena
@@ -354,7 +321,13 @@ struct arena_s {
 	 */
 	arena_chunk_t		*spare;
 
-	/* Number of pages in active runs. */
+	/* Minimum ratio (log base 2) of nactive:ndirty. */
+	ssize_t			lg_dirty_mult;
+
+	/* True if a thread is currently executing arena_purge(). */
+	bool			purging;
+
+	/* Number of pages in active runs and huge regions. */
 	size_t			nactive;
 
 	/*
@@ -366,44 +339,116 @@ struct arena_s {
 	size_t			ndirty;
 
 	/*
-	 * Approximate number of pages being purged.  It is possible for
-	 * multiple threads to purge dirty pages concurrently, and they use
-	 * npurgatory to indicate the total number of pages all threads are
-	 * attempting to purge.
+	 * Size/address-ordered tree of this arena's available runs.  The tree
+	 * is used for first-best-fit run allocation.
 	 */
-	size_t			npurgatory;
+	arena_avail_tree_t	runs_avail;
 
 	/*
-	 * Size/address-ordered trees of this arena's available runs.  The trees
-	 * are used for first-best-fit run allocation.
+	 * Unused dirty memory this arena manages.  Dirty memory is conceptually
+	 * tracked as an arbitrarily interleaved LRU of dirty runs and cached
+	 * chunks, but the list linkage is actually semi-duplicated in order to
+	 * avoid extra arena_chunk_map_misc_t space overhead.
+	 *
+	 *   LRU-----------------------------------------------------------MRU
+	 *
+	 *        /-- arena ---\
+	 *        |            |
+	 *        |            |
+	 *        |------------|                             /- chunk -\
+	 *   ...->|chunks_cache|<--------------------------->|  /----\ |<--...
+	 *        |------------|                             |  |node| |
+	 *        |            |                             |  |    | |
+	 *        |            |    /- run -\    /- run -\   |  |    | |
+	 *        |            |    |       |    |       |   |  |    | |
+	 *        |            |    |       |    |       |   |  |    | |
+	 *        |------------|    |-------|    |-------|   |  |----| |
+	 *   ...->|runs_dirty  |<-->|rd     |<-->|rd     |<---->|rd  |<----...
+	 *        |------------|    |-------|    |-------|   |  |----| |
+	 *        |            |    |       |    |       |   |  |    | |
+	 *        |            |    |       |    |       |   |  \----/ |
+	 *        |            |    \-------/    \-------/   |         |
+	 *        |            |                             |         |
+	 *        |            |                             |         |
+	 *        \------------/                             \---------/
 	 */
-	arena_avail_tree_t	runs_avail;
+	arena_runs_dirty_link_t	runs_dirty;
+	extent_node_t		chunks_cache;
+
+	/* Extant huge allocations. */
+	ql_head(extent_node_t)	huge;
+	/* Synchronizes all huge allocation/update/deallocation. */
+	malloc_mutex_t		huge_mtx;
+
+	/*
+	 * Trees of chunks that were previously allocated (trees differ only in
+	 * node ordering).  These are used when allocating chunks, in an attempt
+	 * to re-use address space.  Depending on function, different tree
+	 * orderings are needed, which is why there are two trees with the same
+	 * contents.
+	 */
+	extent_tree_t		chunks_szad_cached;
+	extent_tree_t		chunks_ad_cached;
+	extent_tree_t		chunks_szad_retained;
+	extent_tree_t		chunks_ad_retained;
+
+	malloc_mutex_t		chunks_mtx;
+	/* Cache of nodes that were allocated via base_alloc(). */
+	ql_head(extent_node_t)	node_cache;
+	malloc_mutex_t		node_cache_mtx;
+
+	/* User-configurable chunk hook functions. */
+	chunk_hooks_t		chunk_hooks;
 
 	/* bins is used to store trees of free regions. */
 	arena_bin_t		bins[NBINS];
 };
+#endif /* JEMALLOC_ARENA_STRUCTS_B */
 
 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
 
-extern ssize_t	opt_lg_dirty_mult;
-/*
- * small_size2bin is a compact lookup table that rounds request sizes up to
- * size classes.  In order to reduce cache footprint, the table is compressed,
- * and all accesses are via the SMALL_SIZE2BIN macro.
- */
-extern uint8_t const	small_size2bin[];
-#define	SMALL_SIZE2BIN(s)	(small_size2bin[(s-1) >> LG_TINY_MIN])
+static const size_t	large_pad =
+#ifdef JEMALLOC_CACHE_OBLIVIOUS
+    PAGE
+#else
+    0
+#endif
+    ;
 
-extern arena_bin_info_t	arena_bin_info[NBINS];
+extern ssize_t		opt_lg_dirty_mult;
 
-/* Number of large size classes. */
-#define			nlclasses (chunk_npages - map_bias)
+extern arena_bin_info_t	arena_bin_info[NBINS];
 
+extern size_t		map_bias; /* Number of arena chunk header pages. */
+extern size_t		map_misc_offset;
+extern size_t		arena_maxrun; /* Max run size for arenas. */
+extern size_t		large_maxclass; /* Max large size class. */
+extern unsigned		nlclasses; /* Number of large size classes. */
+extern unsigned		nhclasses; /* Number of huge size classes. */
+
+void	arena_chunk_cache_maybe_insert(arena_t *arena, extent_node_t *node,
+    bool cache);
+void	arena_chunk_cache_maybe_remove(arena_t *arena, extent_node_t *node,
+    bool cache);
+extent_node_t	*arena_node_alloc(arena_t *arena);
+void	arena_node_dalloc(arena_t *arena, extent_node_t *node);
+void	*arena_chunk_alloc_huge(arena_t *arena, size_t usize, size_t alignment,
+    bool *zero);
+void	arena_chunk_dalloc_huge(arena_t *arena, void *chunk, size_t usize);
+void	arena_chunk_ralloc_huge_similar(arena_t *arena, void *chunk,
+    size_t oldsize, size_t usize);
+void	arena_chunk_ralloc_huge_shrink(arena_t *arena, void *chunk,
+    size_t oldsize, size_t usize);
+bool	arena_chunk_ralloc_huge_expand(arena_t *arena, void *chunk,
+    size_t oldsize, size_t usize, bool *zero);
+ssize_t	arena_lg_dirty_mult_get(arena_t *arena);
+bool	arena_lg_dirty_mult_set(arena_t *arena, ssize_t lg_dirty_mult);
+void	arena_maybe_purge(arena_t *arena);
 void	arena_purge_all(arena_t *arena);
 void	arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin,
-    size_t binind, uint64_t prof_accumbytes);
+    szind_t binind, uint64_t prof_accumbytes);
 void	arena_alloc_junk_small(void *ptr, arena_bin_info_t *bin_info,
     bool zero);
 #ifdef JEMALLOC_JET
@@ -418,19 +463,22 @@ void	arena_dalloc_junk_small(void *ptr, arena_bin_info_t *bin_info);
 void	arena_quarantine_junk_small(void *ptr, size_t usize);
 void	*arena_malloc_small(arena_t *arena, size_t size, bool zero);
 void	*arena_malloc_large(arena_t *arena, size_t size, bool zero);
-void	*arena_palloc(arena_t *arena, size_t size, size_t alignment, bool zero);
+void	*arena_palloc(tsd_t *tsd, arena_t *arena, size_t usize,
+    size_t alignment, bool zero, tcache_t *tcache);
 void	arena_prof_promoted(const void *ptr, size_t size);
-void	arena_dalloc_bin_locked(arena_t *arena, arena_chunk_t *chunk, void *ptr,
-    arena_chunk_map_t *mapelm);
+void	arena_dalloc_bin_junked_locked(arena_t *arena, arena_chunk_t *chunk,
+    void *ptr, arena_chunk_map_bits_t *bitselm);
 void	arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
-    size_t pageind, arena_chunk_map_t *mapelm);
+    size_t pageind, arena_chunk_map_bits_t *bitselm);
 void	arena_dalloc_small(arena_t *arena, arena_chunk_t *chunk, void *ptr,
     size_t pageind);
 #ifdef JEMALLOC_JET
 typedef void (arena_dalloc_junk_large_t)(void *, size_t);
 extern arena_dalloc_junk_large_t *arena_dalloc_junk_large;
+#else
+void	arena_dalloc_junk_large(void *ptr, size_t usize);
 #endif
-void	arena_dalloc_large_locked(arena_t *arena, arena_chunk_t *chunk,
+void	arena_dalloc_large_junked_locked(arena_t *arena, arena_chunk_t *chunk,
     void *ptr);
 void	arena_dalloc_large(arena_t *arena, arena_chunk_t *chunk, void *ptr);
 #ifdef JEMALLOC_JET
@@ -439,16 +487,18 @@ extern arena_ralloc_junk_large_t *arena_ralloc_junk_large;
 #endif
 bool	arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
     size_t extra, bool zero);
-void	*arena_ralloc(arena_t *arena, void *ptr, size_t oldsize, size_t size,
-    size_t extra, size_t alignment, bool zero, bool try_tcache_alloc,
-    bool try_tcache_dalloc);
+void	*arena_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize,
+    size_t size, size_t alignment, bool zero, tcache_t *tcache);
 dss_prec_t	arena_dss_prec_get(arena_t *arena);
-void	arena_dss_prec_set(arena_t *arena, dss_prec_t dss_prec);
-void	arena_stats_merge(arena_t *arena, const char **dss, size_t *nactive,
-    size_t *ndirty, arena_stats_t *astats, malloc_bin_stats_t *bstats,
-    malloc_large_stats_t *lstats);
-bool	arena_new(arena_t *arena, unsigned ind);
-void	arena_boot(void);
+bool	arena_dss_prec_set(arena_t *arena, dss_prec_t dss_prec);
+ssize_t	arena_lg_dirty_mult_default_get(void);
+bool	arena_lg_dirty_mult_default_set(ssize_t lg_dirty_mult);
+void	arena_stats_merge(arena_t *arena, const char **dss,
+    ssize_t *lg_dirty_mult, size_t *nactive, size_t *ndirty,
+    arena_stats_t *astats, malloc_bin_stats_t *bstats,
+    malloc_large_stats_t *lstats, malloc_huge_stats_t *hstats);
+arena_t	*arena_new(unsigned ind);
+bool	arena_boot(void);
 void	arena_prefork(arena_t *arena);
 void	arena_postfork_parent(arena_t *arena);
 void	arena_postfork_child(arena_t *arena);
@@ -458,64 +508,138 @@ void	arena_postfork_child(arena_t *arena);
 #ifdef JEMALLOC_H_INLINES
 
 #ifndef JEMALLOC_ENABLE_INLINE
-arena_chunk_map_t	*arena_mapp_get(arena_chunk_t *chunk, size_t pageind);
+arena_chunk_map_bits_t	*arena_bitselm_get(arena_chunk_t *chunk,
+    size_t pageind);
+arena_chunk_map_misc_t	*arena_miscelm_get(arena_chunk_t *chunk,
+    size_t pageind);
+size_t	arena_miscelm_to_pageind(arena_chunk_map_misc_t *miscelm);
+void	*arena_miscelm_to_rpages(arena_chunk_map_misc_t *miscelm);
+arena_chunk_map_misc_t	*arena_rd_to_miscelm(arena_runs_dirty_link_t *rd);
+arena_chunk_map_misc_t	*arena_run_to_miscelm(arena_run_t *run);
 size_t	*arena_mapbitsp_get(arena_chunk_t *chunk, size_t pageind);
 size_t	arena_mapbitsp_read(size_t *mapbitsp);
 size_t	arena_mapbits_get(arena_chunk_t *chunk, size_t pageind);
+size_t	arena_mapbits_size_decode(size_t mapbits);
 size_t	arena_mapbits_unallocated_size_get(arena_chunk_t *chunk,
     size_t pageind);
 size_t	arena_mapbits_large_size_get(arena_chunk_t *chunk, size_t pageind);
 size_t	arena_mapbits_small_runind_get(arena_chunk_t *chunk, size_t pageind);
-size_t	arena_mapbits_binind_get(arena_chunk_t *chunk, size_t pageind);
+szind_t	arena_mapbits_binind_get(arena_chunk_t *chunk, size_t pageind);
 size_t	arena_mapbits_dirty_get(arena_chunk_t *chunk, size_t pageind);
 size_t	arena_mapbits_unzeroed_get(arena_chunk_t *chunk, size_t pageind);
+size_t	arena_mapbits_decommitted_get(arena_chunk_t *chunk, size_t pageind);
 size_t	arena_mapbits_large_get(arena_chunk_t *chunk, size_t pageind);
 size_t	arena_mapbits_allocated_get(arena_chunk_t *chunk, size_t pageind);
 void	arena_mapbitsp_write(size_t *mapbitsp, size_t mapbits);
+size_t	arena_mapbits_size_encode(size_t size);
 void	arena_mapbits_unallocated_set(arena_chunk_t *chunk, size_t pageind,
     size_t size, size_t flags);
 void	arena_mapbits_unallocated_size_set(arena_chunk_t *chunk, size_t pageind,
     size_t size);
+void	arena_mapbits_internal_set(arena_chunk_t *chunk, size_t pageind,
+    size_t flags);
 void	arena_mapbits_large_set(arena_chunk_t *chunk, size_t pageind,
     size_t size, size_t flags);
 void	arena_mapbits_large_binind_set(arena_chunk_t *chunk, size_t pageind,
-    size_t binind);
+    szind_t binind);
 void	arena_mapbits_small_set(arena_chunk_t *chunk, size_t pageind,
-    size_t runind, size_t binind, size_t flags);
-void	arena_mapbits_unzeroed_set(arena_chunk_t *chunk, size_t pageind,
-    size_t unzeroed);
+    size_t runind, szind_t binind, size_t flags);
+void	arena_metadata_allocated_add(arena_t *arena, size_t size);
+void	arena_metadata_allocated_sub(arena_t *arena, size_t size);
+size_t	arena_metadata_allocated_get(arena_t *arena);
 bool	arena_prof_accum_impl(arena_t *arena, uint64_t accumbytes);
 bool	arena_prof_accum_locked(arena_t *arena, uint64_t accumbytes);
 bool	arena_prof_accum(arena_t *arena, uint64_t accumbytes);
-size_t	arena_ptr_small_binind_get(const void *ptr, size_t mapbits);
-size_t	arena_bin_index(arena_t *arena, arena_bin_t *bin);
+szind_t	arena_ptr_small_binind_get(const void *ptr, size_t mapbits);
+szind_t	arena_bin_index(arena_t *arena, arena_bin_t *bin);
 unsigned	arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info,
     const void *ptr);
-prof_ctx_t	*arena_prof_ctx_get(const void *ptr);
-void	arena_prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx);
-void	*arena_malloc(arena_t *arena, size_t size, bool zero, bool try_tcache);
+prof_tctx_t	*arena_prof_tctx_get(const void *ptr);
+void	arena_prof_tctx_set(const void *ptr, size_t usize, prof_tctx_t *tctx);
+void	arena_prof_tctx_reset(const void *ptr, size_t usize,
+    const void *old_ptr, prof_tctx_t *old_tctx);
+void	*arena_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero,
+    tcache_t *tcache);
+arena_t	*arena_aalloc(const void *ptr);
 size_t	arena_salloc(const void *ptr, bool demote);
-void	arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr,
-    bool try_tcache);
+void	arena_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache);
+void	arena_sdalloc(tsd_t *tsd, void *ptr, size_t size, tcache_t *tcache);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ARENA_C_))
 #  ifdef JEMALLOC_ARENA_INLINE_A
-JEMALLOC_ALWAYS_INLINE arena_chunk_map_t *
-arena_mapp_get(arena_chunk_t *chunk, size_t pageind)
+JEMALLOC_ALWAYS_INLINE arena_chunk_map_bits_t *
+arena_bitselm_get(arena_chunk_t *chunk, size_t pageind)
 {
 
 	assert(pageind >= map_bias);
 	assert(pageind < chunk_npages);
 
-	return (&chunk->map[pageind-map_bias]);
+	return (&chunk->map_bits[pageind-map_bias]);
+}
+
+JEMALLOC_ALWAYS_INLINE arena_chunk_map_misc_t *
+arena_miscelm_get(arena_chunk_t *chunk, size_t pageind)
+{
+
+	assert(pageind >= map_bias);
+	assert(pageind < chunk_npages);
+
+	return ((arena_chunk_map_misc_t *)((uintptr_t)chunk +
+	    (uintptr_t)map_misc_offset) + pageind-map_bias);
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+arena_miscelm_to_pageind(arena_chunk_map_misc_t *miscelm)
+{
+	arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(miscelm);
+	size_t pageind = ((uintptr_t)miscelm - ((uintptr_t)chunk +
+	    map_misc_offset)) / sizeof(arena_chunk_map_misc_t) + map_bias;
+
+	assert(pageind >= map_bias);
+	assert(pageind < chunk_npages);
+
+	return (pageind);
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+arena_miscelm_to_rpages(arena_chunk_map_misc_t *miscelm)
+{
+	arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(miscelm);
+	size_t pageind = arena_miscelm_to_pageind(miscelm);
+
+	return ((void *)((uintptr_t)chunk + (pageind << LG_PAGE)));
+}
+
+JEMALLOC_ALWAYS_INLINE arena_chunk_map_misc_t *
+arena_rd_to_miscelm(arena_runs_dirty_link_t *rd)
+{
+	arena_chunk_map_misc_t *miscelm = (arena_chunk_map_misc_t
+	    *)((uintptr_t)rd - offsetof(arena_chunk_map_misc_t, rd));
+
+	assert(arena_miscelm_to_pageind(miscelm) >= map_bias);
+	assert(arena_miscelm_to_pageind(miscelm) < chunk_npages);
+
+	return (miscelm);
+}
+
+JEMALLOC_ALWAYS_INLINE arena_chunk_map_misc_t *
+arena_run_to_miscelm(arena_run_t *run)
+{
+	arena_chunk_map_misc_t *miscelm = (arena_chunk_map_misc_t
+	    *)((uintptr_t)run - offsetof(arena_chunk_map_misc_t, run));
+
+	assert(arena_miscelm_to_pageind(miscelm) >= map_bias);
+	assert(arena_miscelm_to_pageind(miscelm) < chunk_npages);
+
+	return (miscelm);
 }
 
 JEMALLOC_ALWAYS_INLINE size_t *
 arena_mapbitsp_get(arena_chunk_t *chunk, size_t pageind)
 {
 
-	return (&arena_mapp_get(chunk, pageind)->bits);
+	return (&arena_bitselm_get(chunk, pageind)->bits);
 }
 
 JEMALLOC_ALWAYS_INLINE size_t
@@ -533,13 +657,29 @@ arena_mapbits_get(arena_chunk_t *chunk, size_t pageind)
 }
 
 JEMALLOC_ALWAYS_INLINE size_t
+arena_mapbits_size_decode(size_t mapbits)
+{
+	size_t size;
+
+#if CHUNK_MAP_SIZE_SHIFT > 0
+	size = (mapbits & CHUNK_MAP_SIZE_MASK) >> CHUNK_MAP_SIZE_SHIFT;
+#elif CHUNK_MAP_SIZE_SHIFT == 0
+	size = mapbits & CHUNK_MAP_SIZE_MASK;
+#else
+	size = (mapbits & CHUNK_MAP_SIZE_MASK) << -CHUNK_MAP_SIZE_SHIFT;
+#endif
+
+	return (size);
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
 arena_mapbits_unallocated_size_get(arena_chunk_t *chunk, size_t pageind)
 {
 	size_t mapbits;
 
 	mapbits = arena_mapbits_get(chunk, pageind);
 	assert((mapbits & (CHUNK_MAP_LARGE|CHUNK_MAP_ALLOCATED)) == 0);
-	return (mapbits & ~PAGE_MASK);
+	return (arena_mapbits_size_decode(mapbits));
 }
 
 JEMALLOC_ALWAYS_INLINE size_t
@@ -550,7 +690,7 @@ arena_mapbits_large_size_get(arena_chunk_t *chunk, size_t pageind)
 	mapbits = arena_mapbits_get(chunk, pageind);
 	assert((mapbits & (CHUNK_MAP_LARGE|CHUNK_MAP_ALLOCATED)) ==
 	    (CHUNK_MAP_LARGE|CHUNK_MAP_ALLOCATED));
-	return (mapbits & ~PAGE_MASK);
+	return (arena_mapbits_size_decode(mapbits));
 }
 
 JEMALLOC_ALWAYS_INLINE size_t
@@ -561,14 +701,14 @@ arena_mapbits_small_runind_get(arena_chunk_t *chunk, size_t pageind)
 	mapbits = arena_mapbits_get(chunk, pageind);
 	assert((mapbits & (CHUNK_MAP_LARGE|CHUNK_MAP_ALLOCATED)) ==
 	    CHUNK_MAP_ALLOCATED);
-	return (mapbits >> LG_PAGE);
+	return (mapbits >> CHUNK_MAP_RUNIND_SHIFT);
 }
 
-JEMALLOC_ALWAYS_INLINE size_t
+JEMALLOC_ALWAYS_INLINE szind_t
 arena_mapbits_binind_get(arena_chunk_t *chunk, size_t pageind)
 {
 	size_t mapbits;
-	size_t binind;
+	szind_t binind;
 
 	mapbits = arena_mapbits_get(chunk, pageind);
 	binind = (mapbits & CHUNK_MAP_BININD_MASK) >> CHUNK_MAP_BININD_SHIFT;
@@ -582,6 +722,8 @@ arena_mapbits_dirty_get(arena_chunk_t *chunk, size_t pageind)
 	size_t mapbits;
 
 	mapbits = arena_mapbits_get(chunk, pageind);
+	assert((mapbits & CHUNK_MAP_DECOMMITTED) == 0 || (mapbits &
+	    (CHUNK_MAP_DIRTY|CHUNK_MAP_UNZEROED)) == 0);
 	return (mapbits & CHUNK_MAP_DIRTY);
 }
 
@@ -591,10 +733,23 @@ arena_mapbits_unzeroed_get(arena_chunk_t *chunk, size_t pageind)
 	size_t mapbits;
 
 	mapbits = arena_mapbits_get(chunk, pageind);
+	assert((mapbits & CHUNK_MAP_DECOMMITTED) == 0 || (mapbits &
+	    (CHUNK_MAP_DIRTY|CHUNK_MAP_UNZEROED)) == 0);
 	return (mapbits & CHUNK_MAP_UNZEROED);
 }
 
 JEMALLOC_ALWAYS_INLINE size_t
+arena_mapbits_decommitted_get(arena_chunk_t *chunk, size_t pageind)
+{
+	size_t mapbits;
+
+	mapbits = arena_mapbits_get(chunk, pageind);
+	assert((mapbits & CHUNK_MAP_DECOMMITTED) == 0 || (mapbits &
+	    (CHUNK_MAP_DIRTY|CHUNK_MAP_UNZEROED)) == 0);
+	return (mapbits & CHUNK_MAP_DECOMMITTED);
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
 arena_mapbits_large_get(arena_chunk_t *chunk, size_t pageind)
 {
 	size_t mapbits;
@@ -619,6 +774,23 @@ arena_mapbitsp_write(size_t *mapbitsp, size_t mapbits)
 	*mapbitsp = mapbits;
 }
 
+JEMALLOC_ALWAYS_INLINE size_t
+arena_mapbits_size_encode(size_t size)
+{
+	size_t mapbits;
+
+#if CHUNK_MAP_SIZE_SHIFT > 0
+	mapbits = size << CHUNK_MAP_SIZE_SHIFT;
+#elif CHUNK_MAP_SIZE_SHIFT == 0
+	mapbits = size;
+#else
+	mapbits = size >> -CHUNK_MAP_SIZE_SHIFT;
+#endif
+
+	assert((mapbits & ~CHUNK_MAP_SIZE_MASK) == 0);
+	return (mapbits);
+}
+
 JEMALLOC_ALWAYS_INLINE void
 arena_mapbits_unallocated_set(arena_chunk_t *chunk, size_t pageind, size_t size,
     size_t flags)
@@ -626,9 +798,11 @@ arena_mapbits_unallocated_set(arena_chunk_t *chunk, size_t pageind, size_t size,
 	size_t *mapbitsp = arena_mapbitsp_get(chunk, pageind);
 
 	assert((size & PAGE_MASK) == 0);
-	assert((flags & ~CHUNK_MAP_FLAGS_MASK) == 0);
-	assert((flags & (CHUNK_MAP_DIRTY|CHUNK_MAP_UNZEROED)) == flags);
-	arena_mapbitsp_write(mapbitsp, size | CHUNK_MAP_BININD_INVALID | flags);
+	assert((flags & CHUNK_MAP_FLAGS_MASK) == flags);
+	assert((flags & CHUNK_MAP_DECOMMITTED) == 0 || (flags &
+	    (CHUNK_MAP_DIRTY|CHUNK_MAP_UNZEROED)) == 0);
+	arena_mapbitsp_write(mapbitsp, arena_mapbits_size_encode(size) |
+	    CHUNK_MAP_BININD_INVALID | flags);
 }
 
 JEMALLOC_ALWAYS_INLINE void
@@ -640,7 +814,17 @@ arena_mapbits_unallocated_size_set(arena_chunk_t *chunk, size_t pageind,
 
 	assert((size & PAGE_MASK) == 0);
 	assert((mapbits & (CHUNK_MAP_LARGE|CHUNK_MAP_ALLOCATED)) == 0);
-	arena_mapbitsp_write(mapbitsp, size | (mapbits & PAGE_MASK));
+	arena_mapbitsp_write(mapbitsp, arena_mapbits_size_encode(size) |
+	    (mapbits & ~CHUNK_MAP_SIZE_MASK));
+}
+
+JEMALLOC_ALWAYS_INLINE void
+arena_mapbits_internal_set(arena_chunk_t *chunk, size_t pageind, size_t flags)
+{
+	size_t *mapbitsp = arena_mapbitsp_get(chunk, pageind);
+
+	assert((flags & CHUNK_MAP_UNZEROED) == flags);
+	arena_mapbitsp_write(mapbitsp, flags);
 }
 
 JEMALLOC_ALWAYS_INLINE void
@@ -648,54 +832,62 @@ arena_mapbits_large_set(arena_chunk_t *chunk, size_t pageind, size_t size,
     size_t flags)
 {
 	size_t *mapbitsp = arena_mapbitsp_get(chunk, pageind);
-	size_t mapbits = arena_mapbitsp_read(mapbitsp);
-	size_t unzeroed;
 
 	assert((size & PAGE_MASK) == 0);
-	assert((flags & CHUNK_MAP_DIRTY) == flags);
-	unzeroed = mapbits & CHUNK_MAP_UNZEROED; /* Preserve unzeroed. */
-	arena_mapbitsp_write(mapbitsp, size | CHUNK_MAP_BININD_INVALID | flags
-	    | unzeroed | CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED);
+	assert((flags & CHUNK_MAP_FLAGS_MASK) == flags);
+	assert((flags & CHUNK_MAP_DECOMMITTED) == 0 || (flags &
+	    (CHUNK_MAP_DIRTY|CHUNK_MAP_UNZEROED)) == 0);
+	arena_mapbitsp_write(mapbitsp, arena_mapbits_size_encode(size) |
+	    CHUNK_MAP_BININD_INVALID | flags | CHUNK_MAP_LARGE |
+	    CHUNK_MAP_ALLOCATED);
 }
 
 JEMALLOC_ALWAYS_INLINE void
 arena_mapbits_large_binind_set(arena_chunk_t *chunk, size_t pageind,
-    size_t binind)
+    szind_t binind)
 {
 	size_t *mapbitsp = arena_mapbitsp_get(chunk, pageind);
 	size_t mapbits = arena_mapbitsp_read(mapbitsp);
 
 	assert(binind <= BININD_INVALID);
-	assert(arena_mapbits_large_size_get(chunk, pageind) == PAGE);
+	assert(arena_mapbits_large_size_get(chunk, pageind) == LARGE_MINCLASS +
+	    large_pad);
 	arena_mapbitsp_write(mapbitsp, (mapbits & ~CHUNK_MAP_BININD_MASK) |
 	    (binind << CHUNK_MAP_BININD_SHIFT));
 }
 
 JEMALLOC_ALWAYS_INLINE void
 arena_mapbits_small_set(arena_chunk_t *chunk, size_t pageind, size_t runind,
-    size_t binind, size_t flags)
+    szind_t binind, size_t flags)
 {
 	size_t *mapbitsp = arena_mapbitsp_get(chunk, pageind);
-	size_t mapbits = arena_mapbitsp_read(mapbitsp);
-	size_t unzeroed;
 
 	assert(binind < BININD_INVALID);
 	assert(pageind - runind >= map_bias);
-	assert((flags & CHUNK_MAP_DIRTY) == flags);
-	unzeroed = mapbits & CHUNK_MAP_UNZEROED; /* Preserve unzeroed. */
-	arena_mapbitsp_write(mapbitsp, (runind << LG_PAGE) | (binind <<
-	    CHUNK_MAP_BININD_SHIFT) | flags | unzeroed | CHUNK_MAP_ALLOCATED);
+	assert((flags & CHUNK_MAP_UNZEROED) == flags);
+	arena_mapbitsp_write(mapbitsp, (runind << CHUNK_MAP_RUNIND_SHIFT) |
+	    (binind << CHUNK_MAP_BININD_SHIFT) | flags | CHUNK_MAP_ALLOCATED);
 }
 
-JEMALLOC_ALWAYS_INLINE void
-arena_mapbits_unzeroed_set(arena_chunk_t *chunk, size_t pageind,
-    size_t unzeroed)
+JEMALLOC_INLINE void
+arena_metadata_allocated_add(arena_t *arena, size_t size)
+{
+
+	atomic_add_z(&arena->stats.metadata_allocated, size);
+}
+
+JEMALLOC_INLINE void
+arena_metadata_allocated_sub(arena_t *arena, size_t size)
 {
-	size_t *mapbitsp = arena_mapbitsp_get(chunk, pageind);
-	size_t mapbits = arena_mapbitsp_read(mapbitsp);
 
-	arena_mapbitsp_write(mapbitsp, (mapbits & ~CHUNK_MAP_UNZEROED) |
-	    unzeroed);
+	atomic_sub_z(&arena->stats.metadata_allocated, size);
+}
+
+JEMALLOC_INLINE size_t
+arena_metadata_allocated_get(arena_t *arena)
+{
+
+	return (atomic_read_z(&arena->stats.metadata_allocated));
 }
 
 JEMALLOC_INLINE bool
@@ -719,7 +911,7 @@ arena_prof_accum_locked(arena_t *arena, uint64_t accumbytes)
 
 	cassert(config_prof);
 
-	if (prof_interval == 0)
+	if (likely(prof_interval == 0))
 		return (false);
 	return (arena_prof_accum_impl(arena, accumbytes));
 }
@@ -730,7 +922,7 @@ arena_prof_accum(arena_t *arena, uint64_t accumbytes)
 
 	cassert(config_prof);
 
-	if (prof_interval == 0)
+	if (likely(prof_interval == 0))
 		return (false);
 
 	{
@@ -743,10 +935,10 @@ arena_prof_accum(arena_t *arena, uint64_t accumbytes)
 	}
 }
 
-JEMALLOC_ALWAYS_INLINE size_t
+JEMALLOC_ALWAYS_INLINE szind_t
 arena_ptr_small_binind_get(const void *ptr, size_t mapbits)
 {
-	size_t binind;
+	szind_t binind;
 
 	binind = (mapbits & CHUNK_MAP_BININD_MASK) >> CHUNK_MAP_BININD_SHIFT;
 
@@ -755,27 +947,34 @@ arena_ptr_small_binind_get(const void *ptr, size_t mapbits)
 		arena_t *arena;
 		size_t pageind;
 		size_t actual_mapbits;
+		size_t rpages_ind;
 		arena_run_t *run;
 		arena_bin_t *bin;
-		size_t actual_binind;
+		szind_t run_binind, actual_binind;
 		arena_bin_info_t *bin_info;
+		arena_chunk_map_misc_t *miscelm;
+		void *rpages;
 
 		assert(binind != BININD_INVALID);
 		assert(binind < NBINS);
 		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-		arena = chunk->arena;
+		arena = extent_node_arena_get(&chunk->node);
 		pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
 		actual_mapbits = arena_mapbits_get(chunk, pageind);
 		assert(mapbits == actual_mapbits);
 		assert(arena_mapbits_large_get(chunk, pageind) == 0);
 		assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
-		run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
-		    (actual_mapbits >> LG_PAGE)) << LG_PAGE));
-		bin = run->bin;
+		rpages_ind = pageind - arena_mapbits_small_runind_get(chunk,
+		    pageind);
+		miscelm = arena_miscelm_get(chunk, rpages_ind);
+		run = &miscelm->run;
+		run_binind = run->binind;
+		bin = &arena->bins[run_binind];
 		actual_binind = bin - arena->bins;
-		assert(binind == actual_binind);
+		assert(run_binind == actual_binind);
 		bin_info = &arena_bin_info[actual_binind];
-		assert(((uintptr_t)ptr - ((uintptr_t)run +
+		rpages = arena_miscelm_to_rpages(miscelm);
+		assert(((uintptr_t)ptr - ((uintptr_t)rpages +
 		    (uintptr_t)bin_info->reg0_offset)) % bin_info->reg_interval
 		    == 0);
 	}
@@ -785,10 +984,10 @@ arena_ptr_small_binind_get(const void *ptr, size_t mapbits)
 #  endif /* JEMALLOC_ARENA_INLINE_A */
 
 #  ifdef JEMALLOC_ARENA_INLINE_B
-JEMALLOC_INLINE size_t
+JEMALLOC_INLINE szind_t
 arena_bin_index(arena_t *arena, arena_bin_t *bin)
 {
-	size_t binind = bin - arena->bins;
+	szind_t binind = bin - arena->bins;
 	assert(binind < NBINS);
 	return (binind);
 }
@@ -798,24 +997,26 @@ arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info, const void *ptr)
 {
 	unsigned shift, diff, regind;
 	size_t interval;
+	arena_chunk_map_misc_t *miscelm = arena_run_to_miscelm(run);
+	void *rpages = arena_miscelm_to_rpages(miscelm);
 
 	/*
 	 * Freeing a pointer lower than region zero can cause assertion
 	 * failure.
 	 */
-	assert((uintptr_t)ptr >= (uintptr_t)run +
+	assert((uintptr_t)ptr >= (uintptr_t)rpages +
 	    (uintptr_t)bin_info->reg0_offset);
 
 	/*
 	 * Avoid doing division with a variable divisor if possible.  Using
 	 * actual division here can reduce allocator throughput by over 20%!
 	 */
-	diff = (unsigned)((uintptr_t)ptr - (uintptr_t)run -
+	diff = (unsigned)((uintptr_t)ptr - (uintptr_t)rpages -
 	    bin_info->reg0_offset);
 
 	/* Rescale (factor powers of 2 out of the numerator and denominator). */
 	interval = bin_info->reg_interval;
-	shift = ffs(interval) - 1;
+	shift = jemalloc_ffs(interval) - 1;
 	diff >>= shift;
 	interval >>= shift;
 
@@ -850,8 +1051,8 @@ arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info, const void *ptr)
 		    SIZE_INV(28), SIZE_INV(29), SIZE_INV(30), SIZE_INV(31)
 		};
 
-		if (interval <= ((sizeof(interval_invs) / sizeof(unsigned)) +
-		    2)) {
+		if (likely(interval <= ((sizeof(interval_invs) /
+		    sizeof(unsigned)) + 2))) {
 			regind = (diff * interval_invs[interval - 3]) >>
 			    SIZE_INV_SHIFT;
 		} else
@@ -865,113 +1066,138 @@ arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info, const void *ptr)
 	return (regind);
 }
 
-JEMALLOC_INLINE prof_ctx_t *
-arena_prof_ctx_get(const void *ptr)
+JEMALLOC_INLINE prof_tctx_t *
+arena_prof_tctx_get(const void *ptr)
 {
-	prof_ctx_t *ret;
+	prof_tctx_t *ret;
 	arena_chunk_t *chunk;
-	size_t pageind, mapbits;
 
 	cassert(config_prof);
 	assert(ptr != NULL);
-	assert(CHUNK_ADDR2BASE(ptr) != ptr);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
-	mapbits = arena_mapbits_get(chunk, pageind);
-	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
-	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
-		if (prof_promote)
-			ret = (prof_ctx_t *)(uintptr_t)1U;
+	if (likely(chunk != ptr)) {
+		size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
+		size_t mapbits = arena_mapbits_get(chunk, pageind);
+		assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
+		if (likely((mapbits & CHUNK_MAP_LARGE) == 0))
+			ret = (prof_tctx_t *)(uintptr_t)1U;
 		else {
-			arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
-			    (uintptr_t)((pageind - (mapbits >> LG_PAGE)) <<
-			    LG_PAGE));
-			size_t binind = arena_ptr_small_binind_get(ptr,
-			    mapbits);
-			arena_bin_info_t *bin_info = &arena_bin_info[binind];
-			unsigned regind;
-
-			regind = arena_run_regind(run, bin_info, ptr);
-			ret = *(prof_ctx_t **)((uintptr_t)run +
-			    bin_info->ctx0_offset + (regind *
-			    sizeof(prof_ctx_t *)));
+			arena_chunk_map_misc_t *elm = arena_miscelm_get(chunk,
+			    pageind);
+			ret = atomic_read_p(&elm->prof_tctx_pun);
 		}
 	} else
-		ret = arena_mapp_get(chunk, pageind)->prof_ctx;
+		ret = huge_prof_tctx_get(ptr);
 
 	return (ret);
 }
 
 JEMALLOC_INLINE void
-arena_prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx)
+arena_prof_tctx_set(const void *ptr, size_t usize, prof_tctx_t *tctx)
 {
 	arena_chunk_t *chunk;
-	size_t pageind;
 
 	cassert(config_prof);
 	assert(ptr != NULL);
-	assert(CHUNK_ADDR2BASE(ptr) != ptr);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
-	assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
-
-	if (usize > SMALL_MAXCLASS || (prof_promote &&
-	    ((uintptr_t)ctx != (uintptr_t)1U || arena_mapbits_large_get(chunk,
-	    pageind) != 0))) {
-		assert(arena_mapbits_large_get(chunk, pageind) != 0);
-		arena_mapp_get(chunk, pageind)->prof_ctx = ctx;
-	} else {
-		assert(arena_mapbits_large_get(chunk, pageind) == 0);
-		if (prof_promote == false) {
-			size_t mapbits = arena_mapbits_get(chunk, pageind);
-			arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
-			    (uintptr_t)((pageind - (mapbits >> LG_PAGE)) <<
-			    LG_PAGE));
-			size_t binind;
-			arena_bin_info_t *bin_info;
-			unsigned regind;
-
-			binind = arena_ptr_small_binind_get(ptr, mapbits);
-			bin_info = &arena_bin_info[binind];
-			regind = arena_run_regind(run, bin_info, ptr);
-
-			*((prof_ctx_t **)((uintptr_t)run +
-			    bin_info->ctx0_offset + (regind * sizeof(prof_ctx_t
-			    *)))) = ctx;
+	if (likely(chunk != ptr)) {
+		size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
+
+		assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
+
+		if (unlikely(usize > SMALL_MAXCLASS || (uintptr_t)tctx >
+		    (uintptr_t)1U)) {
+			arena_chunk_map_misc_t *elm;
+
+			assert(arena_mapbits_large_get(chunk, pageind) != 0);
+
+			elm = arena_miscelm_get(chunk, pageind);
+			atomic_write_p(&elm->prof_tctx_pun, tctx);
+		} else {
+			/*
+			 * tctx must always be initialized for large runs.
+			 * Assert that the surrounding conditional logic is
+			 * equivalent to checking whether ptr refers to a large
+			 * run.
+			 */
+			assert(arena_mapbits_large_get(chunk, pageind) == 0);
 		}
+	} else
+		huge_prof_tctx_set(ptr, tctx);
+}
+
+JEMALLOC_INLINE void
+arena_prof_tctx_reset(const void *ptr, size_t usize, const void *old_ptr,
+    prof_tctx_t *old_tctx)
+{
+
+	cassert(config_prof);
+	assert(ptr != NULL);
+
+	if (unlikely(usize > SMALL_MAXCLASS || (ptr == old_ptr &&
+	    (uintptr_t)old_tctx > (uintptr_t)1U))) {
+		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+		if (likely(chunk != ptr)) {
+			size_t pageind;
+			arena_chunk_map_misc_t *elm;
+
+			pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >>
+			    LG_PAGE;
+			assert(arena_mapbits_allocated_get(chunk, pageind) !=
+			    0);
+			assert(arena_mapbits_large_get(chunk, pageind) != 0);
+
+			elm = arena_miscelm_get(chunk, pageind);
+			atomic_write_p(&elm->prof_tctx_pun,
+			    (prof_tctx_t *)(uintptr_t)1U);
+		} else
+			huge_prof_tctx_reset(ptr);
 	}
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-arena_malloc(arena_t *arena, size_t size, bool zero, bool try_tcache)
+arena_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero,
+    tcache_t *tcache)
 {
-	tcache_t *tcache;
 
 	assert(size != 0);
-	assert(size <= arena_maxclass);
 
-	if (size <= SMALL_MAXCLASS) {
-		if (try_tcache && (tcache = tcache_get(true)) != NULL)
-			return (tcache_alloc_small(tcache, size, zero));
-		else {
-			return (arena_malloc_small(choose_arena(arena), size,
+	arena = arena_choose(tsd, arena);
+	if (unlikely(arena == NULL))
+		return (NULL);
+
+	if (likely(size <= SMALL_MAXCLASS)) {
+		if (likely(tcache != NULL)) {
+			return (tcache_alloc_small(tsd, arena, tcache, size,
 			    zero));
-		}
-	} else {
+		} else
+			return (arena_malloc_small(arena, size, zero));
+	} else if (likely(size <= large_maxclass)) {
 		/*
 		 * Initialize tcache after checking size in order to avoid
 		 * infinite recursion during tcache initialization.
 		 */
-		if (try_tcache && size <= tcache_maxclass && (tcache =
-		    tcache_get(true)) != NULL)
-			return (tcache_alloc_large(tcache, size, zero));
-		else {
-			return (arena_malloc_large(choose_arena(arena), size,
+		if (likely(tcache != NULL) && size <= tcache_maxclass) {
+			return (tcache_alloc_large(tsd, arena, tcache, size,
 			    zero));
-		}
-	}
+		} else
+			return (arena_malloc_large(arena, size, zero));
+	} else
+		return (huge_malloc(tsd, arena, size, zero, tcache));
+}
+
+JEMALLOC_ALWAYS_INLINE arena_t *
+arena_aalloc(const void *ptr)
+{
+	arena_chunk_t *chunk;
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	if (likely(chunk != ptr))
+		return (extent_node_arena_get(&chunk->node));
+	else
+		return (huge_aalloc(ptr));
 }
 
 /* Return the size of the allocation pointed to by ptr. */
@@ -980,81 +1206,139 @@ arena_salloc(const void *ptr, bool demote)
 {
 	size_t ret;
 	arena_chunk_t *chunk;
-	size_t pageind, binind;
+	size_t pageind;
+	szind_t binind;
 
 	assert(ptr != NULL);
-	assert(CHUNK_ADDR2BASE(ptr) != ptr);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
-	assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
-	binind = arena_mapbits_binind_get(chunk, pageind);
-	if (binind == BININD_INVALID || (config_prof && demote == false &&
-	    prof_promote && arena_mapbits_large_get(chunk, pageind) != 0)) {
-		/*
-		 * Large allocation.  In the common case (demote == true), and
-		 * as this is an inline function, most callers will only end up
-		 * looking at binind to determine that ptr is a small
-		 * allocation.
-		 */
-		assert(((uintptr_t)ptr & PAGE_MASK) == 0);
-		ret = arena_mapbits_large_size_get(chunk, pageind);
-		assert(ret != 0);
-		assert(pageind + (ret>>LG_PAGE) <= chunk_npages);
-		assert(ret == PAGE || arena_mapbits_large_size_get(chunk,
-		    pageind+(ret>>LG_PAGE)-1) == 0);
-		assert(binind == arena_mapbits_binind_get(chunk,
-		    pageind+(ret>>LG_PAGE)-1));
-		assert(arena_mapbits_dirty_get(chunk, pageind) ==
-		    arena_mapbits_dirty_get(chunk, pageind+(ret>>LG_PAGE)-1));
-	} else {
-		/*
-		 * Small allocation (possibly promoted to a large object due to
-		 * prof_promote).
-		 */
-		assert(arena_mapbits_large_get(chunk, pageind) != 0 ||
-		    arena_ptr_small_binind_get(ptr, arena_mapbits_get(chunk,
-		    pageind)) == binind);
-		ret = arena_bin_info[binind].reg_size;
-	}
+	if (likely(chunk != ptr)) {
+		pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
+		assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
+		binind = arena_mapbits_binind_get(chunk, pageind);
+		if (unlikely(binind == BININD_INVALID || (config_prof && !demote
+		    && arena_mapbits_large_get(chunk, pageind) != 0))) {
+			/*
+			 * Large allocation.  In the common case (demote), and
+			 * as this is an inline function, most callers will only
+			 * end up looking at binind to determine that ptr is a
+			 * small allocation.
+			 */
+			assert(config_cache_oblivious || ((uintptr_t)ptr &
+			    PAGE_MASK) == 0);
+			ret = arena_mapbits_large_size_get(chunk, pageind) -
+			    large_pad;
+			assert(ret != 0);
+			assert(pageind + ((ret+large_pad)>>LG_PAGE) <=
+			    chunk_npages);
+			assert(arena_mapbits_dirty_get(chunk, pageind) ==
+			    arena_mapbits_dirty_get(chunk,
+			    pageind+((ret+large_pad)>>LG_PAGE)-1));
+		} else {
+			/*
+			 * Small allocation (possibly promoted to a large
+			 * object).
+			 */
+			assert(arena_mapbits_large_get(chunk, pageind) != 0 ||
+			    arena_ptr_small_binind_get(ptr,
+			    arena_mapbits_get(chunk, pageind)) == binind);
+			ret = index2size(binind);
+		}
+	} else
+		ret = huge_salloc(ptr);
 
 	return (ret);
 }
 
 JEMALLOC_ALWAYS_INLINE void
-arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr, bool try_tcache)
+arena_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache)
 {
+	arena_chunk_t *chunk;
 	size_t pageind, mapbits;
-	tcache_t *tcache;
 
-	assert(arena != NULL);
-	assert(chunk->arena == arena);
 	assert(ptr != NULL);
-	assert(CHUNK_ADDR2BASE(ptr) != ptr);
 
-	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
-	mapbits = arena_mapbits_get(chunk, pageind);
-	assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
-	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
-		/* Small allocation. */
-		if (try_tcache && (tcache = tcache_get(false)) != NULL) {
-			size_t binind;
-
-			binind = arena_ptr_small_binind_get(ptr, mapbits);
-			tcache_dalloc_small(tcache, ptr, binind);
-		} else
-			arena_dalloc_small(arena, chunk, ptr, pageind);
-	} else {
-		size_t size = arena_mapbits_large_size_get(chunk, pageind);
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	if (likely(chunk != ptr)) {
+		pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
+		mapbits = arena_mapbits_get(chunk, pageind);
+		assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
+		if (likely((mapbits & CHUNK_MAP_LARGE) == 0)) {
+			/* Small allocation. */
+			if (likely(tcache != NULL)) {
+				szind_t binind = arena_ptr_small_binind_get(ptr,
+				    mapbits);
+				tcache_dalloc_small(tsd, tcache, ptr, binind);
+			} else {
+				arena_dalloc_small(extent_node_arena_get(
+				    &chunk->node), chunk, ptr, pageind);
+			}
+		} else {
+			size_t size = arena_mapbits_large_size_get(chunk,
+			    pageind);
+
+			assert(config_cache_oblivious || ((uintptr_t)ptr &
+			    PAGE_MASK) == 0);
+
+			if (likely(tcache != NULL) && size - large_pad <=
+			    tcache_maxclass) {
+				tcache_dalloc_large(tsd, tcache, ptr, size -
+				    large_pad);
+			} else {
+				arena_dalloc_large(extent_node_arena_get(
+				    &chunk->node), chunk, ptr);
+			}
+		}
+	} else
+		huge_dalloc(tsd, ptr, tcache);
+}
 
-		assert(((uintptr_t)ptr & PAGE_MASK) == 0);
+JEMALLOC_ALWAYS_INLINE void
+arena_sdalloc(tsd_t *tsd, void *ptr, size_t size, tcache_t *tcache)
+{
+	arena_chunk_t *chunk;
 
-		if (try_tcache && size <= tcache_maxclass && (tcache =
-		    tcache_get(false)) != NULL) {
-			tcache_dalloc_large(tcache, ptr, size);
-		} else
-			arena_dalloc_large(arena, chunk, ptr);
-	}
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	if (likely(chunk != ptr)) {
+		if (config_prof && opt_prof) {
+			size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >>
+			    LG_PAGE;
+			assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
+			if (arena_mapbits_large_get(chunk, pageind) != 0) {
+				/*
+				 * Make sure to use promoted size, not request
+				 * size.
+				 */
+				size = arena_mapbits_large_size_get(chunk,
+				    pageind) - large_pad;
+			}
+		}
+		assert(s2u(size) == s2u(arena_salloc(ptr, false)));
+
+		if (likely(size <= SMALL_MAXCLASS)) {
+			/* Small allocation. */
+			if (likely(tcache != NULL)) {
+				szind_t binind = size2index(size);
+				tcache_dalloc_small(tsd, tcache, ptr, binind);
+			} else {
+				size_t pageind = ((uintptr_t)ptr -
+				    (uintptr_t)chunk) >> LG_PAGE;
+				arena_dalloc_small(extent_node_arena_get(
+				    &chunk->node), chunk, ptr, pageind);
+			}
+		} else {
+			assert(config_cache_oblivious || ((uintptr_t)ptr &
+			    PAGE_MASK) == 0);
+
+			if (likely(tcache != NULL) && size <= tcache_maxclass)
+				tcache_dalloc_large(tsd, tcache, ptr, size);
+			else {
+				arena_dalloc_large(extent_node_arena_get(
+				    &chunk->node), chunk, ptr);
+			}
+		}
+	} else
+		huge_dalloc(tsd, ptr, tcache);
 }
 #  endif /* JEMALLOC_ARENA_INLINE_B */
 #endif
diff --git a/deps/jemalloc/include/jemalloc/internal/atomic.h b/deps/jemalloc/include/jemalloc/internal/atomic.h
index 11a7b47fe..a9aad35d1 100644
--- a/deps/jemalloc/include/jemalloc/internal/atomic.h
+++ b/deps/jemalloc/include/jemalloc/internal/atomic.h
@@ -11,6 +11,7 @@
 
 #define	atomic_read_uint64(p)	atomic_add_uint64(p, 0)
 #define	atomic_read_uint32(p)	atomic_add_uint32(p, 0)
+#define	atomic_read_p(p)	atomic_add_p(p, NULL)
 #define	atomic_read_z(p)	atomic_add_z(p, 0)
 #define	atomic_read_u(p)	atomic_add_u(p, 0)
 
@@ -18,113 +19,244 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES
 
+/*
+ * All arithmetic functions return the arithmetic result of the atomic
+ * operation.  Some atomic operation APIs return the value prior to mutation, in
+ * which case the following functions must redundantly compute the result so
+ * that it can be returned.  These functions are normally inlined, so the extra
+ * operations can be optimized away if the return values aren't used by the
+ * callers.
+ *
+ *   <t> atomic_read_<t>(<t> *p) { return (*p); }
+ *   <t> atomic_add_<t>(<t> *p, <t> x) { return (*p + x); }
+ *   <t> atomic_sub_<t>(<t> *p, <t> x) { return (*p - x); }
+ *   bool atomic_cas_<t>(<t> *p, <t> c, <t> s)
+ *   {
+ *     if (*p != c)
+ *       return (true);
+ *     *p = s;
+ *     return (false);
+ *   }
+ *   void atomic_write_<t>(<t> *p, <t> x) { *p = x; }
+ */
+
 #ifndef JEMALLOC_ENABLE_INLINE
 uint64_t	atomic_add_uint64(uint64_t *p, uint64_t x);
 uint64_t	atomic_sub_uint64(uint64_t *p, uint64_t x);
+bool	atomic_cas_uint64(uint64_t *p, uint64_t c, uint64_t s);
+void	atomic_write_uint64(uint64_t *p, uint64_t x);
 uint32_t	atomic_add_uint32(uint32_t *p, uint32_t x);
 uint32_t	atomic_sub_uint32(uint32_t *p, uint32_t x);
+bool	atomic_cas_uint32(uint32_t *p, uint32_t c, uint32_t s);
+void	atomic_write_uint32(uint32_t *p, uint32_t x);
+void	*atomic_add_p(void **p, void *x);
+void	*atomic_sub_p(void **p, void *x);
+bool	atomic_cas_p(void **p, void *c, void *s);
+void	atomic_write_p(void **p, const void *x);
 size_t	atomic_add_z(size_t *p, size_t x);
 size_t	atomic_sub_z(size_t *p, size_t x);
+bool	atomic_cas_z(size_t *p, size_t c, size_t s);
+void	atomic_write_z(size_t *p, size_t x);
 unsigned	atomic_add_u(unsigned *p, unsigned x);
 unsigned	atomic_sub_u(unsigned *p, unsigned x);
+bool	atomic_cas_u(unsigned *p, unsigned c, unsigned s);
+void	atomic_write_u(unsigned *p, unsigned x);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ATOMIC_C_))
 /******************************************************************************/
 /* 64-bit operations. */
 #if (LG_SIZEOF_PTR == 3 || LG_SIZEOF_INT == 3)
-#  ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
+#  if (defined(__amd64__) || defined(__x86_64__))
 JEMALLOC_INLINE uint64_t
 atomic_add_uint64(uint64_t *p, uint64_t x)
 {
+	uint64_t t = x;
 
-	return (__sync_add_and_fetch(p, x));
+	asm volatile (
+	    "lock; xaddq %0, %1;"
+	    : "+r" (t), "=m" (*p) /* Outputs. */
+	    : "m" (*p) /* Inputs. */
+	    );
+
+	return (t + x);
 }
 
 JEMALLOC_INLINE uint64_t
 atomic_sub_uint64(uint64_t *p, uint64_t x)
 {
+	uint64_t t;
 
-	return (__sync_sub_and_fetch(p, x));
+	x = (uint64_t)(-(int64_t)x);
+	t = x;
+	asm volatile (
+	    "lock; xaddq %0, %1;"
+	    : "+r" (t), "=m" (*p) /* Outputs. */
+	    : "m" (*p) /* Inputs. */
+	    );
+
+	return (t + x);
 }
-#elif (defined(_MSC_VER))
+
+JEMALLOC_INLINE bool
+atomic_cas_uint64(uint64_t *p, uint64_t c, uint64_t s)
+{
+	uint8_t success;
+
+	asm volatile (
+	    "lock; cmpxchgq %4, %0;"
+	    "sete %1;"
+	    : "=m" (*p), "=a" (success) /* Outputs. */
+	    : "m" (*p), "a" (c), "r" (s) /* Inputs. */
+	    : "memory" /* Clobbers. */
+	    );
+
+	return (!(bool)success);
+}
+
+JEMALLOC_INLINE void
+atomic_write_uint64(uint64_t *p, uint64_t x)
+{
+
+	asm volatile (
+	    "xchgq %1, %0;" /* Lock is implied by xchgq. */
+	    : "=m" (*p), "+r" (x) /* Outputs. */
+	    : "m" (*p) /* Inputs. */
+	    : "memory" /* Clobbers. */
+	    );
+}
+#  elif (defined(JEMALLOC_C11ATOMICS))
 JEMALLOC_INLINE uint64_t
 atomic_add_uint64(uint64_t *p, uint64_t x)
 {
-
-	return (InterlockedExchangeAdd64(p, x));
+	volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
+	return (atomic_fetch_add(a, x) + x);
 }
 
 JEMALLOC_INLINE uint64_t
 atomic_sub_uint64(uint64_t *p, uint64_t x)
 {
+	volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
+	return (atomic_fetch_sub(a, x) - x);
+}
+
+JEMALLOC_INLINE bool
+atomic_cas_uint64(uint64_t *p, uint64_t c, uint64_t s)
+{
+	volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
+	return (!atomic_compare_exchange_strong(a, &c, s));
+}
 
-	return (InterlockedExchangeAdd64(p, -((int64_t)x)));
+JEMALLOC_INLINE void
+atomic_write_uint64(uint64_t *p, uint64_t x)
+{
+	volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
+	atomic_store(a, x);
 }
-#elif (defined(JEMALLOC_OSATOMIC))
+#  elif (defined(JEMALLOC_ATOMIC9))
 JEMALLOC_INLINE uint64_t
 atomic_add_uint64(uint64_t *p, uint64_t x)
 {
 
-	return (OSAtomicAdd64((int64_t)x, (int64_t *)p));
+	/*
+	 * atomic_fetchadd_64() doesn't exist, but we only ever use this
+	 * function on LP64 systems, so atomic_fetchadd_long() will do.
+	 */
+	assert(sizeof(uint64_t) == sizeof(unsigned long));
+
+	return (atomic_fetchadd_long(p, (unsigned long)x) + x);
 }
 
 JEMALLOC_INLINE uint64_t
 atomic_sub_uint64(uint64_t *p, uint64_t x)
 {
 
-	return (OSAtomicAdd64(-((int64_t)x), (int64_t *)p));
+	assert(sizeof(uint64_t) == sizeof(unsigned long));
+
+	return (atomic_fetchadd_long(p, (unsigned long)(-(long)x)) - x);
+}
+
+JEMALLOC_INLINE bool
+atomic_cas_uint64(uint64_t *p, uint64_t c, uint64_t s)
+{
+
+	assert(sizeof(uint64_t) == sizeof(unsigned long));
+
+	return (!atomic_cmpset_long(p, (unsigned long)c, (unsigned long)s));
+}
+
+JEMALLOC_INLINE void
+atomic_write_uint64(uint64_t *p, uint64_t x)
+{
+
+	assert(sizeof(uint64_t) == sizeof(unsigned long));
+
+	atomic_store_rel_long(p, x);
 }
-#  elif (defined(__amd64__) || defined(__x86_64__))
+#  elif (defined(JEMALLOC_OSATOMIC))
 JEMALLOC_INLINE uint64_t
 atomic_add_uint64(uint64_t *p, uint64_t x)
 {
 
-	asm volatile (
-	    "lock; xaddq %0, %1;"
-	    : "+r" (x), "=m" (*p) /* Outputs. */
-	    : "m" (*p) /* Inputs. */
-	    );
-
-	return (x);
+	return (OSAtomicAdd64((int64_t)x, (int64_t *)p));
 }
 
 JEMALLOC_INLINE uint64_t
 atomic_sub_uint64(uint64_t *p, uint64_t x)
 {
 
-	x = (uint64_t)(-(int64_t)x);
-	asm volatile (
-	    "lock; xaddq %0, %1;"
-	    : "+r" (x), "=m" (*p) /* Outputs. */
-	    : "m" (*p) /* Inputs. */
-	    );
+	return (OSAtomicAdd64(-((int64_t)x), (int64_t *)p));
+}
+
+JEMALLOC_INLINE bool
+atomic_cas_uint64(uint64_t *p, uint64_t c, uint64_t s)
+{
 
-	return (x);
+	return (!OSAtomicCompareAndSwap64(c, s, (int64_t *)p));
 }
-#  elif (defined(JEMALLOC_ATOMIC9))
+
+JEMALLOC_INLINE void
+atomic_write_uint64(uint64_t *p, uint64_t x)
+{
+	uint64_t o;
+
+	/*The documented OSAtomic*() API does not expose an atomic exchange. */
+	do {
+		o = atomic_read_uint64(p);
+	} while (atomic_cas_uint64(p, o, x));
+}
+#  elif (defined(_MSC_VER))
 JEMALLOC_INLINE uint64_t
 atomic_add_uint64(uint64_t *p, uint64_t x)
 {
 
-	/*
-	 * atomic_fetchadd_64() doesn't exist, but we only ever use this
-	 * function on LP64 systems, so atomic_fetchadd_long() will do.
-	 */
-	assert(sizeof(uint64_t) == sizeof(unsigned long));
-
-	return (atomic_fetchadd_long(p, (unsigned long)x) + x);
+	return (InterlockedExchangeAdd64(p, x) + x);
 }
 
 JEMALLOC_INLINE uint64_t
 atomic_sub_uint64(uint64_t *p, uint64_t x)
 {
 
-	assert(sizeof(uint64_t) == sizeof(unsigned long));
+	return (InterlockedExchangeAdd64(p, -((int64_t)x)) - x);
+}
 
-	return (atomic_fetchadd_long(p, (unsigned long)(-(long)x)) - x);
+JEMALLOC_INLINE bool
+atomic_cas_uint64(uint64_t *p, uint64_t c, uint64_t s)
+{
+	uint64_t o;
+
+	o = InterlockedCompareExchange64(p, s, c);
+	return (o != c);
 }
-#  elif (defined(JE_FORCE_SYNC_COMPARE_AND_SWAP_8))
+
+JEMALLOC_INLINE void
+atomic_write_uint64(uint64_t *p, uint64_t x)
+{
+
+	InterlockedExchange64(p, x);
+}
+#  elif (defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8) || \
+    defined(JE_FORCE_SYNC_COMPARE_AND_SWAP_8))
 JEMALLOC_INLINE uint64_t
 atomic_add_uint64(uint64_t *p, uint64_t x)
 {
@@ -138,6 +270,20 @@ atomic_sub_uint64(uint64_t *p, uint64_t x)
 
 	return (__sync_sub_and_fetch(p, x));
 }
+
+JEMALLOC_INLINE bool
+atomic_cas_uint64(uint64_t *p, uint64_t c, uint64_t s)
+{
+
+	return (!__sync_bool_compare_and_swap(p, c, s));
+}
+
+JEMALLOC_INLINE void
+atomic_write_uint64(uint64_t *p, uint64_t x)
+{
+
+	__sync_lock_test_and_set(p, x);
+}
 #  else
 #    error "Missing implementation for 64-bit atomic operations"
 #  endif
@@ -145,90 +291,184 @@ atomic_sub_uint64(uint64_t *p, uint64_t x)
 
 /******************************************************************************/
 /* 32-bit operations. */
-#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4
+#if (defined(__i386__) || defined(__amd64__) || defined(__x86_64__))
 JEMALLOC_INLINE uint32_t
 atomic_add_uint32(uint32_t *p, uint32_t x)
 {
+	uint32_t t = x;
 
-	return (__sync_add_and_fetch(p, x));
+	asm volatile (
+	    "lock; xaddl %0, %1;"
+	    : "+r" (t), "=m" (*p) /* Outputs. */
+	    : "m" (*p) /* Inputs. */
+	    );
+
+	return (t + x);
 }
 
 JEMALLOC_INLINE uint32_t
 atomic_sub_uint32(uint32_t *p, uint32_t x)
 {
+	uint32_t t;
 
-	return (__sync_sub_and_fetch(p, x));
+	x = (uint32_t)(-(int32_t)x);
+	t = x;
+	asm volatile (
+	    "lock; xaddl %0, %1;"
+	    : "+r" (t), "=m" (*p) /* Outputs. */
+	    : "m" (*p) /* Inputs. */
+	    );
+
+	return (t + x);
 }
-#elif (defined(_MSC_VER))
+
+JEMALLOC_INLINE bool
+atomic_cas_uint32(uint32_t *p, uint32_t c, uint32_t s)
+{
+	uint8_t success;
+
+	asm volatile (
+	    "lock; cmpxchgl %4, %0;"
+	    "sete %1;"
+	    : "=m" (*p), "=a" (success) /* Outputs. */
+	    : "m" (*p), "a" (c), "r" (s) /* Inputs. */
+	    : "memory"
+	    );
+
+	return (!(bool)success);
+}
+
+JEMALLOC_INLINE void
+atomic_write_uint32(uint32_t *p, uint32_t x)
+{
+
+	asm volatile (
+	    "xchgl %1, %0;" /* Lock is implied by xchgl. */
+	    : "=m" (*p), "+r" (x) /* Outputs. */
+	    : "m" (*p) /* Inputs. */
+	    : "memory" /* Clobbers. */
+	    );
+}
+#  elif (defined(JEMALLOC_C11ATOMICS))
 JEMALLOC_INLINE uint32_t
 atomic_add_uint32(uint32_t *p, uint32_t x)
 {
-
-	return (InterlockedExchangeAdd(p, x));
+	volatile atomic_uint_least32_t *a = (volatile atomic_uint_least32_t *)p;
+	return (atomic_fetch_add(a, x) + x);
 }
 
 JEMALLOC_INLINE uint32_t
 atomic_sub_uint32(uint32_t *p, uint32_t x)
 {
+	volatile atomic_uint_least32_t *a = (volatile atomic_uint_least32_t *)p;
+	return (atomic_fetch_sub(a, x) - x);
+}
+
+JEMALLOC_INLINE bool
+atomic_cas_uint32(uint32_t *p, uint32_t c, uint32_t s)
+{
+	volatile atomic_uint_least32_t *a = (volatile atomic_uint_least32_t *)p;
+	return (!atomic_compare_exchange_strong(a, &c, s));
+}
 
-	return (InterlockedExchangeAdd(p, -((int32_t)x)));
+JEMALLOC_INLINE void
+atomic_write_uint32(uint32_t *p, uint32_t x)
+{
+	volatile atomic_uint_least32_t *a = (volatile atomic_uint_least32_t *)p;
+	atomic_store(a, x);
 }
-#elif (defined(JEMALLOC_OSATOMIC))
+#elif (defined(JEMALLOC_ATOMIC9))
 JEMALLOC_INLINE uint32_t
 atomic_add_uint32(uint32_t *p, uint32_t x)
 {
 
-	return (OSAtomicAdd32((int32_t)x, (int32_t *)p));
+	return (atomic_fetchadd_32(p, x) + x);
 }
 
 JEMALLOC_INLINE uint32_t
 atomic_sub_uint32(uint32_t *p, uint32_t x)
 {
 
-	return (OSAtomicAdd32(-((int32_t)x), (int32_t *)p));
+	return (atomic_fetchadd_32(p, (uint32_t)(-(int32_t)x)) - x);
+}
+
+JEMALLOC_INLINE bool
+atomic_cas_uint32(uint32_t *p, uint32_t c, uint32_t s)
+{
+
+	return (!atomic_cmpset_32(p, c, s));
+}
+
+JEMALLOC_INLINE void
+atomic_write_uint32(uint32_t *p, uint32_t x)
+{
+
+	atomic_store_rel_32(p, x);
 }
-#elif (defined(__i386__) || defined(__amd64__) || defined(__x86_64__))
+#elif (defined(JEMALLOC_OSATOMIC))
 JEMALLOC_INLINE uint32_t
 atomic_add_uint32(uint32_t *p, uint32_t x)
 {
 
-	asm volatile (
-	    "lock; xaddl %0, %1;"
-	    : "+r" (x), "=m" (*p) /* Outputs. */
-	    : "m" (*p) /* Inputs. */
-	    );
-
-	return (x);
+	return (OSAtomicAdd32((int32_t)x, (int32_t *)p));
 }
 
 JEMALLOC_INLINE uint32_t
 atomic_sub_uint32(uint32_t *p, uint32_t x)
 {
 
-	x = (uint32_t)(-(int32_t)x);
-	asm volatile (
-	    "lock; xaddl %0, %1;"
-	    : "+r" (x), "=m" (*p) /* Outputs. */
-	    : "m" (*p) /* Inputs. */
-	    );
+	return (OSAtomicAdd32(-((int32_t)x), (int32_t *)p));
+}
 
-	return (x);
+JEMALLOC_INLINE bool
+atomic_cas_uint32(uint32_t *p, uint32_t c, uint32_t s)
+{
+
+	return (!OSAtomicCompareAndSwap32(c, s, (int32_t *)p));
 }
-#elif (defined(JEMALLOC_ATOMIC9))
+
+JEMALLOC_INLINE void
+atomic_write_uint32(uint32_t *p, uint32_t x)
+{
+	uint32_t o;
+
+	/*The documented OSAtomic*() API does not expose an atomic exchange. */
+	do {
+		o = atomic_read_uint32(p);
+	} while (atomic_cas_uint32(p, o, x));
+}
+#elif (defined(_MSC_VER))
 JEMALLOC_INLINE uint32_t
 atomic_add_uint32(uint32_t *p, uint32_t x)
 {
 
-	return (atomic_fetchadd_32(p, x) + x);
+	return (InterlockedExchangeAdd(p, x) + x);
 }
 
 JEMALLOC_INLINE uint32_t
 atomic_sub_uint32(uint32_t *p, uint32_t x)
 {
 
-	return (atomic_fetchadd_32(p, (uint32_t)(-(int32_t)x)) - x);
+	return (InterlockedExchangeAdd(p, -((int32_t)x)) - x);
+}
+
+JEMALLOC_INLINE bool
+atomic_cas_uint32(uint32_t *p, uint32_t c, uint32_t s)
+{
+	uint32_t o;
+
+	o = InterlockedCompareExchange(p, s, c);
+	return (o != c);
 }
-#elif (defined(JE_FORCE_SYNC_COMPARE_AND_SWAP_4))
+
+JEMALLOC_INLINE void
+atomic_write_uint32(uint32_t *p, uint32_t x)
+{
+
+	InterlockedExchange(p, x);
+}
+#elif (defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4) || \
+ defined(JE_FORCE_SYNC_COMPARE_AND_SWAP_4))
 JEMALLOC_INLINE uint32_t
 atomic_add_uint32(uint32_t *p, uint32_t x)
 {
@@ -242,11 +482,73 @@ atomic_sub_uint32(uint32_t *p, uint32_t x)
 
 	return (__sync_sub_and_fetch(p, x));
 }
+
+JEMALLOC_INLINE bool
+atomic_cas_uint32(uint32_t *p, uint32_t c, uint32_t s)
+{
+
+	return (!__sync_bool_compare_and_swap(p, c, s));
+}
+
+JEMALLOC_INLINE void
+atomic_write_uint32(uint32_t *p, uint32_t x)
+{
+
+	__sync_lock_test_and_set(p, x);
+}
 #else
 #  error "Missing implementation for 32-bit atomic operations"
 #endif
 
 /******************************************************************************/
+/* Pointer operations. */
+JEMALLOC_INLINE void *
+atomic_add_p(void **p, void *x)
+{
+
+#if (LG_SIZEOF_PTR == 3)
+	return ((void *)atomic_add_uint64((uint64_t *)p, (uint64_t)x));
+#elif (LG_SIZEOF_PTR == 2)
+	return ((void *)atomic_add_uint32((uint32_t *)p, (uint32_t)x));
+#endif
+}
+
+JEMALLOC_INLINE void *
+atomic_sub_p(void **p, void *x)
+{
+
+#if (LG_SIZEOF_PTR == 3)
+	return ((void *)atomic_add_uint64((uint64_t *)p,
+	    (uint64_t)-((int64_t)x)));
+#elif (LG_SIZEOF_PTR == 2)
+	return ((void *)atomic_add_uint32((uint32_t *)p,
+	    (uint32_t)-((int32_t)x)));
+#endif
+}
+
+JEMALLOC_INLINE bool
+atomic_cas_p(void **p, void *c, void *s)
+{
+
+#if (LG_SIZEOF_PTR == 3)
+	return (atomic_cas_uint64((uint64_t *)p, (uint64_t)c, (uint64_t)s));
+#elif (LG_SIZEOF_PTR == 2)
+	return (atomic_cas_uint32((uint32_t *)p, (uint32_t)c, (uint32_t)s));
+#endif
+}
+
+JEMALLOC_INLINE void
+atomic_write_p(void **p, const void *x)
+{
+
+#if (LG_SIZEOF_PTR == 3)
+	atomic_write_uint64((uint64_t *)p, (uint64_t)x);
+#elif (LG_SIZEOF_PTR == 2)
+	atomic_write_uint32((uint32_t *)p, (uint32_t)x);
+#endif
+}
+
+/******************************************************************************/
 /* size_t operations. */
 JEMALLOC_INLINE size_t
 atomic_add_z(size_t *p, size_t x)
@@ -272,6 +574,28 @@ atomic_sub_z(size_t *p, size_t x)
 #endif
 }
 
+JEMALLOC_INLINE bool
+atomic_cas_z(size_t *p, size_t c, size_t s)
+{
+
+#if (LG_SIZEOF_PTR == 3)
+	return (atomic_cas_uint64((uint64_t *)p, (uint64_t)c, (uint64_t)s));
+#elif (LG_SIZEOF_PTR == 2)
+	return (atomic_cas_uint32((uint32_t *)p, (uint32_t)c, (uint32_t)s));
+#endif
+}
+
+JEMALLOC_INLINE void
+atomic_write_z(size_t *p, size_t x)
+{
+
+#if (LG_SIZEOF_PTR == 3)
+	atomic_write_uint64((uint64_t *)p, (uint64_t)x);
+#elif (LG_SIZEOF_PTR == 2)
+	atomic_write_uint32((uint32_t *)p, (uint32_t)x);
+#endif
+}
+
 /******************************************************************************/
 /* unsigned operations. */
 JEMALLOC_INLINE unsigned
@@ -297,6 +621,29 @@ atomic_sub_u(unsigned *p, unsigned x)
 	    (uint32_t)-((int32_t)x)));
 #endif
 }
+
+JEMALLOC_INLINE bool
+atomic_cas_u(unsigned *p, unsigned c, unsigned s)
+{
+
+#if (LG_SIZEOF_INT == 3)
+	return (atomic_cas_uint64((uint64_t *)p, (uint64_t)c, (uint64_t)s));
+#elif (LG_SIZEOF_INT == 2)
+	return (atomic_cas_uint32((uint32_t *)p, (uint32_t)c, (uint32_t)s));
+#endif
+}
+
+JEMALLOC_INLINE void
+atomic_write_u(unsigned *p, unsigned x)
+{
+
+#if (LG_SIZEOF_INT == 3)
+	atomic_write_uint64((uint64_t *)p, (uint64_t)x);
+#elif (LG_SIZEOF_INT == 2)
+	atomic_write_uint32((uint32_t *)p, (uint32_t)x);
+#endif
+}
+
 /******************************************************************************/
 #endif
 
diff --git a/deps/jemalloc/include/jemalloc/internal/base.h b/deps/jemalloc/include/jemalloc/internal/base.h
index 9cf75ffb0..39e46ee44 100644
--- a/deps/jemalloc/include/jemalloc/internal/base.h
+++ b/deps/jemalloc/include/jemalloc/internal/base.h
@@ -10,9 +10,7 @@
 #ifdef JEMALLOC_H_EXTERNS
 
 void	*base_alloc(size_t size);
-void	*base_calloc(size_t number, size_t size);
-extent_node_t *base_node_alloc(void);
-void	base_node_dealloc(extent_node_t *node);
+void	base_stats_get(size_t *allocated, size_t *resident, size_t *mapped);
 bool	base_boot(void);
 void	base_prefork(void);
 void	base_postfork_parent(void);
diff --git a/deps/jemalloc/include/jemalloc/internal/bitmap.h b/deps/jemalloc/include/jemalloc/internal/bitmap.h
index 605ebac58..fcc6005c7 100644
--- a/deps/jemalloc/include/jemalloc/internal/bitmap.h
+++ b/deps/jemalloc/include/jemalloc/internal/bitmap.h
@@ -3,6 +3,7 @@
 
 /* Maximum bitmap bit count is 2^LG_BITMAP_MAXBITS. */
 #define	LG_BITMAP_MAXBITS	LG_RUN_MAXREGS
+#define	BITMAP_MAXBITS		(ZU(1) << LG_BITMAP_MAXBITS)
 
 typedef struct bitmap_level_s bitmap_level_t;
 typedef struct bitmap_info_s bitmap_info_t;
@@ -14,6 +15,51 @@ typedef unsigned long bitmap_t;
 #define	BITMAP_GROUP_NBITS		(ZU(1) << LG_BITMAP_GROUP_NBITS)
 #define	BITMAP_GROUP_NBITS_MASK		(BITMAP_GROUP_NBITS-1)
 
+/* Number of groups required to store a given number of bits. */
+#define	BITMAP_BITS2GROUPS(nbits)					\
+    ((nbits + BITMAP_GROUP_NBITS_MASK) >> LG_BITMAP_GROUP_NBITS)
+
+/*
+ * Number of groups required at a particular level for a given number of bits.
+ */
+#define	BITMAP_GROUPS_L0(nbits)						\
+    BITMAP_BITS2GROUPS(nbits)
+#define	BITMAP_GROUPS_L1(nbits)						\
+    BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(nbits))
+#define	BITMAP_GROUPS_L2(nbits)						\
+    BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS((nbits))))
+#define	BITMAP_GROUPS_L3(nbits)						\
+    BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(		\
+	BITMAP_BITS2GROUPS((nbits)))))
+
+/*
+ * Assuming the number of levels, number of groups required for a given number
+ * of bits.
+ */
+#define	BITMAP_GROUPS_1_LEVEL(nbits)					\
+    BITMAP_GROUPS_L0(nbits)
+#define	BITMAP_GROUPS_2_LEVEL(nbits)					\
+    (BITMAP_GROUPS_1_LEVEL(nbits) + BITMAP_GROUPS_L1(nbits))
+#define	BITMAP_GROUPS_3_LEVEL(nbits)					\
+    (BITMAP_GROUPS_2_LEVEL(nbits) + BITMAP_GROUPS_L2(nbits))
+#define	BITMAP_GROUPS_4_LEVEL(nbits)					\
+    (BITMAP_GROUPS_3_LEVEL(nbits) + BITMAP_GROUPS_L3(nbits))
+
+/*
+ * Maximum number of groups required to support LG_BITMAP_MAXBITS.
+ */
+#if LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS
+#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_1_LEVEL(BITMAP_MAXBITS)
+#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 2
+#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_2_LEVEL(BITMAP_MAXBITS)
+#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 3
+#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_3_LEVEL(BITMAP_MAXBITS)
+#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 4
+#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_4_LEVEL(BITMAP_MAXBITS)
+#else
+#  error "Unsupported bitmap size"
+#endif
+
 /* Maximum number of levels possible. */
 #define	BITMAP_MAX_LEVELS						\
     (LG_BITMAP_MAXBITS / LG_SIZEOF_BITMAP)				\
@@ -93,7 +139,7 @@ bitmap_set(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
 	bitmap_t g;
 
 	assert(bit < binfo->nbits);
-	assert(bitmap_get(bitmap, binfo, bit) == false);
+	assert(!bitmap_get(bitmap, binfo, bit));
 	goff = bit >> LG_BITMAP_GROUP_NBITS;
 	gp = &bitmap[goff];
 	g = *gp;
@@ -126,15 +172,15 @@ bitmap_sfu(bitmap_t *bitmap, const bitmap_info_t *binfo)
 	bitmap_t g;
 	unsigned i;
 
-	assert(bitmap_full(bitmap, binfo) == false);
+	assert(!bitmap_full(bitmap, binfo));
 
 	i = binfo->nlevels - 1;
 	g = bitmap[binfo->levels[i].group_offset];
-	bit = ffsl(g) - 1;
+	bit = jemalloc_ffsl(g) - 1;
 	while (i > 0) {
 		i--;
 		g = bitmap[binfo->levels[i].group_offset + bit];
-		bit = (bit << LG_BITMAP_GROUP_NBITS) + (ffsl(g) - 1);
+		bit = (bit << LG_BITMAP_GROUP_NBITS) + (jemalloc_ffsl(g) - 1);
 	}
 
 	bitmap_set(bitmap, binfo, bit);
@@ -158,7 +204,7 @@ bitmap_unset(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
 	assert((g & (1LU << (bit & BITMAP_GROUP_NBITS_MASK))) == 0);
 	g ^= 1LU << (bit & BITMAP_GROUP_NBITS_MASK);
 	*gp = g;
-	assert(bitmap_get(bitmap, binfo, bit) == false);
+	assert(!bitmap_get(bitmap, binfo, bit));
 	/* Propagate group state transitions up the tree. */
 	if (propagate) {
 		unsigned i;
@@ -172,7 +218,7 @@ bitmap_unset(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
 			    == 0);
 			g ^= 1LU << (bit & BITMAP_GROUP_NBITS_MASK);
 			*gp = g;
-			if (propagate == false)
+			if (!propagate)
 				break;
 		}
 	}
diff --git a/deps/jemalloc/include/jemalloc/internal/chunk.h b/deps/jemalloc/include/jemalloc/internal/chunk.h
index 87d8700da..5d1938353 100644
--- a/deps/jemalloc/include/jemalloc/internal/chunk.h
+++ b/deps/jemalloc/include/jemalloc/internal/chunk.h
@@ -5,7 +5,7 @@
  * Size and alignment of memory chunks that are allocated by the OS's virtual
  * memory system.
  */
-#define	LG_CHUNK_DEFAULT	22
+#define	LG_CHUNK_DEFAULT	21
 
 /* Return the chunk address for allocation address a. */
 #define	CHUNK_ADDR2BASE(a)						\
@@ -19,6 +19,16 @@
 #define	CHUNK_CEILING(s)						\
 	(((s) + chunksize_mask) & ~chunksize_mask)
 
+#define	CHUNK_HOOKS_INITIALIZER {					\
+    NULL,								\
+    NULL,								\
+    NULL,								\
+    NULL,								\
+    NULL,								\
+    NULL,								\
+    NULL								\
+}
+
 #endif /* JEMALLOC_H_TYPES */
 /******************************************************************************/
 #ifdef JEMALLOC_H_STRUCTS
@@ -30,23 +40,36 @@
 extern size_t		opt_lg_chunk;
 extern const char	*opt_dss;
 
-/* Protects stats_chunks; currently not used for any other purpose. */
-extern malloc_mutex_t	chunks_mtx;
-/* Chunk statistics. */
-extern chunk_stats_t	stats_chunks;
-
-extern rtree_t		*chunks_rtree;
+extern rtree_t		chunks_rtree;
 
 extern size_t		chunksize;
 extern size_t		chunksize_mask; /* (chunksize - 1). */
 extern size_t		chunk_npages;
-extern size_t		map_bias; /* Number of arena chunk header pages. */
-extern size_t		arena_maxclass; /* Max size class for arenas. */
 
-void	*chunk_alloc(size_t size, size_t alignment, bool base, bool *zero,
-    dss_prec_t dss_prec);
-void	chunk_unmap(void *chunk, size_t size);
-void	chunk_dealloc(void *chunk, size_t size, bool unmap);
+extern const chunk_hooks_t	chunk_hooks_default;
+
+chunk_hooks_t	chunk_hooks_get(arena_t *arena);
+chunk_hooks_t	chunk_hooks_set(arena_t *arena,
+    const chunk_hooks_t *chunk_hooks);
+
+bool	chunk_register(const void *chunk, const extent_node_t *node);
+void	chunk_deregister(const void *chunk, const extent_node_t *node);
+void	*chunk_alloc_base(size_t size);
+void	*chunk_alloc_cache(arena_t *arena, chunk_hooks_t *chunk_hooks,
+    void *new_addr, size_t size, size_t alignment, bool *zero,
+    bool dalloc_node);
+void	*chunk_alloc_wrapper(arena_t *arena, chunk_hooks_t *chunk_hooks,
+    void *new_addr, size_t size, size_t alignment, bool *zero, bool *commit);
+void	chunk_dalloc_cache(arena_t *arena, chunk_hooks_t *chunk_hooks,
+    void *chunk, size_t size, bool committed);
+void	chunk_dalloc_arena(arena_t *arena, chunk_hooks_t *chunk_hooks,
+    void *chunk, size_t size, bool zeroed, bool committed);
+void	chunk_dalloc_wrapper(arena_t *arena, chunk_hooks_t *chunk_hooks,
+    void *chunk, size_t size, bool committed);
+bool	chunk_purge_arena(arena_t *arena, void *chunk, size_t offset,
+    size_t length);
+bool	chunk_purge_wrapper(arena_t *arena, chunk_hooks_t *chunk_hooks,
+    void *chunk, size_t size, size_t offset, size_t length);
 bool	chunk_boot(void);
 void	chunk_prefork(void);
 void	chunk_postfork_parent(void);
@@ -56,6 +79,19 @@ void	chunk_postfork_child(void);
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES
 
+#ifndef JEMALLOC_ENABLE_INLINE
+extent_node_t	*chunk_lookup(const void *chunk, bool dependent);
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_CHUNK_C_))
+JEMALLOC_INLINE extent_node_t *
+chunk_lookup(const void *ptr, bool dependent)
+{
+
+	return (rtree_get(&chunks_rtree, (uintptr_t)ptr, dependent));
+}
+#endif
+
 #endif /* JEMALLOC_H_INLINES */
 /******************************************************************************/
 
diff --git a/deps/jemalloc/include/jemalloc/internal/chunk_dss.h b/deps/jemalloc/include/jemalloc/internal/chunk_dss.h
index 4535ce09c..388f46be0 100644
--- a/deps/jemalloc/include/jemalloc/internal/chunk_dss.h
+++ b/deps/jemalloc/include/jemalloc/internal/chunk_dss.h
@@ -23,7 +23,8 @@ extern const char *dss_prec_names[];
 
 dss_prec_t	chunk_dss_prec_get(void);
 bool	chunk_dss_prec_set(dss_prec_t dss_prec);
-void	*chunk_alloc_dss(size_t size, size_t alignment, bool *zero);
+void	*chunk_alloc_dss(arena_t *arena, void *new_addr, size_t size,
+    size_t alignment, bool *zero, bool *commit);
 bool	chunk_in_dss(void *chunk);
 bool	chunk_dss_boot(void);
 void	chunk_dss_prefork(void);
diff --git a/deps/jemalloc/include/jemalloc/internal/chunk_mmap.h b/deps/jemalloc/include/jemalloc/internal/chunk_mmap.h
index f24abac75..7d8014c58 100644
--- a/deps/jemalloc/include/jemalloc/internal/chunk_mmap.h
+++ b/deps/jemalloc/include/jemalloc/internal/chunk_mmap.h
@@ -9,10 +9,9 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
 
-bool	pages_purge(void *addr, size_t length);
-
-void	*chunk_alloc_mmap(size_t size, size_t alignment, bool *zero);
-bool	chunk_dealloc_mmap(void *chunk, size_t size);
+void	*chunk_alloc_mmap(size_t size, size_t alignment, bool *zero,
+    bool *commit);
+bool	chunk_dalloc_mmap(void *chunk, size_t size);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
diff --git a/deps/jemalloc/include/jemalloc/internal/ckh.h b/deps/jemalloc/include/jemalloc/internal/ckh.h
index 58712a6a7..75c1c979f 100644
--- a/deps/jemalloc/include/jemalloc/internal/ckh.h
+++ b/deps/jemalloc/include/jemalloc/internal/ckh.h
@@ -66,13 +66,13 @@ struct ckh_s {
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
 
-bool	ckh_new(ckh_t *ckh, size_t minitems, ckh_hash_t *hash,
+bool	ckh_new(tsd_t *tsd, ckh_t *ckh, size_t minitems, ckh_hash_t *hash,
     ckh_keycomp_t *keycomp);
-void	ckh_delete(ckh_t *ckh);
+void	ckh_delete(tsd_t *tsd, ckh_t *ckh);
 size_t	ckh_count(ckh_t *ckh);
 bool	ckh_iter(ckh_t *ckh, size_t *tabind, void **key, void **data);
-bool	ckh_insert(ckh_t *ckh, const void *key, const void *data);
-bool	ckh_remove(ckh_t *ckh, const void *searchkey, void **key,
+bool	ckh_insert(tsd_t *tsd, ckh_t *ckh, const void *key, const void *data);
+bool	ckh_remove(tsd_t *tsd, ckh_t *ckh, const void *searchkey, void **key,
     void **data);
 bool	ckh_search(ckh_t *ckh, const void *seachkey, void **key, void **data);
 void	ckh_string_hash(const void *key, size_t r_hash[2]);
diff --git a/deps/jemalloc/include/jemalloc/internal/ctl.h b/deps/jemalloc/include/jemalloc/internal/ctl.h
index 0ffecc5f2..751c14b5b 100644
--- a/deps/jemalloc/include/jemalloc/internal/ctl.h
+++ b/deps/jemalloc/include/jemalloc/internal/ctl.h
@@ -34,6 +34,7 @@ struct ctl_arena_stats_s {
 	bool			initialized;
 	unsigned		nthreads;
 	const char		*dss;
+	ssize_t			lg_dirty_mult;
 	size_t			pactive;
 	size_t			pdirty;
 	arena_stats_t		astats;
@@ -46,22 +47,15 @@ struct ctl_arena_stats_s {
 
 	malloc_bin_stats_t	bstats[NBINS];
 	malloc_large_stats_t	*lstats;	/* nlclasses elements. */
+	malloc_huge_stats_t	*hstats;	/* nhclasses elements. */
 };
 
 struct ctl_stats_s {
 	size_t			allocated;
 	size_t			active;
+	size_t			metadata;
+	size_t			resident;
 	size_t			mapped;
-	struct {
-		size_t		current;	/* stats_chunks.curchunks */
-		uint64_t	total;		/* stats_chunks.nchunks */
-		size_t		high;		/* stats_chunks.highchunks */
-	} chunks;
-	struct {
-		size_t		allocated;	/* huge_allocated */
-		uint64_t	nmalloc;	/* huge_nmalloc */
-		uint64_t	ndalloc;	/* huge_ndalloc */
-	} huge;
 	unsigned		narenas;
 	ctl_arena_stats_t	*arenas;	/* (narenas + 1) elements. */
 };
diff --git a/deps/jemalloc/include/jemalloc/internal/extent.h b/deps/jemalloc/include/jemalloc/internal/extent.h
index ba95ca816..386d50ef4 100644
--- a/deps/jemalloc/include/jemalloc/internal/extent.h
+++ b/deps/jemalloc/include/jemalloc/internal/extent.h
@@ -7,25 +7,53 @@ typedef struct extent_node_s extent_node_t;
 /******************************************************************************/
 #ifdef JEMALLOC_H_STRUCTS
 
-/* Tree of extents. */
+/* Tree of extents.  Use accessor functions for en_* fields. */
 struct extent_node_s {
-	/* Linkage for the size/address-ordered tree. */
-	rb_node(extent_node_t)	link_szad;
+	/* Arena from which this extent came, if any. */
+	arena_t			*en_arena;
 
-	/* Linkage for the address-ordered tree. */
-	rb_node(extent_node_t)	link_ad;
+	/* Pointer to the extent that this tree node is responsible for. */
+	void			*en_addr;
+
+	/* Total region size. */
+	size_t			en_size;
+
+	/*
+	 * The zeroed flag is used by chunk recycling code to track whether
+	 * memory is zero-filled.
+	 */
+	bool			en_zeroed;
+
+	/*
+	 * True if physical memory is committed to the extent, whether
+	 * explicitly or implicitly as on a system that overcommits and
+	 * satisfies physical memory needs on demand via soft page faults.
+	 */
+	bool			en_committed;
+
+	/*
+	 * The achunk flag is used to validate that huge allocation lookups
+	 * don't return arena chunks.
+	 */
+	bool			en_achunk;
 
 	/* Profile counters, used for huge objects. */
-	prof_ctx_t		*prof_ctx;
+	prof_tctx_t		*en_prof_tctx;
 
-	/* Pointer to the extent that this tree node is responsible for. */
-	void			*addr;
+	/* Linkage for arena's runs_dirty and chunks_cache rings. */
+	arena_runs_dirty_link_t	rd;
+	qr(extent_node_t)	cc_link;
 
-	/* Total region size. */
-	size_t			size;
+	union {
+		/* Linkage for the size/address-ordered tree. */
+		rb_node(extent_node_t)	szad_link;
+
+		/* Linkage for arena's huge and node_cache lists. */
+		ql_elm(extent_node_t)	ql_link;
+	};
 
-	/* True if zero-filled; used by chunk recycling code. */
-	bool			zeroed;
+	/* Linkage for the address-ordered tree. */
+	rb_node(extent_node_t)	ad_link;
 };
 typedef rb_tree(extent_node_t) extent_tree_t;
 
@@ -41,6 +69,171 @@ rb_proto(, extent_tree_ad_, extent_tree_t, extent_node_t)
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES
 
+#ifndef JEMALLOC_ENABLE_INLINE
+arena_t	*extent_node_arena_get(const extent_node_t *node);
+void	*extent_node_addr_get(const extent_node_t *node);
+size_t	extent_node_size_get(const extent_node_t *node);
+bool	extent_node_zeroed_get(const extent_node_t *node);
+bool	extent_node_committed_get(const extent_node_t *node);
+bool	extent_node_achunk_get(const extent_node_t *node);
+prof_tctx_t	*extent_node_prof_tctx_get(const extent_node_t *node);
+void	extent_node_arena_set(extent_node_t *node, arena_t *arena);
+void	extent_node_addr_set(extent_node_t *node, void *addr);
+void	extent_node_size_set(extent_node_t *node, size_t size);
+void	extent_node_zeroed_set(extent_node_t *node, bool zeroed);
+void	extent_node_committed_set(extent_node_t *node, bool committed);
+void	extent_node_achunk_set(extent_node_t *node, bool achunk);
+void	extent_node_prof_tctx_set(extent_node_t *node, prof_tctx_t *tctx);
+void	extent_node_init(extent_node_t *node, arena_t *arena, void *addr,
+    size_t size, bool zeroed, bool committed);
+void	extent_node_dirty_linkage_init(extent_node_t *node);
+void	extent_node_dirty_insert(extent_node_t *node,
+    arena_runs_dirty_link_t *runs_dirty, extent_node_t *chunks_dirty);
+void	extent_node_dirty_remove(extent_node_t *node);
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_EXTENT_C_))
+JEMALLOC_INLINE arena_t *
+extent_node_arena_get(const extent_node_t *node)
+{
+
+	return (node->en_arena);
+}
+
+JEMALLOC_INLINE void *
+extent_node_addr_get(const extent_node_t *node)
+{
+
+	return (node->en_addr);
+}
+
+JEMALLOC_INLINE size_t
+extent_node_size_get(const extent_node_t *node)
+{
+
+	return (node->en_size);
+}
+
+JEMALLOC_INLINE bool
+extent_node_zeroed_get(const extent_node_t *node)
+{
+
+	return (node->en_zeroed);
+}
+
+JEMALLOC_INLINE bool
+extent_node_committed_get(const extent_node_t *node)
+{
+
+	assert(!node->en_achunk);
+	return (node->en_committed);
+}
+
+JEMALLOC_INLINE bool
+extent_node_achunk_get(const extent_node_t *node)
+{
+
+	return (node->en_achunk);
+}
+
+JEMALLOC_INLINE prof_tctx_t *
+extent_node_prof_tctx_get(const extent_node_t *node)
+{
+
+	return (node->en_prof_tctx);
+}
+
+JEMALLOC_INLINE void
+extent_node_arena_set(extent_node_t *node, arena_t *arena)
+{
+
+	node->en_arena = arena;
+}
+
+JEMALLOC_INLINE void
+extent_node_addr_set(extent_node_t *node, void *addr)
+{
+
+	node->en_addr = addr;
+}
+
+JEMALLOC_INLINE void
+extent_node_size_set(extent_node_t *node, size_t size)
+{
+
+	node->en_size = size;
+}
+
+JEMALLOC_INLINE void
+extent_node_zeroed_set(extent_node_t *node, bool zeroed)
+{
+
+	node->en_zeroed = zeroed;
+}
+
+JEMALLOC_INLINE void
+extent_node_committed_set(extent_node_t *node, bool committed)
+{
+
+	node->en_committed = committed;
+}
+
+JEMALLOC_INLINE void
+extent_node_achunk_set(extent_node_t *node, bool achunk)
+{
+
+	node->en_achunk = achunk;
+}
+
+JEMALLOC_INLINE void
+extent_node_prof_tctx_set(extent_node_t *node, prof_tctx_t *tctx)
+{
+
+	node->en_prof_tctx = tctx;
+}
+
+JEMALLOC_INLINE void
+extent_node_init(extent_node_t *node, arena_t *arena, void *addr, size_t size,
+    bool zeroed, bool committed)
+{
+
+	extent_node_arena_set(node, arena);
+	extent_node_addr_set(node, addr);
+	extent_node_size_set(node, size);
+	extent_node_zeroed_set(node, zeroed);
+	extent_node_committed_set(node, committed);
+	extent_node_achunk_set(node, false);
+	if (config_prof)
+		extent_node_prof_tctx_set(node, NULL);
+}
+
+JEMALLOC_INLINE void
+extent_node_dirty_linkage_init(extent_node_t *node)
+{
+
+	qr_new(&node->rd, rd_link);
+	qr_new(node, cc_link);
+}
+
+JEMALLOC_INLINE void
+extent_node_dirty_insert(extent_node_t *node,
+    arena_runs_dirty_link_t *runs_dirty, extent_node_t *chunks_dirty)
+{
+
+	qr_meld(runs_dirty, &node->rd, rd_link);
+	qr_meld(chunks_dirty, node, cc_link);
+}
+
+JEMALLOC_INLINE void
+extent_node_dirty_remove(extent_node_t *node)
+{
+
+	qr_remove(&node->rd, rd_link);
+	qr_remove(node, cc_link);
+}
+
+#endif
+
 #endif /* JEMALLOC_H_INLINES */
 /******************************************************************************/
 
diff --git a/deps/jemalloc/include/jemalloc/internal/hash.h b/deps/jemalloc/include/jemalloc/internal/hash.h
index c7183ede8..bcead337a 100644
--- a/deps/jemalloc/include/jemalloc/internal/hash.h
+++ b/deps/jemalloc/include/jemalloc/internal/hash.h
@@ -35,13 +35,14 @@ JEMALLOC_INLINE uint32_t
 hash_rotl_32(uint32_t x, int8_t r)
 {
 
-	return (x << r) | (x >> (32 - r));
+	return ((x << r) | (x >> (32 - r)));
 }
 
 JEMALLOC_INLINE uint64_t
 hash_rotl_64(uint64_t x, int8_t r)
 {
-	return (x << r) | (x >> (64 - r));
+
+	return ((x << r) | (x >> (64 - r)));
 }
 
 JEMALLOC_INLINE uint32_t
@@ -76,9 +77,9 @@ hash_fmix_64(uint64_t k)
 {
 
 	k ^= k >> 33;
-	k *= QU(0xff51afd7ed558ccdLLU);
+	k *= KQU(0xff51afd7ed558ccd);
 	k ^= k >> 33;
-	k *= QU(0xc4ceb9fe1a85ec53LLU);
+	k *= KQU(0xc4ceb9fe1a85ec53);
 	k ^= k >> 33;
 
 	return (k);
@@ -247,8 +248,8 @@ hash_x64_128(const void *key, const int len, const uint32_t seed,
 	uint64_t h1 = seed;
 	uint64_t h2 = seed;
 
-	const uint64_t c1 = QU(0x87c37b91114253d5LLU);
-	const uint64_t c2 = QU(0x4cf5ad432745937fLLU);
+	const uint64_t c1 = KQU(0x87c37b91114253d5);
+	const uint64_t c2 = KQU(0x4cf5ad432745937f);
 
 	/* body */
 	{
diff --git a/deps/jemalloc/include/jemalloc/internal/huge.h b/deps/jemalloc/include/jemalloc/internal/huge.h
index a2b9c7791..ece7af980 100644
--- a/deps/jemalloc/include/jemalloc/internal/huge.h
+++ b/deps/jemalloc/include/jemalloc/internal/huge.h
@@ -9,34 +9,24 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
 
-/* Huge allocation statistics. */
-extern uint64_t		huge_nmalloc;
-extern uint64_t		huge_ndalloc;
-extern size_t		huge_allocated;
-
-/* Protects chunk-related data structures. */
-extern malloc_mutex_t	huge_mtx;
-
-void	*huge_malloc(size_t size, bool zero, dss_prec_t dss_prec);
-void	*huge_palloc(size_t size, size_t alignment, bool zero,
-    dss_prec_t dss_prec);
-bool	huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
-    size_t extra);
-void	*huge_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
-    size_t alignment, bool zero, bool try_tcache_dalloc, dss_prec_t dss_prec);
+void	*huge_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero,
+    tcache_t *tcache);
+void	*huge_palloc(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
+    bool zero, tcache_t *tcache);
+bool	huge_ralloc_no_move(void *ptr, size_t oldsize, size_t usize_min,
+    size_t usize_max, bool zero);
+void	*huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize,
+    size_t usize, size_t alignment, bool zero, tcache_t *tcache);
 #ifdef JEMALLOC_JET
 typedef void (huge_dalloc_junk_t)(void *, size_t);
 extern huge_dalloc_junk_t *huge_dalloc_junk;
 #endif
-void	huge_dalloc(void *ptr, bool unmap);
+void	huge_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache);
+arena_t	*huge_aalloc(const void *ptr);
 size_t	huge_salloc(const void *ptr);
-dss_prec_t	huge_dss_prec_get(arena_t *arena);
-prof_ctx_t	*huge_prof_ctx_get(const void *ptr);
-void	huge_prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
-bool	huge_boot(void);
-void	huge_prefork(void);
-void	huge_postfork_parent(void);
-void	huge_postfork_child(void);
+prof_tctx_t	*huge_prof_tctx_get(const void *ptr);
+void	huge_prof_tctx_set(const void *ptr, prof_tctx_t *tctx);
+void	huge_prof_tctx_reset(const void *ptr);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
diff --git a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
index df266abb7..8536a3eda 100644
--- a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
@@ -1,70 +1,13 @@
 #ifndef JEMALLOC_INTERNAL_H
 #define	JEMALLOC_INTERNAL_H
-#include <math.h>
-#ifdef _WIN32
-#  include <windows.h>
-#  define ENOENT ERROR_PATH_NOT_FOUND
-#  define EINVAL ERROR_BAD_ARGUMENTS
-#  define EAGAIN ERROR_OUTOFMEMORY
-#  define EPERM  ERROR_WRITE_FAULT
-#  define EFAULT ERROR_INVALID_ADDRESS
-#  define ENOMEM ERROR_NOT_ENOUGH_MEMORY
-#  undef ERANGE
-#  define ERANGE ERROR_INVALID_DATA
-#else
-#  include <sys/param.h>
-#  include <sys/mman.h>
-#  include <sys/syscall.h>
-#  if !defined(SYS_write) && defined(__NR_write)
-#    define SYS_write __NR_write
-#  endif
-#  include <sys/uio.h>
-#  include <pthread.h>
-#  include <errno.h>
-#endif
-#include <sys/types.h>
-
-#include <limits.h>
-#ifndef SIZE_T_MAX
-#  define SIZE_T_MAX	SIZE_MAX
-#endif
-#include <stdarg.h>
-#include <stdbool.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <stddef.h>
-#ifndef offsetof
-#  define offsetof(type, member)	((size_t)&(((type *)NULL)->member))
-#endif
-#include <inttypes.h>
-#include <string.h>
-#include <strings.h>
-#include <ctype.h>
-#ifdef _MSC_VER
-#  include <io.h>
-typedef intptr_t ssize_t;
-#  define PATH_MAX 1024
-#  define STDERR_FILENO 2
-#  define __func__ __FUNCTION__
-/* Disable warnings about deprecated system functions */
-#  pragma warning(disable: 4996)
-#else
-#  include <unistd.h>
-#endif
-#include <fcntl.h>
 
 #include "jemalloc_internal_defs.h"
+#include "jemalloc/internal/jemalloc_internal_decls.h"
 
 #ifdef JEMALLOC_UTRACE
 #include <sys/ktrace.h>
 #endif
 
-#ifdef JEMALLOC_VALGRIND
-#include <valgrind/valgrind.h>
-#include <valgrind/memcheck.h>
-#endif
-
 #define	JEMALLOC_NO_DEMANGLE
 #ifdef JEMALLOC_JET
 #  define JEMALLOC_N(n) jet_##n
@@ -85,7 +28,7 @@ static const bool config_debug =
     false
 #endif
     ;
-static const bool config_dss =
+static const bool have_dss =
 #ifdef JEMALLOC_DSS
     true
 #else
@@ -127,8 +70,8 @@ static const bool config_prof_libunwind =
     false
 #endif
     ;
-static const bool config_mremap =
-#ifdef JEMALLOC_MREMAP
+static const bool maps_coalesce =
+#ifdef JEMALLOC_MAPS_COALESCE
     true
 #else
     false
@@ -190,6 +133,17 @@ static const bool config_ivsalloc =
     false
 #endif
     ;
+static const bool config_cache_oblivious =
+#ifdef JEMALLOC_CACHE_OBLIVIOUS
+    true
+#else
+    false
+#endif
+    ;
+
+#ifdef JEMALLOC_C11ATOMICS
+#include <stdatomic.h>
+#endif
 
 #ifdef JEMALLOC_ATOMIC9
 #include <machine/atomic.h>
@@ -229,20 +183,48 @@ static const bool config_ivsalloc =
 
 #include "jemalloc/internal/jemalloc_internal_macros.h"
 
+/* Size class index type. */
+typedef unsigned szind_t;
+
+/*
+ * Flags bits:
+ *
+ * a: arena
+ * t: tcache
+ * 0: unused
+ * z: zero
+ * n: alignment
+ *
+ * aaaaaaaa aaaatttt tttttttt 0znnnnnn
+ */
+#define	MALLOCX_ARENA_MASK	((int)~0xfffff)
+#define	MALLOCX_ARENA_MAX	0xffe
+#define	MALLOCX_TCACHE_MASK	((int)~0xfff000ffU)
+#define	MALLOCX_TCACHE_MAX	0xffd
 #define	MALLOCX_LG_ALIGN_MASK	((int)0x3f)
-#define	ALLOCM_LG_ALIGN_MASK	((int)0x3f)
+/* Use MALLOCX_ALIGN_GET() if alignment may not be specified in flags. */
+#define	MALLOCX_ALIGN_GET_SPECIFIED(flags)				\
+    (ZU(1) << (flags & MALLOCX_LG_ALIGN_MASK))
+#define	MALLOCX_ALIGN_GET(flags)					\
+    (MALLOCX_ALIGN_GET_SPECIFIED(flags) & (SIZE_T_MAX-1))
+#define	MALLOCX_ZERO_GET(flags)						\
+    ((bool)(flags & MALLOCX_ZERO))
+
+#define	MALLOCX_TCACHE_GET(flags)					\
+    (((unsigned)((flags & MALLOCX_TCACHE_MASK) >> 8)) - 2)
+#define	MALLOCX_ARENA_GET(flags)					\
+    (((unsigned)(((unsigned)flags) >> 20)) - 1)
 
 /* Smallest size class to support. */
-#define	LG_TINY_MIN		3
 #define	TINY_MIN		(1U << LG_TINY_MIN)
 
 /*
- * Minimum alignment of allocations is 2^LG_QUANTUM bytes (ignoring tiny size
+ * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size
  * classes).
  */
 #ifndef LG_QUANTUM
 #  if (defined(__i386__) || defined(_M_IX86))
-#    define LG_QUANTUM		3
+#    define LG_QUANTUM		4
 #  endif
 #  ifdef __ia64__
 #    define LG_QUANTUM		4
@@ -250,11 +232,11 @@ static const bool config_ivsalloc =
 #  ifdef __alpha__
 #    define LG_QUANTUM		4
 #  endif
-#  ifdef __sparc64__
+#  if (defined(__sparc64__) || defined(__sparcv9))
 #    define LG_QUANTUM		4
 #  endif
 #  if (defined(__amd64__) || defined(__x86_64__) || defined(_M_X64))
-#    define LG_QUANTUM		3
+#    define LG_QUANTUM		4
 #  endif
 #  ifdef __arm__
 #    define LG_QUANTUM		3
@@ -268,6 +250,9 @@ static const bool config_ivsalloc =
 #  ifdef __mips__
 #    define LG_QUANTUM		3
 #  endif
+#  ifdef __or1k__
+#    define LG_QUANTUM		3
+#  endif
 #  ifdef __powerpc__
 #    define LG_QUANTUM		4
 #  endif
@@ -280,8 +265,12 @@ static const bool config_ivsalloc =
 #  ifdef __tile__
 #    define LG_QUANTUM		4
 #  endif
+#  ifdef __le32__
+#    define LG_QUANTUM		4
+#  endif
 #  ifndef LG_QUANTUM
-#    error "No LG_QUANTUM definition for architecture; specify via CPPFLAGS"
+#    error "Unknown minimum alignment for architecture; specify via "
+	 "--with-lg-quantum"
 #  endif
 #endif
 
@@ -321,12 +310,11 @@ static const bool config_ivsalloc =
 #define	CACHELINE_CEILING(s)						\
 	(((s) + CACHELINE_MASK) & ~CACHELINE_MASK)
 
-/* Page size.  STATIC_PAGE_SHIFT is determined by the configure script. */
+/* Page size.  LG_PAGE is determined by the configure script. */
 #ifdef PAGE_MASK
 #  undef PAGE_MASK
 #endif
-#define	LG_PAGE		STATIC_PAGE_SHIFT
-#define	PAGE		((size_t)(1U << STATIC_PAGE_SHIFT))
+#define	PAGE		((size_t)(1U << LG_PAGE))
 #define	PAGE_MASK	((size_t)(PAGE - 1))
 
 /* Return the smallest pagesize multiple that is >= s. */
@@ -345,7 +333,7 @@ static const bool config_ivsalloc =
 #define	ALIGNMENT_CEILING(s, alignment)					\
 	(((s) + (alignment - 1)) & (-(alignment)))
 
-/* Declare a variable length array */
+/* Declare a variable-length array. */
 #if __STDC_VERSION__ < 199901L
 #  ifdef _MSC_VER
 #    include <malloc.h>
@@ -358,86 +346,12 @@ static const bool config_ivsalloc =
 #    endif
 #  endif
 #  define VARIABLE_ARRAY(type, name, count) \
-	type *name = alloca(sizeof(type) * count)
-#else
-#  define VARIABLE_ARRAY(type, name, count) type name[count]
-#endif
-
-#ifdef JEMALLOC_VALGRIND
-/*
- * The JEMALLOC_VALGRIND_*() macros must be macros rather than functions
- * so that when Valgrind reports errors, there are no extra stack frames
- * in the backtraces.
- *
- * The size that is reported to valgrind must be consistent through a chain of
- * malloc..realloc..realloc calls.  Request size isn't recorded anywhere in
- * jemalloc, so it is critical that all callers of these macros provide usize
- * rather than request size.  As a result, buffer overflow detection is
- * technically weakened for the standard API, though it is generally accepted
- * practice to consider any extra bytes reported by malloc_usable_size() as
- * usable space.
- */
-#define	JEMALLOC_VALGRIND_MALLOC(cond, ptr, usize, zero) do {		\
-	if (config_valgrind && opt_valgrind && cond)			\
-		VALGRIND_MALLOCLIKE_BLOCK(ptr, usize, p2rz(ptr), zero);	\
-} while (0)
-#define	JEMALLOC_VALGRIND_REALLOC(ptr, usize, old_ptr, old_usize,	\
-    old_rzsize, zero)  do {						\
-	if (config_valgrind && opt_valgrind) {				\
-		size_t rzsize = p2rz(ptr);				\
-									\
-		if (ptr == old_ptr) {					\
-			VALGRIND_RESIZEINPLACE_BLOCK(ptr, old_usize,	\
-			    usize, rzsize);				\
-			if (zero && old_usize < usize) {		\
-				VALGRIND_MAKE_MEM_DEFINED(		\
-				    (void *)((uintptr_t)ptr +		\
-				    old_usize), usize - old_usize);	\
-			}						\
-		} else {						\
-			if (old_ptr != NULL) {				\
-				VALGRIND_FREELIKE_BLOCK(old_ptr,	\
-				    old_rzsize);			\
-			}						\
-			if (ptr != NULL) {				\
-				size_t copy_size = (old_usize < usize)	\
-				    ?  old_usize : usize;		\
-				size_t tail_size = usize - copy_size;	\
-				VALGRIND_MALLOCLIKE_BLOCK(ptr, usize,	\
-				    rzsize, false);			\
-				if (copy_size > 0) {			\
-					VALGRIND_MAKE_MEM_DEFINED(ptr,	\
-					    copy_size);			\
-				}					\
-				if (zero && tail_size > 0) {		\
-					VALGRIND_MAKE_MEM_DEFINED(	\
-					    (void *)((uintptr_t)ptr +	\
-					    copy_size), tail_size);	\
-				}					\
-			}						\
-		}							\
-	}								\
-} while (0)
-#define	JEMALLOC_VALGRIND_FREE(ptr, rzsize) do {			\
-	if (config_valgrind && opt_valgrind)				\
-		VALGRIND_FREELIKE_BLOCK(ptr, rzsize);			\
-} while (0)
+	type *name = alloca(sizeof(type) * (count))
 #else
-#define	RUNNING_ON_VALGRIND	((unsigned)0)
-#define	VALGRIND_MALLOCLIKE_BLOCK(addr, sizeB, rzB, is_zeroed) \
-    do {} while (0)
-#define	VALGRIND_RESIZEINPLACE_BLOCK(addr, oldSizeB, newSizeB, rzB) \
-    do {} while (0)
-#define	VALGRIND_FREELIKE_BLOCK(addr, rzB) do {} while (0)
-#define	VALGRIND_MAKE_MEM_NOACCESS(_qzz_addr, _qzz_len) do {} while (0)
-#define	VALGRIND_MAKE_MEM_UNDEFINED(_qzz_addr, _qzz_len) do {} while (0)
-#define	VALGRIND_MAKE_MEM_DEFINED(_qzz_addr, _qzz_len) do {} while (0)
-#define	JEMALLOC_VALGRIND_MALLOC(cond, ptr, usize, zero) do {} while (0)
-#define	JEMALLOC_VALGRIND_REALLOC(ptr, usize, old_ptr, old_usize,	\
-    old_rzsize, zero) do {} while (0)
-#define	JEMALLOC_VALGRIND_FREE(ptr, rzsize) do {} while (0)
+#  define VARIABLE_ARRAY(type, name, count) type name[(count)]
 #endif
 
+#include "jemalloc/internal/valgrind.h"
 #include "jemalloc/internal/util.h"
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/prng.h"
@@ -452,9 +366,10 @@ static const bool config_ivsalloc =
 #include "jemalloc/internal/arena.h"
 #include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/base.h"
+#include "jemalloc/internal/rtree.h"
+#include "jemalloc/internal/pages.h"
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
-#include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/tcache.h"
 #include "jemalloc/internal/hash.h"
 #include "jemalloc/internal/quarantine.h"
@@ -464,6 +379,7 @@ static const bool config_ivsalloc =
 /******************************************************************************/
 #define	JEMALLOC_H_STRUCTS
 
+#include "jemalloc/internal/valgrind.h"
 #include "jemalloc/internal/util.h"
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/prng.h"
@@ -472,68 +388,83 @@ static const bool config_ivsalloc =
 #include "jemalloc/internal/stats.h"
 #include "jemalloc/internal/ctl.h"
 #include "jemalloc/internal/mutex.h"
-#include "jemalloc/internal/tsd.h"
 #include "jemalloc/internal/mb.h"
 #include "jemalloc/internal/bitmap.h"
+#define	JEMALLOC_ARENA_STRUCTS_A
+#include "jemalloc/internal/arena.h"
+#undef JEMALLOC_ARENA_STRUCTS_A
 #include "jemalloc/internal/extent.h"
+#define	JEMALLOC_ARENA_STRUCTS_B
 #include "jemalloc/internal/arena.h"
+#undef JEMALLOC_ARENA_STRUCTS_B
 #include "jemalloc/internal/base.h"
+#include "jemalloc/internal/rtree.h"
+#include "jemalloc/internal/pages.h"
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
-#include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/tcache.h"
 #include "jemalloc/internal/hash.h"
 #include "jemalloc/internal/quarantine.h"
 #include "jemalloc/internal/prof.h"
 
-typedef struct {
-	uint64_t	allocated;
-	uint64_t	deallocated;
-} thread_allocated_t;
-/*
- * The JEMALLOC_ARG_CONCAT() wrapper is necessary to pass {0, 0} via a cpp macro
- * argument.
- */
-#define	THREAD_ALLOCATED_INITIALIZER	JEMALLOC_ARG_CONCAT({0, 0})
+#include "jemalloc/internal/tsd.h"
 
 #undef JEMALLOC_H_STRUCTS
 /******************************************************************************/
 #define	JEMALLOC_H_EXTERNS
 
 extern bool	opt_abort;
-extern bool	opt_junk;
+extern const char	*opt_junk;
+extern bool	opt_junk_alloc;
+extern bool	opt_junk_free;
 extern size_t	opt_quarantine;
 extern bool	opt_redzone;
 extern bool	opt_utrace;
-extern bool	opt_valgrind;
 extern bool	opt_xmalloc;
 extern bool	opt_zero;
 extern size_t	opt_narenas;
 
+extern bool	in_valgrind;
+
 /* Number of CPUs. */
 extern unsigned		ncpus;
 
-/* Protects arenas initialization (arenas, arenas_total). */
-extern malloc_mutex_t	arenas_lock;
 /*
- * Arenas that are used to service external requests.  Not all elements of the
- * arenas array are necessarily used; arenas are created lazily as needed.
- *
- * arenas[0..narenas_auto) are used for automatic multiplexing of threads and
- * arenas.  arenas[narenas_auto..narenas_total) are only used if the application
- * takes some action to create them and allocate from them.
+ * index2size_tab encodes the same information as could be computed (at
+ * unacceptable cost in some code paths) by index2size_compute().
  */
-extern arena_t		**arenas;
-extern unsigned		narenas_total;
-extern unsigned		narenas_auto; /* Read-only after initialization. */
-
+extern size_t const	index2size_tab[NSIZES];
+/*
+ * size2index_tab is a compact lookup table that rounds request sizes up to
+ * size classes.  In order to reduce cache footprint, the table is compressed,
+ * and all accesses are via size2index().
+ */
+extern uint8_t const	size2index_tab[];
+
+arena_t	*a0get(void);
+void	*a0malloc(size_t size);
+void	a0dalloc(void *ptr);
+void	*bootstrap_malloc(size_t size);
+void	*bootstrap_calloc(size_t num, size_t size);
+void	bootstrap_free(void *ptr);
 arena_t	*arenas_extend(unsigned ind);
-void	arenas_cleanup(void *arg);
-arena_t	*choose_arena_hard(void);
+arena_t	*arena_init(unsigned ind);
+unsigned	narenas_total_get(void);
+arena_t	*arena_get_hard(tsd_t *tsd, unsigned ind, bool init_if_missing);
+arena_t	*arena_choose_hard(tsd_t *tsd);
+void	arena_migrate(tsd_t *tsd, unsigned oldind, unsigned newind);
+unsigned	arena_nbound(unsigned ind);
+void	thread_allocated_cleanup(tsd_t *tsd);
+void	thread_deallocated_cleanup(tsd_t *tsd);
+void	arena_cleanup(tsd_t *tsd);
+void	arenas_cache_cleanup(tsd_t *tsd);
+void	narenas_cache_cleanup(tsd_t *tsd);
+void	arenas_cache_bypass_cleanup(tsd_t *tsd);
 void	jemalloc_prefork(void);
 void	jemalloc_postfork_parent(void);
 void	jemalloc_postfork_child(void);
 
+#include "jemalloc/internal/valgrind.h"
 #include "jemalloc/internal/util.h"
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/prng.h"
@@ -542,24 +473,26 @@ void	jemalloc_postfork_child(void);
 #include "jemalloc/internal/stats.h"
 #include "jemalloc/internal/ctl.h"
 #include "jemalloc/internal/mutex.h"
-#include "jemalloc/internal/tsd.h"
 #include "jemalloc/internal/mb.h"
 #include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/extent.h"
 #include "jemalloc/internal/arena.h"
 #include "jemalloc/internal/base.h"
+#include "jemalloc/internal/rtree.h"
+#include "jemalloc/internal/pages.h"
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
-#include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/tcache.h"
 #include "jemalloc/internal/hash.h"
 #include "jemalloc/internal/quarantine.h"
 #include "jemalloc/internal/prof.h"
+#include "jemalloc/internal/tsd.h"
 
 #undef JEMALLOC_H_EXTERNS
 /******************************************************************************/
 #define	JEMALLOC_H_INLINES
 
+#include "jemalloc/internal/valgrind.h"
 #include "jemalloc/internal/util.h"
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/prng.h"
@@ -572,26 +505,158 @@ void	jemalloc_postfork_child(void);
 #include "jemalloc/internal/mb.h"
 #include "jemalloc/internal/extent.h"
 #include "jemalloc/internal/base.h"
+#include "jemalloc/internal/rtree.h"
+#include "jemalloc/internal/pages.h"
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
 
 #ifndef JEMALLOC_ENABLE_INLINE
-malloc_tsd_protos(JEMALLOC_ATTR(unused), arenas, arena_t *)
-
+szind_t	size2index_compute(size_t size);
+szind_t	size2index_lookup(size_t size);
+szind_t	size2index(size_t size);
+size_t	index2size_compute(szind_t index);
+size_t	index2size_lookup(szind_t index);
+size_t	index2size(szind_t index);
+size_t	s2u_compute(size_t size);
+size_t	s2u_lookup(size_t size);
 size_t	s2u(size_t size);
 size_t	sa2u(size_t size, size_t alignment);
-unsigned	narenas_total_get(void);
-arena_t	*choose_arena(arena_t *arena);
+arena_t	*arena_choose(tsd_t *tsd, arena_t *arena);
+arena_t	*arena_get(tsd_t *tsd, unsigned ind, bool init_if_missing,
+    bool refresh_if_missing);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_))
-/*
- * Map of pthread_self() --> arenas[???], used for selecting an arena to use
- * for allocations.
- */
-malloc_tsd_externs(arenas, arena_t *)
-malloc_tsd_funcs(JEMALLOC_ALWAYS_INLINE, arenas, arena_t *, NULL,
-    arenas_cleanup)
+JEMALLOC_INLINE szind_t
+size2index_compute(size_t size)
+{
+
+#if (NTBINS != 0)
+	if (size <= (ZU(1) << LG_TINY_MAXCLASS)) {
+		size_t lg_tmin = LG_TINY_MAXCLASS - NTBINS + 1;
+		size_t lg_ceil = lg_floor(pow2_ceil(size));
+		return (lg_ceil < lg_tmin ? 0 : lg_ceil - lg_tmin);
+	}
+#endif
+	{
+		size_t x = unlikely(ZI(size) < 0) ? ((size<<1) ?
+		    (ZU(1)<<(LG_SIZEOF_PTR+3)) : ((ZU(1)<<(LG_SIZEOF_PTR+3))-1))
+		    : lg_floor((size<<1)-1);
+		size_t shift = (x < LG_SIZE_CLASS_GROUP + LG_QUANTUM) ? 0 :
+		    x - (LG_SIZE_CLASS_GROUP + LG_QUANTUM);
+		size_t grp = shift << LG_SIZE_CLASS_GROUP;
+
+		size_t lg_delta = (x < LG_SIZE_CLASS_GROUP + LG_QUANTUM + 1)
+		    ? LG_QUANTUM : x - LG_SIZE_CLASS_GROUP - 1;
+
+		size_t delta_inverse_mask = ZI(-1) << lg_delta;
+		size_t mod = ((((size-1) & delta_inverse_mask) >> lg_delta)) &
+		    ((ZU(1) << LG_SIZE_CLASS_GROUP) - 1);
+
+		size_t index = NTBINS + grp + mod;
+		return (index);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE szind_t
+size2index_lookup(size_t size)
+{
+
+	assert(size <= LOOKUP_MAXCLASS);
+	{
+		size_t ret = ((size_t)(size2index_tab[(size-1) >>
+		    LG_TINY_MIN]));
+		assert(ret == size2index_compute(size));
+		return (ret);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE szind_t
+size2index(size_t size)
+{
+
+	assert(size > 0);
+	if (likely(size <= LOOKUP_MAXCLASS))
+		return (size2index_lookup(size));
+	return (size2index_compute(size));
+}
+
+JEMALLOC_INLINE size_t
+index2size_compute(szind_t index)
+{
+
+#if (NTBINS > 0)
+	if (index < NTBINS)
+		return (ZU(1) << (LG_TINY_MAXCLASS - NTBINS + 1 + index));
+#endif
+	{
+		size_t reduced_index = index - NTBINS;
+		size_t grp = reduced_index >> LG_SIZE_CLASS_GROUP;
+		size_t mod = reduced_index & ((ZU(1) << LG_SIZE_CLASS_GROUP) -
+		    1);
+
+		size_t grp_size_mask = ~((!!grp)-1);
+		size_t grp_size = ((ZU(1) << (LG_QUANTUM +
+		    (LG_SIZE_CLASS_GROUP-1))) << grp) & grp_size_mask;
+
+		size_t shift = (grp == 0) ? 1 : grp;
+		size_t lg_delta = shift + (LG_QUANTUM-1);
+		size_t mod_size = (mod+1) << lg_delta;
+
+		size_t usize = grp_size + mod_size;
+		return (usize);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+index2size_lookup(szind_t index)
+{
+	size_t ret = (size_t)index2size_tab[index];
+	assert(ret == index2size_compute(index));
+	return (ret);
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+index2size(szind_t index)
+{
+
+	assert(index < NSIZES);
+	return (index2size_lookup(index));
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+s2u_compute(size_t size)
+{
+
+#if (NTBINS > 0)
+	if (size <= (ZU(1) << LG_TINY_MAXCLASS)) {
+		size_t lg_tmin = LG_TINY_MAXCLASS - NTBINS + 1;
+		size_t lg_ceil = lg_floor(pow2_ceil(size));
+		return (lg_ceil < lg_tmin ? (ZU(1) << lg_tmin) :
+		    (ZU(1) << lg_ceil));
+	}
+#endif
+	{
+		size_t x = unlikely(ZI(size) < 0) ? ((size<<1) ?
+		    (ZU(1)<<(LG_SIZEOF_PTR+3)) : ((ZU(1)<<(LG_SIZEOF_PTR+3))-1))
+		    : lg_floor((size<<1)-1);
+		size_t lg_delta = (x < LG_SIZE_CLASS_GROUP + LG_QUANTUM + 1)
+		    ?  LG_QUANTUM : x - LG_SIZE_CLASS_GROUP - 1;
+		size_t delta = ZU(1) << lg_delta;
+		size_t delta_mask = delta - 1;
+		size_t usize = (size + delta_mask) & ~delta_mask;
+		return (usize);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+s2u_lookup(size_t size)
+{
+	size_t ret = index2size_lookup(size2index_lookup(size));
+
+	assert(ret == s2u_compute(size));
+	return (ret);
+}
 
 /*
  * Compute usable size that would result from allocating an object with the
@@ -601,11 +666,10 @@ JEMALLOC_ALWAYS_INLINE size_t
 s2u(size_t size)
 {
 
-	if (size <= SMALL_MAXCLASS)
-		return (arena_bin_info[SMALL_SIZE2BIN(size)].reg_size);
-	if (size <= arena_maxclass)
-		return (PAGE_CEILING(size));
-	return (CHUNK_CEILING(size));
+	assert(size > 0);
+	if (likely(size <= LOOKUP_MAXCLASS))
+		return (s2u_lookup(size));
+	return (s2u_compute(size));
 }
 
 /*
@@ -619,108 +683,128 @@ sa2u(size_t size, size_t alignment)
 
 	assert(alignment != 0 && ((alignment - 1) & alignment) == 0);
 
-	/*
-	 * Round size up to the nearest multiple of alignment.
-	 *
-	 * This done, we can take advantage of the fact that for each small
-	 * size class, every object is aligned at the smallest power of two
-	 * that is non-zero in the base two representation of the size.  For
-	 * example:
-	 *
-	 *   Size |   Base 2 | Minimum alignment
-	 *   -----+----------+------------------
-	 *     96 |  1100000 |  32
-	 *    144 | 10100000 |  32
-	 *    192 | 11000000 |  64
-	 */
-	usize = ALIGNMENT_CEILING(size, alignment);
-	/*
-	 * (usize < size) protects against the combination of maximal
-	 * alignment and size greater than maximal alignment.
-	 */
-	if (usize < size) {
-		/* size_t overflow. */
-		return (0);
+	/* Try for a small size class. */
+	if (size <= SMALL_MAXCLASS && alignment < PAGE) {
+		/*
+		 * Round size up to the nearest multiple of alignment.
+		 *
+		 * This done, we can take advantage of the fact that for each
+		 * small size class, every object is aligned at the smallest
+		 * power of two that is non-zero in the base two representation
+		 * of the size.  For example:
+		 *
+		 *   Size |   Base 2 | Minimum alignment
+		 *   -----+----------+------------------
+		 *     96 |  1100000 |  32
+		 *    144 | 10100000 |  32
+		 *    192 | 11000000 |  64
+		 */
+		usize = s2u(ALIGNMENT_CEILING(size, alignment));
+		if (usize < LARGE_MINCLASS)
+			return (usize);
 	}
 
-	if (usize <= arena_maxclass && alignment <= PAGE) {
-		if (usize <= SMALL_MAXCLASS)
-			return (arena_bin_info[SMALL_SIZE2BIN(usize)].reg_size);
-		return (PAGE_CEILING(usize));
-	} else {
-		size_t run_size;
-
+	/* Try for a large size class. */
+	if (likely(size <= large_maxclass) && likely(alignment < chunksize)) {
 		/*
 		 * We can't achieve subpage alignment, so round up alignment
-		 * permanently; it makes later calculations simpler.
+		 * to the minimum that can actually be supported.
 		 */
 		alignment = PAGE_CEILING(alignment);
-		usize = PAGE_CEILING(size);
-		/*
-		 * (usize < size) protects against very large sizes within
-		 * PAGE of SIZE_T_MAX.
-		 *
-		 * (usize + alignment < usize) protects against the
-		 * combination of maximal alignment and usize large enough
-		 * to cause overflow.  This is similar to the first overflow
-		 * check above, but it needs to be repeated due to the new
-		 * usize value, which may now be *equal* to maximal
-		 * alignment, whereas before we only detected overflow if the
-		 * original size was *greater* than maximal alignment.
-		 */
-		if (usize < size || usize + alignment < usize) {
-			/* size_t overflow. */
-			return (0);
-		}
+
+		/* Make sure result is a large size class. */
+		usize = (size <= LARGE_MINCLASS) ? LARGE_MINCLASS : s2u(size);
 
 		/*
 		 * Calculate the size of the over-size run that arena_palloc()
 		 * would need to allocate in order to guarantee the alignment.
-		 * If the run wouldn't fit within a chunk, round up to a huge
-		 * allocation size.
 		 */
-		run_size = usize + alignment - PAGE;
-		if (run_size <= arena_maxclass)
-			return (PAGE_CEILING(usize));
-		return (CHUNK_CEILING(usize));
+		if (usize + large_pad + alignment - PAGE <= arena_maxrun)
+			return (usize);
 	}
-}
 
-JEMALLOC_INLINE unsigned
-narenas_total_get(void)
-{
-	unsigned narenas;
+	/* Huge size class.  Beware of size_t overflow. */
 
-	malloc_mutex_lock(&arenas_lock);
-	narenas = narenas_total;
-	malloc_mutex_unlock(&arenas_lock);
+	/*
+	 * We can't achieve subchunk alignment, so round up alignment to the
+	 * minimum that can actually be supported.
+	 */
+	alignment = CHUNK_CEILING(alignment);
+	if (alignment == 0) {
+		/* size_t overflow. */
+		return (0);
+	}
+
+	/* Make sure result is a huge size class. */
+	if (size <= chunksize)
+		usize = chunksize;
+	else {
+		usize = s2u(size);
+		if (usize < size) {
+			/* size_t overflow. */
+			return (0);
+		}
+	}
 
-	return (narenas);
+	/*
+	 * Calculate the multi-chunk mapping that huge_palloc() would need in
+	 * order to guarantee the alignment.
+	 */
+	if (usize + alignment - PAGE < usize) {
+		/* size_t overflow. */
+		return (0);
+	}
+	return (usize);
 }
 
 /* Choose an arena based on a per-thread value. */
 JEMALLOC_INLINE arena_t *
-choose_arena(arena_t *arena)
+arena_choose(tsd_t *tsd, arena_t *arena)
 {
 	arena_t *ret;
 
 	if (arena != NULL)
 		return (arena);
 
-	if ((ret = *arenas_tsd_get()) == NULL) {
-		ret = choose_arena_hard();
-		assert(ret != NULL);
-	}
+	if (unlikely((ret = tsd_arena_get(tsd)) == NULL))
+		ret = arena_choose_hard(tsd);
 
 	return (ret);
 }
+
+JEMALLOC_INLINE arena_t *
+arena_get(tsd_t *tsd, unsigned ind, bool init_if_missing,
+    bool refresh_if_missing)
+{
+	arena_t *arena;
+	arena_t **arenas_cache = tsd_arenas_cache_get(tsd);
+
+	/* init_if_missing requires refresh_if_missing. */
+	assert(!init_if_missing || refresh_if_missing);
+
+	if (unlikely(arenas_cache == NULL)) {
+		/* arenas_cache hasn't been initialized yet. */
+		return (arena_get_hard(tsd, ind, init_if_missing));
+	}
+	if (unlikely(ind >= tsd_narenas_cache_get(tsd))) {
+		/*
+		 * ind is invalid, cache is old (too small), or arena to be
+		 * initialized.
+		 */
+		return (refresh_if_missing ? arena_get_hard(tsd, ind,
+		    init_if_missing) : NULL);
+	}
+	arena = arenas_cache[ind];
+	if (likely(arena != NULL) || !refresh_if_missing)
+		return (arena);
+	return (arena_get_hard(tsd, ind, init_if_missing));
+}
 #endif
 
 #include "jemalloc/internal/bitmap.h"
-#include "jemalloc/internal/rtree.h"
 /*
- * Include arena.h twice in order to resolve circular dependencies with
- * tcache.h.
+ * Include portions of arena.h interleaved with tcache.h in order to resolve
+ * circular dependencies.
  */
 #define	JEMALLOC_ARENA_INLINE_A
 #include "jemalloc/internal/arena.h"
@@ -733,133 +817,155 @@ choose_arena(arena_t *arena)
 #include "jemalloc/internal/quarantine.h"
 
 #ifndef JEMALLOC_ENABLE_INLINE
-void	*imalloct(size_t size, bool try_tcache, arena_t *arena);
-void	*imalloc(size_t size);
-void	*icalloct(size_t size, bool try_tcache, arena_t *arena);
-void	*icalloc(size_t size);
-void	*ipalloct(size_t usize, size_t alignment, bool zero, bool try_tcache,
-    arena_t *arena);
-void	*ipalloc(size_t usize, size_t alignment, bool zero);
+arena_t	*iaalloc(const void *ptr);
 size_t	isalloc(const void *ptr, bool demote);
+void	*iallocztm(tsd_t *tsd, size_t size, bool zero, tcache_t *tcache,
+    bool is_metadata, arena_t *arena);
+void	*imalloct(tsd_t *tsd, size_t size, tcache_t *tcache, arena_t *arena);
+void	*imalloc(tsd_t *tsd, size_t size);
+void	*icalloct(tsd_t *tsd, size_t size, tcache_t *tcache, arena_t *arena);
+void	*icalloc(tsd_t *tsd, size_t size);
+void	*ipallocztm(tsd_t *tsd, size_t usize, size_t alignment, bool zero,
+    tcache_t *tcache, bool is_metadata, arena_t *arena);
+void	*ipalloct(tsd_t *tsd, size_t usize, size_t alignment, bool zero,
+    tcache_t *tcache, arena_t *arena);
+void	*ipalloc(tsd_t *tsd, size_t usize, size_t alignment, bool zero);
 size_t	ivsalloc(const void *ptr, bool demote);
 size_t	u2rz(size_t usize);
 size_t	p2rz(const void *ptr);
-void	idalloct(void *ptr, bool try_tcache);
-void	idalloc(void *ptr);
-void	iqalloct(void *ptr, bool try_tcache);
-void	iqalloc(void *ptr);
-void	*iralloct_realign(void *ptr, size_t oldsize, size_t size, size_t extra,
-    size_t alignment, bool zero, bool try_tcache_alloc, bool try_tcache_dalloc,
+void	idalloctm(tsd_t *tsd, void *ptr, tcache_t *tcache, bool is_metadata);
+void	idalloct(tsd_t *tsd, void *ptr, tcache_t *tcache);
+void	idalloc(tsd_t *tsd, void *ptr);
+void	iqalloc(tsd_t *tsd, void *ptr, tcache_t *tcache);
+void	isdalloct(tsd_t *tsd, void *ptr, size_t size, tcache_t *tcache);
+void	isqalloc(tsd_t *tsd, void *ptr, size_t size, tcache_t *tcache);
+void	*iralloct_realign(tsd_t *tsd, void *ptr, size_t oldsize, size_t size,
+    size_t extra, size_t alignment, bool zero, tcache_t *tcache,
     arena_t *arena);
-void	*iralloct(void *ptr, size_t size, size_t extra, size_t alignment,
-    bool zero, bool try_tcache_alloc, bool try_tcache_dalloc, arena_t *arena);
-void	*iralloc(void *ptr, size_t size, size_t extra, size_t alignment,
-    bool zero);
-bool	ixalloc(void *ptr, size_t size, size_t extra, size_t alignment,
-    bool zero);
-malloc_tsd_protos(JEMALLOC_ATTR(unused), thread_allocated, thread_allocated_t)
+void	*iralloct(tsd_t *tsd, void *ptr, size_t oldsize, size_t size,
+    size_t alignment, bool zero, tcache_t *tcache, arena_t *arena);
+void	*iralloc(tsd_t *tsd, void *ptr, size_t oldsize, size_t size,
+    size_t alignment, bool zero);
+bool	ixalloc(void *ptr, size_t oldsize, size_t size, size_t extra,
+    size_t alignment, bool zero);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_))
+JEMALLOC_ALWAYS_INLINE arena_t *
+iaalloc(const void *ptr)
+{
+
+	assert(ptr != NULL);
+
+	return (arena_aalloc(ptr));
+}
+
+/*
+ * Typical usage:
+ *   void *ptr = [...]
+ *   size_t sz = isalloc(ptr, config_prof);
+ */
+JEMALLOC_ALWAYS_INLINE size_t
+isalloc(const void *ptr, bool demote)
+{
+
+	assert(ptr != NULL);
+	/* Demotion only makes sense if config_prof is true. */
+	assert(config_prof || !demote);
+
+	return (arena_salloc(ptr, demote));
+}
+
 JEMALLOC_ALWAYS_INLINE void *
-imalloct(size_t size, bool try_tcache, arena_t *arena)
+iallocztm(tsd_t *tsd, size_t size, bool zero, tcache_t *tcache, bool is_metadata,
+    arena_t *arena)
 {
+	void *ret;
 
 	assert(size != 0);
 
-	if (size <= arena_maxclass)
-		return (arena_malloc(arena, size, false, try_tcache));
-	else
-		return (huge_malloc(size, false, huge_dss_prec_get(arena)));
+	ret = arena_malloc(tsd, arena, size, zero, tcache);
+	if (config_stats && is_metadata && likely(ret != NULL)) {
+		arena_metadata_allocated_add(iaalloc(ret), isalloc(ret,
+		    config_prof));
+	}
+	return (ret);
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-imalloc(size_t size)
+imalloct(tsd_t *tsd, size_t size, tcache_t *tcache, arena_t *arena)
 {
 
-	return (imalloct(size, true, NULL));
+	return (iallocztm(tsd, size, false, tcache, false, arena));
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-icalloct(size_t size, bool try_tcache, arena_t *arena)
+imalloc(tsd_t *tsd, size_t size)
 {
 
-	if (size <= arena_maxclass)
-		return (arena_malloc(arena, size, true, try_tcache));
-	else
-		return (huge_malloc(size, true, huge_dss_prec_get(arena)));
+	return (iallocztm(tsd, size, false, tcache_get(tsd, true), false, NULL));
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-icalloc(size_t size)
+icalloct(tsd_t *tsd, size_t size, tcache_t *tcache, arena_t *arena)
 {
 
-	return (icalloct(size, true, NULL));
+	return (iallocztm(tsd, size, true, tcache, false, arena));
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-ipalloct(size_t usize, size_t alignment, bool zero, bool try_tcache,
-    arena_t *arena)
+icalloc(tsd_t *tsd, size_t size)
+{
+
+	return (iallocztm(tsd, size, true, tcache_get(tsd, true), false, NULL));
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+ipallocztm(tsd_t *tsd, size_t usize, size_t alignment, bool zero,
+    tcache_t *tcache, bool is_metadata, arena_t *arena)
 {
 	void *ret;
 
 	assert(usize != 0);
 	assert(usize == sa2u(usize, alignment));
 
-	if (usize <= arena_maxclass && alignment <= PAGE)
-		ret = arena_malloc(arena, usize, zero, try_tcache);
-	else {
-		if (usize <= arena_maxclass) {
-			ret = arena_palloc(choose_arena(arena), usize,
-			    alignment, zero);
-		} else if (alignment <= chunksize)
-			ret = huge_malloc(usize, zero, huge_dss_prec_get(arena));
-		else
-			ret = huge_palloc(usize, alignment, zero, huge_dss_prec_get(arena));
-	}
-
+	ret = arena_palloc(tsd, arena, usize, alignment, zero, tcache);
 	assert(ALIGNMENT_ADDR2BASE(ret, alignment) == ret);
+	if (config_stats && is_metadata && likely(ret != NULL)) {
+		arena_metadata_allocated_add(iaalloc(ret), isalloc(ret,
+		    config_prof));
+	}
 	return (ret);
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-ipalloc(size_t usize, size_t alignment, bool zero)
+ipalloct(tsd_t *tsd, size_t usize, size_t alignment, bool zero,
+    tcache_t *tcache, arena_t *arena)
 {
 
-	return (ipalloct(usize, alignment, zero, true, NULL));
+	return (ipallocztm(tsd, usize, alignment, zero, tcache, false, arena));
 }
 
-/*
- * Typical usage:
- *   void *ptr = [...]
- *   size_t sz = isalloc(ptr, config_prof);
- */
-JEMALLOC_ALWAYS_INLINE size_t
-isalloc(const void *ptr, bool demote)
+JEMALLOC_ALWAYS_INLINE void *
+ipalloc(tsd_t *tsd, size_t usize, size_t alignment, bool zero)
 {
-	size_t ret;
-	arena_chunk_t *chunk;
-
-	assert(ptr != NULL);
-	/* Demotion only makes sense if config_prof is true. */
-	assert(config_prof || demote == false);
 
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (chunk != ptr)
-		ret = arena_salloc(ptr, demote);
-	else
-		ret = huge_salloc(ptr);
-
-	return (ret);
+	return (ipallocztm(tsd, usize, alignment, zero, tcache_get(tsd,
+	    NULL), false, NULL));
 }
 
 JEMALLOC_ALWAYS_INLINE size_t
 ivsalloc(const void *ptr, bool demote)
 {
+	extent_node_t *node;
 
 	/* Return 0 if ptr is not within a chunk managed by jemalloc. */
-	if (rtree_get(chunks_rtree, (uintptr_t)CHUNK_ADDR2BASE(ptr)) == 0)
+	node = chunk_lookup(ptr, false);
+	if (node == NULL)
 		return (0);
+	/* Only arena chunks should be looked up via interior pointers. */
+	assert(extent_node_addr_get(node) == ptr ||
+	    extent_node_achunk_get(node));
 
 	return (isalloc(ptr, demote));
 }
@@ -870,7 +976,7 @@ u2rz(size_t usize)
 	size_t ret;
 
 	if (usize <= SMALL_MAXCLASS) {
-		size_t binind = SMALL_SIZE2BIN(usize);
+		szind_t binind = size2index(usize);
 		ret = arena_bin_info[binind].redzone_size;
 	} else
 		ret = 0;
@@ -887,47 +993,62 @@ p2rz(const void *ptr)
 }
 
 JEMALLOC_ALWAYS_INLINE void
-idalloct(void *ptr, bool try_tcache)
+idalloctm(tsd_t *tsd, void *ptr, tcache_t *tcache, bool is_metadata)
 {
-	arena_chunk_t *chunk;
 
 	assert(ptr != NULL);
+	if (config_stats && is_metadata) {
+		arena_metadata_allocated_sub(iaalloc(ptr), isalloc(ptr,
+		    config_prof));
+	}
 
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (chunk != ptr)
-		arena_dalloc(chunk->arena, chunk, ptr, try_tcache);
-	else
-		huge_dalloc(ptr, true);
+	arena_dalloc(tsd, ptr, tcache);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+idalloct(tsd_t *tsd, void *ptr, tcache_t *tcache)
+{
+
+	idalloctm(tsd, ptr, tcache, false);
 }
 
 JEMALLOC_ALWAYS_INLINE void
-idalloc(void *ptr)
+idalloc(tsd_t *tsd, void *ptr)
 {
 
-	idalloct(ptr, true);
+	idalloctm(tsd, ptr, tcache_get(tsd, false), false);
 }
 
 JEMALLOC_ALWAYS_INLINE void
-iqalloct(void *ptr, bool try_tcache)
+iqalloc(tsd_t *tsd, void *ptr, tcache_t *tcache)
 {
 
-	if (config_fill && opt_quarantine)
-		quarantine(ptr);
+	if (config_fill && unlikely(opt_quarantine))
+		quarantine(tsd, ptr);
 	else
-		idalloct(ptr, try_tcache);
+		idalloctm(tsd, ptr, tcache, false);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+isdalloct(tsd_t *tsd, void *ptr, size_t size, tcache_t *tcache)
+{
+
+	arena_sdalloc(tsd, ptr, size, tcache);
 }
 
 JEMALLOC_ALWAYS_INLINE void
-iqalloc(void *ptr)
+isqalloc(tsd_t *tsd, void *ptr, size_t size, tcache_t *tcache)
 {
 
-	iqalloct(ptr, true);
+	if (config_fill && unlikely(opt_quarantine))
+		quarantine(tsd, ptr);
+	else
+		isdalloct(tsd, ptr, size, tcache);
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-iralloct_realign(void *ptr, size_t oldsize, size_t size, size_t extra,
-    size_t alignment, bool zero, bool try_tcache_alloc, bool try_tcache_dalloc,
-    arena_t *arena)
+iralloct_realign(tsd_t *tsd, void *ptr, size_t oldsize, size_t size,
+    size_t extra, size_t alignment, bool zero, tcache_t *tcache, arena_t *arena)
 {
 	void *p;
 	size_t usize, copysize;
@@ -935,7 +1056,7 @@ iralloct_realign(void *ptr, size_t oldsize, size_t size, size_t extra,
 	usize = sa2u(size + extra, alignment);
 	if (usize == 0)
 		return (NULL);
-	p = ipalloct(usize, alignment, zero, try_tcache_alloc, arena);
+	p = ipalloct(tsd, usize, alignment, zero, tcache, arena);
 	if (p == NULL) {
 		if (extra == 0)
 			return (NULL);
@@ -943,7 +1064,7 @@ iralloct_realign(void *ptr, size_t oldsize, size_t size, size_t extra,
 		usize = sa2u(size, alignment);
 		if (usize == 0)
 			return (NULL);
-		p = ipalloct(usize, alignment, zero, try_tcache_alloc, arena);
+		p = ipalloct(tsd, usize, alignment, zero, tcache, arena);
 		if (p == NULL)
 			return (NULL);
 	}
@@ -953,72 +1074,57 @@ iralloct_realign(void *ptr, size_t oldsize, size_t size, size_t extra,
 	 */
 	copysize = (size < oldsize) ? size : oldsize;
 	memcpy(p, ptr, copysize);
-	iqalloct(ptr, try_tcache_dalloc);
+	isqalloc(tsd, ptr, oldsize, tcache);
 	return (p);
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-iralloct(void *ptr, size_t size, size_t extra, size_t alignment, bool zero,
-    bool try_tcache_alloc, bool try_tcache_dalloc, arena_t *arena)
+iralloct(tsd_t *tsd, void *ptr, size_t oldsize, size_t size, size_t alignment,
+    bool zero, tcache_t *tcache, arena_t *arena)
 {
-	size_t oldsize;
 
 	assert(ptr != NULL);
 	assert(size != 0);
 
-	oldsize = isalloc(ptr, config_prof);
-
 	if (alignment != 0 && ((uintptr_t)ptr & ((uintptr_t)alignment-1))
 	    != 0) {
 		/*
 		 * Existing object alignment is inadequate; allocate new space
 		 * and copy.
 		 */
-		return (iralloct_realign(ptr, oldsize, size, extra, alignment,
-		    zero, try_tcache_alloc, try_tcache_dalloc, arena));
+		return (iralloct_realign(tsd, ptr, oldsize, size, 0, alignment,
+		    zero, tcache, arena));
 	}
 
-	if (size + extra <= arena_maxclass) {
-		return (arena_ralloc(arena, ptr, oldsize, size, extra,
-		    alignment, zero, try_tcache_alloc,
-		    try_tcache_dalloc));
-	} else {
-		return (huge_ralloc(ptr, oldsize, size, extra,
-		    alignment, zero, try_tcache_dalloc, huge_dss_prec_get(arena)));
-	}
+	return (arena_ralloc(tsd, arena, ptr, oldsize, size, alignment, zero,
+	    tcache));
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-iralloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero)
+iralloc(tsd_t *tsd, void *ptr, size_t oldsize, size_t size, size_t alignment,
+    bool zero)
 {
 
-	return (iralloct(ptr, size, extra, alignment, zero, true, true, NULL));
+	return (iralloct(tsd, ptr, oldsize, size, alignment, zero,
+	    tcache_get(tsd, true), NULL));
 }
 
 JEMALLOC_ALWAYS_INLINE bool
-ixalloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero)
+ixalloc(void *ptr, size_t oldsize, size_t size, size_t extra, size_t alignment,
+    bool zero)
 {
-	size_t oldsize;
 
 	assert(ptr != NULL);
 	assert(size != 0);
 
-	oldsize = isalloc(ptr, config_prof);
 	if (alignment != 0 && ((uintptr_t)ptr & ((uintptr_t)alignment-1))
 	    != 0) {
 		/* Existing object alignment is inadequate. */
 		return (true);
 	}
 
-	if (size <= arena_maxclass)
-		return (arena_ralloc_no_move(ptr, oldsize, size, extra, zero));
-	else
-		return (huge_ralloc_no_move(ptr, oldsize, size, extra));
+	return (arena_ralloc_no_move(ptr, oldsize, size, extra, zero));
 }
-
-malloc_tsd_externs(thread_allocated, thread_allocated_t)
-malloc_tsd_funcs(JEMALLOC_ALWAYS_INLINE, thread_allocated, thread_allocated_t,
-    THREAD_ALLOCATED_INITIALIZER, malloc_tsd_no_cleanup)
 #endif
 
 #include "jemalloc/internal/prof.h"
diff --git a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_decls.h b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_decls.h
new file mode 100644
index 000000000..a601d6ebb
--- /dev/null
+++ b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_decls.h
@@ -0,0 +1,64 @@
+#ifndef JEMALLOC_INTERNAL_DECLS_H
+#define	JEMALLOC_INTERNAL_DECLS_H
+
+#include <math.h>
+#ifdef _WIN32
+#  include <windows.h>
+#  include "msvc_compat/windows_extra.h"
+
+#else
+#  include <sys/param.h>
+#  include <sys/mman.h>
+#  if !defined(__pnacl__) && !defined(__native_client__)
+#    include <sys/syscall.h>
+#    if !defined(SYS_write) && defined(__NR_write)
+#      define SYS_write __NR_write
+#    endif
+#    include <sys/uio.h>
+#  endif
+#  include <pthread.h>
+#  include <errno.h>
+#endif
+#include <sys/types.h>
+
+#include <limits.h>
+#ifndef SIZE_T_MAX
+#  define SIZE_T_MAX	SIZE_MAX
+#endif
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stddef.h>
+#ifndef offsetof
+#  define offsetof(type, member)	((size_t)&(((type *)NULL)->member))
+#endif
+#include <string.h>
+#include <strings.h>
+#include <ctype.h>
+#ifdef _MSC_VER
+#  include <io.h>
+typedef intptr_t ssize_t;
+#  define PATH_MAX 1024
+#  define STDERR_FILENO 2
+#  define __func__ __FUNCTION__
+#  ifdef JEMALLOC_HAS_RESTRICT
+#    define restrict __restrict
+#  endif
+/* Disable warnings about deprecated system functions. */
+#  pragma warning(disable: 4996)
+#if _MSC_VER < 1800
+static int
+isblank(int c)
+{
+
+	return (c == '\t' || c == ' ');
+}
+#endif
+#else
+#  include <unistd.h>
+#endif
+#include <fcntl.h>
+
+#endif /* JEMALLOC_INTERNAL_H */
diff --git a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_defs.h.in b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_defs.h.in
index c166fbd9e..b0f8caaf8 100644
--- a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -22,6 +22,9 @@
  */
 #undef CPU_SPINWAIT
 
+/* Defined if C11 atomics are available. */
+#undef JEMALLOC_C11ATOMICS
+
 /* Defined if the equivalent of FreeBSD's atomic(9) functions are available. */
 #undef JEMALLOC_ATOMIC9
 
@@ -35,7 +38,7 @@
  * Defined if __sync_add_and_fetch(uint32_t *, uint32_t) and
  * __sync_sub_and_fetch(uint32_t *, uint32_t) are available, despite
  * __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 not being defined (which means the
- * functions are defined in libgcc instead of being inlines)
+ * functions are defined in libgcc instead of being inlines).
  */
 #undef JE_FORCE_SYNC_COMPARE_AND_SWAP_4
 
@@ -43,17 +46,37 @@
  * Defined if __sync_add_and_fetch(uint64_t *, uint64_t) and
  * __sync_sub_and_fetch(uint64_t *, uint64_t) are available, despite
  * __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 not being defined (which means the
- * functions are defined in libgcc instead of being inlines)
+ * functions are defined in libgcc instead of being inlines).
  */
 #undef JE_FORCE_SYNC_COMPARE_AND_SWAP_8
 
 /*
+ * Defined if __builtin_clz() and __builtin_clzl() are available.
+ */
+#undef JEMALLOC_HAVE_BUILTIN_CLZ
+
+/*
+ * Defined if madvise(2) is available.
+ */
+#undef JEMALLOC_HAVE_MADVISE
+
+/*
  * Defined if OSSpin*() functions are available, as provided by Darwin, and
  * documented in the spinlock(3) manual page.
  */
 #undef JEMALLOC_OSSPIN
 
 /*
+ * Defined if secure_getenv(3) is available.
+ */
+#undef JEMALLOC_HAVE_SECURE_GETENV
+
+/*
+ * Defined if issetugid(2) is available.
+ */
+#undef JEMALLOC_HAVE_ISSETUGID
+
+/*
  * Defined if _malloc_thread_cleanup() exists.  At least in the case of
  * FreeBSD, pthread_key_create() allocates, which if used during malloc
  * bootstrapping will cause recursion into the pthreads library.  Therefore, if
@@ -76,9 +99,6 @@
  */
 #undef JEMALLOC_MUTEX_INIT_CB
 
-/* Defined if sbrk() is supported. */
-#undef JEMALLOC_HAVE_SBRK
-
 /* Non-empty if the tls_model attribute is supported. */
 #undef JEMALLOC_TLS_MODEL
 
@@ -137,8 +157,26 @@
 /* Support lazy locking (avoid locking unless a second thread is launched). */
 #undef JEMALLOC_LAZY_LOCK
 
-/* One page is 2^STATIC_PAGE_SHIFT bytes. */
-#undef STATIC_PAGE_SHIFT
+/* Minimum size class to support is 2^LG_TINY_MIN bytes. */
+#undef LG_TINY_MIN
+
+/*
+ * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size
+ * classes).
+ */
+#undef LG_QUANTUM
+
+/* One page is 2^LG_PAGE bytes. */
+#undef LG_PAGE
+
+/*
+ * If defined, adjacent virtual memory mappings with identical attributes
+ * automatically coalesce, and they fragment when changes are made to subranges.
+ * This is the normal order of things for mmap()/munmap(), but on Windows
+ * VirtualAlloc()/VirtualFree() operations must be precisely matched, i.e.
+ * mappings do *not* coalesce/fragment.
+ */
+#undef JEMALLOC_MAPS_COALESCE
 
 /*
  * If defined, use munmap() to unmap freed chunks, rather than storing them for
@@ -147,23 +185,29 @@
  */
 #undef JEMALLOC_MUNMAP
 
-/*
- * If defined, use mremap(...MREMAP_FIXED...) for huge realloc().  This is
- * disabled by default because it is Linux-specific and it will cause virtual
- * memory map holes, much like munmap(2) does.
- */
-#undef JEMALLOC_MREMAP
-
 /* TLS is used to map arenas and magazine caches to threads. */
 #undef JEMALLOC_TLS
 
 /*
+ * ffs()/ffsl() functions to use for bitmapping.  Don't use these directly;
+ * instead, use jemalloc_ffs() or jemalloc_ffsl() from util.h.
+ */
+#undef JEMALLOC_INTERNAL_FFSL
+#undef JEMALLOC_INTERNAL_FFS
+
+/*
  * JEMALLOC_IVSALLOC enables ivsalloc(), which verifies that pointers reside
  * within jemalloc-owned chunks before dereferencing them.
  */
 #undef JEMALLOC_IVSALLOC
 
 /*
+ * If defined, explicitly attempt to more uniformly distribute large allocation
+ * pointer alignments across all cache indices.
+ */
+#undef JEMALLOC_CACHE_OBLIVIOUS
+
+/*
  * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
  */
 #undef JEMALLOC_ZONE
@@ -182,9 +226,7 @@
 #undef JEMALLOC_PURGE_MADVISE_DONTNEED
 #undef JEMALLOC_PURGE_MADVISE_FREE
 
-/*
- * Define if operating system has alloca.h header.
- */
+/* Define if operating system has alloca.h header. */
 #undef JEMALLOC_HAS_ALLOCA_H
 
 /* C99 restrict keyword supported. */
@@ -202,4 +244,19 @@
 /* sizeof(intmax_t) == 2^LG_SIZEOF_INTMAX_T. */
 #undef LG_SIZEOF_INTMAX_T
 
+/* glibc malloc hooks (__malloc_hook, __realloc_hook, __free_hook). */
+#undef JEMALLOC_GLIBC_MALLOC_HOOK
+
+/* glibc memalign hook. */
+#undef JEMALLOC_GLIBC_MEMALIGN_HOOK
+
+/* Adaptive mutex support in pthreads. */
+#undef JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP
+
+/*
+ * If defined, jemalloc symbols are not exported (doesn't work when
+ * JEMALLOC_PREFIX is not defined).
+ */
+#undef JEMALLOC_EXPORT
+
 #endif /* JEMALLOC_INTERNAL_DEFS_H_ */
diff --git a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_macros.h b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_macros.h
index 4e2392302..a08ba772e 100644
--- a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_macros.h
+++ b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_macros.h
@@ -39,9 +39,15 @@
 #endif
 
 #define	ZU(z)	((size_t)z)
+#define	ZI(z)	((ssize_t)z)
 #define	QU(q)	((uint64_t)q)
 #define	QI(q)	((int64_t)q)
 
+#define	KZU(z)	ZU(z##ULL)
+#define	KZI(z)	ZI(z##LL)
+#define	KQU(q)	QU(q##ULL)
+#define	KQI(q)	QI(q##LL)
+
 #ifndef __DECONST
 #  define	__DECONST(type, var)	((type)(uintptr_t)(const void *)(var))
 #endif
diff --git a/deps/jemalloc/include/jemalloc/internal/mutex.h b/deps/jemalloc/include/jemalloc/internal/mutex.h
index de44e1435..f051f2917 100644
--- a/deps/jemalloc/include/jemalloc/internal/mutex.h
+++ b/deps/jemalloc/include/jemalloc/internal/mutex.h
@@ -10,7 +10,7 @@ typedef struct malloc_mutex_s malloc_mutex_t;
 #elif (defined(JEMALLOC_MUTEX_INIT_CB))
 #  define MALLOC_MUTEX_INITIALIZER {PTHREAD_MUTEX_INITIALIZER, NULL}
 #else
-#  if (defined(PTHREAD_MUTEX_ADAPTIVE_NP) &&				\
+#  if (defined(JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP) &&		\
        defined(PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP))
 #    define MALLOC_MUTEX_TYPE PTHREAD_MUTEX_ADAPTIVE_NP
 #    define MALLOC_MUTEX_INITIALIZER {PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP}
@@ -26,7 +26,11 @@ typedef struct malloc_mutex_s malloc_mutex_t;
 
 struct malloc_mutex_s {
 #ifdef _WIN32
+#  if _WIN32_WINNT >= 0x0600
+	SRWLOCK         	lock;
+#  else
 	CRITICAL_SECTION	lock;
+#  endif
 #elif (defined(JEMALLOC_OSSPIN))
 	OSSpinLock		lock;
 #elif (defined(JEMALLOC_MUTEX_INIT_CB))
@@ -70,7 +74,11 @@ malloc_mutex_lock(malloc_mutex_t *mutex)
 
 	if (isthreaded) {
 #ifdef _WIN32
+#  if _WIN32_WINNT >= 0x0600
+		AcquireSRWLockExclusive(&mutex->lock);
+#  else
 		EnterCriticalSection(&mutex->lock);
+#  endif
 #elif (defined(JEMALLOC_OSSPIN))
 		OSSpinLockLock(&mutex->lock);
 #else
@@ -85,7 +93,11 @@ malloc_mutex_unlock(malloc_mutex_t *mutex)
 
 	if (isthreaded) {
 #ifdef _WIN32
+#  if _WIN32_WINNT >= 0x0600
+		ReleaseSRWLockExclusive(&mutex->lock);
+#  else
 		LeaveCriticalSection(&mutex->lock);
+#  endif
 #elif (defined(JEMALLOC_OSSPIN))
 		OSSpinLockUnlock(&mutex->lock);
 #else
diff --git a/deps/jemalloc/include/jemalloc/internal/pages.h b/deps/jemalloc/include/jemalloc/internal/pages.h
new file mode 100644
index 000000000..da7eb9686
--- /dev/null
+++ b/deps/jemalloc/include/jemalloc/internal/pages.h
@@ -0,0 +1,26 @@
+/******************************************************************************/
+#ifdef JEMALLOC_H_TYPES
+
+#endif /* JEMALLOC_H_TYPES */
+/******************************************************************************/
+#ifdef JEMALLOC_H_STRUCTS
+
+#endif /* JEMALLOC_H_STRUCTS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_EXTERNS
+
+void	*pages_map(void *addr, size_t size);
+void	pages_unmap(void *addr, size_t size);
+void	*pages_trim(void *addr, size_t alloc_size, size_t leadsize,
+    size_t size);
+bool	pages_commit(void *addr, size_t size);
+bool	pages_decommit(void *addr, size_t size);
+bool	pages_purge(void *addr, size_t size);
+
+#endif /* JEMALLOC_H_EXTERNS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_INLINES
+
+#endif /* JEMALLOC_H_INLINES */
+/******************************************************************************/
+
diff --git a/deps/jemalloc/include/jemalloc/internal/private_symbols.txt b/deps/jemalloc/include/jemalloc/internal/private_symbols.txt
index 93516d242..a90021aa6 100644
--- a/deps/jemalloc/include/jemalloc/internal/private_symbols.txt
+++ b/deps/jemalloc/include/jemalloc/internal/private_symbols.txt
@@ -1,44 +1,76 @@
-a0calloc
-a0free
+a0dalloc
+a0get
 a0malloc
+arena_aalloc
 arena_alloc_junk_small
 arena_bin_index
 arena_bin_info
+arena_bitselm_get
 arena_boot
+arena_choose
+arena_choose_hard
+arena_chunk_alloc_huge
+arena_chunk_cache_maybe_insert
+arena_chunk_cache_maybe_remove
+arena_chunk_dalloc_huge
+arena_chunk_ralloc_huge_expand
+arena_chunk_ralloc_huge_shrink
+arena_chunk_ralloc_huge_similar
+arena_cleanup
 arena_dalloc
 arena_dalloc_bin
-arena_dalloc_bin_locked
+arena_dalloc_bin_junked_locked
 arena_dalloc_junk_large
 arena_dalloc_junk_small
 arena_dalloc_large
-arena_dalloc_large_locked
+arena_dalloc_large_junked_locked
 arena_dalloc_small
 arena_dss_prec_get
 arena_dss_prec_set
+arena_get
+arena_get_hard
+arena_init
+arena_lg_dirty_mult_default_get
+arena_lg_dirty_mult_default_set
+arena_lg_dirty_mult_get
+arena_lg_dirty_mult_set
 arena_malloc
 arena_malloc_large
 arena_malloc_small
 arena_mapbits_allocated_get
 arena_mapbits_binind_get
+arena_mapbits_decommitted_get
 arena_mapbits_dirty_get
 arena_mapbits_get
+arena_mapbits_internal_set
 arena_mapbits_large_binind_set
 arena_mapbits_large_get
 arena_mapbits_large_set
 arena_mapbits_large_size_get
+arena_mapbitsp_get
+arena_mapbitsp_read
+arena_mapbitsp_write
+arena_mapbits_size_decode
+arena_mapbits_size_encode
 arena_mapbits_small_runind_get
 arena_mapbits_small_set
 arena_mapbits_unallocated_set
 arena_mapbits_unallocated_size_get
 arena_mapbits_unallocated_size_set
 arena_mapbits_unzeroed_get
-arena_mapbits_unzeroed_set
-arena_mapbitsp_get
-arena_mapbitsp_read
-arena_mapbitsp_write
-arena_mapp_get
-arena_maxclass
+arena_maxrun
+arena_maybe_purge
+arena_metadata_allocated_add
+arena_metadata_allocated_get
+arena_metadata_allocated_sub
+arena_migrate
+arena_miscelm_get
+arena_miscelm_to_pageind
+arena_miscelm_to_rpages
+arena_nbound
 arena_new
+arena_node_alloc
+arena_node_dalloc
 arena_palloc
 arena_postfork_child
 arena_postfork_parent
@@ -46,50 +78,47 @@ arena_prefork
 arena_prof_accum
 arena_prof_accum_impl
 arena_prof_accum_locked
-arena_prof_ctx_get
-arena_prof_ctx_set
 arena_prof_promoted
+arena_prof_tctx_get
+arena_prof_tctx_reset
+arena_prof_tctx_set
 arena_ptr_small_binind_get
 arena_purge_all
 arena_quarantine_junk_small
 arena_ralloc
 arena_ralloc_junk_large
 arena_ralloc_no_move
+arena_rd_to_miscelm
 arena_redzone_corruption
 arena_run_regind
+arena_run_to_miscelm
 arena_salloc
+arenas_cache_bypass_cleanup
+arenas_cache_cleanup
+arena_sdalloc
 arena_stats_merge
 arena_tcache_fill_small
-arenas
-arenas_booted
-arenas_cleanup
-arenas_extend
-arenas_initialized
-arenas_lock
-arenas_tls
-arenas_tsd
-arenas_tsd_boot
-arenas_tsd_cleanup_wrapper
-arenas_tsd_get
-arenas_tsd_get_wrapper
-arenas_tsd_init_head
-arenas_tsd_set
+atomic_add_p
 atomic_add_u
 atomic_add_uint32
 atomic_add_uint64
 atomic_add_z
+atomic_cas_p
+atomic_cas_u
+atomic_cas_uint32
+atomic_cas_uint64
+atomic_cas_z
+atomic_sub_p
 atomic_sub_u
 atomic_sub_uint32
 atomic_sub_uint64
 atomic_sub_z
 base_alloc
 base_boot
-base_calloc
-base_node_alloc
-base_node_dealloc
 base_postfork_child
 base_postfork_parent
 base_prefork
+base_stats_get
 bitmap_full
 bitmap_get
 bitmap_info_init
@@ -99,49 +128,54 @@ bitmap_set
 bitmap_sfu
 bitmap_size
 bitmap_unset
+bootstrap_calloc
+bootstrap_free
+bootstrap_malloc
 bt_init
 buferror
-choose_arena
-choose_arena_hard
-chunk_alloc
+chunk_alloc_base
+chunk_alloc_cache
 chunk_alloc_dss
 chunk_alloc_mmap
+chunk_alloc_wrapper
 chunk_boot
-chunk_dealloc
-chunk_dealloc_mmap
+chunk_dalloc_arena
+chunk_dalloc_cache
+chunk_dalloc_mmap
+chunk_dalloc_wrapper
+chunk_deregister
 chunk_dss_boot
 chunk_dss_postfork_child
 chunk_dss_postfork_parent
 chunk_dss_prec_get
 chunk_dss_prec_set
 chunk_dss_prefork
+chunk_hooks_default
+chunk_hooks_get
+chunk_hooks_set
 chunk_in_dss
+chunk_lookup
 chunk_npages
 chunk_postfork_child
 chunk_postfork_parent
 chunk_prefork
-chunk_unmap
-chunks_mtx
-chunks_rtree
+chunk_purge_arena
+chunk_purge_wrapper
+chunk_register
 chunksize
 chunksize_mask
-ckh_bucket_search
+chunks_rtree
 ckh_count
 ckh_delete
-ckh_evict_reloc_insert
 ckh_insert
-ckh_isearch
 ckh_iter
 ckh_new
 ckh_pointer_hash
 ckh_pointer_keycomp
-ckh_rebuild
 ckh_remove
 ckh_search
 ckh_string_hash
 ckh_string_keycomp
-ckh_try_bucket_insert
-ckh_try_insert
 ctl_boot
 ctl_bymib
 ctl_byname
@@ -150,6 +184,23 @@ ctl_postfork_child
 ctl_postfork_parent
 ctl_prefork
 dss_prec_names
+extent_node_achunk_get
+extent_node_achunk_set
+extent_node_addr_get
+extent_node_addr_set
+extent_node_arena_get
+extent_node_arena_set
+extent_node_dirty_insert
+extent_node_dirty_linkage_init
+extent_node_dirty_remove
+extent_node_init
+extent_node_prof_tctx_get
+extent_node_prof_tctx_set
+extent_node_size_get
+extent_node_size_set
+extent_node_zeroed_get
+extent_node_zeroed_set
+extent_tree_ad_empty
 extent_tree_ad_first
 extent_tree_ad_insert
 extent_tree_ad_iter
@@ -166,6 +217,7 @@ extent_tree_ad_reverse_iter
 extent_tree_ad_reverse_iter_recurse
 extent_tree_ad_reverse_iter_start
 extent_tree_ad_search
+extent_tree_szad_empty
 extent_tree_szad_first
 extent_tree_szad_insert
 extent_tree_szad_iter
@@ -193,45 +245,49 @@ hash_rotl_64
 hash_x64_128
 hash_x86_128
 hash_x86_32
-huge_allocated
-huge_boot
+huge_aalloc
 huge_dalloc
 huge_dalloc_junk
-huge_dss_prec_get
 huge_malloc
-huge_mtx
-huge_ndalloc
-huge_nmalloc
 huge_palloc
-huge_postfork_child
-huge_postfork_parent
-huge_prefork
-huge_prof_ctx_get
-huge_prof_ctx_set
+huge_prof_tctx_get
+huge_prof_tctx_reset
+huge_prof_tctx_set
 huge_ralloc
 huge_ralloc_no_move
 huge_salloc
-iallocm
+iaalloc
+iallocztm
 icalloc
 icalloct
 idalloc
 idalloct
+idalloctm
 imalloc
 imalloct
+index2size
+index2size_compute
+index2size_lookup
+index2size_tab
+in_valgrind
 ipalloc
 ipalloct
+ipallocztm
 iqalloc
-iqalloct
 iralloc
 iralloct
 iralloct_realign
 isalloc
+isdalloct
+isqalloc
 isthreaded
 ivsalloc
 ixalloc
 jemalloc_postfork_child
 jemalloc_postfork_parent
 jemalloc_prefork
+large_maxclass
+lg_floor
 malloc_cprintf
 malloc_mutex_init
 malloc_mutex_lock
@@ -242,7 +298,8 @@ malloc_mutex_unlock
 malloc_printf
 malloc_snprintf
 malloc_strtoumax
-malloc_tsd_boot
+malloc_tsd_boot0
+malloc_tsd_boot1
 malloc_tsd_cleanup_register
 malloc_tsd_dalloc
 malloc_tsd_malloc
@@ -251,16 +308,18 @@ malloc_vcprintf
 malloc_vsnprintf
 malloc_write
 map_bias
+map_misc_offset
 mb_write
 mutex_boot
-narenas_auto
-narenas_total
+narenas_cache_cleanup
 narenas_total_get
 ncpus
 nhbins
 opt_abort
 opt_dss
 opt_junk
+opt_junk_alloc
+opt_junk_free
 opt_lg_chunk
 opt_lg_dirty_mult
 opt_lg_prof_interval
@@ -274,84 +333,99 @@ opt_prof_final
 opt_prof_gdump
 opt_prof_leak
 opt_prof_prefix
+opt_prof_thread_active_init
 opt_quarantine
 opt_redzone
 opt_stats_print
 opt_tcache
 opt_utrace
-opt_valgrind
 opt_xmalloc
 opt_zero
 p2rz
+pages_commit
+pages_decommit
+pages_map
 pages_purge
+pages_trim
+pages_unmap
 pow2_ceil
+prof_active_get
+prof_active_get_unlocked
+prof_active_set
+prof_alloc_prep
+prof_alloc_rollback
 prof_backtrace
 prof_boot0
 prof_boot1
 prof_boot2
-prof_bt_count
-prof_ctx_get
-prof_ctx_set
+prof_dump_header
 prof_dump_open
 prof_free
+prof_free_sampled_object
 prof_gdump
+prof_gdump_get
+prof_gdump_get_unlocked
+prof_gdump_set
+prof_gdump_val
 prof_idump
 prof_interval
 prof_lookup
 prof_malloc
+prof_malloc_sample_object
 prof_mdump
 prof_postfork_child
 prof_postfork_parent
 prof_prefork
-prof_promote
 prof_realloc
+prof_reset
 prof_sample_accum_update
 prof_sample_threshold_update
-prof_tdata_booted
+prof_tctx_get
+prof_tctx_reset
+prof_tctx_set
 prof_tdata_cleanup
 prof_tdata_get
 prof_tdata_init
-prof_tdata_initialized
-prof_tdata_tls
-prof_tdata_tsd
-prof_tdata_tsd_boot
-prof_tdata_tsd_cleanup_wrapper
-prof_tdata_tsd_get
-prof_tdata_tsd_get_wrapper
-prof_tdata_tsd_init_head
-prof_tdata_tsd_set
+prof_tdata_reinit
+prof_thread_active_get
+prof_thread_active_init_get
+prof_thread_active_init_set
+prof_thread_active_set
+prof_thread_name_get
+prof_thread_name_set
 quarantine
 quarantine_alloc_hook
-quarantine_boot
-quarantine_booted
+quarantine_alloc_hook_work
 quarantine_cleanup
-quarantine_init
-quarantine_tls
-quarantine_tsd
-quarantine_tsd_boot
-quarantine_tsd_cleanup_wrapper
-quarantine_tsd_get
-quarantine_tsd_get_wrapper
-quarantine_tsd_init_head
-quarantine_tsd_set
 register_zone
+rtree_child_read
+rtree_child_read_hard
+rtree_child_tryread
 rtree_delete
 rtree_get
-rtree_get_locked
 rtree_new
-rtree_postfork_child
-rtree_postfork_parent
-rtree_prefork
+rtree_node_valid
 rtree_set
+rtree_start_level
+rtree_subkey
+rtree_subtree_read
+rtree_subtree_read_hard
+rtree_subtree_tryread
+rtree_val_read
+rtree_val_write
 s2u
+s2u_compute
+s2u_lookup
 sa2u
 set_errno
-small_size2bin
+size2index
+size2index_compute
+size2index_lookup
+size2index_tab
 stats_cactive
 stats_cactive_add
 stats_cactive_get
 stats_cactive_sub
-stats_chunks
 stats_print
 tcache_alloc_easy
 tcache_alloc_large
@@ -359,55 +433,67 @@ tcache_alloc_small
 tcache_alloc_small_hard
 tcache_arena_associate
 tcache_arena_dissociate
+tcache_arena_reassociate
 tcache_bin_flush_large
 tcache_bin_flush_small
 tcache_bin_info
-tcache_boot0
-tcache_boot1
-tcache_booted
+tcache_boot
+tcache_cleanup
 tcache_create
 tcache_dalloc_large
 tcache_dalloc_small
-tcache_destroy
-tcache_enabled_booted
+tcache_enabled_cleanup
 tcache_enabled_get
-tcache_enabled_initialized
 tcache_enabled_set
-tcache_enabled_tls
-tcache_enabled_tsd
-tcache_enabled_tsd_boot
-tcache_enabled_tsd_cleanup_wrapper
-tcache_enabled_tsd_get
-tcache_enabled_tsd_get_wrapper
-tcache_enabled_tsd_init_head
-tcache_enabled_tsd_set
 tcache_event
 tcache_event_hard
 tcache_flush
 tcache_get
-tcache_initialized
+tcache_get_hard
 tcache_maxclass
+tcaches
 tcache_salloc
+tcaches_create
+tcaches_destroy
+tcaches_flush
+tcaches_get
 tcache_stats_merge
-tcache_thread_cleanup
-tcache_tls
-tcache_tsd
-tcache_tsd_boot
-tcache_tsd_cleanup_wrapper
-tcache_tsd_get
-tcache_tsd_get_wrapper
-tcache_tsd_init_head
-tcache_tsd_set
-thread_allocated_booted
-thread_allocated_initialized
-thread_allocated_tls
-thread_allocated_tsd
-thread_allocated_tsd_boot
-thread_allocated_tsd_cleanup_wrapper
-thread_allocated_tsd_get
-thread_allocated_tsd_get_wrapper
-thread_allocated_tsd_init_head
-thread_allocated_tsd_set
+thread_allocated_cleanup
+thread_deallocated_cleanup
+tsd_arena_get
+tsd_arena_set
+tsd_boot
+tsd_boot0
+tsd_boot1
+tsd_booted
+tsd_cleanup
+tsd_cleanup_wrapper
+tsd_fetch
+tsd_get
+tsd_wrapper_get
+tsd_wrapper_set
+tsd_initialized
 tsd_init_check_recursion
 tsd_init_finish
+tsd_init_head
+tsd_nominal
+tsd_quarantine_get
+tsd_quarantine_set
+tsd_set
+tsd_tcache_enabled_get
+tsd_tcache_enabled_set
+tsd_tcache_get
+tsd_tcache_set
+tsd_tls
+tsd_tsd
+tsd_prof_tdata_get
+tsd_prof_tdata_set
+tsd_thread_allocated_get
+tsd_thread_allocated_set
+tsd_thread_deallocated_get
+tsd_thread_deallocated_set
 u2rz
+valgrind_freelike_block
+valgrind_make_mem_defined
+valgrind_make_mem_noaccess
+valgrind_make_mem_undefined
diff --git a/deps/jemalloc/include/jemalloc/internal/prng.h b/deps/jemalloc/include/jemalloc/internal/prng.h
index 7b2b06512..216d0ef47 100644
--- a/deps/jemalloc/include/jemalloc/internal/prng.h
+++ b/deps/jemalloc/include/jemalloc/internal/prng.h
@@ -15,7 +15,7 @@
  * See Knuth's TAOCP 3rd Ed., Vol. 2, pg. 17 for details on these constraints.
  *
  * This choice of m has the disadvantage that the quality of the bits is
- * proportional to bit position.  For example. the lowest bit has a cycle of 2,
+ * proportional to bit position.  For example, the lowest bit has a cycle of 2,
  * the next has a cycle of 4, etc.  For this reason, we prefer to use the upper
  * bits.
  *
@@ -26,22 +26,22 @@
  *   const uint32_t a, c : See above discussion.
  */
 #define	prng32(r, lg_range, state, a, c) do {				\
-	assert(lg_range > 0);						\
-	assert(lg_range <= 32);						\
+	assert((lg_range) > 0);						\
+	assert((lg_range) <= 32);					\
 									\
 	r = (state * (a)) + (c);					\
 	state = r;							\
-	r >>= (32 - lg_range);						\
+	r >>= (32 - (lg_range));					\
 } while (false)
 
 /* Same as prng32(), but 64 bits of pseudo-randomness, using uint64_t. */
 #define	prng64(r, lg_range, state, a, c) do {				\
-	assert(lg_range > 0);						\
-	assert(lg_range <= 64);						\
+	assert((lg_range) > 0);						\
+	assert((lg_range) <= 64);					\
 									\
 	r = (state * (a)) + (c);					\
 	state = r;							\
-	r >>= (64 - lg_range);						\
+	r >>= (64 - (lg_range));					\
 } while (false)
 
 #endif /* JEMALLOC_H_TYPES */
diff --git a/deps/jemalloc/include/jemalloc/internal/prof.h b/deps/jemalloc/include/jemalloc/internal/prof.h
index 6f162d21e..e5198c3e8 100644
--- a/deps/jemalloc/include/jemalloc/internal/prof.h
+++ b/deps/jemalloc/include/jemalloc/internal/prof.h
@@ -3,8 +3,8 @@
 
 typedef struct prof_bt_s prof_bt_t;
 typedef struct prof_cnt_s prof_cnt_t;
-typedef struct prof_thr_cnt_s prof_thr_cnt_t;
-typedef struct prof_ctx_s prof_ctx_t;
+typedef struct prof_tctx_s prof_tctx_t;
+typedef struct prof_gctx_s prof_gctx_t;
 typedef struct prof_tdata_s prof_tdata_t;
 
 /* Option defaults. */
@@ -23,9 +23,6 @@ typedef struct prof_tdata_s prof_tdata_t;
  */
 #define	PROF_BT_MAX			128
 
-/* Maximum number of backtraces to store in each per thread LRU cache. */
-#define	PROF_TCMAX			1024
-
 /* Initial hash table size. */
 #define	PROF_CKH_MINITEMS		64
 
@@ -36,12 +33,18 @@ typedef struct prof_tdata_s prof_tdata_t;
 #define	PROF_PRINTF_BUFSIZE		128
 
 /*
- * Number of mutexes shared among all ctx's.  No space is allocated for these
+ * Number of mutexes shared among all gctx's.  No space is allocated for these
  * unless profiling is enabled, so it's okay to over-provision.
  */
 #define	PROF_NCTX_LOCKS			1024
 
 /*
+ * Number of mutexes shared among all tdata's.  No space is allocated for these
+ * unless profiling is enabled, so it's okay to over-provision.
+ */
+#define	PROF_NTDATA_LOCKS		256
+
+/*
  * prof_tdata pointers close to NULL are used to encode state information that
  * is used for cleaning up during thread shutdown.
  */
@@ -63,141 +66,186 @@ struct prof_bt_s {
 /* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */
 typedef struct {
 	prof_bt_t	*bt;
-	unsigned	nignore;
 	unsigned	max;
 } prof_unwind_data_t;
 #endif
 
 struct prof_cnt_s {
-	/*
-	 * Profiling counters.  An allocation/deallocation pair can operate on
-	 * different prof_thr_cnt_t objects that are linked into the same
-	 * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go
-	 * negative.  In principle it is possible for the *bytes counters to
-	 * overflow/underflow, but a general solution would require something
-	 * like 128-bit counters; this implementation doesn't bother to solve
-	 * that problem.
-	 */
-	int64_t		curobjs;
-	int64_t		curbytes;
+	/* Profiling counters. */
+	uint64_t	curobjs;
+	uint64_t	curbytes;
 	uint64_t	accumobjs;
 	uint64_t	accumbytes;
 };
 
-struct prof_thr_cnt_s {
-	/* Linkage into prof_ctx_t's cnts_ql. */
-	ql_elm(prof_thr_cnt_t)	cnts_link;
+typedef enum {
+	prof_tctx_state_initializing,
+	prof_tctx_state_nominal,
+	prof_tctx_state_dumping,
+	prof_tctx_state_purgatory /* Dumper must finish destroying. */
+} prof_tctx_state_t;
 
-	/* Linkage into thread's LRU. */
-	ql_elm(prof_thr_cnt_t)	lru_link;
+struct prof_tctx_s {
+	/* Thread data for thread that performed the allocation. */
+	prof_tdata_t		*tdata;
 
 	/*
-	 * Associated context.  If a thread frees an object that it did not
-	 * allocate, it is possible that the context is not cached in the
-	 * thread's hash table, in which case it must be able to look up the
-	 * context, insert a new prof_thr_cnt_t into the thread's hash table,
-	 * and link it into the prof_ctx_t's cnts_ql.
+	 * Copy of tdata->thr_{uid,discrim}, necessary because tdata may be
+	 * defunct during teardown.
 	 */
-	prof_ctx_t		*ctx;
+	uint64_t		thr_uid;
+	uint64_t		thr_discrim;
+
+	/* Profiling counters, protected by tdata->lock. */
+	prof_cnt_t		cnts;
+
+	/* Associated global context. */
+	prof_gctx_t		*gctx;
 
 	/*
-	 * Threads use memory barriers to update the counters.  Since there is
-	 * only ever one writer, the only challenge is for the reader to get a
-	 * consistent read of the counters.
-	 *
-	 * The writer uses this series of operations:
-	 *
-	 * 1) Increment epoch to an odd number.
-	 * 2) Update counters.
-	 * 3) Increment epoch to an even number.
-	 *
-	 * The reader must assure 1) that the epoch is even while it reads the
-	 * counters, and 2) that the epoch doesn't change between the time it
-	 * starts and finishes reading the counters.
+	 * UID that distinguishes multiple tctx's created by the same thread,
+	 * but coexisting in gctx->tctxs.  There are two ways that such
+	 * coexistence can occur:
+	 * - A dumper thread can cause a tctx to be retained in the purgatory
+	 *   state.
+	 * - Although a single "producer" thread must create all tctx's which
+	 *   share the same thr_uid, multiple "consumers" can each concurrently
+	 *   execute portions of prof_tctx_destroy().  prof_tctx_destroy() only
+	 *   gets called once each time cnts.cur{objs,bytes} drop to 0, but this
+	 *   threshold can be hit again before the first consumer finishes
+	 *   executing prof_tctx_destroy().
 	 */
-	unsigned		epoch;
+	uint64_t		tctx_uid;
 
-	/* Profiling counters. */
-	prof_cnt_t		cnts;
-};
+	/* Linkage into gctx's tctxs. */
+	rb_node(prof_tctx_t)	tctx_link;
 
-struct prof_ctx_s {
-	/* Associated backtrace. */
-	prof_bt_t		*bt;
+	/*
+	 * True during prof_alloc_prep()..prof_malloc_sample_object(), prevents
+	 * sample vs destroy race.
+	 */
+	bool			prepared;
+
+	/* Current dump-related state, protected by gctx->lock. */
+	prof_tctx_state_t	state;
+
+	/*
+	 * Copy of cnts snapshotted during early dump phase, protected by
+	 * dump_mtx.
+	 */
+	prof_cnt_t		dump_cnts;
+};
+typedef rb_tree(prof_tctx_t) prof_tctx_tree_t;
 
-	/* Protects nlimbo, cnt_merged, and cnts_ql. */
+struct prof_gctx_s {
+	/* Protects nlimbo, cnt_summed, and tctxs. */
 	malloc_mutex_t		*lock;
 
 	/*
-	 * Number of threads that currently cause this ctx to be in a state of
+	 * Number of threads that currently cause this gctx to be in a state of
 	 * limbo due to one of:
-	 *   - Initializing per thread counters associated with this ctx.
-	 *   - Preparing to destroy this ctx.
-	 *   - Dumping a heap profile that includes this ctx.
+	 *   - Initializing this gctx.
+	 *   - Initializing per thread counters associated with this gctx.
+	 *   - Preparing to destroy this gctx.
+	 *   - Dumping a heap profile that includes this gctx.
 	 * nlimbo must be 1 (single destroyer) in order to safely destroy the
-	 * ctx.
+	 * gctx.
 	 */
 	unsigned		nlimbo;
 
-	/* Temporary storage for summation during dump. */
-	prof_cnt_t		cnt_summed;
-
-	/* When threads exit, they merge their stats into cnt_merged. */
-	prof_cnt_t		cnt_merged;
-
 	/*
-	 * List of profile counters, one for each thread that has allocated in
+	 * Tree of profile counters, one for each thread that has allocated in
 	 * this context.
 	 */
-	ql_head(prof_thr_cnt_t)	cnts_ql;
+	prof_tctx_tree_t	tctxs;
+
+	/* Linkage for tree of contexts to be dumped. */
+	rb_node(prof_gctx_t)	dump_link;
+
+	/* Temporary storage for summation during dump. */
+	prof_cnt_t		cnt_summed;
+
+	/* Associated backtrace. */
+	prof_bt_t		bt;
 
-	/* Linkage for list of contexts to be dumped. */
-	ql_elm(prof_ctx_t)	dump_link;
+	/* Backtrace vector, variable size, referred to by bt. */
+	void			*vec[1];
 };
-typedef ql_head(prof_ctx_t) prof_ctx_list_t;
+typedef rb_tree(prof_gctx_t) prof_gctx_tree_t;
 
 struct prof_tdata_s {
+	malloc_mutex_t		*lock;
+
+	/* Monotonically increasing unique thread identifier. */
+	uint64_t		thr_uid;
+
 	/*
-	 * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread keeps a
-	 * cache of backtraces, with associated thread-specific prof_thr_cnt_t
-	 * objects.  Other threads may read the prof_thr_cnt_t contents, but no
-	 * others will ever write them.
-	 *
-	 * Upon thread exit, the thread must merge all the prof_thr_cnt_t
-	 * counter data into the associated prof_ctx_t objects, and unlink/free
-	 * the prof_thr_cnt_t objects.
+	 * Monotonically increasing discriminator among tdata structures
+	 * associated with the same thr_uid.
 	 */
-	ckh_t			bt2cnt;
+	uint64_t		thr_discrim;
 
-	/* LRU for contents of bt2cnt. */
-	ql_head(prof_thr_cnt_t)	lru_ql;
+	/* Included in heap profile dumps if non-NULL. */
+	char			*thread_name;
 
-	/* Backtrace vector, used for calls to prof_backtrace(). */
-	void			**vec;
+	bool			attached;
+	bool			expired;
+
+	rb_node(prof_tdata_t)	tdata_link;
+
+	/*
+	 * Counter used to initialize prof_tctx_t's tctx_uid.  No locking is
+	 * necessary when incrementing this field, because only one thread ever
+	 * does so.
+	 */
+	uint64_t		tctx_uid_next;
+
+	/*
+	 * Hash of (prof_bt_t *)-->(prof_tctx_t *).  Each thread tracks
+	 * backtraces for which it has non-zero allocation/deallocation counters
+	 * associated with thread-specific prof_tctx_t objects.  Other threads
+	 * may write to prof_tctx_t contents when freeing associated objects.
+	 */
+	ckh_t			bt2tctx;
 
 	/* Sampling state. */
 	uint64_t		prng_state;
-	uint64_t		threshold;
-	uint64_t		accum;
+	uint64_t		bytes_until_sample;
 
 	/* State used to avoid dumping while operating on prof internals. */
 	bool			enq;
 	bool			enq_idump;
 	bool			enq_gdump;
+
+	/*
+	 * Set to true during an early dump phase for tdata's which are
+	 * currently being dumped.  New threads' tdata's have this initialized
+	 * to false so that they aren't accidentally included in later dump
+	 * phases.
+	 */
+	bool			dumping;
+
+	/*
+	 * True if profiling is active for this tdata's thread
+	 * (thread.prof.active mallctl).
+	 */
+	bool			active;
+
+	/* Temporary storage for summation during dump. */
+	prof_cnt_t		cnt_summed;
+
+	/* Backtrace vector, used for calls to prof_backtrace(). */
+	void			*vec[PROF_BT_MAX];
 };
+typedef rb_tree(prof_tdata_t) prof_tdata_tree_t;
 
 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
 
 extern bool	opt_prof;
-/*
- * Even if opt_prof is true, sampling can be temporarily disabled by setting
- * opt_prof_active to false.  No locking is used when updating opt_prof_active,
- * so there are no guarantees regarding how long it will take for all threads
- * to notice state changes.
- */
 extern bool	opt_prof_active;
+extern bool	opt_prof_thread_active_init;
 extern size_t	opt_lg_prof_sample;   /* Mean bytes between samples. */
 extern ssize_t	opt_lg_prof_interval; /* lg(prof_interval). */
 extern bool	opt_prof_gdump;       /* High-water memory dumping. */
@@ -211,6 +259,12 @@ extern char	opt_prof_prefix[
 #endif
     1];
 
+/* Accessed via prof_active_[gs]et{_unlocked,}(). */
+extern bool	prof_active;
+
+/* Accessed via prof_gdump_[gs]et{_unlocked,}(). */
+extern bool	prof_gdump_val;
+
 /*
  * Profile dump interval, measured in bytes allocated.  Each arena triggers a
  * profile dump when it reaches this threshold.  The effect is that the
@@ -221,391 +275,269 @@ extern char	opt_prof_prefix[
 extern uint64_t	prof_interval;
 
 /*
- * If true, promote small sampled objects to large objects, since small run
- * headers do not have embedded profile context pointers.
+ * Initialized as opt_lg_prof_sample, and potentially modified during profiling
+ * resets.
  */
-extern bool	prof_promote;
+extern size_t	lg_prof_sample;
 
+void	prof_alloc_rollback(tsd_t *tsd, prof_tctx_t *tctx, bool updated);
+void	prof_malloc_sample_object(const void *ptr, size_t usize,
+    prof_tctx_t *tctx);
+void	prof_free_sampled_object(tsd_t *tsd, size_t usize, prof_tctx_t *tctx);
 void	bt_init(prof_bt_t *bt, void **vec);
-void	prof_backtrace(prof_bt_t *bt, unsigned nignore);
-prof_thr_cnt_t	*prof_lookup(prof_bt_t *bt);
+void	prof_backtrace(prof_bt_t *bt);
+prof_tctx_t	*prof_lookup(tsd_t *tsd, prof_bt_t *bt);
 #ifdef JEMALLOC_JET
+size_t	prof_tdata_count(void);
 size_t	prof_bt_count(void);
+const prof_cnt_t *prof_cnt_all(void);
 typedef int (prof_dump_open_t)(bool, const char *);
 extern prof_dump_open_t *prof_dump_open;
+typedef bool (prof_dump_header_t)(bool, const prof_cnt_t *);
+extern prof_dump_header_t *prof_dump_header;
 #endif
 void	prof_idump(void);
 bool	prof_mdump(const char *filename);
 void	prof_gdump(void);
-prof_tdata_t	*prof_tdata_init(void);
-void	prof_tdata_cleanup(void *arg);
+prof_tdata_t	*prof_tdata_init(tsd_t *tsd);
+prof_tdata_t	*prof_tdata_reinit(tsd_t *tsd, prof_tdata_t *tdata);
+void	prof_reset(tsd_t *tsd, size_t lg_sample);
+void	prof_tdata_cleanup(tsd_t *tsd);
+const char	*prof_thread_name_get(void);
+bool	prof_active_get(void);
+bool	prof_active_set(bool active);
+int	prof_thread_name_set(tsd_t *tsd, const char *thread_name);
+bool	prof_thread_active_get(void);
+bool	prof_thread_active_set(bool active);
+bool	prof_thread_active_init_get(void);
+bool	prof_thread_active_init_set(bool active_init);
+bool	prof_gdump_get(void);
+bool	prof_gdump_set(bool active);
 void	prof_boot0(void);
 void	prof_boot1(void);
 bool	prof_boot2(void);
 void	prof_prefork(void);
 void	prof_postfork_parent(void);
 void	prof_postfork_child(void);
+void	prof_sample_threshold_update(prof_tdata_t *tdata);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES
 
-#define	PROF_ALLOC_PREP(nignore, size, ret) do {			\
-	prof_tdata_t *prof_tdata;					\
-	prof_bt_t bt;							\
-									\
-	assert(size == s2u(size));					\
-									\
-	prof_tdata = prof_tdata_get(true);				\
-	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX) {	\
-		if (prof_tdata != NULL)					\
-			ret = (prof_thr_cnt_t *)(uintptr_t)1U;		\
-		else							\
-			ret = NULL;					\
-		break;							\
-	}								\
-									\
-	if (opt_prof_active == false) {					\
-		/* Sampling is currently inactive, so avoid sampling. */\
-		ret = (prof_thr_cnt_t *)(uintptr_t)1U;			\
-	} else if (opt_lg_prof_sample == 0) {				\
-		/* Don't bother with sampling logic, since sampling   */\
-		/* interval is 1.                                     */\
-		bt_init(&bt, prof_tdata->vec);				\
-		prof_backtrace(&bt, nignore);				\
-		ret = prof_lookup(&bt);					\
-	} else {							\
-		if (prof_tdata->threshold == 0) {			\
-			/* Initialize.  Seed the prng differently for */\
-			/* each thread.                               */\
-			prof_tdata->prng_state =			\
-			    (uint64_t)(uintptr_t)&size;			\
-			prof_sample_threshold_update(prof_tdata);	\
-		}							\
-									\
-		/* Determine whether to capture a backtrace based on  */\
-		/* whether size is enough for prof_accum to reach     */\
-		/* prof_tdata->threshold.  However, delay updating    */\
-		/* these variables until prof_{m,re}alloc(), because  */\
-		/* we don't know for sure that the allocation will    */\
-		/* succeed.                                           */\
-		/*                                                    */\
-		/* Use subtraction rather than addition to avoid      */\
-		/* potential integer overflow.                        */\
-		if (size >= prof_tdata->threshold -			\
-		    prof_tdata->accum) {				\
-			bt_init(&bt, prof_tdata->vec);			\
-			prof_backtrace(&bt, nignore);			\
-			ret = prof_lookup(&bt);				\
-		} else							\
-			ret = (prof_thr_cnt_t *)(uintptr_t)1U;		\
-	}								\
-} while (0)
-
 #ifndef JEMALLOC_ENABLE_INLINE
-malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *)
-
-prof_tdata_t	*prof_tdata_get(bool create);
-void	prof_sample_threshold_update(prof_tdata_t *prof_tdata);
-prof_ctx_t	*prof_ctx_get(const void *ptr);
-void	prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx);
-bool	prof_sample_accum_update(size_t size);
-void	prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt);
-void	prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,
-    size_t old_usize, prof_ctx_t *old_ctx);
-void	prof_free(const void *ptr, size_t size);
+bool	prof_active_get_unlocked(void);
+bool	prof_gdump_get_unlocked(void);
+prof_tdata_t	*prof_tdata_get(tsd_t *tsd, bool create);
+bool	prof_sample_accum_update(tsd_t *tsd, size_t usize, bool commit,
+    prof_tdata_t **tdata_out);
+prof_tctx_t	*prof_alloc_prep(tsd_t *tsd, size_t usize, bool prof_active,
+    bool update);
+prof_tctx_t	*prof_tctx_get(const void *ptr);
+void	prof_tctx_set(const void *ptr, size_t usize, prof_tctx_t *tctx);
+void	prof_tctx_reset(const void *ptr, size_t usize, const void *old_ptr,
+    prof_tctx_t *tctx);
+void	prof_malloc_sample_object(const void *ptr, size_t usize,
+    prof_tctx_t *tctx);
+void	prof_malloc(const void *ptr, size_t usize, prof_tctx_t *tctx);
+void	prof_realloc(tsd_t *tsd, const void *ptr, size_t usize,
+    prof_tctx_t *tctx, bool prof_active, bool updated, const void *old_ptr,
+    size_t old_usize, prof_tctx_t *old_tctx);
+void	prof_free(tsd_t *tsd, const void *ptr, size_t usize);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
-/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
-malloc_tsd_externs(prof_tdata, prof_tdata_t *)
-malloc_tsd_funcs(JEMALLOC_INLINE, prof_tdata, prof_tdata_t *, NULL,
-    prof_tdata_cleanup)
+JEMALLOC_ALWAYS_INLINE bool
+prof_active_get_unlocked(void)
+{
+
+	/*
+	 * Even if opt_prof is true, sampling can be temporarily disabled by
+	 * setting prof_active to false.  No locking is used when reading
+	 * prof_active in the fast path, so there are no guarantees regarding
+	 * how long it will take for all threads to notice state changes.
+	 */
+	return (prof_active);
+}
 
-JEMALLOC_INLINE prof_tdata_t *
-prof_tdata_get(bool create)
+JEMALLOC_ALWAYS_INLINE bool
+prof_gdump_get_unlocked(void)
 {
-	prof_tdata_t *prof_tdata;
+
+	/*
+	 * No locking is used when reading prof_gdump_val in the fast path, so
+	 * there are no guarantees regarding how long it will take for all
+	 * threads to notice state changes.
+	 */
+	return (prof_gdump_val);
+}
+
+JEMALLOC_ALWAYS_INLINE prof_tdata_t *
+prof_tdata_get(tsd_t *tsd, bool create)
+{
+	prof_tdata_t *tdata;
 
 	cassert(config_prof);
 
-	prof_tdata = *prof_tdata_tsd_get();
-	if (create && prof_tdata == NULL)
-		prof_tdata = prof_tdata_init();
+	tdata = tsd_prof_tdata_get(tsd);
+	if (create) {
+		if (unlikely(tdata == NULL)) {
+			if (tsd_nominal(tsd)) {
+				tdata = prof_tdata_init(tsd);
+				tsd_prof_tdata_set(tsd, tdata);
+			}
+		} else if (unlikely(tdata->expired)) {
+			tdata = prof_tdata_reinit(tsd, tdata);
+			tsd_prof_tdata_set(tsd, tdata);
+		}
+		assert(tdata == NULL || tdata->attached);
+	}
 
-	return (prof_tdata);
+	return (tdata);
 }
 
-JEMALLOC_INLINE void
-prof_sample_threshold_update(prof_tdata_t *prof_tdata)
+JEMALLOC_ALWAYS_INLINE prof_tctx_t *
+prof_tctx_get(const void *ptr)
 {
-	/*
-	 * The body of this function is compiled out unless heap profiling is
-	 * enabled, so that it is possible to compile jemalloc with floating
-	 * point support completely disabled.  Avoiding floating point code is
-	 * important on memory-constrained systems, but it also enables a
-	 * workaround for versions of glibc that don't properly save/restore
-	 * floating point registers during dynamic lazy symbol loading (which
-	 * internally calls into whatever malloc implementation happens to be
-	 * integrated into the application).  Note that some compilers (e.g.
-	 * gcc 4.8) may use floating point registers for fast memory moves, so
-	 * jemalloc must be compiled with such optimizations disabled (e.g.
-	 * -mno-sse) in order for the workaround to be complete.
-	 */
-#ifdef JEMALLOC_PROF
-	uint64_t r;
-	double u;
 
 	cassert(config_prof);
+	assert(ptr != NULL);
 
-	/*
-	 * Compute sample threshold as a geometrically distributed random
-	 * variable with mean (2^opt_lg_prof_sample).
-	 *
-	 *                         __        __
-	 *                         |  log(u)  |                     1
-	 * prof_tdata->threshold = | -------- |, where p = -------------------
-	 *                         | log(1-p) |             opt_lg_prof_sample
-	 *                                                 2
-	 *
-	 * For more information on the math, see:
-	 *
-	 *   Non-Uniform Random Variate Generation
-	 *   Luc Devroye
-	 *   Springer-Verlag, New York, 1986
-	 *   pp 500
-	 *   (http://luc.devroye.org/rnbookindex.html)
-	 */
-	prng64(r, 53, prof_tdata->prng_state,
-	    UINT64_C(6364136223846793005), UINT64_C(1442695040888963407));
-	u = (double)r * (1.0/9007199254740992.0L);
-	prof_tdata->threshold = (uint64_t)(log(u) /
-	    log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
-	    + (uint64_t)1U;
-#endif
+	return (arena_prof_tctx_get(ptr));
 }
 
-JEMALLOC_INLINE prof_ctx_t *
-prof_ctx_get(const void *ptr)
+JEMALLOC_ALWAYS_INLINE void
+prof_tctx_set(const void *ptr, size_t usize, prof_tctx_t *tctx)
 {
-	prof_ctx_t *ret;
-	arena_chunk_t *chunk;
 
 	cassert(config_prof);
 	assert(ptr != NULL);
 
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (chunk != ptr) {
-		/* Region. */
-		ret = arena_prof_ctx_get(ptr);
-	} else
-		ret = huge_prof_ctx_get(ptr);
-
-	return (ret);
+	arena_prof_tctx_set(ptr, usize, tctx);
 }
 
-JEMALLOC_INLINE void
-prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx)
+JEMALLOC_ALWAYS_INLINE void
+prof_tctx_reset(const void *ptr, size_t usize, const void *old_ptr,
+    prof_tctx_t *old_tctx)
 {
-	arena_chunk_t *chunk;
 
 	cassert(config_prof);
 	assert(ptr != NULL);
 
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (chunk != ptr) {
-		/* Region. */
-		arena_prof_ctx_set(ptr, usize, ctx);
-	} else
-		huge_prof_ctx_set(ptr, ctx);
+	arena_prof_tctx_reset(ptr, usize, old_ptr, old_tctx);
 }
 
-JEMALLOC_INLINE bool
-prof_sample_accum_update(size_t size)
+JEMALLOC_ALWAYS_INLINE bool
+prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
+    prof_tdata_t **tdata_out)
 {
-	prof_tdata_t *prof_tdata;
+	prof_tdata_t *tdata;
 
 	cassert(config_prof);
-	/* Sampling logic is unnecessary if the interval is 1. */
-	assert(opt_lg_prof_sample != 0);
 
-	prof_tdata = prof_tdata_get(false);
-	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+	tdata = prof_tdata_get(tsd, true);
+	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+		tdata = NULL;
+
+	if (tdata_out != NULL)
+		*tdata_out = tdata;
+
+	if (tdata == NULL)
 		return (true);
 
-	/* Take care to avoid integer overflow. */
-	if (size >= prof_tdata->threshold - prof_tdata->accum) {
-		prof_tdata->accum -= (prof_tdata->threshold - size);
-		/* Compute new sample threshold. */
-		prof_sample_threshold_update(prof_tdata);
-		while (prof_tdata->accum >= prof_tdata->threshold) {
-			prof_tdata->accum -= prof_tdata->threshold;
-			prof_sample_threshold_update(prof_tdata);
-		}
-		return (false);
-	} else {
-		prof_tdata->accum += size;
+	if (tdata->bytes_until_sample >= usize) {
+		if (update)
+			tdata->bytes_until_sample -= usize;
 		return (true);
+	} else {
+		/* Compute new sample threshold. */
+		if (update)
+			prof_sample_threshold_update(tdata);
+		return (!tdata->active);
 	}
 }
 
-JEMALLOC_INLINE void
-prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt)
+JEMALLOC_ALWAYS_INLINE prof_tctx_t *
+prof_alloc_prep(tsd_t *tsd, size_t usize, bool prof_active, bool update)
+{
+	prof_tctx_t *ret;
+	prof_tdata_t *tdata;
+	prof_bt_t bt;
+
+	assert(usize == s2u(usize));
+
+	if (!prof_active || likely(prof_sample_accum_update(tsd, usize, update,
+	    &tdata)))
+		ret = (prof_tctx_t *)(uintptr_t)1U;
+	else {
+		bt_init(&bt, tdata->vec);
+		prof_backtrace(&bt);
+		ret = prof_lookup(tsd, &bt);
+	}
+
+	return (ret);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+prof_malloc(const void *ptr, size_t usize, prof_tctx_t *tctx)
 {
 
 	cassert(config_prof);
 	assert(ptr != NULL);
 	assert(usize == isalloc(ptr, true));
 
-	if (opt_lg_prof_sample != 0) {
-		if (prof_sample_accum_update(usize)) {
-			/*
-			 * Don't sample.  For malloc()-like allocation, it is
-			 * always possible to tell in advance how large an
-			 * object's usable size will be, so there should never
-			 * be a difference between the usize passed to
-			 * PROF_ALLOC_PREP() and prof_malloc().
-			 */
-			assert((uintptr_t)cnt == (uintptr_t)1U);
-		}
-	}
-
-	if ((uintptr_t)cnt > (uintptr_t)1U) {
-		prof_ctx_set(ptr, usize, cnt->ctx);
-
-		cnt->epoch++;
-		/*********/
-		mb_write();
-		/*********/
-		cnt->cnts.curobjs++;
-		cnt->cnts.curbytes += usize;
-		if (opt_prof_accum) {
-			cnt->cnts.accumobjs++;
-			cnt->cnts.accumbytes += usize;
-		}
-		/*********/
-		mb_write();
-		/*********/
-		cnt->epoch++;
-		/*********/
-		mb_write();
-		/*********/
-	} else
-		prof_ctx_set(ptr, usize, (prof_ctx_t *)(uintptr_t)1U);
+	if (unlikely((uintptr_t)tctx > (uintptr_t)1U))
+		prof_malloc_sample_object(ptr, usize, tctx);
+	else
+		prof_tctx_set(ptr, usize, (prof_tctx_t *)(uintptr_t)1U);
 }
 
-JEMALLOC_INLINE void
-prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,
-    size_t old_usize, prof_ctx_t *old_ctx)
+JEMALLOC_ALWAYS_INLINE void
+prof_realloc(tsd_t *tsd, const void *ptr, size_t usize, prof_tctx_t *tctx,
+    bool prof_active, bool updated, const void *old_ptr, size_t old_usize,
+    prof_tctx_t *old_tctx)
 {
-	prof_thr_cnt_t *told_cnt;
+	bool sampled, old_sampled;
 
 	cassert(config_prof);
-	assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
+	assert(ptr != NULL || (uintptr_t)tctx <= (uintptr_t)1U);
 
-	if (ptr != NULL) {
+	if (prof_active && !updated && ptr != NULL) {
 		assert(usize == isalloc(ptr, true));
-		if (opt_lg_prof_sample != 0) {
-			if (prof_sample_accum_update(usize)) {
-				/*
-				 * Don't sample.  The usize passed to
-				 * PROF_ALLOC_PREP() was larger than what
-				 * actually got allocated, so a backtrace was
-				 * captured for this allocation, even though
-				 * its actual usize was insufficient to cross
-				 * the sample threshold.
-				 */
-				cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
-			}
-		}
-	}
-
-	if ((uintptr_t)old_ctx > (uintptr_t)1U) {
-		told_cnt = prof_lookup(old_ctx->bt);
-		if (told_cnt == NULL) {
+		if (prof_sample_accum_update(tsd, usize, true, NULL)) {
 			/*
-			 * It's too late to propagate OOM for this realloc(),
-			 * so operate directly on old_cnt->ctx->cnt_merged.
+			 * Don't sample.  The usize passed to prof_alloc_prep()
+			 * was larger than what actually got allocated, so a
+			 * backtrace was captured for this allocation, even
+			 * though its actual usize was insufficient to cross the
+			 * sample threshold.
 			 */
-			malloc_mutex_lock(old_ctx->lock);
-			old_ctx->cnt_merged.curobjs--;
-			old_ctx->cnt_merged.curbytes -= old_usize;
-			malloc_mutex_unlock(old_ctx->lock);
-			told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
+			tctx = (prof_tctx_t *)(uintptr_t)1U;
 		}
-	} else
-		told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
-
-	if ((uintptr_t)told_cnt > (uintptr_t)1U)
-		told_cnt->epoch++;
-	if ((uintptr_t)cnt > (uintptr_t)1U) {
-		prof_ctx_set(ptr, usize, cnt->ctx);
-		cnt->epoch++;
-	} else if (ptr != NULL)
-		prof_ctx_set(ptr, usize, (prof_ctx_t *)(uintptr_t)1U);
-	/*********/
-	mb_write();
-	/*********/
-	if ((uintptr_t)told_cnt > (uintptr_t)1U) {
-		told_cnt->cnts.curobjs--;
-		told_cnt->cnts.curbytes -= old_usize;
 	}
-	if ((uintptr_t)cnt > (uintptr_t)1U) {
-		cnt->cnts.curobjs++;
-		cnt->cnts.curbytes += usize;
-		if (opt_prof_accum) {
-			cnt->cnts.accumobjs++;
-			cnt->cnts.accumbytes += usize;
-		}
-	}
-	/*********/
-	mb_write();
-	/*********/
-	if ((uintptr_t)told_cnt > (uintptr_t)1U)
-		told_cnt->epoch++;
-	if ((uintptr_t)cnt > (uintptr_t)1U)
-		cnt->epoch++;
-	/*********/
-	mb_write(); /* Not strictly necessary. */
+
+	sampled = ((uintptr_t)tctx > (uintptr_t)1U);
+	old_sampled = ((uintptr_t)old_tctx > (uintptr_t)1U);
+
+	if (unlikely(sampled))
+		prof_malloc_sample_object(ptr, usize, tctx);
+	else
+		prof_tctx_reset(ptr, usize, old_ptr, old_tctx);
+
+	if (unlikely(old_sampled))
+		prof_free_sampled_object(tsd, old_usize, old_tctx);
 }
 
-JEMALLOC_INLINE void
-prof_free(const void *ptr, size_t size)
+JEMALLOC_ALWAYS_INLINE void
+prof_free(tsd_t *tsd, const void *ptr, size_t usize)
 {
-	prof_ctx_t *ctx = prof_ctx_get(ptr);
+	prof_tctx_t *tctx = prof_tctx_get(ptr);
 
 	cassert(config_prof);
+	assert(usize == isalloc(ptr, true));
 
-	if ((uintptr_t)ctx > (uintptr_t)1) {
-		prof_thr_cnt_t *tcnt;
-		assert(size == isalloc(ptr, true));
-		tcnt = prof_lookup(ctx->bt);
-
-		if (tcnt != NULL) {
-			tcnt->epoch++;
-			/*********/
-			mb_write();
-			/*********/
-			tcnt->cnts.curobjs--;
-			tcnt->cnts.curbytes -= size;
-			/*********/
-			mb_write();
-			/*********/
-			tcnt->epoch++;
-			/*********/
-			mb_write();
-			/*********/
-		} else {
-			/*
-			 * OOM during free() cannot be propagated, so operate
-			 * directly on cnt->ctx->cnt_merged.
-			 */
-			malloc_mutex_lock(ctx->lock);
-			ctx->cnt_merged.curobjs--;
-			ctx->cnt_merged.curbytes -= size;
-			malloc_mutex_unlock(ctx->lock);
-		}
-	}
+	if (unlikely((uintptr_t)tctx > (uintptr_t)1U))
+		prof_free_sampled_object(tsd, usize, tctx);
 }
 #endif
 
diff --git a/deps/jemalloc/include/jemalloc/internal/ql.h b/deps/jemalloc/include/jemalloc/internal/ql.h
index f70c5f6f3..1834bb855 100644
--- a/deps/jemalloc/include/jemalloc/internal/ql.h
+++ b/deps/jemalloc/include/jemalloc/internal/ql.h
@@ -1,6 +1,4 @@
-/*
- * List definitions.
- */
+/* List definitions. */
 #define	ql_head(a_type)							\
 struct {								\
 	a_type *qlh_first;						\
diff --git a/deps/jemalloc/include/jemalloc/internal/qr.h b/deps/jemalloc/include/jemalloc/internal/qr.h
index 602944b9b..0fbaec25e 100644
--- a/deps/jemalloc/include/jemalloc/internal/qr.h
+++ b/deps/jemalloc/include/jemalloc/internal/qr.h
@@ -40,8 +40,10 @@ struct {								\
 	(a_qr_b)->a_field.qre_prev = t;					\
 } while (0)
 
-/* qr_meld() and qr_split() are functionally equivalent, so there's no need to
- * have two copies of the code. */
+/*
+ * qr_meld() and qr_split() are functionally equivalent, so there's no need to
+ * have two copies of the code.
+ */
 #define	qr_split(a_qr_a, a_qr_b, a_field)				\
 	qr_meld((a_qr_a), (a_qr_b), a_field)
 
diff --git a/deps/jemalloc/include/jemalloc/internal/quarantine.h b/deps/jemalloc/include/jemalloc/internal/quarantine.h
index 16f677f73..ae607399f 100644
--- a/deps/jemalloc/include/jemalloc/internal/quarantine.h
+++ b/deps/jemalloc/include/jemalloc/internal/quarantine.h
@@ -29,36 +29,29 @@ struct quarantine_s {
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
 
-quarantine_t	*quarantine_init(size_t lg_maxobjs);
-void	quarantine(void *ptr);
-void	quarantine_cleanup(void *arg);
-bool	quarantine_boot(void);
+void	quarantine_alloc_hook_work(tsd_t *tsd);
+void	quarantine(tsd_t *tsd, void *ptr);
+void	quarantine_cleanup(tsd_t *tsd);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES
 
 #ifndef JEMALLOC_ENABLE_INLINE
-malloc_tsd_protos(JEMALLOC_ATTR(unused), quarantine, quarantine_t *)
-
 void	quarantine_alloc_hook(void);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_QUARANTINE_C_))
-malloc_tsd_externs(quarantine, quarantine_t *)
-malloc_tsd_funcs(JEMALLOC_ALWAYS_INLINE, quarantine, quarantine_t *, NULL,
-    quarantine_cleanup)
-
 JEMALLOC_ALWAYS_INLINE void
 quarantine_alloc_hook(void)
 {
-	quarantine_t *quarantine;
+	tsd_t *tsd;
 
 	assert(config_fill && opt_quarantine);
 
-	quarantine = *quarantine_tsd_get();
-	if (quarantine == NULL)
-		quarantine_init(LG_MAXOBJS_INIT);
+	tsd = tsd_fetch();
+	if (tsd_quarantine_get(tsd) == NULL)
+		quarantine_alloc_hook_work(tsd);
 }
 #endif
 
diff --git a/deps/jemalloc/include/jemalloc/internal/rb.h b/deps/jemalloc/include/jemalloc/internal/rb.h
index 423802eb2..2ca8e5933 100644
--- a/deps/jemalloc/include/jemalloc/internal/rb.h
+++ b/deps/jemalloc/include/jemalloc/internal/rb.h
@@ -158,6 +158,8 @@ struct {								\
 #define	rb_proto(a_attr, a_prefix, a_rbt_type, a_type)			\
 a_attr void								\
 a_prefix##new(a_rbt_type *rbtree);					\
+a_attr bool								\
+a_prefix##empty(a_rbt_type *rbtree);					\
 a_attr a_type *								\
 a_prefix##first(a_rbt_type *rbtree);					\
 a_attr a_type *								\
@@ -198,7 +200,7 @@ a_prefix##reverse_iter(a_rbt_type *rbtree, a_type *start,		\
  *                 int (a_cmp *)(a_type *a_node, a_type *a_other);
  *                                       ^^^^^^
  *                                    or a_key
- *               Interpretation of comparision function return values:
+ *               Interpretation of comparison function return values:
  *                 -1 : a_node <  a_other
  *                  0 : a_node == a_other
  *                  1 : a_node >  a_other
@@ -224,6 +226,13 @@ a_prefix##reverse_iter(a_rbt_type *rbtree, a_type *start,		\
  *       Args:
  *         tree: Pointer to an uninitialized red-black tree object.
  *
+ *   static bool
+ *   ex_empty(ex_t *tree);
+ *       Description: Determine whether tree is empty.
+ *       Args:
+ *         tree: Pointer to an initialized red-black tree object.
+ *       Ret: True if tree is empty, false otherwise.
+ *
  *   static ex_node_t *
  *   ex_first(ex_t *tree);
  *   static ex_node_t *
@@ -309,6 +318,10 @@ a_attr void								\
 a_prefix##new(a_rbt_type *rbtree) {					\
     rb_new(a_type, a_field, rbtree);					\
 }									\
+a_attr bool								\
+a_prefix##empty(a_rbt_type *rbtree) {					\
+    return (rbtree->rbt_root == &rbtree->rbt_nil);			\
+}									\
 a_attr a_type *								\
 a_prefix##first(a_rbt_type *rbtree) {					\
     a_type *ret;							\
@@ -580,7 +593,7 @@ a_prefix##remove(a_rbt_type *rbtree, a_type *node) {			\
 	if (left != &rbtree->rbt_nil) {					\
 	    /* node has no successor, but it has a left child.        */\
 	    /* Splice node out, without losing the left child.        */\
-	    assert(rbtn_red_get(a_type, a_field, node) == false);	\
+	    assert(!rbtn_red_get(a_type, a_field, node));		\
 	    assert(rbtn_red_get(a_type, a_field, left));		\
 	    rbtn_black_set(a_type, a_field, left);			\
 	    if (pathp == path) {					\
@@ -616,8 +629,7 @@ a_prefix##remove(a_rbt_type *rbtree, a_type *node) {			\
 	if (pathp->cmp < 0) {						\
 	    rbtn_left_set(a_type, a_field, pathp->node,			\
 	      pathp[1].node);						\
-	    assert(rbtn_red_get(a_type, a_field, pathp[1].node)		\
-	      == false);						\
+	    assert(!rbtn_red_get(a_type, a_field, pathp[1].node));	\
 	    if (rbtn_red_get(a_type, a_field, pathp->node)) {		\
 		a_type *right = rbtn_right_get(a_type, a_field,		\
 		  pathp->node);						\
@@ -681,7 +693,7 @@ a_prefix##remove(a_rbt_type *rbtree, a_type *node) {			\
 		    rbtn_rotate_left(a_type, a_field, pathp->node,	\
 		      tnode);						\
 		    /* Balance restored, but rotation modified        */\
-		    /* subree root, which may actually be the tree    */\
+		    /* subtree root, which may actually be the tree   */\
 		    /* root.                                          */\
 		    if (pathp == path) {				\
 			/* Set root. */					\
@@ -849,7 +861,7 @@ a_prefix##remove(a_rbt_type *rbtree, a_type *node) {			\
     }									\
     /* Set root. */							\
     rbtree->rbt_root = path->node;					\
-    assert(rbtn_red_get(a_type, a_field, rbtree->rbt_root) == false);	\
+    assert(!rbtn_red_get(a_type, a_field, rbtree->rbt_root));		\
 }									\
 a_attr a_type *								\
 a_prefix##iter_recurse(a_rbt_type *rbtree, a_type *node,		\
diff --git a/deps/jemalloc/include/jemalloc/internal/rtree.h b/deps/jemalloc/include/jemalloc/internal/rtree.h
index bc74769f5..28ae9d1dd 100644
--- a/deps/jemalloc/include/jemalloc/internal/rtree.h
+++ b/deps/jemalloc/include/jemalloc/internal/rtree.h
@@ -1,170 +1,292 @@
 /*
  * This radix tree implementation is tailored to the singular purpose of
- * tracking which chunks are currently owned by jemalloc.  This functionality
- * is mandatory for OS X, where jemalloc must be able to respond to object
- * ownership queries.
+ * associating metadata with chunks that are currently owned by jemalloc.
  *
  *******************************************************************************
  */
 #ifdef JEMALLOC_H_TYPES
 
+typedef struct rtree_node_elm_s rtree_node_elm_t;
+typedef struct rtree_level_s rtree_level_t;
 typedef struct rtree_s rtree_t;
 
 /*
- * Size of each radix tree node (must be a power of 2).  This impacts tree
- * depth.
+ * RTREE_BITS_PER_LEVEL must be a power of two that is no larger than the
+ * machine address width.
  */
-#define	RTREE_NODESIZE (1U << 16)
+#define	LG_RTREE_BITS_PER_LEVEL	4
+#define	RTREE_BITS_PER_LEVEL	(ZU(1) << LG_RTREE_BITS_PER_LEVEL)
+#define	RTREE_HEIGHT_MAX						\
+    ((ZU(1) << (LG_SIZEOF_PTR+3)) / RTREE_BITS_PER_LEVEL)
 
-typedef void *(rtree_alloc_t)(size_t);
-typedef void (rtree_dalloc_t)(void *);
+/* Used for two-stage lock-free node initialization. */
+#define	RTREE_NODE_INITIALIZING	((rtree_node_elm_t *)0x1)
+
+/*
+ * The node allocation callback function's argument is the number of contiguous
+ * rtree_node_elm_t structures to allocate, and the resulting memory must be
+ * zeroed.
+ */
+typedef rtree_node_elm_t *(rtree_node_alloc_t)(size_t);
+typedef void (rtree_node_dalloc_t)(rtree_node_elm_t *);
 
 #endif /* JEMALLOC_H_TYPES */
 /******************************************************************************/
 #ifdef JEMALLOC_H_STRUCTS
 
+struct rtree_node_elm_s {
+	union {
+		void			*pun;
+		rtree_node_elm_t	*child;
+		extent_node_t		*val;
+	};
+};
+
+struct rtree_level_s {
+	/*
+	 * A non-NULL subtree points to a subtree rooted along the hypothetical
+	 * path to the leaf node corresponding to key 0.  Depending on what keys
+	 * have been used to store to the tree, an arbitrary combination of
+	 * subtree pointers may remain NULL.
+	 *
+	 * Suppose keys comprise 48 bits, and LG_RTREE_BITS_PER_LEVEL is 4.
+	 * This results in a 3-level tree, and the leftmost leaf can be directly
+	 * accessed via subtrees[2], the subtree prefixed by 0x0000 (excluding
+	 * 0x00000000) can be accessed via subtrees[1], and the remainder of the
+	 * tree can be accessed via subtrees[0].
+	 *
+	 *   levels[0] : [<unused> | 0x0001******** | 0x0002******** | ...]
+	 *
+	 *   levels[1] : [<unused> | 0x00000001**** | 0x00000002**** | ... ]
+	 *
+	 *   levels[2] : [val(0x000000000000) | val(0x000000000001) | ...]
+	 *
+	 * This has practical implications on x64, which currently uses only the
+	 * lower 47 bits of virtual address space in userland, thus leaving
+	 * subtrees[0] unused and avoiding a level of tree traversal.
+	 */
+	union {
+		void			*subtree_pun;
+		rtree_node_elm_t	*subtree;
+	};
+	/* Number of key bits distinguished by this level. */
+	unsigned		bits;
+	/*
+	 * Cumulative number of key bits distinguished by traversing to
+	 * corresponding tree level.
+	 */
+	unsigned		cumbits;
+};
+
 struct rtree_s {
-	rtree_alloc_t	*alloc;
-	rtree_dalloc_t	*dalloc;
-	malloc_mutex_t	mutex;
-	void		**root;
-	unsigned	height;
-	unsigned	level2bits[1]; /* Dynamically sized. */
+	rtree_node_alloc_t	*alloc;
+	rtree_node_dalloc_t	*dalloc;
+	unsigned		height;
+	/*
+	 * Precomputed table used to convert from the number of leading 0 key
+	 * bits to which subtree level to start at.
+	 */
+	unsigned		start_level[RTREE_HEIGHT_MAX];
+	rtree_level_t		levels[RTREE_HEIGHT_MAX];
 };
 
 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
 
-rtree_t	*rtree_new(unsigned bits, rtree_alloc_t *alloc, rtree_dalloc_t *dalloc);
+bool rtree_new(rtree_t *rtree, unsigned bits, rtree_node_alloc_t *alloc,
+    rtree_node_dalloc_t *dalloc);
 void	rtree_delete(rtree_t *rtree);
-void	rtree_prefork(rtree_t *rtree);
-void	rtree_postfork_parent(rtree_t *rtree);
-void	rtree_postfork_child(rtree_t *rtree);
+rtree_node_elm_t	*rtree_subtree_read_hard(rtree_t *rtree,
+    unsigned level);
+rtree_node_elm_t	*rtree_child_read_hard(rtree_t *rtree,
+    rtree_node_elm_t *elm, unsigned level);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES
 
 #ifndef JEMALLOC_ENABLE_INLINE
-#ifdef JEMALLOC_DEBUG
-uint8_t rtree_get_locked(rtree_t *rtree, uintptr_t key);
-#endif
-uint8_t	rtree_get(rtree_t *rtree, uintptr_t key);
-bool	rtree_set(rtree_t *rtree, uintptr_t key, uint8_t val);
+unsigned	rtree_start_level(rtree_t *rtree, uintptr_t key);
+uintptr_t	rtree_subkey(rtree_t *rtree, uintptr_t key, unsigned level);
+
+bool	rtree_node_valid(rtree_node_elm_t *node);
+rtree_node_elm_t	*rtree_child_tryread(rtree_node_elm_t *elm);
+rtree_node_elm_t	*rtree_child_read(rtree_t *rtree, rtree_node_elm_t *elm,
+    unsigned level);
+extent_node_t	*rtree_val_read(rtree_t *rtree, rtree_node_elm_t *elm,
+    bool dependent);
+void	rtree_val_write(rtree_t *rtree, rtree_node_elm_t *elm,
+    const extent_node_t *val);
+rtree_node_elm_t	*rtree_subtree_tryread(rtree_t *rtree, unsigned level);
+rtree_node_elm_t	*rtree_subtree_read(rtree_t *rtree, unsigned level);
+
+extent_node_t	*rtree_get(rtree_t *rtree, uintptr_t key, bool dependent);
+bool	rtree_set(rtree_t *rtree, uintptr_t key, const extent_node_t *val);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_RTREE_C_))
-#define	RTREE_GET_GENERATE(f)						\
-/* The least significant bits of the key are ignored. */		\
-JEMALLOC_INLINE uint8_t							\
-f(rtree_t *rtree, uintptr_t key)					\
-{									\
-	uint8_t ret;							\
-	uintptr_t subkey;						\
-	unsigned i, lshift, height, bits;				\
-	void **node, **child;						\
-									\
-	RTREE_LOCK(&rtree->mutex);					\
-	for (i = lshift = 0, height = rtree->height, node = rtree->root;\
-	    i < height - 1;						\
-	    i++, lshift += bits, node = child) {			\
-		bits = rtree->level2bits[i];				\
-		subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR +	\
-		    3)) - bits);					\
-		child = (void**)node[subkey];				\
-		if (child == NULL) {					\
-			RTREE_UNLOCK(&rtree->mutex);			\
-			return (0);					\
-		}							\
-	}								\
-									\
-	/*								\
-	 * node is a leaf, so it contains values rather than node	\
-	 * pointers.							\
-	 */								\
-	bits = rtree->level2bits[i];					\
-	subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR+3)) -	\
-	    bits);							\
-	{								\
-		uint8_t *leaf = (uint8_t *)node;			\
-		ret = leaf[subkey];					\
-	}								\
-	RTREE_UNLOCK(&rtree->mutex);					\
-									\
-	RTREE_GET_VALIDATE						\
-	return (ret);							\
+JEMALLOC_INLINE unsigned
+rtree_start_level(rtree_t *rtree, uintptr_t key)
+{
+	unsigned start_level;
+
+	if (unlikely(key == 0))
+		return (rtree->height - 1);
+
+	start_level = rtree->start_level[lg_floor(key) >>
+	    LG_RTREE_BITS_PER_LEVEL];
+	assert(start_level < rtree->height);
+	return (start_level);
 }
 
-#ifdef JEMALLOC_DEBUG
-#  define RTREE_LOCK(l)		malloc_mutex_lock(l)
-#  define RTREE_UNLOCK(l)	malloc_mutex_unlock(l)
-#  define RTREE_GET_VALIDATE
-RTREE_GET_GENERATE(rtree_get_locked)
-#  undef RTREE_LOCK
-#  undef RTREE_UNLOCK
-#  undef RTREE_GET_VALIDATE
-#endif
+JEMALLOC_INLINE uintptr_t
+rtree_subkey(rtree_t *rtree, uintptr_t key, unsigned level)
+{
 
-#define	RTREE_LOCK(l)
-#define	RTREE_UNLOCK(l)
-#ifdef JEMALLOC_DEBUG
-   /*
-    * Suppose that it were possible for a jemalloc-allocated chunk to be
-    * munmap()ped, followed by a different allocator in another thread re-using
-    * overlapping virtual memory, all without invalidating the cached rtree
-    * value.  The result would be a false positive (the rtree would claim that
-    * jemalloc owns memory that it had actually discarded).  This scenario
-    * seems impossible, but the following assertion is a prudent sanity check.
-    */
-#  define RTREE_GET_VALIDATE						\
-	assert(rtree_get_locked(rtree, key) == ret);
-#else
-#  define RTREE_GET_VALIDATE
-#endif
-RTREE_GET_GENERATE(rtree_get)
-#undef RTREE_LOCK
-#undef RTREE_UNLOCK
-#undef RTREE_GET_VALIDATE
+	return ((key >> ((ZU(1) << (LG_SIZEOF_PTR+3)) -
+	    rtree->levels[level].cumbits)) & ((ZU(1) <<
+	    rtree->levels[level].bits) - 1));
+}
 
 JEMALLOC_INLINE bool
-rtree_set(rtree_t *rtree, uintptr_t key, uint8_t val)
+rtree_node_valid(rtree_node_elm_t *node)
+{
+
+	return ((uintptr_t)node > (uintptr_t)RTREE_NODE_INITIALIZING);
+}
+
+JEMALLOC_INLINE rtree_node_elm_t *
+rtree_child_tryread(rtree_node_elm_t *elm)
+{
+	rtree_node_elm_t *child;
+
+	/* Double-checked read (first read may be stale. */
+	child = elm->child;
+	if (!rtree_node_valid(child))
+		child = atomic_read_p(&elm->pun);
+	return (child);
+}
+
+JEMALLOC_INLINE rtree_node_elm_t *
+rtree_child_read(rtree_t *rtree, rtree_node_elm_t *elm, unsigned level)
+{
+	rtree_node_elm_t *child;
+
+	child = rtree_child_tryread(elm);
+	if (unlikely(!rtree_node_valid(child)))
+		child = rtree_child_read_hard(rtree, elm, level);
+	return (child);
+}
+
+JEMALLOC_INLINE extent_node_t *
+rtree_val_read(rtree_t *rtree, rtree_node_elm_t *elm, bool dependent)
+{
+
+	if (dependent) {
+		/*
+		 * Reading a val on behalf of a pointer to a valid allocation is
+		 * guaranteed to be a clean read even without synchronization,
+		 * because the rtree update became visible in memory before the
+		 * pointer came into existence.
+		 */
+		return (elm->val);
+	} else {
+		/*
+		 * An arbitrary read, e.g. on behalf of ivsalloc(), may not be
+		 * dependent on a previous rtree write, which means a stale read
+		 * could result if synchronization were omitted here.
+		 */
+		return (atomic_read_p(&elm->pun));
+	}
+}
+
+JEMALLOC_INLINE void
+rtree_val_write(rtree_t *rtree, rtree_node_elm_t *elm, const extent_node_t *val)
+{
+
+	atomic_write_p(&elm->pun, val);
+}
+
+JEMALLOC_INLINE rtree_node_elm_t *
+rtree_subtree_tryread(rtree_t *rtree, unsigned level)
+{
+	rtree_node_elm_t *subtree;
+
+	/* Double-checked read (first read may be stale. */
+	subtree = rtree->levels[level].subtree;
+	if (!rtree_node_valid(subtree))
+		subtree = atomic_read_p(&rtree->levels[level].subtree_pun);
+	return (subtree);
+}
+
+JEMALLOC_INLINE rtree_node_elm_t *
+rtree_subtree_read(rtree_t *rtree, unsigned level)
+{
+	rtree_node_elm_t *subtree;
+
+	subtree = rtree_subtree_tryread(rtree, level);
+	if (unlikely(!rtree_node_valid(subtree)))
+		subtree = rtree_subtree_read_hard(rtree, level);
+	return (subtree);
+}
+
+JEMALLOC_INLINE extent_node_t *
+rtree_get(rtree_t *rtree, uintptr_t key, bool dependent)
 {
 	uintptr_t subkey;
-	unsigned i, lshift, height, bits;
-	void **node, **child;
-
-	malloc_mutex_lock(&rtree->mutex);
-	for (i = lshift = 0, height = rtree->height, node = rtree->root;
-	    i < height - 1;
-	    i++, lshift += bits, node = child) {
-		bits = rtree->level2bits[i];
-		subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR+3)) -
-		    bits);
-		child = (void**)node[subkey];
-		if (child == NULL) {
-			size_t size = ((i + 1 < height - 1) ? sizeof(void *)
-			    : (sizeof(uint8_t))) << rtree->level2bits[i+1];
-			child = (void**)rtree->alloc(size);
-			if (child == NULL) {
-				malloc_mutex_unlock(&rtree->mutex);
-				return (true);
-			}
-			memset(child, 0, size);
-			node[subkey] = child;
+	unsigned i, start_level;
+	rtree_node_elm_t *node, *child;
+
+	start_level = rtree_start_level(rtree, key);
+
+	for (i = start_level, node = rtree_subtree_tryread(rtree, start_level);
+	    /**/; i++, node = child) {
+		if (!dependent && unlikely(!rtree_node_valid(node)))
+			return (NULL);
+		subkey = rtree_subkey(rtree, key, i);
+		if (i == rtree->height - 1) {
+			/*
+			 * node is a leaf, so it contains values rather than
+			 * child pointers.
+			 */
+			return (rtree_val_read(rtree, &node[subkey],
+			    dependent));
 		}
+		assert(i < rtree->height - 1);
+		child = rtree_child_tryread(&node[subkey]);
 	}
+	not_reached();
+}
 
-	/* node is a leaf, so it contains values rather than node pointers. */
-	bits = rtree->level2bits[i];
-	subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR+3)) - bits);
-	{
-		uint8_t *leaf = (uint8_t *)node;
-		leaf[subkey] = val;
-	}
-	malloc_mutex_unlock(&rtree->mutex);
+JEMALLOC_INLINE bool
+rtree_set(rtree_t *rtree, uintptr_t key, const extent_node_t *val)
+{
+	uintptr_t subkey;
+	unsigned i, start_level;
+	rtree_node_elm_t *node, *child;
 
-	return (false);
+	start_level = rtree_start_level(rtree, key);
+
+	node = rtree_subtree_read(rtree, start_level);
+	if (node == NULL)
+		return (true);
+	for (i = start_level; /**/; i++, node = child) {
+		subkey = rtree_subkey(rtree, key, i);
+		if (i == rtree->height - 1) {
+			/*
+			 * node is a leaf, so it contains values rather than
+			 * child pointers.
+			 */
+			rtree_val_write(rtree, &node[subkey], val);
+			return (false);
+		}
+		assert(i + 1 < rtree->height);
+		child = rtree_child_read(rtree, &node[subkey], i);
+		if (child == NULL)
+			return (true);
+	}
+	not_reached();
 }
 #endif
 
diff --git a/deps/jemalloc/include/jemalloc/internal/size_classes.sh b/deps/jemalloc/include/jemalloc/internal/size_classes.sh
index 29c80c1fb..fc82036d3 100755
--- a/deps/jemalloc/include/jemalloc/internal/size_classes.sh
+++ b/deps/jemalloc/include/jemalloc/internal/size_classes.sh
@@ -1,17 +1,26 @@
 #!/bin/sh
+#
+# Usage: size_classes.sh <lg_qarr> <lg_tmin> <lg_parr> <lg_g>
 
 # The following limits are chosen such that they cover all supported platforms.
 
-# Range of quanta.
-lg_qmin=3
-lg_qmax=4
+# Pointer sizes.
+lg_zarr="2 3"
+
+# Quanta.
+lg_qarr=$1
 
 # The range of tiny size classes is [2^lg_tmin..2^(lg_q-1)].
-lg_tmin=3
+lg_tmin=$2
+
+# Maximum lookup size.
+lg_kmax=12
+
+# Page sizes.
+lg_parr=`echo $3 | tr ',' ' '`
 
-# Range of page sizes.
-lg_pmin=12
-lg_pmax=16
+# Size class group size (number of size classes for each size doubling).
+lg_g=$4
 
 pow2() {
   e=$1
@@ -22,68 +31,224 @@ pow2() {
   done
 }
 
+lg() {
+  x=$1
+  lg_result=0
+  while [ ${x} -gt 1 ] ; do
+    lg_result=$((${lg_result} + 1))
+    x=$((${x} / 2))
+  done
+}
+
+size_class() {
+  index=$1
+  lg_grp=$2
+  lg_delta=$3
+  ndelta=$4
+  lg_p=$5
+  lg_kmax=$6
+
+  lg ${ndelta}; lg_ndelta=${lg_result}; pow2 ${lg_ndelta}
+  if [ ${pow2_result} -lt ${ndelta} ] ; then
+    rem="yes"
+  else
+    rem="no"
+  fi
+
+  lg_size=${lg_grp}
+  if [ $((${lg_delta} + ${lg_ndelta})) -eq ${lg_grp} ] ; then
+    lg_size=$((${lg_grp} + 1))
+  else
+    lg_size=${lg_grp}
+    rem="yes"
+  fi
+
+  if [ ${lg_size} -lt $((${lg_p} + ${lg_g})) ] ; then
+    bin="yes"
+  else
+    bin="no"
+  fi
+  if [ ${lg_size} -lt ${lg_kmax} \
+      -o ${lg_size} -eq ${lg_kmax} -a ${rem} = "no" ] ; then
+    lg_delta_lookup=${lg_delta}
+  else
+    lg_delta_lookup="no"
+  fi
+  printf '    SC(%3d, %6d, %8d, %6d, %3s, %2s) \\\n' ${index} ${lg_grp} ${lg_delta} ${ndelta} ${bin} ${lg_delta_lookup}
+  # Defined upon return:
+  # - lg_delta_lookup (${lg_delta} or "no")
+  # - bin ("yes" or "no")
+}
+
+sep_line() {
+  echo "                                               \\"
+}
+
+size_classes() {
+  lg_z=$1
+  lg_q=$2
+  lg_t=$3
+  lg_p=$4
+  lg_g=$5
+
+  pow2 $((${lg_z} + 3)); ptr_bits=${pow2_result}
+  pow2 ${lg_g}; g=${pow2_result}
+
+  echo "#define	SIZE_CLASSES \\"
+  echo "  /* index, lg_grp, lg_delta, ndelta, bin, lg_delta_lookup */ \\"
+
+  ntbins=0
+  nlbins=0
+  lg_tiny_maxclass='"NA"'
+  nbins=0
+
+  # Tiny size classes.
+  ndelta=0
+  index=0
+  lg_grp=${lg_t}
+  lg_delta=${lg_grp}
+  while [ ${lg_grp} -lt ${lg_q} ] ; do
+    size_class ${index} ${lg_grp} ${lg_delta} ${ndelta} ${lg_p} ${lg_kmax}
+    if [ ${lg_delta_lookup} != "no" ] ; then
+      nlbins=$((${index} + 1))
+    fi
+    if [ ${bin} != "no" ] ; then
+      nbins=$((${index} + 1))
+    fi
+    ntbins=$((${ntbins} + 1))
+    lg_tiny_maxclass=${lg_grp} # Final written value is correct.
+    index=$((${index} + 1))
+    lg_delta=${lg_grp}
+    lg_grp=$((${lg_grp} + 1))
+  done
+
+  # First non-tiny group.
+  if [ ${ntbins} -gt 0 ] ; then
+    sep_line
+    # The first size class has an unusual encoding, because the size has to be
+    # split between grp and delta*ndelta.
+    lg_grp=$((${lg_grp} - 1))
+    ndelta=1
+    size_class ${index} ${lg_grp} ${lg_delta} ${ndelta} ${lg_p} ${lg_kmax}
+    index=$((${index} + 1))
+    lg_grp=$((${lg_grp} + 1))
+    lg_delta=$((${lg_delta} + 1))
+  fi
+  while [ ${ndelta} -lt ${g} ] ; do
+    size_class ${index} ${lg_grp} ${lg_delta} ${ndelta} ${lg_p} ${lg_kmax}
+    index=$((${index} + 1))
+    ndelta=$((${ndelta} + 1))
+  done
+
+  # All remaining groups.
+  lg_grp=$((${lg_grp} + ${lg_g}))
+  while [ ${lg_grp} -lt ${ptr_bits} ] ; do
+    sep_line
+    ndelta=1
+    if [ ${lg_grp} -eq $((${ptr_bits} - 1)) ] ; then
+      ndelta_limit=$((${g} - 1))
+    else
+      ndelta_limit=${g}
+    fi
+    while [ ${ndelta} -le ${ndelta_limit} ] ; do
+      size_class ${index} ${lg_grp} ${lg_delta} ${ndelta} ${lg_p} ${lg_kmax}
+      if [ ${lg_delta_lookup} != "no" ] ; then
+        nlbins=$((${index} + 1))
+        # Final written value is correct:
+        lookup_maxclass="((((size_t)1) << ${lg_grp}) + (((size_t)${ndelta}) << ${lg_delta}))"
+      fi
+      if [ ${bin} != "no" ] ; then
+        nbins=$((${index} + 1))
+        # Final written value is correct:
+        small_maxclass="((((size_t)1) << ${lg_grp}) + (((size_t)${ndelta}) << ${lg_delta}))"
+        if [ ${lg_g} -gt 0 ] ; then
+          lg_large_minclass=$((${lg_grp} + 1))
+        else
+          lg_large_minclass=$((${lg_grp} + 2))
+        fi
+      fi
+      # Final written value is correct:
+      huge_maxclass="((((size_t)1) << ${lg_grp}) + (((size_t)${ndelta}) << ${lg_delta}))"
+      index=$((${index} + 1))
+      ndelta=$((${ndelta} + 1))
+    done
+    lg_grp=$((${lg_grp} + 1))
+    lg_delta=$((${lg_delta} + 1))
+  done
+  echo
+  nsizes=${index}
+
+  # Defined upon completion:
+  # - ntbins
+  # - nlbins
+  # - nbins
+  # - nsizes
+  # - lg_tiny_maxclass
+  # - lookup_maxclass
+  # - small_maxclass
+  # - lg_large_minclass
+  # - huge_maxclass
+}
+
 cat <<EOF
 /* This file was automatically generated by size_classes.sh. */
 /******************************************************************************/
 #ifdef JEMALLOC_H_TYPES
 
+/*
+ * This header requires LG_SIZEOF_PTR, LG_TINY_MIN, LG_QUANTUM, and LG_PAGE to
+ * be defined prior to inclusion, and it in turn defines:
+ *
+ *   LG_SIZE_CLASS_GROUP: Lg of size class count for each size doubling.
+ *   SIZE_CLASSES: Complete table of
+ *                 SC(index, lg_grp, lg_delta, ndelta, bin, lg_delta_lookup)
+ *                 tuples.
+ *     index: Size class index.
+ *     lg_grp: Lg group base size (no deltas added).
+ *     lg_delta: Lg delta to previous size class.
+ *     ndelta: Delta multiplier.  size == 1<<lg_grp + ndelta<<lg_delta
+ *     bin: 'yes' if a small bin size class, 'no' otherwise.
+ *     lg_delta_lookup: Same as lg_delta if a lookup table size class, 'no'
+ *                      otherwise.
+ *   NTBINS: Number of tiny bins.
+ *   NLBINS: Number of bins supported by the lookup table.
+ *   NBINS: Number of small size class bins.
+ *   NSIZES: Number of size classes.
+ *   LG_TINY_MAXCLASS: Lg of maximum tiny size class.
+ *   LOOKUP_MAXCLASS: Maximum size class included in lookup table.
+ *   SMALL_MAXCLASS: Maximum small size class.
+ *   LG_LARGE_MINCLASS: Lg of minimum large size class.
+ *   HUGE_MAXCLASS: Maximum (huge) size class.
+ */
+
+#define	LG_SIZE_CLASS_GROUP	${lg_g}
+
 EOF
 
-lg_q=${lg_qmin}
-while [ ${lg_q} -le ${lg_qmax} ] ; do
-  lg_t=${lg_tmin}
-  while [ ${lg_t} -le ${lg_q} ] ; do
-    lg_p=${lg_pmin}
-    while [ ${lg_p} -le ${lg_pmax} ] ; do
-      echo "#if (LG_TINY_MIN == ${lg_t} && LG_QUANTUM == ${lg_q} && LG_PAGE == ${lg_p})"
-      echo "#define	SIZE_CLASSES_DEFINED"
-      pow2 ${lg_q}; q=${pow2_result}
-      pow2 ${lg_t}; t=${pow2_result}
-      pow2 ${lg_p}; p=${pow2_result}
-      bin=0
-      psz=0
-      sz=${t}
-      delta=$((${sz} - ${psz}))
-      echo "/*  SIZE_CLASS(bin,	delta,	sz) */"
-      echo "#define	SIZE_CLASSES							\\"
-
-      # Tiny size classes.
-      while [ ${sz} -lt ${q} ] ; do
-        echo "    SIZE_CLASS(${bin},	${delta},	${sz})					\\"
-        bin=$((${bin} + 1))
-        psz=${sz}
-        sz=$((${sz} + ${sz}))
-        delta=$((${sz} - ${psz}))
-      done
-      # Quantum-multiple size classes.  For each doubling of sz, as many as 4
-      # size classes exist.  Their spacing is the greater of:
-      # - q
-      # - sz/4, where sz is a power of 2
-      while [ ${sz} -lt ${p} ] ; do
-        if [ ${sz} -ge $((${q} * 4)) ] ; then
-          i=$((${sz} / 4))
-        else
-          i=${q}
-        fi
-        next_2pow=$((${sz} * 2))
-        while [ ${sz} -lt $next_2pow ] ; do
-          echo "    SIZE_CLASS(${bin},	${delta},	${sz})					\\"
-          bin=$((${bin} + 1))
-          psz=${sz}
-          sz=$((${sz} + ${i}))
-          delta=$((${sz} - ${psz}))
-        done
+for lg_z in ${lg_zarr} ; do
+  for lg_q in ${lg_qarr} ; do
+    lg_t=${lg_tmin}
+    while [ ${lg_t} -le ${lg_q} ] ; do
+      # Iterate through page sizes and compute how many bins there are.
+      for lg_p in ${lg_parr} ; do
+        echo "#if (LG_SIZEOF_PTR == ${lg_z} && LG_TINY_MIN == ${lg_t} && LG_QUANTUM == ${lg_q} && LG_PAGE == ${lg_p})"
+        size_classes ${lg_z} ${lg_q} ${lg_t} ${lg_p} ${lg_g}
+        echo "#define	SIZE_CLASSES_DEFINED"
+        echo "#define	NTBINS			${ntbins}"
+        echo "#define	NLBINS			${nlbins}"
+        echo "#define	NBINS			${nbins}"
+        echo "#define	NSIZES			${nsizes}"
+        echo "#define	LG_TINY_MAXCLASS	${lg_tiny_maxclass}"
+        echo "#define	LOOKUP_MAXCLASS		${lookup_maxclass}"
+        echo "#define	SMALL_MAXCLASS		${small_maxclass}"
+        echo "#define	LG_LARGE_MINCLASS	${lg_large_minclass}"
+        echo "#define	HUGE_MAXCLASS		${huge_maxclass}"
+        echo "#endif"
+        echo
       done
-      echo
-      echo "#define	NBINS		${bin}"
-      echo "#define	SMALL_MAXCLASS	${psz}"
-      echo "#endif"
-      echo
-      lg_p=$((${lg_p} + 1))
+      lg_t=$((${lg_t} + 1))
     done
-    lg_t=$((${lg_t} + 1))
   done
-  lg_q=$((${lg_q} + 1))
 done
 
 cat <<EOF
@@ -92,11 +257,10 @@ cat <<EOF
 #endif
 #undef SIZE_CLASSES_DEFINED
 /*
- * The small_size2bin lookup table uses uint8_t to encode each bin index, so we
+ * The size2index_tab lookup table uses uint8_t to encode each bin index, so we
  * cannot support more than 256 small size classes.  Further constrain NBINS to
- * 255 to support prof_promote, since all small size classes, plus a "not
- * small" size class must be stored in 8 bits of arena_chunk_map_t's bits
- * field.
+ * 255 since all small size classes, plus a "not small" size class must be
+ * stored in 8 bits of arena_chunk_map_bits_t's bits field.
  */
 #if (NBINS > 255)
 #  error "Too many small size classes"
diff --git a/deps/jemalloc/include/jemalloc/internal/stats.h b/deps/jemalloc/include/jemalloc/internal/stats.h
index 27f68e368..c91dba99d 100644
--- a/deps/jemalloc/include/jemalloc/internal/stats.h
+++ b/deps/jemalloc/include/jemalloc/internal/stats.h
@@ -4,6 +4,7 @@
 typedef struct tcache_bin_stats_s tcache_bin_stats_t;
 typedef struct malloc_bin_stats_s malloc_bin_stats_t;
 typedef struct malloc_large_stats_s malloc_large_stats_t;
+typedef struct malloc_huge_stats_s malloc_huge_stats_t;
 typedef struct arena_stats_s arena_stats_t;
 typedef struct chunk_stats_s chunk_stats_t;
 
@@ -21,12 +22,6 @@ struct tcache_bin_stats_s {
 
 struct malloc_bin_stats_s {
 	/*
-	 * Current number of bytes allocated, including objects currently
-	 * cached by tcache.
-	 */
-	size_t		allocated;
-
-	/*
 	 * Total number of allocation/deallocation requests served directly by
 	 * the bin.  Note that tcache may allocate an object, then recycle it
 	 * many times, resulting many increments to nrequests, but only one
@@ -42,6 +37,12 @@ struct malloc_bin_stats_s {
 	 */
 	uint64_t	nrequests;
 
+	/*
+	 * Current number of regions of this size class, including regions
+	 * currently cached by tcache.
+	 */
+	size_t		curregs;
+
 	/* Number of tcache fills from this bin. */
 	uint64_t	nfills;
 
@@ -78,10 +79,25 @@ struct malloc_large_stats_s {
 	 */
 	uint64_t	nrequests;
 
-	/* Current number of runs of this size class. */
+	/*
+	 * Current number of runs of this size class, including runs currently
+	 * cached by tcache.
+	 */
 	size_t		curruns;
 };
 
+struct malloc_huge_stats_s {
+	/*
+	 * Total number of allocation/deallocation requests served directly by
+	 * the arena.
+	 */
+	uint64_t	nmalloc;
+	uint64_t	ndalloc;
+
+	/* Current number of (multi-)chunk allocations of this size class. */
+	size_t		curhchunks;
+};
+
 struct arena_stats_s {
 	/* Number of bytes currently mapped. */
 	size_t		mapped;
@@ -95,34 +111,28 @@ struct arena_stats_s {
 	uint64_t	nmadvise;
 	uint64_t	purged;
 
+	/*
+	 * Number of bytes currently mapped purely for metadata purposes, and
+	 * number of bytes currently allocated for internal metadata.
+	 */
+	size_t		metadata_mapped;
+	size_t		metadata_allocated; /* Protected via atomic_*_z(). */
+
 	/* Per-size-category statistics. */
 	size_t		allocated_large;
 	uint64_t	nmalloc_large;
 	uint64_t	ndalloc_large;
 	uint64_t	nrequests_large;
 
-	/*
-	 * One element for each possible size class, including sizes that
-	 * overlap with bin size classes.  This is necessary because ipalloc()
-	 * sometimes has to use such large objects in order to assure proper
-	 * alignment.
-	 */
-	malloc_large_stats_t	*lstats;
-};
-
-struct chunk_stats_s {
-	/* Number of chunks that were allocated. */
-	uint64_t	nchunks;
+	size_t		allocated_huge;
+	uint64_t	nmalloc_huge;
+	uint64_t	ndalloc_huge;
 
-	/* High-water mark for number of chunks allocated. */
-	size_t		highchunks;
+	/* One element for each large size class. */
+	malloc_large_stats_t	*lstats;
 
-	/*
-	 * Current number of chunks allocated.  This value isn't maintained for
-	 * any other purpose, so keep track of it in order to be able to set
-	 * highchunks.
-	 */
-	size_t		curchunks;
+	/* One element for each huge size class. */
+	malloc_huge_stats_t	*hstats;
 };
 
 #endif /* JEMALLOC_H_STRUCTS */
diff --git a/deps/jemalloc/include/jemalloc/internal/tcache.h b/deps/jemalloc/include/jemalloc/internal/tcache.h
index c3d4b58d4..5079cd266 100644
--- a/deps/jemalloc/include/jemalloc/internal/tcache.h
+++ b/deps/jemalloc/include/jemalloc/internal/tcache.h
@@ -4,6 +4,7 @@
 typedef struct tcache_bin_info_s tcache_bin_info_t;
 typedef struct tcache_bin_s tcache_bin_t;
 typedef struct tcache_s tcache_t;
+typedef struct tcaches_s tcaches_t;
 
 /*
  * tcache pointers close to NULL are used to encode state information that is
@@ -16,6 +17,11 @@ typedef struct tcache_s tcache_t;
 #define	TCACHE_STATE_MAX		TCACHE_STATE_PURGATORY
 
 /*
+ * Absolute minimum number of cache slots for each small bin.
+ */
+#define	TCACHE_NSLOTS_SMALL_MIN		20
+
+/*
  * Absolute maximum number of cache slots for each small bin in the thread
  * cache.  This is an additional constraint beyond that imposed as: twice the
  * number of regions per run for this size class.
@@ -69,10 +75,9 @@ struct tcache_bin_s {
 
 struct tcache_s {
 	ql_elm(tcache_t) link;		/* Used for aggregating stats. */
-	uint64_t	prof_accumbytes;/* Cleared after arena_prof_accum() */
-	arena_t		*arena;		/* This thread's arena. */
+	uint64_t	prof_accumbytes;/* Cleared after arena_prof_accum(). */
 	unsigned	ev_cnt;		/* Event count since incremental GC. */
-	unsigned	next_gc_bin;	/* Next bin to GC. */
+	szind_t		next_gc_bin;	/* Next bin to GC. */
 	tcache_bin_t	tbins[1];	/* Dynamically sized. */
 	/*
 	 * The pointer stacks associated with tbins follow as a contiguous
@@ -82,6 +87,14 @@ struct tcache_s {
 	 */
 };
 
+/* Linkage for list of available (previously used) explicit tcache IDs. */
+struct tcaches_s {
+	union {
+		tcache_t	*tcache;
+		tcaches_t	*next;
+	};
+};
+
 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
@@ -95,84 +108,90 @@ extern tcache_bin_info_t	*tcache_bin_info;
  * Number of tcache bins.  There are NBINS small-object bins, plus 0 or more
  * large-object bins.
  */
-extern size_t			nhbins;
+extern size_t	nhbins;
 
 /* Maximum cached size class. */
-extern size_t			tcache_maxclass;
+extern size_t	tcache_maxclass;
+
+/*
+ * Explicit tcaches, managed via the tcache.{create,flush,destroy} mallctls and
+ * usable via the MALLOCX_TCACHE() flag.  The automatic per thread tcaches are
+ * completely disjoint from this data structure.  tcaches starts off as a sparse
+ * array, so it has no physical memory footprint until individual pages are
+ * touched.  This allows the entire array to be allocated the first time an
+ * explicit tcache is created without a disproportionate impact on memory usage.
+ */
+extern tcaches_t	*tcaches;
 
 size_t	tcache_salloc(const void *ptr);
-void	tcache_event_hard(tcache_t *tcache);
-void	*tcache_alloc_small_hard(tcache_t *tcache, tcache_bin_t *tbin,
-    size_t binind);
-void	tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem,
-    tcache_t *tcache);
-void	tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem,
-    tcache_t *tcache);
+void	tcache_event_hard(tsd_t *tsd, tcache_t *tcache);
+void	*tcache_alloc_small_hard(tsd_t *tsd, arena_t *arena, tcache_t *tcache,
+    tcache_bin_t *tbin, szind_t binind);
+void	tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
+    szind_t binind, unsigned rem);
+void	tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, szind_t binind,
+    unsigned rem, tcache_t *tcache);
 void	tcache_arena_associate(tcache_t *tcache, arena_t *arena);
-void	tcache_arena_dissociate(tcache_t *tcache);
-tcache_t *tcache_create(arena_t *arena);
-void	tcache_destroy(tcache_t *tcache);
-void	tcache_thread_cleanup(void *arg);
+void	tcache_arena_reassociate(tcache_t *tcache, arena_t *oldarena,
+    arena_t *newarena);
+void	tcache_arena_dissociate(tcache_t *tcache, arena_t *arena);
+tcache_t *tcache_get_hard(tsd_t *tsd);
+tcache_t *tcache_create(tsd_t *tsd, arena_t *arena);
+void	tcache_cleanup(tsd_t *tsd);
+void	tcache_enabled_cleanup(tsd_t *tsd);
 void	tcache_stats_merge(tcache_t *tcache, arena_t *arena);
-bool	tcache_boot0(void);
-bool	tcache_boot1(void);
+bool	tcaches_create(tsd_t *tsd, unsigned *r_ind);
+void	tcaches_flush(tsd_t *tsd, unsigned ind);
+void	tcaches_destroy(tsd_t *tsd, unsigned ind);
+bool	tcache_boot(void);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES
 
 #ifndef JEMALLOC_ENABLE_INLINE
-malloc_tsd_protos(JEMALLOC_ATTR(unused), tcache, tcache_t *)
-malloc_tsd_protos(JEMALLOC_ATTR(unused), tcache_enabled, tcache_enabled_t)
-
-void	tcache_event(tcache_t *tcache);
+void	tcache_event(tsd_t *tsd, tcache_t *tcache);
 void	tcache_flush(void);
 bool	tcache_enabled_get(void);
-tcache_t *tcache_get(bool create);
+tcache_t *tcache_get(tsd_t *tsd, bool create);
 void	tcache_enabled_set(bool enabled);
 void	*tcache_alloc_easy(tcache_bin_t *tbin);
-void	*tcache_alloc_small(tcache_t *tcache, size_t size, bool zero);
-void	*tcache_alloc_large(tcache_t *tcache, size_t size, bool zero);
-void	tcache_dalloc_small(tcache_t *tcache, void *ptr, size_t binind);
-void	tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size);
+void	*tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache,
+    size_t size, bool zero);
+void	*tcache_alloc_large(tsd_t *tsd, arena_t *arena, tcache_t *tcache,
+    size_t size, bool zero);
+void	tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr,
+    szind_t binind);
+void	tcache_dalloc_large(tsd_t *tsd, tcache_t *tcache, void *ptr,
+    size_t size);
+tcache_t	*tcaches_get(tsd_t *tsd, unsigned ind);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_TCACHE_C_))
-/* Map of thread-specific caches. */
-malloc_tsd_externs(tcache, tcache_t *)
-malloc_tsd_funcs(JEMALLOC_ALWAYS_INLINE, tcache, tcache_t *, NULL,
-    tcache_thread_cleanup)
-/* Per thread flag that allows thread caches to be disabled. */
-malloc_tsd_externs(tcache_enabled, tcache_enabled_t)
-malloc_tsd_funcs(JEMALLOC_ALWAYS_INLINE, tcache_enabled, tcache_enabled_t,
-    tcache_enabled_default, malloc_tsd_no_cleanup)
-
 JEMALLOC_INLINE void
 tcache_flush(void)
 {
-	tcache_t *tcache;
+	tsd_t *tsd;
 
 	cassert(config_tcache);
 
-	tcache = *tcache_tsd_get();
-	if ((uintptr_t)tcache <= (uintptr_t)TCACHE_STATE_MAX)
-		return;
-	tcache_destroy(tcache);
-	tcache = NULL;
-	tcache_tsd_set(&tcache);
+	tsd = tsd_fetch();
+	tcache_cleanup(tsd);
 }
 
 JEMALLOC_INLINE bool
 tcache_enabled_get(void)
 {
+	tsd_t *tsd;
 	tcache_enabled_t tcache_enabled;
 
 	cassert(config_tcache);
 
-	tcache_enabled = *tcache_enabled_tsd_get();
+	tsd = tsd_fetch();
+	tcache_enabled = tsd_tcache_enabled_get(tsd);
 	if (tcache_enabled == tcache_enabled_default) {
 		tcache_enabled = (tcache_enabled_t)opt_tcache;
-		tcache_enabled_tsd_set(&tcache_enabled);
+		tsd_tcache_enabled_set(tsd, tcache_enabled);
 	}
 
 	return ((bool)tcache_enabled);
@@ -181,85 +200,41 @@ tcache_enabled_get(void)
 JEMALLOC_INLINE void
 tcache_enabled_set(bool enabled)
 {
+	tsd_t *tsd;
 	tcache_enabled_t tcache_enabled;
-	tcache_t *tcache;
 
 	cassert(config_tcache);
 
+	tsd = tsd_fetch();
+
 	tcache_enabled = (tcache_enabled_t)enabled;
-	tcache_enabled_tsd_set(&tcache_enabled);
-	tcache = *tcache_tsd_get();
-	if (enabled) {
-		if (tcache == TCACHE_STATE_DISABLED) {
-			tcache = NULL;
-			tcache_tsd_set(&tcache);
-		}
-	} else /* disabled */ {
-		if (tcache > TCACHE_STATE_MAX) {
-			tcache_destroy(tcache);
-			tcache = NULL;
-		}
-		if (tcache == NULL) {
-			tcache = TCACHE_STATE_DISABLED;
-			tcache_tsd_set(&tcache);
-		}
-	}
+	tsd_tcache_enabled_set(tsd, tcache_enabled);
+
+	if (!enabled)
+		tcache_cleanup(tsd);
 }
 
 JEMALLOC_ALWAYS_INLINE tcache_t *
-tcache_get(bool create)
+tcache_get(tsd_t *tsd, bool create)
 {
 	tcache_t *tcache;
 
-	if (config_tcache == false)
-		return (NULL);
-	if (config_lazy_lock && isthreaded == false)
+	if (!config_tcache)
 		return (NULL);
 
-	tcache = *tcache_tsd_get();
-	if ((uintptr_t)tcache <= (uintptr_t)TCACHE_STATE_MAX) {
-		if (tcache == TCACHE_STATE_DISABLED)
-			return (NULL);
-		if (tcache == NULL) {
-			if (create == false) {
-				/*
-				 * Creating a tcache here would cause
-				 * allocation as a side effect of free().
-				 * Ordinarily that would be okay since
-				 * tcache_create() failure is a soft failure
-				 * that doesn't propagate.  However, if TLS
-				 * data are freed via free() as in glibc,
-				 * subtle corruption could result from setting
-				 * a TLS variable after its backing memory is
-				 * freed.
-				 */
-				return (NULL);
-			}
-			if (tcache_enabled_get() == false) {
-				tcache_enabled_set(false); /* Memoize. */
-				return (NULL);
-			}
-			return (tcache_create(choose_arena(NULL)));
-		}
-		if (tcache == TCACHE_STATE_PURGATORY) {
-			/*
-			 * Make a note that an allocator function was called
-			 * after tcache_thread_cleanup() was called.
-			 */
-			tcache = TCACHE_STATE_REINCARNATED;
-			tcache_tsd_set(&tcache);
-			return (NULL);
-		}
-		if (tcache == TCACHE_STATE_REINCARNATED)
-			return (NULL);
-		not_reached();
+	tcache = tsd_tcache_get(tsd);
+	if (!create)
+		return (tcache);
+	if (unlikely(tcache == NULL) && tsd_nominal(tsd)) {
+		tcache = tcache_get_hard(tsd);
+		tsd_tcache_set(tsd, tcache);
 	}
 
 	return (tcache);
 }
 
 JEMALLOC_ALWAYS_INLINE void
-tcache_event(tcache_t *tcache)
+tcache_event(tsd_t *tsd, tcache_t *tcache)
 {
 
 	if (TCACHE_GC_INCR == 0)
@@ -267,8 +242,8 @@ tcache_event(tcache_t *tcache)
 
 	tcache->ev_cnt++;
 	assert(tcache->ev_cnt <= TCACHE_GC_INCR);
-	if (tcache->ev_cnt == TCACHE_GC_INCR)
-		tcache_event_hard(tcache);
+	if (unlikely(tcache->ev_cnt == TCACHE_GC_INCR))
+		tcache_event_hard(tsd, tcache);
 }
 
 JEMALLOC_ALWAYS_INLINE void *
@@ -276,85 +251,87 @@ tcache_alloc_easy(tcache_bin_t *tbin)
 {
 	void *ret;
 
-	if (tbin->ncached == 0) {
+	if (unlikely(tbin->ncached == 0)) {
 		tbin->low_water = -1;
 		return (NULL);
 	}
 	tbin->ncached--;
-	if ((int)tbin->ncached < tbin->low_water)
+	if (unlikely((int)tbin->ncached < tbin->low_water))
 		tbin->low_water = tbin->ncached;
 	ret = tbin->avail[tbin->ncached];
 	return (ret);
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-tcache_alloc_small(tcache_t *tcache, size_t size, bool zero)
+tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
+    bool zero)
 {
 	void *ret;
-	size_t binind;
+	szind_t binind;
+	size_t usize;
 	tcache_bin_t *tbin;
 
-	binind = SMALL_SIZE2BIN(size);
+	binind = size2index(size);
 	assert(binind < NBINS);
 	tbin = &tcache->tbins[binind];
-	size = arena_bin_info[binind].reg_size;
+	usize = index2size(binind);
 	ret = tcache_alloc_easy(tbin);
-	if (ret == NULL) {
-		ret = tcache_alloc_small_hard(tcache, tbin, binind);
+	if (unlikely(ret == NULL)) {
+		ret = tcache_alloc_small_hard(tsd, arena, tcache, tbin, binind);
 		if (ret == NULL)
 			return (NULL);
 	}
-	assert(tcache_salloc(ret) == arena_bin_info[binind].reg_size);
+	assert(tcache_salloc(ret) == usize);
 
-	if (zero == false) {
+	if (likely(!zero)) {
 		if (config_fill) {
-			if (opt_junk) {
+			if (unlikely(opt_junk_alloc)) {
 				arena_alloc_junk_small(ret,
 				    &arena_bin_info[binind], false);
-			} else if (opt_zero)
-				memset(ret, 0, size);
+			} else if (unlikely(opt_zero))
+				memset(ret, 0, usize);
 		}
-		VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
 	} else {
-		if (config_fill && opt_junk) {
+		if (config_fill && unlikely(opt_junk_alloc)) {
 			arena_alloc_junk_small(ret, &arena_bin_info[binind],
 			    true);
 		}
-		VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
-		memset(ret, 0, size);
+		memset(ret, 0, usize);
 	}
 
 	if (config_stats)
 		tbin->tstats.nrequests++;
 	if (config_prof)
-		tcache->prof_accumbytes += arena_bin_info[binind].reg_size;
-	tcache_event(tcache);
+		tcache->prof_accumbytes += usize;
+	tcache_event(tsd, tcache);
 	return (ret);
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-tcache_alloc_large(tcache_t *tcache, size_t size, bool zero)
+tcache_alloc_large(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
+    bool zero)
 {
 	void *ret;
-	size_t binind;
+	szind_t binind;
+	size_t usize;
 	tcache_bin_t *tbin;
 
-	size = PAGE_CEILING(size);
-	assert(size <= tcache_maxclass);
-	binind = NBINS + (size >> LG_PAGE) - 1;
+	binind = size2index(size);
+	usize = index2size(binind);
+	assert(usize <= tcache_maxclass);
 	assert(binind < nhbins);
 	tbin = &tcache->tbins[binind];
 	ret = tcache_alloc_easy(tbin);
-	if (ret == NULL) {
+	if (unlikely(ret == NULL)) {
 		/*
 		 * Only allocate one large object at a time, because it's quite
 		 * expensive to create one and not use it.
 		 */
-		ret = arena_malloc_large(tcache->arena, size, zero);
+		ret = arena_malloc_large(arena, usize, zero);
 		if (ret == NULL)
 			return (NULL);
 	} else {
-		if (config_prof && prof_promote && size == PAGE) {
+		if (config_prof && usize == LARGE_MINCLASS) {
 			arena_chunk_t *chunk =
 			    (arena_chunk_t *)CHUNK_ADDR2BASE(ret);
 			size_t pageind = (((uintptr_t)ret - (uintptr_t)chunk) >>
@@ -362,57 +339,54 @@ tcache_alloc_large(tcache_t *tcache, size_t size, bool zero)
 			arena_mapbits_large_binind_set(chunk, pageind,
 			    BININD_INVALID);
 		}
-		if (zero == false) {
+		if (likely(!zero)) {
 			if (config_fill) {
-				if (opt_junk)
-					memset(ret, 0xa5, size);
-				else if (opt_zero)
-					memset(ret, 0, size);
+				if (unlikely(opt_junk_alloc))
+					memset(ret, 0xa5, usize);
+				else if (unlikely(opt_zero))
+					memset(ret, 0, usize);
 			}
-			VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
-		} else {
-			VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
-			memset(ret, 0, size);
-		}
+		} else
+			memset(ret, 0, usize);
 
 		if (config_stats)
 			tbin->tstats.nrequests++;
 		if (config_prof)
-			tcache->prof_accumbytes += size;
+			tcache->prof_accumbytes += usize;
 	}
 
-	tcache_event(tcache);
+	tcache_event(tsd, tcache);
 	return (ret);
 }
 
 JEMALLOC_ALWAYS_INLINE void
-tcache_dalloc_small(tcache_t *tcache, void *ptr, size_t binind)
+tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind)
 {
 	tcache_bin_t *tbin;
 	tcache_bin_info_t *tbin_info;
 
 	assert(tcache_salloc(ptr) <= SMALL_MAXCLASS);
 
-	if (config_fill && opt_junk)
+	if (config_fill && unlikely(opt_junk_free))
 		arena_dalloc_junk_small(ptr, &arena_bin_info[binind]);
 
 	tbin = &tcache->tbins[binind];
 	tbin_info = &tcache_bin_info[binind];
-	if (tbin->ncached == tbin_info->ncached_max) {
-		tcache_bin_flush_small(tbin, binind, (tbin_info->ncached_max >>
-		    1), tcache);
+	if (unlikely(tbin->ncached == tbin_info->ncached_max)) {
+		tcache_bin_flush_small(tsd, tcache, tbin, binind,
+		    (tbin_info->ncached_max >> 1));
 	}
 	assert(tbin->ncached < tbin_info->ncached_max);
 	tbin->avail[tbin->ncached] = ptr;
 	tbin->ncached++;
 
-	tcache_event(tcache);
+	tcache_event(tsd, tcache);
 }
 
 JEMALLOC_ALWAYS_INLINE void
-tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size)
+tcache_dalloc_large(tsd_t *tsd, tcache_t *tcache, void *ptr, size_t size)
 {
-	size_t binind;
+	szind_t binind;
 	tcache_bin_t *tbin;
 	tcache_bin_info_t *tbin_info;
 
@@ -420,22 +394,31 @@ tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size)
 	assert(tcache_salloc(ptr) > SMALL_MAXCLASS);
 	assert(tcache_salloc(ptr) <= tcache_maxclass);
 
-	binind = NBINS + (size >> LG_PAGE) - 1;
+	binind = size2index(size);
 
-	if (config_fill && opt_junk)
-		memset(ptr, 0x5a, size);
+	if (config_fill && unlikely(opt_junk_free))
+		arena_dalloc_junk_large(ptr, size);
 
 	tbin = &tcache->tbins[binind];
 	tbin_info = &tcache_bin_info[binind];
-	if (tbin->ncached == tbin_info->ncached_max) {
-		tcache_bin_flush_large(tbin, binind, (tbin_info->ncached_max >>
-		    1), tcache);
+	if (unlikely(tbin->ncached == tbin_info->ncached_max)) {
+		tcache_bin_flush_large(tsd, tbin, binind,
+		    (tbin_info->ncached_max >> 1), tcache);
 	}
 	assert(tbin->ncached < tbin_info->ncached_max);
 	tbin->avail[tbin->ncached] = ptr;
 	tbin->ncached++;
 
-	tcache_event(tcache);
+	tcache_event(tsd, tcache);
+}
+
+JEMALLOC_ALWAYS_INLINE tcache_t *
+tcaches_get(tsd_t *tsd, unsigned ind)
+{
+	tcaches_t *elm = &tcaches[ind];
+	if (unlikely(elm->tcache == NULL))
+		elm->tcache = tcache_create(tsd, arena_choose(tsd, NULL));
+	return (elm->tcache);
 }
 #endif
 
diff --git a/deps/jemalloc/include/jemalloc/internal/tsd.h b/deps/jemalloc/include/jemalloc/internal/tsd.h
index 9fb4a23ec..eed7aa013 100644
--- a/deps/jemalloc/include/jemalloc/internal/tsd.h
+++ b/deps/jemalloc/include/jemalloc/internal/tsd.h
@@ -2,7 +2,7 @@
 #ifdef JEMALLOC_H_TYPES
 
 /* Maximum number of malloc_tsd users with cleanup functions. */
-#define	MALLOC_TSD_CLEANUPS_MAX	8
+#define	MALLOC_TSD_CLEANUPS_MAX	2
 
 typedef bool (*malloc_tsd_cleanup_t)(void);
 
@@ -12,9 +12,18 @@ typedef struct tsd_init_block_s tsd_init_block_t;
 typedef struct tsd_init_head_s tsd_init_head_t;
 #endif
 
+typedef struct tsd_s tsd_t;
+
+typedef enum {
+	tsd_state_uninitialized,
+	tsd_state_nominal,
+	tsd_state_purgatory,
+	tsd_state_reincarnated
+} tsd_state_t;
+
 /*
  * TLS/TSD-agnostic macro-based implementation of thread-specific data.  There
- * are four macros that support (at least) three use cases: file-private,
+ * are five macros that support (at least) three use cases: file-private,
  * library-private, and library-private inlined.  Following is an example
  * library-private tsd variable:
  *
@@ -24,34 +33,36 @@ typedef struct tsd_init_head_s tsd_init_head_t;
  *           int y;
  *   } example_t;
  *   #define EX_INITIALIZER JEMALLOC_CONCAT({0, 0})
- *   malloc_tsd_protos(, example, example_t *)
- *   malloc_tsd_externs(example, example_t *)
+ *   malloc_tsd_types(example_, example_t)
+ *   malloc_tsd_protos(, example_, example_t)
+ *   malloc_tsd_externs(example_, example_t)
  * In example.c:
- *   malloc_tsd_data(, example, example_t *, EX_INITIALIZER)
- *   malloc_tsd_funcs(, example, example_t *, EX_INITIALIZER,
+ *   malloc_tsd_data(, example_, example_t, EX_INITIALIZER)
+ *   malloc_tsd_funcs(, example_, example_t, EX_INITIALIZER,
  *       example_tsd_cleanup)
  *
  * The result is a set of generated functions, e.g.:
  *
  *   bool example_tsd_boot(void) {...}
- *   example_t **example_tsd_get() {...}
- *   void example_tsd_set(example_t **val) {...}
+ *   example_t *example_tsd_get() {...}
+ *   void example_tsd_set(example_t *val) {...}
  *
  * Note that all of the functions deal in terms of (a_type *) rather than
- * (a_type)  so that it is possible to support non-pointer types (unlike
+ * (a_type) so that it is possible to support non-pointer types (unlike
  * pthreads TSD).  example_tsd_cleanup() is passed an (a_type *) pointer that is
- * cast to (void *).  This means that the cleanup function needs to cast *and*
- * dereference the function argument, e.g.:
+ * cast to (void *).  This means that the cleanup function needs to cast the
+ * function argument to (a_type *), then dereference the resulting pointer to
+ * access fields, e.g.
  *
  *   void
  *   example_tsd_cleanup(void *arg)
  *   {
- *           example_t *example = *(example_t **)arg;
+ *           example_t *example = (example_t *)arg;
  *
+ *           example->x = 42;
  *           [...]
- *           if ([want the cleanup function to be called again]) {
- *                   example_tsd_set(&example);
- *           }
+ *           if ([want the cleanup function to be called again])
+ *                   example_tsd_set(example);
  *   }
  *
  * If example_tsd_set() is called within example_tsd_cleanup(), it will be
@@ -60,63 +71,96 @@ typedef struct tsd_init_head_s tsd_init_head_t;
  * non-NULL.
  */
 
+/* malloc_tsd_types(). */
+#ifdef JEMALLOC_MALLOC_THREAD_CLEANUP
+#define	malloc_tsd_types(a_name, a_type)
+#elif (defined(JEMALLOC_TLS))
+#define	malloc_tsd_types(a_name, a_type)
+#elif (defined(_WIN32))
+#define	malloc_tsd_types(a_name, a_type)				\
+typedef struct {							\
+	bool	initialized;						\
+	a_type	val;							\
+} a_name##tsd_wrapper_t;
+#else
+#define	malloc_tsd_types(a_name, a_type)				\
+typedef struct {							\
+	bool	initialized;						\
+	a_type	val;							\
+} a_name##tsd_wrapper_t;
+#endif
+
 /* malloc_tsd_protos(). */
 #define	malloc_tsd_protos(a_attr, a_name, a_type)			\
 a_attr bool								\
-a_name##_tsd_boot(void);						\
+a_name##tsd_boot0(void);						\
+a_attr void								\
+a_name##tsd_boot1(void);						\
+a_attr bool								\
+a_name##tsd_boot(void);							\
 a_attr a_type *								\
-a_name##_tsd_get(void);							\
+a_name##tsd_get(void);							\
 a_attr void								\
-a_name##_tsd_set(a_type *val);
+a_name##tsd_set(a_type *val);
 
 /* malloc_tsd_externs(). */
 #ifdef JEMALLOC_MALLOC_THREAD_CLEANUP
 #define	malloc_tsd_externs(a_name, a_type)				\
-extern __thread a_type	a_name##_tls;					\
-extern __thread bool	a_name##_initialized;				\
-extern bool		a_name##_booted;
+extern __thread a_type	a_name##tsd_tls;				\
+extern __thread bool	a_name##tsd_initialized;			\
+extern bool		a_name##tsd_booted;
 #elif (defined(JEMALLOC_TLS))
 #define	malloc_tsd_externs(a_name, a_type)				\
-extern __thread a_type	a_name##_tls;					\
-extern pthread_key_t	a_name##_tsd;					\
-extern bool		a_name##_booted;
+extern __thread a_type	a_name##tsd_tls;				\
+extern pthread_key_t	a_name##tsd_tsd;				\
+extern bool		a_name##tsd_booted;
 #elif (defined(_WIN32))
 #define	malloc_tsd_externs(a_name, a_type)				\
-extern DWORD		a_name##_tsd;					\
-extern bool		a_name##_booted;
+extern DWORD		a_name##tsd_tsd;				\
+extern a_name##tsd_wrapper_t	a_name##tsd_boot_wrapper;		\
+extern bool		a_name##tsd_booted;
 #else
 #define	malloc_tsd_externs(a_name, a_type)				\
-extern pthread_key_t	a_name##_tsd;					\
-extern tsd_init_head_t	a_name##_tsd_init_head;				\
-extern bool		a_name##_booted;
+extern pthread_key_t	a_name##tsd_tsd;				\
+extern tsd_init_head_t	a_name##tsd_init_head;				\
+extern a_name##tsd_wrapper_t	a_name##tsd_boot_wrapper;		\
+extern bool		a_name##tsd_booted;
 #endif
 
 /* malloc_tsd_data(). */
 #ifdef JEMALLOC_MALLOC_THREAD_CLEANUP
 #define	malloc_tsd_data(a_attr, a_name, a_type, a_initializer)		\
 a_attr __thread a_type JEMALLOC_TLS_MODEL				\
-    a_name##_tls = a_initializer;					\
+    a_name##tsd_tls = a_initializer;					\
 a_attr __thread bool JEMALLOC_TLS_MODEL					\
-    a_name##_initialized = false;					\
-a_attr bool		a_name##_booted = false;
+    a_name##tsd_initialized = false;					\
+a_attr bool		a_name##tsd_booted = false;
 #elif (defined(JEMALLOC_TLS))
 #define	malloc_tsd_data(a_attr, a_name, a_type, a_initializer)		\
 a_attr __thread a_type JEMALLOC_TLS_MODEL				\
-    a_name##_tls = a_initializer;					\
-a_attr pthread_key_t	a_name##_tsd;					\
-a_attr bool		a_name##_booted = false;
+    a_name##tsd_tls = a_initializer;					\
+a_attr pthread_key_t	a_name##tsd_tsd;				\
+a_attr bool		a_name##tsd_booted = false;
 #elif (defined(_WIN32))
 #define	malloc_tsd_data(a_attr, a_name, a_type, a_initializer)		\
-a_attr DWORD		a_name##_tsd;					\
-a_attr bool		a_name##_booted = false;
+a_attr DWORD		a_name##tsd_tsd;				\
+a_attr a_name##tsd_wrapper_t a_name##tsd_boot_wrapper = {		\
+	false,								\
+	a_initializer							\
+};									\
+a_attr bool		a_name##tsd_booted = false;
 #else
 #define	malloc_tsd_data(a_attr, a_name, a_type, a_initializer)		\
-a_attr pthread_key_t	a_name##_tsd;					\
-a_attr tsd_init_head_t	a_name##_tsd_init_head = {			\
+a_attr pthread_key_t	a_name##tsd_tsd;				\
+a_attr tsd_init_head_t	a_name##tsd_init_head = {			\
 	ql_head_initializer(blocks),					\
 	MALLOC_MUTEX_INITIALIZER					\
 };									\
-a_attr bool		a_name##_booted = false;
+a_attr a_name##tsd_wrapper_t a_name##tsd_boot_wrapper = {		\
+	false,								\
+	a_initializer							\
+};									\
+a_attr bool		a_name##tsd_booted = false;
 #endif
 
 /* malloc_tsd_funcs(). */
@@ -125,75 +169,100 @@ a_attr bool		a_name##_booted = false;
     a_cleanup)								\
 /* Initialization/cleanup. */						\
 a_attr bool								\
-a_name##_tsd_cleanup_wrapper(void)					\
+a_name##tsd_cleanup_wrapper(void)					\
 {									\
 									\
-	if (a_name##_initialized) {					\
-		a_name##_initialized = false;				\
-		a_cleanup(&a_name##_tls);				\
+	if (a_name##tsd_initialized) {					\
+		a_name##tsd_initialized = false;			\
+		a_cleanup(&a_name##tsd_tls);				\
 	}								\
-	return (a_name##_initialized);					\
+	return (a_name##tsd_initialized);				\
 }									\
 a_attr bool								\
-a_name##_tsd_boot(void)							\
+a_name##tsd_boot0(void)							\
 {									\
 									\
 	if (a_cleanup != malloc_tsd_no_cleanup) {			\
 		malloc_tsd_cleanup_register(				\
-		    &a_name##_tsd_cleanup_wrapper);			\
+		    &a_name##tsd_cleanup_wrapper);			\
 	}								\
-	a_name##_booted = true;						\
+	a_name##tsd_booted = true;					\
 	return (false);							\
 }									\
+a_attr void								\
+a_name##tsd_boot1(void)							\
+{									\
+									\
+	/* Do nothing. */						\
+}									\
+a_attr bool								\
+a_name##tsd_boot(void)							\
+{									\
+									\
+	return (a_name##tsd_boot0());					\
+}									\
 /* Get/set. */								\
 a_attr a_type *								\
-a_name##_tsd_get(void)							\
+a_name##tsd_get(void)							\
 {									\
 									\
-	assert(a_name##_booted);					\
-	return (&a_name##_tls);						\
+	assert(a_name##tsd_booted);					\
+	return (&a_name##tsd_tls);					\
 }									\
 a_attr void								\
-a_name##_tsd_set(a_type *val)						\
+a_name##tsd_set(a_type *val)						\
 {									\
 									\
-	assert(a_name##_booted);					\
-	a_name##_tls = (*val);						\
+	assert(a_name##tsd_booted);					\
+	a_name##tsd_tls = (*val);					\
 	if (a_cleanup != malloc_tsd_no_cleanup)				\
-		a_name##_initialized = true;				\
+		a_name##tsd_initialized = true;				\
 }
 #elif (defined(JEMALLOC_TLS))
 #define	malloc_tsd_funcs(a_attr, a_name, a_type, a_initializer,		\
     a_cleanup)								\
 /* Initialization/cleanup. */						\
 a_attr bool								\
-a_name##_tsd_boot(void)							\
+a_name##tsd_boot0(void)							\
 {									\
 									\
 	if (a_cleanup != malloc_tsd_no_cleanup) {			\
-		if (pthread_key_create(&a_name##_tsd, a_cleanup) != 0)	\
+		if (pthread_key_create(&a_name##tsd_tsd, a_cleanup) !=	\
+		    0)							\
 			return (true);					\
 	}								\
-	a_name##_booted = true;						\
+	a_name##tsd_booted = true;					\
 	return (false);							\
 }									\
+a_attr void								\
+a_name##tsd_boot1(void)							\
+{									\
+									\
+	/* Do nothing. */						\
+}									\
+a_attr bool								\
+a_name##tsd_boot(void)							\
+{									\
+									\
+	return (a_name##tsd_boot0());					\
+}									\
 /* Get/set. */								\
 a_attr a_type *								\
-a_name##_tsd_get(void)							\
+a_name##tsd_get(void)							\
 {									\
 									\
-	assert(a_name##_booted);					\
-	return (&a_name##_tls);						\
+	assert(a_name##tsd_booted);					\
+	return (&a_name##tsd_tls);					\
 }									\
 a_attr void								\
-a_name##_tsd_set(a_type *val)						\
+a_name##tsd_set(a_type *val)						\
 {									\
 									\
-	assert(a_name##_booted);					\
-	a_name##_tls = (*val);						\
+	assert(a_name##tsd_booted);					\
+	a_name##tsd_tls = (*val);					\
 	if (a_cleanup != malloc_tsd_no_cleanup) {			\
-		if (pthread_setspecific(a_name##_tsd,			\
-		    (void *)(&a_name##_tls))) {				\
+		if (pthread_setspecific(a_name##tsd_tsd,		\
+		    (void *)(&a_name##tsd_tls))) {			\
 			malloc_write("<jemalloc>: Error"		\
 			    " setting TSD for "#a_name"\n");		\
 			if (opt_abort)					\
@@ -204,27 +273,21 @@ a_name##_tsd_set(a_type *val)						\
 #elif (defined(_WIN32))
 #define	malloc_tsd_funcs(a_attr, a_name, a_type, a_initializer,		\
     a_cleanup)								\
-/* Data structure. */							\
-typedef struct {							\
-	bool	initialized;						\
-	a_type	val;							\
-} a_name##_tsd_wrapper_t;						\
 /* Initialization/cleanup. */						\
 a_attr bool								\
-a_name##_tsd_cleanup_wrapper(void)					\
+a_name##tsd_cleanup_wrapper(void)					\
 {									\
-	a_name##_tsd_wrapper_t *wrapper;				\
+	DWORD error = GetLastError();					\
+	a_name##tsd_wrapper_t *wrapper = (a_name##tsd_wrapper_t *)	\
+	    TlsGetValue(a_name##tsd_tsd);				\
+	SetLastError(error);						\
 									\
-	wrapper = (a_name##_tsd_wrapper_t *) TlsGetValue(a_name##_tsd);	\
 	if (wrapper == NULL)						\
 		return (false);						\
 	if (a_cleanup != malloc_tsd_no_cleanup &&			\
 	    wrapper->initialized) {					\
-		a_type val = wrapper->val;				\
-		a_type tsd_static_data = a_initializer;			\
 		wrapper->initialized = false;				\
-		wrapper->val = tsd_static_data;				\
-		a_cleanup(&val);					\
+		a_cleanup(&wrapper->val);				\
 		if (wrapper->initialized) {				\
 			/* Trigger another cleanup round. */		\
 			return (true);					\
@@ -233,63 +296,95 @@ a_name##_tsd_cleanup_wrapper(void)					\
 	malloc_tsd_dalloc(wrapper);					\
 	return (false);							\
 }									\
-a_attr bool								\
-a_name##_tsd_boot(void)							\
+a_attr void								\
+a_name##tsd_wrapper_set(a_name##tsd_wrapper_t *wrapper)			\
 {									\
 									\
-	a_name##_tsd = TlsAlloc();					\
-	if (a_name##_tsd == TLS_OUT_OF_INDEXES)				\
-		return (true);						\
-	if (a_cleanup != malloc_tsd_no_cleanup) {			\
-		malloc_tsd_cleanup_register(				\
-		    &a_name##_tsd_cleanup_wrapper);			\
+	if (!TlsSetValue(a_name##tsd_tsd, (void *)wrapper)) {		\
+		malloc_write("<jemalloc>: Error setting"		\
+		    " TSD for "#a_name"\n");				\
+		abort();						\
 	}								\
-	a_name##_booted = true;						\
-	return (false);							\
 }									\
-/* Get/set. */								\
-a_attr a_name##_tsd_wrapper_t *						\
-a_name##_tsd_get_wrapper(void)						\
+a_attr a_name##tsd_wrapper_t *						\
+a_name##tsd_wrapper_get(void)						\
 {									\
-	a_name##_tsd_wrapper_t *wrapper = (a_name##_tsd_wrapper_t *)	\
-	    TlsGetValue(a_name##_tsd);					\
+	DWORD error = GetLastError();					\
+	a_name##tsd_wrapper_t *wrapper = (a_name##tsd_wrapper_t *)	\
+	    TlsGetValue(a_name##tsd_tsd);				\
+	SetLastError(error);						\
 									\
-	if (wrapper == NULL) {						\
-		wrapper = (a_name##_tsd_wrapper_t *)			\
-		    malloc_tsd_malloc(sizeof(a_name##_tsd_wrapper_t));	\
+	if (unlikely(wrapper == NULL)) {				\
+		wrapper = (a_name##tsd_wrapper_t *)			\
+		    malloc_tsd_malloc(sizeof(a_name##tsd_wrapper_t));	\
 		if (wrapper == NULL) {					\
 			malloc_write("<jemalloc>: Error allocating"	\
 			    " TSD for "#a_name"\n");			\
 			abort();					\
 		} else {						\
-			static a_type tsd_static_data = a_initializer;	\
 			wrapper->initialized = false;			\
-			wrapper->val = tsd_static_data;			\
-		}							\
-		if (!TlsSetValue(a_name##_tsd, (void *)wrapper)) {	\
-			malloc_write("<jemalloc>: Error setting"	\
-			    " TSD for "#a_name"\n");			\
-			abort();					\
+			wrapper->val = a_initializer;			\
 		}							\
+		a_name##tsd_wrapper_set(wrapper);			\
 	}								\
 	return (wrapper);						\
 }									\
+a_attr bool								\
+a_name##tsd_boot0(void)							\
+{									\
+									\
+	a_name##tsd_tsd = TlsAlloc();					\
+	if (a_name##tsd_tsd == TLS_OUT_OF_INDEXES)			\
+		return (true);						\
+	if (a_cleanup != malloc_tsd_no_cleanup) {			\
+		malloc_tsd_cleanup_register(				\
+		    &a_name##tsd_cleanup_wrapper);			\
+	}								\
+	a_name##tsd_wrapper_set(&a_name##tsd_boot_wrapper);		\
+	a_name##tsd_booted = true;					\
+	return (false);							\
+}									\
+a_attr void								\
+a_name##tsd_boot1(void)							\
+{									\
+	a_name##tsd_wrapper_t *wrapper;					\
+	wrapper = (a_name##tsd_wrapper_t *)				\
+	    malloc_tsd_malloc(sizeof(a_name##tsd_wrapper_t));		\
+	if (wrapper == NULL) {						\
+		malloc_write("<jemalloc>: Error allocating"		\
+		    " TSD for "#a_name"\n");				\
+		abort();						\
+	}								\
+	memcpy(wrapper, &a_name##tsd_boot_wrapper,			\
+	    sizeof(a_name##tsd_wrapper_t));				\
+	a_name##tsd_wrapper_set(wrapper);				\
+}									\
+a_attr bool								\
+a_name##tsd_boot(void)							\
+{									\
+									\
+	if (a_name##tsd_boot0())					\
+		return (true);						\
+	a_name##tsd_boot1();						\
+	return (false);							\
+}									\
+/* Get/set. */								\
 a_attr a_type *								\
-a_name##_tsd_get(void)							\
+a_name##tsd_get(void)							\
 {									\
-	a_name##_tsd_wrapper_t *wrapper;				\
+	a_name##tsd_wrapper_t *wrapper;					\
 									\
-	assert(a_name##_booted);					\
-	wrapper = a_name##_tsd_get_wrapper();				\
+	assert(a_name##tsd_booted);					\
+	wrapper = a_name##tsd_wrapper_get();				\
 	return (&wrapper->val);						\
 }									\
 a_attr void								\
-a_name##_tsd_set(a_type *val)						\
+a_name##tsd_set(a_type *val)						\
 {									\
-	a_name##_tsd_wrapper_t *wrapper;				\
+	a_name##tsd_wrapper_t *wrapper;					\
 									\
-	assert(a_name##_booted);					\
-	wrapper = a_name##_tsd_get_wrapper();				\
+	assert(a_name##tsd_booted);					\
+	wrapper = a_name##tsd_wrapper_get();				\
 	wrapper->val = *(val);						\
 	if (a_cleanup != malloc_tsd_no_cleanup)				\
 		wrapper->initialized = true;				\
@@ -297,16 +392,11 @@ a_name##_tsd_set(a_type *val)						\
 #else
 #define	malloc_tsd_funcs(a_attr, a_name, a_type, a_initializer,		\
     a_cleanup)								\
-/* Data structure. */							\
-typedef struct {							\
-	bool	initialized;						\
-	a_type	val;							\
-} a_name##_tsd_wrapper_t;						\
 /* Initialization/cleanup. */						\
 a_attr void								\
-a_name##_tsd_cleanup_wrapper(void *arg)					\
+a_name##tsd_cleanup_wrapper(void *arg)					\
 {									\
-	a_name##_tsd_wrapper_t *wrapper = (a_name##_tsd_wrapper_t *)arg;\
+	a_name##tsd_wrapper_t *wrapper = (a_name##tsd_wrapper_t *)arg;	\
 									\
 	if (a_cleanup != malloc_tsd_no_cleanup &&			\
 	    wrapper->initialized) {					\
@@ -314,7 +404,7 @@ a_name##_tsd_cleanup_wrapper(void *arg)					\
 		a_cleanup(&wrapper->val);				\
 		if (wrapper->initialized) {				\
 			/* Trigger another cleanup round. */		\
-			if (pthread_setspecific(a_name##_tsd,		\
+			if (pthread_setspecific(a_name##tsd_tsd,	\
 			    (void *)wrapper)) {				\
 				malloc_write("<jemalloc>: Error"	\
 				    " setting TSD for "#a_name"\n");	\
@@ -326,67 +416,97 @@ a_name##_tsd_cleanup_wrapper(void *arg)					\
 	}								\
 	malloc_tsd_dalloc(wrapper);					\
 }									\
-a_attr bool								\
-a_name##_tsd_boot(void)							\
+a_attr void								\
+a_name##tsd_wrapper_set(a_name##tsd_wrapper_t *wrapper)			\
 {									\
 									\
-	if (pthread_key_create(&a_name##_tsd,				\
-	    a_name##_tsd_cleanup_wrapper) != 0)				\
-		return (true);						\
-	a_name##_booted = true;						\
-	return (false);							\
+	if (pthread_setspecific(a_name##tsd_tsd,			\
+	    (void *)wrapper)) {						\
+		malloc_write("<jemalloc>: Error setting"		\
+		    " TSD for "#a_name"\n");				\
+		abort();						\
+	}								\
 }									\
-/* Get/set. */								\
-a_attr a_name##_tsd_wrapper_t *						\
-a_name##_tsd_get_wrapper(void)						\
+a_attr a_name##tsd_wrapper_t *						\
+a_name##tsd_wrapper_get(void)						\
 {									\
-	a_name##_tsd_wrapper_t *wrapper = (a_name##_tsd_wrapper_t *)	\
-	    pthread_getspecific(a_name##_tsd);				\
+	a_name##tsd_wrapper_t *wrapper = (a_name##tsd_wrapper_t *)	\
+	    pthread_getspecific(a_name##tsd_tsd);			\
 									\
-	if (wrapper == NULL) {						\
+	if (unlikely(wrapper == NULL)) {				\
 		tsd_init_block_t block;					\
 		wrapper = tsd_init_check_recursion(			\
-		    &a_name##_tsd_init_head, &block);			\
+		    &a_name##tsd_init_head, &block);			\
 		if (wrapper)						\
 		    return (wrapper);					\
-		wrapper = (a_name##_tsd_wrapper_t *)			\
-		    malloc_tsd_malloc(sizeof(a_name##_tsd_wrapper_t));	\
+		wrapper = (a_name##tsd_wrapper_t *)			\
+		    malloc_tsd_malloc(sizeof(a_name##tsd_wrapper_t));	\
 		block.data = wrapper;					\
 		if (wrapper == NULL) {					\
 			malloc_write("<jemalloc>: Error allocating"	\
 			    " TSD for "#a_name"\n");			\
 			abort();					\
 		} else {						\
-			static a_type tsd_static_data = a_initializer;	\
 			wrapper->initialized = false;			\
-			wrapper->val = tsd_static_data;			\
-		}							\
-		if (pthread_setspecific(a_name##_tsd,			\
-		    (void *)wrapper)) {					\
-			malloc_write("<jemalloc>: Error setting"	\
-			    " TSD for "#a_name"\n");			\
-			abort();					\
+			wrapper->val = a_initializer;			\
 		}							\
-		tsd_init_finish(&a_name##_tsd_init_head, &block);	\
+		a_name##tsd_wrapper_set(wrapper);			\
+		tsd_init_finish(&a_name##tsd_init_head, &block);	\
 	}								\
 	return (wrapper);						\
 }									\
+a_attr bool								\
+a_name##tsd_boot0(void)							\
+{									\
+									\
+	if (pthread_key_create(&a_name##tsd_tsd,			\
+	    a_name##tsd_cleanup_wrapper) != 0)				\
+		return (true);						\
+	a_name##tsd_wrapper_set(&a_name##tsd_boot_wrapper);		\
+	a_name##tsd_booted = true;					\
+	return (false);							\
+}									\
+a_attr void								\
+a_name##tsd_boot1(void)							\
+{									\
+	a_name##tsd_wrapper_t *wrapper;					\
+	wrapper = (a_name##tsd_wrapper_t *)				\
+	    malloc_tsd_malloc(sizeof(a_name##tsd_wrapper_t));		\
+	if (wrapper == NULL) {						\
+		malloc_write("<jemalloc>: Error allocating"		\
+		    " TSD for "#a_name"\n");				\
+		abort();						\
+	}								\
+	memcpy(wrapper, &a_name##tsd_boot_wrapper,			\
+	    sizeof(a_name##tsd_wrapper_t));				\
+	a_name##tsd_wrapper_set(wrapper);				\
+}									\
+a_attr bool								\
+a_name##tsd_boot(void)							\
+{									\
+									\
+	if (a_name##tsd_boot0())					\
+		return (true);						\
+	a_name##tsd_boot1();						\
+	return (false);							\
+}									\
+/* Get/set. */								\
 a_attr a_type *								\
-a_name##_tsd_get(void)							\
+a_name##tsd_get(void)							\
 {									\
-	a_name##_tsd_wrapper_t *wrapper;				\
+	a_name##tsd_wrapper_t *wrapper;					\
 									\
-	assert(a_name##_booted);					\
-	wrapper = a_name##_tsd_get_wrapper();				\
+	assert(a_name##tsd_booted);					\
+	wrapper = a_name##tsd_wrapper_get();				\
 	return (&wrapper->val);						\
 }									\
 a_attr void								\
-a_name##_tsd_set(a_type *val)						\
+a_name##tsd_set(a_type *val)						\
 {									\
-	a_name##_tsd_wrapper_t *wrapper;				\
+	a_name##tsd_wrapper_t *wrapper;					\
 									\
-	assert(a_name##_booted);					\
-	wrapper = a_name##_tsd_get_wrapper();				\
+	assert(a_name##tsd_booted);					\
+	wrapper = a_name##tsd_wrapper_get();				\
 	wrapper->val = *(val);						\
 	if (a_cleanup != malloc_tsd_no_cleanup)				\
 		wrapper->initialized = true;				\
@@ -410,25 +530,136 @@ struct tsd_init_head_s {
 };
 #endif
 
+#define	MALLOC_TSD							\
+/*  O(name,			type) */				\
+    O(tcache,			tcache_t *)				\
+    O(thread_allocated,		uint64_t)				\
+    O(thread_deallocated,	uint64_t)				\
+    O(prof_tdata,		prof_tdata_t *)				\
+    O(arena,			arena_t *)				\
+    O(arenas_cache,		arena_t **)				\
+    O(narenas_cache,		unsigned)				\
+    O(arenas_cache_bypass,	bool)					\
+    O(tcache_enabled,		tcache_enabled_t)			\
+    O(quarantine,		quarantine_t *)				\
+
+#define	TSD_INITIALIZER {						\
+    tsd_state_uninitialized,						\
+    NULL,								\
+    0,									\
+    0,									\
+    NULL,								\
+    NULL,								\
+    NULL,								\
+    0,									\
+    false,								\
+    tcache_enabled_default,						\
+    NULL								\
+}
+
+struct tsd_s {
+	tsd_state_t	state;
+#define	O(n, t)								\
+	t		n;
+MALLOC_TSD
+#undef O
+};
+
+static const tsd_t tsd_initializer = TSD_INITIALIZER;
+
+malloc_tsd_types(, tsd_t)
+
 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
 
 void	*malloc_tsd_malloc(size_t size);
 void	malloc_tsd_dalloc(void *wrapper);
-void	malloc_tsd_no_cleanup(void *);
+void	malloc_tsd_no_cleanup(void *arg);
 void	malloc_tsd_cleanup_register(bool (*f)(void));
-void	malloc_tsd_boot(void);
+bool	malloc_tsd_boot0(void);
+void	malloc_tsd_boot1(void);
 #if (!defined(JEMALLOC_MALLOC_THREAD_CLEANUP) && !defined(JEMALLOC_TLS) && \
     !defined(_WIN32))
 void	*tsd_init_check_recursion(tsd_init_head_t *head,
     tsd_init_block_t *block);
 void	tsd_init_finish(tsd_init_head_t *head, tsd_init_block_t *block);
 #endif
+void	tsd_cleanup(void *arg);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES
 
+#ifndef JEMALLOC_ENABLE_INLINE
+malloc_tsd_protos(JEMALLOC_ATTR(unused), , tsd_t)
+
+tsd_t	*tsd_fetch(void);
+bool	tsd_nominal(tsd_t *tsd);
+#define	O(n, t)								\
+t	*tsd_##n##p_get(tsd_t *tsd);					\
+t	tsd_##n##_get(tsd_t *tsd);					\
+void	tsd_##n##_set(tsd_t *tsd, t n);
+MALLOC_TSD
+#undef O
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_TSD_C_))
+malloc_tsd_externs(, tsd_t)
+malloc_tsd_funcs(JEMALLOC_ALWAYS_INLINE, , tsd_t, tsd_initializer, tsd_cleanup)
+
+JEMALLOC_ALWAYS_INLINE tsd_t *
+tsd_fetch(void)
+{
+	tsd_t *tsd = tsd_get();
+
+	if (unlikely(tsd->state != tsd_state_nominal)) {
+		if (tsd->state == tsd_state_uninitialized) {
+			tsd->state = tsd_state_nominal;
+			/* Trigger cleanup handler registration. */
+			tsd_set(tsd);
+		} else if (tsd->state == tsd_state_purgatory) {
+			tsd->state = tsd_state_reincarnated;
+			tsd_set(tsd);
+		} else
+			assert(tsd->state == tsd_state_reincarnated);
+	}
+
+	return (tsd);
+}
+
+JEMALLOC_INLINE bool
+tsd_nominal(tsd_t *tsd)
+{
+
+	return (tsd->state == tsd_state_nominal);
+}
+
+#define	O(n, t)								\
+JEMALLOC_ALWAYS_INLINE t *						\
+tsd_##n##p_get(tsd_t *tsd)						\
+{									\
+									\
+	return (&tsd->n);						\
+}									\
+									\
+JEMALLOC_ALWAYS_INLINE t						\
+tsd_##n##_get(tsd_t *tsd)						\
+{									\
+									\
+	return (*tsd_##n##p_get(tsd));					\
+}									\
+									\
+JEMALLOC_ALWAYS_INLINE void						\
+tsd_##n##_set(tsd_t *tsd, t n)						\
+{									\
+									\
+	assert(tsd->state == tsd_state_nominal);			\
+	tsd->n = n;							\
+}
+MALLOC_TSD
+#undef O
+#endif
+
 #endif /* JEMALLOC_H_INLINES */
 /******************************************************************************/
diff --git a/deps/jemalloc/include/jemalloc/internal/util.h b/deps/jemalloc/include/jemalloc/internal/util.h
index 6b938f746..b2ea740fd 100644
--- a/deps/jemalloc/include/jemalloc/internal/util.h
+++ b/deps/jemalloc/include/jemalloc/internal/util.h
@@ -1,6 +1,36 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_TYPES
 
+#ifdef _WIN32
+#  ifdef _WIN64
+#    define FMT64_PREFIX "ll"
+#    define FMTPTR_PREFIX "ll"
+#  else
+#    define FMT64_PREFIX "ll"
+#    define FMTPTR_PREFIX ""
+#  endif
+#  define FMTd32 "d"
+#  define FMTu32 "u"
+#  define FMTx32 "x"
+#  define FMTd64 FMT64_PREFIX "d"
+#  define FMTu64 FMT64_PREFIX "u"
+#  define FMTx64 FMT64_PREFIX "x"
+#  define FMTdPTR FMTPTR_PREFIX "d"
+#  define FMTuPTR FMTPTR_PREFIX "u"
+#  define FMTxPTR FMTPTR_PREFIX "x"
+#else
+#  include <inttypes.h>
+#  define FMTd32 PRId32
+#  define FMTu32 PRIu32
+#  define FMTx32 PRIx32
+#  define FMTd64 PRId64
+#  define FMTu64 PRIu64
+#  define FMTx64 PRIx64
+#  define FMTdPTR PRIdPTR
+#  define FMTuPTR PRIuPTR
+#  define FMTxPTR PRIxPTR
+#endif
+
 /* Size of stack-allocated buffer passed to buferror(). */
 #define	BUFERROR_BUF		64
 
@@ -22,9 +52,33 @@
  * uninitialized.
  */
 #ifdef JEMALLOC_CC_SILENCE
-#  define JEMALLOC_CC_SILENCE_INIT(v) = v
+#	define JEMALLOC_CC_SILENCE_INIT(v) = v
 #else
-#  define JEMALLOC_CC_SILENCE_INIT(v)
+#	define JEMALLOC_CC_SILENCE_INIT(v)
+#endif
+
+#define	JEMALLOC_GNUC_PREREQ(major, minor)				\
+    (!defined(__clang__) &&						\
+    (__GNUC__ > (major) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor))))
+#ifndef __has_builtin
+#  define __has_builtin(builtin) (0)
+#endif
+#define	JEMALLOC_CLANG_HAS_BUILTIN(builtin)				\
+    (defined(__clang__) && __has_builtin(builtin))
+
+#ifdef __GNUC__
+#	define likely(x)   __builtin_expect(!!(x), 1)
+#	define unlikely(x) __builtin_expect(!!(x), 0)
+#  if JEMALLOC_GNUC_PREREQ(4, 6) ||					\
+      JEMALLOC_CLANG_HAS_BUILTIN(__builtin_unreachable)
+#	define unreachable() __builtin_unreachable()
+#  else
+#	define unreachable()
+#  endif
+#else
+#	define likely(x)   !!(x)
+#	define unlikely(x) !!(x)
+#	define unreachable()
 #endif
 
 /*
@@ -33,7 +87,7 @@
  */
 #ifndef assert
 #define	assert(e) do {							\
-	if (config_debug && !(e)) {					\
+	if (unlikely(config_debug && !(e))) {				\
 		malloc_printf(						\
 		    "<jemalloc>: %s:%d: Failed assertion: \"%s\"\n",	\
 		    __FILE__, __LINE__, #e);				\
@@ -50,6 +104,7 @@
 		    __FILE__, __LINE__);				\
 		abort();						\
 	}								\
+	unreachable();							\
 } while (0)
 #endif
 
@@ -65,14 +120,14 @@
 
 #ifndef assert_not_implemented
 #define	assert_not_implemented(e) do {					\
-	if (config_debug && !(e))					\
+	if (unlikely(config_debug && !(e)))				\
 		not_implemented();					\
 } while (0)
 #endif
 
 /* Use to assert a particular configuration, e.g., cassert(config_debug). */
 #define	cassert(c) do {							\
-	if ((c) == false)						\
+	if (unlikely(!(c)))						\
 		not_reached();						\
 } while (0)
 
@@ -96,25 +151,47 @@ void	malloc_write(const char *s);
 int	malloc_vsnprintf(char *str, size_t size, const char *format,
     va_list ap);
 int	malloc_snprintf(char *str, size_t size, const char *format, ...)
-    JEMALLOC_ATTR(format(printf, 3, 4));
+    JEMALLOC_FORMAT_PRINTF(3, 4);
 void	malloc_vcprintf(void (*write_cb)(void *, const char *), void *cbopaque,
     const char *format, va_list ap);
 void malloc_cprintf(void (*write)(void *, const char *), void *cbopaque,
-    const char *format, ...) JEMALLOC_ATTR(format(printf, 3, 4));
-void	malloc_printf(const char *format, ...)
-    JEMALLOC_ATTR(format(printf, 1, 2));
+    const char *format, ...) JEMALLOC_FORMAT_PRINTF(3, 4);
+void	malloc_printf(const char *format, ...) JEMALLOC_FORMAT_PRINTF(1, 2);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES
 
 #ifndef JEMALLOC_ENABLE_INLINE
+int	jemalloc_ffsl(long bitmap);
+int	jemalloc_ffs(int bitmap);
 size_t	pow2_ceil(size_t x);
+size_t	lg_floor(size_t x);
 void	set_errno(int errnum);
 int	get_errno(void);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_UTIL_C_))
+
+/* Sanity check. */
+#if !defined(JEMALLOC_INTERNAL_FFSL) || !defined(JEMALLOC_INTERNAL_FFS)
+#  error Both JEMALLOC_INTERNAL_FFSL && JEMALLOC_INTERNAL_FFS should have been defined by configure
+#endif
+
+JEMALLOC_ALWAYS_INLINE int
+jemalloc_ffsl(long bitmap)
+{
+
+	return (JEMALLOC_INTERNAL_FFSL(bitmap));
+}
+
+JEMALLOC_ALWAYS_INLINE int
+jemalloc_ffs(int bitmap)
+{
+
+	return (JEMALLOC_INTERNAL_FFS(bitmap));
+}
+
 /* Compute the smallest power of 2 that is >= x. */
 JEMALLOC_INLINE size_t
 pow2_ceil(size_t x)
@@ -133,7 +210,82 @@ pow2_ceil(size_t x)
 	return (x);
 }
 
-/* Sets error code */
+#if (defined(__i386__) || defined(__amd64__) || defined(__x86_64__))
+JEMALLOC_INLINE size_t
+lg_floor(size_t x)
+{
+	size_t ret;
+
+	assert(x != 0);
+
+	asm ("bsr %1, %0"
+	    : "=r"(ret) // Outputs.
+	    : "r"(x)    // Inputs.
+	    );
+	return (ret);
+}
+#elif (defined(_MSC_VER))
+JEMALLOC_INLINE size_t
+lg_floor(size_t x)
+{
+	unsigned long ret;
+
+	assert(x != 0);
+
+#if (LG_SIZEOF_PTR == 3)
+	_BitScanReverse64(&ret, x);
+#elif (LG_SIZEOF_PTR == 2)
+	_BitScanReverse(&ret, x);
+#else
+#  error "Unsupported type sizes for lg_floor()"
+#endif
+	return (ret);
+}
+#elif (defined(JEMALLOC_HAVE_BUILTIN_CLZ))
+JEMALLOC_INLINE size_t
+lg_floor(size_t x)
+{
+
+	assert(x != 0);
+
+#if (LG_SIZEOF_PTR == LG_SIZEOF_INT)
+	return (((8 << LG_SIZEOF_PTR) - 1) - __builtin_clz(x));
+#elif (LG_SIZEOF_PTR == LG_SIZEOF_LONG)
+	return (((8 << LG_SIZEOF_PTR) - 1) - __builtin_clzl(x));
+#else
+#  error "Unsupported type sizes for lg_floor()"
+#endif
+}
+#else
+JEMALLOC_INLINE size_t
+lg_floor(size_t x)
+{
+
+	assert(x != 0);
+
+	x |= (x >> 1);
+	x |= (x >> 2);
+	x |= (x >> 4);
+	x |= (x >> 8);
+	x |= (x >> 16);
+#if (LG_SIZEOF_PTR == 3 && LG_SIZEOF_PTR == LG_SIZEOF_LONG)
+	x |= (x >> 32);
+	if (x == KZU(0xffffffffffffffff))
+		return (63);
+	x++;
+	return (jemalloc_ffsl(x) - 2);
+#elif (LG_SIZEOF_PTR == 2)
+	if (x == KZU(0xffffffff))
+		return (31);
+	x++;
+	return (jemalloc_ffs(x) - 2);
+#else
+#  error "Unsupported type sizes for lg_floor()"
+#endif
+}
+#endif
+
+/* Set error code. */
 JEMALLOC_INLINE void
 set_errno(int errnum)
 {
@@ -145,7 +297,7 @@ set_errno(int errnum)
 #endif
 }
 
-/* Get last error code */
+/* Get last error code. */
 JEMALLOC_INLINE int
 get_errno(void)
 {
diff --git a/deps/jemalloc/include/jemalloc/internal/valgrind.h b/deps/jemalloc/include/jemalloc/internal/valgrind.h
new file mode 100644
index 000000000..a3380df92
--- /dev/null
+++ b/deps/jemalloc/include/jemalloc/internal/valgrind.h
@@ -0,0 +1,112 @@
+/******************************************************************************/
+#ifdef JEMALLOC_H_TYPES
+
+#ifdef JEMALLOC_VALGRIND
+#include <valgrind/valgrind.h>
+
+/*
+ * The size that is reported to Valgrind must be consistent through a chain of
+ * malloc..realloc..realloc calls.  Request size isn't recorded anywhere in
+ * jemalloc, so it is critical that all callers of these macros provide usize
+ * rather than request size.  As a result, buffer overflow detection is
+ * technically weakened for the standard API, though it is generally accepted
+ * practice to consider any extra bytes reported by malloc_usable_size() as
+ * usable space.
+ */
+#define	JEMALLOC_VALGRIND_MAKE_MEM_NOACCESS(ptr, usize) do {		\
+	if (unlikely(in_valgrind))					\
+		valgrind_make_mem_noaccess(ptr, usize);			\
+} while (0)
+#define	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ptr, usize) do {		\
+	if (unlikely(in_valgrind))					\
+		valgrind_make_mem_undefined(ptr, usize);		\
+} while (0)
+#define	JEMALLOC_VALGRIND_MAKE_MEM_DEFINED(ptr, usize) do {		\
+	if (unlikely(in_valgrind))					\
+		valgrind_make_mem_defined(ptr, usize);			\
+} while (0)
+/*
+ * The VALGRIND_MALLOCLIKE_BLOCK() and VALGRIND_RESIZEINPLACE_BLOCK() macro
+ * calls must be embedded in macros rather than in functions so that when
+ * Valgrind reports errors, there are no extra stack frames in the backtraces.
+ */
+#define	JEMALLOC_VALGRIND_MALLOC(cond, ptr, usize, zero) do {		\
+	if (unlikely(in_valgrind && cond))				\
+		VALGRIND_MALLOCLIKE_BLOCK(ptr, usize, p2rz(ptr), zero);	\
+} while (0)
+#define	JEMALLOC_VALGRIND_REALLOC(maybe_moved, ptr, usize,		\
+    ptr_maybe_null, old_ptr, old_usize, old_rzsize, old_ptr_maybe_null,	\
+    zero) do {								\
+	if (unlikely(in_valgrind)) {					\
+		size_t rzsize = p2rz(ptr);				\
+									\
+		if (!maybe_moved || ptr == old_ptr) {			\
+			VALGRIND_RESIZEINPLACE_BLOCK(ptr, old_usize,	\
+			    usize, rzsize);				\
+			if (zero && old_usize < usize) {		\
+				valgrind_make_mem_defined(		\
+				    (void *)((uintptr_t)ptr +		\
+				    old_usize), usize - old_usize);	\
+			}						\
+		} else {						\
+			if (!old_ptr_maybe_null || old_ptr != NULL) {	\
+				valgrind_freelike_block(old_ptr,	\
+				    old_rzsize);			\
+			}						\
+			if (!ptr_maybe_null || ptr != NULL) {		\
+				size_t copy_size = (old_usize < usize)	\
+				    ?  old_usize : usize;		\
+				size_t tail_size = usize - copy_size;	\
+				VALGRIND_MALLOCLIKE_BLOCK(ptr, usize,	\
+				    rzsize, false);			\
+				if (copy_size > 0) {			\
+					valgrind_make_mem_defined(ptr,	\
+					copy_size);			\
+				}					\
+				if (zero && tail_size > 0) {		\
+					valgrind_make_mem_defined(	\
+					    (void *)((uintptr_t)ptr +	\
+					    copy_size), tail_size);	\
+				}					\
+			}						\
+		}							\
+	}								\
+} while (0)
+#define	JEMALLOC_VALGRIND_FREE(ptr, rzsize) do {			\
+	if (unlikely(in_valgrind))					\
+		valgrind_freelike_block(ptr, rzsize);			\
+} while (0)
+#else
+#define	RUNNING_ON_VALGRIND	((unsigned)0)
+#define	JEMALLOC_VALGRIND_MAKE_MEM_NOACCESS(ptr, usize) do {} while (0)
+#define	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ptr, usize) do {} while (0)
+#define	JEMALLOC_VALGRIND_MAKE_MEM_DEFINED(ptr, usize) do {} while (0)
+#define	JEMALLOC_VALGRIND_MALLOC(cond, ptr, usize, zero) do {} while (0)
+#define	JEMALLOC_VALGRIND_REALLOC(maybe_moved, ptr, usize,		\
+    ptr_maybe_null, old_ptr, old_usize, old_rzsize, old_ptr_maybe_null,	\
+    zero) do {} while (0)
+#define	JEMALLOC_VALGRIND_FREE(ptr, rzsize) do {} while (0)
+#endif
+
+#endif /* JEMALLOC_H_TYPES */
+/******************************************************************************/
+#ifdef JEMALLOC_H_STRUCTS
+
+#endif /* JEMALLOC_H_STRUCTS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_EXTERNS
+
+#ifdef JEMALLOC_VALGRIND
+void	valgrind_make_mem_noaccess(void *ptr, size_t usize);
+void	valgrind_make_mem_undefined(void *ptr, size_t usize);
+void	valgrind_make_mem_defined(void *ptr, size_t usize);
+void	valgrind_freelike_block(void *ptr, size_t usize);
+#endif
+
+#endif /* JEMALLOC_H_EXTERNS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_INLINES
+
+#endif /* JEMALLOC_H_INLINES */
+/******************************************************************************/
+
author	antirez <antirez@gmail.com>	2015-10-06 16:18:30 +0200
committer	antirez <antirez@gmail.com>	2015-10-06 16:55:37 +0200
commit	a9951b1b6a326532163e0fe4ee1a26e972258a1e (patch)
tree	ca555f37238537175cc1b34aa62a9f873026047f /deps/jemalloc/include/jemalloc/internal
parent	e3ded0273c43986a49ddd9d5fb4a20d187d015de (diff)
download	redis-a9951b1b6a326532163e0fe4ee1a26e972258a1e.tar.gz