feat(Deps/Jemalloc): update Jemalloc to 5.2.1 (#2413)

author: Viste <viste02@gmail.com> 2019-11-14 23:17:38 +0300
committer: Kargatum <dowlandtop@yandex.com> 2019-11-15 03:17:38 +0700
commit: 685538b01b27ba38c605448e3a0de225bed4bb29 (patch)
tree: 36196f0965c5fc2fccdbc45a86a8155f2c986e4d /deps/jemalloc/src
parent: fae7ae95a373530e0b206814662df557882c8f1a (diff)
28 files changed, 6655 insertions, 2297 deletions
diff --git a/deps/jemalloc/src/arena.c b/deps/jemalloc/src/arena.c
index 632fce5233..ba50e41033 100644
--- a/deps/jemalloc/src/arena.c
+++ b/deps/jemalloc/src/arena.c
@@ -3,13 +3,16 @@
 #include "jemalloc/internal/jemalloc_internal_includes.h"
 
 #include "jemalloc/internal/assert.h"
+#include "jemalloc/internal/div.h"
 #include "jemalloc/internal/extent_dss.h"
 #include "jemalloc/internal/extent_mmap.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/rtree.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/safety_check.h"
 #include "jemalloc/internal/util.h"
 
+JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
+
 /******************************************************************************/
 /* Data. */
 
@@ -32,21 +35,6 @@ ssize_t opt_muzzy_decay_ms = MUZZY_DECAY_MS_DEFAULT;
 static atomic_zd_t dirty_decay_ms_default;
 static atomic_zd_t muzzy_decay_ms_default;
 
-const arena_bin_info_t arena_bin_info[NBINS] = {
-#define BIN_INFO_bin_yes(reg_size, slab_size, nregs)			\
-	{reg_size, slab_size, nregs, BITMAP_INFO_INITIALIZER(nregs)},
-#define BIN_INFO_bin_no(reg_size, slab_size, nregs)
-#define SC(index, lg_grp, lg_delta, ndelta, psz, bin, pgs,		\
-    lg_delta_lookup)							\
-	BIN_INFO_bin_##bin((1U<<lg_grp) + (ndelta<<lg_delta),		\
-	    (pgs << LG_PAGE), (pgs << LG_PAGE) / ((1U<<lg_grp) +	\
-	    (ndelta<<lg_delta)))
-	SIZE_CLASSES
-#undef BIN_INFO_bin_yes
-#undef BIN_INFO_bin_no
-#undef SC
-};
-
 const uint64_t h_steps[SMOOTHSTEP_NSTEPS] = {
 #define STEP(step, h, x, y)			\
 		h,
@@ -54,6 +42,12 @@ const uint64_t h_steps[SMOOTHSTEP_NSTEPS] = {
 #undef STEP
 };
 
+static div_info_t arena_binind_div_info[SC_NBINS];
+
+size_t opt_oversize_threshold = OVERSIZE_THRESHOLD_DEFAULT;
+size_t oversize_threshold = OVERSIZE_THRESHOLD_DEFAULT;
+static unsigned huge_arena_ind;
+
 /******************************************************************************/
 /*
  * Function prototypes for static functions that are referenced prior to
@@ -62,155 +56,16 @@ const uint64_t h_steps[SMOOTHSTEP_NSTEPS] = {
 
 static void arena_decay_to_limit(tsdn_t *tsdn, arena_t *arena,
     arena_decay_t *decay, extents_t *extents, bool all, size_t npages_limit,
-    bool is_background_thread);
+    size_t npages_decay_max, bool is_background_thread);
 static bool arena_decay_dirty(tsdn_t *tsdn, arena_t *arena,
     bool is_background_thread, bool all);
 static void arena_dalloc_bin_slab(tsdn_t *tsdn, arena_t *arena, extent_t *slab,
-    arena_bin_t *bin);
+    bin_t *bin);
 static void arena_bin_lower_slab(tsdn_t *tsdn, arena_t *arena, extent_t *slab,
-    arena_bin_t *bin);
+    bin_t *bin);
 
 /******************************************************************************/
 
-static bool
-arena_stats_init(tsdn_t *tsdn, arena_stats_t *arena_stats) {
-	if (config_debug) {
-		for (size_t i = 0; i < sizeof(arena_stats_t); i++) {
-			assert(((char *)arena_stats)[i] == 0);
-		}
-	}
-#ifndef JEMALLOC_ATOMIC_U64
-	if (malloc_mutex_init(&arena_stats->mtx, "arena_stats",
-	    WITNESS_RANK_ARENA_STATS, malloc_mutex_rank_exclusive)) {
-		return true;
-	}
-#endif
-	/* Memory is zeroed, so there is no need to clear stats. */
-	return false;
-}
-
-static void
-arena_stats_lock(tsdn_t *tsdn, arena_stats_t *arena_stats) {
-#ifndef JEMALLOC_ATOMIC_U64
-	malloc_mutex_lock(tsdn, &arena_stats->mtx);
-#endif
-}
-
-static void
-arena_stats_unlock(tsdn_t *tsdn, arena_stats_t *arena_stats) {
-#ifndef JEMALLOC_ATOMIC_U64
-	malloc_mutex_unlock(tsdn, &arena_stats->mtx);
-#endif
-}
-
-static uint64_t
-arena_stats_read_u64(tsdn_t *tsdn, arena_stats_t *arena_stats,
-    arena_stats_u64_t *p) {
-#ifdef JEMALLOC_ATOMIC_U64
-	return atomic_load_u64(p, ATOMIC_RELAXED);
-#else
-	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
-	return *p;
-#endif
-}
-
-static void
-arena_stats_add_u64(tsdn_t *tsdn, arena_stats_t *arena_stats,
-    arena_stats_u64_t *p, uint64_t x) {
-#ifdef JEMALLOC_ATOMIC_U64
-	atomic_fetch_add_u64(p, x, ATOMIC_RELAXED);
-#else
-	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
-	*p += x;
-#endif
-}
-
-UNUSED static void
-arena_stats_sub_u64(tsdn_t *tsdn, arena_stats_t *arena_stats,
-    arena_stats_u64_t *p, uint64_t x) {
-#ifdef JEMALLOC_ATOMIC_U64
-	UNUSED uint64_t r = atomic_fetch_sub_u64(p, x, ATOMIC_RELAXED);
-	assert(r - x <= r);
-#else
-	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
-	*p -= x;
-	assert(*p + x >= *p);
-#endif
-}
-
-/*
- * Non-atomically sets *dst += src.  *dst needs external synchronization.
- * This lets us avoid the cost of a fetch_add when its unnecessary (note that
- * the types here are atomic).
- */
-static void
-arena_stats_accum_u64(arena_stats_u64_t *dst, uint64_t src) {
-#ifdef JEMALLOC_ATOMIC_U64
-	uint64_t cur_dst = atomic_load_u64(dst, ATOMIC_RELAXED);
-	atomic_store_u64(dst, src + cur_dst, ATOMIC_RELAXED);
-#else
-	*dst += src;
-#endif
-}
-
-static size_t
-arena_stats_read_zu(tsdn_t *tsdn, arena_stats_t *arena_stats, atomic_zu_t *p) {
-#ifdef JEMALLOC_ATOMIC_U64
-	return atomic_load_zu(p, ATOMIC_RELAXED);
-#else
-	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
-	return atomic_load_zu(p, ATOMIC_RELAXED);
-#endif
-}
-
-static void
-arena_stats_add_zu(tsdn_t *tsdn, arena_stats_t *arena_stats, atomic_zu_t *p,
-    size_t x) {
-#ifdef JEMALLOC_ATOMIC_U64
-	atomic_fetch_add_zu(p, x, ATOMIC_RELAXED);
-#else
-	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
-	size_t cur = atomic_load_zu(p, ATOMIC_RELAXED);
-	atomic_store_zu(p, cur + x, ATOMIC_RELAXED);
-#endif
-}
-
-static void
-arena_stats_sub_zu(tsdn_t *tsdn, arena_stats_t *arena_stats, atomic_zu_t *p,
-    size_t x) {
-#ifdef JEMALLOC_ATOMIC_U64
-	UNUSED size_t r = atomic_fetch_sub_zu(p, x, ATOMIC_RELAXED);
-	assert(r - x <= r);
-#else
-	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
-	size_t cur = atomic_load_zu(p, ATOMIC_RELAXED);
-	atomic_store_zu(p, cur - x, ATOMIC_RELAXED);
-#endif
-}
-
-/* Like the _u64 variant, needs an externally synchronized *dst. */
-static void
-arena_stats_accum_zu(atomic_zu_t *dst, size_t src) {
-	size_t cur_dst = atomic_load_zu(dst, ATOMIC_RELAXED);
-	atomic_store_zu(dst, src + cur_dst, ATOMIC_RELAXED);
-}
-
-void
-arena_stats_large_nrequests_add(tsdn_t *tsdn, arena_stats_t *arena_stats,
-    szind_t szind, uint64_t nrequests) {
-	arena_stats_lock(tsdn, arena_stats);
-	arena_stats_add_u64(tsdn, arena_stats, &arena_stats->lstats[szind -
-	    NBINS].nrequests, nrequests);
-	arena_stats_unlock(tsdn, arena_stats);
-}
-
-void
-arena_stats_mapped_add(tsdn_t *tsdn, arena_stats_t *arena_stats, size_t size) {
-	arena_stats_lock(tsdn, arena_stats);
-	arena_stats_add_zu(tsdn, arena_stats, &arena_stats->mapped, size);
-	arena_stats_unlock(tsdn, arena_stats);
-}
-
 void
 arena_basic_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
     const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms,
@@ -228,15 +83,16 @@ void
 arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
     const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms,
     size_t *nactive, size_t *ndirty, size_t *nmuzzy, arena_stats_t *astats,
-    malloc_bin_stats_t *bstats, malloc_large_stats_t *lstats) {
+    bin_stats_t *bstats, arena_stats_large_t *lstats,
+    arena_stats_extents_t *estats) {
 	cassert(config_stats);
 
 	arena_basic_stats_merge(tsdn, arena, nthreads, dss, dirty_decay_ms,
 	    muzzy_decay_ms, nactive, ndirty, nmuzzy);
 
-	size_t base_allocated, base_resident, base_mapped;
+	size_t base_allocated, base_resident, base_mapped, metadata_thp;
 	base_stats_get(tsdn, arena->base, &base_allocated, &base_resident,
-	    &base_mapped);
+	    &base_mapped, &metadata_thp);
 
 	arena_stats_lock(tsdn, &arena->stats);
 
@@ -245,6 +101,10 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
 	arena_stats_accum_zu(&astats->retained,
 	    extents_npages_get(&arena->extents_retained) << LG_PAGE);
 
+	atomic_store_zu(&astats->extent_avail,
+	    atomic_load_zu(&arena->extent_avail_cnt, ATOMIC_RELAXED),
+	    ATOMIC_RELAXED);
+
 	arena_stats_accum_u64(&astats->decay_dirty.npurge,
 	    arena_stats_read_u64(tsdn, &arena->stats,
 	    &arena->stats.decay_dirty.npurge));
@@ -267,12 +127,15 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
 
 	arena_stats_accum_zu(&astats->base, base_allocated);
 	arena_stats_accum_zu(&astats->internal, arena_internal_get(arena));
+	arena_stats_accum_zu(&astats->metadata_thp, metadata_thp);
 	arena_stats_accum_zu(&astats->resident, base_resident +
 	    (((atomic_load_zu(&arena->nactive, ATOMIC_RELAXED) +
 	    extents_npages_get(&arena->extents_dirty) +
 	    extents_npages_get(&arena->extents_muzzy)) << LG_PAGE)));
+	arena_stats_accum_zu(&astats->abandoned_vm, atomic_load_zu(
+	    &arena->stats.abandoned_vm, ATOMIC_RELAXED));
 
-	for (szind_t i = 0; i < NSIZES - NBINS; i++) {
+	for (szind_t i = 0; i < SC_NSIZES - SC_NBINS; i++) {
 		uint64_t nmalloc = arena_stats_read_u64(tsdn, &arena->stats,
 		    &arena->stats.lstats[i].nmalloc);
 		arena_stats_accum_u64(&lstats[i].nmalloc, nmalloc);
@@ -290,12 +153,43 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
 		arena_stats_accum_u64(&astats->nrequests_large,
 		    nmalloc + nrequests);
 
+		/* nfill == nmalloc for large currently. */
+		arena_stats_accum_u64(&lstats[i].nfills, nmalloc);
+		arena_stats_accum_u64(&astats->nfills_large, nmalloc);
+
+		uint64_t nflush = arena_stats_read_u64(tsdn, &arena->stats,
+		    &arena->stats.lstats[i].nflushes);
+		arena_stats_accum_u64(&lstats[i].nflushes, nflush);
+		arena_stats_accum_u64(&astats->nflushes_large, nflush);
+
 		assert(nmalloc >= ndalloc);
 		assert(nmalloc - ndalloc <= SIZE_T_MAX);
 		size_t curlextents = (size_t)(nmalloc - ndalloc);
 		lstats[i].curlextents += curlextents;
 		arena_stats_accum_zu(&astats->allocated_large,
-		    curlextents * sz_index2size(NBINS + i));
+		    curlextents * sz_index2size(SC_NBINS + i));
+	}
+
+	for (pszind_t i = 0; i < SC_NPSIZES; i++) {
+		size_t dirty, muzzy, retained, dirty_bytes, muzzy_bytes,
+		    retained_bytes;
+		dirty = extents_nextents_get(&arena->extents_dirty, i);
+		muzzy = extents_nextents_get(&arena->extents_muzzy, i);
+		retained = extents_nextents_get(&arena->extents_retained, i);
+		dirty_bytes = extents_nbytes_get(&arena->extents_dirty, i);
+		muzzy_bytes = extents_nbytes_get(&arena->extents_muzzy, i);
+		retained_bytes =
+		    extents_nbytes_get(&arena->extents_retained, i);
+
+		atomic_store_zu(&estats[i].ndirty, dirty, ATOMIC_RELAXED);
+		atomic_store_zu(&estats[i].nmuzzy, muzzy, ATOMIC_RELAXED);
+		atomic_store_zu(&estats[i].nretained, retained, ATOMIC_RELAXED);
+		atomic_store_zu(&estats[i].dirty_bytes, dirty_bytes,
+		    ATOMIC_RELAXED);
+		atomic_store_zu(&estats[i].muzzy_bytes, muzzy_bytes,
+		    ATOMIC_RELAXED);
+		atomic_store_zu(&estats[i].retained_bytes, retained_bytes,
+		    ATOMIC_RELAXED);
 	}
 
 	arena_stats_unlock(tsdn, &arena->stats);
@@ -303,16 +197,16 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
 	/* tcache_bytes counts currently cached bytes. */
 	atomic_store_zu(&astats->tcache_bytes, 0, ATOMIC_RELAXED);
 	malloc_mutex_lock(tsdn, &arena->tcache_ql_mtx);
-	tcache_t *tcache;
-	ql_foreach(tcache, &arena->tcache_ql, link) {
+	cache_bin_array_descriptor_t *descriptor;
+	ql_foreach(descriptor, &arena->cache_bin_array_descriptor_ql, link) {
 		szind_t i = 0;
-		for (; i < NBINS; i++) {
-			tcache_bin_t *tbin = tcache_small_bin_get(tcache, i);
+		for (; i < SC_NBINS; i++) {
+			cache_bin_t *tbin = &descriptor->bins_small[i];
 			arena_stats_accum_zu(&astats->tcache_bytes,
 			    tbin->ncached * sz_index2size(i));
 		}
 		for (; i < nhbins; i++) {
-			tcache_bin_t *tbin = tcache_large_bin_get(tcache, i);
+			cache_bin_t *tbin = &descriptor->bins_large[i];
 			arena_stats_accum_zu(&astats->tcache_bytes,
 			    tbin->ncached * sz_index2size(i));
 		}
@@ -350,21 +244,11 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
 	nstime_update(&astats->uptime);
 	nstime_subtract(&astats->uptime, &arena->create_time);
 
-	for (szind_t i = 0; i < NBINS; i++) {
-		arena_bin_t *bin = &arena->bins[i];
-
-		malloc_mutex_lock(tsdn, &bin->lock);
-		malloc_mutex_prof_read(tsdn, &bstats[i].mutex_data, &bin->lock);
-		bstats[i].nmalloc += bin->stats.nmalloc;
-		bstats[i].ndalloc += bin->stats.ndalloc;
-		bstats[i].nrequests += bin->stats.nrequests;
-		bstats[i].curregs += bin->stats.curregs;
-		bstats[i].nfills += bin->stats.nfills;
-		bstats[i].nflushes += bin->stats.nflushes;
-		bstats[i].nslabs += bin->stats.nslabs;
-		bstats[i].reslabs += bin->stats.reslabs;
-		bstats[i].curslabs += bin->stats.curslabs;
-		malloc_mutex_unlock(tsdn, &bin->lock);
+	for (szind_t i = 0; i < SC_NBINS; i++) {
+		for (unsigned j = 0; j < bin_infos[i].n_shards; j++) {
+			bin_stats_merge(tsdn, &bstats[i],
+			    &arena->bins[i].bin_shards[j]);
+		}
 	}
 }
 
@@ -384,8 +268,7 @@ arena_extents_dirty_dalloc(tsdn_t *tsdn, arena_t *arena,
 }
 
 static void *
-arena_slab_reg_alloc(tsdn_t *tsdn, extent_t *slab,
-    const arena_bin_info_t *bin_info) {
+arena_slab_reg_alloc(extent_t *slab, const bin_info_t *bin_info) {
 	void *ret;
 	arena_slab_data_t *slab_data = extent_slab_data_get(slab);
 	size_t regind;
@@ -400,6 +283,54 @@ arena_slab_reg_alloc(tsdn_t *tsdn, extent_t *slab,
 	return ret;
 }
 
+static void
+arena_slab_reg_alloc_batch(extent_t *slab, const bin_info_t *bin_info,
+			   unsigned cnt, void** ptrs) {
+	arena_slab_data_t *slab_data = extent_slab_data_get(slab);
+
+	assert(extent_nfree_get(slab) >= cnt);
+	assert(!bitmap_full(slab_data->bitmap, &bin_info->bitmap_info));
+
+#if (! defined JEMALLOC_INTERNAL_POPCOUNTL) || (defined BITMAP_USE_TREE)
+	for (unsigned i = 0; i < cnt; i++) {
+		size_t regind = bitmap_sfu(slab_data->bitmap,
+					   &bin_info->bitmap_info);
+		*(ptrs + i) = (void *)((uintptr_t)extent_addr_get(slab) +
+		    (uintptr_t)(bin_info->reg_size * regind));
+	}
+#else
+	unsigned group = 0;
+	bitmap_t g = slab_data->bitmap[group];
+	unsigned i = 0;
+	while (i < cnt) {
+		while (g == 0) {
+			g = slab_data->bitmap[++group];
+		}
+		size_t shift = group << LG_BITMAP_GROUP_NBITS;
+		size_t pop = popcount_lu(g);
+		if (pop > (cnt - i)) {
+			pop = cnt - i;
+		}
+
+		/*
+		 * Load from memory locations only once, outside the
+		 * hot loop below.
+		 */
+		uintptr_t base = (uintptr_t)extent_addr_get(slab);
+		uintptr_t regsize = (uintptr_t)bin_info->reg_size;
+		while (pop--) {
+			size_t bit = cfs_lu(&g);
+			size_t regind = shift + bit;
+			*(ptrs + i) = (void *)(base + regsize * regind);
+
+			i++;
+		}
+		slab_data->bitmap[group] = g;
+	}
+#endif
+	extent_nfree_sub(slab, cnt);
+}
+
 #ifndef JEMALLOC_JET
 static
 #endif
@@ -412,37 +343,22 @@ arena_slab_regind(extent_t *slab, szind_t binind, const void *ptr) {
 	assert((uintptr_t)ptr < (uintptr_t)extent_past_get(slab));
 	/* Freeing an interior pointer can cause assertion failure. */
 	assert(((uintptr_t)ptr - (uintptr_t)extent_addr_get(slab)) %
-	    (uintptr_t)arena_bin_info[binind].reg_size == 0);
+	    (uintptr_t)bin_infos[binind].reg_size == 0);
 
-	/* Avoid doing division with a variable divisor. */
 	diff = (size_t)((uintptr_t)ptr - (uintptr_t)extent_addr_get(slab));
-	switch (binind) {
-#define REGIND_bin_yes(index, reg_size)					\
-	case index:							\
-		regind = diff / (reg_size);				\
-		assert(diff == regind * (reg_size));			\
-		break;
-#define REGIND_bin_no(index, reg_size)
-#define SC(index, lg_grp, lg_delta, ndelta, psz, bin, pgs,		\
-    lg_delta_lookup)							\
-	REGIND_bin_##bin(index, (1U<<lg_grp) + (ndelta<<lg_delta))
-	SIZE_CLASSES
-#undef REGIND_bin_yes
-#undef REGIND_bin_no
-#undef SC
-	default: not_reached();
-	}
-
-	assert(regind < arena_bin_info[binind].nregs);
+
+	/* Avoid doing division with a variable divisor. */
+	regind = div_compute(&arena_binind_div_info[binind], diff);
+
+	assert(regind < bin_infos[binind].nregs);
 
 	return regind;
 }
 
 static void
-arena_slab_reg_dalloc(tsdn_t *tsdn, extent_t *slab,
-    arena_slab_data_t *slab_data, void *ptr) {
+arena_slab_reg_dalloc(extent_t *slab, arena_slab_data_t *slab_data, void *ptr) {
 	szind_t binind = extent_szind_get(slab);
-	const arena_bin_info_t *bin_info = &arena_bin_info[binind];
+	const bin_info_t *bin_info = &bin_infos[binind];
 	size_t regind = arena_slab_regind(slab, binind, ptr);
 
 	assert(extent_nfree_get(slab) < bin_info->nregs);
@@ -470,11 +386,11 @@ arena_large_malloc_stats_update(tsdn_t *tsdn, arena_t *arena, size_t usize) {
 
 	cassert(config_stats);
 
-	if (usize < LARGE_MINCLASS) {
-		usize = LARGE_MINCLASS;
+	if (usize < SC_LARGE_MINCLASS) {
+		usize = SC_LARGE_MINCLASS;
 	}
 	index = sz_size2index(usize);
-	hindex = (index >= NBINS) ? index - NBINS : 0;
+	hindex = (index >= SC_NBINS) ? index - SC_NBINS : 0;
 
 	arena_stats_add_u64(tsdn, &arena->stats,
 	    &arena->stats.lstats[hindex].nmalloc, 1);
@@ -486,11 +402,11 @@ arena_large_dalloc_stats_update(tsdn_t *tsdn, arena_t *arena, size_t usize) {
 
 	cassert(config_stats);
 
-	if (usize < LARGE_MINCLASS) {
-		usize = LARGE_MINCLASS;
+	if (usize < SC_LARGE_MINCLASS) {
+		usize = SC_LARGE_MINCLASS;
 	}
 	index = sz_size2index(usize);
-	hindex = (index >= NBINS) ? index - NBINS : 0;
+	hindex = (index >= SC_NBINS) ? index - SC_NBINS : 0;
 
 	arena_stats_add_u64(tsdn, &arena->stats,
 	    &arena->stats.lstats[hindex].ndalloc, 1);
@@ -503,6 +419,11 @@ arena_large_ralloc_stats_update(tsdn_t *tsdn, arena_t *arena, size_t oldusize,
 	arena_large_malloc_stats_update(tsdn, arena, usize);
 }
 
+static bool
+arena_may_have_muzzy(arena_t *arena) {
+	return (pages_can_purge_lazy && (arena_muzzy_decay_ms_get(arena) != 0));
+}
+
 extent_t *
 arena_extent_alloc_large(tsdn_t *tsdn, arena_t *arena, size_t usize,
     size_t alignment, bool *zero) {
@@ -517,7 +438,7 @@ arena_extent_alloc_large(tsdn_t *tsdn, arena_t *arena, size_t usize,
 	extent_t *extent = extents_alloc(tsdn, arena, &extent_hooks,
 	    &arena->extents_dirty, NULL, usize, sz_large_pad, alignment, false,
 	    szind, zero, &commit);
-	if (extent == NULL) {
+	if (extent == NULL && arena_may_have_muzzy(arena)) {
 		extent = extents_alloc(tsdn, arena, &extent_hooks,
 		    &arena->extents_muzzy, NULL, usize, sz_large_pad, alignment,
 		    false, szind, zero, &commit);
@@ -692,7 +613,8 @@ arena_decay_try_purge(tsdn_t *tsdn, arena_t *arena, arena_decay_t *decay,
     bool is_background_thread) {
 	if (current_npages > npages_limit) {
 		arena_decay_to_limit(tsdn, arena, decay, extents, false,
-		    npages_limit, is_background_thread);
+		    npages_limit, current_npages - npages_limit,
+		    is_background_thread);
 	}
 }
 
@@ -738,7 +660,7 @@ arena_decay_epoch_advance(tsdn_t *tsdn, arena_t *arena, arena_decay_t *decay,
 }
 
 static void
-arena_decay_reinit(arena_decay_t *decay, extents_t *extents, ssize_t decay_ms) {
+arena_decay_reinit(arena_decay_t *decay, ssize_t decay_ms) {
 	arena_decay_ms_write(decay, decay_ms);
 	if (decay_ms > 0) {
 		nstime_init(&decay->interval, (uint64_t)decay_ms *
@@ -755,8 +677,8 @@ arena_decay_reinit(arena_decay_t *decay, extents_t *extents, ssize_t decay_ms) {
 }
 
 static bool
-arena_decay_init(arena_decay_t *decay, extents_t *extents, ssize_t decay_ms,
-    decay_stats_t *stats) {
+arena_decay_init(arena_decay_t *decay, ssize_t decay_ms,
+    arena_stats_decay_t *stats) {
 	if (config_debug) {
 		for (size_t i = 0; i < sizeof(arena_decay_t); i++) {
 			assert(((char *)decay)[i] == 0);
@@ -768,7 +690,7 @@ arena_decay_init(arena_decay_t *decay, extents_t *extents, ssize_t decay_ms,
 		return true;
 	}
 	decay->purging = false;
-	arena_decay_reinit(decay, extents, decay_ms);
+	arena_decay_reinit(decay, decay_ms);
 	/* Memory is zeroed, so there is no need to clear stats. */
 	if (config_stats) {
 		decay->stats = stats;
@@ -798,7 +720,8 @@ arena_maybe_decay(tsdn_t *tsdn, arena_t *arena, arena_decay_t *decay,
 	if (decay_ms <= 0) {
 		if (decay_ms == 0) {
 			arena_decay_to_limit(tsdn, arena, decay, extents, false,
-			    0, is_background_thread);
+			    0, extents_npages_get(extents),
+			    is_background_thread);
 		}
 		return false;
 	}
@@ -876,7 +799,7 @@ arena_decay_ms_set(tsdn_t *tsdn, arena_t *arena, arena_decay_t *decay,
 	 * infrequent, either between the {-1, 0, >0} states, or a one-time
 	 * arbitrary change during initial arena configuration.
 	 */
-	arena_decay_reinit(decay, extents, decay_ms);
+	arena_decay_reinit(decay, decay_ms);
 	arena_maybe_decay(tsdn, arena, decay, extents, false);
 	malloc_mutex_unlock(tsdn, &decay->mtx);
 
@@ -900,14 +823,15 @@ arena_muzzy_decay_ms_set(tsdn_t *tsdn, arena_t *arena,
 static size_t
 arena_stash_decayed(tsdn_t *tsdn, arena_t *arena,
     extent_hooks_t **r_extent_hooks, extents_t *extents, size_t npages_limit,
-    extent_list_t *decay_extents) {
+	size_t npages_decay_max, extent_list_t *decay_extents) {
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, 0);
 
 	/* Stash extents according to npages_limit. */
 	size_t nstashed = 0;
 	extent_t *extent;
-	while ((extent = extents_evict(tsdn, arena, r_extent_hooks, extents,
+	while (nstashed < npages_decay_max &&
+	    (extent = extents_evict(tsdn, arena, r_extent_hooks, extents,
 	    npages_limit)) != NULL) {
 		extent_list_append(decay_extents, extent);
 		nstashed += extent_size_get(extent) >> LG_PAGE;
@@ -919,7 +843,7 @@ static size_t
 arena_decay_stashed(tsdn_t *tsdn, arena_t *arena,
     extent_hooks_t **r_extent_hooks, arena_decay_t *decay, extents_t *extents,
     bool all, extent_list_t *decay_extents, bool is_background_thread) {
-	UNUSED size_t nmadvise, nunmapped;
+	size_t nmadvise, nunmapped;
 	size_t npurged;
 
 	if (config_stats) {
@@ -982,12 +906,15 @@ arena_decay_stashed(tsdn_t *tsdn, arena_t *arena,
 }
 
 /*
- * npages_limit: Decay as many dirty extents as possible without violating the
- * invariant: (extents_npages_get(extents) >= npages_limit)
+ * npages_limit: Decay at most npages_decay_max pages without violating the
+ * invariant: (extents_npages_get(extents) >= npages_limit).  We need an upper
+ * bound on number of pages in order to prevent unbounded growth (namely in
+ * stashed), otherwise unbounded new pages could be added to extents during the
+ * current decay run, so that the purging thread never finishes.
  */
 static void
 arena_decay_to_limit(tsdn_t *tsdn, arena_t *arena, arena_decay_t *decay,
-    extents_t *extents, bool all, size_t npages_limit,
+    extents_t *extents, bool all, size_t npages_limit, size_t npages_decay_max,
     bool is_background_thread) {
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, 1);
@@ -1005,9 +932,9 @@ arena_decay_to_limit(tsdn_t *tsdn, arena_t *arena, arena_decay_t *decay,
 	extent_list_init(&decay_extents);
 
 	size_t npurge = arena_stash_decayed(tsdn, arena, &extent_hooks, extents,
-	    npages_limit, &decay_extents);
+	    npages_limit, npages_decay_max, &decay_extents);
 	if (npurge != 0) {
-		UNUSED size_t npurged = arena_decay_stashed(tsdn, arena,
+		size_t npurged = arena_decay_stashed(tsdn, arena,
 		    &extent_hooks, decay, extents, all, &decay_extents,
 		    is_background_thread);
 		assert(npurged == npurge);
@@ -1023,7 +950,7 @@ arena_decay_impl(tsdn_t *tsdn, arena_t *arena, arena_decay_t *decay,
 	if (all) {
 		malloc_mutex_lock(tsdn, &decay->mtx);
 		arena_decay_to_limit(tsdn, arena, decay, extents, all, 0,
-		    is_background_thread);
+		    extents_npages_get(extents), is_background_thread);
 		malloc_mutex_unlock(tsdn, &decay->mtx);
 
 		return false;
@@ -1045,7 +972,8 @@ arena_decay_impl(tsdn_t *tsdn, arena_t *arena, arena_decay_t *decay,
 
 	if (have_background_thread && background_thread_enabled() &&
 	    epoch_advanced && !is_background_thread) {
-		background_thread_interval_check(tsdn, arena, decay, npages_new);
+		background_thread_interval_check(tsdn, arena, decay,
+		    npages_new);
 	}
 
 	return false;
@@ -1082,30 +1010,37 @@ arena_slab_dalloc(tsdn_t *tsdn, arena_t *arena, extent_t *slab) {
 }
 
 static void
-arena_bin_slabs_nonfull_insert(arena_bin_t *bin, extent_t *slab) {
+arena_bin_slabs_nonfull_insert(bin_t *bin, extent_t *slab) {
 	assert(extent_nfree_get(slab) > 0);
 	extent_heap_insert(&bin->slabs_nonfull, slab);
+	if (config_stats) {
+		bin->stats.nonfull_slabs++;
+	}
 }
 
 static void
-arena_bin_slabs_nonfull_remove(arena_bin_t *bin, extent_t *slab) {
+arena_bin_slabs_nonfull_remove(bin_t *bin, extent_t *slab) {
 	extent_heap_remove(&bin->slabs_nonfull, slab);
+	if (config_stats) {
+		bin->stats.nonfull_slabs--;
+	}
 }
 
 static extent_t *
-arena_bin_slabs_nonfull_tryget(arena_bin_t *bin) {
+arena_bin_slabs_nonfull_tryget(bin_t *bin) {
 	extent_t *slab = extent_heap_remove_first(&bin->slabs_nonfull);
 	if (slab == NULL) {
 		return NULL;
 	}
 	if (config_stats) {
 		bin->stats.reslabs++;
+		bin->stats.nonfull_slabs--;
 	}
 	return slab;
 }
 
 static void
-arena_bin_slabs_full_insert(arena_t *arena, arena_bin_t *bin, extent_t *slab) {
+arena_bin_slabs_full_insert(arena_t *arena, bin_t *bin, extent_t *slab) {
 	assert(extent_nfree_get(slab) == 0);
 	/*
 	 *  Tracking extents is required by arena_reset, which is not allowed
@@ -1119,13 +1054,44 @@ arena_bin_slabs_full_insert(arena_t *arena, arena_bin_t *bin, extent_t *slab) {
 }
 
 static void
-arena_bin_slabs_full_remove(arena_t *arena, arena_bin_t *bin, extent_t *slab) {
+arena_bin_slabs_full_remove(arena_t *arena, bin_t *bin, extent_t *slab) {
 	if (arena_is_auto(arena)) {
 		return;
 	}
 	extent_list_remove(&bin->slabs_full, slab);
 }
 
+static void
+arena_bin_reset(tsd_t *tsd, arena_t *arena, bin_t *bin) {
+	extent_t *slab;
+
+	malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
+	if (bin->slabcur != NULL) {
+		slab = bin->slabcur;
+		bin->slabcur = NULL;
+		malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock);
+		arena_slab_dalloc(tsd_tsdn(tsd), arena, slab);
+		malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
+	}
+	while ((slab = extent_heap_remove_first(&bin->slabs_nonfull)) != NULL) {
+		malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock);
+		arena_slab_dalloc(tsd_tsdn(tsd), arena, slab);
+		malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
+	}
+	for (slab = extent_list_first(&bin->slabs_full); slab != NULL;
+	     slab = extent_list_first(&bin->slabs_full)) {
+		arena_bin_slabs_full_remove(arena, bin, slab);
+		malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock);
+		arena_slab_dalloc(tsd_tsdn(tsd), arena, slab);
+		malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
+	}
+	if (config_stats) {
+		bin->stats.curregs = 0;
+		bin->stats.curslabs = 0;
+	}
+	malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock);
+}
+
 void
 arena_reset(tsd_t *tsd, arena_t *arena) {
 	/*
@@ -1155,7 +1121,7 @@ arena_reset(tsd_t *tsd, arena_t *arena) {
 		rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd);
 		rtree_szind_slab_read(tsd_tsdn(tsd), &extents_rtree, rtree_ctx,
 		    (uintptr_t)ptr, true, &alloc_ctx.szind, &alloc_ctx.slab);
-		assert(alloc_ctx.szind != NSIZES);
+		assert(alloc_ctx.szind != SC_NSIZES);
 
 		if (config_stats || (config_prof && opt_prof)) {
 			usize = sz_index2size(alloc_ctx.szind);
@@ -1171,35 +1137,11 @@ arena_reset(tsd_t *tsd, arena_t *arena) {
 	malloc_mutex_unlock(tsd_tsdn(tsd), &arena->large_mtx);
 
 	/* Bins. */
-	for (unsigned i = 0; i < NBINS; i++) {
-		extent_t *slab;
-		arena_bin_t *bin = &arena->bins[i];
-		malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
-		if (bin->slabcur != NULL) {
-			slab = bin->slabcur;
-			bin->slabcur = NULL;
-			malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock);
-			arena_slab_dalloc(tsd_tsdn(tsd), arena, slab);
-			malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
-		}
-		while ((slab = extent_heap_remove_first(&bin->slabs_nonfull)) !=
-		    NULL) {
-			malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock);
-			arena_slab_dalloc(tsd_tsdn(tsd), arena, slab);
-			malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
-		}
-		for (slab = extent_list_first(&bin->slabs_full); slab != NULL;
-		    slab = extent_list_first(&bin->slabs_full)) {
-			arena_bin_slabs_full_remove(arena, bin, slab);
-			malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock);
-			arena_slab_dalloc(tsd_tsdn(tsd), arena, slab);
-			malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
-		}
-		if (config_stats) {
-			bin->stats.curregs = 0;
-			bin->stats.curslabs = 0;
+	for (unsigned i = 0; i < SC_NBINS; i++) {
+		for (unsigned j = 0; j < bin_infos[i].n_shards; j++) {
+			arena_bin_reset(tsd, arena,
+			    &arena->bins[i].bin_shards[j]);
 		}
-		malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock);
 	}
 
 	atomic_store_zu(&arena->nactive, 0, ATOMIC_RELAXED);
@@ -1262,7 +1204,7 @@ arena_destroy(tsd_t *tsd, arena_t *arena) {
 
 static extent_t *
 arena_slab_alloc_hard(tsdn_t *tsdn, arena_t *arena,
-    extent_hooks_t **r_extent_hooks, const arena_bin_info_t *bin_info,
+    extent_hooks_t **r_extent_hooks, const bin_info_t *bin_info,
     szind_t szind) {
 	extent_t *slab;
 	bool zero, commit;
@@ -1284,8 +1226,8 @@ arena_slab_alloc_hard(tsdn_t *tsdn, arena_t *arena,
 }
 
 static extent_t *
-arena_slab_alloc(tsdn_t *tsdn, arena_t *arena, szind_t binind,
-    const arena_bin_info_t *bin_info) {
+arena_slab_alloc(tsdn_t *tsdn, arena_t *arena, szind_t binind, unsigned binshard,
+    const bin_info_t *bin_info) {
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, 0);
 
@@ -1296,7 +1238,7 @@ arena_slab_alloc(tsdn_t *tsdn, arena_t *arena, szind_t binind,
 	extent_t *slab = extents_alloc(tsdn, arena, &extent_hooks,
 	    &arena->extents_dirty, NULL, bin_info->slab_size, 0, PAGE, true,
 	    binind, &zero, &commit);
-	if (slab == NULL) {
+	if (slab == NULL && arena_may_have_muzzy(arena)) {
 		slab = extents_alloc(tsdn, arena, &extent_hooks,
 		    &arena->extents_muzzy, NULL, bin_info->slab_size, 0, PAGE,
 		    true, binind, &zero, &commit);
@@ -1312,7 +1254,7 @@ arena_slab_alloc(tsdn_t *tsdn, arena_t *arena, szind_t binind,
 
 	/* Initialize slab internals. */
 	arena_slab_data_t *slab_data = extent_slab_data_get(slab);
-	extent_nfree_set(slab, bin_info->nregs);
+	extent_nfree_binshard_set(slab, bin_info->nregs, binshard);
 	bitmap_init(slab_data->bitmap, &bin_info->bitmap_info, false);
 
 	arena_nactive_add(arena, extent_size_get(slab) >> LG_PAGE);
@@ -1321,10 +1263,10 @@ arena_slab_alloc(tsdn_t *tsdn, arena_t *arena, szind_t binind,
 }
 
 static extent_t *
-arena_bin_nonfull_slab_get(tsdn_t *tsdn, arena_t *arena, arena_bin_t *bin,
-    szind_t binind) {
+arena_bin_nonfull_slab_get(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
+    szind_t binind, unsigned binshard) {
 	extent_t *slab;
-	const arena_bin_info_t *bin_info;
+	const bin_info_t *bin_info;
 
 	/* Look for a usable slab. */
 	slab = arena_bin_slabs_nonfull_tryget(bin);
@@ -1333,12 +1275,12 @@ arena_bin_nonfull_slab_get(tsdn_t *tsdn, arena_t *arena, arena_bin_t *bin,
 	}
 	/* No existing slabs have any space available. */
 
-	bin_info = &arena_bin_info[binind];
+	bin_info = &bin_infos[binind];
 
 	/* Allocate a new slab. */
 	malloc_mutex_unlock(tsdn, &bin->lock);
 	/******************************/
-	slab = arena_slab_alloc(tsdn, arena, binind, bin_info);
+	slab = arena_slab_alloc(tsdn, arena, binind, binshard, bin_info);
 	/********************************/
 	malloc_mutex_lock(tsdn, &bin->lock);
 	if (slab != NULL) {
@@ -1364,24 +1306,24 @@ arena_bin_nonfull_slab_get(tsdn_t *tsdn, arena_t *arena, arena_bin_t *bin,
 
 /* Re-fill bin->slabcur, then call arena_slab_reg_alloc(). */
 static void *
-arena_bin_malloc_hard(tsdn_t *tsdn, arena_t *arena, arena_bin_t *bin,
-    szind_t binind) {
-	const arena_bin_info_t *bin_info;
+arena_bin_malloc_hard(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
+    szind_t binind, unsigned binshard) {
+	const bin_info_t *bin_info;
 	extent_t *slab;
 
-	bin_info = &arena_bin_info[binind];
+	bin_info = &bin_infos[binind];
 	if (!arena_is_auto(arena) && bin->slabcur != NULL) {
 		arena_bin_slabs_full_insert(arena, bin, bin->slabcur);
 		bin->slabcur = NULL;
 	}
-	slab = arena_bin_nonfull_slab_get(tsdn, arena, bin, binind);
+	slab = arena_bin_nonfull_slab_get(tsdn, arena, bin, binind, binshard);
 	if (bin->slabcur != NULL) {
 		/*
 		 * Another thread updated slabcur while this one ran without the
 		 * bin lock in arena_bin_nonfull_slab_get().
 		 */
 		if (extent_nfree_get(bin->slabcur) > 0) {
-			void *ret = arena_slab_reg_alloc(tsdn, bin->slabcur,
+			void *ret = arena_slab_reg_alloc(bin->slabcur,
 			    bin_info);
 			if (slab != NULL) {
 				/*
@@ -1415,51 +1357,78 @@ arena_bin_malloc_hard(tsdn_t *tsdn, arena_t *arena, arena_bin_t *bin,
 
 	assert(extent_nfree_get(bin->slabcur) > 0);
 
-	return arena_slab_reg_alloc(tsdn, slab, bin_info);
+	return arena_slab_reg_alloc(slab, bin_info);
+}
+
+/* Choose a bin shard and return the locked bin. */
+bin_t *
+arena_bin_choose_lock(tsdn_t *tsdn, arena_t *arena, szind_t binind,
+    unsigned *binshard) {
+	bin_t *bin;
+	if (tsdn_null(tsdn) || tsd_arena_get(tsdn_tsd(tsdn)) == NULL) {
+		*binshard = 0;
+	} else {
+		*binshard = tsd_binshardsp_get(tsdn_tsd(tsdn))->binshard[binind];
+	}
+	assert(*binshard < bin_infos[binind].n_shards);
+	bin = &arena->bins[binind].bin_shards[*binshard];
+	malloc_mutex_lock(tsdn, &bin->lock);
+
+	return bin;
 }
 
 void
 arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
-    tcache_bin_t *tbin, szind_t binind, uint64_t prof_accumbytes) {
-	unsigned i, nfill;
-	arena_bin_t *bin;
+    cache_bin_t *tbin, szind_t binind, uint64_t prof_accumbytes) {
+	unsigned i, nfill, cnt;
 
 	assert(tbin->ncached == 0);
 
 	if (config_prof && arena_prof_accum(tsdn, arena, prof_accumbytes)) {
 		prof_idump(tsdn);
 	}
-	bin = &arena->bins[binind];
-	malloc_mutex_lock(tsdn, &bin->lock);
+
+	unsigned binshard;
+	bin_t *bin = arena_bin_choose_lock(tsdn, arena, binind, &binshard);
+
 	for (i = 0, nfill = (tcache_bin_info[binind].ncached_max >>
-	    tcache->lg_fill_div[binind]); i < nfill; i++) {
+	    tcache->lg_fill_div[binind]); i < nfill; i += cnt) {
 		extent_t *slab;
-		void *ptr;
 		if ((slab = bin->slabcur) != NULL && extent_nfree_get(slab) >
 		    0) {
-			ptr = arena_slab_reg_alloc(tsdn, slab,
-			    &arena_bin_info[binind]);
+			unsigned tofill = nfill - i;
+			cnt = tofill < extent_nfree_get(slab) ?
+				tofill : extent_nfree_get(slab);
+			arena_slab_reg_alloc_batch(
+			   slab, &bin_infos[binind], cnt,
+			   tbin->avail - nfill + i);
 		} else {
-			ptr = arena_bin_malloc_hard(tsdn, arena, bin, binind);
-		}
-		if (ptr == NULL) {
+			cnt = 1;
+			void *ptr = arena_bin_malloc_hard(tsdn, arena, bin,
+			    binind, binshard);
 			/*
 			 * OOM.  tbin->avail isn't yet filled down to its first
 			 * element, so the successful allocations (if any) must
 			 * be moved just before tbin->avail before bailing out.
 			 */
-			if (i > 0) {
-				memmove(tbin->avail - i, tbin->avail - nfill,
-				    i * sizeof(void *));
+			if (ptr == NULL) {
+				if (i > 0) {
+					memmove(tbin->avail - i,
+						tbin->avail - nfill,
+						i * sizeof(void *));
+				}
+				break;
 			}
-			break;
+			/* Insert such that low regions get used first. */
+			*(tbin->avail - nfill + i) = ptr;
 		}
 		if (config_fill && unlikely(opt_junk_alloc)) {
-			arena_alloc_junk_small(ptr, &arena_bin_info[binind],
-			    true);
+			for (unsigned j = 0; j < cnt; j++) {
+				void* ptr = *(tbin->avail - nfill + i + j);
+				arena_alloc_junk_small(ptr, &bin_infos[binind],
+							true);
+			}
 		}
-		/* Insert such that low regions get used first. */
-		*(tbin->avail - nfill + i) = ptr;
 	}
 	if (config_stats) {
 		bin->stats.nmalloc += i;
@@ -1474,14 +1443,14 @@ arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
 }
 
 void
-arena_alloc_junk_small(void *ptr, const arena_bin_info_t *bin_info, bool zero) {
+arena_alloc_junk_small(void *ptr, const bin_info_t *bin_info, bool zero) {
 	if (!zero) {
 		memset(ptr, JEMALLOC_ALLOC_JUNK, bin_info->reg_size);
 	}
 }
 
 static void
-arena_dalloc_junk_small_impl(void *ptr, const arena_bin_info_t *bin_info) {
+arena_dalloc_junk_small_impl(void *ptr, const bin_info_t *bin_info) {
 	memset(ptr, JEMALLOC_FREE_JUNK, bin_info->reg_size);
 }
 arena_dalloc_junk_small_t *JET_MUTABLE arena_dalloc_junk_small =
@@ -1490,19 +1459,19 @@ arena_dalloc_junk_small_t *JET_MUTABLE arena_dalloc_junk_small =
 static void *
 arena_malloc_small(tsdn_t *tsdn, arena_t *arena, szind_t binind, bool zero) {
 	void *ret;
-	arena_bin_t *bin;
+	bin_t *bin;
 	size_t usize;
 	extent_t *slab;
 
-	assert(binind < NBINS);
-	bin = &arena->bins[binind];
+	assert(binind < SC_NBINS);
 	usize = sz_index2size(binind);
+	unsigned binshard;
+	bin = arena_bin_choose_lock(tsdn, arena, binind, &binshard);
 
-	malloc_mutex_lock(tsdn, &bin->lock);
 	if ((slab = bin->slabcur) != NULL && extent_nfree_get(slab) > 0) {
-		ret = arena_slab_reg_alloc(tsdn, slab, &arena_bin_info[binind]);
+		ret = arena_slab_reg_alloc(slab, &bin_infos[binind]);
 	} else {
-		ret = arena_bin_malloc_hard(tsdn, arena, bin, binind);
+		ret = arena_bin_malloc_hard(tsdn, arena, bin, binind, binshard);
 	}
 
 	if (ret == NULL) {
@@ -1524,14 +1493,14 @@ arena_malloc_small(tsdn_t *tsdn, arena_t *arena, szind_t binind, bool zero) {
 		if (config_fill) {
 			if (unlikely(opt_junk_alloc)) {
 				arena_alloc_junk_small(ret,
-				    &arena_bin_info[binind], false);
+				    &bin_infos[binind], false);
 			} else if (unlikely(opt_zero)) {
 				memset(ret, 0, usize);
 			}
 		}
 	} else {
 		if (config_fill && unlikely(opt_junk_alloc)) {
-			arena_alloc_junk_small(ret, &arena_bin_info[binind],
+			arena_alloc_junk_small(ret, &bin_infos[binind],
 			    true);
 		}
 		memset(ret, 0, usize);
@@ -1547,13 +1516,13 @@ arena_malloc_hard(tsdn_t *tsdn, arena_t *arena, size_t size, szind_t ind,
 	assert(!tsdn_null(tsdn) || arena != NULL);
 
 	if (likely(!tsdn_null(tsdn))) {
-		arena = arena_choose(tsdn_tsd(tsdn), arena);
+		arena = arena_choose_maybe_huge(tsdn_tsd(tsdn), arena, size);
 	}
 	if (unlikely(arena == NULL)) {
 		return NULL;
 	}
 
-	if (likely(size <= SMALL_MAXCLASS)) {
+	if (likely(size <= SC_SMALL_MAXCLASS)) {
 		return arena_malloc_small(tsdn, arena, ind, zero);
 	}
 	return large_malloc(tsdn, arena, sz_index2size(ind), zero);
@@ -1564,8 +1533,9 @@ arena_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment,
     bool zero, tcache_t *tcache) {
 	void *ret;
 
-	if (usize <= SMALL_MAXCLASS && (alignment < PAGE || (alignment == PAGE
-	    && (usize & PAGE_MASK) == 0))) {
+	if (usize <= SC_SMALL_MAXCLASS
+	    && (alignment < PAGE
+	    || (alignment == PAGE && (usize & PAGE_MASK) == 0))) {
 		/* Small; alignment doesn't require special slab placement. */
 		ret = arena_malloc(tsdn, arena, usize, sz_size2index(usize),
 		    zero, tcache, true);
@@ -1580,11 +1550,15 @@ arena_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment,
 }
 
 void
-arena_prof_promote(tsdn_t *tsdn, const void *ptr, size_t usize) {
+arena_prof_promote(tsdn_t *tsdn, void *ptr, size_t usize) {
 	cassert(config_prof);
 	assert(ptr != NULL);
-	assert(isalloc(tsdn, ptr) == LARGE_MINCLASS);
-	assert(usize <= SMALL_MAXCLASS);
+	assert(isalloc(tsdn, ptr) == SC_LARGE_MINCLASS);
+	assert(usize <= SC_SMALL_MAXCLASS);
+
+	if (config_opt_safety_checks) {
+		safety_check_set_redzone(ptr, usize, SC_LARGE_MINCLASS);
+	}
 
 	rtree_ctx_t rtree_ctx_fallback;
 	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
@@ -1608,15 +1582,15 @@ arena_prof_demote(tsdn_t *tsdn, extent_t *extent, const void *ptr) {
 	cassert(config_prof);
 	assert(ptr != NULL);
 
-	extent_szind_set(extent, NBINS);
+	extent_szind_set(extent, SC_NBINS);
 	rtree_ctx_t rtree_ctx_fallback;
 	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
 	rtree_szind_slab_update(tsdn, &extents_rtree, rtree_ctx, (uintptr_t)ptr,
-	    NBINS, false);
+	    SC_NBINS, false);
 
-	assert(isalloc(tsdn, ptr) == LARGE_MINCLASS);
+	assert(isalloc(tsdn, ptr) == SC_LARGE_MINCLASS);
 
-	return LARGE_MINCLASS;
+	return SC_LARGE_MINCLASS;
 }
 
 void
@@ -1626,23 +1600,32 @@ arena_dalloc_promoted(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
 	assert(opt_prof);
 
 	extent_t *extent = iealloc(tsdn, ptr);
-	size_t usize = arena_prof_demote(tsdn, extent, ptr);
-	if (usize <= tcache_maxclass) {
+	size_t usize = extent_usize_get(extent);
+	size_t bumped_usize = arena_prof_demote(tsdn, extent, ptr);
+	if (config_opt_safety_checks && usize < SC_LARGE_MINCLASS) {
+		/*
+		 * Currently, we only do redzoning for small sampled
+		 * allocations.
+		 */
+		assert(bumped_usize == SC_LARGE_MINCLASS);
+		safety_check_verify_redzone(ptr, usize, bumped_usize);
+	}
+	if (bumped_usize <= tcache_maxclass && tcache != NULL) {
 		tcache_dalloc_large(tsdn_tsd(tsdn), tcache, ptr,
-		    sz_size2index(usize), slow_path);
+		    sz_size2index(bumped_usize), slow_path);
 	} else {
 		large_dalloc(tsdn, extent);
 	}
 }
 
 static void
-arena_dissociate_bin_slab(arena_t *arena, extent_t *slab, arena_bin_t *bin) {
+arena_dissociate_bin_slab(arena_t *arena, extent_t *slab, bin_t *bin) {
 	/* Dissociate slab from bin. */
 	if (slab == bin->slabcur) {
 		bin->slabcur = NULL;
 	} else {
 		szind_t binind = extent_szind_get(slab);
-		const arena_bin_info_t *bin_info = &arena_bin_info[binind];
+		const bin_info_t *bin_info = &bin_infos[binind];
 
 		/*
 		 * The following block's conditional is necessary because if the
@@ -1659,7 +1642,7 @@ arena_dissociate_bin_slab(arena_t *arena, extent_t *slab, arena_bin_t *bin) {
 
 static void
 arena_dalloc_bin_slab(tsdn_t *tsdn, arena_t *arena, extent_t *slab,
-    arena_bin_t *bin) {
+    bin_t *bin) {
 	assert(slab != bin->slabcur);
 
 	malloc_mutex_unlock(tsdn, &bin->lock);
@@ -1674,7 +1657,7 @@ arena_dalloc_bin_slab(tsdn_t *tsdn, arena_t *arena, extent_t *slab,
 
 static void
 arena_bin_lower_slab(tsdn_t *tsdn, arena_t *arena, extent_t *slab,
-    arena_bin_t *bin) {
+    bin_t *bin) {
 	assert(extent_nfree_get(slab) > 0);
 
 	/*
@@ -1700,18 +1683,16 @@ arena_bin_lower_slab(tsdn_t *tsdn, arena_t *arena, extent_t *slab,
 }
 
 static void
-arena_dalloc_bin_locked_impl(tsdn_t *tsdn, arena_t *arena, extent_t *slab,
-    void *ptr, bool junked) {
+arena_dalloc_bin_locked_impl(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
+    szind_t binind, extent_t *slab, void *ptr, bool junked) {
 	arena_slab_data_t *slab_data = extent_slab_data_get(slab);
-	szind_t binind = extent_szind_get(slab);
-	arena_bin_t *bin = &arena->bins[binind];
-	const arena_bin_info_t *bin_info = &arena_bin_info[binind];
+	const bin_info_t *bin_info = &bin_infos[binind];
 
 	if (!junked && config_fill && unlikely(opt_junk_free)) {
 		arena_dalloc_junk_small(ptr, bin_info);
 	}
 
-	arena_slab_reg_dalloc(tsdn, slab, slab_data, ptr);
+	arena_slab_reg_dalloc(slab, slab_data, ptr);
 	unsigned nfree = extent_nfree_get(slab);
 	if (nfree == bin_info->nregs) {
 		arena_dissociate_bin_slab(arena, slab, bin);
@@ -1728,18 +1709,21 @@ arena_dalloc_bin_locked_impl(tsdn_t *tsdn, arena_t *arena, extent_t *slab,
 }
 
 void
-arena_dalloc_bin_junked_locked(tsdn_t *tsdn, arena_t *arena, extent_t *extent,
-    void *ptr) {
-	arena_dalloc_bin_locked_impl(tsdn, arena, extent, ptr, true);
+arena_dalloc_bin_junked_locked(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
+    szind_t binind, extent_t *extent, void *ptr) {
+	arena_dalloc_bin_locked_impl(tsdn, arena, bin, binind, extent, ptr,
+	    true);
 }
 
 static void
 arena_dalloc_bin(tsdn_t *tsdn, arena_t *arena, extent_t *extent, void *ptr) {
 	szind_t binind = extent_szind_get(extent);
-	arena_bin_t *bin = &arena->bins[binind];
+	unsigned binshard = extent_binshard_get(extent);
+	bin_t *bin = &arena->bins[binind].bin_shards[binshard];
 
 	malloc_mutex_lock(tsdn, &bin->lock);
-	arena_dalloc_bin_locked_impl(tsdn, arena, extent, ptr, false);
+	arena_dalloc_bin_locked_impl(tsdn, arena, bin, binind, extent, ptr,
+	    false);
 	malloc_mutex_unlock(tsdn, &bin->lock);
 }
 
@@ -1754,38 +1738,48 @@ arena_dalloc_small(tsdn_t *tsdn, void *ptr) {
 
 bool
 arena_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
-    size_t extra, bool zero) {
+    size_t extra, bool zero, size_t *newsize) {
+	bool ret;
 	/* Calls with non-zero extra had to clamp extra. */
-	assert(extra == 0 || size + extra <= LARGE_MAXCLASS);
+	assert(extra == 0 || size + extra <= SC_LARGE_MAXCLASS);
 
-	if (unlikely(size > LARGE_MAXCLASS)) {
-		return true;
+	extent_t *extent = iealloc(tsdn, ptr);
+	if (unlikely(size > SC_LARGE_MAXCLASS)) {
+		ret = true;
+		goto done;
 	}
 
-	extent_t *extent = iealloc(tsdn, ptr);
 	size_t usize_min = sz_s2u(size);
 	size_t usize_max = sz_s2u(size + extra);
-	if (likely(oldsize <= SMALL_MAXCLASS && usize_min <= SMALL_MAXCLASS)) {
+	if (likely(oldsize <= SC_SMALL_MAXCLASS && usize_min
+	    <= SC_SMALL_MAXCLASS)) {
 		/*
 		 * Avoid moving the allocation if the size class can be left the
 		 * same.
 		 */
-		assert(arena_bin_info[sz_size2index(oldsize)].reg_size ==
+		assert(bin_infos[sz_size2index(oldsize)].reg_size ==
 		    oldsize);
-		if ((usize_max > SMALL_MAXCLASS || sz_size2index(usize_max) !=
-		    sz_size2index(oldsize)) && (size > oldsize || usize_max <
-		    oldsize)) {
-			return true;
+		if ((usize_max > SC_SMALL_MAXCLASS
+		    || sz_size2index(usize_max) != sz_size2index(oldsize))
+		    && (size > oldsize || usize_max < oldsize)) {
+			ret = true;
+			goto done;
 		}
 
 		arena_decay_tick(tsdn, extent_arena_get(extent));
-		return false;
-	} else if (oldsize >= LARGE_MINCLASS && usize_max >= LARGE_MINCLASS) {
-		return large_ralloc_no_move(tsdn, extent, usize_min, usize_max,
+		ret = false;
+	} else if (oldsize >= SC_LARGE_MINCLASS
+	    && usize_max >= SC_LARGE_MINCLASS) {
+		ret = large_ralloc_no_move(tsdn, extent, usize_min, usize_max,
 		    zero);
+	} else {
+		ret = true;
 	}
+done:
+	assert(extent == iealloc(tsdn, ptr));
+	*newsize = extent_usize_get(extent);
 
-	return true;
+	return ret;
 }
 
 static void *
@@ -1796,7 +1790,7 @@ arena_ralloc_move_helper(tsdn_t *tsdn, arena_t *arena, size_t usize,
 		    zero, tcache, true);
 	}
 	usize = sz_sa2u(usize, alignment);
-	if (unlikely(usize == 0 || usize > LARGE_MAXCLASS)) {
+	if (unlikely(usize == 0 || usize > SC_LARGE_MAXCLASS)) {
 		return NULL;
 	}
 	return ipalloct(tsdn, usize, alignment, zero, tcache, arena);
@@ -1804,22 +1798,30 @@ arena_ralloc_move_helper(tsdn_t *tsdn, arena_t *arena, size_t usize,
 
 void *
 arena_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t oldsize,
-    size_t size, size_t alignment, bool zero, tcache_t *tcache) {
+    size_t size, size_t alignment, bool zero, tcache_t *tcache,
+    hook_ralloc_args_t *hook_args) {
 	size_t usize = sz_s2u(size);
-	if (unlikely(usize == 0 || size > LARGE_MAXCLASS)) {
+	if (unlikely(usize == 0 || size > SC_LARGE_MAXCLASS)) {
 		return NULL;
 	}
 
-	if (likely(usize <= SMALL_MAXCLASS)) {
+	if (likely(usize <= SC_SMALL_MAXCLASS)) {
 		/* Try to avoid moving the allocation. */
-		if (!arena_ralloc_no_move(tsdn, ptr, oldsize, usize, 0, zero)) {
+		UNUSED size_t newsize;
+		if (!arena_ralloc_no_move(tsdn, ptr, oldsize, usize, 0, zero,
+		    &newsize)) {
+			hook_invoke_expand(hook_args->is_realloc
+			    ? hook_expand_realloc : hook_expand_rallocx,
+			    ptr, oldsize, usize, (uintptr_t)ptr,
+			    hook_args->args);
 			return ptr;
 		}
 	}
 
-	if (oldsize >= LARGE_MINCLASS && usize >= LARGE_MINCLASS) {
-		return large_ralloc(tsdn, arena, iealloc(tsdn, ptr), usize,
-		    alignment, zero, tcache);
+	if (oldsize >= SC_LARGE_MINCLASS
+	    && usize >= SC_LARGE_MINCLASS) {
+		return large_ralloc(tsdn, arena, ptr, usize,
+		    alignment, zero, tcache, hook_args);
 	}
 
 	/*
@@ -1832,11 +1834,16 @@ arena_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t oldsize,
 		return NULL;
 	}
 
+	hook_invoke_alloc(hook_args->is_realloc
+	    ? hook_alloc_realloc : hook_alloc_rallocx, ret, (uintptr_t)ret,
+	    hook_args->args);
+	hook_invoke_dalloc(hook_args->is_realloc
+	    ? hook_dalloc_realloc : hook_dalloc_rallocx, ptr, hook_args->args);
+
 	/*
 	 * Junk/zero-filling were already done by
 	 * ipalloc()/arena_malloc().
 	 */
-
 	size_t copysize = (usize < oldsize) ? usize : oldsize;
 	memcpy(ret, ptr, copysize);
 	isdalloct(tsdn, ptr, oldsize, tcache, NULL, true);
@@ -1885,6 +1892,32 @@ arena_muzzy_decay_ms_default_set(ssize_t decay_ms) {
 	return false;
 }
 
+bool
+arena_retain_grow_limit_get_set(tsd_t *tsd, arena_t *arena, size_t *old_limit,
+    size_t *new_limit) {
+	assert(opt_retain);
+
+	pszind_t new_ind JEMALLOC_CC_SILENCE_INIT(0);
+	if (new_limit != NULL) {
+		size_t limit = *new_limit;
+		/* Grow no more than the new limit. */
+		if ((new_ind = sz_psz2ind(limit + 1) - 1) >= SC_NPSIZES) {
+			return true;
+		}
+	}
+
+	malloc_mutex_lock(tsd_tsdn(tsd), &arena->extent_grow_mtx);
+	if (old_limit != NULL) {
+		*old_limit = sz_pind2sz(arena->retain_grow_limit);
+	}
+	if (new_limit != NULL) {
+		arena->retain_grow_limit = new_ind;
+	}
+	malloc_mutex_unlock(tsd_tsdn(tsd), &arena->extent_grow_mtx);
+
+	return false;
+}
+
 unsigned
 arena_nthreads_get(arena_t *arena, bool internal) {
 	return atomic_load_u(&arena->nthreads[internal], ATOMIC_RELAXED);
@@ -1920,7 +1953,12 @@ arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 		}
 	}
 
-	arena = (arena_t *)base_alloc(tsdn, base, sizeof(arena_t), CACHELINE);
+	unsigned nbins_total = 0;
+	for (i = 0; i < SC_NBINS; i++) {
+		nbins_total += bin_infos[i].n_shards;
+	}
+	size_t arena_size = sizeof(arena_t) + sizeof(bin_t) * nbins_total;
+	arena = (arena_t *)base_alloc(tsdn, base, arena_size, CACHELINE);
 	if (arena == NULL) {
 		goto label_error;
 	}
@@ -1935,6 +1973,7 @@ arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 		}
 
 		ql_new(&arena->tcache_ql);
+		ql_new(&arena->cache_bin_array_descriptor_ql);
 		if (malloc_mutex_init(&arena->tcache_ql_mtx, "tcache_ql",
 		    WITNESS_RANK_TCACHE_QL, malloc_mutex_rank_exclusive)) {
 			goto label_error;
@@ -2001,16 +2040,17 @@ arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 		goto label_error;
 	}
 
-	if (arena_decay_init(&arena->decay_dirty, &arena->extents_dirty,
+	if (arena_decay_init(&arena->decay_dirty,
 	    arena_dirty_decay_ms_default_get(), &arena->stats.decay_dirty)) {
 		goto label_error;
 	}
-	if (arena_decay_init(&arena->decay_muzzy, &arena->extents_muzzy,
+	if (arena_decay_init(&arena->decay_muzzy,
 	    arena_muzzy_decay_ms_default_get(), &arena->stats.decay_muzzy)) {
 		goto label_error;
 	}
 
 	arena->extent_grow_next = sz_psz2ind(HUGEPAGE);
+	arena->retain_grow_limit = sz_psz2ind(SC_LARGE_MAXCLASS);
 	if (malloc_mutex_init(&arena->extent_grow_mtx, "extent_grow",
 	    WITNESS_RANK_EXTENT_GROW, malloc_mutex_rank_exclusive)) {
 		goto label_error;
@@ -2023,19 +2063,20 @@ arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 	}
 
 	/* Initialize bins. */
-	for (i = 0; i < NBINS; i++) {
-		arena_bin_t *bin = &arena->bins[i];
-		if (malloc_mutex_init(&bin->lock, "arena_bin",
-		    WITNESS_RANK_ARENA_BIN, malloc_mutex_rank_exclusive)) {
-			goto label_error;
-		}
-		bin->slabcur = NULL;
-		extent_heap_new(&bin->slabs_nonfull);
-		extent_list_init(&bin->slabs_full);
-		if (config_stats) {
-			memset(&bin->stats, 0, sizeof(malloc_bin_stats_t));
+	uintptr_t bin_addr = (uintptr_t)arena + sizeof(arena_t);
+	atomic_store_u(&arena->binshard_next, 0, ATOMIC_RELEASE);
+	for (i = 0; i < SC_NBINS; i++) {
+		unsigned nshards = bin_infos[i].n_shards;
+		arena->bins[i].bin_shards = (bin_t *)bin_addr;
+		bin_addr += nshards * sizeof(bin_t);
+		for (unsigned j = 0; j < nshards; j++) {
+			bool err = bin_init(&arena->bins[i].bin_shards[j]);
+			if (err) {
+				goto label_error;
+			}
 		}
 	}
+	assert(bin_addr == (uintptr_t)arena + arena_size);
 
 	arena->base = base;
 	/* Set arena before creating background threads. */
@@ -2052,8 +2093,8 @@ arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 		 */
 		assert(!tsdn_null(tsdn));
 		pre_reentrancy(tsdn_tsd(tsdn), arena);
-		if (hooks_arena_new_hook) {
-			hooks_arena_new_hook();
+		if (test_hooks_arena_new_hook) {
+			test_hooks_arena_new_hook();
 		}
 		post_reentrancy(tsdn_tsd(tsdn));
 	}
@@ -2066,10 +2107,75 @@ label_error:
 	return NULL;
 }
 
+arena_t *
+arena_choose_huge(tsd_t *tsd) {
+	/* huge_arena_ind can be 0 during init (will use a0). */
+	if (huge_arena_ind == 0) {
+		assert(!malloc_initialized());
+	}
+
+	arena_t *huge_arena = arena_get(tsd_tsdn(tsd), huge_arena_ind, false);
+	if (huge_arena == NULL) {
+		/* Create the huge arena on demand. */
+		assert(huge_arena_ind != 0);
+		huge_arena = arena_get(tsd_tsdn(tsd), huge_arena_ind, true);
+		if (huge_arena == NULL) {
+			return NULL;
+		}
+		/*
+		 * Purge eagerly for huge allocations, because: 1) number of
+		 * huge allocations is usually small, which means ticker based
+		 * decay is not reliable; and 2) less immediate reuse is
+		 * expected for huge allocations.
+		 */
+		if (arena_dirty_decay_ms_default_get() > 0) {
+			arena_dirty_decay_ms_set(tsd_tsdn(tsd), huge_arena, 0);
+		}
+		if (arena_muzzy_decay_ms_default_get() > 0) {
+			arena_muzzy_decay_ms_set(tsd_tsdn(tsd), huge_arena, 0);
+		}
+	}
+
+	return huge_arena;
+}
+
+bool
+arena_init_huge(void) {
+	bool huge_enabled;
+
+	/* The threshold should be large size class. */
+	if (opt_oversize_threshold > SC_LARGE_MAXCLASS ||
+	    opt_oversize_threshold < SC_LARGE_MINCLASS) {
+		opt_oversize_threshold = 0;
+		oversize_threshold = SC_LARGE_MAXCLASS + PAGE;
+		huge_enabled = false;
+	} else {
+		/* Reserve the index for the huge arena. */
+		huge_arena_ind = narenas_total_get();
+		oversize_threshold = opt_oversize_threshold;
+		huge_enabled = true;
+	}
+
+	return huge_enabled;
+}
+
+bool
+arena_is_huge(unsigned arena_ind) {
+	if (huge_arena_ind == 0) {
+		return false;
+	}
+	return (arena_ind == huge_arena_ind);
+}
+
 void
-arena_boot(void) {
+arena_boot(sc_data_t *sc_data) {
 	arena_dirty_decay_ms_default_set(opt_dirty_decay_ms);
 	arena_muzzy_decay_ms_default_set(opt_muzzy_decay_ms);
+	for (unsigned i = 0; i < SC_NBINS; i++) {
+		sc_t *sc = &sc_data->sc[i];
+		div_init(&arena_binind_div_info[i],
+		    (1U << sc->lg_base) + (sc->ndelta << sc->lg_delta));
+	}
 }
 
 void
@@ -2114,8 +2220,10 @@ arena_prefork6(tsdn_t *tsdn, arena_t *arena) {
 
 void
 arena_prefork7(tsdn_t *tsdn, arena_t *arena) {
-	for (unsigned i = 0; i < NBINS; i++) {
-		malloc_mutex_prefork(tsdn, &arena->bins[i].lock);
+	for (unsigned i = 0; i < SC_NBINS; i++) {
+		for (unsigned j = 0; j < bin_infos[i].n_shards; j++) {
+			bin_prefork(tsdn, &arena->bins[i].bin_shards[j]);
+		}
 	}
 }
 
@@ -2123,8 +2231,11 @@ void
 arena_postfork_parent(tsdn_t *tsdn, arena_t *arena) {
 	unsigned i;
 
-	for (i = 0; i < NBINS; i++) {
-		malloc_mutex_postfork_parent(tsdn, &arena->bins[i].lock);
+	for (i = 0; i < SC_NBINS; i++) {
+		for (unsigned j = 0; j < bin_infos[i].n_shards; j++) {
+			bin_postfork_parent(tsdn,
+			    &arena->bins[i].bin_shards[j]);
+		}
 	}
 	malloc_mutex_postfork_parent(tsdn, &arena->large_mtx);
 	base_postfork_parent(tsdn, arena->base);
@@ -2154,15 +2265,23 @@ arena_postfork_child(tsdn_t *tsdn, arena_t *arena) {
 	}
 	if (config_stats) {
 		ql_new(&arena->tcache_ql);
+		ql_new(&arena->cache_bin_array_descriptor_ql);
 		tcache_t *tcache = tcache_get(tsdn_tsd(tsdn));
 		if (tcache != NULL && tcache->arena == arena) {
 			ql_elm_new(tcache, link);
 			ql_tail_insert(&arena->tcache_ql, tcache, link);
+			cache_bin_array_descriptor_init(
+			    &tcache->cache_bin_array_descriptor,
+			    tcache->bins_small, tcache->bins_large);
+			ql_tail_insert(&arena->cache_bin_array_descriptor_ql,
+			    &tcache->cache_bin_array_descriptor, link);
 		}
 	}
 
-	for (i = 0; i < NBINS; i++) {
-		malloc_mutex_postfork_child(tsdn, &arena->bins[i].lock);
+	for (i = 0; i < SC_NBINS; i++) {
+		for (unsigned j = 0; j < bin_infos[i].n_shards; j++) {
+			bin_postfork_child(tsdn, &arena->bins[i].bin_shards[j]);
+		}
 	}
 	malloc_mutex_postfork_child(tsdn, &arena->large_mtx);
 	base_postfork_child(tsdn, arena->base);
diff --git a/deps/jemalloc/src/background_thread.c b/deps/jemalloc/src/background_thread.c
index eb30eb5b42..57b9b256bb 100644
--- a/deps/jemalloc/src/background_thread.c
+++ b/deps/jemalloc/src/background_thread.c
@@ -4,6 +4,8 @@
 
 #include "jemalloc/internal/assert.h"
 
+JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
+
 /******************************************************************************/
 /* Data. */
 
@@ -11,38 +13,37 @@
 #define BACKGROUND_THREAD_DEFAULT false
 /* Read-only after initialization. */
 bool opt_background_thread = BACKGROUND_THREAD_DEFAULT;
+size_t opt_max_background_threads = MAX_BACKGROUND_THREAD_LIMIT + 1;
 
 /* Used for thread creation, termination and stats. */
 malloc_mutex_t background_thread_lock;
 /* Indicates global state.  Atomic because decay reads this w/o locking. */
 atomic_b_t background_thread_enabled_state;
 size_t n_background_threads;
+size_t max_background_threads;
 /* Thread info per-index. */
 background_thread_info_t *background_thread_info;
 
-/* False if no necessary runtime support. */
-bool can_enable_background_thread;
-
 /******************************************************************************/
 
 #ifdef JEMALLOC_PTHREAD_CREATE_WRAPPER
-#include <dlfcn.h>
 
 static int (*pthread_create_fptr)(pthread_t *__restrict, const pthread_attr_t *,
     void *(*)(void *), void *__restrict);
-static pthread_once_t once_control = PTHREAD_ONCE_INIT;
 
 static void
-pthread_create_wrapper_once(void) {
+pthread_create_wrapper_init(void) {
 #ifdef JEMALLOC_LAZY_LOCK
-	isthreaded = true;
+	if (!isthreaded) {
+		isthreaded = true;
+	}
 #endif
 }
 
 int
 pthread_create_wrapper(pthread_t *__restrict thread, const pthread_attr_t *attr,
     void *(*start_routine)(void *), void *__restrict arg) {
-	pthread_once(&once_control, pthread_create_wrapper_once);
+	pthread_create_wrapper_init();
 
 	return pthread_create_fptr(thread, attr, start_routine, arg);
 }
@@ -78,7 +79,7 @@ background_thread_info_init(tsdn_t *tsdn, background_thread_info_t *info) {
 }
 
 static inline bool
-set_current_thread_affinity(UNUSED int cpu) {
+set_current_thread_affinity(int cpu) {
 #if defined(JEMALLOC_HAVE_SCHED_SETAFFINITY)
 	cpu_set_t cpuset;
 	CPU_ZERO(&cpuset);
@@ -286,7 +287,7 @@ background_work_sleep_once(tsdn_t *tsdn, background_thread_info_t *info, unsigne
 	uint64_t min_interval = BACKGROUND_THREAD_INDEFINITE_SLEEP;
 	unsigned narenas = narenas_total_get();
 
-	for (unsigned i = ind; i < narenas; i += ncpus) {
+	for (unsigned i = ind; i < narenas; i += max_background_threads) {
 		arena_t *arena = arena_get(tsdn, i, false);
 		if (!arena) {
 			continue;
@@ -379,35 +380,32 @@ background_thread_create_signals_masked(pthread_t *thread,
 	return create_err;
 }
 
-static void
+static bool
 check_background_thread_creation(tsd_t *tsd, unsigned *n_created,
     bool *created_threads) {
+	bool ret = false;
 	if (likely(*n_created == n_background_threads)) {
-		return;
+		return ret;
 	}
 
-	malloc_mutex_unlock(tsd_tsdn(tsd), &background_thread_info[0].mtx);
-label_restart:
-	malloc_mutex_lock(tsd_tsdn(tsd), &background_thread_lock);
-	for (unsigned i = 1; i < ncpus; i++) {
+	tsdn_t *tsdn = tsd_tsdn(tsd);
+	malloc_mutex_unlock(tsdn, &background_thread_info[0].mtx);
+	for (unsigned i = 1; i < max_background_threads; i++) {
 		if (created_threads[i]) {
 			continue;
 		}
 		background_thread_info_t *info = &background_thread_info[i];
-		malloc_mutex_lock(tsd_tsdn(tsd), &info->mtx);
-		assert(info->state != background_thread_paused);
+		malloc_mutex_lock(tsdn, &info->mtx);
+		/*
+		 * In case of the background_thread_paused state because of
+		 * arena reset, delay the creation.
+		 */
 		bool create = (info->state == background_thread_started);
-		malloc_mutex_unlock(tsd_tsdn(tsd), &info->mtx);
+		malloc_mutex_unlock(tsdn, &info->mtx);
 		if (!create) {
 			continue;
 		}
 
-		/*
-		 * To avoid deadlock with prefork handlers (which waits for the
-		 * mutex held here), unlock before calling pthread_create().
-		 */
-		malloc_mutex_unlock(tsd_tsdn(tsd), &background_thread_lock);
-
 		pre_reentrancy(tsd, NULL);
 		int err = background_thread_create_signals_masked(&info->thread,
 		    NULL, background_thread_entry, (void *)(uintptr_t)i);
@@ -423,19 +421,21 @@ label_restart:
 				abort();
 			}
 		}
-		/* Restart since we unlocked. */
-		goto label_restart;
+		/* Return to restart the loop since we unlocked. */
+		ret = true;
+		break;
 	}
-	malloc_mutex_lock(tsd_tsdn(tsd), &background_thread_info[0].mtx);
-	malloc_mutex_unlock(tsd_tsdn(tsd), &background_thread_lock);
+	malloc_mutex_lock(tsdn, &background_thread_info[0].mtx);
+
+	return ret;
 }
 
 static void
 background_thread0_work(tsd_t *tsd) {
 	/* Thread0 is also responsible for launching / terminating threads. */
-	VARIABLE_ARRAY(bool, created_threads, ncpus);
+	VARIABLE_ARRAY(bool, created_threads, max_background_threads);
 	unsigned i;
-	for (i = 1; i < ncpus; i++) {
+	for (i = 1; i < max_background_threads; i++) {
 		created_threads[i] = false;
 	}
 	/* Start working, and create more threads when asked. */
@@ -445,8 +445,10 @@ background_thread0_work(tsd_t *tsd) {
 		    &background_thread_info[0])) {
 			continue;
 		}
-		check_background_thread_creation(tsd, &n_created,
-		    (bool *)&created_threads);
+		if (check_background_thread_creation(tsd, &n_created,
+		    (bool *)&created_threads)) {
+			continue;
+		}
 		background_work_sleep_once(tsd_tsdn(tsd),
 		    &background_thread_info[0], 0);
 	}
@@ -456,15 +458,20 @@ background_thread0_work(tsd_t *tsd) {
 	 * the global background_thread mutex (and is waiting) for us.
 	 */
 	assert(!background_thread_enabled());
-	for (i = 1; i < ncpus; i++) {
+	for (i = 1; i < max_background_threads; i++) {
 		background_thread_info_t *info = &background_thread_info[i];
 		assert(info->state != background_thread_paused);
 		if (created_threads[i]) {
 			background_threads_disable_single(tsd, info);
 		} else {
 			malloc_mutex_lock(tsd_tsdn(tsd), &info->mtx);
-			/* Clear in case the thread wasn't created. */
-			info->state = background_thread_stopped;
+			if (info->state != background_thread_stopped) {
+				/* The thread was not created. */
+				assert(info->state ==
+				    background_thread_started);
+				n_background_threads--;
+				info->state = background_thread_stopped;
+			}
 			malloc_mutex_unlock(tsd_tsdn(tsd), &info->mtx);
 		}
 	}
@@ -498,9 +505,11 @@ background_work(tsd_t *tsd, unsigned ind) {
 static void *
 background_thread_entry(void *ind_arg) {
 	unsigned thread_ind = (unsigned)(uintptr_t)ind_arg;
-	assert(thread_ind < ncpus);
+	assert(thread_ind < max_background_threads);
 #ifdef JEMALLOC_HAVE_PTHREAD_SETNAME_NP
 	pthread_setname_np(pthread_self(), "jemalloc_bg_thd");
+#elif defined(__FreeBSD__)
+	pthread_set_name_np(pthread_self(), "jemalloc_bg_thd");
 #endif
 	if (opt_percpu_arena != percpu_arena_disabled) {
 		set_current_thread_affinity((int)thread_ind);
@@ -525,14 +534,13 @@ background_thread_init(tsd_t *tsd, background_thread_info_t *info) {
 	n_background_threads++;
 }
 
-/* Create a new background thread if needed. */
-bool
-background_thread_create(tsd_t *tsd, unsigned arena_ind) {
+static bool
+background_thread_create_locked(tsd_t *tsd, unsigned arena_ind) {
 	assert(have_background_thread);
 	malloc_mutex_assert_owner(tsd_tsdn(tsd), &background_thread_lock);
 
 	/* We create at most NCPUs threads. */
-	size_t thread_ind = arena_ind % ncpus;
+	size_t thread_ind = arena_ind % max_background_threads;
 	background_thread_info_t *info = &background_thread_info[thread_ind];
 
 	bool need_new_thread;
@@ -580,37 +588,53 @@ background_thread_create(tsd_t *tsd, unsigned arena_ind) {
 	return false;
 }
 
+/* Create a new background thread if needed. */
+bool
+background_thread_create(tsd_t *tsd, unsigned arena_ind) {
+	assert(have_background_thread);
+
+	bool ret;
+	malloc_mutex_lock(tsd_tsdn(tsd), &background_thread_lock);
+	ret = background_thread_create_locked(tsd, arena_ind);
+	malloc_mutex_unlock(tsd_tsdn(tsd), &background_thread_lock);
+
+	return ret;
+}
+
 bool
 background_threads_enable(tsd_t *tsd) {
 	assert(n_background_threads == 0);
 	assert(background_thread_enabled());
 	malloc_mutex_assert_owner(tsd_tsdn(tsd), &background_thread_lock);
 
-	VARIABLE_ARRAY(bool, marked, ncpus);
+	VARIABLE_ARRAY(bool, marked, max_background_threads);
 	unsigned i, nmarked;
-	for (i = 0; i < ncpus; i++) {
+	for (i = 0; i < max_background_threads; i++) {
 		marked[i] = false;
 	}
 	nmarked = 0;
+	/* Thread 0 is required and created at the end. */
+	marked[0] = true;
 	/* Mark the threads we need to create for thread 0. */
 	unsigned n = narenas_total_get();
 	for (i = 1; i < n; i++) {
-		if (marked[i % ncpus] ||
+		if (marked[i % max_background_threads] ||
 		    arena_get(tsd_tsdn(tsd), i, false) == NULL) {
 			continue;
 		}
-		background_thread_info_t *info = &background_thread_info[i];
+		background_thread_info_t *info = &background_thread_info[
+		    i % max_background_threads];
 		malloc_mutex_lock(tsd_tsdn(tsd), &info->mtx);
 		assert(info->state == background_thread_stopped);
 		background_thread_init(tsd, info);
 		malloc_mutex_unlock(tsd_tsdn(tsd), &info->mtx);
-		marked[i % ncpus] = true;
-		if (++nmarked == ncpus) {
+		marked[i % max_background_threads] = true;
+		if (++nmarked == max_background_threads) {
 			break;
 		}
 	}
 
-	return background_thread_create(tsd, 0);
+	return background_thread_create_locked(tsd, 0);
 }
 
 bool
@@ -720,14 +744,14 @@ background_thread_prefork0(tsdn_t *tsdn) {
 
 void
 background_thread_prefork1(tsdn_t *tsdn) {
-	for (unsigned i = 0; i < ncpus; i++) {
+	for (unsigned i = 0; i < max_background_threads; i++) {
 		malloc_mutex_prefork(tsdn, &background_thread_info[i].mtx);
 	}
 }
 
 void
 background_thread_postfork_parent(tsdn_t *tsdn) {
-	for (unsigned i = 0; i < ncpus; i++) {
+	for (unsigned i = 0; i < max_background_threads; i++) {
 		malloc_mutex_postfork_parent(tsdn,
 		    &background_thread_info[i].mtx);
 	}
@@ -736,7 +760,7 @@ background_thread_postfork_parent(tsdn_t *tsdn) {
 
 void
 background_thread_postfork_child(tsdn_t *tsdn) {
-	for (unsigned i = 0; i < ncpus; i++) {
+	for (unsigned i = 0; i < max_background_threads; i++) {
 		malloc_mutex_postfork_child(tsdn,
 		    &background_thread_info[i].mtx);
 	}
@@ -749,7 +773,7 @@ background_thread_postfork_child(tsdn_t *tsdn) {
 	malloc_mutex_lock(tsdn, &background_thread_lock);
 	n_background_threads = 0;
 	background_thread_enabled_set(tsdn, false);
-	for (unsigned i = 0; i < ncpus; i++) {
+	for (unsigned i = 0; i < max_background_threads; i++) {
 		background_thread_info_t *info = &background_thread_info[i];
 		malloc_mutex_lock(tsdn, &info->mtx);
 		info->state = background_thread_stopped;
@@ -773,9 +797,15 @@ background_thread_stats_read(tsdn_t *tsdn, background_thread_stats_t *stats) {
 	stats->num_threads = n_background_threads;
 	uint64_t num_runs = 0;
 	nstime_init(&stats->run_interval, 0);
-	for (unsigned i = 0; i < ncpus; i++) {
+	for (unsigned i = 0; i < max_background_threads; i++) {
 		background_thread_info_t *info = &background_thread_info[i];
-		malloc_mutex_lock(tsdn, &info->mtx);
+		if (malloc_mutex_trylock(tsdn, &info->mtx)) {
+			/*
+			 * Each background thread run may take a long time;
+			 * avoid waiting on the stats if the thread is active.
+			 */
+			continue;
+		}
 		if (info->state != background_thread_stopped) {
 			num_runs += info->tot_n_runs;
 			nstime_add(&stats->run_interval, &info->tot_sleep_time);
@@ -795,6 +825,39 @@ background_thread_stats_read(tsdn_t *tsdn, background_thread_stats_t *stats) {
 #undef BILLION
 #undef BACKGROUND_THREAD_MIN_INTERVAL_NS
 
+#ifdef JEMALLOC_HAVE_DLSYM
+#include <dlfcn.h>
+#endif
+
+static bool
+pthread_create_fptr_init(void) {
+	if (pthread_create_fptr != NULL) {
+		return false;
+	}
+	/*
+	 * Try the next symbol first, because 1) when use lazy_lock we have a
+	 * wrapper for pthread_create; and 2) application may define its own
+	 * wrapper as well (and can call malloc within the wrapper).
+	 */
+#ifdef JEMALLOC_HAVE_DLSYM
+	pthread_create_fptr = dlsym(RTLD_NEXT, "pthread_create");
+#else
+	pthread_create_fptr = NULL;
+#endif
+	if (pthread_create_fptr == NULL) {
+		if (config_lazy_lock) {
+			malloc_write("<jemalloc>: Error in dlsym(RTLD_NEXT, "
+			    "\"pthread_create\")\n");
+			abort();
+		} else {
+			/* Fall back to the default symbol. */
+			pthread_create_fptr = pthread_create;
+		}
+	}
+
+	return false;
+}
+
 /*
  * When lazy lock is enabled, we need to make sure setting isthreaded before
  * taking any background_thread locks.  This is called early in ctl (instead of
@@ -805,7 +868,8 @@ void
 background_thread_ctl_init(tsdn_t *tsdn) {
 	malloc_mutex_assert_not_owner(tsdn, &background_thread_lock);
 #ifdef JEMALLOC_PTHREAD_CREATE_WRAPPER
-	pthread_once(&once_control, pthread_create_wrapper_once);
+	pthread_create_fptr_init();
+	pthread_create_wrapper_init();
 #endif
 }
 
@@ -818,18 +882,10 @@ background_thread_boot0(void) {
 		    "supports pthread only\n");
 		return true;
 	}
-
 #ifdef JEMALLOC_PTHREAD_CREATE_WRAPPER
-	pthread_create_fptr = dlsym(RTLD_NEXT, "pthread_create");
-	if (pthread_create_fptr == NULL) {
-		can_enable_background_thread = false;
-		if (config_lazy_lock || opt_background_thread) {
-			malloc_write("<jemalloc>: Error in dlsym(RTLD_NEXT, "
-			    "\"pthread_create\")\n");
-			abort();
-		}
-	} else {
-		can_enable_background_thread = true;
+	if ((config_lazy_lock || opt_background_thread) &&
+	    pthread_create_fptr_init()) {
+		return true;
 	}
 #endif
 	return false;
@@ -841,6 +897,11 @@ background_thread_boot1(tsdn_t *tsdn) {
 	assert(have_background_thread);
 	assert(narenas_total_get() > 0);
 
+	if (opt_max_background_threads > MAX_BACKGROUND_THREAD_LIMIT) {
+		opt_max_background_threads = DEFAULT_NUM_BACKGROUND_THREAD;
+	}
+	max_background_threads = opt_max_background_threads;
+
 	background_thread_enabled_set(tsdn, opt_background_thread);
 	if (malloc_mutex_init(&background_thread_lock,
 	    "background_thread_global",
@@ -848,17 +909,15 @@ background_thread_boot1(tsdn_t *tsdn) {
 	    malloc_mutex_rank_exclusive)) {
 		return true;
 	}
-	if (opt_background_thread) {
-		background_thread_ctl_init(tsdn);
-	}
 
 	background_thread_info = (background_thread_info_t *)base_alloc(tsdn,
-	    b0get(), ncpus * sizeof(background_thread_info_t), CACHELINE);
+	    b0get(), opt_max_background_threads *
+	    sizeof(background_thread_info_t), CACHELINE);
 	if (background_thread_info == NULL) {
 		return true;
 	}
 
-	for (unsigned i = 0; i < ncpus; i++) {
+	for (unsigned i = 0; i < max_background_threads; i++) {
 		background_thread_info_t *info = &background_thread_info[i];
 		/* Thread mutex is rank_inclusive because of thread0. */
 		if (malloc_mutex_init(&info->mtx, "background_thread",
diff --git a/deps/jemalloc/src/base.c b/deps/jemalloc/src/base.c
index 97078b134d..f3c61661a2 100644
--- a/deps/jemalloc/src/base.c
+++ b/deps/jemalloc/src/base.c
@@ -10,25 +10,40 @@
 /******************************************************************************/
 /* Data. */
 
-static base_t	*b0;
+static base_t *b0;
+
+metadata_thp_mode_t opt_metadata_thp = METADATA_THP_DEFAULT;
+
+const char *metadata_thp_mode_names[] = {
+	"disabled",
+	"auto",
+	"always"
+};
 
 /******************************************************************************/
 
+static inline bool
+metadata_thp_madvise(void) {
+	return (metadata_thp_enabled() &&
+	    (init_system_thp_mode == thp_mode_default));
+}
+
 static void *
 base_map(tsdn_t *tsdn, extent_hooks_t *extent_hooks, unsigned ind, size_t size) {
 	void *addr;
 	bool zero = true;
 	bool commit = true;
 
+	/* Use huge page sizes and alignment regardless of opt_metadata_thp. */
 	assert(size == HUGEPAGE_CEILING(size));
-
+	size_t alignment = HUGEPAGE;
 	if (extent_hooks == &extent_hooks_default) {
-		addr = extent_alloc_mmap(NULL, size, PAGE, &zero, &commit);
+		addr = extent_alloc_mmap(NULL, size, alignment, &zero, &commit);
 	} else {
 		/* No arena context as we are creating new arenas. */
 		tsd_t *tsd = tsdn_null(tsdn) ? tsd_fetch() : tsdn_tsd(tsdn);
 		pre_reentrancy(tsd, NULL);
-		addr = extent_hooks->alloc(extent_hooks, NULL, size, PAGE,
+		addr = extent_hooks->alloc(extent_hooks, NULL, size, alignment,
 		    &zero, &commit, ind);
 		post_reentrancy(tsd);
 	}
@@ -51,16 +66,16 @@ base_unmap(tsdn_t *tsdn, extent_hooks_t *extent_hooks, unsigned ind, void *addr,
 	 */
 	if (extent_hooks == &extent_hooks_default) {
 		if (!extent_dalloc_mmap(addr, size)) {
-			return;
+			goto label_done;
 		}
 		if (!pages_decommit(addr, size)) {
-			return;
+			goto label_done;
 		}
 		if (!pages_purge_forced(addr, size)) {
-			return;
+			goto label_done;
 		}
 		if (!pages_purge_lazy(addr, size)) {
-			return;
+			goto label_done;
 		}
 		/* Nothing worked.  This should never happen. */
 		not_reached();
@@ -70,27 +85,33 @@ base_unmap(tsdn_t *tsdn, extent_hooks_t *extent_hooks, unsigned ind, void *addr,
 		if (extent_hooks->dalloc != NULL &&
 		    !extent_hooks->dalloc(extent_hooks, addr, size, true,
 		    ind)) {
-			goto label_done;
+			goto label_post_reentrancy;
 		}
 		if (extent_hooks->decommit != NULL &&
 		    !extent_hooks->decommit(extent_hooks, addr, size, 0, size,
 		    ind)) {
-			goto label_done;
+			goto label_post_reentrancy;
 		}
 		if (extent_hooks->purge_forced != NULL &&
 		    !extent_hooks->purge_forced(extent_hooks, addr, size, 0,
 		    size, ind)) {
-			goto label_done;
+			goto label_post_reentrancy;
 		}
 		if (extent_hooks->purge_lazy != NULL &&
 		    !extent_hooks->purge_lazy(extent_hooks, addr, size, 0, size,
 		    ind)) {
-			goto label_done;
+			goto label_post_reentrancy;
 		}
 		/* Nothing worked.  That's the application's problem. */
-	label_done:
+	label_post_reentrancy:
 		post_reentrancy(tsd);
-		return;
+	}
+label_done:
+	if (metadata_thp_madvise()) {
+		/* Set NOHUGEPAGE after unmap to avoid kernel defrag. */
+		assert(((uintptr_t)addr & HUGEPAGE_MASK) == 0 &&
+		    (size & HUGEPAGE_MASK) == 0);
+		pages_nohuge(addr, size);
 	}
 }
 
@@ -105,6 +126,56 @@ base_extent_init(size_t *extent_sn_next, extent_t *extent, void *addr,
 	extent_binit(extent, addr, size, sn);
 }
 
+static size_t
+base_get_num_blocks(base_t *base, bool with_new_block) {
+	base_block_t *b = base->blocks;
+	assert(b != NULL);
+
+	size_t n_blocks = with_new_block ? 2 : 1;
+	while (b->next != NULL) {
+		n_blocks++;
+		b = b->next;
+	}
+
+	return n_blocks;
+}
+
+static void
+base_auto_thp_switch(tsdn_t *tsdn, base_t *base) {
+	assert(opt_metadata_thp == metadata_thp_auto);
+	malloc_mutex_assert_owner(tsdn, &base->mtx);
+	if (base->auto_thp_switched) {
+		return;
+	}
+	/* Called when adding a new block. */
+	bool should_switch;
+	if (base_ind_get(base) != 0) {
+		should_switch = (base_get_num_blocks(base, true) ==
+		    BASE_AUTO_THP_THRESHOLD);
+	} else {
+		should_switch = (base_get_num_blocks(base, true) ==
+		    BASE_AUTO_THP_THRESHOLD_A0);
+	}
+	if (!should_switch) {
+		return;
+	}
+
+	base->auto_thp_switched = true;
+	assert(!config_stats || base->n_thp == 0);
+	/* Make the initial blocks THP lazily. */
+	base_block_t *block = base->blocks;
+	while (block != NULL) {
+		assert((block->size & HUGEPAGE_MASK) == 0);
+		pages_huge(block, block->size);
+		if (config_stats) {
+			base->n_thp += HUGEPAGE_CEILING(block->size -
+			    extent_bsize_get(&block->extent)) >> LG_HUGEPAGE;
+		}
+		block = block->next;
+		assert(block == NULL || (base_ind_get(base) == 0));
+	}
+}
+
 static void *
 base_extent_bump_alloc_helper(extent_t *extent, size_t *gap_size, size_t size,
     size_t alignment) {
@@ -124,8 +195,8 @@ base_extent_bump_alloc_helper(extent_t *extent, size_t *gap_size, size_t size,
 }
 
 static void
-base_extent_bump_alloc_post(tsdn_t *tsdn, base_t *base, extent_t *extent,
-    size_t gap_size, void *addr, size_t size) {
+base_extent_bump_alloc_post(base_t *base, extent_t *extent, size_t gap_size,
+    void *addr, size_t size) {
 	if (extent_bsize_get(extent) > 0) {
 		/*
 		 * Compute the index for the largest size class that does not
@@ -140,23 +211,31 @@ base_extent_bump_alloc_post(tsdn_t *tsdn, base_t *base, extent_t *extent,
 		base->allocated += size;
 		/*
 		 * Add one PAGE to base_resident for every page boundary that is
-		 * crossed by the new allocation.
+		 * crossed by the new allocation. Adjust n_thp similarly when
+		 * metadata_thp is enabled.
 		 */
 		base->resident += PAGE_CEILING((uintptr_t)addr + size) -
 		    PAGE_CEILING((uintptr_t)addr - gap_size);
 		assert(base->allocated <= base->resident);
 		assert(base->resident <= base->mapped);
+		if (metadata_thp_madvise() && (opt_metadata_thp ==
+		    metadata_thp_always || base->auto_thp_switched)) {
+			base->n_thp += (HUGEPAGE_CEILING((uintptr_t)addr + size)
+			    - HUGEPAGE_CEILING((uintptr_t)addr - gap_size)) >>
+			    LG_HUGEPAGE;
+			assert(base->mapped >= base->n_thp << LG_HUGEPAGE);
+		}
 	}
 }
 
 static void *
-base_extent_bump_alloc(tsdn_t *tsdn, base_t *base, extent_t *extent,
-    size_t size, size_t alignment) {
+base_extent_bump_alloc(base_t *base, extent_t *extent, size_t size,
+    size_t alignment) {
 	void *ret;
 	size_t gap_size;
 
 	ret = base_extent_bump_alloc_helper(extent, &gap_size, size, alignment);
-	base_extent_bump_alloc_post(tsdn, base, extent, gap_size, ret, size);
+	base_extent_bump_alloc_post(base, extent, gap_size, ret, size);
 	return ret;
 }
 
@@ -166,8 +245,8 @@ base_extent_bump_alloc(tsdn_t *tsdn, base_t *base, extent_t *extent,
  * On success a pointer to the initialized base_block_t header is returned.
  */
 static base_block_t *
-base_block_alloc(tsdn_t *tsdn, extent_hooks_t *extent_hooks, unsigned ind,
-    pszind_t *pind_last, size_t *extent_sn_next, size_t size,
+base_block_alloc(tsdn_t *tsdn, base_t *base, extent_hooks_t *extent_hooks,
+    unsigned ind, pszind_t *pind_last, size_t *extent_sn_next, size_t size,
     size_t alignment) {
 	alignment = ALIGNMENT_CEILING(alignment, QUANTUM);
 	size_t usize = ALIGNMENT_CEILING(size, alignment);
@@ -183,8 +262,8 @@ base_block_alloc(tsdn_t *tsdn, extent_hooks_t *extent_hooks, unsigned ind,
 	 */
 	size_t min_block_size = HUGEPAGE_CEILING(sz_psz2u(header_size + gap_size
 	    + usize));
-	pszind_t pind_next = (*pind_last + 1 < NPSIZES) ? *pind_last + 1 :
-	    *pind_last;
+	pszind_t pind_next = (*pind_last + 1 < sz_psz2ind(SC_LARGE_MAXCLASS)) ?
+	    *pind_last + 1 : *pind_last;
 	size_t next_block_size = HUGEPAGE_CEILING(sz_pind2sz(pind_next));
 	size_t block_size = (min_block_size > next_block_size) ? min_block_size
 	    : next_block_size;
@@ -193,6 +272,25 @@ base_block_alloc(tsdn_t *tsdn, extent_hooks_t *extent_hooks, unsigned ind,
 	if (block == NULL) {
 		return NULL;
 	}
+
+	if (metadata_thp_madvise()) {
+		void *addr = (void *)block;
+		assert(((uintptr_t)addr & HUGEPAGE_MASK) == 0 &&
+		    (block_size & HUGEPAGE_MASK) == 0);
+		if (opt_metadata_thp == metadata_thp_always) {
+			pages_huge(addr, block_size);
+		} else if (opt_metadata_thp == metadata_thp_auto &&
+		    base != NULL) {
+			/* base != NULL indicates this is not a new base. */
+			malloc_mutex_lock(tsdn, &base->mtx);
+			base_auto_thp_switch(tsdn, base);
+			if (base->auto_thp_switched) {
+				pages_huge(addr, block_size);
+			}
+			malloc_mutex_unlock(tsdn, &base->mtx);
+		}
+	}
+
 	*pind_last = sz_psz2ind(block_size);
 	block->size = block_size;
 	block->next = NULL;
@@ -216,7 +314,7 @@ base_extent_alloc(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment) {
 	 * called.
 	 */
 	malloc_mutex_unlock(tsdn, &base->mtx);
-	base_block_t *block = base_block_alloc(tsdn, extent_hooks,
+	base_block_t *block = base_block_alloc(tsdn, base, extent_hooks,
 	    base_ind_get(base), &base->pind_last, &base->extent_sn_next, size,
 	    alignment);
 	malloc_mutex_lock(tsdn, &base->mtx);
@@ -229,8 +327,16 @@ base_extent_alloc(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment) {
 		base->allocated += sizeof(base_block_t);
 		base->resident += PAGE_CEILING(sizeof(base_block_t));
 		base->mapped += block->size;
+		if (metadata_thp_madvise() &&
+		    !(opt_metadata_thp == metadata_thp_auto
+		      && !base->auto_thp_switched)) {
+			assert(base->n_thp > 0);
+			base->n_thp += HUGEPAGE_CEILING(sizeof(base_block_t)) >>
+			    LG_HUGEPAGE;
+		}
 		assert(base->allocated <= base->resident);
 		assert(base->resident <= base->mapped);
+		assert(base->n_thp << LG_HUGEPAGE <= base->mapped);
 	}
 	return &block->extent;
 }
@@ -244,7 +350,7 @@ base_t *
 base_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 	pszind_t pind_last = 0;
 	size_t extent_sn_next = 0;
-	base_block_t *block = base_block_alloc(tsdn, extent_hooks, ind,
+	base_block_t *block = base_block_alloc(tsdn, NULL, extent_hooks, ind,
 	    &pind_last, &extent_sn_next, sizeof(base_t), QUANTUM);
 	if (block == NULL) {
 		return NULL;
@@ -265,17 +371,22 @@ base_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 	base->pind_last = pind_last;
 	base->extent_sn_next = extent_sn_next;
 	base->blocks = block;
-	for (szind_t i = 0; i < NSIZES; i++) {
+	base->auto_thp_switched = false;
+	for (szind_t i = 0; i < SC_NSIZES; i++) {
 		extent_heap_new(&base->avail[i]);
 	}
 	if (config_stats) {
 		base->allocated = sizeof(base_block_t);
 		base->resident = PAGE_CEILING(sizeof(base_block_t));
 		base->mapped = block->size;
+		base->n_thp = (opt_metadata_thp == metadata_thp_always) &&
+		    metadata_thp_madvise() ? HUGEPAGE_CEILING(sizeof(base_block_t))
+		    >> LG_HUGEPAGE : 0;
 		assert(base->allocated <= base->resident);
 		assert(base->resident <= base->mapped);
+		assert(base->n_thp << LG_HUGEPAGE <= base->mapped);
 	}
-	base_extent_bump_alloc_post(tsdn, base, &block->extent, gap_size, base,
+	base_extent_bump_alloc_post(base, &block->extent, gap_size, base,
 	    base_size);
 
 	return base;
@@ -315,7 +426,7 @@ base_alloc_impl(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment,
 
 	extent_t *extent = NULL;
 	malloc_mutex_lock(tsdn, &base->mtx);
-	for (szind_t i = sz_size2index(asize); i < NSIZES; i++) {
+	for (szind_t i = sz_size2index(asize); i < SC_NSIZES; i++) {
 		extent = extent_heap_remove_first(&base->avail[i]);
 		if (extent != NULL) {
 			/* Use existing space. */
@@ -332,7 +443,7 @@ base_alloc_impl(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment,
 		goto label_return;
 	}
 
-	ret = base_extent_bump_alloc(tsdn, base, extent, usize, alignment);
+	ret = base_extent_bump_alloc(base, extent, usize, alignment);
 	if (esn != NULL) {
 		*esn = extent_sn_get(extent);
 	}
@@ -368,7 +479,7 @@ base_alloc_extent(tsdn_t *tsdn, base_t *base) {
 
 void
 base_stats_get(tsdn_t *tsdn, base_t *base, size_t *allocated, size_t *resident,
-    size_t *mapped) {
+    size_t *mapped, size_t *n_thp) {
 	cassert(config_stats);
 
 	malloc_mutex_lock(tsdn, &base->mtx);
@@ -377,6 +488,7 @@ base_stats_get(tsdn_t *tsdn, base_t *base, size_t *allocated, size_t *resident,
 	*allocated = base->allocated;
 	*resident = base->resident;
 	*mapped = base->mapped;
+	*n_thp = base->n_thp;
 	malloc_mutex_unlock(tsdn, &base->mtx);
 }
 
diff --git a/deps/jemalloc/src/bin.c b/deps/jemalloc/src/bin.c
new file mode 100644
index 0000000000..bca6b12c35
--- /dev/null
+++ b/deps/jemalloc/src/bin.c
@@ -0,0 +1,95 @@
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/jemalloc_internal_includes.h"
+
+#include "jemalloc/internal/assert.h"
+#include "jemalloc/internal/bin.h"
+#include "jemalloc/internal/sc.h"
+#include "jemalloc/internal/witness.h"
+
+bin_info_t bin_infos[SC_NBINS];
+
+static void
+bin_infos_init(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS],
+    bin_info_t bin_infos[SC_NBINS]) {
+	for (unsigned i = 0; i < SC_NBINS; i++) {
+		bin_info_t *bin_info = &bin_infos[i];
+		sc_t *sc = &sc_data->sc[i];
+		bin_info->reg_size = ((size_t)1U << sc->lg_base)
+		    + ((size_t)sc->ndelta << sc->lg_delta);
+		bin_info->slab_size = (sc->pgs << LG_PAGE);
+		bin_info->nregs =
+		    (uint32_t)(bin_info->slab_size / bin_info->reg_size);
+		bin_info->n_shards = bin_shard_sizes[i];
+		bitmap_info_t bitmap_info = BITMAP_INFO_INITIALIZER(
+		    bin_info->nregs);
+		bin_info->bitmap_info = bitmap_info;
+	}
+}
+
+bool
+bin_update_shard_size(unsigned bin_shard_sizes[SC_NBINS], size_t start_size,
+    size_t end_size, size_t nshards) {
+	if (nshards > BIN_SHARDS_MAX || nshards == 0) {
+		return true;
+	}
+
+	if (start_size > SC_SMALL_MAXCLASS) {
+		return false;
+	}
+	if (end_size > SC_SMALL_MAXCLASS) {
+		end_size = SC_SMALL_MAXCLASS;
+	}
+
+	/* Compute the index since this may happen before sz init. */
+	szind_t ind1 = sz_size2index_compute(start_size);
+	szind_t ind2 = sz_size2index_compute(end_size);
+	for (unsigned i = ind1; i <= ind2; i++) {
+		bin_shard_sizes[i] = (unsigned)nshards;
+	}
+
+	return false;
+}
+
+void
+bin_shard_sizes_boot(unsigned bin_shard_sizes[SC_NBINS]) {
+	/* Load the default number of shards. */
+	for (unsigned i = 0; i < SC_NBINS; i++) {
+		bin_shard_sizes[i] = N_BIN_SHARDS_DEFAULT;
+	}
+}
+
+void
+bin_boot(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS]) {
+	assert(sc_data->initialized);
+	bin_infos_init(sc_data, bin_shard_sizes, bin_infos);
+}
+
+bool
+bin_init(bin_t *bin) {
+	if (malloc_mutex_init(&bin->lock, "bin", WITNESS_RANK_BIN,
+	    malloc_mutex_rank_exclusive)) {
+		return true;
+	}
+	bin->slabcur = NULL;
+	extent_heap_new(&bin->slabs_nonfull);
+	extent_list_init(&bin->slabs_full);
+	if (config_stats) {
+		memset(&bin->stats, 0, sizeof(bin_stats_t));
+	}
+	return false;
+}
+
+void
+bin_prefork(tsdn_t *tsdn, bin_t *bin) {
+	malloc_mutex_prefork(tsdn, &bin->lock);
+}
+
+void
+bin_postfork_parent(tsdn_t *tsdn, bin_t *bin) {
+	malloc_mutex_postfork_parent(tsdn, &bin->lock);
+}
+
+void
+bin_postfork_child(tsdn_t *tsdn, bin_t *bin) {
+	malloc_mutex_postfork_child(tsdn, &bin->lock);
+}
diff --git a/deps/jemalloc/src/ckh.c b/deps/jemalloc/src/ckh.c
index e95e0a3ed5..1bf6df5a11 100644
--- a/deps/jemalloc/src/ckh.c
+++ b/deps/jemalloc/src/ckh.c
@@ -275,7 +275,8 @@ ckh_grow(tsd_t *tsd, ckh_t *ckh) {
 
 		lg_curcells++;
 		usize = sz_sa2u(sizeof(ckhc_t) << lg_curcells, CACHELINE);
-		if (unlikely(usize == 0 || usize > LARGE_MAXCLASS)) {
+		if (unlikely(usize == 0
+		    || usize > SC_LARGE_MAXCLASS)) {
 			ret = true;
 			goto label_return;
 		}
@@ -320,7 +321,7 @@ ckh_shrink(tsd_t *tsd, ckh_t *ckh) {
 	lg_prevbuckets = ckh->lg_curbuckets;
 	lg_curcells = ckh->lg_curbuckets + LG_CKH_BUCKET_CELLS - 1;
 	usize = sz_sa2u(sizeof(ckhc_t) << lg_curcells, CACHELINE);
-	if (unlikely(usize == 0 || usize > LARGE_MAXCLASS)) {
+	if (unlikely(usize == 0 || usize > SC_LARGE_MAXCLASS)) {
 		return;
 	}
 	tab = (ckhc_t *)ipallocztm(tsd_tsdn(tsd), usize, CACHELINE, true, NULL,
@@ -396,7 +397,7 @@ ckh_new(tsd_t *tsd, ckh_t *ckh, size_t minitems, ckh_hash_t *hash,
 	ckh->keycomp = keycomp;
 
 	usize = sz_sa2u(sizeof(ckhc_t) << lg_mincells, CACHELINE);
-	if (unlikely(usize == 0 || usize > LARGE_MAXCLASS)) {
+	if (unlikely(usize == 0 || usize > SC_LARGE_MAXCLASS)) {
 		ret = true;
 		goto label_return;
 	}
diff --git a/deps/jemalloc/src/ctl.c b/deps/jemalloc/src/ctl.c
index 36bc8fb5b7..48afaa61f4 100644
--- a/deps/jemalloc/src/ctl.c
+++ b/deps/jemalloc/src/ctl.c
@@ -8,7 +8,7 @@
 #include "jemalloc/internal/extent_mmap.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/nstime.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/util.h"
 
 /******************************************************************************/
@@ -57,6 +57,7 @@ static const ctl_named_node_t	*n##_index(tsdn_t *tsdn,		\
 CTL_PROTO(version)
 CTL_PROTO(epoch)
 CTL_PROTO(background_thread)
+CTL_PROTO(max_background_threads)
 CTL_PROTO(thread_tcache_enabled)
 CTL_PROTO(thread_tcache_flush)
 CTL_PROTO(thread_prof_name)
@@ -71,20 +72,24 @@ CTL_PROTO(config_debug)
 CTL_PROTO(config_fill)
 CTL_PROTO(config_lazy_lock)
 CTL_PROTO(config_malloc_conf)
+CTL_PROTO(config_opt_safety_checks)
 CTL_PROTO(config_prof)
 CTL_PROTO(config_prof_libgcc)
 CTL_PROTO(config_prof_libunwind)
 CTL_PROTO(config_stats)
-CTL_PROTO(config_thp)
 CTL_PROTO(config_utrace)
 CTL_PROTO(config_xmalloc)
 CTL_PROTO(opt_abort)
 CTL_PROTO(opt_abort_conf)
+CTL_PROTO(opt_confirm_conf)
+CTL_PROTO(opt_metadata_thp)
 CTL_PROTO(opt_retain)
 CTL_PROTO(opt_dss)
 CTL_PROTO(opt_narenas)
 CTL_PROTO(opt_percpu_arena)
+CTL_PROTO(opt_oversize_threshold)
 CTL_PROTO(opt_background_thread)
+CTL_PROTO(opt_max_background_threads)
 CTL_PROTO(opt_dirty_decay_ms)
 CTL_PROTO(opt_muzzy_decay_ms)
 CTL_PROTO(opt_stats_print)
@@ -94,6 +99,8 @@ CTL_PROTO(opt_zero)
 CTL_PROTO(opt_utrace)
 CTL_PROTO(opt_xmalloc)
 CTL_PROTO(opt_tcache)
+CTL_PROTO(opt_thp)
+CTL_PROTO(opt_lg_extent_max_active_fit)
 CTL_PROTO(opt_lg_tcache_max)
 CTL_PROTO(opt_prof)
 CTL_PROTO(opt_prof_prefix)
@@ -117,10 +124,12 @@ CTL_PROTO(arena_i_dss)
 CTL_PROTO(arena_i_dirty_decay_ms)
 CTL_PROTO(arena_i_muzzy_decay_ms)
 CTL_PROTO(arena_i_extent_hooks)
+CTL_PROTO(arena_i_retain_grow_limit)
 INDEX_PROTO(arena_i)
 CTL_PROTO(arenas_bin_i_size)
 CTL_PROTO(arenas_bin_i_nregs)
 CTL_PROTO(arenas_bin_i_slab_size)
+CTL_PROTO(arenas_bin_i_nshards)
 INDEX_PROTO(arenas_bin_i)
 CTL_PROTO(arenas_lextent_i_size)
 INDEX_PROTO(arenas_lextent_i)
@@ -134,6 +143,7 @@ CTL_PROTO(arenas_nbins)
 CTL_PROTO(arenas_nhbins)
 CTL_PROTO(arenas_nlextents)
 CTL_PROTO(arenas_create)
+CTL_PROTO(arenas_lookup)
 CTL_PROTO(prof_thread_active_init)
 CTL_PROTO(prof_active)
 CTL_PROTO(prof_dump)
@@ -141,14 +151,20 @@ CTL_PROTO(prof_gdump)
 CTL_PROTO(prof_reset)
 CTL_PROTO(prof_interval)
 CTL_PROTO(lg_prof_sample)
+CTL_PROTO(prof_log_start)
+CTL_PROTO(prof_log_stop)
 CTL_PROTO(stats_arenas_i_small_allocated)
 CTL_PROTO(stats_arenas_i_small_nmalloc)
 CTL_PROTO(stats_arenas_i_small_ndalloc)
 CTL_PROTO(stats_arenas_i_small_nrequests)
+CTL_PROTO(stats_arenas_i_small_nfills)
+CTL_PROTO(stats_arenas_i_small_nflushes)
 CTL_PROTO(stats_arenas_i_large_allocated)
 CTL_PROTO(stats_arenas_i_large_nmalloc)
 CTL_PROTO(stats_arenas_i_large_ndalloc)
 CTL_PROTO(stats_arenas_i_large_nrequests)
+CTL_PROTO(stats_arenas_i_large_nfills)
+CTL_PROTO(stats_arenas_i_large_nflushes)
 CTL_PROTO(stats_arenas_i_bins_j_nmalloc)
 CTL_PROTO(stats_arenas_i_bins_j_ndalloc)
 CTL_PROTO(stats_arenas_i_bins_j_nrequests)
@@ -158,12 +174,20 @@ CTL_PROTO(stats_arenas_i_bins_j_nflushes)
 CTL_PROTO(stats_arenas_i_bins_j_nslabs)
 CTL_PROTO(stats_arenas_i_bins_j_nreslabs)
 CTL_PROTO(stats_arenas_i_bins_j_curslabs)
+CTL_PROTO(stats_arenas_i_bins_j_nonfull_slabs)
 INDEX_PROTO(stats_arenas_i_bins_j)
 CTL_PROTO(stats_arenas_i_lextents_j_nmalloc)
 CTL_PROTO(stats_arenas_i_lextents_j_ndalloc)
 CTL_PROTO(stats_arenas_i_lextents_j_nrequests)
 CTL_PROTO(stats_arenas_i_lextents_j_curlextents)
 INDEX_PROTO(stats_arenas_i_lextents_j)
+CTL_PROTO(stats_arenas_i_extents_j_ndirty)
+CTL_PROTO(stats_arenas_i_extents_j_nmuzzy)
+CTL_PROTO(stats_arenas_i_extents_j_nretained)
+CTL_PROTO(stats_arenas_i_extents_j_dirty_bytes)
+CTL_PROTO(stats_arenas_i_extents_j_muzzy_bytes)
+CTL_PROTO(stats_arenas_i_extents_j_retained_bytes)
+INDEX_PROTO(stats_arenas_i_extents_j)
 CTL_PROTO(stats_arenas_i_nthreads)
 CTL_PROTO(stats_arenas_i_uptime)
 CTL_PROTO(stats_arenas_i_dss)
@@ -174,6 +198,7 @@ CTL_PROTO(stats_arenas_i_pdirty)
 CTL_PROTO(stats_arenas_i_pmuzzy)
 CTL_PROTO(stats_arenas_i_mapped)
 CTL_PROTO(stats_arenas_i_retained)
+CTL_PROTO(stats_arenas_i_extent_avail)
 CTL_PROTO(stats_arenas_i_dirty_npurge)
 CTL_PROTO(stats_arenas_i_dirty_nmadvise)
 CTL_PROTO(stats_arenas_i_dirty_purged)
@@ -182,8 +207,10 @@ CTL_PROTO(stats_arenas_i_muzzy_nmadvise)
 CTL_PROTO(stats_arenas_i_muzzy_purged)
 CTL_PROTO(stats_arenas_i_base)
 CTL_PROTO(stats_arenas_i_internal)
+CTL_PROTO(stats_arenas_i_metadata_thp)
 CTL_PROTO(stats_arenas_i_tcache_bytes)
 CTL_PROTO(stats_arenas_i_resident)
+CTL_PROTO(stats_arenas_i_abandoned_vm)
 INDEX_PROTO(stats_arenas_i)
 CTL_PROTO(stats_allocated)
 CTL_PROTO(stats_active)
@@ -191,9 +218,16 @@ CTL_PROTO(stats_background_thread_num_threads)
 CTL_PROTO(stats_background_thread_num_runs)
 CTL_PROTO(stats_background_thread_run_interval)
 CTL_PROTO(stats_metadata)
+CTL_PROTO(stats_metadata_thp)
 CTL_PROTO(stats_resident)
 CTL_PROTO(stats_mapped)
 CTL_PROTO(stats_retained)
+CTL_PROTO(experimental_hooks_install)
+CTL_PROTO(experimental_hooks_remove)
+CTL_PROTO(experimental_utilization_query)
+CTL_PROTO(experimental_utilization_batch_query)
+CTL_PROTO(experimental_arenas_i_pactivep)
+INDEX_PROTO(experimental_arenas_i)
 
 #define MUTEX_STATS_CTL_PROTO_GEN(n)					\
 CTL_PROTO(stats_##n##_num_ops)						\
@@ -262,11 +296,11 @@ static const ctl_named_node_t	config_node[] = {
 	{NAME("fill"),		CTL(config_fill)},
 	{NAME("lazy_lock"),	CTL(config_lazy_lock)},
 	{NAME("malloc_conf"),	CTL(config_malloc_conf)},
+	{NAME("opt_safety_checks"),	CTL(config_opt_safety_checks)},
 	{NAME("prof"),		CTL(config_prof)},
 	{NAME("prof_libgcc"),	CTL(config_prof_libgcc)},
 	{NAME("prof_libunwind"), CTL(config_prof_libunwind)},
 	{NAME("stats"),		CTL(config_stats)},
-	{NAME("thp"),		CTL(config_thp)},
 	{NAME("utrace"),	CTL(config_utrace)},
 	{NAME("xmalloc"),	CTL(config_xmalloc)}
 };
@@ -274,11 +308,15 @@ static const ctl_named_node_t	config_node[] = {
 static const ctl_named_node_t opt_node[] = {
 	{NAME("abort"),		CTL(opt_abort)},
 	{NAME("abort_conf"),	CTL(opt_abort_conf)},
+	{NAME("confirm_conf"),	CTL(opt_confirm_conf)},
+	{NAME("metadata_thp"),	CTL(opt_metadata_thp)},
 	{NAME("retain"),	CTL(opt_retain)},
 	{NAME("dss"),		CTL(opt_dss)},
 	{NAME("narenas"),	CTL(opt_narenas)},
 	{NAME("percpu_arena"),	CTL(opt_percpu_arena)},
+	{NAME("oversize_threshold"),	CTL(opt_oversize_threshold)},
 	{NAME("background_thread"),	CTL(opt_background_thread)},
+	{NAME("max_background_threads"),	CTL(opt_max_background_threads)},
 	{NAME("dirty_decay_ms"), CTL(opt_dirty_decay_ms)},
 	{NAME("muzzy_decay_ms"), CTL(opt_muzzy_decay_ms)},
 	{NAME("stats_print"),	CTL(opt_stats_print)},
@@ -288,6 +326,8 @@ static const ctl_named_node_t opt_node[] = {
 	{NAME("utrace"),	CTL(opt_utrace)},
 	{NAME("xmalloc"),	CTL(opt_xmalloc)},
 	{NAME("tcache"),	CTL(opt_tcache)},
+	{NAME("thp"),		CTL(opt_thp)},
+	{NAME("lg_extent_max_active_fit"), CTL(opt_lg_extent_max_active_fit)},
 	{NAME("lg_tcache_max"),	CTL(opt_lg_tcache_max)},
 	{NAME("prof"),		CTL(opt_prof)},
 	{NAME("prof_prefix"),	CTL(opt_prof_prefix)},
@@ -316,7 +356,8 @@ static const ctl_named_node_t arena_i_node[] = {
 	{NAME("dss"),		CTL(arena_i_dss)},
 	{NAME("dirty_decay_ms"), CTL(arena_i_dirty_decay_ms)},
 	{NAME("muzzy_decay_ms"), CTL(arena_i_muzzy_decay_ms)},
-	{NAME("extent_hooks"),	CTL(arena_i_extent_hooks)}
+	{NAME("extent_hooks"),	CTL(arena_i_extent_hooks)},
+	{NAME("retain_grow_limit"),	CTL(arena_i_retain_grow_limit)}
 };
 static const ctl_named_node_t super_arena_i_node[] = {
 	{NAME(""),		CHILD(named, arena_i)}
@@ -329,7 +370,8 @@ static const ctl_indexed_node_t arena_node[] = {
 static const ctl_named_node_t arenas_bin_i_node[] = {
 	{NAME("size"),		CTL(arenas_bin_i_size)},
 	{NAME("nregs"),		CTL(arenas_bin_i_nregs)},
-	{NAME("slab_size"),	CTL(arenas_bin_i_slab_size)}
+	{NAME("slab_size"),	CTL(arenas_bin_i_slab_size)},
+	{NAME("nshards"),	CTL(arenas_bin_i_nshards)}
 };
 static const ctl_named_node_t super_arenas_bin_i_node[] = {
 	{NAME(""),		CHILD(named, arenas_bin_i)}
@@ -362,7 +404,8 @@ static const ctl_named_node_t arenas_node[] = {
 	{NAME("bin"),		CHILD(indexed, arenas_bin)},
 	{NAME("nlextents"),	CTL(arenas_nlextents)},
 	{NAME("lextent"),	CHILD(indexed, arenas_lextent)},
-	{NAME("create"),	CTL(arenas_create)}
+	{NAME("create"),	CTL(arenas_create)},
+	{NAME("lookup"),	CTL(arenas_lookup)}
 };
 
 static const ctl_named_node_t	prof_node[] = {
@@ -372,21 +415,26 @@ static const ctl_named_node_t	prof_node[] = {
 	{NAME("gdump"),		CTL(prof_gdump)},
 	{NAME("reset"),		CTL(prof_reset)},
 	{NAME("interval"),	CTL(prof_interval)},
-	{NAME("lg_sample"),	CTL(lg_prof_sample)}
+	{NAME("lg_sample"),	CTL(lg_prof_sample)},
+	{NAME("log_start"),	CTL(prof_log_start)},
+	{NAME("log_stop"),	CTL(prof_log_stop)}
 };
-
 static const ctl_named_node_t stats_arenas_i_small_node[] = {
 	{NAME("allocated"),	CTL(stats_arenas_i_small_allocated)},
 	{NAME("nmalloc"),	CTL(stats_arenas_i_small_nmalloc)},
 	{NAME("ndalloc"),	CTL(stats_arenas_i_small_ndalloc)},
-	{NAME("nrequests"),	CTL(stats_arenas_i_small_nrequests)}
+	{NAME("nrequests"),	CTL(stats_arenas_i_small_nrequests)},
+	{NAME("nfills"),	CTL(stats_arenas_i_small_nfills)},
+	{NAME("nflushes"),	CTL(stats_arenas_i_small_nflushes)}
 };
 
 static const ctl_named_node_t stats_arenas_i_large_node[] = {
 	{NAME("allocated"),	CTL(stats_arenas_i_large_allocated)},
 	{NAME("nmalloc"),	CTL(stats_arenas_i_large_nmalloc)},
 	{NAME("ndalloc"),	CTL(stats_arenas_i_large_ndalloc)},
-	{NAME("nrequests"),	CTL(stats_arenas_i_large_nrequests)}
+	{NAME("nrequests"),	CTL(stats_arenas_i_large_nrequests)},
+	{NAME("nfills"),	CTL(stats_arenas_i_large_nfills)},
+	{NAME("nflushes"),	CTL(stats_arenas_i_large_nflushes)}
 };
 
 #define MUTEX_PROF_DATA_NODE(prefix)					\
@@ -420,6 +468,7 @@ static const ctl_named_node_t stats_arenas_i_bins_j_node[] = {
 	{NAME("nslabs"),	CTL(stats_arenas_i_bins_j_nslabs)},
 	{NAME("nreslabs"),	CTL(stats_arenas_i_bins_j_nreslabs)},
 	{NAME("curslabs"),	CTL(stats_arenas_i_bins_j_curslabs)},
+	{NAME("nonfull_slabs"),	CTL(stats_arenas_i_bins_j_nonfull_slabs)},
 	{NAME("mutex"),		CHILD(named, stats_arenas_i_bins_j_mutex)}
 };
 
@@ -445,6 +494,23 @@ static const ctl_indexed_node_t stats_arenas_i_lextents_node[] = {
 	{INDEX(stats_arenas_i_lextents_j)}
 };
 
+static const ctl_named_node_t stats_arenas_i_extents_j_node[] = {
+	{NAME("ndirty"),	CTL(stats_arenas_i_extents_j_ndirty)},
+	{NAME("nmuzzy"),	CTL(stats_arenas_i_extents_j_nmuzzy)},
+	{NAME("nretained"),	CTL(stats_arenas_i_extents_j_nretained)},
+	{NAME("dirty_bytes"),	CTL(stats_arenas_i_extents_j_dirty_bytes)},
+	{NAME("muzzy_bytes"),	CTL(stats_arenas_i_extents_j_muzzy_bytes)},
+	{NAME("retained_bytes"), CTL(stats_arenas_i_extents_j_retained_bytes)}
+};
+
+static const ctl_named_node_t super_stats_arenas_i_extents_j_node[] = {
+	{NAME(""),		CHILD(named, stats_arenas_i_extents_j)}
+};
+
+static const ctl_indexed_node_t stats_arenas_i_extents_node[] = {
+	{INDEX(stats_arenas_i_extents_j)}
+};
+
 #define OP(mtx)  MUTEX_PROF_DATA_NODE(arenas_i_mutexes_##mtx)
 MUTEX_PROF_ARENA_MUTEXES
 #undef OP
@@ -466,6 +532,7 @@ static const ctl_named_node_t stats_arenas_i_node[] = {
 	{NAME("pmuzzy"),	CTL(stats_arenas_i_pmuzzy)},
 	{NAME("mapped"),	CTL(stats_arenas_i_mapped)},
 	{NAME("retained"),	CTL(stats_arenas_i_retained)},
+	{NAME("extent_avail"),	CTL(stats_arenas_i_extent_avail)},
 	{NAME("dirty_npurge"),	CTL(stats_arenas_i_dirty_npurge)},
 	{NAME("dirty_nmadvise"), CTL(stats_arenas_i_dirty_nmadvise)},
 	{NAME("dirty_purged"),	CTL(stats_arenas_i_dirty_purged)},
@@ -474,12 +541,15 @@ static const ctl_named_node_t stats_arenas_i_node[] = {
 	{NAME("muzzy_purged"),	CTL(stats_arenas_i_muzzy_purged)},
 	{NAME("base"),		CTL(stats_arenas_i_base)},
 	{NAME("internal"),	CTL(stats_arenas_i_internal)},
+	{NAME("metadata_thp"),	CTL(stats_arenas_i_metadata_thp)},
 	{NAME("tcache_bytes"),	CTL(stats_arenas_i_tcache_bytes)},
 	{NAME("resident"),	CTL(stats_arenas_i_resident)},
+	{NAME("abandoned_vm"),	CTL(stats_arenas_i_abandoned_vm)},
 	{NAME("small"),		CHILD(named, stats_arenas_i_small)},
 	{NAME("large"),		CHILD(named, stats_arenas_i_large)},
 	{NAME("bins"),		CHILD(indexed, stats_arenas_i_bins)},
 	{NAME("lextents"),	CHILD(indexed, stats_arenas_i_lextents)},
+	{NAME("extents"),	CHILD(indexed, stats_arenas_i_extents)},
 	{NAME("mutexes"),	CHILD(named, stats_arenas_i_mutexes)}
 };
 static const ctl_named_node_t super_stats_arenas_i_node[] = {
@@ -512,6 +582,7 @@ static const ctl_named_node_t stats_node[] = {
 	{NAME("allocated"),	CTL(stats_allocated)},
 	{NAME("active"),	CTL(stats_active)},
 	{NAME("metadata"),	CTL(stats_metadata)},
+	{NAME("metadata_thp"),	CTL(stats_metadata_thp)},
 	{NAME("resident"),	CTL(stats_resident)},
 	{NAME("mapped"),	CTL(stats_mapped)},
 	{NAME("retained"),	CTL(stats_retained)},
@@ -521,10 +592,38 @@ static const ctl_named_node_t stats_node[] = {
 	{NAME("arenas"),	CHILD(indexed, stats_arenas)}
 };
 
+static const ctl_named_node_t experimental_hooks_node[] = {
+	{NAME("install"),	CTL(experimental_hooks_install)},
+	{NAME("remove"),	CTL(experimental_hooks_remove)}
+};
+
+static const ctl_named_node_t experimental_utilization_node[] = {
+	{NAME("query"),		CTL(experimental_utilization_query)},
+	{NAME("batch_query"),	CTL(experimental_utilization_batch_query)}
+};
+
+static const ctl_named_node_t experimental_arenas_i_node[] = {
+	{NAME("pactivep"),	CTL(experimental_arenas_i_pactivep)}
+};
+static const ctl_named_node_t super_experimental_arenas_i_node[] = {
+	{NAME(""),		CHILD(named, experimental_arenas_i)}
+};
+
+static const ctl_indexed_node_t experimental_arenas_node[] = {
+	{INDEX(experimental_arenas_i)}
+};
+
+static const ctl_named_node_t experimental_node[] = {
+	{NAME("hooks"),		CHILD(named, experimental_hooks)},
+	{NAME("utilization"),	CHILD(named, experimental_utilization)},
+	{NAME("arenas"),	CHILD(indexed, experimental_arenas)}
+};
+
 static const ctl_named_node_t	root_node[] = {
 	{NAME("version"),	CTL(version)},
 	{NAME("epoch"),		CTL(epoch)},
 	{NAME("background_thread"),	CTL(background_thread)},
+	{NAME("max_background_threads"),	CTL(max_background_threads)},
 	{NAME("thread"),	CHILD(named, thread)},
 	{NAME("config"),	CHILD(named, config)},
 	{NAME("opt"),		CHILD(named, opt)},
@@ -532,7 +631,8 @@ static const ctl_named_node_t	root_node[] = {
 	{NAME("arena"),		CHILD(indexed, arena)},
 	{NAME("arenas"),	CHILD(named, arenas)},
 	{NAME("prof"),		CHILD(named, prof)},
-	{NAME("stats"),		CHILD(named, stats)}
+	{NAME("stats"),		CHILD(named, stats)},
+	{NAME("experimental"),	CHILD(named, experimental)}
 };
 static const ctl_named_node_t super_root_node[] = {
 	{NAME(""),		CHILD(named, root)}
@@ -550,7 +650,7 @@ static const ctl_named_node_t super_root_node[] = {
  * synchronized by the ctl mutex.
  */
 static void
-accum_arena_stats_u64(arena_stats_u64_t *dst, arena_stats_u64_t *src) {
+ctl_accum_arena_stats_u64(arena_stats_u64_t *dst, arena_stats_u64_t *src) {
 #ifdef JEMALLOC_ATOMIC_U64
 	uint64_t cur_dst = atomic_load_u64(dst, ATOMIC_RELAXED);
 	uint64_t cur_src = atomic_load_u64(src, ATOMIC_RELAXED);
@@ -562,7 +662,7 @@ accum_arena_stats_u64(arena_stats_u64_t *dst, arena_stats_u64_t *src) {
 
 /* Likewise: with ctl mutex synchronization, reading is simple. */
 static uint64_t
-arena_stats_read_u64(arena_stats_u64_t *p) {
+ctl_arena_stats_read_u64(arena_stats_u64_t *p) {
 #ifdef JEMALLOC_ATOMIC_U64
 	return atomic_load_u64(p, ATOMIC_RELAXED);
 #else
@@ -570,7 +670,8 @@ arena_stats_read_u64(arena_stats_u64_t *p) {
 #endif
 }
 
-static void accum_atomic_zu(atomic_zu_t *dst, atomic_zu_t *src) {
+static void
+accum_atomic_zu(atomic_zu_t *dst, atomic_zu_t *src) {
 	size_t cur_dst = atomic_load_zu(dst, ATOMIC_RELAXED);
 	size_t cur_src = atomic_load_zu(src, ATOMIC_RELAXED);
 	atomic_store_zu(dst, cur_dst + cur_src, ATOMIC_RELAXED);
@@ -679,10 +780,14 @@ ctl_arena_clear(ctl_arena_t *ctl_arena) {
 		ctl_arena->astats->nmalloc_small = 0;
 		ctl_arena->astats->ndalloc_small = 0;
 		ctl_arena->astats->nrequests_small = 0;
-		memset(ctl_arena->astats->bstats, 0, NBINS *
-		    sizeof(malloc_bin_stats_t));
-		memset(ctl_arena->astats->lstats, 0, (NSIZES - NBINS) *
-		    sizeof(malloc_large_stats_t));
+		ctl_arena->astats->nfills_small = 0;
+		ctl_arena->astats->nflushes_small = 0;
+		memset(ctl_arena->astats->bstats, 0, SC_NBINS *
+		    sizeof(bin_stats_t));
+		memset(ctl_arena->astats->lstats, 0, (SC_NSIZES - SC_NBINS) *
+		    sizeof(arena_stats_large_t));
+		memset(ctl_arena->astats->estats, 0, SC_NPSIZES *
+		    sizeof(arena_stats_extents_t));
 	}
 }
 
@@ -696,9 +801,9 @@ ctl_arena_stats_amerge(tsdn_t *tsdn, ctl_arena_t *ctl_arena, arena_t *arena) {
 		    &ctl_arena->muzzy_decay_ms, &ctl_arena->pactive,
 		    &ctl_arena->pdirty, &ctl_arena->pmuzzy,
 		    &ctl_arena->astats->astats, ctl_arena->astats->bstats,
-		    ctl_arena->astats->lstats);
+		    ctl_arena->astats->lstats, ctl_arena->astats->estats);
 
-		for (i = 0; i < NBINS; i++) {
+		for (i = 0; i < SC_NBINS; i++) {
 			ctl_arena->astats->allocated_small +=
 			    ctl_arena->astats->bstats[i].curregs *
 			    sz_index2size(i);
@@ -708,6 +813,10 @@ ctl_arena_stats_amerge(tsdn_t *tsdn, ctl_arena_t *ctl_arena, arena_t *arena) {
 			    ctl_arena->astats->bstats[i].ndalloc;
 			ctl_arena->astats->nrequests_small +=
 			    ctl_arena->astats->bstats[i].nrequests;
+			ctl_arena->astats->nfills_small +=
+			    ctl_arena->astats->bstats[i].nfills;
+			ctl_arena->astats->nflushes_small +=
+			    ctl_arena->astats->bstats[i].nflushes;
 		}
 	} else {
 		arena_basic_stats_merge(tsdn, arena, &ctl_arena->nthreads,
@@ -743,20 +852,22 @@ ctl_arena_stats_sdmerge(ctl_arena_t *ctl_sdarena, ctl_arena_t *ctl_arena,
 			    &astats->astats.mapped);
 			accum_atomic_zu(&sdstats->astats.retained,
 			    &astats->astats.retained);
+			accum_atomic_zu(&sdstats->astats.extent_avail,
+			    &astats->astats.extent_avail);
 		}
 
-		accum_arena_stats_u64(&sdstats->astats.decay_dirty.npurge,
+		ctl_accum_arena_stats_u64(&sdstats->astats.decay_dirty.npurge,
 		    &astats->astats.decay_dirty.npurge);
-		accum_arena_stats_u64(&sdstats->astats.decay_dirty.nmadvise,
+		ctl_accum_arena_stats_u64(&sdstats->astats.decay_dirty.nmadvise,
 		    &astats->astats.decay_dirty.nmadvise);
-		accum_arena_stats_u64(&sdstats->astats.decay_dirty.purged,
+		ctl_accum_arena_stats_u64(&sdstats->astats.decay_dirty.purged,
 		    &astats->astats.decay_dirty.purged);
 
-		accum_arena_stats_u64(&sdstats->astats.decay_muzzy.npurge,
+		ctl_accum_arena_stats_u64(&sdstats->astats.decay_muzzy.npurge,
 		    &astats->astats.decay_muzzy.npurge);
-		accum_arena_stats_u64(&sdstats->astats.decay_muzzy.nmadvise,
+		ctl_accum_arena_stats_u64(&sdstats->astats.decay_muzzy.nmadvise,
 		    &astats->astats.decay_muzzy.nmadvise);
-		accum_arena_stats_u64(&sdstats->astats.decay_muzzy.purged,
+		ctl_accum_arena_stats_u64(&sdstats->astats.decay_muzzy.purged,
 		    &astats->astats.decay_muzzy.purged);
 
 #define OP(mtx) malloc_mutex_prof_merge(				\
@@ -773,6 +884,8 @@ MUTEX_PROF_ARENA_MUTEXES
 			    &astats->astats.internal);
 			accum_atomic_zu(&sdstats->astats.resident,
 			    &astats->astats.resident);
+			accum_atomic_zu(&sdstats->astats.metadata_thp,
+			    &astats->astats.metadata_thp);
 		} else {
 			assert(atomic_load_zu(
 			    &astats->astats.internal, ATOMIC_RELAXED) == 0);
@@ -786,6 +899,8 @@ MUTEX_PROF_ARENA_MUTEXES
 		sdstats->nmalloc_small += astats->nmalloc_small;
 		sdstats->ndalloc_small += astats->ndalloc_small;
 		sdstats->nrequests_small += astats->nrequests_small;
+		sdstats->nfills_small += astats->nfills_small;
+		sdstats->nflushes_small += astats->nflushes_small;
 
 		if (!destroyed) {
 			accum_atomic_zu(&sdstats->astats.allocated_large,
@@ -794,12 +909,14 @@ MUTEX_PROF_ARENA_MUTEXES
 			assert(atomic_load_zu(&astats->astats.allocated_large,
 			    ATOMIC_RELAXED) == 0);
 		}
-		accum_arena_stats_u64(&sdstats->astats.nmalloc_large,
+		ctl_accum_arena_stats_u64(&sdstats->astats.nmalloc_large,
 		    &astats->astats.nmalloc_large);
-		accum_arena_stats_u64(&sdstats->astats.ndalloc_large,
+		ctl_accum_arena_stats_u64(&sdstats->astats.ndalloc_large,
 		    &astats->astats.ndalloc_large);
-		accum_arena_stats_u64(&sdstats->astats.nrequests_large,
+		ctl_accum_arena_stats_u64(&sdstats->astats.nrequests_large,
 		    &astats->astats.nrequests_large);
+		accum_atomic_zu(&sdstats->astats.abandoned_vm,
+		    &astats->astats.abandoned_vm);
 
 		accum_atomic_zu(&sdstats->astats.tcache_bytes,
 		    &astats->astats.tcache_bytes);
@@ -808,7 +925,8 @@ MUTEX_PROF_ARENA_MUTEXES
 			sdstats->astats.uptime = astats->astats.uptime;
 		}
 
-		for (i = 0; i < NBINS; i++) {
+		/* Merge bin stats. */
+		for (i = 0; i < SC_NBINS; i++) {
 			sdstats->bstats[i].nmalloc += astats->bstats[i].nmalloc;
 			sdstats->bstats[i].ndalloc += astats->bstats[i].ndalloc;
 			sdstats->bstats[i].nrequests +=
@@ -827,19 +945,23 @@ MUTEX_PROF_ARENA_MUTEXES
 			if (!destroyed) {
 				sdstats->bstats[i].curslabs +=
 				    astats->bstats[i].curslabs;
+				sdstats->bstats[i].nonfull_slabs +=
+				    astats->bstats[i].nonfull_slabs;
 			} else {
 				assert(astats->bstats[i].curslabs == 0);
+				assert(astats->bstats[i].nonfull_slabs == 0);
 			}
 			malloc_mutex_prof_merge(&sdstats->bstats[i].mutex_data,
 			    &astats->bstats[i].mutex_data);
 		}
 
-		for (i = 0; i < NSIZES - NBINS; i++) {
-			accum_arena_stats_u64(&sdstats->lstats[i].nmalloc,
+		/* Merge stats for large allocations. */
+		for (i = 0; i < SC_NSIZES - SC_NBINS; i++) {
+			ctl_accum_arena_stats_u64(&sdstats->lstats[i].nmalloc,
 			    &astats->lstats[i].nmalloc);
-			accum_arena_stats_u64(&sdstats->lstats[i].ndalloc,
+			ctl_accum_arena_stats_u64(&sdstats->lstats[i].ndalloc,
 			    &astats->lstats[i].ndalloc);
-			accum_arena_stats_u64(&sdstats->lstats[i].nrequests,
+			ctl_accum_arena_stats_u64(&sdstats->lstats[i].nrequests,
 			    &astats->lstats[i].nrequests);
 			if (!destroyed) {
 				sdstats->lstats[i].curlextents +=
@@ -848,6 +970,22 @@ MUTEX_PROF_ARENA_MUTEXES
 				assert(astats->lstats[i].curlextents == 0);
 			}
 		}
+
+		/* Merge extents stats. */
+		for (i = 0; i < SC_NPSIZES; i++) {
+			accum_atomic_zu(&sdstats->estats[i].ndirty,
+			    &astats->estats[i].ndirty);
+			accum_atomic_zu(&sdstats->estats[i].nmuzzy,
+			    &astats->estats[i].nmuzzy);
+			accum_atomic_zu(&sdstats->estats[i].nretained,
+			    &astats->estats[i].nretained);
+			accum_atomic_zu(&sdstats->estats[i].dirty_bytes,
+			    &astats->estats[i].dirty_bytes);
+			accum_atomic_zu(&sdstats->estats[i].muzzy_bytes,
+			    &astats->estats[i].muzzy_bytes);
+			accum_atomic_zu(&sdstats->estats[i].retained_bytes,
+			    &astats->estats[i].retained_bytes);
+		}
 	}
 }
 
@@ -938,6 +1076,8 @@ ctl_refresh(tsdn_t *tsdn) {
 		    &ctl_sarena->astats->astats.base, ATOMIC_RELAXED) +
 		    atomic_load_zu(&ctl_sarena->astats->astats.internal,
 			ATOMIC_RELAXED);
+		ctl_stats->metadata_thp = atomic_load_zu(
+		    &ctl_sarena->astats->astats.metadata_thp, ATOMIC_RELAXED);
 		ctl_stats->resident = atomic_load_zu(
 		    &ctl_sarena->astats->astats.resident, ATOMIC_RELAXED);
 		ctl_stats->mapped = atomic_load_zu(
@@ -1357,8 +1497,8 @@ label_return:								\
 
 #define CTL_RO_CGEN(c, n, v, t)						\
 static int								\
-n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,	\
-    size_t *oldlenp, void *newp, size_t newlen) {			\
+n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, \
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {			\
 	int ret;							\
 	t oldval;							\
 									\
@@ -1400,8 +1540,8 @@ label_return:								\
  */
 #define CTL_RO_NL_CGEN(c, n, v, t)					\
 static int								\
-n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,	\
-    size_t *oldlenp, void *newp, size_t newlen) {			\
+n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, \
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {			\
 	int ret;							\
 	t oldval;							\
 									\
@@ -1419,8 +1559,8 @@ label_return:								\
 
 #define CTL_RO_NL_GEN(n, v, t)						\
 static int								\
-n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,	\
-    size_t *oldlenp, void *newp, size_t newlen) {			\
+n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, \
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {			\
 	int ret;							\
 	t oldval;							\
 									\
@@ -1454,8 +1594,8 @@ label_return:								\
 
 #define CTL_RO_CONFIG_GEN(n, t)						\
 static int								\
-n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,	\
-    size_t *oldlenp, void *newp, size_t newlen) {			\
+n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, \
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {			\
 	int ret;							\
 	t oldval;							\
 									\
@@ -1473,8 +1613,8 @@ label_return:								\
 CTL_RO_NL_GEN(version, JEMALLOC_VERSION, const char *)
 
 static int
-epoch_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+epoch_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	UNUSED uint64_t newval;
 
@@ -1492,8 +1632,9 @@ label_return:
 }
 
 static int
-background_thread_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
-    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
+background_thread_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp,
+    void *newp, size_t newlen) {
 	int ret;
 	bool oldval;
 
@@ -1523,22 +1664,74 @@ background_thread_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
 
 		background_thread_enabled_set(tsd_tsdn(tsd), newval);
 		if (newval) {
-			if (!can_enable_background_thread) {
-				malloc_printf("<jemalloc>: Error in dlsym("
-			            "RTLD_NEXT, \"pthread_create\"). Cannot "
-				    "enable background_thread\n");
+			if (background_threads_enable(tsd)) {
 				ret = EFAULT;
 				goto label_return;
 			}
-			if (background_threads_enable(tsd)) {
+		} else {
+			if (background_threads_disable(tsd)) {
 				ret = EFAULT;
 				goto label_return;
 			}
-		} else {
+		}
+	}
+	ret = 0;
+label_return:
+	malloc_mutex_unlock(tsd_tsdn(tsd), &background_thread_lock);
+	malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx);
+
+	return ret;
+}
+
+static int
+max_background_threads_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp,
+    size_t newlen) {
+	int ret;
+	size_t oldval;
+
+	if (!have_background_thread) {
+		return ENOENT;
+	}
+	background_thread_ctl_init(tsd_tsdn(tsd));
+
+	malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx);
+	malloc_mutex_lock(tsd_tsdn(tsd), &background_thread_lock);
+	if (newp == NULL) {
+		oldval = max_background_threads;
+		READ(oldval, size_t);
+	} else {
+		if (newlen != sizeof(size_t)) {
+			ret = EINVAL;
+			goto label_return;
+		}
+		oldval = max_background_threads;
+		READ(oldval, size_t);
+
+		size_t newval = *(size_t *)newp;
+		if (newval == oldval) {
+			ret = 0;
+			goto label_return;
+		}
+		if (newval > opt_max_background_threads) {
+			ret = EINVAL;
+			goto label_return;
+		}
+
+		if (background_thread_enabled()) {
+			background_thread_enabled_set(tsd_tsdn(tsd), false);
 			if (background_threads_disable(tsd)) {
 				ret = EFAULT;
 				goto label_return;
 			}
+			max_background_threads = newval;
+			background_thread_enabled_set(tsd_tsdn(tsd), true);
+			if (background_threads_enable(tsd)) {
+				ret = EFAULT;
+				goto label_return;
+			}
+		} else {
+			max_background_threads = newval;
 		}
 	}
 	ret = 0;
@@ -1556,11 +1749,11 @@ CTL_RO_CONFIG_GEN(config_debug, bool)
 CTL_RO_CONFIG_GEN(config_fill, bool)
 CTL_RO_CONFIG_GEN(config_lazy_lock, bool)
 CTL_RO_CONFIG_GEN(config_malloc_conf, const char *)
+CTL_RO_CONFIG_GEN(config_opt_safety_checks, bool)
 CTL_RO_CONFIG_GEN(config_prof, bool)
 CTL_RO_CONFIG_GEN(config_prof_libgcc, bool)
 CTL_RO_CONFIG_GEN(config_prof_libunwind, bool)
 CTL_RO_CONFIG_GEN(config_stats, bool)
-CTL_RO_CONFIG_GEN(config_thp, bool)
 CTL_RO_CONFIG_GEN(config_utrace, bool)
 CTL_RO_CONFIG_GEN(config_xmalloc, bool)
 
@@ -1568,12 +1761,17 @@ CTL_RO_CONFIG_GEN(config_xmalloc, bool)
 
 CTL_RO_NL_GEN(opt_abort, opt_abort, bool)
 CTL_RO_NL_GEN(opt_abort_conf, opt_abort_conf, bool)
+CTL_RO_NL_GEN(opt_confirm_conf, opt_confirm_conf, bool)
+CTL_RO_NL_GEN(opt_metadata_thp, metadata_thp_mode_names[opt_metadata_thp],
+    const char *)
 CTL_RO_NL_GEN(opt_retain, opt_retain, bool)
 CTL_RO_NL_GEN(opt_dss, opt_dss, const char *)
 CTL_RO_NL_GEN(opt_narenas, opt_narenas, unsigned)
 CTL_RO_NL_GEN(opt_percpu_arena, percpu_arena_mode_names[opt_percpu_arena],
     const char *)
+CTL_RO_NL_GEN(opt_oversize_threshold, opt_oversize_threshold, size_t)
 CTL_RO_NL_GEN(opt_background_thread, opt_background_thread, bool)
+CTL_RO_NL_GEN(opt_max_background_threads, opt_max_background_threads, size_t)
 CTL_RO_NL_GEN(opt_dirty_decay_ms, opt_dirty_decay_ms, ssize_t)
 CTL_RO_NL_GEN(opt_muzzy_decay_ms, opt_muzzy_decay_ms, ssize_t)
 CTL_RO_NL_GEN(opt_stats_print, opt_stats_print, bool)
@@ -1583,6 +1781,9 @@ CTL_RO_NL_CGEN(config_fill, opt_zero, opt_zero, bool)
 CTL_RO_NL_CGEN(config_utrace, opt_utrace, opt_utrace, bool)
 CTL_RO_NL_CGEN(config_xmalloc, opt_xmalloc, opt_xmalloc, bool)
 CTL_RO_NL_GEN(opt_tcache, opt_tcache, bool)
+CTL_RO_NL_GEN(opt_thp, thp_mode_names[opt_thp], const char *)
+CTL_RO_NL_GEN(opt_lg_extent_max_active_fit, opt_lg_extent_max_active_fit,
+    size_t)
 CTL_RO_NL_GEN(opt_lg_tcache_max, opt_lg_tcache_max, ssize_t)
 CTL_RO_NL_CGEN(config_prof, opt_prof, opt_prof, bool)
 CTL_RO_NL_CGEN(config_prof, opt_prof_prefix, opt_prof_prefix, const char *)
@@ -1599,8 +1800,8 @@ CTL_RO_NL_CGEN(config_prof, opt_prof_leak, opt_prof_leak, bool)
 /******************************************************************************/
 
 static int
-thread_arena_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+thread_arena_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	arena_t *oldarena;
 	unsigned newind, oldind;
@@ -1664,8 +1865,9 @@ CTL_TSD_RO_NL_CGEN(config_stats, thread_deallocatedp,
     tsd_thread_deallocatedp_get, uint64_t *)
 
 static int
-thread_tcache_enabled_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
-    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
+thread_tcache_enabled_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp,
+    size_t newlen) {
 	int ret;
 	bool oldval;
 
@@ -1685,8 +1887,9 @@ label_return:
 }
 
 static int
-thread_tcache_flush_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
-    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
+thread_tcache_flush_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp,
+    size_t newlen) {
 	int ret;
 
 	if (!tcache_available(tsd)) {
@@ -1705,8 +1908,9 @@ label_return:
 }
 
 static int
-thread_prof_name_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+thread_prof_name_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp,
+    size_t newlen) {
 	int ret;
 
 	if (!config_prof) {
@@ -1736,8 +1940,9 @@ label_return:
 }
 
 static int
-thread_prof_active_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+thread_prof_active_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp,
+    size_t newlen) {
 	int ret;
 	bool oldval;
 
@@ -1766,8 +1971,8 @@ label_return:
 /******************************************************************************/
 
 static int
-tcache_create_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+tcache_create_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	unsigned tcache_ind;
 
@@ -1784,8 +1989,8 @@ label_return:
 }
 
 static int
-tcache_flush_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+tcache_flush_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	unsigned tcache_ind;
 
@@ -1804,8 +2009,8 @@ label_return:
 }
 
 static int
-tcache_destroy_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+tcache_destroy_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	unsigned tcache_ind;
 
@@ -1953,9 +2158,8 @@ arena_reset_prepare_background_thread(tsd_t *tsd, unsigned arena_ind) {
 	if (have_background_thread) {
 		malloc_mutex_lock(tsd_tsdn(tsd), &background_thread_lock);
 		if (background_thread_enabled()) {
-			unsigned ind = arena_ind % ncpus;
 			background_thread_info_t *info =
-			    &background_thread_info[ind];
+			    background_thread_info_get(arena_ind);
 			assert(info->state == background_thread_started);
 			malloc_mutex_lock(tsd_tsdn(tsd), &info->mtx);
 			info->state = background_thread_paused;
@@ -1968,9 +2172,8 @@ static void
 arena_reset_finish_background_thread(tsd_t *tsd, unsigned arena_ind) {
 	if (have_background_thread) {
 		if (background_thread_enabled()) {
-			unsigned ind = arena_ind % ncpus;
 			background_thread_info_t *info =
-			    &background_thread_info[ind];
+			    background_thread_info_get(arena_ind);
 			assert(info->state == background_thread_paused);
 			malloc_mutex_lock(tsd_tsdn(tsd), &info->mtx);
 			info->state = background_thread_started;
@@ -2126,6 +2329,17 @@ arena_i_decay_ms_ctl_impl(tsd_t *tsd, const size_t *mib, size_t miblen,
 			ret = EINVAL;
 			goto label_return;
 		}
+		if (arena_is_huge(arena_ind) && *(ssize_t *)newp > 0) {
+			/*
+			 * By default the huge arena purges eagerly.  If it is
+			 * set to non-zero decay time afterwards, background
+			 * thread might be needed.
+			 */
+			if (background_thread_create(tsd, arena_ind)) {
+				ret = EFAULT;
+				goto label_return;
+			}
+		}
 		if (dirty ? arena_dirty_decay_ms_set(tsd_tsdn(tsd), arena,
 		    *(ssize_t *)newp) : arena_muzzy_decay_ms_set(tsd_tsdn(tsd),
 		    arena, *(ssize_t *)newp)) {
@@ -2162,20 +2376,41 @@ arena_i_extent_hooks_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
 
 	malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx);
 	MIB_UNSIGNED(arena_ind, 1);
-	if (arena_ind < narenas_total_get() && (arena =
-	    arena_get(tsd_tsdn(tsd), arena_ind, false)) != NULL) {
-		if (newp != NULL) {
-			extent_hooks_t *old_extent_hooks;
-			extent_hooks_t *new_extent_hooks
-			    JEMALLOC_CC_SILENCE_INIT(NULL);
-			WRITE(new_extent_hooks, extent_hooks_t *);
-			old_extent_hooks = extent_hooks_set(tsd, arena,
-			    new_extent_hooks);
+	if (arena_ind < narenas_total_get()) {
+		extent_hooks_t *old_extent_hooks;
+		arena = arena_get(tsd_tsdn(tsd), arena_ind, false);
+		if (arena == NULL) {
+			if (arena_ind >= narenas_auto) {
+				ret = EFAULT;
+				goto label_return;
+			}
+			old_extent_hooks =
+			    (extent_hooks_t *)&extent_hooks_default;
 			READ(old_extent_hooks, extent_hooks_t *);
+			if (newp != NULL) {
+				/* Initialize a new arena as a side effect. */
+				extent_hooks_t *new_extent_hooks
+				    JEMALLOC_CC_SILENCE_INIT(NULL);
+				WRITE(new_extent_hooks, extent_hooks_t *);
+				arena = arena_init(tsd_tsdn(tsd), arena_ind,
+				    new_extent_hooks);
+				if (arena == NULL) {
+					ret = EFAULT;
+					goto label_return;
+				}
+			}
 		} else {
-			extent_hooks_t *old_extent_hooks =
-			    extent_hooks_get(arena);
-			READ(old_extent_hooks, extent_hooks_t *);
+			if (newp != NULL) {
+				extent_hooks_t *new_extent_hooks
+				    JEMALLOC_CC_SILENCE_INIT(NULL);
+				WRITE(new_extent_hooks, extent_hooks_t *);
+				old_extent_hooks = extent_hooks_set(tsd, arena,
+				    new_extent_hooks);
+				READ(old_extent_hooks, extent_hooks_t *);
+			} else {
+				old_extent_hooks = extent_hooks_get(arena);
+				READ(old_extent_hooks, extent_hooks_t *);
+			}
 		}
 	} else {
 		ret = EFAULT;
@@ -2187,8 +2422,46 @@ label_return:
 	return ret;
 }
 
+static int
+arena_i_retain_grow_limit_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp,
+    size_t newlen) {
+	int ret;
+	unsigned arena_ind;
+	arena_t *arena;
+
+	if (!opt_retain) {
+		/* Only relevant when retain is enabled. */
+		return ENOENT;
+	}
+
+	malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx);
+	MIB_UNSIGNED(arena_ind, 1);
+	if (arena_ind < narenas_total_get() && (arena =
+	    arena_get(tsd_tsdn(tsd), arena_ind, false)) != NULL) {
+		size_t old_limit, new_limit;
+		if (newp != NULL) {
+			WRITE(new_limit, size_t);
+		}
+		bool err = arena_retain_grow_limit_get_set(tsd, arena,
+		    &old_limit, newp != NULL ? &new_limit : NULL);
+		if (!err) {
+			READ(old_limit, size_t);
+			ret = 0;
+		} else {
+			ret = EFAULT;
+		}
+	} else {
+		ret = EFAULT;
+	}
+label_return:
+	malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx);
+	return ret;
+}
+
 static const ctl_named_node_t *
-arena_i_index(tsdn_t *tsdn, const size_t *mib, size_t miblen, size_t i) {
+arena_i_index(tsdn_t *tsdn, const size_t *mib, size_t miblen,
+    size_t i) {
 	const ctl_named_node_t *ret;
 
 	malloc_mutex_lock(tsdn, &ctl_mtx);
@@ -2213,8 +2486,8 @@ label_return:
 /******************************************************************************/
 
 static int
-arenas_narenas_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+arenas_narenas_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	unsigned narenas;
 
@@ -2234,8 +2507,9 @@ label_return:
 }
 
 static int
-arenas_decay_ms_ctl_impl(tsd_t *tsd, const size_t *mib, size_t miblen,
-    void *oldp, size_t *oldlenp, void *newp, size_t newlen, bool dirty) {
+arenas_decay_ms_ctl_impl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp,
+    size_t newlen, bool dirty) {
 	int ret;
 
 	if (oldp != NULL && oldlenp != NULL) {
@@ -2248,7 +2522,7 @@ arenas_decay_ms_ctl_impl(tsd_t *tsd, const size_t *mib, size_t miblen,
 			ret = EINVAL;
 			goto label_return;
 		}
-		if (dirty ?  arena_dirty_decay_ms_default_set(*(ssize_t *)newp)
+		if (dirty ? arena_dirty_decay_ms_default_set(*(ssize_t *)newp)
 		    : arena_muzzy_decay_ms_default_set(*(ssize_t *)newp)) {
 			ret = EFAULT;
 			goto label_return;
@@ -2277,34 +2551,36 @@ arenas_muzzy_decay_ms_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
 CTL_RO_NL_GEN(arenas_quantum, QUANTUM, size_t)
 CTL_RO_NL_GEN(arenas_page, PAGE, size_t)
 CTL_RO_NL_GEN(arenas_tcache_max, tcache_maxclass, size_t)
-CTL_RO_NL_GEN(arenas_nbins, NBINS, unsigned)
+CTL_RO_NL_GEN(arenas_nbins, SC_NBINS, unsigned)
 CTL_RO_NL_GEN(arenas_nhbins, nhbins, unsigned)
-CTL_RO_NL_GEN(arenas_bin_i_size, arena_bin_info[mib[2]].reg_size, size_t)
-CTL_RO_NL_GEN(arenas_bin_i_nregs, arena_bin_info[mib[2]].nregs, uint32_t)
-CTL_RO_NL_GEN(arenas_bin_i_slab_size, arena_bin_info[mib[2]].slab_size, size_t)
+CTL_RO_NL_GEN(arenas_bin_i_size, bin_infos[mib[2]].reg_size, size_t)
+CTL_RO_NL_GEN(arenas_bin_i_nregs, bin_infos[mib[2]].nregs, uint32_t)
+CTL_RO_NL_GEN(arenas_bin_i_slab_size, bin_infos[mib[2]].slab_size, size_t)
+CTL_RO_NL_GEN(arenas_bin_i_nshards, bin_infos[mib[2]].n_shards, uint32_t)
 static const ctl_named_node_t *
-arenas_bin_i_index(tsdn_t *tsdn, const size_t *mib, size_t miblen, size_t i) {
-	if (i > NBINS) {
+arenas_bin_i_index(tsdn_t *tsdn, const size_t *mib,
+    size_t miblen, size_t i) {
+	if (i > SC_NBINS) {
 		return NULL;
 	}
 	return super_arenas_bin_i_node;
 }
 
-CTL_RO_NL_GEN(arenas_nlextents, NSIZES - NBINS, unsigned)
-CTL_RO_NL_GEN(arenas_lextent_i_size, sz_index2size(NBINS+(szind_t)mib[2]),
+CTL_RO_NL_GEN(arenas_nlextents, SC_NSIZES - SC_NBINS, unsigned)
+CTL_RO_NL_GEN(arenas_lextent_i_size, sz_index2size(SC_NBINS+(szind_t)mib[2]),
     size_t)
 static const ctl_named_node_t *
-arenas_lextent_i_index(tsdn_t *tsdn, const size_t *mib, size_t miblen,
-    size_t i) {
-	if (i > NSIZES - NBINS) {
+arenas_lextent_i_index(tsdn_t *tsdn, const size_t *mib,
+    size_t miblen, size_t i) {
+	if (i > SC_NSIZES - SC_NBINS) {
 		return NULL;
 	}
 	return super_arenas_lextent_i_node;
 }
 
 static int
-arenas_create_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+arenas_create_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	extent_hooks_t *extent_hooks;
 	unsigned arena_ind;
@@ -2325,11 +2601,43 @@ label_return:
 	return ret;
 }
 
+static int
+arenas_lookup_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp,
+    size_t newlen) {
+	int ret;
+	unsigned arena_ind;
+	void *ptr;
+	extent_t *extent;
+	arena_t *arena;
+
+	ptr = NULL;
+	ret = EINVAL;
+	malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx);
+	WRITE(ptr, void *);
+	extent = iealloc(tsd_tsdn(tsd), ptr);
+	if (extent == NULL)
+		goto label_return;
+
+	arena = extent_arena_get(extent);
+	if (arena == NULL)
+		goto label_return;
+
+	arena_ind = arena_ind_get(arena);
+	READ(arena_ind, unsigned);
+
+	ret = 0;
+label_return:
+	malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx);
+	return ret;
+}
+
 /******************************************************************************/
 
 static int
-prof_thread_active_init_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
-    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
+prof_thread_active_init_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp,
+    size_t newlen) {
 	int ret;
 	bool oldval;
 
@@ -2355,8 +2663,8 @@ label_return:
 }
 
 static int
-prof_active_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+prof_active_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	bool oldval;
 
@@ -2381,8 +2689,8 @@ label_return:
 }
 
 static int
-prof_dump_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+prof_dump_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	const char *filename = NULL;
 
@@ -2404,8 +2712,8 @@ label_return:
 }
 
 static int
-prof_gdump_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+prof_gdump_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	bool oldval;
 
@@ -2430,8 +2738,8 @@ label_return:
 }
 
 static int
-prof_reset_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+prof_reset_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	size_t lg_sample = lg_prof_sample;
 
@@ -2455,11 +2763,50 @@ label_return:
 CTL_RO_NL_CGEN(config_prof, prof_interval, prof_interval, uint64_t)
 CTL_RO_NL_CGEN(config_prof, lg_prof_sample, lg_prof_sample, size_t)
 
+static int
+prof_log_start_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
+    size_t *oldlenp, void *newp, size_t newlen) {
+	int ret;
+
+	const char *filename = NULL;
+
+	if (!config_prof) {
+		return ENOENT;
+	}
+
+	WRITEONLY();
+	WRITE(filename, const char *);
+
+	if (prof_log_start(tsd_tsdn(tsd), filename)) {
+		ret = EFAULT;
+		goto label_return;
+	}
+
+	ret = 0;
+label_return:
+	return ret;
+}
+
+static int
+prof_log_stop_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
+    size_t *oldlenp, void *newp, size_t newlen) {
+	if (!config_prof) {
+		return ENOENT;
+	}
+
+	if (prof_log_stop(tsd_tsdn(tsd))) {
+		return EFAULT;
+	}
+
+	return 0;
+}
+
 /******************************************************************************/
 
 CTL_RO_CGEN(config_stats, stats_allocated, ctl_stats->allocated, size_t)
 CTL_RO_CGEN(config_stats, stats_active, ctl_stats->active, size_t)
 CTL_RO_CGEN(config_stats, stats_metadata, ctl_stats->metadata, size_t)
+CTL_RO_CGEN(config_stats, stats_metadata_thp, ctl_stats->metadata_thp, size_t)
 CTL_RO_CGEN(config_stats, stats_resident, ctl_stats->resident, size_t)
 CTL_RO_CGEN(config_stats, stats_mapped, ctl_stats->mapped, size_t)
 CTL_RO_CGEN(config_stats, stats_retained, ctl_stats->retained, size_t)
@@ -2488,26 +2835,30 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_mapped,
 CTL_RO_CGEN(config_stats, stats_arenas_i_retained,
     atomic_load_zu(&arenas_i(mib[2])->astats->astats.retained, ATOMIC_RELAXED),
     size_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_extent_avail,
+    atomic_load_zu(&arenas_i(mib[2])->astats->astats.extent_avail,
+        ATOMIC_RELAXED),
+    size_t)
 
 CTL_RO_CGEN(config_stats, stats_arenas_i_dirty_npurge,
-    arena_stats_read_u64(&arenas_i(mib[2])->astats->astats.decay_dirty.npurge),
-    uint64_t)
+    ctl_arena_stats_read_u64(
+    &arenas_i(mib[2])->astats->astats.decay_dirty.npurge), uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_dirty_nmadvise,
-    arena_stats_read_u64(
+    ctl_arena_stats_read_u64(
     &arenas_i(mib[2])->astats->astats.decay_dirty.nmadvise), uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_dirty_purged,
-    arena_stats_read_u64(&arenas_i(mib[2])->astats->astats.decay_dirty.purged),
-    uint64_t)
+    ctl_arena_stats_read_u64(
+    &arenas_i(mib[2])->astats->astats.decay_dirty.purged), uint64_t)
 
 CTL_RO_CGEN(config_stats, stats_arenas_i_muzzy_npurge,
-    arena_stats_read_u64(&arenas_i(mib[2])->astats->astats.decay_muzzy.npurge),
-    uint64_t)
+    ctl_arena_stats_read_u64(
+    &arenas_i(mib[2])->astats->astats.decay_muzzy.npurge), uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_muzzy_nmadvise,
-    arena_stats_read_u64(
+    ctl_arena_stats_read_u64(
     &arenas_i(mib[2])->astats->astats.decay_muzzy.nmadvise), uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_muzzy_purged,
-    arena_stats_read_u64(&arenas_i(mib[2])->astats->astats.decay_muzzy.purged),
-    uint64_t)
+    ctl_arena_stats_read_u64(
+    &arenas_i(mib[2])->astats->astats.decay_muzzy.purged), uint64_t)
 
 CTL_RO_CGEN(config_stats, stats_arenas_i_base,
     atomic_load_zu(&arenas_i(mib[2])->astats->astats.base, ATOMIC_RELAXED),
@@ -2515,12 +2866,18 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_base,
 CTL_RO_CGEN(config_stats, stats_arenas_i_internal,
     atomic_load_zu(&arenas_i(mib[2])->astats->astats.internal, ATOMIC_RELAXED),
     size_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_metadata_thp,
+    atomic_load_zu(&arenas_i(mib[2])->astats->astats.metadata_thp,
+    ATOMIC_RELAXED), size_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_tcache_bytes,
     atomic_load_zu(&arenas_i(mib[2])->astats->astats.tcache_bytes,
     ATOMIC_RELAXED), size_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_resident,
     atomic_load_zu(&arenas_i(mib[2])->astats->astats.resident, ATOMIC_RELAXED),
     size_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_abandoned_vm,
+    atomic_load_zu(&arenas_i(mib[2])->astats->astats.abandoned_vm,
+    ATOMIC_RELAXED), size_t)
 
 CTL_RO_CGEN(config_stats, stats_arenas_i_small_allocated,
     arenas_i(mib[2])->astats->allocated_small, size_t)
@@ -2530,18 +2887,32 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_small_ndalloc,
     arenas_i(mib[2])->astats->ndalloc_small, uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_small_nrequests,
     arenas_i(mib[2])->astats->nrequests_small, uint64_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_small_nfills,
+    arenas_i(mib[2])->astats->nfills_small, uint64_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_small_nflushes,
+    arenas_i(mib[2])->astats->nflushes_small, uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_large_allocated,
     atomic_load_zu(&arenas_i(mib[2])->astats->astats.allocated_large,
     ATOMIC_RELAXED), size_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_large_nmalloc,
-    arena_stats_read_u64(&arenas_i(mib[2])->astats->astats.nmalloc_large),
-    uint64_t)
+    ctl_arena_stats_read_u64(
+    &arenas_i(mib[2])->astats->astats.nmalloc_large), uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_large_ndalloc,
-    arena_stats_read_u64(&arenas_i(mib[2])->astats->astats.ndalloc_large),
-    uint64_t)
+    ctl_arena_stats_read_u64(
+    &arenas_i(mib[2])->astats->astats.ndalloc_large), uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_large_nrequests,
-    arena_stats_read_u64(&arenas_i(mib[2])->astats->astats.nmalloc_large),
-    uint64_t) /* Intentional. */
+    ctl_arena_stats_read_u64(
+    &arenas_i(mib[2])->astats->astats.nrequests_large), uint64_t)
+/*
+ * Note: "nmalloc_large" here instead of "nfills" in the read.  This is
+ * intentional (large has no batch fill).
+ */
+CTL_RO_CGEN(config_stats, stats_arenas_i_large_nfills,
+    ctl_arena_stats_read_u64(
+    &arenas_i(mib[2])->astats->astats.nmalloc_large), uint64_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_large_nflushes,
+    ctl_arena_stats_read_u64(
+    &arenas_i(mib[2])->astats->astats.nflushes_large), uint64_t)
 
 /* Lock profiling related APIs below. */
 #define RO_MUTEX_CTL_GEN(n, l)						\
@@ -2580,8 +2951,9 @@ RO_MUTEX_CTL_GEN(arenas_i_bins_j_mutex,
 
 /* Resets all mutex stats, including global, arena and bin mutexes. */
 static int
-stats_mutexes_reset_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
-    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
+stats_mutexes_reset_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp,
+    void *newp, size_t newlen) {
 	if (!config_stats) {
 		return ENOENT;
 	}
@@ -2621,9 +2993,11 @@ stats_mutexes_reset_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
 		MUTEX_PROF_RESET(arena->tcache_ql_mtx);
 		MUTEX_PROF_RESET(arena->base->mtx);
 
-		for (szind_t i = 0; i < NBINS; i++) {
-			arena_bin_t *bin = &arena->bins[i];
-			MUTEX_PROF_RESET(bin->lock);
+		for (szind_t i = 0; i < SC_NBINS; i++) {
+			for (unsigned j = 0; j < bin_infos[i].n_shards; j++) {
+				bin_t *bin = &arena->bins[i].bin_shards[j];
+				MUTEX_PROF_RESET(bin->lock);
+			}
 		}
 	}
 #undef MUTEX_PROF_RESET
@@ -2648,45 +3022,90 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_nreslabs,
     arenas_i(mib[2])->astats->bstats[mib[4]].reslabs, uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_curslabs,
     arenas_i(mib[2])->astats->bstats[mib[4]].curslabs, size_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_nonfull_slabs,
+    arenas_i(mib[2])->astats->bstats[mib[4]].nonfull_slabs, size_t)
 
 static const ctl_named_node_t *
-stats_arenas_i_bins_j_index(tsdn_t *tsdn, const size_t *mib, size_t miblen,
-    size_t j) {
-	if (j > NBINS) {
+stats_arenas_i_bins_j_index(tsdn_t *tsdn, const size_t *mib,
+    size_t miblen, size_t j) {
+	if (j > SC_NBINS) {
 		return NULL;
 	}
 	return super_stats_arenas_i_bins_j_node;
 }
 
 CTL_RO_CGEN(config_stats, stats_arenas_i_lextents_j_nmalloc,
-    arena_stats_read_u64(&arenas_i(mib[2])->astats->lstats[mib[4]].nmalloc),
-    uint64_t)
+    ctl_arena_stats_read_u64(
+    &arenas_i(mib[2])->astats->lstats[mib[4]].nmalloc), uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_lextents_j_ndalloc,
-    arena_stats_read_u64(&arenas_i(mib[2])->astats->lstats[mib[4]].ndalloc),
-    uint64_t)
+    ctl_arena_stats_read_u64(
+    &arenas_i(mib[2])->astats->lstats[mib[4]].ndalloc), uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_lextents_j_nrequests,
-    arena_stats_read_u64(&arenas_i(mib[2])->astats->lstats[mib[4]].nrequests),
-    uint64_t)
+    ctl_arena_stats_read_u64(
+    &arenas_i(mib[2])->astats->lstats[mib[4]].nrequests), uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_lextents_j_curlextents,
     arenas_i(mib[2])->astats->lstats[mib[4]].curlextents, size_t)
 
 static const ctl_named_node_t *
-stats_arenas_i_lextents_j_index(tsdn_t *tsdn, const size_t *mib, size_t miblen,
-    size_t j) {
-	if (j > NSIZES - NBINS) {
+stats_arenas_i_lextents_j_index(tsdn_t *tsdn, const size_t *mib,
+    size_t miblen, size_t j) {
+	if (j > SC_NSIZES - SC_NBINS) {
 		return NULL;
 	}
 	return super_stats_arenas_i_lextents_j_node;
 }
 
+CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_ndirty,
+    atomic_load_zu(
+        &arenas_i(mib[2])->astats->estats[mib[4]].ndirty,
+	ATOMIC_RELAXED), size_t);
+CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_nmuzzy,
+    atomic_load_zu(
+        &arenas_i(mib[2])->astats->estats[mib[4]].nmuzzy,
+	ATOMIC_RELAXED), size_t);
+CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_nretained,
+    atomic_load_zu(
+        &arenas_i(mib[2])->astats->estats[mib[4]].nretained,
+	ATOMIC_RELAXED), size_t);
+CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_dirty_bytes,
+    atomic_load_zu(
+        &arenas_i(mib[2])->astats->estats[mib[4]].dirty_bytes,
+	ATOMIC_RELAXED), size_t);
+CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_muzzy_bytes,
+    atomic_load_zu(
+        &arenas_i(mib[2])->astats->estats[mib[4]].muzzy_bytes,
+	ATOMIC_RELAXED), size_t);
+CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_retained_bytes,
+    atomic_load_zu(
+        &arenas_i(mib[2])->astats->estats[mib[4]].retained_bytes,
+	ATOMIC_RELAXED), size_t);
+
+static const ctl_named_node_t *
+stats_arenas_i_extents_j_index(tsdn_t *tsdn, const size_t *mib,
+    size_t miblen, size_t j) {
+	if (j >= SC_NPSIZES) {
+		return NULL;
+	}
+	return super_stats_arenas_i_extents_j_node;
+}
+
+static bool
+ctl_arenas_i_verify(size_t i) {
+	size_t a = arenas_i2a_impl(i, true, true);
+	if (a == UINT_MAX || !ctl_arenas->arenas[a]->initialized) {
+		return true;
+	}
+
+	return false;
+}
+
 static const ctl_named_node_t *
-stats_arenas_i_index(tsdn_t *tsdn, const size_t *mib, size_t miblen, size_t i) {
+stats_arenas_i_index(tsdn_t *tsdn, const size_t *mib,
+    size_t miblen, size_t i) {
 	const ctl_named_node_t *ret;
-	size_t a;
 
 	malloc_mutex_lock(tsdn, &ctl_mtx);
-	a = arenas_i2a_impl(i, true, true);
-	if (a == UINT_MAX || !ctl_arenas->arenas[a]->initialized) {
+	if (ctl_arenas_i_verify(i)) {
 		ret = NULL;
 		goto label_return;
 	}
@@ -2696,3 +3115,321 @@ label_return:
 	malloc_mutex_unlock(tsdn, &ctl_mtx);
 	return ret;
 }
+
+static int
+experimental_hooks_install_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
+	int ret;
+	if (oldp == NULL || oldlenp == NULL|| newp == NULL) {
+		ret = EINVAL;
+		goto label_return;
+	}
+	/*
+	 * Note: this is a *private* struct.  This is an experimental interface;
+	 * forcing the user to know the jemalloc internals well enough to
+	 * extract the ABI hopefully ensures nobody gets too comfortable with
+	 * this API, which can change at a moment's notice.
+	 */
+	hooks_t hooks;
+	WRITE(hooks, hooks_t);
+	void *handle = hook_install(tsd_tsdn(tsd), &hooks);
+	if (handle == NULL) {
+		ret = EAGAIN;
+		goto label_return;
+	}
+	READ(handle, void *);
+
+	ret = 0;
+label_return:
+	return ret;
+}
+
+static int
+experimental_hooks_remove_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
+	int ret;
+	WRITEONLY();
+	void *handle = NULL;
+	WRITE(handle, void *);
+	if (handle == NULL) {
+		ret = EINVAL;
+		goto label_return;
+	}
+	hook_remove(tsd_tsdn(tsd), handle);
+	ret = 0;
+label_return:
+	return ret;
+}
+
+/*
+ * Output six memory utilization entries for an input pointer, the first one of
+ * type (void *) and the remaining five of type size_t, describing the following
+ * (in the same order):
+ *
+ * (a) memory address of the extent a potential reallocation would go into,
+ * == the five fields below describe about the extent the pointer resides in ==
+ * (b) number of free regions in the extent,
+ * (c) number of regions in the extent,
+ * (d) size of the extent in terms of bytes,
+ * (e) total number of free regions in the bin the extent belongs to, and
+ * (f) total number of regions in the bin the extent belongs to.
+ *
+ * Note that "(e)" and "(f)" are only available when stats are enabled;
+ * otherwise their values are undefined.
+ *
+ * This API is mainly intended for small class allocations, where extents are
+ * used as slab.
+ *
+ * In case of large class allocations, "(a)" will be NULL, and "(e)" and "(f)"
+ * will be zero (if stats are enabled; otherwise undefined).  The other three
+ * fields will be properly set though the values are trivial: "(b)" will be 0,
+ * "(c)" will be 1, and "(d)" will be the usable size.
+ *
+ * The input pointer and size are respectively passed in by newp and newlen,
+ * and the output fields and size are respectively oldp and *oldlenp.
+ *
+ * It can be beneficial to define the following macros to make it easier to
+ * access the output:
+ *
+ * #define SLABCUR_READ(out) (*(void **)out)
+ * #define COUNTS(out) ((size_t *)((void **)out + 1))
+ * #define NFREE_READ(out) COUNTS(out)[0]
+ * #define NREGS_READ(out) COUNTS(out)[1]
+ * #define SIZE_READ(out) COUNTS(out)[2]
+ * #define BIN_NFREE_READ(out) COUNTS(out)[3]
+ * #define BIN_NREGS_READ(out) COUNTS(out)[4]
+ *
+ * and then write e.g. NFREE_READ(oldp) to fetch the output.  See the unit test
+ * test_query in test/unit/extent_util.c for an example.
+ *
+ * For a typical defragmentation workflow making use of this API for
+ * understanding the fragmentation level, please refer to the comment for
+ * experimental_utilization_batch_query_ctl.
+ *
+ * It's up to the application how to determine the significance of
+ * fragmentation relying on the outputs returned.  Possible choices are:
+ *
+ * (a) if extent utilization ratio is below certain threshold,
+ * (b) if extent memory consumption is above certain threshold,
+ * (c) if extent utilization ratio is significantly below bin utilization ratio,
+ * (d) if input pointer deviates a lot from potential reallocation address, or
+ * (e) some selection/combination of the above.
+ *
+ * The caller needs to make sure that the input/output arguments are valid,
+ * in particular, that the size of the output is correct, i.e.:
+ *
+ *     *oldlenp = sizeof(void *) + sizeof(size_t) * 5
+ *
+ * Otherwise, the function immediately returns EINVAL without touching anything.
+ *
+ * In the rare case where there's no associated extent found for the input
+ * pointer, the function zeros out all output fields and return.  Please refer
+ * to the comment for experimental_utilization_batch_query_ctl to understand the
+ * motivation from C++.
+ */
+static int
+experimental_utilization_query_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
+	int ret;
+
+	assert(sizeof(extent_util_stats_verbose_t)
+	    == sizeof(void *) + sizeof(size_t) * 5);
+
+	if (oldp == NULL || oldlenp == NULL
+	    || *oldlenp != sizeof(extent_util_stats_verbose_t)
+	    || newp == NULL) {
+		ret = EINVAL;
+		goto label_return;
+	}
+
+	void *ptr = NULL;
+	WRITE(ptr, void *);
+	extent_util_stats_verbose_t *util_stats
+	    = (extent_util_stats_verbose_t *)oldp;
+	extent_util_stats_verbose_get(tsd_tsdn(tsd), ptr,
+	    &util_stats->nfree, &util_stats->nregs, &util_stats->size,
+	    &util_stats->bin_nfree, &util_stats->bin_nregs,
+	    &util_stats->slabcur_addr);
+	ret = 0;
+
+label_return:
+	return ret;
+}
+
+/*
+ * Given an input array of pointers, output three memory utilization entries of
+ * type size_t for each input pointer about the extent it resides in:
+ *
+ * (a) number of free regions in the extent,
+ * (b) number of regions in the extent, and
+ * (c) size of the extent in terms of bytes.
+ *
+ * This API is mainly intended for small class allocations, where extents are
+ * used as slab.  In case of large class allocations, the outputs are trivial:
+ * "(a)" will be 0, "(b)" will be 1, and "(c)" will be the usable size.
+ *
+ * Note that multiple input pointers may reside on a same extent so the output
+ * fields may contain duplicates.
+ *
+ * The format of the input/output looks like:
+ *
+ * input[0]:  1st_pointer_to_query	|  output[0]: 1st_extent_n_free_regions
+ *					|  output[1]: 1st_extent_n_regions
+ *					|  output[2]: 1st_extent_size
+ * input[1]:  2nd_pointer_to_query	|  output[3]: 2nd_extent_n_free_regions
+ *					|  output[4]: 2nd_extent_n_regions
+ *					|  output[5]: 2nd_extent_size
+ * ...					|  ...
+ *
+ * The input array and size are respectively passed in by newp and newlen, and
+ * the output array and size are respectively oldp and *oldlenp.
+ *
+ * It can be beneficial to define the following macros to make it easier to
+ * access the output:
+ *
+ * #define NFREE_READ(out, i) out[(i) * 3]
+ * #define NREGS_READ(out, i) out[(i) * 3 + 1]
+ * #define SIZE_READ(out, i) out[(i) * 3 + 2]
+ *
+ * and then write e.g. NFREE_READ(oldp, i) to fetch the output.  See the unit
+ * test test_batch in test/unit/extent_util.c for a concrete example.
+ *
+ * A typical workflow would be composed of the following steps:
+ *
+ * (1) flush tcache: mallctl("thread.tcache.flush", ...)
+ * (2) initialize input array of pointers to query fragmentation
+ * (3) allocate output array to hold utilization statistics
+ * (4) query utilization: mallctl("experimental.utilization.batch_query", ...)
+ * (5) (optional) decide if it's worthwhile to defragment; otherwise stop here
+ * (6) disable tcache: mallctl("thread.tcache.enabled", ...)
+ * (7) defragment allocations with significant fragmentation, e.g.:
+ *         for each allocation {
+ *             if it's fragmented {
+ *                 malloc(...);
+ *                 memcpy(...);
+ *                 free(...);
+ *             }
+ *         }
+ * (8) enable tcache: mallctl("thread.tcache.enabled", ...)
+ *
+ * The application can determine the significance of fragmentation themselves
+ * relying on the statistics returned, both at the overall level i.e. step "(5)"
+ * and at individual allocation level i.e. within step "(7)".  Possible choices
+ * are:
+ *
+ * (a) whether memory utilization ratio is below certain threshold,
+ * (b) whether memory consumption is above certain threshold, or
+ * (c) some combination of the two.
+ *
+ * The caller needs to make sure that the input/output arrays are valid and
+ * their sizes are proper as well as matched, meaning:
+ *
+ * (a) newlen = n_pointers * sizeof(const void *)
+ * (b) *oldlenp = n_pointers * sizeof(size_t) * 3
+ * (c) n_pointers > 0
+ *
+ * Otherwise, the function immediately returns EINVAL without touching anything.
+ *
+ * In the rare case where there's no associated extent found for some pointers,
+ * rather than immediately terminating the computation and raising an error,
+ * the function simply zeros out the corresponding output fields and continues
+ * the computation until all input pointers are handled.  The motivations of
+ * such a design are as follows:
+ *
+ * (a) The function always either processes nothing or processes everything, and
+ * never leaves the output half touched and half untouched.
+ *
+ * (b) It facilitates usage needs especially common in C++.  A vast variety of
+ * C++ objects are instantiated with multiple dynamic memory allocations.  For
+ * example, std::string and std::vector typically use at least two allocations,
+ * one for the metadata and one for the actual content.  Other types may use
+ * even more allocations.  When inquiring about utilization statistics, the
+ * caller often wants to examine into all such allocations, especially internal
+ * one(s), rather than just the topmost one.  The issue comes when some
+ * implementations do certain optimizations to reduce/aggregate some internal
+ * allocations, e.g. putting short strings directly into the metadata, and such
+ * decisions are not known to the caller.  Therefore, we permit pointers to
+ * memory usages that may not be returned by previous malloc calls, and we
+ * provide the caller a convenient way to identify such cases.
+ */
+static int
+experimental_utilization_batch_query_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
+	int ret;
+
+	assert(sizeof(extent_util_stats_t) == sizeof(size_t) * 3);
+
+	const size_t len = newlen / sizeof(const void *);
+	if (oldp == NULL || oldlenp == NULL || newp == NULL || newlen == 0
+	    || newlen != len * sizeof(const void *)
+	    || *oldlenp != len * sizeof(extent_util_stats_t)) {
+		ret = EINVAL;
+		goto label_return;
+	}
+
+	void **ptrs = (void **)newp;
+	extent_util_stats_t *util_stats = (extent_util_stats_t *)oldp;
+	size_t i;
+	for (i = 0; i < len; ++i) {
+		extent_util_stats_get(tsd_tsdn(tsd), ptrs[i],
+		    &util_stats[i].nfree, &util_stats[i].nregs,
+		    &util_stats[i].size);
+	}
+	ret = 0;
+
+label_return:
+	return ret;
+}
+
+static const ctl_named_node_t *
+experimental_arenas_i_index(tsdn_t *tsdn, const size_t *mib,
+    size_t miblen, size_t i) {
+	const ctl_named_node_t *ret;
+
+	malloc_mutex_lock(tsdn, &ctl_mtx);
+	if (ctl_arenas_i_verify(i)) {
+		ret = NULL;
+		goto label_return;
+	}
+	ret = super_experimental_arenas_i_node;
+label_return:
+	malloc_mutex_unlock(tsdn, &ctl_mtx);
+	return ret;
+}
+
+static int
+experimental_arenas_i_pactivep_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
+	if (!config_stats) {
+		return ENOENT;
+	}
+	if (oldp == NULL || oldlenp == NULL || *oldlenp != sizeof(size_t *)) {
+		return EINVAL;
+	}
+
+	unsigned arena_ind;
+	arena_t *arena;
+	int ret;
+	size_t *pactivep;
+
+	malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx);
+	READONLY();
+	MIB_UNSIGNED(arena_ind, 2);
+	if (arena_ind < narenas_total_get() && (arena =
+	    arena_get(tsd_tsdn(tsd), arena_ind, false)) != NULL) {
+#if defined(JEMALLOC_GCC_ATOMIC_ATOMICS) ||				\
+    defined(JEMALLOC_GCC_SYNC_ATOMICS) || defined(_MSC_VER)
+		/* Expose the underlying counter for fast read. */
+		pactivep = (size_t *)&(arena->nactive.repr);
+		READ(pactivep, size_t *);
+		ret = 0;
+#else
+		ret = EFAULT;
+#endif
+	} else {
+		ret = EFAULT;
+	}
+label_return:
+	malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx);
+	return ret;
+}
diff --git a/deps/jemalloc/src/div.c b/deps/jemalloc/src/div.c
new file mode 100644
index 0000000000..808892a133
--- /dev/null
+++ b/deps/jemalloc/src/div.c
@@ -0,0 +1,55 @@
+#include "jemalloc/internal/jemalloc_preamble.h"
+
+#include "jemalloc/internal/div.h"
+
+#include "jemalloc/internal/assert.h"
+
+/*
+ * Suppose we have n = q * d, all integers. We know n and d, and want q = n / d.
+ *
+ * For any k, we have (here, all division is exact; not C-style rounding):
+ * floor(ceil(2^k / d) * n / 2^k) = floor((2^k + r) / d * n / 2^k), where
+ * r = (-2^k) mod d.
+ *
+ * Expanding this out:
+ * ... = floor(2^k / d * n / 2^k + r / d * n / 2^k)
+ *     = floor(n / d + (r / d) * (n / 2^k)).
+ *
+ * The fractional part of n / d is 0 (because of the assumption that d divides n
+ * exactly), so we have:
+ * ... = n / d + floor((r / d) * (n / 2^k))
+ *
+ * So that our initial expression is equal to the quantity we seek, so long as
+ * (r / d) * (n / 2^k) < 1.
+ *
+ * r is a remainder mod d, so r < d and r / d < 1 always. We can make
+ * n / 2 ^ k < 1 by setting k = 32. This gets us a value of magic that works.
+ */
+
+void
+div_init(div_info_t *div_info, size_t d) {
+	/* Nonsensical. */
+	assert(d != 0);
+	/*
+	 * This would make the value of magic too high to fit into a uint32_t
+	 * (we would want magic = 2^32 exactly). This would mess with code gen
+	 * on 32-bit machines.
+	 */
+	assert(d != 1);
+
+	uint64_t two_to_k = ((uint64_t)1 << 32);
+	uint32_t magic = (uint32_t)(two_to_k / d);
+
+	/*
+	 * We want magic = ceil(2^k / d), but C gives us floor. We have to
+	 * increment it unless the result was exact (i.e. unless d is a power of
+	 * two).
+	 */
+	if (two_to_k % d != 0) {
+		magic++;
+	}
+	div_info->magic = magic;
+#ifdef JEMALLOC_DEBUG
+	div_info->d = d;
+#endif
+}
diff --git a/deps/jemalloc/src/extent.c b/deps/jemalloc/src/extent.c
index fa45c84d34..9237f903dc 100644
--- a/deps/jemalloc/src/extent.c
+++ b/deps/jemalloc/src/extent.c
@@ -17,8 +17,10 @@ rtree_t		extents_rtree;
 /* Keyed by the address of the extent_t being protected. */
 mutex_pool_t	extent_mutex_pool;
 
+size_t opt_lg_extent_max_active_fit = LG_EXTENT_MAX_ACTIVE_FIT_DEFAULT;
+
 static const bitmap_info_t extents_bitmap_info =
-    BITMAP_INFO_INITIALIZER(NPSIZES+1);
+    BITMAP_INFO_INITIALIZER(SC_NPSIZES+1);
 
 static void *extent_alloc_default(extent_hooks_t *extent_hooks, void *new_addr,
     size_t size, size_t alignment, bool *zero, bool *commit,
@@ -48,20 +50,16 @@ static bool extent_purge_forced_default(extent_hooks_t *extent_hooks,
 static bool extent_purge_forced_impl(tsdn_t *tsdn, arena_t *arena,
     extent_hooks_t **r_extent_hooks, extent_t *extent, size_t offset,
     size_t length, bool growing_retained);
-#ifdef JEMALLOC_MAPS_COALESCE
 static bool extent_split_default(extent_hooks_t *extent_hooks, void *addr,
     size_t size, size_t size_a, size_t size_b, bool committed,
     unsigned arena_ind);
-#endif
 static extent_t *extent_split_impl(tsdn_t *tsdn, arena_t *arena,
     extent_hooks_t **r_extent_hooks, extent_t *extent, size_t size_a,
     szind_t szind_a, bool slab_a, size_t size_b, szind_t szind_b, bool slab_b,
     bool growing_retained);
-#ifdef JEMALLOC_MAPS_COALESCE
 static bool extent_merge_default(extent_hooks_t *extent_hooks, void *addr_a,
     size_t size_a, void *addr_b, size_t size_b, bool committed,
     unsigned arena_ind);
-#endif
 static bool extent_merge_impl(tsdn_t *tsdn, arena_t *arena,
     extent_hooks_t **r_extent_hooks, extent_t *a, extent_t *b,
     bool growing_retained);
@@ -86,11 +84,9 @@ const extent_hooks_t	extent_hooks_default = {
 	,
 	NULL
 #endif
-#ifdef JEMALLOC_MAPS_COALESCE
 	,
 	extent_split_default,
 	extent_merge_default
-#endif
 };
 
 /* Used exclusively for gdump triggering. */
@@ -117,9 +113,13 @@ static void extent_record(tsdn_t *tsdn, arena_t *arena,
 
 /******************************************************************************/
 
-rb_gen(UNUSED, extent_avail_, extent_tree_t, extent_t, rb_link,
+#define ATTR_NONE /* does nothing */
+
+ph_gen(ATTR_NONE, extent_avail_, extent_tree_t, extent_t, ph_link,
     extent_esnead_comp)
 
+#undef ATTR_NONE
+
 typedef enum {
 	lock_result_success,
 	lock_result_failure,
@@ -128,13 +128,16 @@ typedef enum {
 
 static lock_result_t
 extent_rtree_leaf_elm_try_lock(tsdn_t *tsdn, rtree_leaf_elm_t *elm,
-    extent_t **result) {
+    extent_t **result, bool inactive_only) {
 	extent_t *extent1 = rtree_leaf_elm_extent_read(tsdn, &extents_rtree,
 	    elm, true);
 
-	if (extent1 == NULL) {
+	/* Slab implies active extents and should be skipped. */
+	if (extent1 == NULL || (inactive_only && rtree_leaf_elm_slab_read(tsdn,
+	    &extents_rtree, elm, true))) {
 		return lock_result_no_extent;
 	}
+
 	/*
 	 * It's possible that the extent changed out from under us, and with it
 	 * the leaf->extent mapping.  We have to recheck while holding the lock.
@@ -157,7 +160,8 @@ extent_rtree_leaf_elm_try_lock(tsdn_t *tsdn, rtree_leaf_elm_t *elm,
  * address, and NULL otherwise.
  */
 static extent_t *
-extent_lock_from_addr(tsdn_t *tsdn, rtree_ctx_t *rtree_ctx, void *addr) {
+extent_lock_from_addr(tsdn_t *tsdn, rtree_ctx_t *rtree_ctx, void *addr,
+    bool inactive_only) {
 	extent_t *ret = NULL;
 	rtree_leaf_elm_t *elm = rtree_leaf_elm_lookup(tsdn, &extents_rtree,
 	    rtree_ctx, (uintptr_t)addr, false, false);
@@ -166,7 +170,8 @@ extent_lock_from_addr(tsdn_t *tsdn, rtree_ctx_t *rtree_ctx, void *addr) {
 	}
 	lock_result_t lock_result;
 	do {
-		lock_result = extent_rtree_leaf_elm_try_lock(tsdn, elm, &ret);
+		lock_result = extent_rtree_leaf_elm_try_lock(tsdn, elm, &ret,
+		    inactive_only);
 	} while (lock_result == lock_result_failure);
 	return ret;
 }
@@ -180,6 +185,7 @@ extent_alloc(tsdn_t *tsdn, arena_t *arena) {
 		return base_alloc_extent(tsdn, arena->base);
 	}
 	extent_avail_remove(&arena->extent_avail, extent);
+	atomic_fetch_sub_zu(&arena->extent_avail_cnt, 1, ATOMIC_RELAXED);
 	malloc_mutex_unlock(tsdn, &arena->extent_avail_mtx);
 	return extent;
 }
@@ -188,6 +194,7 @@ void
 extent_dalloc(tsdn_t *tsdn, arena_t *arena, extent_t *extent) {
 	malloc_mutex_lock(tsdn, &arena->extent_avail_mtx);
 	extent_avail_insert(&arena->extent_avail, extent);
+	atomic_fetch_add_zu(&arena->extent_avail_cnt, 1, ATOMIC_RELAXED);
 	malloc_mutex_unlock(tsdn, &arena->extent_avail_mtx);
 }
 
@@ -253,7 +260,7 @@ extent_size_quantize_ceil(size_t size) {
 	size_t ret;
 
 	assert(size > 0);
-	assert(size - sz_large_pad <= LARGE_MAXCLASS);
+	assert(size - sz_large_pad <= SC_LARGE_MAXCLASS);
 	assert((size & PAGE_MASK) == 0);
 
 	ret = extent_size_quantize_floor(size);
@@ -282,7 +289,7 @@ extents_init(tsdn_t *tsdn, extents_t *extents, extent_state_t state,
 	    malloc_mutex_rank_exclusive)) {
 		return true;
 	}
-	for (unsigned i = 0; i < NPSIZES+1; i++) {
+	for (unsigned i = 0; i < SC_NPSIZES + 1; i++) {
 		extent_heap_new(&extents->heaps[i]);
 	}
 	bitmap_init(extents->bitmap, &extents_bitmap_info, true);
@@ -303,9 +310,34 @@ extents_npages_get(extents_t *extents) {
 	return atomic_load_zu(&extents->npages, ATOMIC_RELAXED);
 }
 
+size_t
+extents_nextents_get(extents_t *extents, pszind_t pind) {
+	return atomic_load_zu(&extents->nextents[pind], ATOMIC_RELAXED);
+}
+
+size_t
+extents_nbytes_get(extents_t *extents, pszind_t pind) {
+	return atomic_load_zu(&extents->nbytes[pind], ATOMIC_RELAXED);
+}
+
+static void
+extents_stats_add(extents_t *extent, pszind_t pind, size_t sz) {
+	size_t cur = atomic_load_zu(&extent->nextents[pind], ATOMIC_RELAXED);
+	atomic_store_zu(&extent->nextents[pind], cur + 1, ATOMIC_RELAXED);
+	cur = atomic_load_zu(&extent->nbytes[pind], ATOMIC_RELAXED);
+	atomic_store_zu(&extent->nbytes[pind], cur + sz, ATOMIC_RELAXED);
+}
+
+static void
+extents_stats_sub(extents_t *extent, pszind_t pind, size_t sz) {
+	size_t cur = atomic_load_zu(&extent->nextents[pind], ATOMIC_RELAXED);
+	atomic_store_zu(&extent->nextents[pind], cur - 1, ATOMIC_RELAXED);
+	cur = atomic_load_zu(&extent->nbytes[pind], ATOMIC_RELAXED);
+	atomic_store_zu(&extent->nbytes[pind], cur - sz, ATOMIC_RELAXED);
+}
+
 static void
-extents_insert_locked(tsdn_t *tsdn, extents_t *extents, extent_t *extent,
-    bool preserve_lru) {
+extents_insert_locked(tsdn_t *tsdn, extents_t *extents, extent_t *extent) {
 	malloc_mutex_assert_owner(tsdn, &extents->mtx);
 	assert(extent_state_get(extent) == extents->state);
 
@@ -317,9 +349,12 @@ extents_insert_locked(tsdn_t *tsdn, extents_t *extents, extent_t *extent,
 		    (size_t)pind);
 	}
 	extent_heap_insert(&extents->heaps[pind], extent);
-	if (!preserve_lru) {
-		extent_list_append(&extents->lru, extent);
+
+	if (config_stats) {
+		extents_stats_add(extents, pind, size);
 	}
+
+	extent_list_append(&extents->lru, extent);
 	size_t npages = size >> LG_PAGE;
 	/*
 	 * All modifications to npages hold the mutex (as asserted above), so we
@@ -333,8 +368,7 @@ extents_insert_locked(tsdn_t *tsdn, extents_t *extents, extent_t *extent,
 }
 
 static void
-extents_remove_locked(tsdn_t *tsdn, extents_t *extents, extent_t *extent,
-    bool preserve_lru) {
+extents_remove_locked(tsdn_t *tsdn, extents_t *extents, extent_t *extent) {
 	malloc_mutex_assert_owner(tsdn, &extents->mtx);
 	assert(extent_state_get(extent) == extents->state);
 
@@ -342,13 +376,16 @@ extents_remove_locked(tsdn_t *tsdn, extents_t *extents, extent_t *extent,
 	size_t psz = extent_size_quantize_floor(size);
 	pszind_t pind = sz_psz2ind(psz);
 	extent_heap_remove(&extents->heaps[pind], extent);
+
+	if (config_stats) {
+		extents_stats_sub(extents, pind, size);
+	}
+
 	if (extent_heap_empty(&extents->heaps[pind])) {
 		bitmap_set(extents->bitmap, &extents_bitmap_info,
 		    (size_t)pind);
 	}
-	if (!preserve_lru) {
-		extent_list_remove(&extents->lru, extent);
-	}
+	extent_list_remove(&extents->lru, extent);
 	size_t npages = size >> LG_PAGE;
 	/*
 	 * As in extents_insert_locked, we hold extents->mtx and so don't need
@@ -361,18 +398,38 @@ extents_remove_locked(tsdn_t *tsdn, extents_t *extents, extent_t *extent,
 	    cur_extents_npages - (size >> LG_PAGE), ATOMIC_RELAXED);
 }
 
-/* Do any-best-fit extent selection, i.e. select any extent that best fits. */
+/*
+ * Find an extent with size [min_size, max_size) to satisfy the alignment
+ * requirement.  For each size, try only the first extent in the heap.
+ */
 static extent_t *
-extents_best_fit_locked(tsdn_t *tsdn, arena_t *arena, extents_t *extents,
-    size_t size) {
-	pszind_t pind = sz_psz2ind(extent_size_quantize_ceil(size));
-	pszind_t i = (pszind_t)bitmap_ffu(extents->bitmap, &extents_bitmap_info,
-	    (size_t)pind);
-	if (i < NPSIZES+1) {
+extents_fit_alignment(extents_t *extents, size_t min_size, size_t max_size,
+    size_t alignment) {
+        pszind_t pind = sz_psz2ind(extent_size_quantize_ceil(min_size));
+        pszind_t pind_max = sz_psz2ind(extent_size_quantize_ceil(max_size));
+
+	for (pszind_t i = (pszind_t)bitmap_ffu(extents->bitmap,
+	    &extents_bitmap_info, (size_t)pind); i < pind_max; i =
+	    (pszind_t)bitmap_ffu(extents->bitmap, &extents_bitmap_info,
+	    (size_t)i+1)) {
+		assert(i < SC_NPSIZES);
 		assert(!extent_heap_empty(&extents->heaps[i]));
-		extent_t *extent = extent_heap_any(&extents->heaps[i]);
-		assert(extent_size_get(extent) >= size);
-		return extent;
+		extent_t *extent = extent_heap_first(&extents->heaps[i]);
+		uintptr_t base = (uintptr_t)extent_base_get(extent);
+		size_t candidate_size = extent_size_get(extent);
+		assert(candidate_size >= min_size);
+
+		uintptr_t next_align = ALIGNMENT_CEILING((uintptr_t)base,
+		    PAGE_CEILING(alignment));
+		if (base > next_align || base + candidate_size <= next_align) {
+			/* Overflow or not crossing the next alignment. */
+			continue;
+		}
+
+		size_t leadsize = next_align - base;
+		if (candidate_size - leadsize >= min_size) {
+			return extent;
+		}
 	}
 
 	return NULL;
@@ -388,39 +445,75 @@ extents_first_fit_locked(tsdn_t *tsdn, arena_t *arena, extents_t *extents,
 	extent_t *ret = NULL;
 
 	pszind_t pind = sz_psz2ind(extent_size_quantize_ceil(size));
+
+	if (!maps_coalesce && !opt_retain) {
+		/*
+		 * No split / merge allowed (Windows w/o retain). Try exact fit
+		 * only.
+		 */
+		return extent_heap_empty(&extents->heaps[pind]) ? NULL :
+		    extent_heap_first(&extents->heaps[pind]);
+	}
+
 	for (pszind_t i = (pszind_t)bitmap_ffu(extents->bitmap,
-	    &extents_bitmap_info, (size_t)pind); i < NPSIZES+1; i =
-	    (pszind_t)bitmap_ffu(extents->bitmap, &extents_bitmap_info,
+	    &extents_bitmap_info, (size_t)pind);
+	    i < SC_NPSIZES + 1;
+	    i = (pszind_t)bitmap_ffu(extents->bitmap, &extents_bitmap_info,
 	    (size_t)i+1)) {
 		assert(!extent_heap_empty(&extents->heaps[i]));
 		extent_t *extent = extent_heap_first(&extents->heaps[i]);
 		assert(extent_size_get(extent) >= size);
+		/*
+		 * In order to reduce fragmentation, avoid reusing and splitting
+		 * large extents for much smaller sizes.
+		 *
+		 * Only do check for dirty extents (delay_coalesce).
+		 */
+		if (extents->delay_coalesce &&
+		    (sz_pind2sz(i) >> opt_lg_extent_max_active_fit) > size) {
+			break;
+		}
 		if (ret == NULL || extent_snad_comp(extent, ret) < 0) {
 			ret = extent;
 		}
-		if (i == NPSIZES) {
+		if (i == SC_NPSIZES) {
 			break;
 		}
-		assert(i < NPSIZES);
+		assert(i < SC_NPSIZES);
 	}
 
 	return ret;
 }
 
 /*
- * Do {best,first}-fit extent selection, where the selection policy choice is
- * based on extents->delay_coalesce.  Best-fit selection requires less
- * searching, but its layout policy is less stable and may cause higher virtual
- * memory fragmentation as a side effect.
+ * Do first-fit extent selection, where the selection policy choice is
+ * based on extents->delay_coalesce.
  */
 static extent_t *
 extents_fit_locked(tsdn_t *tsdn, arena_t *arena, extents_t *extents,
-    size_t size) {
+    size_t esize, size_t alignment) {
 	malloc_mutex_assert_owner(tsdn, &extents->mtx);
 
-	return extents->delay_coalesce ? extents_best_fit_locked(tsdn, arena,
-	    extents, size) : extents_first_fit_locked(tsdn, arena, extents,
-	    size);
+	size_t max_size = esize + PAGE_CEILING(alignment) - PAGE;
+	/* Beware size_t wrap-around. */
+	if (max_size < esize) {
+		return NULL;
+	}
+
+	extent_t *extent =
+	    extents_first_fit_locked(tsdn, arena, extents, max_size);
+
+	if (alignment > PAGE && extent == NULL) {
+		/*
+		 * max_size guarantees the alignment requirement but is rather
+		 * pessimistic.  Next we try to satisfy the aligned allocation
+		 * with sizes in [esize, max_size).
+		 */
+		extent = extents_fit_alignment(extents, esize, max_size,
+		    alignment);
+	}
+
+	return extent;
 }
 
 static bool
@@ -436,7 +529,7 @@ extent_try_delayed_coalesce(tsdn_t *tsdn, arena_t *arena,
 	if (!coalesced) {
 		return true;
 	}
-	extents_insert_locked(tsdn, extents, extent, true);
+	extents_insert_locked(tsdn, extents, extent);
 	return false;
 }
 
@@ -449,8 +542,10 @@ extents_alloc(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, 0);
 
-	return extent_recycle(tsdn, arena, r_extent_hooks, extents, new_addr,
-	    size, pad, alignment, slab, szind, zero, commit, false);
+	extent_t *extent = extent_recycle(tsdn, arena, r_extent_hooks, extents,
+	    new_addr, size, pad, alignment, slab, szind, zero, commit, false);
+	assert(extent == NULL || extent_dumpable_get(extent));
+	return extent;
 }
 
 void
@@ -458,6 +553,7 @@ extents_dalloc(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
     extents_t *extents, extent_t *extent) {
 	assert(extent_base_get(extent) != NULL);
 	assert(extent_size_get(extent) != 0);
+	assert(extent_dumpable_get(extent));
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, 0);
 
@@ -487,14 +583,13 @@ extents_evict(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 			goto label_return;
 		}
 		/* Check the eviction limit. */
-		size_t npages = extent_size_get(extent) >> LG_PAGE;
 		size_t extents_npages = atomic_load_zu(&extents->npages,
 		    ATOMIC_RELAXED);
-		if (extents_npages - npages < npages_min) {
+		if (extents_npages <= npages_min) {
 			extent = NULL;
 			goto label_return;
 		}
-		extents_remove_locked(tsdn, extents, extent, false);
+		extents_remove_locked(tsdn, extents, extent);
 		if (!extents->delay_coalesce) {
 			break;
 		}
@@ -532,16 +627,24 @@ label_return:
 	return extent;
 }
 
+/*
+ * This can only happen when we fail to allocate a new extent struct (which
+ * indicates OOM), e.g. when trying to split an existing extent.
+ */
 static void
-extents_leak(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
+extents_abandon_vm(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
     extents_t *extents, extent_t *extent, bool growing_retained) {
+	size_t sz = extent_size_get(extent);
+	if (config_stats) {
+		arena_stats_accum_zu(&arena->stats.abandoned_vm, sz);
+	}
 	/*
 	 * Leak extent after making sure its pages have already been purged, so
 	 * that this is only a virtual memory leak.
 	 */
 	if (extents_state_get(extents) == extent_state_dirty) {
 		if (extent_purge_lazy_impl(tsdn, arena, r_extent_hooks,
-		    extent, 0, extent_size_get(extent), growing_retained)) {
+		    extent, 0, sz, growing_retained)) {
 			extent_purge_forced_impl(tsdn, arena, r_extent_hooks,
 			    extent, 0, extent_size_get(extent),
 			    growing_retained);
@@ -567,29 +670,29 @@ extents_postfork_child(tsdn_t *tsdn, extents_t *extents) {
 
 static void
 extent_deactivate_locked(tsdn_t *tsdn, arena_t *arena, extents_t *extents,
-    extent_t *extent, bool preserve_lru) {
+    extent_t *extent) {
 	assert(extent_arena_get(extent) == arena);
 	assert(extent_state_get(extent) == extent_state_active);
 
 	extent_state_set(extent, extents_state_get(extents));
-	extents_insert_locked(tsdn, extents, extent, preserve_lru);
+	extents_insert_locked(tsdn, extents, extent);
 }
 
 static void
 extent_deactivate(tsdn_t *tsdn, arena_t *arena, extents_t *extents,
-    extent_t *extent, bool preserve_lru) {
+    extent_t *extent) {
 	malloc_mutex_lock(tsdn, &extents->mtx);
-	extent_deactivate_locked(tsdn, arena, extents, extent, preserve_lru);
+	extent_deactivate_locked(tsdn, arena, extents, extent);
 	malloc_mutex_unlock(tsdn, &extents->mtx);
 }
 
 static void
 extent_activate_locked(tsdn_t *tsdn, arena_t *arena, extents_t *extents,
-    extent_t *extent, bool preserve_lru) {
+    extent_t *extent) {
 	assert(extent_arena_get(extent) == arena);
 	assert(extent_state_get(extent) == extents_state_get(extents));
 
-	extents_remove_locked(tsdn, extents, extent, preserve_lru);
+	extents_remove_locked(tsdn, extents, extent);
 	extent_state_set(extent, extent_state_active);
 }
 
@@ -688,6 +791,7 @@ extent_register_impl(tsdn_t *tsdn, extent_t *extent, bool gdump_add) {
 
 	if (extent_rtree_leaf_elms_lookup(tsdn, rtree_ctx, extent, false, true,
 	    &elm_a, &elm_b)) {
+		extent_unlock(tsdn, extent);
 		return true;
 	}
 
@@ -723,6 +827,13 @@ extent_reregister(tsdn_t *tsdn, extent_t *extent) {
 	assert(!err);
 }
 
+/*
+ * Removes all pointers to the given extent from the global rtree indices for
+ * its interior.  This is relevant for slab extents, for which we need to do
+ * metadata lookups at places other than the head of the extent.  We deregister
+ * on the interior, then, when an extent moves from being an active slab to an
+ * inactive state.
+ */
 static void
 extent_interior_deregister(tsdn_t *tsdn, rtree_ctx_t *rtree_ctx,
     extent_t *extent) {
@@ -737,8 +848,11 @@ extent_interior_deregister(tsdn_t *tsdn, rtree_ctx_t *rtree_ctx,
 	}
 }
 
+/*
+ * Removes all pointers to the given extent from the global rtree.
+ */
 static void
-extent_deregister(tsdn_t *tsdn, extent_t *extent) {
+extent_deregister_impl(tsdn_t *tsdn, extent_t *extent, bool gdump) {
 	rtree_ctx_t rtree_ctx_fallback;
 	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
 	rtree_leaf_elm_t *elm_a, *elm_b;
@@ -747,7 +861,7 @@ extent_deregister(tsdn_t *tsdn, extent_t *extent) {
 
 	extent_lock(tsdn, extent);
 
-	extent_rtree_write_acquired(tsdn, elm_a, elm_b, NULL, NSIZES, false);
+	extent_rtree_write_acquired(tsdn, elm_a, elm_b, NULL, SC_NSIZES, false);
 	if (extent_slab_get(extent)) {
 		extent_interior_deregister(tsdn, rtree_ctx, extent);
 		extent_slab_set(extent, false);
@@ -755,16 +869,30 @@ extent_deregister(tsdn_t *tsdn, extent_t *extent) {
 
 	extent_unlock(tsdn, extent);
 
-	if (config_prof) {
+	if (config_prof && gdump) {
 		extent_gdump_sub(tsdn, extent);
 	}
 }
 
+static void
+extent_deregister(tsdn_t *tsdn, extent_t *extent) {
+	extent_deregister_impl(tsdn, extent, true);
+}
+
+static void
+extent_deregister_no_gdump_sub(tsdn_t *tsdn, extent_t *extent) {
+	extent_deregister_impl(tsdn, extent, false);
+}
+
+/*
+ * Tries to find and remove an extent from extents that can be used for the
+ * given allocation request.
+ */
 static extent_t *
 extent_recycle_extract(tsdn_t *tsdn, arena_t *arena,
     extent_hooks_t **r_extent_hooks, rtree_ctx_t *rtree_ctx, extents_t *extents,
     void *new_addr, size_t size, size_t pad, size_t alignment, bool slab,
-    bool *zero, bool *commit, bool growing_retained) {
+    bool growing_retained) {
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, growing_retained ? 1 : 0);
 	assert(alignment > 0);
@@ -786,16 +914,12 @@ extent_recycle_extract(tsdn_t *tsdn, arena_t *arena,
 	}
 
 	size_t esize = size + pad;
-	size_t alloc_size = esize + PAGE_CEILING(alignment) - PAGE;
-	/* Beware size_t wrap-around. */
-	if (alloc_size < esize) {
-		return NULL;
-	}
 	malloc_mutex_lock(tsdn, &extents->mtx);
 	extent_hooks_assure_initialized(arena, r_extent_hooks);
 	extent_t *extent;
 	if (new_addr != NULL) {
-		extent = extent_lock_from_addr(tsdn, rtree_ctx, new_addr);
+		extent = extent_lock_from_addr(tsdn, rtree_ctx, new_addr,
+		    false);
 		if (extent != NULL) {
 			/*
 			 * We might null-out extent to report an error, but we
@@ -812,86 +936,194 @@ extent_recycle_extract(tsdn_t *tsdn, arena_t *arena,
 			extent_unlock(tsdn, unlock_extent);
 		}
 	} else {
-		extent = extents_fit_locked(tsdn, arena, extents, alloc_size);
+		extent = extents_fit_locked(tsdn, arena, extents, esize,
+		    alignment);
 	}
 	if (extent == NULL) {
 		malloc_mutex_unlock(tsdn, &extents->mtx);
 		return NULL;
 	}
 
-	extent_activate_locked(tsdn, arena, extents, extent, false);
+	extent_activate_locked(tsdn, arena, extents, extent);
 	malloc_mutex_unlock(tsdn, &extents->mtx);
 
-	if (extent_zeroed_get(extent)) {
-		*zero = true;
-	}
-	if (extent_committed_get(extent)) {
-		*commit = true;
-	}
-
 	return extent;
 }
 
-static extent_t *
-extent_recycle_split(tsdn_t *tsdn, arena_t *arena,
-    extent_hooks_t **r_extent_hooks, rtree_ctx_t *rtree_ctx, extents_t *extents,
+/*
+ * Given an allocation request and an extent guaranteed to be able to satisfy
+ * it, this splits off lead and trail extents, leaving extent pointing to an
+ * extent satisfying the allocation.
+ * This function doesn't put lead or trail into any extents_t; it's the caller's
+ * job to ensure that they can be reused.
+ */
+typedef enum {
+	/*
+	 * Split successfully.  lead, extent, and trail, are modified to extents
+	 * describing the ranges before, in, and after the given allocation.
+	 */
+	extent_split_interior_ok,
+	/*
+	 * The extent can't satisfy the given allocation request.  None of the
+	 * input extent_t *s are touched.
+	 */
+	extent_split_interior_cant_alloc,
+	/*
+	 * In a potentially invalid state.  Must leak (if *to_leak is non-NULL),
+	 * and salvage what's still salvageable (if *to_salvage is non-NULL).
+	 * None of lead, extent, or trail are valid.
+	 */
+	extent_split_interior_error
+} extent_split_interior_result_t;
+
+static extent_split_interior_result_t
+extent_split_interior(tsdn_t *tsdn, arena_t *arena,
+    extent_hooks_t **r_extent_hooks, rtree_ctx_t *rtree_ctx,
+    /* The result of splitting, in case of success. */
+    extent_t **extent, extent_t **lead, extent_t **trail,
+    /* The mess to clean up, in case of error. */
+    extent_t **to_leak, extent_t **to_salvage,
     void *new_addr, size_t size, size_t pad, size_t alignment, bool slab,
-    szind_t szind, extent_t *extent, bool growing_retained) {
+    szind_t szind, bool growing_retained) {
 	size_t esize = size + pad;
-	size_t leadsize = ALIGNMENT_CEILING((uintptr_t)extent_base_get(extent),
-	    PAGE_CEILING(alignment)) - (uintptr_t)extent_base_get(extent);
+	size_t leadsize = ALIGNMENT_CEILING((uintptr_t)extent_base_get(*extent),
+	    PAGE_CEILING(alignment)) - (uintptr_t)extent_base_get(*extent);
 	assert(new_addr == NULL || leadsize == 0);
-	assert(extent_size_get(extent) >= leadsize + esize);
-	size_t trailsize = extent_size_get(extent) - leadsize - esize;
+	if (extent_size_get(*extent) < leadsize + esize) {
+		return extent_split_interior_cant_alloc;
+	}
+	size_t trailsize = extent_size_get(*extent) - leadsize - esize;
+
+	*lead = NULL;
+	*trail = NULL;
+	*to_leak = NULL;
+	*to_salvage = NULL;
 
 	/* Split the lead. */
 	if (leadsize != 0) {
-		extent_t *lead = extent;
-		extent = extent_split_impl(tsdn, arena, r_extent_hooks,
-		    lead, leadsize, NSIZES, false, esize + trailsize, szind,
+		*lead = *extent;
+		*extent = extent_split_impl(tsdn, arena, r_extent_hooks,
+		    *lead, leadsize, SC_NSIZES, false, esize + trailsize, szind,
 		    slab, growing_retained);
-		if (extent == NULL) {
-			extent_deregister(tsdn, lead);
-			extents_leak(tsdn, arena, r_extent_hooks, extents,
-			    lead, growing_retained);
-			return NULL;
+		if (*extent == NULL) {
+			*to_leak = *lead;
+			*lead = NULL;
+			return extent_split_interior_error;
 		}
-		extent_deactivate(tsdn, arena, extents, lead, false);
 	}
 
 	/* Split the trail. */
 	if (trailsize != 0) {
-		extent_t *trail = extent_split_impl(tsdn, arena,
-		    r_extent_hooks, extent, esize, szind, slab, trailsize,
-		    NSIZES, false, growing_retained);
-		if (trail == NULL) {
-			extent_deregister(tsdn, extent);
-			extents_leak(tsdn, arena, r_extent_hooks, extents,
-			    extent, growing_retained);
-			return NULL;
+		*trail = extent_split_impl(tsdn, arena, r_extent_hooks, *extent,
+		    esize, szind, slab, trailsize, SC_NSIZES, false,
+		    growing_retained);
+		if (*trail == NULL) {
+			*to_leak = *extent;
+			*to_salvage = *lead;
+			*lead = NULL;
+			*extent = NULL;
+			return extent_split_interior_error;
 		}
-		extent_deactivate(tsdn, arena, extents, trail, false);
-	} else if (leadsize == 0) {
+	}
+
+	if (leadsize == 0 && trailsize == 0) {
 		/*
 		 * Splitting causes szind to be set as a side effect, but no
 		 * splitting occurred.
 		 */
-		extent_szind_set(extent, szind);
-		if (szind != NSIZES) {
+		extent_szind_set(*extent, szind);
+		if (szind != SC_NSIZES) {
 			rtree_szind_slab_update(tsdn, &extents_rtree, rtree_ctx,
-			    (uintptr_t)extent_addr_get(extent), szind, slab);
-			if (slab && extent_size_get(extent) > PAGE) {
+			    (uintptr_t)extent_addr_get(*extent), szind, slab);
+			if (slab && extent_size_get(*extent) > PAGE) {
 				rtree_szind_slab_update(tsdn, &extents_rtree,
 				    rtree_ctx,
-				    (uintptr_t)extent_past_get(extent) -
+				    (uintptr_t)extent_past_get(*extent) -
 				    (uintptr_t)PAGE, szind, slab);
 			}
 		}
 	}
 
-	return extent;
+	return extent_split_interior_ok;
+}
+
+/*
+ * This fulfills the indicated allocation request out of the given extent (which
+ * the caller should have ensured was big enough).  If there's any unused space
+ * before or after the resulting allocation, that space is given its own extent
+ * and put back into extents.
+ */
+static extent_t *
+extent_recycle_split(tsdn_t *tsdn, arena_t *arena,
+    extent_hooks_t **r_extent_hooks, rtree_ctx_t *rtree_ctx, extents_t *extents,
+    void *new_addr, size_t size, size_t pad, size_t alignment, bool slab,
+    szind_t szind, extent_t *extent, bool growing_retained) {
+	extent_t *lead;
+	extent_t *trail;
+	extent_t *to_leak;
+	extent_t *to_salvage;
+
+	extent_split_interior_result_t result = extent_split_interior(
+	    tsdn, arena, r_extent_hooks, rtree_ctx, &extent, &lead, &trail,
+	    &to_leak, &to_salvage, new_addr, size, pad, alignment, slab, szind,
+	    growing_retained);
+
+	if (!maps_coalesce && result != extent_split_interior_ok
+	    && !opt_retain) {
+		/*
+		 * Split isn't supported (implies Windows w/o retain).  Avoid
+		 * leaking the extents.
+		 */
+		assert(to_leak != NULL && lead == NULL && trail == NULL);
+		extent_deactivate(tsdn, arena, extents, to_leak);
+		return NULL;
+	}
+
+	if (result == extent_split_interior_ok) {
+		if (lead != NULL) {
+			extent_deactivate(tsdn, arena, extents, lead);
+		}
+		if (trail != NULL) {
+			extent_deactivate(tsdn, arena, extents, trail);
+		}
+		return extent;
+	} else {
+		/*
+		 * We should have picked an extent that was large enough to
+		 * fulfill our allocation request.
+		 */
+		assert(result == extent_split_interior_error);
+		if (to_salvage != NULL) {
+			extent_deregister(tsdn, to_salvage);
+		}
+		if (to_leak != NULL) {
+			void *leak = extent_base_get(to_leak);
+			extent_deregister_no_gdump_sub(tsdn, to_leak);
+			extents_abandon_vm(tsdn, arena, r_extent_hooks, extents,
+			    to_leak, growing_retained);
+			assert(extent_lock_from_addr(tsdn, rtree_ctx, leak,
+			    false) == NULL);
+		}
+		return NULL;
+	}
+	unreachable();
+}
+
+static bool
+extent_need_manual_zero(arena_t *arena) {
+	/*
+	 * Need to manually zero the extent on repopulating if either; 1) non
+	 * default extent hooks installed (in which case the purge semantics may
+	 * change); or 2) transparent huge pages enabled.
+	 */
+	return (!arena_has_default_hooks(arena) ||
+		(opt_thp == thp_mode_always));
 }
 
+/*
+ * Tries to satisfy the given allocation request by reusing one of the extents
+ * in the given extents_t.
+ */
 static extent_t *
 extent_recycle(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
     extents_t *extents, void *new_addr, size_t size, size_t pad,
@@ -906,16 +1138,12 @@ extent_recycle(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 	rtree_ctx_t rtree_ctx_fallback;
 	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
 
-	bool committed = false;
 	extent_t *extent = extent_recycle_extract(tsdn, arena, r_extent_hooks,
-	    rtree_ctx, extents, new_addr, size, pad, alignment, slab, zero,
-	    &committed, growing_retained);
+	    rtree_ctx, extents, new_addr, size, pad, alignment, slab,
+	    growing_retained);
 	if (extent == NULL) {
 		return NULL;
 	}
-	if (committed) {
-		*commit = true;
-	}
 
 	extent = extent_recycle_split(tsdn, arena, r_extent_hooks, rtree_ctx,
 	    extents, new_addr, size, pad, alignment, slab, szind, extent,
@@ -931,7 +1159,16 @@ extent_recycle(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 			    extent, growing_retained);
 			return NULL;
 		}
-		extent_zeroed_set(extent, true);
+		if (!extent_need_manual_zero(arena)) {
+			extent_zeroed_set(extent, true);
+		}
+	}
+
+	if (extent_committed_get(extent)) {
+		*commit = true;
+	}
+	if (extent_zeroed_get(extent)) {
+		*zero = true;
 	}
 
 	if (pad != 0) {
@@ -945,14 +1182,16 @@ extent_recycle(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 
 	if (*zero) {
 		void *addr = extent_base_get(extent);
-		size_t size = extent_size_get(extent);
 		if (!extent_zeroed_get(extent)) {
-			if (pages_purge_forced(addr, size)) {
+			size_t size = extent_size_get(extent);
+			if (extent_need_manual_zero(arena) ||
+			    pages_purge_forced(addr, size)) {
 				memset(addr, 0, size);
 			}
 		} else if (config_debug) {
 			size_t *p = (size_t *)(uintptr_t)addr;
-			for (size_t i = 0; i < size / sizeof(size_t); i++) {
+			/* Check the first page only. */
+			for (size_t i = 0; i < PAGE / sizeof(size_t); i++) {
 				assert(p[i] == 0);
 			}
 		}
@@ -999,11 +1238,12 @@ extent_alloc_core(tsdn_t *tsdn, arena_t *arena, void *new_addr, size_t size,
 static void *
 extent_alloc_default_impl(tsdn_t *tsdn, arena_t *arena, void *new_addr,
     size_t size, size_t alignment, bool *zero, bool *commit) {
-	void *ret;
-
-	ret = extent_alloc_core(tsdn, arena, new_addr, size, alignment, zero,
+	void *ret = extent_alloc_core(tsdn, arena, new_addr, size, alignment, zero,
 	    commit, (dss_prec_t)atomic_load_u(&arena->dss_prec,
 	    ATOMIC_RELAXED));
+	if (have_madvise_huge && ret) {
+		pages_set_thp_state(ret, size);
+	}
 	return ret;
 }
 
@@ -1022,13 +1262,24 @@ extent_alloc_default(extent_hooks_t *extent_hooks, void *new_addr, size_t size,
 	assert(arena != NULL);
 
 	return extent_alloc_default_impl(tsdn, arena, new_addr, size,
-	    alignment, zero, commit);
+	    ALIGNMENT_CEILING(alignment, PAGE), zero, commit);
 }
 
 static void
 extent_hook_pre_reentrancy(tsdn_t *tsdn, arena_t *arena) {
 	tsd_t *tsd = tsdn_null(tsdn) ? tsd_fetch() : tsdn_tsd(tsdn);
-	pre_reentrancy(tsd, arena);
+	if (arena == arena_get(tsd_tsdn(tsd), 0, false)) {
+		/*
+		 * The only legitimate case of customized extent hooks for a0 is
+		 * hooks with no allocation activities.  One such example is to
+		 * place metadata on pre-allocated resources such as huge pages.
+		 * In that case, rely on reentrancy_level checks to catch
+		 * infinite recursions.
+		 */
+		pre_reentrancy(tsd, NULL);
+	} else {
+		pre_reentrancy(tsd, arena);
+	}
 }
 
 static void
@@ -1064,11 +1315,11 @@ extent_grow_retained(tsdn_t *tsdn, arena_t *arena,
 	size_t alloc_size = sz_pind2sz(arena->extent_grow_next + egn_skip);
 	while (alloc_size < alloc_size_min) {
 		egn_skip++;
-		if (arena->extent_grow_next + egn_skip == NPSIZES) {
+		if (arena->extent_grow_next + egn_skip >=
+		    sz_psz2ind(SC_LARGE_MAXCLASS)) {
 			/* Outside legal range. */
 			goto label_err;
 		}
-		assert(arena->extent_grow_next + egn_skip < NPSIZES);
 		alloc_size = sz_pind2sz(arena->extent_grow_next + egn_skip);
 	}
 
@@ -1081,9 +1332,8 @@ extent_grow_retained(tsdn_t *tsdn, arena_t *arena,
 
 	void *ptr;
 	if (*r_extent_hooks == &extent_hooks_default) {
-		ptr = extent_alloc_core(tsdn, arena, NULL, alloc_size, PAGE,
-		    &zeroed, &committed, (dss_prec_t)atomic_load_u(
-		    &arena->dss_prec, ATOMIC_RELAXED));
+		ptr = extent_alloc_default_impl(tsdn, arena, NULL,
+		    alloc_size, PAGE, &zeroed, &committed);
 	} else {
 		extent_hook_pre_reentrancy(tsdn, arena);
 		ptr = (*r_extent_hooks)->alloc(*r_extent_hooks, NULL,
@@ -1092,23 +1342,19 @@ extent_grow_retained(tsdn_t *tsdn, arena_t *arena,
 		extent_hook_post_reentrancy(tsdn);
 	}
 
-	extent_init(extent, arena, ptr, alloc_size, false, NSIZES,
+	extent_init(extent, arena, ptr, alloc_size, false, SC_NSIZES,
 	    arena_extent_sn_next(arena), extent_state_active, zeroed,
-	    committed);
+	    committed, true, EXTENT_IS_HEAD);
 	if (ptr == NULL) {
 		extent_dalloc(tsdn, arena, extent);
 		goto label_err;
 	}
+
 	if (extent_register_no_gdump_add(tsdn, extent)) {
-		extents_leak(tsdn, arena, r_extent_hooks,
-		    &arena->extents_retained, extent, true);
+		extent_dalloc(tsdn, arena, extent);
 		goto label_err;
 	}
 
-	size_t leadsize = ALIGNMENT_CEILING((uintptr_t)ptr,
-	    PAGE_CEILING(alignment)) - (uintptr_t)ptr;
-	assert(alloc_size >= leadsize + esize);
-	size_t trailsize = alloc_size - leadsize - esize;
 	if (extent_zeroed_get(extent) && extent_committed_get(extent)) {
 		*zero = true;
 	}
@@ -1116,54 +1362,46 @@ extent_grow_retained(tsdn_t *tsdn, arena_t *arena,
 		*commit = true;
 	}
 
-	/* Split the lead. */
-	if (leadsize != 0) {
-		extent_t *lead = extent;
-		extent = extent_split_impl(tsdn, arena, r_extent_hooks, lead,
-		    leadsize, NSIZES, false, esize + trailsize, szind, slab,
-		    true);
-		if (extent == NULL) {
-			extent_deregister(tsdn, lead);
-			extents_leak(tsdn, arena, r_extent_hooks,
+	rtree_ctx_t rtree_ctx_fallback;
+	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
+
+	extent_t *lead;
+	extent_t *trail;
+	extent_t *to_leak;
+	extent_t *to_salvage;
+	extent_split_interior_result_t result = extent_split_interior(
+	    tsdn, arena, r_extent_hooks, rtree_ctx, &extent, &lead, &trail,
+	    &to_leak, &to_salvage, NULL, size, pad, alignment, slab, szind,
+	    true);
+
+	if (result == extent_split_interior_ok) {
+		if (lead != NULL) {
+			extent_record(tsdn, arena, r_extent_hooks,
 			    &arena->extents_retained, lead, true);
-			goto label_err;
 		}
-		extent_record(tsdn, arena, r_extent_hooks,
-		    &arena->extents_retained, lead, true);
-	}
-
-	/* Split the trail. */
-	if (trailsize != 0) {
-		extent_t *trail = extent_split_impl(tsdn, arena, r_extent_hooks,
-		    extent, esize, szind, slab, trailsize, NSIZES, false, true);
-		if (trail == NULL) {
-			extent_deregister(tsdn, extent);
-			extents_leak(tsdn, arena, r_extent_hooks,
-			    &arena->extents_retained, extent, true);
-			goto label_err;
+		if (trail != NULL) {
+			extent_record(tsdn, arena, r_extent_hooks,
+			    &arena->extents_retained, trail, true);
 		}
-		extent_record(tsdn, arena, r_extent_hooks,
-		    &arena->extents_retained, trail, true);
-	} else if (leadsize == 0) {
+	} else {
 		/*
-		 * Splitting causes szind to be set as a side effect, but no
-		 * splitting occurred.
+		 * We should have allocated a sufficiently large extent; the
+		 * cant_alloc case should not occur.
 		 */
-		rtree_ctx_t rtree_ctx_fallback;
-		rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn,
-		    &rtree_ctx_fallback);
-
-		extent_szind_set(extent, szind);
-		if (szind != NSIZES) {
-			rtree_szind_slab_update(tsdn, &extents_rtree, rtree_ctx,
-			    (uintptr_t)extent_addr_get(extent), szind, slab);
-			if (slab && extent_size_get(extent) > PAGE) {
-				rtree_szind_slab_update(tsdn, &extents_rtree,
-				    rtree_ctx,
-				    (uintptr_t)extent_past_get(extent) -
-				    (uintptr_t)PAGE, szind, slab);
+		assert(result == extent_split_interior_error);
+		if (to_salvage != NULL) {
+			if (config_prof) {
+				extent_gdump_add(tsdn, to_salvage);
 			}
+			extent_record(tsdn, arena, r_extent_hooks,
+			    &arena->extents_retained, to_salvage, true);
+		}
+		if (to_leak != NULL) {
+			extent_deregister_no_gdump_sub(tsdn, to_leak);
+			extents_abandon_vm(tsdn, arena, r_extent_hooks,
+			    &arena->extents_retained, to_leak, true);
 		}
+		goto label_err;
 	}
 
 	if (*commit && !extent_committed_get(extent)) {
@@ -1173,17 +1411,20 @@ extent_grow_retained(tsdn_t *tsdn, arena_t *arena,
 			    &arena->extents_retained, extent, true);
 			goto label_err;
 		}
-		extent_zeroed_set(extent, true);
+		if (!extent_need_manual_zero(arena)) {
+			extent_zeroed_set(extent, true);
+		}
 	}
 
 	/*
-	 * Increment extent_grow_next if doing so wouldn't exceed the legal
+	 * Increment extent_grow_next if doing so wouldn't exceed the allowed
 	 * range.
 	 */
-	if (arena->extent_grow_next + egn_skip + 1 < NPSIZES) {
+	if (arena->extent_grow_next + egn_skip + 1 <=
+	    arena->retain_grow_limit) {
 		arena->extent_grow_next += egn_skip + 1;
 	} else {
-		arena->extent_grow_next = NPSIZES - 1;
+		arena->extent_grow_next = arena->retain_grow_limit;
 	}
 	/* All opportunities for failure are past. */
 	malloc_mutex_unlock(tsdn, &arena->extent_grow_mtx);
@@ -1206,7 +1447,8 @@ extent_grow_retained(tsdn_t *tsdn, arena_t *arena,
 	if (*zero && !extent_zeroed_get(extent)) {
 		void *addr = extent_base_get(extent);
 		size_t size = extent_size_get(extent);
-		if (pages_purge_forced(addr, size)) {
+		if (extent_need_manual_zero(arena) ||
+		    pages_purge_forced(addr, size)) {
 			memset(addr, 0, size);
 		}
 	}
@@ -1256,14 +1498,15 @@ extent_alloc_wrapper_hard(tsdn_t *tsdn, arena_t *arena,
 		return NULL;
 	}
 	void *addr;
+	size_t palignment = ALIGNMENT_CEILING(alignment, PAGE);
 	if (*r_extent_hooks == &extent_hooks_default) {
 		/* Call directly to propagate tsdn. */
 		addr = extent_alloc_default_impl(tsdn, arena, new_addr, esize,
-		    alignment, zero, commit);
+		    palignment, zero, commit);
 	} else {
 		extent_hook_pre_reentrancy(tsdn, arena);
 		addr = (*r_extent_hooks)->alloc(*r_extent_hooks, new_addr,
-		    esize, alignment, zero, commit, arena_ind_get(arena));
+		    esize, palignment, zero, commit, arena_ind_get(arena));
 		extent_hook_post_reentrancy(tsdn);
 	}
 	if (addr == NULL) {
@@ -1271,13 +1514,13 @@ extent_alloc_wrapper_hard(tsdn_t *tsdn, arena_t *arena,
 		return NULL;
 	}
 	extent_init(extent, arena, addr, esize, slab, szind,
-	    arena_extent_sn_next(arena), extent_state_active, zero, commit);
+	    arena_extent_sn_next(arena), extent_state_active, *zero, *commit,
+	    true, EXTENT_NOT_HEAD);
 	if (pad != 0) {
 		extent_addr_randomize(tsdn, extent, alignment);
 	}
 	if (extent_register(tsdn, extent)) {
-		extents_leak(tsdn, arena, r_extent_hooks,
-		    &arena->extents_retained, extent, false);
+		extent_dalloc(tsdn, arena, extent);
 		return NULL;
 	}
 
@@ -1296,10 +1539,20 @@ extent_alloc_wrapper(tsdn_t *tsdn, arena_t *arena,
 	extent_t *extent = extent_alloc_retained(tsdn, arena, r_extent_hooks,
 	    new_addr, size, pad, alignment, slab, szind, zero, commit);
 	if (extent == NULL) {
+		if (opt_retain && new_addr != NULL) {
+			/*
+			 * When retain is enabled and new_addr is set, we do not
+			 * attempt extent_alloc_wrapper_hard which does mmap
+			 * that is very unlikely to succeed (unless it happens
+			 * to be at the end).
+			 */
+			return NULL;
+		}
 		extent = extent_alloc_wrapper_hard(tsdn, arena, r_extent_hooks,
 		    new_addr, size, pad, alignment, slab, szind, zero, commit);
 	}
 
+	assert(extent == NULL || extent_dumpable_get(extent));
 	return extent;
 }
 
@@ -1329,16 +1582,7 @@ extent_coalesce(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
     bool growing_retained) {
 	assert(extent_can_coalesce(arena, extents, inner, outer));
 
-	if (forward && extents->delay_coalesce) {
-		/*
-		 * The extent that remains after coalescing must occupy the
-		 * outer extent's position in the LRU.  For forward coalescing,
-		 * swap the inner extent into the LRU.
-		 */
-		extent_list_replace(&extents->lru, outer, inner);
-	}
-	extent_activate_locked(tsdn, arena, extents, outer,
-	    extents->delay_coalesce);
+	extent_activate_locked(tsdn, arena, extents, outer);
 
 	malloc_mutex_unlock(tsdn, &extents->mtx);
 	bool err = extent_merge_impl(tsdn, arena, r_extent_hooks,
@@ -1346,20 +1590,22 @@ extent_coalesce(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 	malloc_mutex_lock(tsdn, &extents->mtx);
 
 	if (err) {
-		if (forward && extents->delay_coalesce) {
-			extent_list_replace(&extents->lru, inner, outer);
-		}
-		extent_deactivate_locked(tsdn, arena, extents, outer,
-		    extents->delay_coalesce);
+		extent_deactivate_locked(tsdn, arena, extents, outer);
 	}
 
 	return err;
 }
 
 static extent_t *
-extent_try_coalesce(tsdn_t *tsdn, arena_t *arena,
+extent_try_coalesce_impl(tsdn_t *tsdn, arena_t *arena,
     extent_hooks_t **r_extent_hooks, rtree_ctx_t *rtree_ctx, extents_t *extents,
-    extent_t *extent, bool *coalesced, bool growing_retained) {
+    extent_t *extent, bool *coalesced, bool growing_retained,
+    bool inactive_only) {
+	/*
+	 * We avoid checking / locking inactive neighbors for large size
+	 * classes, since they are eagerly coalesced on deallocation which can
+	 * cause lock contention.
+	 */
 	/*
 	 * Continue attempting to coalesce until failure, to protect against
 	 * races with other threads that are thwarted by this one.
@@ -1370,7 +1616,7 @@ extent_try_coalesce(tsdn_t *tsdn, arena_t *arena,
 
 		/* Try to coalesce forward. */
 		extent_t *next = extent_lock_from_addr(tsdn, rtree_ctx,
-		    extent_past_get(extent));
+		    extent_past_get(extent), inactive_only);
 		if (next != NULL) {
 			/*
 			 * extents->mtx only protects against races for
@@ -1396,7 +1642,7 @@ extent_try_coalesce(tsdn_t *tsdn, arena_t *arena,
 
 		/* Try to coalesce backward. */
 		extent_t *prev = extent_lock_from_addr(tsdn, rtree_ctx,
-		    extent_before_get(extent));
+		    extent_before_get(extent), inactive_only);
 		if (prev != NULL) {
 			bool can_coalesce = extent_can_coalesce(arena, extents,
 			    extent, prev);
@@ -1422,6 +1668,26 @@ extent_try_coalesce(tsdn_t *tsdn, arena_t *arena,
 	return extent;
 }
 
+static extent_t *
+extent_try_coalesce(tsdn_t *tsdn, arena_t *arena,
+    extent_hooks_t **r_extent_hooks, rtree_ctx_t *rtree_ctx, extents_t *extents,
+    extent_t *extent, bool *coalesced, bool growing_retained) {
+	return extent_try_coalesce_impl(tsdn, arena, r_extent_hooks, rtree_ctx,
+	    extents, extent, coalesced, growing_retained, false);
+}
+
+static extent_t *
+extent_try_coalesce_large(tsdn_t *tsdn, arena_t *arena,
+    extent_hooks_t **r_extent_hooks, rtree_ctx_t *rtree_ctx, extents_t *extents,
+    extent_t *extent, bool *coalesced, bool growing_retained) {
+	return extent_try_coalesce_impl(tsdn, arena, r_extent_hooks, rtree_ctx,
+	    extents, extent, coalesced, growing_retained, true);
+}
+
+/*
+ * Does the metadata management portions of putting an unused extent into the
+ * given extents_t (coalesces, deregisters slab interiors, the heap operations).
+ */
 static void
 extent_record(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
     extents_t *extents, extent_t *extent, bool growing_retained) {
@@ -1435,7 +1701,7 @@ extent_record(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 	malloc_mutex_lock(tsdn, &extents->mtx);
 	extent_hooks_assure_initialized(arena, r_extent_hooks);
 
-	extent_szind_set(extent, NSIZES);
+	extent_szind_set(extent, SC_NSIZES);
 	if (extent_slab_get(extent)) {
 		extent_interior_deregister(tsdn, rtree_ctx, extent);
 		extent_slab_set(extent, false);
@@ -1447,9 +1713,24 @@ extent_record(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 	if (!extents->delay_coalesce) {
 		extent = extent_try_coalesce(tsdn, arena, r_extent_hooks,
 		    rtree_ctx, extents, extent, NULL, growing_retained);
+	} else if (extent_size_get(extent) >= SC_LARGE_MINCLASS) {
+		assert(extents == &arena->extents_dirty);
+		/* Always coalesce large extents eagerly. */
+		bool coalesced;
+		do {
+			assert(extent_state_get(extent) == extent_state_active);
+			extent = extent_try_coalesce_large(tsdn, arena,
+			    r_extent_hooks, rtree_ctx, extents, extent,
+			    &coalesced, growing_retained);
+		} while (coalesced);
+		if (extent_size_get(extent) >= oversize_threshold) {
+			/* Shortcut to purge the oversize extent eagerly. */
+			malloc_mutex_unlock(tsdn, &extents->mtx);
+			arena_decay_extent(tsdn, arena, r_extent_hooks, extent);
+			return;
+		}
 	}
-
-	extent_deactivate_locked(tsdn, arena, extents, extent, false);
+	extent_deactivate_locked(tsdn, arena, extents, extent);
 
 	malloc_mutex_unlock(tsdn, &extents->mtx);
 }
@@ -1462,14 +1743,19 @@ extent_dalloc_gap(tsdn_t *tsdn, arena_t *arena, extent_t *extent) {
 	    WITNESS_RANK_CORE, 0);
 
 	if (extent_register(tsdn, extent)) {
-		extents_leak(tsdn, arena, &extent_hooks,
-		    &arena->extents_retained, extent, false);
+		extent_dalloc(tsdn, arena, extent);
 		return;
 	}
 	extent_dalloc_wrapper(tsdn, arena, &extent_hooks, extent);
 }
 
 static bool
+extent_may_dalloc(void) {
+	/* With retain enabled, the default dalloc always fails. */
+	return !opt_retain;
+}
+
+static bool
 extent_dalloc_default_impl(void *addr, size_t size) {
 	if (!have_dss || !extent_in_dss(addr)) {
 		return extent_dalloc_mmap(addr, size);
@@ -1520,19 +1806,24 @@ extent_dalloc_wrapper_try(tsdn_t *tsdn, arena_t *arena,
 void
 extent_dalloc_wrapper(tsdn_t *tsdn, arena_t *arena,
     extent_hooks_t **r_extent_hooks, extent_t *extent) {
+	assert(extent_dumpable_get(extent));
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, 0);
 
-	/*
-	 * Deregister first to avoid a race with other allocating threads, and
-	 * reregister if deallocation fails.
-	 */
-	extent_deregister(tsdn, extent);
-	if (!extent_dalloc_wrapper_try(tsdn, arena, r_extent_hooks, extent)) {
-		return;
+	/* Avoid calling the default extent_dalloc unless have to. */
+	if (*r_extent_hooks != &extent_hooks_default || extent_may_dalloc()) {
+		/*
+		 * Deregister first to avoid a race with other allocating
+		 * threads, and reregister if deallocation fails.
+		 */
+		extent_deregister(tsdn, extent);
+		if (!extent_dalloc_wrapper_try(tsdn, arena, r_extent_hooks,
+		    extent)) {
+			return;
+		}
+		extent_reregister(tsdn, extent);
 	}
 
-	extent_reregister(tsdn, extent);
 	if (*r_extent_hooks != &extent_hooks_default) {
 		extent_hook_pre_reentrancy(tsdn, arena);
 	}
@@ -1772,14 +2063,28 @@ extent_purge_forced_wrapper(tsdn_t *tsdn, arena_t *arena,
 	    offset, length, false);
 }
 
-#ifdef JEMALLOC_MAPS_COALESCE
 static bool
 extent_split_default(extent_hooks_t *extent_hooks, void *addr, size_t size,
     size_t size_a, size_t size_b, bool committed, unsigned arena_ind) {
-	return !maps_coalesce;
+	if (!maps_coalesce) {
+		/*
+		 * Without retain, only whole regions can be purged (required by
+		 * MEM_RELEASE on Windows) -- therefore disallow splitting.  See
+		 * comments in extent_head_no_merge().
+		 */
+		return !opt_retain;
+	}
+
+	return false;
 }
-#endif
 
+/*
+ * Accepts the extent to split, and the characteristics of each side of the
+ * split.  The 'a' parameters go with the 'lead' of the resulting pair of
+ * extents (the lower addressed portion of the split), and the 'b' parameters go
+ * with the trail (the higher addressed portion).  This makes 'extent' the lead,
+ * and returns the trail (except in case of error).
+ */
 static extent_t *
 extent_split_impl(tsdn_t *tsdn, arena_t *arena,
     extent_hooks_t **r_extent_hooks, extent_t *extent, size_t size_a,
@@ -1803,7 +2108,8 @@ extent_split_impl(tsdn_t *tsdn, arena_t *arena,
 	extent_init(trail, arena, (void *)((uintptr_t)extent_base_get(extent) +
 	    size_a), size_b, slab_b, szind_b, extent_sn_get(extent),
 	    extent_state_get(extent), extent_zeroed_get(extent),
-	    extent_committed_get(extent));
+	    extent_committed_get(extent), extent_dumpable_get(extent),
+	    EXTENT_NOT_HEAD);
 
 	rtree_ctx_t rtree_ctx_fallback;
 	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
@@ -1814,7 +2120,8 @@ extent_split_impl(tsdn_t *tsdn, arena_t *arena,
 		extent_init(&lead, arena, extent_addr_get(extent), size_a,
 		    slab_a, szind_a, extent_sn_get(extent),
 		    extent_state_get(extent), extent_zeroed_get(extent),
-		    extent_committed_get(extent));
+		    extent_committed_get(extent), extent_dumpable_get(extent),
+		    EXTENT_NOT_HEAD);
 
 		extent_rtree_leaf_elms_lookup(tsdn, rtree_ctx, &lead, false,
 		    true, &lead_elm_a, &lead_elm_b);
@@ -1872,7 +2179,7 @@ extent_split_wrapper(tsdn_t *tsdn, arena_t *arena,
 
 static bool
 extent_merge_default_impl(void *addr_a, void *addr_b) {
-	if (!maps_coalesce) {
+	if (!maps_coalesce && !opt_retain) {
 		return true;
 	}
 	if (have_dss && !extent_dss_mergeable(addr_a, addr_b)) {
@@ -1882,13 +2189,51 @@ extent_merge_default_impl(void *addr_a, void *addr_b) {
 	return false;
 }
 
-#ifdef JEMALLOC_MAPS_COALESCE
+/*
+ * Returns true if the given extents can't be merged because of their head bit
+ * settings.  Assumes the second extent has the higher address.
+ */
+static bool
+extent_head_no_merge(extent_t *a, extent_t *b) {
+	assert(extent_base_get(a) < extent_base_get(b));
+	/*
+	 * When coalesce is not always allowed (Windows), only merge extents
+	 * from the same VirtualAlloc region under opt.retain (in which case
+	 * MEM_DECOMMIT is utilized for purging).
+	 */
+	if (maps_coalesce) {
+		return false;
+	}
+	if (!opt_retain) {
+		return true;
+	}
+	/* If b is a head extent, disallow the cross-region merge. */
+	if (extent_is_head_get(b)) {
+		/*
+		 * Additionally, sn should not overflow with retain; sanity
+		 * check that different regions have unique sn.
+		 */
+		assert(extent_sn_comp(a, b) != 0);
+		return true;
+	}
+	assert(extent_sn_comp(a, b) == 0);
+
+	return false;
+}
+
 static bool
 extent_merge_default(extent_hooks_t *extent_hooks, void *addr_a, size_t size_a,
     void *addr_b, size_t size_b, bool committed, unsigned arena_ind) {
+	if (!maps_coalesce) {
+		tsdn_t *tsdn = tsdn_fetch();
+		extent_t *a = iealloc(tsdn, addr_a);
+		extent_t *b = iealloc(tsdn, addr_b);
+		if (extent_head_no_merge(a, b)) {
+			return true;
+		}
+	}
 	return extent_merge_default_impl(addr_a, addr_b);
 }
-#endif
 
 static bool
 extent_merge_impl(tsdn_t *tsdn, arena_t *arena,
@@ -1896,10 +2241,11 @@ extent_merge_impl(tsdn_t *tsdn, arena_t *arena,
     bool growing_retained) {
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, growing_retained ? 1 : 0);
+	assert(extent_base_get(a) < extent_base_get(b));
 
 	extent_hooks_assure_initialized(arena, r_extent_hooks);
 
-	if ((*r_extent_hooks)->merge == NULL) {
+	if ((*r_extent_hooks)->merge == NULL || extent_head_no_merge(a, b)) {
 		return true;
 	}
 
@@ -1938,22 +2284,23 @@ extent_merge_impl(tsdn_t *tsdn, arena_t *arena,
 
 	if (a_elm_b != NULL) {
 		rtree_leaf_elm_write(tsdn, &extents_rtree, a_elm_b, NULL,
-		    NSIZES, false);
+		    SC_NSIZES, false);
 	}
 	if (b_elm_b != NULL) {
 		rtree_leaf_elm_write(tsdn, &extents_rtree, b_elm_a, NULL,
-		    NSIZES, false);
+		    SC_NSIZES, false);
 	} else {
 		b_elm_b = b_elm_a;
 	}
 
 	extent_size_set(a, extent_size_get(a) + extent_size_get(b));
-	extent_szind_set(a, NSIZES);
+	extent_szind_set(a, SC_NSIZES);
 	extent_sn_set(a, (extent_sn_get(a) < extent_sn_get(b)) ?
 	    extent_sn_get(a) : extent_sn_get(b));
 	extent_zeroed_set(a, extent_zeroed_get(a) && extent_zeroed_get(b));
 
-	extent_rtree_write_acquired(tsdn, a_elm_a, b_elm_b, a, NSIZES, false);
+	extent_rtree_write_acquired(tsdn, a_elm_a, b_elm_b, a, SC_NSIZES,
+	    false);
 
 	extent_unlock2(tsdn, a, b);
 
@@ -1985,3 +2332,72 @@ extent_boot(void) {
 
 	return false;
 }
+
+void
+extent_util_stats_get(tsdn_t *tsdn, const void *ptr,
+    size_t *nfree, size_t *nregs, size_t *size) {
+	assert(ptr != NULL && nfree != NULL && nregs != NULL && size != NULL);
+
+	const extent_t *extent = iealloc(tsdn, ptr);
+	if (unlikely(extent == NULL)) {
+		*nfree = *nregs = *size = 0;
+		return;
+	}
+
+	*size = extent_size_get(extent);
+	if (!extent_slab_get(extent)) {
+		*nfree = 0;
+		*nregs = 1;
+	} else {
+		*nfree = extent_nfree_get(extent);
+		*nregs = bin_infos[extent_szind_get(extent)].nregs;
+		assert(*nfree <= *nregs);
+		assert(*nfree * extent_usize_get(extent) <= *size);
+	}
+}
+
+void
+extent_util_stats_verbose_get(tsdn_t *tsdn, const void *ptr,
+    size_t *nfree, size_t *nregs, size_t *size,
+    size_t *bin_nfree, size_t *bin_nregs, void **slabcur_addr) {
+	assert(ptr != NULL && nfree != NULL && nregs != NULL && size != NULL
+	    && bin_nfree != NULL && bin_nregs != NULL && slabcur_addr != NULL);
+
+	const extent_t *extent = iealloc(tsdn, ptr);
+	if (unlikely(extent == NULL)) {
+		*nfree = *nregs = *size = *bin_nfree = *bin_nregs = 0;
+		*slabcur_addr = NULL;
+		return;
+	}
+
+	*size = extent_size_get(extent);
+	if (!extent_slab_get(extent)) {
+		*nfree = *bin_nfree = *bin_nregs = 0;
+		*nregs = 1;
+		*slabcur_addr = NULL;
+		return;
+	}
+
+	*nfree = extent_nfree_get(extent);
+	const szind_t szind = extent_szind_get(extent);
+	*nregs = bin_infos[szind].nregs;
+	assert(*nfree <= *nregs);
+	assert(*nfree * extent_usize_get(extent) <= *size);
+
+	const arena_t *arena = extent_arena_get(extent);
+	assert(arena != NULL);
+	const unsigned binshard = extent_binshard_get(extent);
+	bin_t *bin = &arena->bins[szind].bin_shards[binshard];
+
+	malloc_mutex_lock(tsdn, &bin->lock);
+	if (config_stats) {
+		*bin_nregs = *nregs * bin->stats.curslabs;
+		assert(*bin_nregs >= bin->stats.curregs);
+		*bin_nfree = *bin_nregs - bin->stats.curregs;
+	} else {
+		*bin_nfree = *bin_nregs = 0;
+	}
+	*slabcur_addr = extent_addr_get(bin->slabcur);
+	assert(*slabcur_addr != NULL);
+	malloc_mutex_unlock(tsdn, &bin->lock);
+}
diff --git a/deps/jemalloc/src/extent_dss.c b/deps/jemalloc/src/extent_dss.c
index e72da95870..8581789110 100644
--- a/deps/jemalloc/src/extent_dss.c
+++ b/deps/jemalloc/src/extent_dss.c
@@ -113,7 +113,7 @@ extent_alloc_dss(tsdn_t *tsdn, arena_t *arena, void *new_addr, size_t size,
 
 	cassert(have_dss);
 	assert(size > 0);
-	assert(alignment > 0);
+	assert(alignment == ALIGNMENT_CEILING(alignment, PAGE));
 
 	/*
 	 * sbrk() uses a signed increment argument, so take care not to
@@ -154,9 +154,10 @@ extent_alloc_dss(tsdn_t *tsdn, arena_t *arena, void *new_addr, size_t size,
 			    (uintptr_t)gap_addr_page;
 			if (gap_size_page != 0) {
 				extent_init(gap, arena, gap_addr_page,
-				    gap_size_page, false, NSIZES,
+				    gap_size_page, false, SC_NSIZES,
 				    arena_extent_sn_next(arena),
-				    extent_state_active, false, true);
+				    extent_state_active, false, true, true,
+				    EXTENT_NOT_HEAD);
 			}
 			/*
 			 * Compute the address just past the end of the desired
@@ -198,8 +199,9 @@ extent_alloc_dss(tsdn_t *tsdn, arena_t *arena, void *new_addr, size_t size,
 					extent_t extent;
 
 					extent_init(&extent, arena, ret, size,
-					    size, false, NSIZES,
-					    extent_state_active, false, true);
+					    size, false, SC_NSIZES,
+					    extent_state_active, false, true,
+					    true, EXTENT_NOT_HEAD);
 					if (extent_purge_forced_wrapper(tsdn,
 					    arena, &extent_hooks, &extent, 0,
 					    size)) {
diff --git a/deps/jemalloc/src/extent_mmap.c b/deps/jemalloc/src/extent_mmap.c
index 8d607dc803..17fd1c8f95 100644
--- a/deps/jemalloc/src/extent_mmap.c
+++ b/deps/jemalloc/src/extent_mmap.c
@@ -21,8 +21,8 @@ bool	opt_retain =
 void *
 extent_alloc_mmap(void *new_addr, size_t size, size_t alignment, bool *zero,
     bool *commit) {
-	void *ret = pages_map(new_addr, size, ALIGNMENT_CEILING(alignment,
-	    PAGE), commit);
+	assert(alignment == ALIGNMENT_CEILING(alignment, PAGE));
+	void *ret = pages_map(new_addr, size, alignment, commit);
 	if (ret == NULL) {
 		return NULL;
 	}
diff --git a/deps/jemalloc/src/hook.c b/deps/jemalloc/src/hook.c
new file mode 100644
index 0000000000..9ac703cf9f
--- /dev/null
+++ b/deps/jemalloc/src/hook.c
@@ -0,0 +1,195 @@
+#include "jemalloc/internal/jemalloc_preamble.h"
+
+#include "jemalloc/internal/hook.h"
+
+#include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/seq.h"
+
+typedef struct hooks_internal_s hooks_internal_t;
+struct hooks_internal_s {
+	hooks_t hooks;
+	bool in_use;
+};
+
+seq_define(hooks_internal_t, hooks)
+
+static atomic_u_t nhooks = ATOMIC_INIT(0);
+static seq_hooks_t hooks[HOOK_MAX];
+static malloc_mutex_t hooks_mu;
+
+bool
+hook_boot() {
+	return malloc_mutex_init(&hooks_mu, "hooks", WITNESS_RANK_HOOK,
+	    malloc_mutex_rank_exclusive);
+}
+
+static void *
+hook_install_locked(hooks_t *to_install) {
+	hooks_internal_t hooks_internal;
+	for (int i = 0; i < HOOK_MAX; i++) {
+		bool success = seq_try_load_hooks(&hooks_internal, &hooks[i]);
+		/* We hold mu; no concurrent access. */
+		assert(success);
+		if (!hooks_internal.in_use) {
+			hooks_internal.hooks = *to_install;
+			hooks_internal.in_use = true;
+			seq_store_hooks(&hooks[i], &hooks_internal);
+			atomic_store_u(&nhooks,
+			    atomic_load_u(&nhooks, ATOMIC_RELAXED) + 1,
+			    ATOMIC_RELAXED);
+			return &hooks[i];
+		}
+	}
+	return NULL;
+}
+
+void *
+hook_install(tsdn_t *tsdn, hooks_t *to_install) {
+	malloc_mutex_lock(tsdn, &hooks_mu);
+	void *ret = hook_install_locked(to_install);
+	if (ret != NULL) {
+		tsd_global_slow_inc(tsdn);
+	}
+	malloc_mutex_unlock(tsdn, &hooks_mu);
+	return ret;
+}
+
+static void
+hook_remove_locked(seq_hooks_t *to_remove) {
+	hooks_internal_t hooks_internal;
+	bool success = seq_try_load_hooks(&hooks_internal, to_remove);
+	/* We hold mu; no concurrent access. */
+	assert(success);
+	/* Should only remove hooks that were added. */
+	assert(hooks_internal.in_use);
+	hooks_internal.in_use = false;
+	seq_store_hooks(to_remove, &hooks_internal);
+	atomic_store_u(&nhooks, atomic_load_u(&nhooks, ATOMIC_RELAXED) - 1,
+	    ATOMIC_RELAXED);
+}
+
+void
+hook_remove(tsdn_t *tsdn, void *opaque) {
+	if (config_debug) {
+		char *hooks_begin = (char *)&hooks[0];
+		char *hooks_end = (char *)&hooks[HOOK_MAX];
+		char *hook = (char *)opaque;
+		assert(hooks_begin <= hook && hook < hooks_end
+		    && (hook - hooks_begin) % sizeof(seq_hooks_t) == 0);
+	}
+	malloc_mutex_lock(tsdn, &hooks_mu);
+	hook_remove_locked((seq_hooks_t *)opaque);
+	tsd_global_slow_dec(tsdn);
+	malloc_mutex_unlock(tsdn, &hooks_mu);
+}
+
+#define FOR_EACH_HOOK_BEGIN(hooks_internal_ptr)				\
+for (int for_each_hook_counter = 0;					\
+    for_each_hook_counter < HOOK_MAX;					\
+    for_each_hook_counter++) {						\
+	bool for_each_hook_success = seq_try_load_hooks(		\
+	    (hooks_internal_ptr), &hooks[for_each_hook_counter]);	\
+	if (!for_each_hook_success) {					\
+		continue;						\
+	}								\
+	if (!(hooks_internal_ptr)->in_use) {				\
+		continue;						\
+	}
+#define FOR_EACH_HOOK_END						\
+}
+
+static bool *
+hook_reentrantp() {
+	/*
+	 * We prevent user reentrancy within hooks.  This is basically just a
+	 * thread-local bool that triggers an early-exit.
+	 *
+	 * We don't fold in_hook into reentrancy.  There are two reasons for
+	 * this:
+	 * - Right now, we turn on reentrancy during things like extent hook
+	 *   execution.  Allocating during extent hooks is not officially
+	 *   supported, but we don't want to break it for the time being.  These
+	 *   sorts of allocations should probably still be hooked, though.
+	 * - If a hook allocates, we may want it to be relatively fast (after
+	 *   all, it executes on every allocator operation).  Turning on
+	 *   reentrancy is a fairly heavyweight mode (disabling tcache,
+	 *   redirecting to arena 0, etc.).  It's possible we may one day want
+	 *   to turn on reentrant mode here, if it proves too difficult to keep
+	 *   this working.  But that's fairly easy for us to see; OTOH, people
+	 *   not using hooks because they're too slow is easy for us to miss.
+	 *
+	 * The tricky part is
+	 * that this code might get invoked even if we don't have access to tsd.
+	 * This function mimics getting a pointer to thread-local data, except
+	 * that it might secretly return a pointer to some global data if we
+	 * know that the caller will take the early-exit path.
+	 * If we return a bool that indicates that we are reentrant, then the
+	 * caller will go down the early exit path, leaving the global
+	 * untouched.
+	 */
+	static bool in_hook_global = true;
+	tsdn_t *tsdn = tsdn_fetch();
+	tcache_t *tcache = tsdn_tcachep_get(tsdn);
+	if (tcache != NULL) {
+		return &tcache->in_hook;
+	}
+	return &in_hook_global;
+}
+
+#define HOOK_PROLOGUE							\
+	if (likely(atomic_load_u(&nhooks, ATOMIC_RELAXED) == 0)) {	\
+		return;							\
+	}								\
+	bool *in_hook = hook_reentrantp();				\
+	if (*in_hook) {							\
+		return;							\
+	}								\
+	*in_hook = true;
+
+#define HOOK_EPILOGUE							\
+	*in_hook = false;
+
+void
+hook_invoke_alloc(hook_alloc_t type, void *result, uintptr_t result_raw,
+    uintptr_t args_raw[3]) {
+	HOOK_PROLOGUE
+
+	hooks_internal_t hook;
+	FOR_EACH_HOOK_BEGIN(&hook)
+		hook_alloc h = hook.hooks.alloc_hook;
+		if (h != NULL) {
+			h(hook.hooks.extra, type, result, result_raw, args_raw);
+		}
+	FOR_EACH_HOOK_END
+
+	HOOK_EPILOGUE
+}
+
+void
+hook_invoke_dalloc(hook_dalloc_t type, void *address, uintptr_t args_raw[3]) {
+	HOOK_PROLOGUE
+	hooks_internal_t hook;
+	FOR_EACH_HOOK_BEGIN(&hook)
+		hook_dalloc h = hook.hooks.dalloc_hook;
+		if (h != NULL) {
+			h(hook.hooks.extra, type, address, args_raw);
+		}
+	FOR_EACH_HOOK_END
+	HOOK_EPILOGUE
+}
+
+void
+hook_invoke_expand(hook_expand_t type, void *address, size_t old_usize,
+    size_t new_usize, uintptr_t result_raw, uintptr_t args_raw[4]) {
+	HOOK_PROLOGUE
+	hooks_internal_t hook;
+	FOR_EACH_HOOK_BEGIN(&hook)
+		hook_expand h = hook.hooks.expand_hook;
+		if (h != NULL) {
+			h(hook.hooks.extra, type, address, old_usize, new_usize,
+			    result_raw, args_raw);
+		}
+	FOR_EACH_HOOK_END
+	HOOK_EPILOGUE
+}
diff --git a/deps/jemalloc/src/jemalloc.c b/deps/jemalloc/src/jemalloc.c
index 0ee8ad48b9..ed13718d48 100644
--- a/deps/jemalloc/src/jemalloc.c
+++ b/deps/jemalloc/src/jemalloc.c
@@ -7,11 +7,14 @@
 #include "jemalloc/internal/ctl.h"
 #include "jemalloc/internal/extent_dss.h"
 #include "jemalloc/internal/extent_mmap.h"
+#include "jemalloc/internal/hook.h"
 #include "jemalloc/internal/jemalloc_internal_types.h"
+#include "jemalloc/internal/log.h"
 #include "jemalloc/internal/malloc_io.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/rtree.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/safety_check.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/spin.h"
 #include "jemalloc/internal/sz.h"
 #include "jemalloc/internal/ticker.h"
@@ -40,6 +43,8 @@ bool	opt_abort_conf =
     false
 #endif
     ;
+/* Intentionally default off, even with debug builds. */
+bool	opt_confirm_conf = false;
 const char	*opt_junk =
 #if (defined(JEMALLOC_DEBUG) && defined(JEMALLOC_FILL))
     "true"
@@ -84,8 +89,10 @@ malloc_mutex_t arenas_lock;
 JEMALLOC_ALIGNED(CACHELINE)
 atomic_p_t		arenas[MALLOCX_ARENA_LIMIT];
 static atomic_u_t	narenas_total; /* Use narenas_total_*(). */
-static arena_t		*a0; /* arenas[0]; read-only after initialization. */
-unsigned		narenas_auto; /* Read-only after initialization. */
+/* Below three are read-only after initialization. */
+static arena_t		*a0; /* arenas[0]. */
+unsigned		narenas_auto;
+unsigned		manual_arena_base;
 
 typedef enum {
 	malloc_init_uninitialized	= 3,
@@ -325,7 +332,7 @@ arena_init_locked(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 	 */
 	arena = arena_get(tsdn, ind, false);
 	if (arena != NULL) {
-		assert(ind < narenas_auto);
+		assert(arena_is_auto(arena));
 		return arena;
 	}
 
@@ -340,12 +347,12 @@ arena_new_create_background_thread(tsdn_t *tsdn, unsigned ind) {
 	if (ind == 0) {
 		return;
 	}
-	if (have_background_thread) {
-		bool err;
-		malloc_mutex_lock(tsdn, &background_thread_lock);
-		err = background_thread_create(tsdn_tsd(tsdn), ind);
-		malloc_mutex_unlock(tsdn, &background_thread_lock);
-		if (err) {
+	/*
+	 * Avoid creating a new background thread just for the huge arena, which
+	 * purges eagerly by default.
+	 */
+	if (have_background_thread && !arena_is_huge(ind)) {
+		if (background_thread_create(tsdn_tsd(tsdn), ind)) {
 			malloc_printf("<jemalloc>: error in background thread "
 				      "creation for arena %u. Abort.\n", ind);
 			abort();
@@ -375,6 +382,14 @@ arena_bind(tsd_t *tsd, unsigned ind, bool internal) {
 		tsd_iarena_set(tsd, arena);
 	} else {
 		tsd_arena_set(tsd, arena);
+		unsigned shard = atomic_fetch_add_u(&arena->binshard_next, 1,
+		    ATOMIC_RELAXED);
+		tsd_binshards_t *bins = tsd_binshardsp_get(tsd);
+		for (unsigned i = 0; i < SC_NBINS; i++) {
+			assert(bin_infos[i].n_shards > 0 &&
+			    bin_infos[i].n_shards <= BIN_SHARDS_MAX);
+			bins->binshard[i] = shard % bin_infos[i].n_shards;
+		}
 	}
 }
 
@@ -760,6 +775,50 @@ init_opt_stats_print_opts(const char *v, size_t vlen) {
 	assert(opts_len == strlen(opt_stats_print_opts));
 }
 
+/* Reads the next size pair in a multi-sized option. */
+static bool
+malloc_conf_multi_sizes_next(const char **slab_size_segment_cur,
+    size_t *vlen_left, size_t *slab_start, size_t *slab_end, size_t *new_size) {
+	const char *cur = *slab_size_segment_cur;
+	char *end;
+	uintmax_t um;
+
+	set_errno(0);
+
+	/* First number, then '-' */
+	um = malloc_strtoumax(cur, &end, 0);
+	if (get_errno() != 0 || *end != '-') {
+		return true;
+	}
+	*slab_start = (size_t)um;
+	cur = end + 1;
+
+	/* Second number, then ':' */
+	um = malloc_strtoumax(cur, &end, 0);
+	if (get_errno() != 0 || *end != ':') {
+		return true;
+	}
+	*slab_end = (size_t)um;
+	cur = end + 1;
+
+	/* Last number */
+	um = malloc_strtoumax(cur, &end, 0);
+	if (get_errno() != 0) {
+		return true;
+	}
+	*new_size = (size_t)um;
+
+	/* Consume the separator if there is one. */
+	if (*end == '|') {
+		end++;
+	}
+
+	*vlen_left -= end - *slab_size_segment_cur;
+	*slab_size_segment_cur = end;
+
+	return false;
+}
+
 static bool
 malloc_conf_next(char const **opts_p, char const **k_p, size_t *klen_p,
     char const **v_p, size_t *vlen_p) {
@@ -848,10 +907,13 @@ malloc_conf_error(const char *msg, const char *k, size_t klen, const char *v,
     size_t vlen) {
 	malloc_printf("<jemalloc>: %s: %.*s:%.*s\n", msg, (int)klen, k,
 	    (int)vlen, v);
-	had_conf_error = true;
-	if (opt_abort_conf) {
-		malloc_abort_invalid_conf();
+	/* If abort_conf is set, error out after processing all options. */
+	const char *experimental = "experimental_";
+	if (strncmp(k, experimental, strlen(experimental)) == 0) {
+		/* However, tolerate experimental features. */
+		return;
 	}
+	had_conf_error = true;
 }
 
 static void
@@ -869,88 +931,141 @@ malloc_slow_flag_init(void) {
 	malloc_slow = (malloc_slow_flags != 0);
 }
 
-static void
-malloc_conf_init(void) {
-	unsigned i;
-	char buf[PATH_MAX + 1];
-	const char *opts, *k, *v;
-	size_t klen, vlen;
+/* Number of sources for initializing malloc_conf */
+#define MALLOC_CONF_NSOURCES 4
 
-	for (i = 0; i < 4; i++) {
-		/* Get runtime configuration. */
-		switch (i) {
-		case 0:
-			opts = config_malloc_conf;
-			break;
-		case 1:
-			if (je_malloc_conf != NULL) {
-				/*
-				 * Use options that were compiled into the
-				 * program.
-				 */
-				opts = je_malloc_conf;
-			} else {
-				/* No configuration specified. */
-				buf[0] = '\0';
-				opts = buf;
-			}
-			break;
-		case 2: {
-			ssize_t linklen = 0;
+static const char *
+obtain_malloc_conf(unsigned which_source, char buf[PATH_MAX + 1]) {
+	if (config_debug) {
+		static unsigned read_source = 0;
+		/*
+		 * Each source should only be read once, to minimize # of
+		 * syscalls on init.
+		 */
+		assert(read_source++ == which_source);
+	}
+	assert(which_source < MALLOC_CONF_NSOURCES);
+
+	const char *ret;
+	switch (which_source) {
+	case 0:
+		ret = config_malloc_conf;
+		break;
+	case 1:
+		if (je_malloc_conf != NULL) {
+			/* Use options that were compiled into the program. */
+			ret = je_malloc_conf;
+		} else {
+			/* No configuration specified. */
+			ret = NULL;
+		}
+		break;
+	case 2: {
+		ssize_t linklen = 0;
 #ifndef _WIN32
-			int saved_errno = errno;
-			const char *linkname =
+		int saved_errno = errno;
+		const char *linkname =
 #  ifdef JEMALLOC_PREFIX
-			    "/etc/"JEMALLOC_PREFIX"malloc.conf"
+		    "/etc/"JEMALLOC_PREFIX"malloc.conf"
 #  else
-			    "/etc/malloc.conf"
+		    "/etc/malloc.conf"
 #  endif
-			    ;
+		    ;
 
-			/*
-			 * Try to use the contents of the "/etc/malloc.conf"
-			 * symbolic link's name.
-			 */
-			linklen = readlink(linkname, buf, sizeof(buf) - 1);
-			if (linklen == -1) {
-				/* No configuration specified. */
-				linklen = 0;
-				/* Restore errno. */
-				set_errno(saved_errno);
-			}
+		/*
+		 * Try to use the contents of the "/etc/malloc.conf" symbolic
+		 * link's name.
+		 */
+#ifndef JEMALLOC_READLINKAT
+		linklen = readlink(linkname, buf, PATH_MAX);
+#else
+		linklen = readlinkat(AT_FDCWD, linkname, buf, PATH_MAX);
 #endif
-			buf[linklen] = '\0';
-			opts = buf;
-			break;
-		} case 3: {
-			const char *envname =
+		if (linklen == -1) {
+			/* No configuration specified. */
+			linklen = 0;
+			/* Restore errno. */
+			set_errno(saved_errno);
+		}
+#endif
+		buf[linklen] = '\0';
+		ret = buf;
+		break;
+	} case 3: {
+		const char *envname =
 #ifdef JEMALLOC_PREFIX
-			    JEMALLOC_CPREFIX"MALLOC_CONF"
+		    JEMALLOC_CPREFIX"MALLOC_CONF"
 #else
-			    "MALLOC_CONF"
+		    "MALLOC_CONF"
 #endif
-			    ;
+		    ;
 
-			if ((opts = jemalloc_secure_getenv(envname)) != NULL) {
-				/*
-				 * Do nothing; opts is already initialized to
-				 * the value of the MALLOC_CONF environment
-				 * variable.
-				 */
-			} else {
-				/* No configuration specified. */
-				buf[0] = '\0';
-				opts = buf;
-			}
-			break;
-		} default:
-			not_reached();
-			buf[0] = '\0';
-			opts = buf;
+		if ((ret = jemalloc_secure_getenv(envname)) != NULL) {
+			/*
+			 * Do nothing; opts is already initialized to the value
+			 * of the MALLOC_CONF environment variable.
+			 */
+		} else {
+			/* No configuration specified. */
+			ret = NULL;
+		}
+		break;
+	} default:
+		not_reached();
+		ret = NULL;
+	}
+	return ret;
+}
+
+static void
+malloc_conf_init_helper(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS],
+    bool initial_call, const char *opts_cache[MALLOC_CONF_NSOURCES],
+    char buf[PATH_MAX + 1]) {
+	static const char *opts_explain[MALLOC_CONF_NSOURCES] = {
+		"string specified via --with-malloc-conf",
+		"string pointed to by the global variable malloc_conf",
+		"\"name\" of the file referenced by the symbolic link named "
+		    "/etc/malloc.conf",
+		"value of the environment variable MALLOC_CONF"
+	};
+	unsigned i;
+	const char *opts, *k, *v;
+	size_t klen, vlen;
+
+	for (i = 0; i < MALLOC_CONF_NSOURCES; i++) {
+		/* Get runtime configuration. */
+		if (initial_call) {
+			opts_cache[i] = obtain_malloc_conf(i, buf);
+		}
+		opts = opts_cache[i];
+		if (!initial_call && opt_confirm_conf) {
+			malloc_printf(
+			    "<jemalloc>: malloc_conf #%u (%s): \"%s\"\n",
+			    i + 1, opts_explain[i], opts != NULL ? opts : "");
+		}
+		if (opts == NULL) {
+			continue;
 		}
 
 		while (*opts != '\0' && !malloc_conf_next(&opts, &k, &klen, &v,
 		    &vlen)) {
+
+#define CONF_ERROR(msg, k, klen, v, vlen)				\
+			if (!initial_call) {				\
+				malloc_conf_error(			\
+				    msg, k, klen, v, vlen);		\
+				cur_opt_valid = false;			\
+			}
+#define CONF_CONTINUE	{						\
+				if (!initial_call && opt_confirm_conf	\
+				    && cur_opt_valid) {			\
+					malloc_printf("<jemalloc>: -- "	\
+					    "Set conf value: %.*s:%.*s"	\
+					    "\n", (int)klen, k,		\
+					    (int)vlen, v);		\
+				}					\
+				continue;				\
+			}
 #define CONF_MATCH(n)							\
 	(sizeof(n)-1 == klen && strncmp(n, k, klen) == 0)
 #define CONF_MATCH_VALUE(n)						\
@@ -962,16 +1077,23 @@ malloc_conf_init(void) {
 				} else if (CONF_MATCH_VALUE("false")) {	\
 					o = false;			\
 				} else {				\
-					malloc_conf_error(		\
-					    "Invalid conf value",	\
+					CONF_ERROR("Invalid conf value",\
 					    k, klen, v, vlen);		\
 				}					\
-				continue;				\
+				CONF_CONTINUE;				\
 			}
-#define CONF_MIN_no(um, min)	false
-#define CONF_MIN_yes(um, min)	((um) < (min))
-#define CONF_MAX_no(um, max)	false
-#define CONF_MAX_yes(um, max)	((um) > (max))
+      /*
+       * One of the CONF_MIN macros below expands, in one of the use points,
+       * to "unsigned integer < 0", which is always false, triggering the
+       * GCC -Wtype-limits warning, which we disable here and re-enable below.
+       */
+      JEMALLOC_DIAGNOSTIC_PUSH
+      JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS
+
+#define CONF_DONT_CHECK_MIN(um, min)	false
+#define CONF_CHECK_MIN(um, min)	((um) < (min))
+#define CONF_DONT_CHECK_MAX(um, max)	false
+#define CONF_CHECK_MAX(um, max)	((um) > (max))
 #define CONF_HANDLE_T_U(t, o, n, min, max, check_min, check_max, clip)	\
 			if (CONF_MATCH(n)) {				\
 				uintmax_t um;				\
@@ -981,26 +1103,21 @@ malloc_conf_init(void) {
 				um = malloc_strtoumax(v, &end, 0);	\
 				if (get_errno() != 0 || (uintptr_t)end -\
 				    (uintptr_t)v != vlen) {		\
-					malloc_conf_error(		\
-					    "Invalid conf value",	\
+					CONF_ERROR("Invalid conf value",\
 					    k, klen, v, vlen);		\
 				} else if (clip) {			\
-					if (CONF_MIN_##check_min(um,	\
-					    (t)(min))) {		\
+					if (check_min(um, (t)(min))) {	\
 						o = (t)(min);		\
 					} else if (			\
-					    CONF_MAX_##check_max(um,	\
-					    (t)(max))) {		\
+					    check_max(um, (t)(max))) {	\
 						o = (t)(max);		\
 					} else {			\
 						o = (t)um;		\
 					}				\
 				} else {				\
-					if (CONF_MIN_##check_min(um,	\
-					    (t)(min)) ||		\
-					    CONF_MAX_##check_max(um,	\
-					    (t)(max))) {		\
-						malloc_conf_error(	\
+					if (check_min(um, (t)(min)) ||	\
+					    check_max(um, (t)(max))) {	\
+						CONF_ERROR(		\
 						    "Out-of-range "	\
 						    "conf value",	\
 						    k, klen, v, vlen);	\
@@ -1008,7 +1125,7 @@ malloc_conf_init(void) {
 						o = (t)um;		\
 					}				\
 				}					\
-				continue;				\
+				CONF_CONTINUE;				\
 			}
 #define CONF_HANDLE_UNSIGNED(o, n, min, max, check_min, check_max,	\
     clip)								\
@@ -1026,18 +1143,17 @@ malloc_conf_init(void) {
 				l = strtol(v, &end, 0);			\
 				if (get_errno() != 0 || (uintptr_t)end -\
 				    (uintptr_t)v != vlen) {		\
-					malloc_conf_error(		\
-					    "Invalid conf value",	\
+					CONF_ERROR("Invalid conf value",\
 					    k, klen, v, vlen);		\
 				} else if (l < (ssize_t)(min) || l >	\
 				    (ssize_t)(max)) {			\
-					malloc_conf_error(		\
+					CONF_ERROR(			\
 					    "Out-of-range conf value",	\
 					    k, klen, v, vlen);		\
 				} else {				\
 					o = l;				\
 				}					\
-				continue;				\
+				CONF_CONTINUE;				\
 			}
 #define CONF_HANDLE_CHAR_P(o, n, d)					\
 			if (CONF_MATCH(n)) {				\
@@ -1046,13 +1162,34 @@ malloc_conf_init(void) {
 				    sizeof(o)-1;			\
 				strncpy(o, v, cpylen);			\
 				o[cpylen] = '\0';			\
-				continue;				\
+				CONF_CONTINUE;				\
+			}
+
+			bool cur_opt_valid = true;
+
+			CONF_HANDLE_BOOL(opt_confirm_conf, "confirm_conf")
+			if (initial_call) {
+				continue;
 			}
 
 			CONF_HANDLE_BOOL(opt_abort, "abort")
 			CONF_HANDLE_BOOL(opt_abort_conf, "abort_conf")
-			if (opt_abort_conf && had_conf_error) {
-				malloc_abort_invalid_conf();
+			if (strncmp("metadata_thp", k, klen) == 0) {
+				int i;
+				bool match = false;
+				for (i = 0; i < metadata_thp_mode_limit; i++) {
+					if (strncmp(metadata_thp_mode_names[i],
+					    v, vlen) == 0) {
+						opt_metadata_thp = i;
+						match = true;
+						break;
+					}
+				}
+				if (!match) {
+					CONF_ERROR("Invalid conf value",
+					    k, klen, v, vlen);
+				}
+				CONF_CONTINUE;
 			}
 			CONF_HANDLE_BOOL(opt_retain, "retain")
 			if (strncmp("dss", k, klen) == 0) {
@@ -1062,7 +1199,7 @@ malloc_conf_init(void) {
 					if (strncmp(dss_prec_names[i], v, vlen)
 					    == 0) {
 						if (extent_dss_prec_set(i)) {
-							malloc_conf_error(
+							CONF_ERROR(
 							    "Error setting dss",
 							    k, klen, v, vlen);
 						} else {
@@ -1074,13 +1211,36 @@ malloc_conf_init(void) {
 					}
 				}
 				if (!match) {
-					malloc_conf_error("Invalid conf value",
+					CONF_ERROR("Invalid conf value",
 					    k, klen, v, vlen);
 				}
-				continue;
+				CONF_CONTINUE;
 			}
 			CONF_HANDLE_UNSIGNED(opt_narenas, "narenas", 1,
-			    UINT_MAX, yes, no, false)
+			    UINT_MAX, CONF_CHECK_MIN, CONF_DONT_CHECK_MAX,
+			    false)
+			if (CONF_MATCH("bin_shards")) {
+				const char *bin_shards_segment_cur = v;
+				size_t vlen_left = vlen;
+				do {
+					size_t size_start;
+					size_t size_end;
+					size_t nshards;
+					bool err = malloc_conf_multi_sizes_next(
+					    &bin_shards_segment_cur, &vlen_left,
+					    &size_start, &size_end, &nshards);
+					if (err || bin_update_shard_size(
+					    bin_shard_sizes, size_start,
+					    size_end, nshards)) {
+						CONF_ERROR(
+						    "Invalid settings for "
+						    "bin_shards", k, klen, v,
+						    vlen);
+						break;
+					}
+				} while (vlen_left > 0);
+				CONF_CONTINUE;
+			}
 			CONF_HANDLE_SSIZE_T(opt_dirty_decay_ms,
 			    "dirty_decay_ms", -1, NSTIME_SEC_MAX * KQU(1000) <
 			    QU(SSIZE_MAX) ? NSTIME_SEC_MAX * KQU(1000) :
@@ -1092,7 +1252,7 @@ malloc_conf_init(void) {
 			CONF_HANDLE_BOOL(opt_stats_print, "stats_print")
 			if (CONF_MATCH("stats_print_opts")) {
 				init_opt_stats_print_opts(v, vlen);
-				continue;
+				CONF_CONTINUE;
 			}
 			if (config_fill) {
 				if (CONF_MATCH("junk")) {
@@ -1113,11 +1273,11 @@ malloc_conf_init(void) {
 						opt_junk_alloc = false;
 						opt_junk_free = true;
 					} else {
-						malloc_conf_error(
-						    "Invalid conf value", k,
-						    klen, v, vlen);
+						CONF_ERROR(
+						    "Invalid conf value",
+						    k, klen, v, vlen);
 					}
-					continue;
+					CONF_CONTINUE;
 				}
 				CONF_HANDLE_BOOL(opt_zero, "zero")
 			}
@@ -1130,15 +1290,31 @@ malloc_conf_init(void) {
 			CONF_HANDLE_BOOL(opt_tcache, "tcache")
 			CONF_HANDLE_SSIZE_T(opt_lg_tcache_max, "lg_tcache_max",
 			    -1, (sizeof(size_t) << 3) - 1)
+
+			/*
+			 * The runtime option of oversize_threshold remains
+			 * undocumented.  It may be tweaked in the next major
+			 * release (6.0).  The default value 8M is rather
+			 * conservative / safe.  Tuning it further down may
+			 * improve fragmentation a bit more, but may also cause
+			 * contention on the huge arena.
+			 */
+			CONF_HANDLE_SIZE_T(opt_oversize_threshold,
+			    "oversize_threshold", 0, SC_LARGE_MAXCLASS,
+			    CONF_DONT_CHECK_MIN, CONF_CHECK_MAX, false)
+			CONF_HANDLE_SIZE_T(opt_lg_extent_max_active_fit,
+			    "lg_extent_max_active_fit", 0,
+			    (sizeof(size_t) << 3), CONF_DONT_CHECK_MIN,
+			    CONF_CHECK_MAX, false)
+
 			if (strncmp("percpu_arena", k, klen) == 0) {
-				int i;
 				bool match = false;
-				for (i = percpu_arena_mode_names_base; i <
+				for (int i = percpu_arena_mode_names_base; i <
 				    percpu_arena_mode_names_limit; i++) {
 					if (strncmp(percpu_arena_mode_names[i],
 					    v, vlen) == 0) {
 						if (!have_percpu_arena) {
-							malloc_conf_error(
+							CONF_ERROR(
 							    "No getcpu support",
 							    k, klen, v, vlen);
 						}
@@ -1148,13 +1324,42 @@ malloc_conf_init(void) {
 					}
 				}
 				if (!match) {
-					malloc_conf_error("Invalid conf value",
+					CONF_ERROR("Invalid conf value",
 					    k, klen, v, vlen);
 				}
-				continue;
+				CONF_CONTINUE;
 			}
 			CONF_HANDLE_BOOL(opt_background_thread,
 			    "background_thread");
+			CONF_HANDLE_SIZE_T(opt_max_background_threads,
+					   "max_background_threads", 1,
+					   opt_max_background_threads,
+					   CONF_CHECK_MIN, CONF_CHECK_MAX,
+					   true);
+			if (CONF_MATCH("slab_sizes")) {
+				bool err;
+				const char *slab_size_segment_cur = v;
+				size_t vlen_left = vlen;
+				do {
+					size_t slab_start;
+					size_t slab_end;
+					size_t pgs;
+					err = malloc_conf_multi_sizes_next(
+					    &slab_size_segment_cur,
+					    &vlen_left, &slab_start, &slab_end,
+					    &pgs);
+					if (!err) {
+						sc_data_update_slab_size(
+						    sc_data, slab_start,
+						    slab_end, (int)pgs);
+					} else {
+						CONF_ERROR("Invalid settings "
+						    "for slab_sizes",
+						    k, klen, v, vlen);
+					}
+				} while (!err && vlen_left > 0);
+				CONF_CONTINUE;
+			}
 			if (config_prof) {
 				CONF_HANDLE_BOOL(opt_prof, "prof")
 				CONF_HANDLE_CHAR_P(opt_prof_prefix,
@@ -1164,7 +1369,8 @@ malloc_conf_init(void) {
 				    "prof_thread_active_init")
 				CONF_HANDLE_SIZE_T(opt_lg_prof_sample,
 				    "lg_prof_sample", 0, (sizeof(uint64_t) << 3)
-				    - 1, no, yes, true)
+				    - 1, CONF_DONT_CHECK_MIN, CONF_CHECK_MAX,
+				    true)
 				CONF_HANDLE_BOOL(opt_prof_accum, "prof_accum")
 				CONF_HANDLE_SSIZE_T(opt_lg_prof_interval,
 				    "lg_prof_interval", -1,
@@ -1172,25 +1378,77 @@ malloc_conf_init(void) {
 				CONF_HANDLE_BOOL(opt_prof_gdump, "prof_gdump")
 				CONF_HANDLE_BOOL(opt_prof_final, "prof_final")
 				CONF_HANDLE_BOOL(opt_prof_leak, "prof_leak")
+				CONF_HANDLE_BOOL(opt_prof_log, "prof_log")
+			}
+			if (config_log) {
+				if (CONF_MATCH("log")) {
+					size_t cpylen = (
+					    vlen <= sizeof(log_var_names) ?
+					    vlen : sizeof(log_var_names) - 1);
+					strncpy(log_var_names, v, cpylen);
+					log_var_names[cpylen] = '\0';
+					CONF_CONTINUE;
+				}
 			}
-			malloc_conf_error("Invalid conf pair", k, klen, v,
-			    vlen);
+			if (CONF_MATCH("thp")) {
+				bool match = false;
+				for (int i = 0; i < thp_mode_names_limit; i++) {
+					if (strncmp(thp_mode_names[i],v, vlen)
+					    == 0) {
+						if (!have_madvise_huge) {
+							CONF_ERROR(
+							    "No THP support",
+							    k, klen, v, vlen);
+						}
+						opt_thp = i;
+						match = true;
+						break;
+					}
+				}
+				if (!match) {
+					CONF_ERROR("Invalid conf value",
+					    k, klen, v, vlen);
+				}
+				CONF_CONTINUE;
+			}
+			CONF_ERROR("Invalid conf pair", k, klen, v, vlen);
+#undef CONF_ERROR
+#undef CONF_CONTINUE
 #undef CONF_MATCH
 #undef CONF_MATCH_VALUE
 #undef CONF_HANDLE_BOOL
-#undef CONF_MIN_no
-#undef CONF_MIN_yes
-#undef CONF_MAX_no
-#undef CONF_MAX_yes
+#undef CONF_DONT_CHECK_MIN
+#undef CONF_CHECK_MIN
+#undef CONF_DONT_CHECK_MAX
+#undef CONF_CHECK_MAX
 #undef CONF_HANDLE_T_U
 #undef CONF_HANDLE_UNSIGNED
 #undef CONF_HANDLE_SIZE_T
 #undef CONF_HANDLE_SSIZE_T
 #undef CONF_HANDLE_CHAR_P
+    /* Re-enable diagnostic "-Wtype-limits" */
+    JEMALLOC_DIAGNOSTIC_POP
+		}
+		if (opt_abort_conf && had_conf_error) {
+			malloc_abort_invalid_conf();
 		}
 	}
+	atomic_store_b(&log_init_done, true, ATOMIC_RELEASE);
+}
+
+static void
+malloc_conf_init(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS]) {
+	const char *opts_cache[MALLOC_CONF_NSOURCES] = {NULL, NULL, NULL, NULL};
+	char buf[PATH_MAX + 1];
+
+	/* The first call only set the confirm_conf option and opts_cache */
+	malloc_conf_init_helper(NULL, NULL, true, opts_cache, buf);
+	malloc_conf_init_helper(sc_data, bin_shard_sizes, false, opts_cache,
+	    NULL);
 }
 
+#undef MALLOC_CONF_NSOURCES
+
 static bool
 malloc_init_hard_needed(void) {
 	if (malloc_initialized() || (IS_INITIALIZER && malloc_init_state ==
@@ -1221,10 +1479,33 @@ static bool
 malloc_init_hard_a0_locked() {
 	malloc_initializer = INITIALIZER;
 
+	JEMALLOC_DIAGNOSTIC_PUSH
+	JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
+	sc_data_t sc_data = {0};
+	JEMALLOC_DIAGNOSTIC_POP
+
+	/*
+	 * Ordering here is somewhat tricky; we need sc_boot() first, since that
+	 * determines what the size classes will be, and then
+	 * malloc_conf_init(), since any slab size tweaking will need to be done
+	 * before sz_boot and bin_boot, which assume that the values they read
+	 * out of sc_data_global are final.
+	 */
+	sc_boot(&sc_data);
+	unsigned bin_shard_sizes[SC_NBINS];
+	bin_shard_sizes_boot(bin_shard_sizes);
+	/*
+	 * prof_boot0 only initializes opt_prof_prefix.  We need to do it before
+	 * we parse malloc_conf options, in case malloc_conf parsing overwrites
+	 * it.
+	 */
 	if (config_prof) {
 		prof_boot0();
 	}
-	malloc_conf_init();
+	malloc_conf_init(&sc_data, bin_shard_sizes);
+	sz_boot(&sc_data);
+	bin_boot(&sc_data, bin_shard_sizes);
+
 	if (opt_stats_print) {
 		/* Print statistics at exit. */
 		if (atexit(stats_print_atexit) != 0) {
@@ -1249,7 +1530,7 @@ malloc_init_hard_a0_locked() {
 	if (config_prof) {
 		prof_boot1();
 	}
-	arena_boot();
+	arena_boot(&sc_data);
 	if (tcache_boot(TSDN_NULL)) {
 		return true;
 	}
@@ -1257,11 +1538,13 @@ malloc_init_hard_a0_locked() {
 	    malloc_mutex_rank_exclusive)) {
 		return true;
 	}
+	hook_boot();
 	/*
 	 * Create enough scaffolding to allow recursive allocation in
 	 * malloc_ncpus().
 	 */
 	narenas_auto = 1;
+	manual_arena_base = narenas_auto + 1;
 	memset(arenas, 0, sizeof(arena_t *) * narenas_auto);
 	/*
 	 * Initialize one arena here.  The rest are lazily created in
@@ -1409,6 +1692,10 @@ malloc_init_narenas(void) {
 		    narenas_auto);
 	}
 	narenas_total_set(narenas_auto);
+	if (arena_init_huge()) {
+		narenas_total_inc();
+	}
+	manual_arena_base = narenas_total_get();
 
 	return false;
 }
@@ -1493,6 +1780,8 @@ malloc_init_hard(void) {
 	post_reentrancy(tsd);
 	malloc_mutex_unlock(tsd_tsdn(tsd), &init_lock);
 
+	witness_assert_lockless(witness_tsd_tsdn(
+	    tsd_witness_tsdp_get_unsafe(tsd)));
 	malloc_tsd_boot1();
 	/* Update TSD after tsd_boot1. */
 	tsd = tsd_fetch();
@@ -1500,12 +1789,11 @@ malloc_init_hard(void) {
 		assert(have_background_thread);
 		/*
 		 * Need to finish init & unlock first before creating background
-		 * threads (pthread_create depends on malloc).
+		 * threads (pthread_create depends on malloc).  ctl_init (which
+		 * sets isthreaded) needs to be called without holding any lock.
 		 */
-		malloc_mutex_lock(tsd_tsdn(tsd), &background_thread_lock);
-		bool err = background_thread_create(tsd, 0);
-		malloc_mutex_unlock(tsd_tsdn(tsd), &background_thread_lock);
-		if (err) {
+		background_thread_ctl_init(tsd_tsdn(tsd));
+		if (background_thread_create(tsd, 0)) {
 			return true;
 		}
 	}
@@ -1528,8 +1816,12 @@ typedef struct static_opts_s static_opts_t;
 struct static_opts_s {
 	/* Whether or not allocation size may overflow. */
 	bool may_overflow;
-	/* Whether or not allocations of size 0 should be treated as size 1. */
-	bool bump_empty_alloc;
+
+	/*
+	 * Whether or not allocations (with alignment) of size 0 should be
+	 * treated as size 1.
+	 */
+	bool bump_empty_aligned_alloc;
 	/*
 	 * Whether to assert that allocations are not of size 0 (after any
 	 * bumping).
@@ -1562,12 +1854,16 @@ struct static_opts_s {
 	 * initialization) options.
 	 */
 	bool slow;
+	/*
+	 * Return size.
+	 */
+	bool usize;
 };
 
 JEMALLOC_ALWAYS_INLINE void
 static_opts_init(static_opts_t *static_opts) {
 	static_opts->may_overflow = false;
-	static_opts->bump_empty_alloc = false;
+	static_opts->bump_empty_aligned_alloc = false;
 	static_opts->assert_nonempty_alloc = false;
 	static_opts->null_out_result_on_error = false;
 	static_opts->set_errno_on_error = false;
@@ -1575,6 +1871,7 @@ static_opts_init(static_opts_t *static_opts) {
 	static_opts->oom_string = "";
 	static_opts->invalid_alignment_string = "";
 	static_opts->slow = false;
+	static_opts->usize = false;
 }
 
 /*
@@ -1589,6 +1886,7 @@ static_opts_init(static_opts_t *static_opts) {
 typedef struct dynamic_opts_s dynamic_opts_t;
 struct dynamic_opts_s {
 	void **result;
+	size_t usize;
 	size_t num_items;
 	size_t item_size;
 	size_t alignment;
@@ -1600,6 +1898,7 @@ struct dynamic_opts_s {
 JEMALLOC_ALWAYS_INLINE void
 dynamic_opts_init(dynamic_opts_t *dynamic_opts) {
 	dynamic_opts->result = NULL;
+	dynamic_opts->usize = 0;
 	dynamic_opts->num_items = 0;
 	dynamic_opts->item_size = 0;
 	dynamic_opts->alignment = 0;
@@ -1663,12 +1962,13 @@ imalloc_sample(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd,
 	szind_t ind_large;
 	size_t bumped_usize = usize;
 
-	if (usize <= SMALL_MAXCLASS) {
-		assert(((dopts->alignment == 0) ? sz_s2u(LARGE_MINCLASS) :
-		    sz_sa2u(LARGE_MINCLASS, dopts->alignment))
-		    == LARGE_MINCLASS);
-		ind_large = sz_size2index(LARGE_MINCLASS);
-		bumped_usize = sz_s2u(LARGE_MINCLASS);
+	if (usize <= SC_SMALL_MAXCLASS) {
+		assert(((dopts->alignment == 0) ?
+		    sz_s2u(SC_LARGE_MINCLASS) :
+		    sz_sa2u(SC_LARGE_MINCLASS, dopts->alignment))
+			== SC_LARGE_MINCLASS);
+		ind_large = sz_size2index(SC_LARGE_MINCLASS);
+		bumped_usize = sz_s2u(SC_LARGE_MINCLASS);
 		ret = imalloc_no_sample(sopts, dopts, tsd, bumped_usize,
 		    bumped_usize, ind_large);
 		if (unlikely(ret == NULL)) {
@@ -1701,7 +2001,7 @@ compute_size_with_overflow(bool may_overflow, dynamic_opts_t *dopts,
 	}
 
 	/* A size_t with its high-half bits all set to 1. */
-	const static size_t high_bits = SIZE_T_MAX << (sizeof(size_t) * 8 / 2);
+	static const size_t high_bits = SIZE_T_MAX << (sizeof(size_t) * 8 / 2);
 
 	*size = dopts->item_size * dopts->num_items;
 
@@ -1751,17 +2051,6 @@ imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd) {
 		goto label_oom;
 	}
 
-	/* Validate the user input. */
-	if (sopts->bump_empty_alloc) {
-		if (unlikely(size == 0)) {
-			size = 1;
-		}
-	}
-
-	if (sopts->assert_nonempty_alloc) {
-		assert (size != 0);
-	}
-
 	if (unlikely(dopts->alignment < sopts->min_alignment
 	    || (dopts->alignment & (dopts->alignment - 1)) != 0)) {
 		goto label_invalid_alignment;
@@ -1771,19 +2060,32 @@ imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd) {
 
 	if (dopts->alignment == 0) {
 		ind = sz_size2index(size);
-		if (unlikely(ind >= NSIZES)) {
+		if (unlikely(ind >= SC_NSIZES)) {
 			goto label_oom;
 		}
-		if (config_stats || (config_prof && opt_prof)) {
+		if (config_stats || (config_prof && opt_prof) || sopts->usize) {
 			usize = sz_index2size(ind);
-			assert(usize > 0 && usize <= LARGE_MAXCLASS);
+			dopts->usize = usize;
+			assert(usize > 0 && usize
+			    <= SC_LARGE_MAXCLASS);
 		}
 	} else {
+		if (sopts->bump_empty_aligned_alloc) {
+			if (unlikely(size == 0)) {
+				size = 1;
+			}
+		}
 		usize = sz_sa2u(size, dopts->alignment);
-		if (unlikely(usize == 0 || usize > LARGE_MAXCLASS)) {
+		dopts->usize = usize;
+		if (unlikely(usize == 0
+		    || usize > SC_LARGE_MAXCLASS)) {
 			goto label_oom;
 		}
 	}
+	/* Validate the user input. */
+	if (sopts->assert_nonempty_alloc) {
+		assert (size != 0);
+	}
 
 	check_entry_exit_locking(tsd_tsdn(tsd));
 
@@ -1816,7 +2118,8 @@ imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd) {
 
 		alloc_ctx_t alloc_ctx;
 		if (likely((uintptr_t)tctx == (uintptr_t)1U)) {
-			alloc_ctx.slab = (usize <= SMALL_MAXCLASS);
+			alloc_ctx.slab = (usize
+			    <= SC_SMALL_MAXCLASS);
 			allocation = imalloc_no_sample(
 			    sopts, dopts, tsd, usize, usize, ind);
 		} else if ((uintptr_t)tctx > (uintptr_t)1U) {
@@ -1921,9 +2224,8 @@ label_invalid_alignment:
 	return EINVAL;
 }
 
-/* Returns the errno-style error code of the allocation. */
-JEMALLOC_ALWAYS_INLINE int
-imalloc(static_opts_t *sopts, dynamic_opts_t *dopts) {
+JEMALLOC_ALWAYS_INLINE bool
+imalloc_init_check(static_opts_t *sopts, dynamic_opts_t *dopts) {
 	if (unlikely(!malloc_initialized()) && unlikely(malloc_init())) {
 		if (config_xmalloc && unlikely(opt_xmalloc)) {
 			malloc_write(sopts->oom_string);
@@ -1933,6 +2235,16 @@ imalloc(static_opts_t *sopts, dynamic_opts_t *dopts) {
 		set_errno(ENOMEM);
 		*dopts->result = NULL;
 
+		return false;
+	}
+
+	return true;
+}
+
+/* Returns the errno-style error code of the allocation. */
+JEMALLOC_ALWAYS_INLINE int
+imalloc(static_opts_t *sopts, dynamic_opts_t *dopts) {
+	if (tsd_get_allocates() && !imalloc_init_check(sopts, dopts)) {
 		return ENOMEM;
 	}
 
@@ -1945,27 +2257,27 @@ imalloc(static_opts_t *sopts, dynamic_opts_t *dopts) {
 		sopts->slow = false;
 		return imalloc_body(sopts, dopts, tsd);
 	} else {
+		if (!tsd_get_allocates() && !imalloc_init_check(sopts, dopts)) {
+			return ENOMEM;
+		}
+
 		sopts->slow = true;
 		return imalloc_body(sopts, dopts, tsd);
 	}
 }
-/******************************************************************************/
-/*
- * Begin malloc(3)-compatible functions.
- */
 
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-void JEMALLOC_NOTHROW *
-JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1)
-je_malloc(size_t size) {
+JEMALLOC_NOINLINE
+void *
+malloc_default(size_t size) {
 	void *ret;
 	static_opts_t sopts;
 	dynamic_opts_t dopts;
 
+	LOG("core.malloc.entry", "size: %zu", size);
+
 	static_opts_init(&sopts);
 	dynamic_opts_init(&dopts);
 
-	sopts.bump_empty_alloc = true;
 	sopts.null_out_result_on_error = true;
 	sopts.set_errno_on_error = true;
 	sopts.oom_string = "<jemalloc>: Error in malloc(): out of memory\n";
@@ -1975,10 +2287,107 @@ je_malloc(size_t size) {
 	dopts.item_size = size;
 
 	imalloc(&sopts, &dopts);
+	/*
+	 * Note that this branch gets optimized away -- it immediately follows
+	 * the check on tsd_fast that sets sopts.slow.
+	 */
+	if (sopts.slow) {
+		uintptr_t args[3] = {size};
+		hook_invoke_alloc(hook_alloc_malloc, ret, (uintptr_t)ret, args);
+	}
+
+	LOG("core.malloc.exit", "result: %p", ret);
 
 	return ret;
 }
 
+/******************************************************************************/
+/*
+ * Begin malloc(3)-compatible functions.
+ */
+
+/*
+ * malloc() fastpath.
+ *
+ * Fastpath assumes size <= SC_LOOKUP_MAXCLASS, and that we hit
+ * tcache.  If either of these is false, we tail-call to the slowpath,
+ * malloc_default().  Tail-calling is used to avoid any caller-saved
+ * registers.
+ *
+ * fastpath supports ticker and profiling, both of which will also
+ * tail-call to the slowpath if they fire.
+ */
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
+void JEMALLOC_NOTHROW *
+JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1)
+je_malloc(size_t size) {
+	LOG("core.malloc.entry", "size: %zu", size);
+
+	if (tsd_get_allocates() && unlikely(!malloc_initialized())) {
+		return malloc_default(size);
+	}
+
+	tsd_t *tsd = tsd_get(false);
+	if (unlikely(!tsd || !tsd_fast(tsd) || (size > SC_LOOKUP_MAXCLASS))) {
+		return malloc_default(size);
+	}
+
+	tcache_t *tcache = tsd_tcachep_get(tsd);
+
+	if (unlikely(ticker_trytick(&tcache->gc_ticker))) {
+		return malloc_default(size);
+	}
+
+	szind_t ind = sz_size2index_lookup(size);
+	size_t usize;
+	if (config_stats || config_prof) {
+		usize = sz_index2size(ind);
+	}
+	/* Fast path relies on size being a bin. I.e. SC_LOOKUP_MAXCLASS < SC_SMALL_MAXCLASS */
+	assert(ind < SC_NBINS);
+	assert(size <= SC_SMALL_MAXCLASS);
+
+	if (config_prof) {
+		int64_t bytes_until_sample = tsd_bytes_until_sample_get(tsd);
+		bytes_until_sample -= usize;
+		tsd_bytes_until_sample_set(tsd, bytes_until_sample);
+
+		if (unlikely(bytes_until_sample < 0)) {
+			/*
+			 * Avoid a prof_active check on the fastpath.
+			 * If prof_active is false, set bytes_until_sample to
+			 * a large value.  If prof_active is set to true,
+			 * bytes_until_sample will be reset.
+			 */
+			if (!prof_active) {
+				tsd_bytes_until_sample_set(tsd, SSIZE_MAX);
+			}
+			return malloc_default(size);
+		}
+	}
+
+	cache_bin_t *bin = tcache_small_bin_get(tcache, ind);
+	bool tcache_success;
+	void* ret = cache_bin_alloc_easy(bin, &tcache_success);
+
+	if (tcache_success) {
+		if (config_stats) {
+			*tsd_thread_allocatedp_get(tsd) += usize;
+			bin->tstats.nrequests++;
+		}
+		if (config_prof) {
+			tcache->prof_accumbytes += usize;
+		}
+
+		LOG("core.malloc.exit", "result: %p", ret);
+
+		/* Fastpath success */
+		return ret;
+	}
+
+	return malloc_default(size);
+}
+
 JEMALLOC_EXPORT int JEMALLOC_NOTHROW
 JEMALLOC_ATTR(nonnull(1))
 je_posix_memalign(void **memptr, size_t alignment, size_t size) {
@@ -1986,10 +2395,13 @@ je_posix_memalign(void **memptr, size_t alignment, size_t size) {
 	static_opts_t sopts;
 	dynamic_opts_t dopts;
 
+	LOG("core.posix_memalign.entry", "mem ptr: %p, alignment: %zu, "
+	    "size: %zu", memptr, alignment, size);
+
 	static_opts_init(&sopts);
 	dynamic_opts_init(&dopts);
 
-	sopts.bump_empty_alloc = true;
+	sopts.bump_empty_aligned_alloc = true;
 	sopts.min_alignment = sizeof(void *);
 	sopts.oom_string =
 	    "<jemalloc>: Error allocating aligned memory: out of memory\n";
@@ -2002,6 +2414,16 @@ je_posix_memalign(void **memptr, size_t alignment, size_t size) {
 	dopts.alignment = alignment;
 
 	ret = imalloc(&sopts, &dopts);
+	if (sopts.slow) {
+		uintptr_t args[3] = {(uintptr_t)memptr, (uintptr_t)alignment,
+			(uintptr_t)size};
+		hook_invoke_alloc(hook_alloc_posix_memalign, *memptr,
+		    (uintptr_t)ret, args);
+	}
+
+	LOG("core.posix_memalign.exit", "result: %d, alloc ptr: %p", ret,
+	    *memptr);
+
 	return ret;
 }
 
@@ -2014,10 +2436,13 @@ je_aligned_alloc(size_t alignment, size_t size) {
 	static_opts_t sopts;
 	dynamic_opts_t dopts;
 
+	LOG("core.aligned_alloc.entry", "alignment: %zu, size: %zu\n",
+	    alignment, size);
+
 	static_opts_init(&sopts);
 	dynamic_opts_init(&dopts);
 
-	sopts.bump_empty_alloc = true;
+	sopts.bump_empty_aligned_alloc = true;
 	sopts.null_out_result_on_error = true;
 	sopts.set_errno_on_error = true;
 	sopts.min_alignment = 1;
@@ -2032,6 +2457,14 @@ je_aligned_alloc(size_t alignment, size_t size) {
 	dopts.alignment = alignment;
 
 	imalloc(&sopts, &dopts);
+	if (sopts.slow) {
+		uintptr_t args[3] = {(uintptr_t)alignment, (uintptr_t)size};
+		hook_invoke_alloc(hook_alloc_aligned_alloc, ret,
+		    (uintptr_t)ret, args);
+	}
+
+	LOG("core.aligned_alloc.exit", "result: %p", ret);
+
 	return ret;
 }
 
@@ -2043,11 +2476,12 @@ je_calloc(size_t num, size_t size) {
 	static_opts_t sopts;
 	dynamic_opts_t dopts;
 
+	LOG("core.calloc.entry", "num: %zu, size: %zu\n", num, size);
+
 	static_opts_init(&sopts);
 	dynamic_opts_init(&dopts);
 
 	sopts.may_overflow = true;
-	sopts.bump_empty_alloc = true;
 	sopts.null_out_result_on_error = true;
 	sopts.set_errno_on_error = true;
 	sopts.oom_string = "<jemalloc>: Error in calloc(): out of memory\n";
@@ -2058,26 +2492,34 @@ je_calloc(size_t num, size_t size) {
 	dopts.zero = true;
 
 	imalloc(&sopts, &dopts);
+	if (sopts.slow) {
+		uintptr_t args[3] = {(uintptr_t)num, (uintptr_t)size};
+		hook_invoke_alloc(hook_alloc_calloc, ret, (uintptr_t)ret, args);
+	}
+
+	LOG("core.calloc.exit", "result: %p", ret);
 
 	return ret;
 }
 
 static void *
 irealloc_prof_sample(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t usize,
-    prof_tctx_t *tctx) {
+    prof_tctx_t *tctx, hook_ralloc_args_t *hook_args) {
 	void *p;
 
 	if (tctx == NULL) {
 		return NULL;
 	}
-	if (usize <= SMALL_MAXCLASS) {
-		p = iralloc(tsd, old_ptr, old_usize, LARGE_MINCLASS, 0, false);
+	if (usize <= SC_SMALL_MAXCLASS) {
+		p = iralloc(tsd, old_ptr, old_usize,
+		    SC_LARGE_MINCLASS, 0, false, hook_args);
 		if (p == NULL) {
 			return NULL;
 		}
 		arena_prof_promote(tsd_tsdn(tsd), p, usize);
 	} else {
-		p = iralloc(tsd, old_ptr, old_usize, usize, 0, false);
+		p = iralloc(tsd, old_ptr, old_usize, usize, 0, false,
+		    hook_args);
 	}
 
 	return p;
@@ -2085,7 +2527,7 @@ irealloc_prof_sample(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t usize,
 
 JEMALLOC_ALWAYS_INLINE void *
 irealloc_prof(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t usize,
-   alloc_ctx_t *alloc_ctx) {
+   alloc_ctx_t *alloc_ctx, hook_ralloc_args_t *hook_args) {
 	void *p;
 	bool prof_active;
 	prof_tctx_t *old_tctx, *tctx;
@@ -2094,9 +2536,11 @@ irealloc_prof(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t usize,
 	old_tctx = prof_tctx_get(tsd_tsdn(tsd), old_ptr, alloc_ctx);
 	tctx = prof_alloc_prep(tsd, usize, prof_active, true);
 	if (unlikely((uintptr_t)tctx != (uintptr_t)1U)) {
-		p = irealloc_prof_sample(tsd, old_ptr, old_usize, usize, tctx);
+		p = irealloc_prof_sample(tsd, old_ptr, old_usize, usize, tctx,
+		    hook_args);
 	} else {
-		p = iralloc(tsd, old_ptr, old_usize, usize, 0, false);
+		p = iralloc(tsd, old_ptr, old_usize, usize, 0, false,
+		    hook_args);
 	}
 	if (unlikely(p == NULL)) {
 		prof_alloc_rollback(tsd, tctx, true);
@@ -2125,7 +2569,7 @@ ifree(tsd_t *tsd, void *ptr, tcache_t *tcache, bool slow_path) {
 	rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd);
 	rtree_szind_slab_read(tsd_tsdn(tsd), &extents_rtree, rtree_ctx,
 	    (uintptr_t)ptr, true, &alloc_ctx.szind, &alloc_ctx.slab);
-	assert(alloc_ctx.szind != NSIZES);
+	assert(alloc_ctx.szind != SC_NSIZES);
 
 	size_t usize;
 	if (config_prof && opt_prof) {
@@ -2161,17 +2605,37 @@ isfree(tsd_t *tsd, void *ptr, size_t usize, tcache_t *tcache, bool slow_path) {
 	assert(malloc_initialized() || IS_INITIALIZER);
 
 	alloc_ctx_t alloc_ctx, *ctx;
-	if (config_prof && opt_prof) {
+	if (!config_cache_oblivious && ((uintptr_t)ptr & PAGE_MASK) != 0) {
+		/*
+		 * When cache_oblivious is disabled and ptr is not page aligned,
+		 * the allocation was not sampled -- usize can be used to
+		 * determine szind directly.
+		 */
+		alloc_ctx.szind = sz_size2index(usize);
+		alloc_ctx.slab = true;
+		ctx = &alloc_ctx;
+		if (config_debug) {
+			alloc_ctx_t dbg_ctx;
+			rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd);
+			rtree_szind_slab_read(tsd_tsdn(tsd), &extents_rtree,
+			    rtree_ctx, (uintptr_t)ptr, true, &dbg_ctx.szind,
+			    &dbg_ctx.slab);
+			assert(dbg_ctx.szind == alloc_ctx.szind);
+			assert(dbg_ctx.slab == alloc_ctx.slab);
+		}
+	} else if (config_prof && opt_prof) {
 		rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd);
 		rtree_szind_slab_read(tsd_tsdn(tsd), &extents_rtree, rtree_ctx,
 		    (uintptr_t)ptr, true, &alloc_ctx.szind, &alloc_ctx.slab);
 		assert(alloc_ctx.szind == sz_size2index(usize));
 		ctx = &alloc_ctx;
-		prof_free(tsd, ptr, usize, ctx);
 	} else {
 		ctx = NULL;
 	}
 
+	if (config_prof && opt_prof) {
+		prof_free(tsd, ptr, usize, ctx);
+	}
 	if (config_stats) {
 		*tsd_thread_deallocatedp_get(tsd) += usize;
 	}
@@ -2186,11 +2650,14 @@ isfree(tsd_t *tsd, void *ptr, size_t usize, tcache_t *tcache, bool slow_path) {
 JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
 void JEMALLOC_NOTHROW *
 JEMALLOC_ALLOC_SIZE(2)
-je_realloc(void *ptr, size_t size) {
+je_realloc(void *ptr, size_t arg_size) {
 	void *ret;
 	tsdn_t *tsdn JEMALLOC_CC_SILENCE_INIT(NULL);
 	size_t usize JEMALLOC_CC_SILENCE_INIT(0);
 	size_t old_usize = 0;
+	size_t size = arg_size;
+
+	LOG("core.realloc.entry", "ptr: %p, size: %zu\n", ptr, size);
 
 	if (unlikely(size == 0)) {
 		if (ptr != NULL) {
@@ -2203,7 +2670,13 @@ je_realloc(void *ptr, size_t size) {
 			} else {
 				tcache = NULL;
 			}
+
+			uintptr_t args[3] = {(uintptr_t)ptr, size};
+			hook_invoke_dalloc(hook_dalloc_realloc, ptr, args);
+
 			ifree(tsd, ptr, tcache, true);
+
+			LOG("core.realloc.exit", "result: %p", NULL);
 			return NULL;
 		}
 		size = 1;
@@ -2215,28 +2688,59 @@ je_realloc(void *ptr, size_t size) {
 
 		check_entry_exit_locking(tsd_tsdn(tsd));
 
+
+		hook_ralloc_args_t hook_args = {true, {(uintptr_t)ptr,
+			(uintptr_t)arg_size, 0, 0}};
+
 		alloc_ctx_t alloc_ctx;
 		rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd);
 		rtree_szind_slab_read(tsd_tsdn(tsd), &extents_rtree, rtree_ctx,
 		    (uintptr_t)ptr, true, &alloc_ctx.szind, &alloc_ctx.slab);
-		assert(alloc_ctx.szind != NSIZES);
+		assert(alloc_ctx.szind != SC_NSIZES);
 		old_usize = sz_index2size(alloc_ctx.szind);
 		assert(old_usize == isalloc(tsd_tsdn(tsd), ptr));
 		if (config_prof && opt_prof) {
 			usize = sz_s2u(size);
-			ret = unlikely(usize == 0 || usize > LARGE_MAXCLASS) ?
-			    NULL : irealloc_prof(tsd, ptr, old_usize, usize,
-			    &alloc_ctx);
+			if (unlikely(usize == 0
+			    || usize > SC_LARGE_MAXCLASS)) {
+				ret = NULL;
+			} else {
+				ret = irealloc_prof(tsd, ptr, old_usize, usize,
+				    &alloc_ctx, &hook_args);
+			}
 		} else {
 			if (config_stats) {
 				usize = sz_s2u(size);
 			}
-			ret = iralloc(tsd, ptr, old_usize, size, 0, false);
+			ret = iralloc(tsd, ptr, old_usize, size, 0, false,
+			    &hook_args);
 		}
 		tsdn = tsd_tsdn(tsd);
 	} else {
 		/* realloc(NULL, size) is equivalent to malloc(size). */
-		return je_malloc(size);
+		static_opts_t sopts;
+		dynamic_opts_t dopts;
+
+		static_opts_init(&sopts);
+		dynamic_opts_init(&dopts);
+
+		sopts.null_out_result_on_error = true;
+		sopts.set_errno_on_error = true;
+		sopts.oom_string =
+		    "<jemalloc>: Error in realloc(): out of memory\n";
+
+		dopts.result = &ret;
+		dopts.num_items = 1;
+		dopts.item_size = size;
+
+		imalloc(&sopts, &dopts);
+		if (sopts.slow) {
+			uintptr_t args[3] = {(uintptr_t)ptr, arg_size};
+			hook_invoke_alloc(hook_alloc_realloc, ret,
+			    (uintptr_t)ret, args);
+		}
+
+		return ret;
 	}
 
 	if (unlikely(ret == NULL)) {
@@ -2257,11 +2761,14 @@ je_realloc(void *ptr, size_t size) {
 	}
 	UTRACE(ptr, size, ret);
 	check_entry_exit_locking(tsdn);
+
+	LOG("core.realloc.exit", "result: %p", ret);
 	return ret;
 }
 
-JEMALLOC_EXPORT void JEMALLOC_NOTHROW
-je_free(void *ptr) {
+JEMALLOC_NOINLINE
+void
+free_default(void *ptr) {
 	UTRACE(ptr, 0, 0);
 	if (likely(ptr != NULL)) {
 		/*
@@ -2287,12 +2794,82 @@ je_free(void *ptr) {
 			} else {
 				tcache = NULL;
 			}
+			uintptr_t args_raw[3] = {(uintptr_t)ptr};
+			hook_invoke_dalloc(hook_dalloc_free, ptr, args_raw);
 			ifree(tsd, ptr, tcache, true);
 		}
 		check_entry_exit_locking(tsd_tsdn(tsd));
 	}
 }
 
+JEMALLOC_ALWAYS_INLINE
+bool free_fastpath(void *ptr, size_t size, bool size_hint) {
+	tsd_t *tsd = tsd_get(false);
+	if (unlikely(!tsd || !tsd_fast(tsd))) {
+		return false;
+	}
+
+	tcache_t *tcache = tsd_tcachep_get(tsd);
+
+	alloc_ctx_t alloc_ctx;
+	/*
+	 * If !config_cache_oblivious, we can check PAGE alignment to
+	 * detect sampled objects.  Otherwise addresses are
+	 * randomized, and we have to look it up in the rtree anyway.
+	 * See also isfree().
+	 */
+	if (!size_hint || config_cache_oblivious) {
+		rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd);
+		bool res = rtree_szind_slab_read_fast(tsd_tsdn(tsd), &extents_rtree,
+						      rtree_ctx, (uintptr_t)ptr,
+						      &alloc_ctx.szind, &alloc_ctx.slab);
+
+		/* Note: profiled objects will have alloc_ctx.slab set */
+		if (!res || !alloc_ctx.slab) {
+			return false;
+		}
+		assert(alloc_ctx.szind != SC_NSIZES);
+	} else {
+		/*
+		 * Check for both sizes that are too large, and for sampled objects.
+		 * Sampled objects are always page-aligned.  The sampled object check
+		 * will also check for null ptr.
+		 */
+		if (size > SC_LOOKUP_MAXCLASS || (((uintptr_t)ptr & PAGE_MASK) == 0)) {
+			return false;
+		}
+		alloc_ctx.szind = sz_size2index_lookup(size);
+	}
+
+	if (unlikely(ticker_trytick(&tcache->gc_ticker))) {
+		return false;
+	}
+
+	cache_bin_t *bin = tcache_small_bin_get(tcache, alloc_ctx.szind);
+	cache_bin_info_t *bin_info = &tcache_bin_info[alloc_ctx.szind];
+	if (!cache_bin_dalloc_easy(bin, bin_info, ptr)) {
+		return false;
+	}
+
+	if (config_stats) {
+		size_t usize = sz_index2size(alloc_ctx.szind);
+		*tsd_thread_deallocatedp_get(tsd) += usize;
+	}
+
+	return true;
+}
+
+JEMALLOC_EXPORT void JEMALLOC_NOTHROW
+je_free(void *ptr) {
+	LOG("core.free.entry", "ptr: %p", ptr);
+
+	if (!free_fastpath(ptr, 0, false)) {
+		free_default(ptr);
+	}
+
+	LOG("core.free.exit", "");
+}
+
 /*
  * End malloc(3)-compatible functions.
  */
@@ -2310,10 +2887,12 @@ je_memalign(size_t alignment, size_t size) {
 	static_opts_t sopts;
 	dynamic_opts_t dopts;
 
+	LOG("core.memalign.entry", "alignment: %zu, size: %zu\n", alignment,
+	    size);
+
 	static_opts_init(&sopts);
 	dynamic_opts_init(&dopts);
 
-	sopts.bump_empty_alloc = true;
 	sopts.min_alignment = 1;
 	sopts.oom_string =
 	    "<jemalloc>: Error allocating aligned memory: out of memory\n";
@@ -2327,6 +2906,13 @@ je_memalign(size_t alignment, size_t size) {
 	dopts.alignment = alignment;
 
 	imalloc(&sopts, &dopts);
+	if (sopts.slow) {
+		uintptr_t args[3] = {alignment, size};
+		hook_invoke_alloc(hook_alloc_memalign, ret, (uintptr_t)ret,
+		    args);
+	}
+
+	LOG("core.memalign.exit", "result: %p", ret);
 	return ret;
 }
 #endif
@@ -2341,10 +2927,11 @@ je_valloc(size_t size) {
 	static_opts_t sopts;
 	dynamic_opts_t dopts;
 
+	LOG("core.valloc.entry", "size: %zu\n", size);
+
 	static_opts_init(&sopts);
 	dynamic_opts_init(&dopts);
 
-	sopts.bump_empty_alloc = true;
 	sopts.null_out_result_on_error = true;
 	sopts.min_alignment = PAGE;
 	sopts.oom_string =
@@ -2358,7 +2945,12 @@ je_valloc(size_t size) {
 	dopts.alignment = PAGE;
 
 	imalloc(&sopts, &dopts);
+	if (sopts.slow) {
+		uintptr_t args[3] = {size};
+		hook_invoke_alloc(hook_alloc_valloc, ret, (uintptr_t)ret, args);
+	}
 
+	LOG("core.valloc.exit", "result: %p\n", ret);
 	return ret;
 }
 #endif
@@ -2424,6 +3016,82 @@ int __posix_memalign(void** r, size_t a, size_t s) PREALIAS(je_posix_memalign);
  * Begin non-standard functions.
  */
 
+#ifdef JEMALLOC_EXPERIMENTAL_SMALLOCX_API
+
+#define JEMALLOC_SMALLOCX_CONCAT_HELPER(x, y) x ## y
+#define JEMALLOC_SMALLOCX_CONCAT_HELPER2(x, y)  \
+  JEMALLOC_SMALLOCX_CONCAT_HELPER(x, y)
+
+typedef struct {
+	void *ptr;
+	size_t size;
+} smallocx_return_t;
+
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
+smallocx_return_t JEMALLOC_NOTHROW
+/*
+ * The attribute JEMALLOC_ATTR(malloc) cannot be used due to:
+ *  - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86488
+ */
+JEMALLOC_SMALLOCX_CONCAT_HELPER2(je_smallocx_, JEMALLOC_VERSION_GID_IDENT)
+  (size_t size, int flags) {
+	/*
+	 * Note: the attribute JEMALLOC_ALLOC_SIZE(1) cannot be
+	 * used here because it makes writing beyond the `size`
+	 * of the `ptr` undefined behavior, but the objective
+	 * of this function is to allow writing beyond `size`
+	 * up to `smallocx_return_t::size`.
+	 */
+	smallocx_return_t ret;
+	static_opts_t sopts;
+	dynamic_opts_t dopts;
+
+	LOG("core.smallocx.entry", "size: %zu, flags: %d", size, flags);
+
+	static_opts_init(&sopts);
+	dynamic_opts_init(&dopts);
+
+	sopts.assert_nonempty_alloc = true;
+	sopts.null_out_result_on_error = true;
+	sopts.oom_string = "<jemalloc>: Error in mallocx(): out of memory\n";
+	sopts.usize = true;
+
+	dopts.result = &ret.ptr;
+	dopts.num_items = 1;
+	dopts.item_size = size;
+	if (unlikely(flags != 0)) {
+		if ((flags & MALLOCX_LG_ALIGN_MASK) != 0) {
+			dopts.alignment = MALLOCX_ALIGN_GET_SPECIFIED(flags);
+		}
+
+		dopts.zero = MALLOCX_ZERO_GET(flags);
+
+		if ((flags & MALLOCX_TCACHE_MASK) != 0) {
+			if ((flags & MALLOCX_TCACHE_MASK)
+			    == MALLOCX_TCACHE_NONE) {
+				dopts.tcache_ind = TCACHE_IND_NONE;
+			} else {
+				dopts.tcache_ind = MALLOCX_TCACHE_GET(flags);
+			}
+		} else {
+			dopts.tcache_ind = TCACHE_IND_AUTOMATIC;
+		}
+
+		if ((flags & MALLOCX_ARENA_MASK) != 0)
+			dopts.arena_ind = MALLOCX_ARENA_GET(flags);
+	}
+
+	imalloc(&sopts, &dopts);
+	assert(dopts.usize == je_nallocx(size, flags));
+	ret.size = dopts.usize;
+
+	LOG("core.smallocx.exit", "result: %p, size: %zu", ret.ptr, ret.size);
+	return ret;
+}
+#undef JEMALLOC_SMALLOCX_CONCAT_HELPER
+#undef JEMALLOC_SMALLOCX_CONCAT_HELPER2
+#endif
+
 JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
 void JEMALLOC_NOTHROW *
 JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1)
@@ -2432,6 +3100,8 @@ je_mallocx(size_t size, int flags) {
 	static_opts_t sopts;
 	dynamic_opts_t dopts;
 
+	LOG("core.mallocx.entry", "size: %zu, flags: %d", size, flags);
+
 	static_opts_init(&sopts);
 	dynamic_opts_init(&dopts);
 
@@ -2465,28 +3135,36 @@ je_mallocx(size_t size, int flags) {
 	}
 
 	imalloc(&sopts, &dopts);
+	if (sopts.slow) {
+		uintptr_t args[3] = {size, flags};
+		hook_invoke_alloc(hook_alloc_mallocx, ret, (uintptr_t)ret,
+		    args);
+	}
+
+	LOG("core.mallocx.exit", "result: %p", ret);
 	return ret;
 }
 
 static void *
 irallocx_prof_sample(tsdn_t *tsdn, void *old_ptr, size_t old_usize,
     size_t usize, size_t alignment, bool zero, tcache_t *tcache, arena_t *arena,
-    prof_tctx_t *tctx) {
+    prof_tctx_t *tctx, hook_ralloc_args_t *hook_args) {
 	void *p;
 
 	if (tctx == NULL) {
 		return NULL;
 	}
-	if (usize <= SMALL_MAXCLASS) {
-		p = iralloct(tsdn, old_ptr, old_usize, LARGE_MINCLASS,
-		    alignment, zero, tcache, arena);
+	if (usize <= SC_SMALL_MAXCLASS) {
+		p = iralloct(tsdn, old_ptr, old_usize,
+		    SC_LARGE_MINCLASS, alignment, zero, tcache,
+		    arena, hook_args);
 		if (p == NULL) {
 			return NULL;
 		}
 		arena_prof_promote(tsdn, p, usize);
 	} else {
 		p = iralloct(tsdn, old_ptr, old_usize, usize, alignment, zero,
-		    tcache, arena);
+		    tcache, arena, hook_args);
 	}
 
 	return p;
@@ -2495,7 +3173,7 @@ irallocx_prof_sample(tsdn_t *tsdn, void *old_ptr, size_t old_usize,
 JEMALLOC_ALWAYS_INLINE void *
 irallocx_prof(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t size,
     size_t alignment, size_t *usize, bool zero, tcache_t *tcache,
-    arena_t *arena, alloc_ctx_t *alloc_ctx) {
+    arena_t *arena, alloc_ctx_t *alloc_ctx, hook_ralloc_args_t *hook_args) {
 	void *p;
 	bool prof_active;
 	prof_tctx_t *old_tctx, *tctx;
@@ -2505,10 +3183,10 @@ irallocx_prof(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t size,
 	tctx = prof_alloc_prep(tsd, *usize, prof_active, false);
 	if (unlikely((uintptr_t)tctx != (uintptr_t)1U)) {
 		p = irallocx_prof_sample(tsd_tsdn(tsd), old_ptr, old_usize,
-		    *usize, alignment, zero, tcache, arena, tctx);
+		    *usize, alignment, zero, tcache, arena, tctx, hook_args);
 	} else {
 		p = iralloct(tsd_tsdn(tsd), old_ptr, old_usize, size, alignment,
-		    zero, tcache, arena);
+		    zero, tcache, arena, hook_args);
 	}
 	if (unlikely(p == NULL)) {
 		prof_alloc_rollback(tsd, tctx, false);
@@ -2545,6 +3223,10 @@ je_rallocx(void *ptr, size_t size, int flags) {
 	arena_t *arena;
 	tcache_t *tcache;
 
+	LOG("core.rallocx.entry", "ptr: %p, size: %zu, flags: %d", ptr,
+	    size, flags);
+
+
 	assert(ptr != NULL);
 	assert(size != 0);
 	assert(malloc_initialized() || IS_INITIALIZER);
@@ -2575,23 +3257,27 @@ je_rallocx(void *ptr, size_t size, int flags) {
 	rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd);
 	rtree_szind_slab_read(tsd_tsdn(tsd), &extents_rtree, rtree_ctx,
 	    (uintptr_t)ptr, true, &alloc_ctx.szind, &alloc_ctx.slab);
-	assert(alloc_ctx.szind != NSIZES);
+	assert(alloc_ctx.szind != SC_NSIZES);
 	old_usize = sz_index2size(alloc_ctx.szind);
 	assert(old_usize == isalloc(tsd_tsdn(tsd), ptr));
+
+	hook_ralloc_args_t hook_args = {false, {(uintptr_t)ptr, size, flags,
+		0}};
 	if (config_prof && opt_prof) {
 		usize = (alignment == 0) ?
 		    sz_s2u(size) : sz_sa2u(size, alignment);
-		if (unlikely(usize == 0 || usize > LARGE_MAXCLASS)) {
+		if (unlikely(usize == 0
+		    || usize > SC_LARGE_MAXCLASS)) {
 			goto label_oom;
 		}
 		p = irallocx_prof(tsd, ptr, old_usize, size, alignment, &usize,
-		    zero, tcache, arena, &alloc_ctx);
+		    zero, tcache, arena, &alloc_ctx, &hook_args);
 		if (unlikely(p == NULL)) {
 			goto label_oom;
 		}
 	} else {
 		p = iralloct(tsd_tsdn(tsd), ptr, old_usize, size, alignment,
-		    zero, tcache, arena);
+		    zero, tcache, arena, &hook_args);
 		if (unlikely(p == NULL)) {
 			goto label_oom;
 		}
@@ -2607,6 +3293,8 @@ je_rallocx(void *ptr, size_t size, int flags) {
 	}
 	UTRACE(ptr, size, p);
 	check_entry_exit_locking(tsd_tsdn(tsd));
+
+	LOG("core.rallocx.exit", "result: %p", p);
 	return p;
 label_oom:
 	if (config_xmalloc && unlikely(opt_xmalloc)) {
@@ -2615,20 +3303,22 @@ label_oom:
 	}
 	UTRACE(ptr, size, 0);
 	check_entry_exit_locking(tsd_tsdn(tsd));
+
+	LOG("core.rallocx.exit", "result: %p", NULL);
 	return NULL;
 }
 
 JEMALLOC_ALWAYS_INLINE size_t
 ixallocx_helper(tsdn_t *tsdn, void *ptr, size_t old_usize, size_t size,
     size_t extra, size_t alignment, bool zero) {
-	size_t usize;
+	size_t newsize;
 
-	if (ixalloc(tsdn, ptr, old_usize, size, extra, alignment, zero)) {
+	if (ixalloc(tsdn, ptr, old_usize, size, extra, alignment, zero,
+	    &newsize)) {
 		return old_usize;
 	}
-	usize = isalloc(tsdn, ptr);
 
-	return usize;
+	return newsize;
 }
 
 static size_t
@@ -2662,17 +3352,19 @@ ixallocx_prof(tsd_t *tsd, void *ptr, size_t old_usize, size_t size,
 	 */
 	if (alignment == 0) {
 		usize_max = sz_s2u(size+extra);
-		assert(usize_max > 0 && usize_max <= LARGE_MAXCLASS);
+		assert(usize_max > 0
+		    && usize_max <= SC_LARGE_MAXCLASS);
 	} else {
 		usize_max = sz_sa2u(size+extra, alignment);
-		if (unlikely(usize_max == 0 || usize_max > LARGE_MAXCLASS)) {
+		if (unlikely(usize_max == 0
+		    || usize_max > SC_LARGE_MAXCLASS)) {
 			/*
 			 * usize_max is out of range, and chances are that
 			 * allocation will fail, but use the maximum possible
 			 * value and carry on with prof_alloc_prep(), just in
 			 * case allocation succeeds.
 			 */
-			usize_max = LARGE_MAXCLASS;
+			usize_max = SC_LARGE_MAXCLASS;
 		}
 	}
 	tctx = prof_alloc_prep(tsd, usize_max, prof_active, false);
@@ -2701,6 +3393,9 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags) {
 	size_t alignment = MALLOCX_ALIGN_GET(flags);
 	bool zero = flags & MALLOCX_ZERO;
 
+	LOG("core.xallocx.entry", "ptr: %p, size: %zu, extra: %zu, "
+	    "flags: %d", ptr, size, extra, flags);
+
 	assert(ptr != NULL);
 	assert(size != 0);
 	assert(SIZE_T_MAX - size >= extra);
@@ -2712,24 +3407,24 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags) {
 	rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd);
 	rtree_szind_slab_read(tsd_tsdn(tsd), &extents_rtree, rtree_ctx,
 	    (uintptr_t)ptr, true, &alloc_ctx.szind, &alloc_ctx.slab);
-	assert(alloc_ctx.szind != NSIZES);
+	assert(alloc_ctx.szind != SC_NSIZES);
 	old_usize = sz_index2size(alloc_ctx.szind);
 	assert(old_usize == isalloc(tsd_tsdn(tsd), ptr));
 	/*
 	 * The API explicitly absolves itself of protecting against (size +
 	 * extra) numerical overflow, but we may need to clamp extra to avoid
-	 * exceeding LARGE_MAXCLASS.
+	 * exceeding SC_LARGE_MAXCLASS.
 	 *
 	 * Ordinarily, size limit checking is handled deeper down, but here we
 	 * have to check as part of (size + extra) clamping, since we need the
 	 * clamped value in the above helper functions.
 	 */
-	if (unlikely(size > LARGE_MAXCLASS)) {
+	if (unlikely(size > SC_LARGE_MAXCLASS)) {
 		usize = old_usize;
 		goto label_not_resized;
 	}
-	if (unlikely(LARGE_MAXCLASS - size < extra)) {
-		extra = LARGE_MAXCLASS - size;
+	if (unlikely(SC_LARGE_MAXCLASS - size < extra)) {
+		extra = SC_LARGE_MAXCLASS - size;
 	}
 
 	if (config_prof && opt_prof) {
@@ -2748,8 +3443,16 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags) {
 		*tsd_thread_deallocatedp_get(tsd) += old_usize;
 	}
 label_not_resized:
+	if (unlikely(!tsd_fast(tsd))) {
+		uintptr_t args[4] = {(uintptr_t)ptr, size, extra, flags};
+		hook_invoke_expand(hook_expand_xallocx, ptr, old_usize,
+		    usize, (uintptr_t)usize, args);
+	}
+
 	UTRACE(ptr, size, ptr);
 	check_entry_exit_locking(tsd_tsdn(tsd));
+
+	LOG("core.xallocx.exit", "result: %zu", usize);
 	return usize;
 }
 
@@ -2759,6 +3462,8 @@ je_sallocx(const void *ptr, int flags) {
 	size_t usize;
 	tsdn_t *tsdn;
 
+	LOG("core.sallocx.entry", "ptr: %p, flags: %d", ptr, flags);
+
 	assert(malloc_initialized() || IS_INITIALIZER);
 	assert(ptr != NULL);
 
@@ -2773,11 +3478,15 @@ je_sallocx(const void *ptr, int flags) {
 	}
 
 	check_entry_exit_locking(tsdn);
+
+	LOG("core.sallocx.exit", "result: %zu", usize);
 	return usize;
 }
 
 JEMALLOC_EXPORT void JEMALLOC_NOTHROW
 je_dallocx(void *ptr, int flags) {
+	LOG("core.dallocx.entry", "ptr: %p, flags: %d", ptr, flags);
+
 	assert(ptr != NULL);
 	assert(malloc_initialized() || IS_INITIALIZER);
 
@@ -2812,9 +3521,13 @@ je_dallocx(void *ptr, int flags) {
 		tsd_assert_fast(tsd);
 		ifree(tsd, ptr, tcache, false);
 	} else {
+		uintptr_t args_raw[3] = {(uintptr_t)ptr, flags};
+		hook_invoke_dalloc(hook_dalloc_dallocx, ptr, args_raw);
 		ifree(tsd, ptr, tcache, true);
 	}
 	check_entry_exit_locking(tsd_tsdn(tsd));
+
+	LOG("core.dallocx.exit", "");
 }
 
 JEMALLOC_ALWAYS_INLINE size_t
@@ -2831,8 +3544,8 @@ inallocx(tsdn_t *tsdn, size_t size, int flags) {
 	return usize;
 }
 
-JEMALLOC_EXPORT void JEMALLOC_NOTHROW
-je_sdallocx(void *ptr, size_t size, int flags) {
+JEMALLOC_NOINLINE void
+sdallocx_default(void *ptr, size_t size, int flags) {
 	assert(ptr != NULL);
 	assert(malloc_initialized() || IS_INITIALIZER);
 
@@ -2869,9 +3582,36 @@ je_sdallocx(void *ptr, size_t size, int flags) {
 		tsd_assert_fast(tsd);
 		isfree(tsd, ptr, usize, tcache, false);
 	} else {
+		uintptr_t args_raw[3] = {(uintptr_t)ptr, size, flags};
+		hook_invoke_dalloc(hook_dalloc_sdallocx, ptr, args_raw);
 		isfree(tsd, ptr, usize, tcache, true);
 	}
 	check_entry_exit_locking(tsd_tsdn(tsd));
+
+}
+
+JEMALLOC_EXPORT void JEMALLOC_NOTHROW
+je_sdallocx(void *ptr, size_t size, int flags) {
+	LOG("core.sdallocx.entry", "ptr: %p, size: %zu, flags: %d", ptr,
+		size, flags);
+
+	if (flags !=0 || !free_fastpath(ptr, size, true)) {
+		sdallocx_default(ptr, size, flags);
+	}
+
+	LOG("core.sdallocx.exit", "");
+}
+
+void JEMALLOC_NOTHROW
+je_sdallocx_noflags(void *ptr, size_t size) {
+	LOG("core.sdallocx.entry", "ptr: %p, size: %zu, flags: 0", ptr,
+		size);
+
+	if (!free_fastpath(ptr, size, true)) {
+		sdallocx_default(ptr, size, 0);
+	}
+
+	LOG("core.sdallocx.exit", "");
 }
 
 JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW
@@ -2883,6 +3623,7 @@ je_nallocx(size_t size, int flags) {
 	assert(size != 0);
 
 	if (unlikely(malloc_init())) {
+		LOG("core.nallocx.exit", "result: %zu", ZU(0));
 		return 0;
 	}
 
@@ -2890,11 +3631,13 @@ je_nallocx(size_t size, int flags) {
 	check_entry_exit_locking(tsdn);
 
 	usize = inallocx(tsdn, size, flags);
-	if (unlikely(usize > LARGE_MAXCLASS)) {
+	if (unlikely(usize > SC_LARGE_MAXCLASS)) {
+		LOG("core.nallocx.exit", "result: %zu", ZU(0));
 		return 0;
 	}
 
 	check_entry_exit_locking(tsdn);
+	LOG("core.nallocx.exit", "result: %zu", usize);
 	return usize;
 }
 
@@ -2904,7 +3647,10 @@ je_mallctl(const char *name, void *oldp, size_t *oldlenp, void *newp,
 	int ret;
 	tsd_t *tsd;
 
+	LOG("core.mallctl.entry", "name: %s", name);
+
 	if (unlikely(malloc_init())) {
+		LOG("core.mallctl.exit", "result: %d", EAGAIN);
 		return EAGAIN;
 	}
 
@@ -2912,6 +3658,8 @@ je_mallctl(const char *name, void *oldp, size_t *oldlenp, void *newp,
 	check_entry_exit_locking(tsd_tsdn(tsd));
 	ret = ctl_byname(tsd, name, oldp, oldlenp, newp, newlen);
 	check_entry_exit_locking(tsd_tsdn(tsd));
+
+	LOG("core.mallctl.exit", "result: %d", ret);
 	return ret;
 }
 
@@ -2919,7 +3667,10 @@ JEMALLOC_EXPORT int JEMALLOC_NOTHROW
 je_mallctlnametomib(const char *name, size_t *mibp, size_t *miblenp) {
 	int ret;
 
+	LOG("core.mallctlnametomib.entry", "name: %s", name);
+
 	if (unlikely(malloc_init())) {
+		LOG("core.mallctlnametomib.exit", "result: %d", EAGAIN);
 		return EAGAIN;
 	}
 
@@ -2927,6 +3678,8 @@ je_mallctlnametomib(const char *name, size_t *mibp, size_t *miblenp) {
 	check_entry_exit_locking(tsd_tsdn(tsd));
 	ret = ctl_nametomib(tsd, name, mibp, miblenp);
 	check_entry_exit_locking(tsd_tsdn(tsd));
+
+	LOG("core.mallctlnametomib.exit", "result: %d", ret);
 	return ret;
 }
 
@@ -2936,7 +3689,10 @@ je_mallctlbymib(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 	int ret;
 	tsd_t *tsd;
 
+	LOG("core.mallctlbymib.entry", "");
+
 	if (unlikely(malloc_init())) {
+		LOG("core.mallctlbymib.exit", "result: %d", EAGAIN);
 		return EAGAIN;
 	}
 
@@ -2944,6 +3700,7 @@ je_mallctlbymib(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 	check_entry_exit_locking(tsd_tsdn(tsd));
 	ret = ctl_bymib(tsd, mib, miblen, oldp, oldlenp, newp, newlen);
 	check_entry_exit_locking(tsd_tsdn(tsd));
+	LOG("core.mallctlbymib.exit", "result: %d", ret);
 	return ret;
 }
 
@@ -2952,10 +3709,13 @@ je_malloc_stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
     const char *opts) {
 	tsdn_t *tsdn;
 
+	LOG("core.malloc_stats_print.entry", "");
+
 	tsdn = tsdn_fetch();
 	check_entry_exit_locking(tsdn);
 	stats_print(write_cb, cbopaque, opts);
 	check_entry_exit_locking(tsdn);
+	LOG("core.malloc_stats_print.exit", "");
 }
 
 JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW
@@ -2963,6 +3723,8 @@ je_malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *ptr) {
 	size_t ret;
 	tsdn_t *tsdn;
 
+	LOG("core.malloc_usable_size.entry", "ptr: %p", ptr);
+
 	assert(malloc_initialized() || IS_INITIALIZER);
 
 	tsdn = tsdn_fetch();
@@ -2980,6 +3742,7 @@ je_malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *ptr) {
 	}
 
 	check_entry_exit_locking(tsdn);
+	LOG("core.malloc_usable_size.exit", "result: %zu", ret);
 	return ret;
 }
 
@@ -3084,6 +3847,7 @@ _malloc_prefork(void)
 		}
 	}
 	prof_prefork1(tsd_tsdn(tsd));
+	tsd_prefork(tsd);
 }
 
 #ifndef JEMALLOC_MUTEX_INIT_CB
@@ -3106,6 +3870,8 @@ _malloc_postfork(void)
 
 	tsd = tsd_fetch();
 
+	tsd_postfork_parent(tsd);
+
 	witness_postfork_parent(tsd_witness_tsdp_get(tsd));
 	/* Release all mutexes, now that fork() has completed. */
 	for (i = 0, narenas = narenas_total_get(); i < narenas; i++) {
@@ -3133,6 +3899,8 @@ jemalloc_postfork_child(void) {
 
 	tsd = tsd_fetch();
 
+	tsd_postfork_child(tsd);
+
 	witness_postfork_child(tsd_witness_tsdp_get(tsd));
 	/* Release all mutexes, now that fork() has completed. */
 	for (i = 0, narenas = narenas_total_get(); i < narenas; i++) {
diff --git a/deps/jemalloc/src/jemalloc_cpp.cpp b/deps/jemalloc/src/jemalloc_cpp.cpp
index 844ab398a7..da0441a7c9 100644
--- a/deps/jemalloc/src/jemalloc_cpp.cpp
+++ b/deps/jemalloc/src/jemalloc_cpp.cpp
@@ -39,12 +39,10 @@ void	operator delete(void *ptr, std::size_t size) noexcept;
 void	operator delete[](void *ptr, std::size_t size) noexcept;
 #endif
 
-template <bool IsNoExcept>
-void *
-newImpl(std::size_t size) noexcept(IsNoExcept) {
-	void *ptr = je_malloc(size);
-	if (likely(ptr != nullptr))
-		return ptr;
+JEMALLOC_NOINLINE
+static void *
+handleOOM(std::size_t size, bool nothrow) {
+	void *ptr = nullptr;
 
 	while (ptr == nullptr) {
 		std::new_handler handler;
@@ -68,11 +66,22 @@ newImpl(std::size_t size) noexcept(IsNoExcept) {
 		ptr = je_malloc(size);
 	}
 
-	if (ptr == nullptr && !IsNoExcept)
+	if (ptr == nullptr && !nothrow)
 		std::__throw_bad_alloc();
 	return ptr;
 }
 
+template <bool IsNoExcept>
+JEMALLOC_ALWAYS_INLINE
+void *
+newImpl(std::size_t size) noexcept(IsNoExcept) {
+	void *ptr = je_malloc(size);
+	if (likely(ptr != nullptr))
+		return ptr;
+
+	return handleOOM(size, IsNoExcept);
+}
+
 void *
 operator new(std::size_t size) {
 	return newImpl<false>(size);
@@ -119,14 +128,14 @@ operator delete(void *ptr, std::size_t size) noexcept {
 	if (unlikely(ptr == nullptr)) {
 		return;
 	}
-	je_sdallocx(ptr, size, /*flags=*/0);
+	je_sdallocx_noflags(ptr, size);
 }
 
 void operator delete[](void *ptr, std::size_t size) noexcept {
 	if (unlikely(ptr == nullptr)) {
 		return;
 	}
-	je_sdallocx(ptr, size, /*flags=*/0);
+	je_sdallocx_noflags(ptr, size);
 }
 
 #endif  // __cpp_sized_deallocation
diff --git a/deps/jemalloc/src/large.c b/deps/jemalloc/src/large.c
index 27a2c67987..8e7a781d33 100644
--- a/deps/jemalloc/src/large.c
+++ b/deps/jemalloc/src/large.c
@@ -28,7 +28,7 @@ large_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment,
 	assert(!tsdn_null(tsdn) || arena != NULL);
 
 	ausize = sz_sa2u(usize, alignment);
-	if (unlikely(ausize == 0 || ausize > LARGE_MAXCLASS)) {
+	if (unlikely(ausize == 0 || ausize > SC_LARGE_MAXCLASS)) {
 		return NULL;
 	}
 
@@ -42,7 +42,7 @@ large_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment,
 	 */
 	is_zeroed = zero;
 	if (likely(!tsdn_null(tsdn))) {
-		arena = arena_choose(tsdn_tsd(tsdn), arena);
+		arena = arena_choose_maybe_huge(tsdn_tsd(tsdn), arena, usize);
 	}
 	if (unlikely(arena == NULL) || (extent = arena_extent_alloc_large(tsdn,
 	    arena, usize, alignment, &is_zeroed)) == NULL) {
@@ -109,7 +109,7 @@ large_ralloc_no_move_shrink(tsdn_t *tsdn, extent_t *extent, size_t usize) {
 	if (diff != 0) {
 		extent_t *trail = extent_split_wrapper(tsdn, arena,
 		    &extent_hooks, extent, usize + sz_large_pad,
-		    sz_size2index(usize), false, diff, NSIZES, false);
+		    sz_size2index(usize), false, diff, SC_NSIZES, false);
 		if (trail == NULL) {
 			return true;
 		}
@@ -154,17 +154,17 @@ large_ralloc_no_move_expand(tsdn_t *tsdn, extent_t *extent, size_t usize,
 	bool new_mapping;
 	if ((trail = extents_alloc(tsdn, arena, &extent_hooks,
 	    &arena->extents_dirty, extent_past_get(extent), trailsize, 0,
-	    CACHELINE, false, NSIZES, &is_zeroed_trail, &commit)) != NULL
+	    CACHELINE, false, SC_NSIZES, &is_zeroed_trail, &commit)) != NULL
 	    || (trail = extents_alloc(tsdn, arena, &extent_hooks,
 	    &arena->extents_muzzy, extent_past_get(extent), trailsize, 0,
-	    CACHELINE, false, NSIZES, &is_zeroed_trail, &commit)) != NULL) {
+	    CACHELINE, false, SC_NSIZES, &is_zeroed_trail, &commit)) != NULL) {
 		if (config_stats) {
 			new_mapping = false;
 		}
 	} else {
 		if ((trail = extent_alloc_wrapper(tsdn, arena, &extent_hooks,
 		    extent_past_get(extent), trailsize, 0, CACHELINE, false,
-		    NSIZES, &is_zeroed_trail, &commit)) == NULL) {
+		    SC_NSIZES, &is_zeroed_trail, &commit)) == NULL) {
 			return true;
 		}
 		if (config_stats) {
@@ -221,9 +221,10 @@ large_ralloc_no_move(tsdn_t *tsdn, extent_t *extent, size_t usize_min,
 	size_t oldusize = extent_usize_get(extent);
 
 	/* The following should have been caught by callers. */
-	assert(usize_min > 0 && usize_max <= LARGE_MAXCLASS);
+	assert(usize_min > 0 && usize_max <= SC_LARGE_MAXCLASS);
 	/* Both allocation sizes must be large to avoid a move. */
-	assert(oldusize >= LARGE_MINCLASS && usize_max >= LARGE_MINCLASS);
+	assert(oldusize >= SC_LARGE_MINCLASS
+	    && usize_max >= SC_LARGE_MINCLASS);
 
 	if (usize_max > oldusize) {
 		/* Attempt to expand the allocation in-place. */
@@ -270,17 +271,23 @@ large_ralloc_move_helper(tsdn_t *tsdn, arena_t *arena, size_t usize,
 }
 
 void *
-large_ralloc(tsdn_t *tsdn, arena_t *arena, extent_t *extent, size_t usize,
-    size_t alignment, bool zero, tcache_t *tcache) {
-	size_t oldusize = extent_usize_get(extent);
+large_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t usize,
+    size_t alignment, bool zero, tcache_t *tcache,
+    hook_ralloc_args_t *hook_args) {
+	extent_t *extent = iealloc(tsdn, ptr);
 
+	size_t oldusize = extent_usize_get(extent);
 	/* The following should have been caught by callers. */
-	assert(usize > 0 && usize <= LARGE_MAXCLASS);
+	assert(usize > 0 && usize <= SC_LARGE_MAXCLASS);
 	/* Both allocation sizes must be large to avoid a move. */
-	assert(oldusize >= LARGE_MINCLASS && usize >= LARGE_MINCLASS);
+	assert(oldusize >= SC_LARGE_MINCLASS
+	    && usize >= SC_LARGE_MINCLASS);
 
 	/* Try to avoid moving the allocation. */
 	if (!large_ralloc_no_move(tsdn, extent, usize, usize, zero)) {
+		hook_invoke_expand(hook_args->is_realloc
+		    ? hook_expand_realloc : hook_expand_rallocx, ptr, oldusize,
+		    usize, (uintptr_t)ptr, hook_args->args);
 		return extent_addr_get(extent);
 	}
 
@@ -295,6 +302,12 @@ large_ralloc(tsdn_t *tsdn, arena_t *arena, extent_t *extent, size_t usize,
 		return NULL;
 	}
 
+	hook_invoke_alloc(hook_args->is_realloc
+	    ? hook_alloc_realloc : hook_alloc_rallocx, ret, (uintptr_t)ret,
+	    hook_args->args);
+	hook_invoke_dalloc(hook_args->is_realloc
+	    ? hook_dalloc_realloc : hook_dalloc_rallocx, ptr, hook_args->args);
+
 	size_t copysize = (usize < oldusize) ? usize : oldusize;
 	memcpy(ret, extent_addr_get(extent), copysize);
 	isdalloct(tsdn, extent_addr_get(extent), oldusize, tcache, NULL, true);
@@ -318,8 +331,9 @@ large_dalloc_prep_impl(tsdn_t *tsdn, arena_t *arena, extent_t *extent,
 		large_dalloc_maybe_junk(extent_addr_get(extent),
 		    extent_usize_get(extent));
 	} else {
-		malloc_mutex_assert_owner(tsdn, &arena->large_mtx);
+		/* Only hold the large_mtx if necessary. */
 		if (!arena_is_auto(arena)) {
+			malloc_mutex_assert_owner(tsdn, &arena->large_mtx);
 			extent_list_remove(&arena->large, extent);
 		}
 	}
@@ -369,3 +383,13 @@ void
 large_prof_tctx_reset(tsdn_t *tsdn, extent_t *extent) {
 	large_prof_tctx_set(tsdn, extent, (prof_tctx_t *)(uintptr_t)1U);
 }
+
+nstime_t
+large_prof_alloc_time_get(const extent_t *extent) {
+	return extent_prof_alloc_time_get(extent);
+}
+
+void
+large_prof_alloc_time_set(extent_t *extent, nstime_t t) {
+	extent_prof_alloc_time_set(extent, t);
+}
diff --git a/deps/jemalloc/src/log.c b/deps/jemalloc/src/log.c
new file mode 100644
index 0000000000..778902fb9b
--- /dev/null
+++ b/deps/jemalloc/src/log.c
@@ -0,0 +1,78 @@
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/jemalloc_internal_includes.h"
+
+#include "jemalloc/internal/log.h"
+
+char log_var_names[JEMALLOC_LOG_VAR_BUFSIZE];
+atomic_b_t log_init_done = ATOMIC_INIT(false);
+
+/*
+ * Returns true if we were able to pick out a segment.  Fills in r_segment_end
+ * with a pointer to the first character after the end of the string.
+ */
+static const char *
+log_var_extract_segment(const char* segment_begin) {
+	const char *end;
+	for (end = segment_begin; *end != '\0' && *end != '|'; end++) {
+	}
+	return end;
+}
+
+static bool
+log_var_matches_segment(const char *segment_begin, const char *segment_end,
+    const char *log_var_begin, const char *log_var_end) {
+	assert(segment_begin <= segment_end);
+	assert(log_var_begin < log_var_end);
+
+	ptrdiff_t segment_len = segment_end - segment_begin;
+	ptrdiff_t log_var_len = log_var_end - log_var_begin;
+	/* The special '.' segment matches everything. */
+	if (segment_len == 1 && *segment_begin == '.') {
+		return true;
+	}
+        if (segment_len == log_var_len) {
+		return strncmp(segment_begin, log_var_begin, segment_len) == 0;
+	} else if (segment_len < log_var_len) {
+		return strncmp(segment_begin, log_var_begin, segment_len) == 0
+		    && log_var_begin[segment_len] == '.';
+        } else {
+		return false;
+	}
+}
+
+unsigned
+log_var_update_state(log_var_t *log_var) {
+	const char *log_var_begin = log_var->name;
+	const char *log_var_end = log_var->name + strlen(log_var->name);
+
+	/* Pointer to one before the beginning of the current segment. */
+	const char *segment_begin = log_var_names;
+
+	/*
+	 * If log_init done is false, we haven't parsed the malloc conf yet.  To
+	 * avoid log-spew, we default to not displaying anything.
+	 */
+	if (!atomic_load_b(&log_init_done, ATOMIC_ACQUIRE)) {
+		return LOG_INITIALIZED_NOT_ENABLED;
+	}
+
+	while (true) {
+		const char *segment_end = log_var_extract_segment(
+		    segment_begin);
+		assert(segment_end < log_var_names + JEMALLOC_LOG_VAR_BUFSIZE);
+		if (log_var_matches_segment(segment_begin, segment_end,
+		    log_var_begin, log_var_end)) {
+			atomic_store_u(&log_var->state, LOG_ENABLED,
+			    ATOMIC_RELAXED);
+			return LOG_ENABLED;
+		}
+		if (*segment_end == '\0') {
+			/* Hit the end of the segment string with no match. */
+			atomic_store_u(&log_var->state,
+			    LOG_INITIALIZED_NOT_ENABLED, ATOMIC_RELAXED);
+			return LOG_INITIALIZED_NOT_ENABLED;
+		}
+		/* Otherwise, skip the delimiter and continue. */
+		segment_begin = segment_end + 1;
+	}
+}
diff --git a/deps/jemalloc/src/malloc_io.c b/deps/jemalloc/src/malloc_io.c
index 6b99afcd3f..d7cb0f5284 100644
--- a/deps/jemalloc/src/malloc_io.c
+++ b/deps/jemalloc/src/malloc_io.c
@@ -70,20 +70,7 @@ static char *x2s(uintmax_t x, bool alt_form, bool uppercase, char *s,
 /* malloc_message() setup. */
 static void
 wrtmessage(void *cbopaque, const char *s) {
-#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_write)
-	/*
-	 * Use syscall(2) rather than write(2) when possible in order to avoid
-	 * the possibility of memory allocation within libc.  This is necessary
-	 * on FreeBSD; most operating systems do not have this problem though.
-	 *
-	 * syscall() returns long or int, depending on platform, so capture the
-	 * unused result in the widest plausible type to avoid compiler
-	 * warnings.
-	 */
-	UNUSED long result = syscall(SYS_write, STDERR_FILENO, s, strlen(s));
-#else
-	UNUSED ssize_t result = write(STDERR_FILENO, s, strlen(s));
-#endif
+	malloc_write_fd(STDERR_FILENO, s, strlen(s));
 }
 
 JEMALLOC_EXPORT void	(*je_malloc_message)(void *, const char *s);
@@ -111,7 +98,7 @@ buferror(int err, char *buf, size_t buflen) {
 	FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM, NULL, err, 0,
 	    (LPSTR)buf, (DWORD)buflen, NULL);
 	return 0;
-#elif defined(__GLIBC__) && defined(_GNU_SOURCE)
+#elif defined(JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE) && defined(_GNU_SOURCE)
 	char *b = strerror_r(err, buf, buflen);
 	if (b != buf) {
 		strncpy(buf, b, buflen);
@@ -375,7 +362,7 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap) {
 	}								\
 } while (0)
 #define GET_ARG_NUMERIC(val, len) do {					\
-	switch (len) {							\
+	switch ((unsigned char)len) {					\
 	case '?':							\
 		val = va_arg(ap, int);					\
 		break;							\
@@ -645,7 +632,6 @@ malloc_vcprintf(void (*write_cb)(void *, const char *), void *cbopaque,
 		 */
 		write_cb = (je_malloc_message != NULL) ? je_malloc_message :
 		    wrtmessage;
-		cbopaque = NULL;
 	}
 
 	malloc_vsnprintf(buf, sizeof(buf), format, ap);
diff --git a/deps/jemalloc/src/mutex.c b/deps/jemalloc/src/mutex.c
index a528ef0c24..3f920f5b1c 100644
--- a/deps/jemalloc/src/mutex.c
+++ b/deps/jemalloc/src/mutex.c
@@ -4,6 +4,7 @@
 
 #include "jemalloc/internal/assert.h"
 #include "jemalloc/internal/malloc_io.h"
+#include "jemalloc/internal/spin.h"
 
 #ifndef _CRT_SPINCOUNT
 #define _CRT_SPINCOUNT 4000
@@ -45,7 +46,7 @@ JEMALLOC_EXPORT int	_pthread_mutex_init_calloc_cb(pthread_mutex_t *mutex,
 void
 malloc_mutex_lock_slow(malloc_mutex_t *mutex) {
 	mutex_prof_data_t *data = &mutex->prof_data;
-	UNUSED nstime_t before = NSTIME_ZERO_INITIALIZER;
+	nstime_t before = NSTIME_ZERO_INITIALIZER;
 
 	if (ncpus == 1) {
 		goto label_spin_done;
@@ -53,8 +54,9 @@ malloc_mutex_lock_slow(malloc_mutex_t *mutex) {
 
 	int cnt = 0, max_cnt = MALLOC_MUTEX_MAX_SPIN;
 	do {
-		CPU_SPINWAIT;
-		if (!malloc_mutex_trylock_final(mutex)) {
+		spin_cpu_spinwait();
+		if (!atomic_load_b(&mutex->locked, ATOMIC_RELAXED)
+                    && !malloc_mutex_trylock_final(mutex)) {
 			data->n_spin_acquired++;
 			return;
 		}
@@ -143,9 +145,7 @@ malloc_mutex_init(malloc_mutex_t *mutex, const char *name,
 	}
 #  endif
 #elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
-	mutex->lock = OS_UNFAIR_LOCK_INIT;
-#elif (defined(JEMALLOC_OSSPIN))
-	mutex->lock = 0;
+       mutex->lock = OS_UNFAIR_LOCK_INIT;
 #elif (defined(JEMALLOC_MUTEX_INIT_CB))
 	if (postpone_init) {
 		mutex->postponed_next = postponed_mutexes;
@@ -173,7 +173,7 @@ malloc_mutex_init(malloc_mutex_t *mutex, const char *name,
 		mutex->lock_order = lock_order;
 		if (lock_order == malloc_mutex_address_ordered) {
 			witness_init(&mutex->witness, name, rank,
-			    mutex_addr_comp, &mutex);
+			    mutex_addr_comp, mutex);
 		} else {
 			witness_init(&mutex->witness, name, rank, NULL, NULL);
 		}
diff --git a/deps/jemalloc/src/pages.c b/deps/jemalloc/src/pages.c
index 6f2ba5669b..13de27a008 100644
--- a/deps/jemalloc/src/pages.c
+++ b/deps/jemalloc/src/pages.c
@@ -10,6 +10,9 @@
 
 #ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT
 #include <sys/sysctl.h>
+#ifdef __FreeBSD__
+#include <vm/vm_param.h>
+#endif
 #endif
 
 /******************************************************************************/
@@ -25,6 +28,18 @@ static int	mmap_flags;
 #endif
 static bool	os_overcommits;
 
+const char *thp_mode_names[] = {
+	"default",
+	"always",
+	"never",
+	"not supported"
+};
+thp_mode_t opt_thp = THP_MODE_DEFAULT;
+thp_mode_t init_system_thp_mode;
+
+/* Runtime support for lazy purge. Irrelevant when !pages_can_purge_lazy. */
+static bool pages_can_purge_lazy_runtime = true;
+
 /******************************************************************************/
 /*
  * Function prototypes for static functions that are referenced prior to
@@ -165,6 +180,35 @@ pages_map(void *addr, size_t size, size_t alignment, bool *commit) {
 	assert(alignment >= PAGE);
 	assert(ALIGNMENT_ADDR2BASE(addr, alignment) == addr);
 
+#if defined(__FreeBSD__) && defined(MAP_EXCL)
+	/*
+	 * FreeBSD has mechanisms both to mmap at specific address without
+	 * touching existing mappings, and to mmap with specific alignment.
+	 */
+	{
+		if (os_overcommits) {
+			*commit = true;
+		}
+
+		int prot = *commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT;
+		int flags = mmap_flags;
+
+		if (addr != NULL) {
+			flags |= MAP_FIXED | MAP_EXCL;
+		} else {
+			unsigned alignment_bits = ffs_zu(alignment);
+			assert(alignment_bits > 1);
+			flags |= MAP_ALIGNED(alignment_bits - 1);
+		}
+
+		void *ret = mmap(addr, size, prot, flags, -1, 0);
+		if (ret == MAP_FAILED) {
+			ret = NULL;
+		}
+
+		return ret;
+	}
+#endif
 	/*
 	 * Ideally, there would be a way to specify alignment to mmap() (like
 	 * NetBSD has), but in the absence of such a feature, we have to work
@@ -246,19 +290,31 @@ pages_decommit(void *addr, size_t size) {
 
 bool
 pages_purge_lazy(void *addr, size_t size) {
-	assert(PAGE_ADDR2BASE(addr) == addr);
+	assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr);
 	assert(PAGE_CEILING(size) == size);
 
 	if (!pages_can_purge_lazy) {
 		return true;
 	}
+	if (!pages_can_purge_lazy_runtime) {
+		/*
+		 * Built with lazy purge enabled, but detected it was not
+		 * supported on the current system.
+		 */
+		return true;
+	}
 
 #ifdef _WIN32
 	VirtualAlloc(addr, size, MEM_RESET, PAGE_READWRITE);
 	return false;
-#elif defined(JEMALLOC_PURGE_MADVISE_FREE) && \
-    !defined(PAGES_CAN_PURGE_LAZY)
-	return (madvise(addr, size, MADV_FREE) != 0);
+#elif defined(JEMALLOC_PURGE_MADVISE_FREE)
+	return (madvise(addr, size,
+#  ifdef MADV_FREE
+	    MADV_FREE
+#  else
+	    JEMALLOC_MADV_FREE
+#  endif
+	    ) != 0);
 #elif defined(JEMALLOC_PURGE_MADVISE_DONTNEED) && \
     !defined(JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS)
 	return (madvise(addr, size, MADV_DONTNEED) != 0);
@@ -287,36 +343,88 @@ pages_purge_forced(void *addr, size_t size) {
 #endif
 }
 
+static bool
+pages_huge_impl(void *addr, size_t size, bool aligned) {
+	if (aligned) {
+		assert(HUGEPAGE_ADDR2BASE(addr) == addr);
+		assert(HUGEPAGE_CEILING(size) == size);
+	}
+#ifdef JEMALLOC_HAVE_MADVISE_HUGE
+	return (madvise(addr, size, MADV_HUGEPAGE) != 0);
+#else
+	return true;
+#endif
+}
+
 bool
 pages_huge(void *addr, size_t size) {
-	assert(HUGEPAGE_ADDR2BASE(addr) == addr);
-	assert(HUGEPAGE_CEILING(size) == size);
+	return pages_huge_impl(addr, size, true);
+}
 
-#ifdef JEMALLOC_THP
-	return (madvise(addr, size, MADV_HUGEPAGE) != 0);
+static bool
+pages_huge_unaligned(void *addr, size_t size) {
+	return pages_huge_impl(addr, size, false);
+}
+
+static bool
+pages_nohuge_impl(void *addr, size_t size, bool aligned) {
+	if (aligned) {
+		assert(HUGEPAGE_ADDR2BASE(addr) == addr);
+		assert(HUGEPAGE_CEILING(size) == size);
+	}
+
+#ifdef JEMALLOC_HAVE_MADVISE_HUGE
+	return (madvise(addr, size, MADV_NOHUGEPAGE) != 0);
 #else
-	return true;
+	return false;
 #endif
 }
 
 bool
 pages_nohuge(void *addr, size_t size) {
-	assert(HUGEPAGE_ADDR2BASE(addr) == addr);
-	assert(HUGEPAGE_CEILING(size) == size);
+	return pages_nohuge_impl(addr, size, true);
+}
 
-#ifdef JEMALLOC_THP
-	return (madvise(addr, size, MADV_NOHUGEPAGE) != 0);
+static bool
+pages_nohuge_unaligned(void *addr, size_t size) {
+	return pages_nohuge_impl(addr, size, false);
+}
+
+bool
+pages_dontdump(void *addr, size_t size) {
+	assert(PAGE_ADDR2BASE(addr) == addr);
+	assert(PAGE_CEILING(size) == size);
+#ifdef JEMALLOC_MADVISE_DONTDUMP
+	return madvise(addr, size, MADV_DONTDUMP) != 0;
 #else
 	return false;
 #endif
 }
 
+bool
+pages_dodump(void *addr, size_t size) {
+	assert(PAGE_ADDR2BASE(addr) == addr);
+	assert(PAGE_CEILING(size) == size);
+#ifdef JEMALLOC_MADVISE_DONTDUMP
+	return madvise(addr, size, MADV_DODUMP) != 0;
+#else
+	return false;
+#endif
+}
+
+
 static size_t
 os_page_detect(void) {
 #ifdef _WIN32
 	SYSTEM_INFO si;
 	GetSystemInfo(&si);
 	return si.dwPageSize;
+#elif defined(__FreeBSD__)
+	/*
+	 * This returns the value obtained from
+	 * the auxv vector, avoiding a syscall.
+	 */
+	return getpagesize();
 #else
 	long result = sysconf(_SC_PAGESIZE);
 	if (result == -1) {
@@ -333,9 +441,19 @@ os_overcommits_sysctl(void) {
 	size_t sz;
 
 	sz = sizeof(vm_overcommit);
+#if defined(__FreeBSD__) && defined(VM_OVERCOMMIT)
+	int mib[2];
+
+	mib[0] = CTL_VM;
+	mib[1] = VM_OVERCOMMIT;
+	if (sysctl(mib, 2, &vm_overcommit, &sz, NULL, 0) != 0) {
+		return false; /* Error. */
+	}
+#else
 	if (sysctlbyname("vm.overcommit", &vm_overcommit, &sz, NULL, 0) != 0) {
 		return false; /* Error. */
 	}
+#endif
 
 	return ((vm_overcommit & 0x3) == 0);
 }
@@ -351,27 +469,44 @@ static bool
 os_overcommits_proc(void) {
 	int fd;
 	char buf[1];
-	ssize_t nread;
 
 #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_open)
-	fd = (int)syscall(SYS_open, "/proc/sys/vm/overcommit_memory", O_RDONLY |
-	    O_CLOEXEC);
+	#if defined(O_CLOEXEC)
+		fd = (int)syscall(SYS_open, "/proc/sys/vm/overcommit_memory", O_RDONLY |
+			O_CLOEXEC);
+	#else
+		fd = (int)syscall(SYS_open, "/proc/sys/vm/overcommit_memory", O_RDONLY);
+		if (fd != -1) {
+			fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
+		}
+	#endif
 #elif defined(JEMALLOC_USE_SYSCALL) && defined(SYS_openat)
-	fd = (int)syscall(SYS_openat,
-	    AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC);
+	#if defined(O_CLOEXEC)
+		fd = (int)syscall(SYS_openat,
+			AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC);
+	#else
+		fd = (int)syscall(SYS_openat,
+			AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY);
+		if (fd != -1) {
+			fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
+		}
+	#endif
 #else
-	fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC);
+	#if defined(O_CLOEXEC)
+		fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC);
+	#else
+		fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY);
+		if (fd != -1) {
+			fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
+		}
+	#endif
 #endif
+
 	if (fd == -1) {
 		return false; /* Error. */
 	}
 
-#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_read)
-	nread = (ssize_t)syscall(SYS_read, fd, &buf, sizeof(buf));
-#else
-	nread = read(fd, &buf, sizeof(buf));
-#endif
-
+	ssize_t nread = malloc_read_fd(fd, &buf, sizeof(buf));
 #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_close)
 	syscall(SYS_close, fd);
 #else
@@ -391,6 +526,75 @@ os_overcommits_proc(void) {
 }
 #endif
 
+void
+pages_set_thp_state (void *ptr, size_t size) {
+	if (opt_thp == thp_mode_default || opt_thp == init_system_thp_mode) {
+		return;
+	}
+	assert(opt_thp != thp_mode_not_supported &&
+	    init_system_thp_mode != thp_mode_not_supported);
+
+	if (opt_thp == thp_mode_always
+	    && init_system_thp_mode != thp_mode_never) {
+		assert(init_system_thp_mode == thp_mode_default);
+		pages_huge_unaligned(ptr, size);
+	} else if (opt_thp == thp_mode_never) {
+		assert(init_system_thp_mode == thp_mode_default ||
+		    init_system_thp_mode == thp_mode_always);
+		pages_nohuge_unaligned(ptr, size);
+	}
+}
+
+static void
+init_thp_state(void) {
+	if (!have_madvise_huge) {
+		if (metadata_thp_enabled() && opt_abort) {
+			malloc_write("<jemalloc>: no MADV_HUGEPAGE support\n");
+			abort();
+		}
+		goto label_error;
+	}
+
+	static const char sys_state_madvise[] = "always [madvise] never\n";
+	static const char sys_state_always[] = "[always] madvise never\n";
+	static const char sys_state_never[] = "always madvise [never]\n";
+	char buf[sizeof(sys_state_madvise)];
+
+#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_open)
+	int fd = (int)syscall(SYS_open,
+	    "/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY);
+#else
+	int fd = open("/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY);
+#endif
+	if (fd == -1) {
+		goto label_error;
+	}
+
+	ssize_t nread = malloc_read_fd(fd, &buf, sizeof(buf));
+#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_close)
+	syscall(SYS_close, fd);
+#else
+	close(fd);
+#endif
+
+        if (nread < 0) {
+		goto label_error; 
+        }
+
+	if (strncmp(buf, sys_state_madvise, (size_t)nread) == 0) {
+		init_system_thp_mode = thp_mode_default;
+	} else if (strncmp(buf, sys_state_always, (size_t)nread) == 0) {
+		init_system_thp_mode = thp_mode_always;
+	} else if (strncmp(buf, sys_state_never, (size_t)nread) == 0) {
+		init_system_thp_mode = thp_mode_never;
+	} else {
+		goto label_error;
+	}
+	return;
+label_error:
+	opt_thp = init_system_thp_mode = thp_mode_not_supported;
+}
+
 bool
 pages_boot(void) {
 	os_page = os_page_detect();
@@ -419,5 +623,27 @@ pages_boot(void) {
 	os_overcommits = false;
 #endif
 
+	init_thp_state();
+
+#ifdef __FreeBSD__
+	/*
+	 * FreeBSD doesn't need the check; madvise(2) is known to work.
+	 */
+#else
+	/* Detect lazy purge runtime support. */
+	if (pages_can_purge_lazy) {
+		bool committed = false;
+		void *madv_free_page = os_pages_map(NULL, PAGE, PAGE, &committed);
+		if (madv_free_page == NULL) {
+			return true;
+		}
+		assert(pages_can_purge_lazy_runtime);
+		if (pages_purge_lazy(madv_free_page, PAGE)) {
+			pages_can_purge_lazy_runtime = false;
+		}
+		os_pages_unmap(madv_free_page, PAGE);
+	}
+#endif
+
 	return false;
 }
diff --git a/deps/jemalloc/src/prof.c b/deps/jemalloc/src/prof.c
index 975722c4c3..13334cb4c0 100644
--- a/deps/jemalloc/src/prof.c
+++ b/deps/jemalloc/src/prof.c
@@ -7,6 +7,7 @@
 #include "jemalloc/internal/hash.h"
 #include "jemalloc/internal/malloc_io.h"
 #include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/emitter.h"
 
 /******************************************************************************/
 
@@ -23,7 +24,7 @@
  */
 #undef _Unwind_Backtrace
 #include <unwind.h>
-#define _Unwind_Backtrace JEMALLOC_HOOK(_Unwind_Backtrace, hooks_libc_hook)
+#define _Unwind_Backtrace JEMALLOC_HOOK(_Unwind_Backtrace, test_hooks_libc_hook)
 #endif
 
 /******************************************************************************/
@@ -38,6 +39,7 @@ bool		opt_prof_gdump = false;
 bool		opt_prof_final = false;
 bool		opt_prof_leak = false;
 bool		opt_prof_accum = false;
+bool		opt_prof_log = false;
 char		opt_prof_prefix[
     /* Minimize memory bloat for non-prof builds. */
 #ifdef JEMALLOC_PROF
@@ -70,6 +72,100 @@ uint64_t	prof_interval = 0;
 
 size_t		lg_prof_sample;
 
+typedef enum prof_logging_state_e prof_logging_state_t;
+enum prof_logging_state_e {
+	prof_logging_state_stopped,
+	prof_logging_state_started,
+	prof_logging_state_dumping
+};
+
+/*
+ * - stopped: log_start never called, or previous log_stop has completed.
+ * - started: log_start called, log_stop not called yet. Allocations are logged.
+ * - dumping: log_stop called but not finished; samples are not logged anymore.
+ */
+prof_logging_state_t prof_logging_state = prof_logging_state_stopped;
+
+#ifdef JEMALLOC_JET
+static bool prof_log_dummy = false;
+#endif
+
+/* Incremented for every log file that is output. */
+static uint64_t log_seq = 0;
+static char log_filename[
+    /* Minimize memory bloat for non-prof builds. */
+#ifdef JEMALLOC_PROF
+    PATH_MAX +
+#endif
+    1];
+
+/* Timestamp for most recent call to log_start(). */
+static nstime_t log_start_timestamp = NSTIME_ZERO_INITIALIZER;
+
+/* Increment these when adding to the log_bt and log_thr linked lists. */
+static size_t log_bt_index = 0;
+static size_t log_thr_index = 0;
+
+/* Linked list node definitions. These are only used in prof.c. */
+typedef struct prof_bt_node_s prof_bt_node_t;
+
+struct prof_bt_node_s {
+	prof_bt_node_t *next;
+	size_t index;
+	prof_bt_t bt;
+	/* Variable size backtrace vector pointed to by bt. */
+	void *vec[1];
+};
+
+typedef struct prof_thr_node_s prof_thr_node_t;
+
+struct prof_thr_node_s {
+	prof_thr_node_t *next;
+	size_t index;
+	uint64_t thr_uid;
+	/* Variable size based on thr_name_sz. */
+	char name[1];
+};
+
+typedef struct prof_alloc_node_s prof_alloc_node_t;
+
+/* This is output when logging sampled allocations. */
+struct prof_alloc_node_s {
+	prof_alloc_node_t *next;
+	/* Indices into an array of thread data. */
+	size_t alloc_thr_ind;
+	size_t free_thr_ind;
+
+	/* Indices into an array of backtraces. */
+	size_t alloc_bt_ind;
+	size_t free_bt_ind;
+
+	uint64_t alloc_time_ns;
+	uint64_t free_time_ns;
+
+	size_t usize;
+};
+
+/*
+ * Created on the first call to prof_log_start and deleted on prof_log_stop.
+ * These are the backtraces and threads that have already been logged by an
+ * allocation.
+ */
+static bool log_tables_initialized = false;
+static ckh_t log_bt_node_set;
+static ckh_t log_thr_node_set;
+
+/* Store linked lists for logged data. */
+static prof_bt_node_t *log_bt_first = NULL;
+static prof_bt_node_t *log_bt_last = NULL;
+static prof_thr_node_t *log_thr_first = NULL;
+static prof_thr_node_t *log_thr_last = NULL;
+static prof_alloc_node_t *log_alloc_first = NULL;
+static prof_alloc_node_t *log_alloc_last = NULL;
+
+/* Protects the prof_logging_state and any log_{...} variable. */
+static malloc_mutex_t log_mtx;
+
 /*
  * Table of mutexes that are shared among gctx's.  These are leaf locks, so
  * there is no problem with using them for more than one gctx at the same time.
@@ -145,6 +241,12 @@ static void	prof_tdata_destroy(tsd_t *tsd, prof_tdata_t *tdata,
     bool even_if_attached);
 static char	*prof_thread_name_alloc(tsdn_t *tsdn, const char *thread_name);
 
+/* Hashtable functions for log_bt_node_set and log_thr_node_set. */
+static void prof_thr_node_hash(const void *key, size_t r_hash[2]);
+static bool prof_thr_node_keycomp(const void *k1, const void *k2);
+static void prof_bt_node_hash(const void *key, size_t r_hash[2]);
+static bool prof_bt_node_keycomp(const void *k1, const void *k2);
+
 /******************************************************************************/
 /* Red-black trees. */
 
@@ -242,6 +344,12 @@ prof_malloc_sample_object(tsdn_t *tsdn, const void *ptr, size_t usize,
     prof_tctx_t *tctx) {
 	prof_tctx_set(tsdn, ptr, usize, NULL, tctx);
 
+	/* Get the current time and set this in the extent_t. We'll read this
+	 * when free() is called. */
+	nstime_t t = NSTIME_ZERO_INITIALIZER;
+	nstime_update(&t);
+	prof_alloc_time_set(tsdn, ptr, NULL, t);
+
 	malloc_mutex_lock(tsdn, tctx->tdata->lock);
 	tctx->cnts.curobjs++;
 	tctx->cnts.curbytes += usize;
@@ -253,14 +361,174 @@ prof_malloc_sample_object(tsdn_t *tsdn, const void *ptr, size_t usize,
 	malloc_mutex_unlock(tsdn, tctx->tdata->lock);
 }
 
+static size_t
+prof_log_bt_index(tsd_t *tsd, prof_bt_t *bt) {
+	assert(prof_logging_state == prof_logging_state_started);
+	malloc_mutex_assert_owner(tsd_tsdn(tsd), &log_mtx);
+
+	prof_bt_node_t dummy_node;
+	dummy_node.bt = *bt;
+	prof_bt_node_t *node;
+
+	/* See if this backtrace is already cached in the table. */
+	if (ckh_search(&log_bt_node_set, (void *)(&dummy_node),
+	    (void **)(&node), NULL)) {
+		size_t sz = offsetof(prof_bt_node_t, vec) +
+			        (bt->len * sizeof(void *));
+		prof_bt_node_t *new_node = (prof_bt_node_t *)
+		    iallocztm(tsd_tsdn(tsd), sz, sz_size2index(sz), false, NULL,
+		    true, arena_get(TSDN_NULL, 0, true), true);
+		if (log_bt_first == NULL) {
+			log_bt_first = new_node;
+			log_bt_last = new_node;
+		} else {
+			log_bt_last->next = new_node;
+			log_bt_last = new_node;
+		}
+
+		new_node->next = NULL;
+		new_node->index = log_bt_index;
+		/*
+		 * Copy the backtrace: bt is inside a tdata or gctx, which
+		 * might die before prof_log_stop is called.
+		 */
+		new_node->bt.len = bt->len;
+		memcpy(new_node->vec, bt->vec, bt->len * sizeof(void *));
+		new_node->bt.vec = new_node->vec;
+
+		log_bt_index++;
+		ckh_insert(tsd, &log_bt_node_set, (void *)new_node, NULL);
+		return new_node->index;
+	} else {
+		return node->index;
+	}
+}
+static size_t
+prof_log_thr_index(tsd_t *tsd, uint64_t thr_uid, const char *name) {
+	assert(prof_logging_state == prof_logging_state_started);
+	malloc_mutex_assert_owner(tsd_tsdn(tsd), &log_mtx);
+
+	prof_thr_node_t dummy_node;
+	dummy_node.thr_uid = thr_uid;
+	prof_thr_node_t *node;
+
+	/* See if this thread is already cached in the table. */
+	if (ckh_search(&log_thr_node_set, (void *)(&dummy_node),
+	    (void **)(&node), NULL)) {
+		size_t sz = offsetof(prof_thr_node_t, name) + strlen(name) + 1;
+		prof_thr_node_t *new_node = (prof_thr_node_t *)
+		    iallocztm(tsd_tsdn(tsd), sz, sz_size2index(sz), false, NULL,
+		    true, arena_get(TSDN_NULL, 0, true), true);
+		if (log_thr_first == NULL) {
+			log_thr_first = new_node;
+			log_thr_last = new_node;
+		} else {
+			log_thr_last->next = new_node;
+			log_thr_last = new_node;
+		}
+
+		new_node->next = NULL;
+		new_node->index = log_thr_index;
+		new_node->thr_uid = thr_uid;
+		strcpy(new_node->name, name);
+
+		log_thr_index++;
+		ckh_insert(tsd, &log_thr_node_set, (void *)new_node, NULL);
+		return new_node->index;
+	} else {
+		return node->index;
+	}
+}
+
+static void
+prof_try_log(tsd_t *tsd, const void *ptr, size_t usize, prof_tctx_t *tctx) {
+	malloc_mutex_assert_owner(tsd_tsdn(tsd), tctx->tdata->lock);
+
+	prof_tdata_t *cons_tdata = prof_tdata_get(tsd, false);
+	if (cons_tdata == NULL) {
+		/*
+		 * We decide not to log these allocations. cons_tdata will be
+		 * NULL only when the current thread is in a weird state (e.g.
+		 * it's being destroyed).
+		 */
+		return;
+	}
+
+	malloc_mutex_lock(tsd_tsdn(tsd), &log_mtx);
+
+	if (prof_logging_state != prof_logging_state_started) {
+		goto label_done;
+	}
+
+	if (!log_tables_initialized) {
+		bool err1 = ckh_new(tsd, &log_bt_node_set, PROF_CKH_MINITEMS,
+				prof_bt_node_hash, prof_bt_node_keycomp);
+		bool err2 = ckh_new(tsd, &log_thr_node_set, PROF_CKH_MINITEMS,
+				prof_thr_node_hash, prof_thr_node_keycomp);
+		if (err1 || err2) {
+			goto label_done;
+		}
+		log_tables_initialized = true;
+	}
+
+	nstime_t alloc_time = prof_alloc_time_get(tsd_tsdn(tsd), ptr,
+			          (alloc_ctx_t *)NULL);
+	nstime_t free_time = NSTIME_ZERO_INITIALIZER;
+	nstime_update(&free_time);
+
+	size_t sz = sizeof(prof_alloc_node_t);
+	prof_alloc_node_t *new_node = (prof_alloc_node_t *)
+	    iallocztm(tsd_tsdn(tsd), sz, sz_size2index(sz), false, NULL, true,
+	    arena_get(TSDN_NULL, 0, true), true);
+
+	const char *prod_thr_name = (tctx->tdata->thread_name == NULL)?
+				        "" : tctx->tdata->thread_name;
+	const char *cons_thr_name = prof_thread_name_get(tsd);
+
+	prof_bt_t bt;
+	/* Initialize the backtrace, using the buffer in tdata to store it. */
+	bt_init(&bt, cons_tdata->vec);
+	prof_backtrace(&bt);
+	prof_bt_t *cons_bt = &bt;
+
+	/* We haven't destroyed tctx yet, so gctx should be good to read. */
+	prof_bt_t *prod_bt = &tctx->gctx->bt;
+
+	new_node->next = NULL;
+	new_node->alloc_thr_ind = prof_log_thr_index(tsd, tctx->tdata->thr_uid,
+				      prod_thr_name);
+	new_node->free_thr_ind = prof_log_thr_index(tsd, cons_tdata->thr_uid,
+				     cons_thr_name);
+	new_node->alloc_bt_ind = prof_log_bt_index(tsd, prod_bt);
+	new_node->free_bt_ind = prof_log_bt_index(tsd, cons_bt);
+	new_node->alloc_time_ns = nstime_ns(&alloc_time);
+	new_node->free_time_ns = nstime_ns(&free_time);
+	new_node->usize = usize;
+
+	if (log_alloc_first == NULL) {
+		log_alloc_first = new_node;
+		log_alloc_last = new_node;
+	} else {
+		log_alloc_last->next = new_node;
+		log_alloc_last = new_node;
+	}
+
+label_done:
+	malloc_mutex_unlock(tsd_tsdn(tsd), &log_mtx);
+}
+
 void
-prof_free_sampled_object(tsd_t *tsd, size_t usize, prof_tctx_t *tctx) {
+prof_free_sampled_object(tsd_t *tsd, const void *ptr, size_t usize,
+    prof_tctx_t *tctx) {
 	malloc_mutex_lock(tsd_tsdn(tsd), tctx->tdata->lock);
+
 	assert(tctx->cnts.curobjs > 0);
 	assert(tctx->cnts.curbytes >= usize);
 	tctx->cnts.curobjs--;
 	tctx->cnts.curbytes -= usize;
 
+	prof_try_log(tsd, ptr, usize, tctx);
+
 	if (prof_tctx_should_destroy(tsd_tsdn(tsd), tctx)) {
 		prof_tctx_destroy(tsd, tctx);
 	} else {
@@ -871,15 +1139,12 @@ prof_lookup(tsd_t *tsd, prof_bt_t *bt) {
 void
 prof_sample_threshold_update(prof_tdata_t *tdata) {
 #ifdef JEMALLOC_PROF
-	uint64_t r;
-	double u;
-
 	if (!config_prof) {
 		return;
 	}
 
 	if (lg_prof_sample == 0) {
-		tdata->bytes_until_sample = 0;
+		tsd_bytes_until_sample_set(tsd_fetch(), 0);
 		return;
 	}
 
@@ -901,11 +1166,16 @@ prof_sample_threshold_update(prof_tdata_t *tdata) {
 	 *   pp 500
 	 *   (http://luc.devroye.org/rnbookindex.html)
 	 */
-	r = prng_lg_range_u64(&tdata->prng_state, 53);
-	u = (double)r * (1.0/9007199254740992.0L);
-	tdata->bytes_until_sample = (uint64_t)(log(u) /
+	uint64_t r = prng_lg_range_u64(&tdata->prng_state, 53);
+	double u = (double)r * (1.0/9007199254740992.0L);
+	uint64_t bytes_until_sample = (uint64_t)(log(u) /
 	    log(1.0 - (1.0 / (double)((uint64_t)1U << lg_prof_sample))))
 	    + (uint64_t)1U;
+	if (bytes_until_sample > SSIZE_MAX) {
+		bytes_until_sample = SSIZE_MAX;
+	}
+	tsd_bytes_until_sample_set(tsd_fetch(), bytes_until_sample);
+
 #endif
 }
 
@@ -978,7 +1248,7 @@ prof_dump_flush(bool propagate_err) {
 
 	cassert(config_prof);
 
-	err = write(prof_dump_fd, prof_dump_buf, prof_dump_buf_end);
+	err = malloc_write_fd(prof_dump_fd, prof_dump_buf, prof_dump_buf_end);
 	if (err == -1) {
 		if (!propagate_err) {
 			malloc_write("<jemalloc>: write() failed during heap "
@@ -1022,7 +1292,7 @@ prof_dump_write(bool propagate_err, const char *s) {
 			}
 		}
 
-		if (prof_dump_buf_end + slen <= PROF_DUMP_BUFSIZE) {
+		if (prof_dump_buf_end + slen - i <= PROF_DUMP_BUFSIZE) {
 			/* Finish writing. */
 			n = slen - i;
 		} else {
@@ -1033,6 +1303,7 @@ prof_dump_write(bool propagate_err, const char *s) {
 		prof_dump_buf_end += n;
 		i += n;
 	}
+	assert(i == slen);
 
 	return false;
 }
@@ -1409,7 +1680,15 @@ prof_open_maps(const char *format, ...) {
 	va_start(ap, format);
 	malloc_vsnprintf(filename, sizeof(filename), format, ap);
 	va_end(ap);
+
+#if defined(O_CLOEXEC)
 	mfd = open(filename, O_RDONLY | O_CLOEXEC);
+#else
+	mfd = open(filename, O_RDONLY);
+	if (mfd != -1) {
+		fcntl(mfd, F_SETFD, fcntl(mfd, F_GETFD) | FD_CLOEXEC);
+	}
+#endif
 
 	return mfd;
 }
@@ -1463,8 +1742,9 @@ prof_dump_maps(bool propagate_err) {
 					goto label_return;
 				}
 			}
-			nread = read(mfd, &prof_dump_buf[prof_dump_buf_end],
-			    PROF_DUMP_BUFSIZE - prof_dump_buf_end);
+			nread = malloc_read_fd(mfd,
+			    &prof_dump_buf[prof_dump_buf_end], PROF_DUMP_BUFSIZE
+			    - prof_dump_buf_end);
 		} while (nread > 0);
 	} else {
 		ret = true;
@@ -1772,7 +2052,7 @@ prof_idump(tsdn_t *tsdn) {
 
 	cassert(config_prof);
 
-	if (!prof_booted || tsdn_null(tsdn)) {
+	if (!prof_booted || tsdn_null(tsdn) || !prof_active_get_unlocked()) {
 		return;
 	}
 	tsd = tsdn_tsd(tsdn);
@@ -1829,7 +2109,7 @@ prof_gdump(tsdn_t *tsdn) {
 
 	cassert(config_prof);
 
-	if (!prof_booted || tsdn_null(tsdn)) {
+	if (!prof_booted || tsdn_null(tsdn) || !prof_active_get_unlocked()) {
 		return;
 	}
 	tsd = tsdn_tsd(tsdn);
@@ -1878,6 +2158,33 @@ prof_bt_keycomp(const void *k1, const void *k2) {
 	return (memcmp(bt1->vec, bt2->vec, bt1->len * sizeof(void *)) == 0);
 }
 
+static void
+prof_bt_node_hash(const void *key, size_t r_hash[2]) {
+	const prof_bt_node_t *bt_node = (prof_bt_node_t *)key;
+	prof_bt_hash((void *)(&bt_node->bt), r_hash);
+}
+
+static bool
+prof_bt_node_keycomp(const void *k1, const void *k2) {
+	const prof_bt_node_t *bt_node1 = (prof_bt_node_t *)k1;
+	const prof_bt_node_t *bt_node2 = (prof_bt_node_t *)k2;
+	return prof_bt_keycomp((void *)(&bt_node1->bt),
+	    (void *)(&bt_node2->bt));
+}
+
+static void
+prof_thr_node_hash(const void *key, size_t r_hash[2]) {
+	const prof_thr_node_t *thr_node = (prof_thr_node_t *)key;
+	hash(&thr_node->thr_uid, sizeof(uint64_t), 0x94122f35U, r_hash);
+}
+
+static bool
+prof_thr_node_keycomp(const void *k1, const void *k2) {
+	const prof_thr_node_t *thr_node1 = (prof_thr_node_t *)k1;
+	const prof_thr_node_t *thr_node2 = (prof_thr_node_t *)k2;
+	return thr_node1->thr_uid == thr_node2->thr_uid;
+}
+
 static uint64_t
 prof_thr_uid_alloc(tsdn_t *tsdn) {
 	uint64_t thr_uid;
@@ -2110,6 +2417,368 @@ prof_active_set(tsdn_t *tsdn, bool active) {
 	return prof_active_old;
 }
 
+#ifdef JEMALLOC_JET
+size_t
+prof_log_bt_count(void) {
+	size_t cnt = 0;
+	prof_bt_node_t *node = log_bt_first;
+	while (node != NULL) {
+		cnt++;
+		node = node->next;
+	}
+	return cnt;
+}
+
+size_t
+prof_log_alloc_count(void) {
+	size_t cnt = 0;
+	prof_alloc_node_t *node = log_alloc_first;
+	while (node != NULL) {
+		cnt++;
+		node = node->next;
+	}
+	return cnt;
+}
+
+size_t
+prof_log_thr_count(void) {
+	size_t cnt = 0;
+	prof_thr_node_t *node = log_thr_first;
+	while (node != NULL) {
+		cnt++;
+		node = node->next;
+	}
+	return cnt;
+}
+
+bool
+prof_log_is_logging(void) {
+	return prof_logging_state == prof_logging_state_started;
+}
+
+bool
+prof_log_rep_check(void) {
+	if (prof_logging_state == prof_logging_state_stopped
+	    && log_tables_initialized) {
+		return true;
+	}
+
+	if (log_bt_last != NULL && log_bt_last->next != NULL) {
+		return true;
+	}
+	if (log_thr_last != NULL && log_thr_last->next != NULL) {
+		return true;
+	}
+	if (log_alloc_last != NULL && log_alloc_last->next != NULL) {
+		return true;
+	}
+
+	size_t bt_count = prof_log_bt_count();
+	size_t thr_count = prof_log_thr_count();
+	size_t alloc_count = prof_log_alloc_count();
+
+
+	if (prof_logging_state == prof_logging_state_stopped) {
+		if (bt_count != 0 || thr_count != 0 || alloc_count || 0) {
+			return true;
+		}
+	}
+
+	prof_alloc_node_t *node = log_alloc_first;
+	while (node != NULL) {
+		if (node->alloc_bt_ind >= bt_count) {
+			return true;
+		}
+		if (node->free_bt_ind >= bt_count) {
+			return true;
+		}
+		if (node->alloc_thr_ind >= thr_count) {
+			return true;
+		}
+		if (node->free_thr_ind >= thr_count) {
+			return true;
+		}
+		if (node->alloc_time_ns > node->free_time_ns) {
+			return true;
+		}
+		node = node->next;
+	}
+
+	return false;
+}
+
+void
+prof_log_dummy_set(bool new_value) {
+	prof_log_dummy = new_value;
+}
+#endif
+
+bool
+prof_log_start(tsdn_t *tsdn, const char *filename) {
+	if (!opt_prof || !prof_booted) {
+		return true;
+	}
+
+	bool ret = false;
+	size_t buf_size = PATH_MAX + 1;
+
+	malloc_mutex_lock(tsdn, &log_mtx);
+
+	if (prof_logging_state != prof_logging_state_stopped) {
+		ret = true;
+	} else if (filename == NULL) {
+		/* Make default name. */
+		malloc_snprintf(log_filename, buf_size, "%s.%d.%"FMTu64".json",
+		    opt_prof_prefix, prof_getpid(), log_seq);
+		log_seq++;
+		prof_logging_state = prof_logging_state_started;
+	} else if (strlen(filename) >= buf_size) {
+		ret = true;
+	} else {
+		strcpy(log_filename, filename);
+		prof_logging_state = prof_logging_state_started;
+	}
+
+	if (!ret) {
+		nstime_update(&log_start_timestamp);
+	}
+
+	malloc_mutex_unlock(tsdn, &log_mtx);
+
+	return ret;
+}
+
+/* Used as an atexit function to stop logging on exit. */
+static void
+prof_log_stop_final(void) {
+	tsd_t *tsd = tsd_fetch();
+	prof_log_stop(tsd_tsdn(tsd));
+}
+
+struct prof_emitter_cb_arg_s {
+	int fd;
+	ssize_t ret;
+};
+
+static void
+prof_emitter_write_cb(void *opaque, const char *to_write) {
+	struct prof_emitter_cb_arg_s *arg =
+	    (struct prof_emitter_cb_arg_s *)opaque;
+	size_t bytes = strlen(to_write);
+#ifdef JEMALLOC_JET
+	if (prof_log_dummy) {
+		return;
+	}
+#endif
+	arg->ret = write(arg->fd, (void *)to_write, bytes);
+}
+
+/*
+ * prof_log_emit_{...} goes through the appropriate linked list, emitting each
+ * node to the json and deallocating it.
+ */
+static void
+prof_log_emit_threads(tsd_t *tsd, emitter_t *emitter) {
+	emitter_json_array_kv_begin(emitter, "threads");
+	prof_thr_node_t *thr_node = log_thr_first;
+	prof_thr_node_t *thr_old_node;
+	while (thr_node != NULL) {
+		emitter_json_object_begin(emitter);
+
+		emitter_json_kv(emitter, "thr_uid", emitter_type_uint64,
+		    &thr_node->thr_uid);
+
+		char *thr_name = thr_node->name;
+
+		emitter_json_kv(emitter, "thr_name", emitter_type_string,
+		    &thr_name);
+
+		emitter_json_object_end(emitter);
+		thr_old_node = thr_node;
+		thr_node = thr_node->next;
+		idalloc(tsd, thr_old_node);
+	}
+	emitter_json_array_end(emitter);
+}
+
+static void
+prof_log_emit_traces(tsd_t *tsd, emitter_t *emitter) {
+	emitter_json_array_kv_begin(emitter, "stack_traces");
+	prof_bt_node_t *bt_node = log_bt_first;
+	prof_bt_node_t *bt_old_node;
+	/*
+	 * Calculate how many hex digits we need: twice number of bytes, two for
+	 * "0x", and then one more for terminating '\0'.
+	 */
+	char buf[2 * sizeof(intptr_t) + 3];
+	size_t buf_sz = sizeof(buf);
+	while (bt_node != NULL) {
+		emitter_json_array_begin(emitter);
+		size_t i;
+		for (i = 0; i < bt_node->bt.len; i++) {
+			malloc_snprintf(buf, buf_sz, "%p", bt_node->bt.vec[i]);
+			char *trace_str = buf;
+			emitter_json_value(emitter, emitter_type_string,
+			    &trace_str);
+		}
+		emitter_json_array_end(emitter);
+
+		bt_old_node = bt_node;
+		bt_node = bt_node->next;
+		idalloc(tsd, bt_old_node);
+	}
+	emitter_json_array_end(emitter);
+}
+
+static void
+prof_log_emit_allocs(tsd_t *tsd, emitter_t *emitter) {
+	emitter_json_array_kv_begin(emitter, "allocations");
+	prof_alloc_node_t *alloc_node = log_alloc_first;
+	prof_alloc_node_t *alloc_old_node;
+	while (alloc_node != NULL) {
+		emitter_json_object_begin(emitter);
+
+		emitter_json_kv(emitter, "alloc_thread", emitter_type_size,
+		    &alloc_node->alloc_thr_ind);
+
+		emitter_json_kv(emitter, "free_thread", emitter_type_size,
+		    &alloc_node->free_thr_ind);
+
+		emitter_json_kv(emitter, "alloc_trace", emitter_type_size,
+		    &alloc_node->alloc_bt_ind);
+
+		emitter_json_kv(emitter, "free_trace", emitter_type_size,
+		    &alloc_node->free_bt_ind);
+
+		emitter_json_kv(emitter, "alloc_timestamp",
+		    emitter_type_uint64, &alloc_node->alloc_time_ns);
+
+		emitter_json_kv(emitter, "free_timestamp", emitter_type_uint64,
+		    &alloc_node->free_time_ns);
+
+		emitter_json_kv(emitter, "usize", emitter_type_uint64,
+		    &alloc_node->usize);
+
+		emitter_json_object_end(emitter);
+
+		alloc_old_node = alloc_node;
+		alloc_node = alloc_node->next;
+		idalloc(tsd, alloc_old_node);
+	}
+	emitter_json_array_end(emitter);
+}
+
+static void
+prof_log_emit_metadata(emitter_t *emitter) {
+	emitter_json_object_kv_begin(emitter, "info");
+
+	nstime_t now = NSTIME_ZERO_INITIALIZER;
+
+	nstime_update(&now);
+	uint64_t ns = nstime_ns(&now) - nstime_ns(&log_start_timestamp);
+	emitter_json_kv(emitter, "duration", emitter_type_uint64, &ns);
+
+	char *vers = JEMALLOC_VERSION;
+	emitter_json_kv(emitter, "version",
+	    emitter_type_string, &vers);
+
+	emitter_json_kv(emitter, "lg_sample_rate",
+	    emitter_type_int, &lg_prof_sample);
+
+	int pid = prof_getpid();
+	emitter_json_kv(emitter, "pid", emitter_type_int, &pid);
+
+	emitter_json_object_end(emitter);
+}
+
+
+bool
+prof_log_stop(tsdn_t *tsdn) {
+	if (!opt_prof || !prof_booted) {
+		return true;
+	}
+
+	tsd_t *tsd = tsdn_tsd(tsdn);
+	malloc_mutex_lock(tsdn, &log_mtx);
+
+	if (prof_logging_state != prof_logging_state_started) {
+		malloc_mutex_unlock(tsdn, &log_mtx);
+		return true;
+	}
+
+	/*
+	 * Set the state to dumping. We'll set it to stopped when we're done.
+	 * Since other threads won't be able to start/stop/log when the state is
+	 * dumping, we don't have to hold the lock during the whole method.
+	 */
+	prof_logging_state = prof_logging_state_dumping;
+	malloc_mutex_unlock(tsdn, &log_mtx);
+
+
+	emitter_t emitter;
+
+	/* Create a file. */
+
+	int fd;
+#ifdef JEMALLOC_JET
+	if (prof_log_dummy) {
+		fd = 0;
+	} else {
+		fd = creat(log_filename, 0644);
+	}
+#else
+	fd = creat(log_filename, 0644);
+#endif
+
+	if (fd == -1) {
+		malloc_printf("<jemalloc>: creat() for log file \"%s\" "
+			      " failed with %d\n", log_filename, errno);
+		if (opt_abort) {
+			abort();
+		}
+		return true;
+	}
+
+	/* Emit to json. */
+	struct prof_emitter_cb_arg_s arg;
+	arg.fd = fd;
+	emitter_init(&emitter, emitter_output_json, &prof_emitter_write_cb,
+	    (void *)(&arg));
+
+	emitter_begin(&emitter);
+	prof_log_emit_metadata(&emitter);
+	prof_log_emit_threads(tsd, &emitter);
+	prof_log_emit_traces(tsd, &emitter);
+	prof_log_emit_allocs(tsd, &emitter);
+	emitter_end(&emitter);
+
+	/* Reset global state. */
+	if (log_tables_initialized) {
+		ckh_delete(tsd, &log_bt_node_set);
+		ckh_delete(tsd, &log_thr_node_set);
+	}
+	log_tables_initialized = false;
+	log_bt_index = 0;
+	log_thr_index = 0;
+	log_bt_first = NULL;
+	log_bt_last = NULL;
+	log_thr_first = NULL;
+	log_thr_last = NULL;
+	log_alloc_first = NULL;
+	log_alloc_last = NULL;
+
+	malloc_mutex_lock(tsdn, &log_mtx);
+	prof_logging_state = prof_logging_state_stopped;
+	malloc_mutex_unlock(tsdn, &log_mtx);
+
+#ifdef JEMALLOC_JET
+	if (prof_log_dummy) {
+		return false;
+	}
+#endif
+	return close(fd);
+}
+
 const char *
 prof_thread_name_get(tsd_t *tsd) {
 	prof_tdata_t *tdata;
@@ -2346,6 +3015,35 @@ prof_boot2(tsd_t *tsd) {
 			}
 		}
 
+		if (opt_prof_log) {
+			prof_log_start(tsd_tsdn(tsd), NULL);
+		}
+
+		if (atexit(prof_log_stop_final) != 0) {
+			malloc_write("<jemalloc>: Error in atexit() "
+				     "for logging\n");
+			if (opt_abort) {
+				abort();
+			}
+		}
+
+		if (malloc_mutex_init(&log_mtx, "prof_log",
+		    WITNESS_RANK_PROF_LOG, malloc_mutex_rank_exclusive)) {
+			return true;
+		}
+
+		if (ckh_new(tsd, &log_bt_node_set, PROF_CKH_MINITEMS,
+		    prof_bt_node_hash, prof_bt_node_keycomp)) {
+			return true;
+		}
+
+		if (ckh_new(tsd, &log_thr_node_set, PROF_CKH_MINITEMS,
+		    prof_thr_node_hash, prof_thr_node_keycomp)) {
+			return true;
+		}
+
+		log_tables_initialized = true;
+
 		gctx_locks = (malloc_mutex_t *)base_alloc(tsd_tsdn(tsd),
 		    b0get(), PROF_NCTX_LOCKS * sizeof(malloc_mutex_t),
 		    CACHELINE);
@@ -2373,16 +3071,14 @@ prof_boot2(tsd_t *tsd) {
 				return true;
 			}
 		}
-	}
-
 #ifdef JEMALLOC_PROF_LIBGCC
-	/*
-	 * Cause the backtracing machinery to allocate its internal state
-	 * before enabling profiling.
-	 */
-	_Unwind_Backtrace(prof_unwind_init_callback, NULL);
+		/*
+		 * Cause the backtracing machinery to allocate its internal
+		 * state before enabling profiling.
+		 */
+		_Unwind_Backtrace(prof_unwind_init_callback, NULL);
 #endif
-
+	}
 	prof_booted = true;
 
 	return false;
diff --git a/deps/jemalloc/src/rtree.c b/deps/jemalloc/src/rtree.c
index 53702cf723..4ae41fe2fe 100644
--- a/deps/jemalloc/src/rtree.c
+++ b/deps/jemalloc/src/rtree.c
@@ -39,7 +39,7 @@ rtree_node_dalloc_impl(tsdn_t *tsdn, rtree_t *rtree, rtree_node_elm_t *node) {
 	/* Nodes are never deleted during normal operation. */
 	not_reached();
 }
-UNUSED rtree_node_dalloc_t *JET_MUTABLE rtree_node_dalloc =
+rtree_node_dalloc_t *JET_MUTABLE rtree_node_dalloc =
     rtree_node_dalloc_impl;
 
 static rtree_leaf_elm_t *
@@ -54,7 +54,7 @@ rtree_leaf_dalloc_impl(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *leaf) {
 	/* Leaves are never deleted during normal operation. */
 	not_reached();
 }
-UNUSED rtree_leaf_dalloc_t *JET_MUTABLE rtree_leaf_dalloc =
+rtree_leaf_dalloc_t *JET_MUTABLE rtree_leaf_dalloc =
     rtree_leaf_dalloc_impl;
 
 #ifdef JEMALLOC_JET
diff --git a/deps/jemalloc/src/safety_check.c b/deps/jemalloc/src/safety_check.c
new file mode 100644
index 0000000000..804155dcfc
--- /dev/null
+++ b/deps/jemalloc/src/safety_check.c
@@ -0,0 +1,24 @@
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/jemalloc_internal_includes.h"
+
+static void (*safety_check_abort)(const char *message);
+
+void safety_check_set_abort(void (*abort_fn)(const char *)) {
+	safety_check_abort = abort_fn;
+}
+
+void safety_check_fail(const char *format, ...) {
+	char buf[MALLOC_PRINTF_BUFSIZE];
+
+	va_list ap;
+	va_start(ap, format);
+	malloc_vsnprintf(buf, MALLOC_PRINTF_BUFSIZE, format, ap);
+	va_end(ap);
+
+	if (safety_check_abort == NULL) {
+		malloc_write(buf);
+		abort();
+	} else {
+		safety_check_abort(buf);
+	}
+}
diff --git a/deps/jemalloc/src/sc.c b/deps/jemalloc/src/sc.c
new file mode 100644
index 0000000000..89ddb6ba6a
--- /dev/null
+++ b/deps/jemalloc/src/sc.c
@@ -0,0 +1,313 @@
+#include "jemalloc/internal/jemalloc_preamble.h"
+
+#include "jemalloc/internal/assert.h"
+#include "jemalloc/internal/bit_util.h"
+#include "jemalloc/internal/bitmap.h"
+#include "jemalloc/internal/pages.h"
+#include "jemalloc/internal/sc.h"
+
+/*
+ * This module computes the size classes used to satisfy allocations.  The logic
+ * here was ported more or less line-by-line from a shell script, and because of
+ * that is not the most idiomatic C.  Eventually we should fix this, but for now
+ * at least the damage is compartmentalized to this file.
+ */
+
+sc_data_t sc_data_global;
+
+static size_t
+reg_size_compute(int lg_base, int lg_delta, int ndelta) {
+	return (ZU(1) << lg_base) + (ZU(ndelta) << lg_delta);
+}
+
+/* Returns the number of pages in the slab. */
+static int
+slab_size(int lg_page, int lg_base, int lg_delta, int ndelta) {
+	size_t page = (ZU(1) << lg_page);
+	size_t reg_size = reg_size_compute(lg_base, lg_delta, ndelta);
+
+	size_t try_slab_size = page;
+	size_t try_nregs = try_slab_size / reg_size;
+	size_t perfect_slab_size = 0;
+	bool perfect = false;
+	/*
+	 * This loop continues until we find the least common multiple of the
+	 * page size and size class size.  Size classes are all of the form
+	 * base + ndelta * delta == (ndelta + base/ndelta) * delta, which is
+	 * (ndelta + ngroup) * delta.  The way we choose slabbing strategies
+	 * means that delta is at most the page size and ndelta < ngroup.  So
+	 * the loop executes for at most 2 * ngroup - 1 iterations, which is
+	 * also the bound on the number of pages in a slab chosen by default.
+	 * With the current default settings, this is at most 7.
+	 */
+	while (!perfect) {
+		perfect_slab_size = try_slab_size;
+		size_t perfect_nregs = try_nregs;
+		try_slab_size += page;
+		try_nregs = try_slab_size / reg_size;
+		if (perfect_slab_size == perfect_nregs * reg_size) {
+			perfect = true;
+		}
+	}
+	return (int)(perfect_slab_size / page);
+}
+
+static void
+size_class(
+    /* Output. */
+    sc_t *sc,
+    /* Configuration decisions. */
+    int lg_max_lookup, int lg_page, int lg_ngroup,
+    /* Inputs specific to the size class. */
+    int index, int lg_base, int lg_delta, int ndelta) {
+	sc->index = index;
+	sc->lg_base = lg_base;
+	sc->lg_delta = lg_delta;
+	sc->ndelta = ndelta;
+	sc->psz = (reg_size_compute(lg_base, lg_delta, ndelta)
+	    % (ZU(1) << lg_page) == 0);
+	size_t size = (ZU(1) << lg_base) + (ZU(ndelta) << lg_delta);
+	if (index == 0) {
+		assert(!sc->psz);
+	}
+	if (size < (ZU(1) << (lg_page + lg_ngroup))) {
+		sc->bin = true;
+		sc->pgs = slab_size(lg_page, lg_base, lg_delta, ndelta);
+	} else {
+		sc->bin = false;
+		sc->pgs = 0;
+	}
+	if (size <= (ZU(1) << lg_max_lookup)) {
+		sc->lg_delta_lookup = lg_delta;
+	} else {
+		sc->lg_delta_lookup = 0;
+	}
+}
+
+static void
+size_classes(
+    /* Output. */
+    sc_data_t *sc_data,
+    /* Determined by the system. */
+    size_t lg_ptr_size, int lg_quantum,
+    /* Configuration decisions. */
+    int lg_tiny_min, int lg_max_lookup, int lg_page, int lg_ngroup) {
+	int ptr_bits = (1 << lg_ptr_size) * 8;
+	int ngroup = (1 << lg_ngroup);
+	int ntiny = 0;
+	int nlbins = 0;
+	int lg_tiny_maxclass = (unsigned)-1;
+	int nbins = 0;
+	int npsizes = 0;
+
+	int index = 0;
+
+	int ndelta = 0;
+	int lg_base = lg_tiny_min;
+	int lg_delta = lg_base;
+
+	/* Outputs that we update as we go. */
+	size_t lookup_maxclass = 0;
+	size_t small_maxclass = 0;
+	int lg_large_minclass = 0;
+	size_t large_maxclass = 0;
+
+	/* Tiny size classes. */
+	while (lg_base < lg_quantum) {
+		sc_t *sc = &sc_data->sc[index];
+		size_class(sc, lg_max_lookup, lg_page, lg_ngroup, index,
+		    lg_base, lg_delta, ndelta);
+		if (sc->lg_delta_lookup != 0) {
+			nlbins = index + 1;
+		}
+		if (sc->psz) {
+			npsizes++;
+		}
+		if (sc->bin) {
+			nbins++;
+		}
+		ntiny++;
+		/* Final written value is correct. */
+		lg_tiny_maxclass = lg_base;
+		index++;
+		lg_delta = lg_base;
+		lg_base++;
+	}
+
+	/* First non-tiny (pseudo) group. */
+	if (ntiny != 0) {
+		sc_t *sc = &sc_data->sc[index];
+		/*
+		 * See the note in sc.h; the first non-tiny size class has an
+		 * unusual encoding.
+		 */
+		lg_base--;
+		ndelta = 1;
+		size_class(sc, lg_max_lookup, lg_page, lg_ngroup, index,
+		    lg_base, lg_delta, ndelta);
+		index++;
+		lg_base++;
+		lg_delta++;
+		if (sc->psz) {
+			npsizes++;
+		}
+		if (sc->bin) {
+			nbins++;
+		}
+	}
+	while (ndelta < ngroup) {
+		sc_t *sc = &sc_data->sc[index];
+		size_class(sc, lg_max_lookup, lg_page, lg_ngroup, index,
+		    lg_base, lg_delta, ndelta);
+		index++;
+		ndelta++;
+		if (sc->psz) {
+			npsizes++;
+		}
+		if (sc->bin) {
+			nbins++;
+		}
+	}
+
+	/* All remaining groups. */
+	lg_base = lg_base + lg_ngroup;
+	while (lg_base < ptr_bits - 1) {
+		ndelta = 1;
+		int ndelta_limit;
+		if (lg_base == ptr_bits - 2) {
+			ndelta_limit = ngroup - 1;
+		} else {
+			ndelta_limit = ngroup;
+		}
+		while (ndelta <= ndelta_limit) {
+			sc_t *sc = &sc_data->sc[index];
+			size_class(sc, lg_max_lookup, lg_page, lg_ngroup, index,
+			    lg_base, lg_delta, ndelta);
+			if (sc->lg_delta_lookup != 0) {
+				nlbins = index + 1;
+				/* Final written value is correct. */
+				lookup_maxclass = (ZU(1) << lg_base)
+				    + (ZU(ndelta) << lg_delta);
+			}
+			if (sc->psz) {
+				npsizes++;
+			}
+			if (sc->bin) {
+				nbins++;
+				/* Final written value is correct. */
+				small_maxclass = (ZU(1) << lg_base)
+				    + (ZU(ndelta) << lg_delta);
+				if (lg_ngroup > 0) {
+					lg_large_minclass = lg_base + 1;
+				} else {
+					lg_large_minclass = lg_base + 2;
+				}
+			}
+			large_maxclass = (ZU(1) << lg_base)
+			    + (ZU(ndelta) << lg_delta);
+			index++;
+			ndelta++;
+		}
+		lg_base++;
+		lg_delta++;
+	}
+	/* Additional outputs. */
+	int nsizes = index;
+	unsigned lg_ceil_nsizes = lg_ceil(nsizes);
+
+	/* Fill in the output data. */
+	sc_data->ntiny = ntiny;
+	sc_data->nlbins = nlbins;
+	sc_data->nbins = nbins;
+	sc_data->nsizes = nsizes;
+	sc_data->lg_ceil_nsizes = lg_ceil_nsizes;
+	sc_data->npsizes = npsizes;
+	sc_data->lg_tiny_maxclass = lg_tiny_maxclass;
+	sc_data->lookup_maxclass = lookup_maxclass;
+	sc_data->small_maxclass = small_maxclass;
+	sc_data->lg_large_minclass = lg_large_minclass;
+	sc_data->large_minclass = (ZU(1) << lg_large_minclass);
+	sc_data->large_maxclass = large_maxclass;
+
+	/*
+	 * We compute these values in two ways:
+	 *   - Incrementally, as above.
+	 *   - In macros, in sc.h.
+	 * The computation is easier when done incrementally, but putting it in
+	 * a constant makes it available to the fast paths without having to
+	 * touch the extra global cacheline.  We assert, however, that the two
+	 * computations are equivalent.
+	 */
+	assert(sc_data->npsizes == SC_NPSIZES);
+	assert(sc_data->lg_tiny_maxclass == SC_LG_TINY_MAXCLASS);
+	assert(sc_data->small_maxclass == SC_SMALL_MAXCLASS);
+	assert(sc_data->large_minclass == SC_LARGE_MINCLASS);
+	assert(sc_data->lg_large_minclass == SC_LG_LARGE_MINCLASS);
+	assert(sc_data->large_maxclass == SC_LARGE_MAXCLASS);
+
+	/* 
+	 * In the allocation fastpath, we want to assume that we can
+	 * unconditionally subtract the requested allocation size from
+	 * a ssize_t, and detect passing through 0 correctly.  This
+	 * results in optimal generated code.  For this to work, the
+	 * maximum allocation size must be less than SSIZE_MAX.
+	 */
+	assert(SC_LARGE_MAXCLASS < SSIZE_MAX);
+}
+
+void
+sc_data_init(sc_data_t *sc_data) {
+	assert(!sc_data->initialized);
+
+	int lg_max_lookup = 12;
+
+	size_classes(sc_data, LG_SIZEOF_PTR, LG_QUANTUM, SC_LG_TINY_MIN,
+	    lg_max_lookup, LG_PAGE, 2);
+
+	sc_data->initialized = true;
+}
+
+static void
+sc_data_update_sc_slab_size(sc_t *sc, size_t reg_size, size_t pgs_guess) {
+	size_t min_pgs = reg_size / PAGE;
+	if (reg_size % PAGE != 0) {
+		min_pgs++;
+	}
+	/*
+	 * BITMAP_MAXBITS is actually determined by putting the smallest
+	 * possible size-class on one page, so this can never be 0.
+	 */
+	size_t max_pgs = BITMAP_MAXBITS * reg_size / PAGE;
+
+	assert(min_pgs <= max_pgs);
+	assert(min_pgs > 0);
+	assert(max_pgs >= 1);
+	if (pgs_guess < min_pgs) {
+		sc->pgs = (int)min_pgs;
+	} else if (pgs_guess > max_pgs) {
+		sc->pgs = (int)max_pgs;
+	} else {
+		sc->pgs = (int)pgs_guess;
+	}
+}
+
+void
+sc_data_update_slab_size(sc_data_t *data, size_t begin, size_t end, int pgs) {
+	assert(data->initialized);
+	for (int i = 0; i < data->nsizes; i++) {
+		sc_t *sc = &data->sc[i];
+		if (!sc->bin) {
+			break;
+		}
+		size_t reg_size = reg_size_compute(sc->lg_base, sc->lg_delta,
+		    sc->ndelta);
+		if (begin <= reg_size && reg_size <= end) {
+			sc_data_update_sc_slab_size(sc, reg_size, pgs);
+		}
+	}
+}
+
+void
+sc_boot(sc_data_t *data) {
+	sc_data_init(data);
+}
diff --git a/deps/jemalloc/src/stats.c b/deps/jemalloc/src/stats.c
index 087df7676e..118e05d291 100644
--- a/deps/jemalloc/src/stats.c
+++ b/deps/jemalloc/src/stats.c
@@ -4,6 +4,7 @@
 
 #include "jemalloc/internal/assert.h"
 #include "jemalloc/internal/ctl.h"
+#include "jemalloc/internal/emitter.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/mutex_prof.h"
 
@@ -51,6 +52,20 @@ char opt_stats_print_opts[stats_print_tot_num_options+1] = "";
 
 /******************************************************************************/
 
+static uint64_t
+rate_per_second(uint64_t value, uint64_t uptime_ns) {
+	uint64_t billion = 1000000000;
+	if (uptime_ns == 0 || value == 0) {
+		return 0;
+	}
+	if (uptime_ns < billion) {
+		return value;
+	} else {
+		uint64_t uptime_s = uptime_ns / billion;
+		return value / uptime_s;
+	}
+}
+
 /* Calculate x.yyy and output a string (takes a fixed sized char array). */
 static bool
 get_rate_str(uint64_t dividend, uint64_t divisor, char str[6]) {
@@ -84,41 +99,175 @@ gen_mutex_ctl_str(char *str, size_t buf_len, const char *prefix,
 }
 
 static void
-read_arena_bin_mutex_stats(unsigned arena_ind, unsigned bin_ind,
-    uint64_t results[mutex_prof_num_counters]) {
+mutex_stats_init_cols(emitter_row_t *row, const char *table_name,
+    emitter_col_t *name,
+    emitter_col_t col_uint64_t[mutex_prof_num_uint64_t_counters],
+    emitter_col_t col_uint32_t[mutex_prof_num_uint32_t_counters]) {
+	mutex_prof_uint64_t_counter_ind_t k_uint64_t = 0;
+	mutex_prof_uint32_t_counter_ind_t k_uint32_t = 0;
+
+	emitter_col_t *col;
+
+	if (name != NULL) {
+		emitter_col_init(name, row);
+		name->justify = emitter_justify_left;
+		name->width = 21;
+		name->type = emitter_type_title;
+		name->str_val = table_name;
+	}
+
+#define WIDTH_uint32_t 12
+#define WIDTH_uint64_t 16
+#define OP(counter, counter_type, human, derived, base_counter)	\
+	col = &col_##counter_type[k_##counter_type];			\
+	++k_##counter_type;						\
+	emitter_col_init(col, row);					\
+	col->justify = emitter_justify_right;				\
+	col->width = derived ? 8 : WIDTH_##counter_type;		\
+	col->type = emitter_type_title;					\
+	col->str_val = human;
+	MUTEX_PROF_COUNTERS
+#undef OP
+#undef WIDTH_uint32_t
+#undef WIDTH_uint64_t
+	col_uint64_t[mutex_counter_total_wait_time_ps].width = 10;
+}
+
+static void
+mutex_stats_read_global(const char *name, emitter_col_t *col_name,
+    emitter_col_t col_uint64_t[mutex_prof_num_uint64_t_counters],
+    emitter_col_t col_uint32_t[mutex_prof_num_uint32_t_counters],
+    uint64_t uptime) {
 	char cmd[MUTEX_CTL_STR_MAX_LENGTH];
-#define OP(c, t)							\
-    gen_mutex_ctl_str(cmd, MUTEX_CTL_STR_MAX_LENGTH,			\
-        "arenas.0.bins.0","mutex", #c);					\
-    CTL_M2_M4_GET(cmd, arena_ind, bin_ind,				\
-        (t *)&results[mutex_counter_##c], t);
-MUTEX_PROF_COUNTERS
+
+	col_name->str_val = name;
+
+	emitter_col_t *dst;
+#define EMITTER_TYPE_uint32_t emitter_type_uint32
+#define EMITTER_TYPE_uint64_t emitter_type_uint64
+#define OP(counter, counter_type, human, derived, base_counter)	\
+	dst = &col_##counter_type[mutex_counter_##counter];		\
+	dst->type = EMITTER_TYPE_##counter_type;			\
+	if (!derived) {							\
+		gen_mutex_ctl_str(cmd, MUTEX_CTL_STR_MAX_LENGTH,	\
+		    "mutexes", name, #counter);				\
+		CTL_GET(cmd, (counter_type *)&dst->bool_val, counter_type);	\
+	} else { \
+	    emitter_col_t *base = &col_##counter_type[mutex_counter_##base_counter];	\
+	    dst->counter_type##_val = rate_per_second(base->counter_type##_val, uptime); \
+	}
+	MUTEX_PROF_COUNTERS
 #undef OP
+#undef EMITTER_TYPE_uint32_t
+#undef EMITTER_TYPE_uint64_t
 }
 
 static void
-mutex_stats_output_json(void (*write_cb)(void *, const char *), void *cbopaque,
-    const char *name, uint64_t stats[mutex_prof_num_counters],
-    const char *json_indent, bool last) {
-	malloc_cprintf(write_cb, cbopaque, "%s\"%s\": {\n", json_indent, name);
-
-	mutex_prof_counter_ind_t k = 0;
-	char *fmt_str[2] = {"%s\t\"%s\": %"FMTu32"%s\n",
-	    "%s\t\"%s\": %"FMTu64"%s\n"};
-#define OP(c, t)							\
-	malloc_cprintf(write_cb, cbopaque,				\
-	    fmt_str[sizeof(t) / sizeof(uint32_t) - 1], 			\
-	    json_indent, #c, (t)stats[mutex_counter_##c],		\
-	    (++k == mutex_prof_num_counters) ? "" : ",");
-MUTEX_PROF_COUNTERS
+mutex_stats_read_arena(unsigned arena_ind, mutex_prof_arena_ind_t mutex_ind,
+    const char *name, emitter_col_t *col_name,
+    emitter_col_t col_uint64_t[mutex_prof_num_uint64_t_counters],
+    emitter_col_t col_uint32_t[mutex_prof_num_uint32_t_counters],
+    uint64_t uptime) {
+	char cmd[MUTEX_CTL_STR_MAX_LENGTH];
+
+	col_name->str_val = name;
+
+	emitter_col_t *dst;
+#define EMITTER_TYPE_uint32_t emitter_type_uint32
+#define EMITTER_TYPE_uint64_t emitter_type_uint64
+#define OP(counter, counter_type, human, derived, base_counter)	\
+	dst = &col_##counter_type[mutex_counter_##counter];		\
+	dst->type = EMITTER_TYPE_##counter_type;			\
+	if (!derived) {                                   \
+		gen_mutex_ctl_str(cmd, MUTEX_CTL_STR_MAX_LENGTH,        \
+		    "arenas.0.mutexes", arena_mutex_names[mutex_ind], #counter);\
+		CTL_M2_GET(cmd, arena_ind, (counter_type *)&dst->bool_val, counter_type); \
+	} else {                      \
+		emitter_col_t *base = &col_##counter_type[mutex_counter_##base_counter];	\
+		dst->counter_type##_val = rate_per_second(base->counter_type##_val, uptime); \
+	}
+	MUTEX_PROF_COUNTERS
+#undef OP
+#undef EMITTER_TYPE_uint32_t
+#undef EMITTER_TYPE_uint64_t
+}
+
+static void
+mutex_stats_read_arena_bin(unsigned arena_ind, unsigned bin_ind,
+    emitter_col_t col_uint64_t[mutex_prof_num_uint64_t_counters],
+    emitter_col_t col_uint32_t[mutex_prof_num_uint32_t_counters],
+    uint64_t uptime) {
+	char cmd[MUTEX_CTL_STR_MAX_LENGTH];
+	emitter_col_t *dst;
+
+#define EMITTER_TYPE_uint32_t emitter_type_uint32
+#define EMITTER_TYPE_uint64_t emitter_type_uint64
+#define OP(counter, counter_type, human, derived, base_counter)	\
+	dst = &col_##counter_type[mutex_counter_##counter];		\
+	dst->type = EMITTER_TYPE_##counter_type;			\
+	if (!derived) {                                   \
+		gen_mutex_ctl_str(cmd, MUTEX_CTL_STR_MAX_LENGTH,        \
+		    "arenas.0.bins.0","mutex", #counter);            \
+		CTL_M2_M4_GET(cmd, arena_ind, bin_ind,                \
+		    (counter_type *)&dst->bool_val, counter_type);  \
+	} else {                      \
+		emitter_col_t *base = &col_##counter_type[mutex_counter_##base_counter]; \
+		dst->counter_type##_val = rate_per_second(base->counter_type##_val, uptime); \
+	}
+	MUTEX_PROF_COUNTERS
 #undef OP
-	malloc_cprintf(write_cb, cbopaque, "%s}%s\n", json_indent,
-	    last ? "" : ",");
+#undef EMITTER_TYPE_uint32_t
+#undef EMITTER_TYPE_uint64_t
 }
 
+/* "row" can be NULL to avoid emitting in table mode. */
 static void
-stats_arena_bins_print(void (*write_cb)(void *, const char *), void *cbopaque,
-    bool json, bool large, bool mutex, unsigned i) {
+mutex_stats_emit(emitter_t *emitter, emitter_row_t *row,
+    emitter_col_t col_uint64_t[mutex_prof_num_uint64_t_counters],
+    emitter_col_t col_uint32_t[mutex_prof_num_uint32_t_counters]) {
+	if (row != NULL) {
+		emitter_table_row(emitter, row);
+	}
+
+	mutex_prof_uint64_t_counter_ind_t k_uint64_t = 0;
+	mutex_prof_uint32_t_counter_ind_t k_uint32_t = 0;
+
+	emitter_col_t *col;
+
+#define EMITTER_TYPE_uint32_t emitter_type_uint32
+#define EMITTER_TYPE_uint64_t emitter_type_uint64
+#define OP(counter, type, human, derived, base_counter)		\
+	if (!derived) {                    \
+		col = &col_##type[k_##type];                        \
+		++k_##type;                            \
+		emitter_json_kv(emitter, #counter, EMITTER_TYPE_##type,        \
+		    (const void *)&col->bool_val); \
+	}
+	MUTEX_PROF_COUNTERS;
+#undef OP
+#undef EMITTER_TYPE_uint32_t
+#undef EMITTER_TYPE_uint64_t
+}
+
+#define COL(row_name, column_name, left_or_right, col_width, etype)      \
+	emitter_col_t col_##column_name;                                     \
+	emitter_col_init(&col_##column_name, &row_name);                     \
+	col_##column_name.justify = emitter_justify_##left_or_right;         \
+	col_##column_name.width = col_width;                                 \
+	col_##column_name.type = emitter_type_##etype;
+
+#define COL_HDR(row_name, column_name, human, left_or_right, col_width, etype)  \
+	COL(row_name, column_name, left_or_right, col_width, etype)	         \
+	emitter_col_t header_##column_name;                                  \
+	emitter_col_init(&header_##column_name, &header_##row_name);         \
+	header_##column_name.justify = emitter_justify_##left_or_right;      \
+	header_##column_name.width = col_width;                              \
+	header_##column_name.type = emitter_type_title;                      \
+	header_##column_name.str_val = human ? human : #column_name;
+
+
+static void
+stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i, uint64_t uptime) {
 	size_t page;
 	bool in_gap, in_gap_prev;
 	unsigned nbins, j;
@@ -126,23 +275,71 @@ stats_arena_bins_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	CTL_GET("arenas.page", &page, size_t);
 
 	CTL_GET("arenas.nbins", &nbins, unsigned);
-	if (json) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\"bins\": [\n");
-	} else {
-		char *mutex_counters = "   n_lock_ops    n_waiting"
-		    "   n_spin_acq  total_wait_ns  max_wait_ns\n";
-		malloc_cprintf(write_cb, cbopaque,
-		    "bins:           size ind    allocated      nmalloc"
-		    "      ndalloc    nrequests      curregs     curslabs regs"
-		    " pgs  util       nfills     nflushes     newslabs"
-		    "      reslabs%s", mutex ? mutex_counters : "\n");
+
+	emitter_row_t header_row;
+	emitter_row_init(&header_row);
+
+	emitter_row_t row;
+	emitter_row_init(&row);
+
+	COL_HDR(row, size, NULL, right, 20, size)
+	COL_HDR(row, ind, NULL, right, 4, unsigned)
+	COL_HDR(row, allocated, NULL, right, 13, uint64)
+	COL_HDR(row, nmalloc, NULL, right, 13, uint64)
+	COL_HDR(row, nmalloc_ps, "(#/sec)", right, 8, uint64)
+	COL_HDR(row, ndalloc, NULL, right, 13, uint64)
+	COL_HDR(row, ndalloc_ps, "(#/sec)", right, 8, uint64)
+	COL_HDR(row, nrequests, NULL, right, 13, uint64)
+	COL_HDR(row, nrequests_ps, "(#/sec)", right, 10, uint64)
+	COL_HDR(row, nshards, NULL, right, 9, unsigned)
+	COL_HDR(row, curregs, NULL, right, 13, size)
+	COL_HDR(row, curslabs, NULL, right, 13, size)
+	COL_HDR(row, nonfull_slabs, NULL, right, 15, size)
+	COL_HDR(row, regs, NULL, right, 5, unsigned)
+	COL_HDR(row, pgs, NULL, right, 4, size)
+	/* To buffer a right- and left-justified column. */
+	COL_HDR(row, justify_spacer, NULL, right, 1, title)
+	COL_HDR(row, util, NULL, right, 6, title)
+	COL_HDR(row, nfills, NULL, right, 13, uint64)
+	COL_HDR(row, nfills_ps, "(#/sec)", right, 8, uint64)
+	COL_HDR(row, nflushes, NULL, right, 13, uint64)
+	COL_HDR(row, nflushes_ps, "(#/sec)", right, 8, uint64)
+	COL_HDR(row, nslabs, NULL, right, 13, uint64)
+	COL_HDR(row, nreslabs, NULL, right, 13, uint64)
+	COL_HDR(row, nreslabs_ps, "(#/sec)", right, 8, uint64)
+
+	/* Don't want to actually print the name. */
+	header_justify_spacer.str_val = " ";
+	col_justify_spacer.str_val = " ";
+
+	emitter_col_t col_mutex64[mutex_prof_num_uint64_t_counters];
+	emitter_col_t col_mutex32[mutex_prof_num_uint32_t_counters];
+
+	emitter_col_t header_mutex64[mutex_prof_num_uint64_t_counters];
+	emitter_col_t header_mutex32[mutex_prof_num_uint32_t_counters];
+
+	if (mutex) {
+		mutex_stats_init_cols(&row, NULL, NULL, col_mutex64,
+		    col_mutex32);
+		mutex_stats_init_cols(&header_row, NULL, NULL, header_mutex64,
+		    header_mutex32);
 	}
+
+	/*
+	 * We print a "bins:" header as part of the table row; we need to adjust
+	 * the header size column to compensate.
+	 */
+	header_size.width -=5;
+	emitter_table_printf(emitter, "bins:");
+	emitter_table_row(emitter, &header_row);
+	emitter_json_array_kv_begin(emitter, "bins");
+
 	for (j = 0, in_gap = false; j < nbins; j++) {
 		uint64_t nslabs;
 		size_t reg_size, slab_size, curregs;
 		size_t curslabs;
-		uint32_t nregs;
+		size_t nonfull_slabs;
+		uint32_t nregs, nshards;
 		uint64_t nmalloc, ndalloc, nrequests, nfills, nflushes;
 		uint64_t nreslabs;
 
@@ -151,14 +348,15 @@ stats_arena_bins_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		in_gap_prev = in_gap;
 		in_gap = (nslabs == 0);
 
-		if (!json && in_gap_prev && !in_gap) {
-			malloc_cprintf(write_cb, cbopaque,
+		if (in_gap_prev && !in_gap) {
+			emitter_table_printf(emitter,
 			    "                     ---\n");
 		}
 
 		CTL_M2_GET("arenas.bin.0.size", j, &reg_size, size_t);
 		CTL_M2_GET("arenas.bin.0.nregs", j, &nregs, uint32_t);
 		CTL_M2_GET("arenas.bin.0.slab_size", j, &slab_size, size_t);
+		CTL_M2_GET("arenas.bin.0.nshards", j, &nshards, uint32_t);
 
 		CTL_M2_M4_GET("stats.arenas.0.bins.0.nmalloc", i, j, &nmalloc,
 		    uint64_t);
@@ -176,106 +374,128 @@ stats_arena_bins_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		    uint64_t);
 		CTL_M2_M4_GET("stats.arenas.0.bins.0.curslabs", i, j, &curslabs,
 		    size_t);
+		CTL_M2_M4_GET("stats.arenas.0.bins.0.nonfull_slabs", i, j, &nonfull_slabs,
+		    size_t);
 
-		if (json) {
-			malloc_cprintf(write_cb, cbopaque,
-			    "\t\t\t\t\t{\n"
-			    "\t\t\t\t\t\t\"nmalloc\": %"FMTu64",\n"
-			    "\t\t\t\t\t\t\"ndalloc\": %"FMTu64",\n"
-			    "\t\t\t\t\t\t\"curregs\": %zu,\n"
-			    "\t\t\t\t\t\t\"nrequests\": %"FMTu64",\n"
-			    "\t\t\t\t\t\t\"nfills\": %"FMTu64",\n"
-			    "\t\t\t\t\t\t\"nflushes\": %"FMTu64",\n"
-			    "\t\t\t\t\t\t\"nreslabs\": %"FMTu64",\n"
-			    "\t\t\t\t\t\t\"curslabs\": %zu%s\n",
-			    nmalloc, ndalloc, curregs, nrequests, nfills,
-			    nflushes, nreslabs, curslabs, mutex ? "," : "");
-			if (mutex) {
-				uint64_t mutex_stats[mutex_prof_num_counters];
-				read_arena_bin_mutex_stats(i, j, mutex_stats);
-				mutex_stats_output_json(write_cb, cbopaque,
-				    "mutex", mutex_stats, "\t\t\t\t\t\t", true);
-			}
-			malloc_cprintf(write_cb, cbopaque,
-			    "\t\t\t\t\t}%s\n",
-			    (j + 1 < nbins) ? "," : "");
-		} else if (!in_gap) {
-			size_t availregs = nregs * curslabs;
-			char util[6];
-			if (get_rate_str((uint64_t)curregs, (uint64_t)availregs,
-			    util)) {
-				if (availregs == 0) {
-					malloc_snprintf(util, sizeof(util),
-					    "1");
-				} else if (curregs > availregs) {
-					/*
-					 * Race detected: the counters were read
-					 * in separate mallctl calls and
-					 * concurrent operations happened in
-					 * between. In this case no meaningful
-					 * utilization can be computed.
-					 */
-					malloc_snprintf(util, sizeof(util),
-					    " race");
-				} else {
-					not_reached();
-				}
-			}
-			uint64_t mutex_stats[mutex_prof_num_counters];
-			if (mutex) {
-				read_arena_bin_mutex_stats(i, j, mutex_stats);
-			}
+		if (mutex) {
+			mutex_stats_read_arena_bin(i, j, col_mutex64,
+			    col_mutex32, uptime);
+		}
 
-			malloc_cprintf(write_cb, cbopaque, "%20zu %3u %12zu %12"
-			    FMTu64" %12"FMTu64" %12"FMTu64" %12zu %12zu %4u"
-			    " %3zu %-5s %12"FMTu64" %12"FMTu64" %12"FMTu64
-			    " %12"FMTu64, reg_size, j, curregs * reg_size,
-			    nmalloc, ndalloc, nrequests, curregs, curslabs,
-			    nregs, slab_size / page, util, nfills, nflushes,
-			    nslabs, nreslabs);
-
-			/* Output less info for bin mutexes to save space. */
-			if (mutex) {
-				malloc_cprintf(write_cb, cbopaque,
-				    " %12"FMTu64" %12"FMTu64" %12"FMTu64
-				    " %14"FMTu64" %12"FMTu64"\n",
-				    mutex_stats[mutex_counter_num_ops],
-				    mutex_stats[mutex_counter_num_wait],
-				    mutex_stats[mutex_counter_num_spin_acq],
-				    mutex_stats[mutex_counter_total_wait_time],
-				    mutex_stats[mutex_counter_max_wait_time]);
+		emitter_json_object_begin(emitter);
+		emitter_json_kv(emitter, "nmalloc", emitter_type_uint64,
+		    &nmalloc);
+		emitter_json_kv(emitter, "ndalloc", emitter_type_uint64,
+		    &ndalloc);
+		emitter_json_kv(emitter, "curregs", emitter_type_size,
+		    &curregs);
+		emitter_json_kv(emitter, "nrequests", emitter_type_uint64,
+		    &nrequests);
+		emitter_json_kv(emitter, "nfills", emitter_type_uint64,
+		    &nfills);
+		emitter_json_kv(emitter, "nflushes", emitter_type_uint64,
+		    &nflushes);
+		emitter_json_kv(emitter, "nreslabs", emitter_type_uint64,
+		    &nreslabs);
+		emitter_json_kv(emitter, "curslabs", emitter_type_size,
+		    &curslabs);
+		emitter_json_kv(emitter, "nonfull_slabs", emitter_type_size,
+		    &nonfull_slabs);
+		if (mutex) {
+			emitter_json_object_kv_begin(emitter, "mutex");
+			mutex_stats_emit(emitter, NULL, col_mutex64,
+			    col_mutex32);
+			emitter_json_object_end(emitter);
+		}
+		emitter_json_object_end(emitter);
+
+		size_t availregs = nregs * curslabs;
+		char util[6];
+		if (get_rate_str((uint64_t)curregs, (uint64_t)availregs, util))
+		{
+			if (availregs == 0) {
+				malloc_snprintf(util, sizeof(util), "1");
+			} else if (curregs > availregs) {
+				/*
+				 * Race detected: the counters were read in
+				 * separate mallctl calls and concurrent
+				 * operations happened in between.  In this case
+				 * no meaningful utilization can be computed.
+				 */
+				malloc_snprintf(util, sizeof(util), " race");
 			} else {
-				malloc_cprintf(write_cb, cbopaque, "\n");
+				not_reached();
 			}
 		}
+
+		col_size.size_val = reg_size;
+		col_ind.unsigned_val = j;
+		col_allocated.size_val = curregs * reg_size;
+		col_nmalloc.uint64_val = nmalloc;
+		col_nmalloc_ps.uint64_val = rate_per_second(nmalloc, uptime);
+		col_ndalloc.uint64_val = ndalloc;
+		col_ndalloc_ps.uint64_val = rate_per_second(ndalloc, uptime);
+		col_nrequests.uint64_val = nrequests;
+		col_nrequests_ps.uint64_val = rate_per_second(nrequests, uptime);
+		col_nshards.unsigned_val = nshards;
+		col_curregs.size_val = curregs;
+		col_curslabs.size_val = curslabs;
+		col_nonfull_slabs.size_val = nonfull_slabs;
+		col_regs.unsigned_val = nregs;
+		col_pgs.size_val = slab_size / page;
+		col_util.str_val = util;
+		col_nfills.uint64_val = nfills;
+		col_nfills_ps.uint64_val = rate_per_second(nfills, uptime);
+		col_nflushes.uint64_val = nflushes;
+		col_nflushes_ps.uint64_val = rate_per_second(nflushes, uptime);
+		col_nslabs.uint64_val = nslabs;
+		col_nreslabs.uint64_val = nreslabs;
+		col_nreslabs_ps.uint64_val = rate_per_second(nreslabs, uptime);
+
+		/*
+		 * Note that mutex columns were initialized above, if mutex ==
+		 * true.
+		 */
+
+		emitter_table_row(emitter, &row);
 	}
-	if (json) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t]%s\n", large ? "," : "");
-	} else {
-		if (in_gap) {
-			malloc_cprintf(write_cb, cbopaque,
-			    "                     ---\n");
-		}
+	emitter_json_array_end(emitter); /* Close "bins". */
+
+	if (in_gap) {
+		emitter_table_printf(emitter, "                     ---\n");
 	}
 }
 
 static void
-stats_arena_lextents_print(void (*write_cb)(void *, const char *),
-    void *cbopaque, bool json, unsigned i) {
+stats_arena_lextents_print(emitter_t *emitter, unsigned i, uint64_t uptime) {
 	unsigned nbins, nlextents, j;
 	bool in_gap, in_gap_prev;
 
 	CTL_GET("arenas.nbins", &nbins, unsigned);
 	CTL_GET("arenas.nlextents", &nlextents, unsigned);
-	if (json) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\"lextents\": [\n");
-	} else {
-		malloc_cprintf(write_cb, cbopaque,
-		    "large:          size ind    allocated      nmalloc"
-		    "      ndalloc    nrequests  curlextents\n");
-	}
+
+	emitter_row_t header_row;
+	emitter_row_init(&header_row);
+	emitter_row_t row;
+	emitter_row_init(&row);
+
+	COL_HDR(row, size, NULL, right, 20, size)
+	COL_HDR(row, ind, NULL, right, 4, unsigned)
+	COL_HDR(row, allocated, NULL, right, 13, size)
+	COL_HDR(row, nmalloc, NULL, right, 13, uint64)
+	COL_HDR(row, nmalloc_ps, "(#/sec)", right, 8, uint64)
+	COL_HDR(row, ndalloc, NULL, right, 13, uint64)
+	COL_HDR(row, ndalloc_ps, "(#/sec)", right, 8, uint64)
+	COL_HDR(row, nrequests, NULL, right, 13, uint64)
+	COL_HDR(row, nrequests_ps, "(#/sec)", right, 8, uint64)
+	COL_HDR(row, curlextents, NULL, right, 13, size)
+
+	/* As with bins, we label the large extents table. */
+	header_size.width -= 6;
+	emitter_table_printf(emitter, "large:");
+	emitter_table_row(emitter, &header_row);
+	emitter_json_array_kv_begin(emitter, "lextents");
+
 	for (j = 0, in_gap = false; j < nlextents; j++) {
 		uint64_t nmalloc, ndalloc, nrequests;
 		size_t lextent_size, curlextents;
@@ -289,156 +509,186 @@ stats_arena_lextents_print(void (*write_cb)(void *, const char *),
 		in_gap_prev = in_gap;
 		in_gap = (nrequests == 0);
 
-		if (!json && in_gap_prev && !in_gap) {
-			malloc_cprintf(write_cb, cbopaque,
+		if (in_gap_prev && !in_gap) {
+			emitter_table_printf(emitter,
 			    "                     ---\n");
 		}
 
 		CTL_M2_GET("arenas.lextent.0.size", j, &lextent_size, size_t);
 		CTL_M2_M4_GET("stats.arenas.0.lextents.0.curlextents", i, j,
 		    &curlextents, size_t);
-		if (json) {
-			malloc_cprintf(write_cb, cbopaque,
-			    "\t\t\t\t\t{\n"
-			    "\t\t\t\t\t\t\"curlextents\": %zu\n"
-			    "\t\t\t\t\t}%s\n",
-			    curlextents,
-			    (j + 1 < nlextents) ? "," : "");
-		} else if (!in_gap) {
-			malloc_cprintf(write_cb, cbopaque,
-			    "%20zu %3u %12zu %12"FMTu64" %12"FMTu64
-			    " %12"FMTu64" %12zu\n",
-			    lextent_size, nbins + j,
-			    curlextents * lextent_size, nmalloc, ndalloc,
-			    nrequests, curlextents);
+
+		emitter_json_object_begin(emitter);
+		emitter_json_kv(emitter, "curlextents", emitter_type_size,
+		    &curlextents);
+		emitter_json_object_end(emitter);
+
+		col_size.size_val = lextent_size;
+		col_ind.unsigned_val = nbins + j;
+		col_allocated.size_val = curlextents * lextent_size;
+		col_nmalloc.uint64_val = nmalloc;
+		col_nmalloc_ps.uint64_val = rate_per_second(nmalloc, uptime);
+		col_ndalloc.uint64_val = ndalloc;
+		col_ndalloc_ps.uint64_val = rate_per_second(ndalloc, uptime);
+		col_nrequests.uint64_val = nrequests;
+		col_nrequests_ps.uint64_val = rate_per_second(nrequests, uptime);
+		col_curlextents.size_val = curlextents;
+
+		if (!in_gap) {
+			emitter_table_row(emitter, &row);
 		}
 	}
-	if (json) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t]\n");
-	} else {
-		if (in_gap) {
-			malloc_cprintf(write_cb, cbopaque,
-			    "                     ---\n");
-		}
+	emitter_json_array_end(emitter); /* Close "lextents". */
+	if (in_gap) {
+		emitter_table_printf(emitter, "                     ---\n");
 	}
 }
 
 static void
-read_arena_mutex_stats(unsigned arena_ind,
-    uint64_t results[mutex_prof_num_arena_mutexes][mutex_prof_num_counters]) {
-	char cmd[MUTEX_CTL_STR_MAX_LENGTH];
-
-	mutex_prof_arena_ind_t i;
-	for (i = 0; i < mutex_prof_num_arena_mutexes; i++) {
-#define OP(c, t)							\
-		gen_mutex_ctl_str(cmd, MUTEX_CTL_STR_MAX_LENGTH,	\
-		    "arenas.0.mutexes",	arena_mutex_names[i], #c);	\
-		CTL_M2_GET(cmd, arena_ind,				\
-		    (t *)&results[i][mutex_counter_##c], t);
-MUTEX_PROF_COUNTERS
-#undef OP
-	}
-}
+stats_arena_extents_print(emitter_t *emitter, unsigned i) {
+	unsigned j;
+	bool in_gap, in_gap_prev;
+	emitter_row_t header_row;
+	emitter_row_init(&header_row);
+	emitter_row_t row;
+	emitter_row_init(&row);
+
+	COL_HDR(row, size, NULL, right, 20, size)
+	COL_HDR(row, ind, NULL, right, 4, unsigned)
+	COL_HDR(row, ndirty, NULL, right, 13, size)
+	COL_HDR(row, dirty, NULL, right, 13, size)
+	COL_HDR(row, nmuzzy, NULL, right, 13, size)
+	COL_HDR(row, muzzy, NULL, right, 13, size)
+	COL_HDR(row, nretained, NULL, right, 13, size)
+	COL_HDR(row, retained, NULL, right, 13, size)
+	COL_HDR(row, ntotal, NULL, right, 13, size)
+	COL_HDR(row, total, NULL, right, 13, size)
+
+	/* Label this section. */
+	header_size.width -= 8;
+	emitter_table_printf(emitter, "extents:");
+	emitter_table_row(emitter, &header_row);
+	emitter_json_array_kv_begin(emitter, "extents");
+
+	in_gap = false;
+	for (j = 0; j < SC_NPSIZES; j++) {
+		size_t ndirty, nmuzzy, nretained, total, dirty_bytes,
+		    muzzy_bytes, retained_bytes, total_bytes;
+		CTL_M2_M4_GET("stats.arenas.0.extents.0.ndirty", i, j,
+		    &ndirty, size_t);
+		CTL_M2_M4_GET("stats.arenas.0.extents.0.nmuzzy", i, j,
+		    &nmuzzy, size_t);
+		CTL_M2_M4_GET("stats.arenas.0.extents.0.nretained", i, j,
+		    &nretained, size_t);
+		CTL_M2_M4_GET("stats.arenas.0.extents.0.dirty_bytes", i, j,
+		    &dirty_bytes, size_t);
+		CTL_M2_M4_GET("stats.arenas.0.extents.0.muzzy_bytes", i, j,
+		    &muzzy_bytes, size_t);
+		CTL_M2_M4_GET("stats.arenas.0.extents.0.retained_bytes", i, j,
+		    &retained_bytes, size_t);
+		total = ndirty + nmuzzy + nretained;
+		total_bytes = dirty_bytes + muzzy_bytes + retained_bytes;
 
-static void
-mutex_stats_output(void (*write_cb)(void *, const char *), void *cbopaque,
-    const char *name, uint64_t stats[mutex_prof_num_counters],
-    bool first_mutex) {
-	if (first_mutex) {
-		/* Print title. */
-		malloc_cprintf(write_cb, cbopaque,
-		    "                           n_lock_ops       n_waiting"
-		    "      n_spin_acq  n_owner_switch   total_wait_ns"
-		    "     max_wait_ns  max_n_thds\n");
-	}
+		in_gap_prev = in_gap;
+		in_gap = (total == 0);
 
-	malloc_cprintf(write_cb, cbopaque, "%s", name);
-	malloc_cprintf(write_cb, cbopaque, ":%*c",
-	    (int)(20 - strlen(name)), ' ');
+		if (in_gap_prev && !in_gap) {
+			emitter_table_printf(emitter,
+			    "                     ---\n");
+		}
 
-	char *fmt_str[2] = {"%12"FMTu32, "%16"FMTu64};
-#define OP(c, t)							\
-	malloc_cprintf(write_cb, cbopaque,				\
-	    fmt_str[sizeof(t) / sizeof(uint32_t) - 1],			\
-	    (t)stats[mutex_counter_##c]);
-MUTEX_PROF_COUNTERS
-#undef OP
-	malloc_cprintf(write_cb, cbopaque, "\n");
+		emitter_json_object_begin(emitter);
+		emitter_json_kv(emitter, "ndirty", emitter_type_size, &ndirty);
+		emitter_json_kv(emitter, "nmuzzy", emitter_type_size, &nmuzzy);
+		emitter_json_kv(emitter, "nretained", emitter_type_size,
+		    &nretained);
+
+		emitter_json_kv(emitter, "dirty_bytes", emitter_type_size,
+		    &dirty_bytes);
+		emitter_json_kv(emitter, "muzzy_bytes", emitter_type_size,
+		    &muzzy_bytes);
+		emitter_json_kv(emitter, "retained_bytes", emitter_type_size,
+		    &retained_bytes);
+		emitter_json_object_end(emitter);
+
+		col_size.size_val = sz_pind2sz(j);
+		col_ind.size_val = j;
+		col_ndirty.size_val = ndirty;
+		col_dirty.size_val = dirty_bytes;
+		col_nmuzzy.size_val = nmuzzy;
+		col_muzzy.size_val = muzzy_bytes;
+		col_nretained.size_val = nretained;
+		col_retained.size_val = retained_bytes;
+		col_ntotal.size_val = total;
+		col_total.size_val = total_bytes;
+
+		if (!in_gap) {
+			emitter_table_row(emitter, &row);
+		}
+	}
+	emitter_json_array_end(emitter); /* Close "extents". */
+	if (in_gap) {
+		emitter_table_printf(emitter, "                     ---\n");
+	}
 }
 
 static void
-stats_arena_mutexes_print(void (*write_cb)(void *, const char *),
-    void *cbopaque, bool json, bool json_end, unsigned arena_ind) {
-	uint64_t mutex_stats[mutex_prof_num_arena_mutexes][mutex_prof_num_counters];
-	read_arena_mutex_stats(arena_ind, mutex_stats);
-
-	/* Output mutex stats. */
-	if (json) {
-		malloc_cprintf(write_cb, cbopaque, "\t\t\t\t\"mutexes\": {\n");
-		mutex_prof_arena_ind_t i, last_mutex;
-		last_mutex = mutex_prof_num_arena_mutexes - 1;
-		for (i = 0; i < mutex_prof_num_arena_mutexes; i++) {
-			mutex_stats_output_json(write_cb, cbopaque,
-			    arena_mutex_names[i], mutex_stats[i],
-			    "\t\t\t\t\t", (i == last_mutex));
-		}
-		malloc_cprintf(write_cb, cbopaque, "\t\t\t\t}%s\n",
-		    json_end ? "" : ",");
-	} else {
-		mutex_prof_arena_ind_t i;
-		for (i = 0; i < mutex_prof_num_arena_mutexes; i++) {
-			mutex_stats_output(write_cb, cbopaque,
-			    arena_mutex_names[i], mutex_stats[i], i == 0);
-		}
+stats_arena_mutexes_print(emitter_t *emitter, unsigned arena_ind, uint64_t uptime) {
+	emitter_row_t row;
+	emitter_col_t col_name;
+	emitter_col_t col64[mutex_prof_num_uint64_t_counters];
+	emitter_col_t col32[mutex_prof_num_uint32_t_counters];
+
+	emitter_row_init(&row);
+	mutex_stats_init_cols(&row, "", &col_name, col64, col32);
+
+	emitter_json_object_kv_begin(emitter, "mutexes");
+	emitter_table_row(emitter, &row);
+
+	for (mutex_prof_arena_ind_t i = 0; i < mutex_prof_num_arena_mutexes;
+	    i++) {
+		const char *name = arena_mutex_names[i];
+		emitter_json_object_kv_begin(emitter, name);
+		mutex_stats_read_arena(arena_ind, i, name, &col_name, col64,
+		    col32, uptime);
+		mutex_stats_emit(emitter, &row, col64, col32);
+		emitter_json_object_end(emitter); /* Close the mutex dict. */
 	}
+	emitter_json_object_end(emitter); /* End "mutexes". */
 }
 
 static void
-stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
-    bool json, unsigned i, bool bins, bool large, bool mutex) {
+stats_arena_print(emitter_t *emitter, unsigned i, bool bins, bool large,
+    bool mutex, bool extents) {
 	unsigned nthreads;
 	const char *dss;
 	ssize_t dirty_decay_ms, muzzy_decay_ms;
 	size_t page, pactive, pdirty, pmuzzy, mapped, retained;
-	size_t base, internal, resident;
+	size_t base, internal, resident, metadata_thp, extent_avail;
 	uint64_t dirty_npurge, dirty_nmadvise, dirty_purged;
 	uint64_t muzzy_npurge, muzzy_nmadvise, muzzy_purged;
 	size_t small_allocated;
-	uint64_t small_nmalloc, small_ndalloc, small_nrequests;
+	uint64_t small_nmalloc, small_ndalloc, small_nrequests, small_nfills,
+	    small_nflushes;
 	size_t large_allocated;
-	uint64_t large_nmalloc, large_ndalloc, large_nrequests;
-	size_t tcache_bytes;
+	uint64_t large_nmalloc, large_ndalloc, large_nrequests, large_nfills,
+	    large_nflushes;
+	size_t tcache_bytes, abandoned_vm;
 	uint64_t uptime;
 
 	CTL_GET("arenas.page", &page, size_t);
 
 	CTL_M2_GET("stats.arenas.0.nthreads", i, &nthreads, unsigned);
-	if (json) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\"nthreads\": %u,\n", nthreads);
-	} else {
-		malloc_cprintf(write_cb, cbopaque,
-		    "assigned threads: %u\n", nthreads);
-	}
+	emitter_kv(emitter, "nthreads", "assigned threads",
+	    emitter_type_unsigned, &nthreads);
 
 	CTL_M2_GET("stats.arenas.0.uptime", i, &uptime, uint64_t);
-	if (json) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\"uptime_ns\": %"FMTu64",\n", uptime);
-	} else {
-		malloc_cprintf(write_cb, cbopaque,
-		    "uptime: %"FMTu64"\n", uptime);
-	}
+	emitter_kv(emitter, "uptime_ns", "uptime", emitter_type_uint64,
+	    &uptime);
 
 	CTL_M2_GET("stats.arenas.0.dss", i, &dss, const char *);
-	if (json) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\"dss\": \"%s\",\n", dss);
-	} else {
-		malloc_cprintf(write_cb, cbopaque,
-		    "dss allocation precedence: %s\n", dss);
-	}
+	emitter_kv(emitter, "dss", "dss allocation precedence",
+	    emitter_type_string, &dss);
 
 	CTL_M2_GET("stats.arenas.0.dirty_decay_ms", i, &dirty_decay_ms,
 	    ssize_t);
@@ -455,205 +705,290 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	CTL_M2_GET("stats.arenas.0.muzzy_nmadvise", i, &muzzy_nmadvise,
 	    uint64_t);
 	CTL_M2_GET("stats.arenas.0.muzzy_purged", i, &muzzy_purged, uint64_t);
-	if (json) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\"dirty_decay_ms\": %zd,\n", dirty_decay_ms);
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\"muzzy_decay_ms\": %zd,\n", muzzy_decay_ms);
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\"pactive\": %zu,\n", pactive);
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\"pdirty\": %zu,\n", pdirty);
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\"pmuzzy\": %zu,\n", pmuzzy);
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\"dirty_npurge\": %"FMTu64",\n", dirty_npurge);
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\"dirty_nmadvise\": %"FMTu64",\n", dirty_nmadvise);
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\"dirty_purged\": %"FMTu64",\n", dirty_purged);
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\"muzzy_npurge\": %"FMTu64",\n", muzzy_npurge);
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\"muzzy_nmadvise\": %"FMTu64",\n", muzzy_nmadvise);
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\"muzzy_purged\": %"FMTu64",\n", muzzy_purged);
-	} else {
-		malloc_cprintf(write_cb, cbopaque,
-		    "decaying:  time       npages       sweeps     madvises"
-		    "       purged\n");
-		if (dirty_decay_ms >= 0) {
-			malloc_cprintf(write_cb, cbopaque,
-			    "   dirty: %5zd %12zu %12"FMTu64" %12"FMTu64" %12"
-			    FMTu64"\n", dirty_decay_ms, pdirty, dirty_npurge,
-			    dirty_nmadvise, dirty_purged);
-		} else {
-			malloc_cprintf(write_cb, cbopaque,
-			    "   dirty:   N/A %12zu %12"FMTu64" %12"FMTu64" %12"
-			    FMTu64"\n", pdirty, dirty_npurge, dirty_nmadvise,
-			    dirty_purged);
-		}
-		if (muzzy_decay_ms >= 0) {
-			malloc_cprintf(write_cb, cbopaque,
-			    "   muzzy: %5zd %12zu %12"FMTu64" %12"FMTu64" %12"
-			    FMTu64"\n", muzzy_decay_ms, pmuzzy, muzzy_npurge,
-			    muzzy_nmadvise, muzzy_purged);
-		} else {
-			malloc_cprintf(write_cb, cbopaque,
-			    "   muzzy:   N/A %12zu %12"FMTu64" %12"FMTu64" %12"
-			    FMTu64"\n", pmuzzy, muzzy_npurge, muzzy_nmadvise,
-			    muzzy_purged);
-		}
-	}
 
-	CTL_M2_GET("stats.arenas.0.small.allocated", i, &small_allocated,
-	    size_t);
-	CTL_M2_GET("stats.arenas.0.small.nmalloc", i, &small_nmalloc, uint64_t);
-	CTL_M2_GET("stats.arenas.0.small.ndalloc", i, &small_ndalloc, uint64_t);
-	CTL_M2_GET("stats.arenas.0.small.nrequests", i, &small_nrequests,
-	    uint64_t);
-	if (json) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\"small\": {\n");
-
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\t\"allocated\": %zu,\n", small_allocated);
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\t\"nmalloc\": %"FMTu64",\n", small_nmalloc);
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\t\"ndalloc\": %"FMTu64",\n", small_ndalloc);
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\t\"nrequests\": %"FMTu64"\n", small_nrequests);
-
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t},\n");
-	} else {
-		malloc_cprintf(write_cb, cbopaque,
-		    "                            allocated      nmalloc"
-		    "      ndalloc    nrequests\n");
-		malloc_cprintf(write_cb, cbopaque,
-		    "small:                   %12zu %12"FMTu64" %12"FMTu64
-		    " %12"FMTu64"\n",
-		    small_allocated, small_nmalloc, small_ndalloc,
-		    small_nrequests);
-	}
+	emitter_row_t decay_row;
+	emitter_row_init(&decay_row);
 
-	CTL_M2_GET("stats.arenas.0.large.allocated", i, &large_allocated,
-	    size_t);
-	CTL_M2_GET("stats.arenas.0.large.nmalloc", i, &large_nmalloc, uint64_t);
-	CTL_M2_GET("stats.arenas.0.large.ndalloc", i, &large_ndalloc, uint64_t);
-	CTL_M2_GET("stats.arenas.0.large.nrequests", i, &large_nrequests,
-	    uint64_t);
-	if (json) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\"large\": {\n");
-
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\t\"allocated\": %zu,\n", large_allocated);
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\t\"nmalloc\": %"FMTu64",\n", large_nmalloc);
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\t\"ndalloc\": %"FMTu64",\n", large_ndalloc);
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\t\"nrequests\": %"FMTu64"\n", large_nrequests);
-
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t},\n");
-	} else {
-		malloc_cprintf(write_cb, cbopaque,
-		    "large:                   %12zu %12"FMTu64" %12"FMTu64
-		    " %12"FMTu64"\n",
-		    large_allocated, large_nmalloc, large_ndalloc,
-		    large_nrequests);
-		malloc_cprintf(write_cb, cbopaque,
-		    "total:                   %12zu %12"FMTu64" %12"FMTu64
-		    " %12"FMTu64"\n",
-		    small_allocated + large_allocated, small_nmalloc +
-		    large_nmalloc, small_ndalloc + large_ndalloc,
-		    small_nrequests + large_nrequests);
-	}
-	if (!json) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "active:                  %12zu\n", pactive * page);
-	}
+	/* JSON-style emission. */
+	emitter_json_kv(emitter, "dirty_decay_ms", emitter_type_ssize,
+	    &dirty_decay_ms);
+	emitter_json_kv(emitter, "muzzy_decay_ms", emitter_type_ssize,
+	    &muzzy_decay_ms);
 
-	CTL_M2_GET("stats.arenas.0.mapped", i, &mapped, size_t);
-	if (json) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\"mapped\": %zu,\n", mapped);
-	} else {
-		malloc_cprintf(write_cb, cbopaque,
-		    "mapped:                  %12zu\n", mapped);
-	}
+	emitter_json_kv(emitter, "pactive", emitter_type_size, &pactive);
+	emitter_json_kv(emitter, "pdirty", emitter_type_size, &pdirty);
+	emitter_json_kv(emitter, "pmuzzy", emitter_type_size, &pmuzzy);
 
-	CTL_M2_GET("stats.arenas.0.retained", i, &retained, size_t);
-	if (json) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\"retained\": %zu,\n", retained);
-	} else {
-		malloc_cprintf(write_cb, cbopaque,
-		    "retained:                %12zu\n", retained);
-	}
+	emitter_json_kv(emitter, "dirty_npurge", emitter_type_uint64,
+	    &dirty_npurge);
+	emitter_json_kv(emitter, "dirty_nmadvise", emitter_type_uint64,
+	    &dirty_nmadvise);
+	emitter_json_kv(emitter, "dirty_purged", emitter_type_uint64,
+	    &dirty_purged);
 
-	CTL_M2_GET("stats.arenas.0.base", i, &base, size_t);
-	if (json) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\"base\": %zu,\n", base);
-	} else {
-		malloc_cprintf(write_cb, cbopaque,
-		    "base:                    %12zu\n", base);
-	}
+	emitter_json_kv(emitter, "muzzy_npurge", emitter_type_uint64,
+	    &muzzy_npurge);
+	emitter_json_kv(emitter, "muzzy_nmadvise", emitter_type_uint64,
+	    &muzzy_nmadvise);
+	emitter_json_kv(emitter, "muzzy_purged", emitter_type_uint64,
+	    &muzzy_purged);
 
-	CTL_M2_GET("stats.arenas.0.internal", i, &internal, size_t);
-	if (json) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\"internal\": %zu,\n", internal);
-	} else {
-		malloc_cprintf(write_cb, cbopaque,
-		    "internal:                %12zu\n", internal);
-	}
+	/* Table-style emission. */
+	COL(decay_row, decay_type, right, 9, title);
+	col_decay_type.str_val = "decaying:";
+
+	COL(decay_row, decay_time, right, 6, title);
+	col_decay_time.str_val = "time";
+
+	COL(decay_row, decay_npages, right, 13, title);
+	col_decay_npages.str_val = "npages";
+
+	COL(decay_row, decay_sweeps, right, 13, title);
+	col_decay_sweeps.str_val = "sweeps";
+
+	COL(decay_row, decay_madvises, right, 13, title);
+	col_decay_madvises.str_val = "madvises";
+
+	COL(decay_row, decay_purged, right, 13, title);
+	col_decay_purged.str_val = "purged";
+
+	/* Title row. */
+	emitter_table_row(emitter, &decay_row);
 
-	CTL_M2_GET("stats.arenas.0.tcache_bytes", i, &tcache_bytes, size_t);
-	if (json) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\"tcache\": %zu,\n", tcache_bytes);
+	/* Dirty row. */
+	col_decay_type.str_val = "dirty:";
+
+	if (dirty_decay_ms >= 0) {
+		col_decay_time.type = emitter_type_ssize;
+		col_decay_time.ssize_val = dirty_decay_ms;
 	} else {
-		malloc_cprintf(write_cb, cbopaque,
-		    "tcache:                  %12zu\n", tcache_bytes);
+		col_decay_time.type = emitter_type_title;
+		col_decay_time.str_val = "N/A";
 	}
 
-	CTL_M2_GET("stats.arenas.0.resident", i, &resident, size_t);
-	if (json) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\"resident\": %zu%s\n", resident,
-		    (bins || large || mutex) ? "," : "");
+	col_decay_npages.type = emitter_type_size;
+	col_decay_npages.size_val = pdirty;
+
+	col_decay_sweeps.type = emitter_type_uint64;
+	col_decay_sweeps.uint64_val = dirty_npurge;
+
+	col_decay_madvises.type = emitter_type_uint64;
+	col_decay_madvises.uint64_val = dirty_nmadvise;
+
+	col_decay_purged.type = emitter_type_uint64;
+	col_decay_purged.uint64_val = dirty_purged;
+
+	emitter_table_row(emitter, &decay_row);
+
+	/* Muzzy row. */
+	col_decay_type.str_val = "muzzy:";
+
+	if (muzzy_decay_ms >= 0) {
+		col_decay_time.type = emitter_type_ssize;
+		col_decay_time.ssize_val = muzzy_decay_ms;
 	} else {
-		malloc_cprintf(write_cb, cbopaque,
-		    "resident:                %12zu\n", resident);
+		col_decay_time.type = emitter_type_title;
+		col_decay_time.str_val = "N/A";
 	}
 
+	col_decay_npages.type = emitter_type_size;
+	col_decay_npages.size_val = pmuzzy;
+
+	col_decay_sweeps.type = emitter_type_uint64;
+	col_decay_sweeps.uint64_val = muzzy_npurge;
+
+	col_decay_madvises.type = emitter_type_uint64;
+	col_decay_madvises.uint64_val = muzzy_nmadvise;
+
+	col_decay_purged.type = emitter_type_uint64;
+	col_decay_purged.uint64_val = muzzy_purged;
+
+	emitter_table_row(emitter, &decay_row);
+
+	/* Small / large / total allocation counts. */
+	emitter_row_t alloc_count_row;
+	emitter_row_init(&alloc_count_row);
+
+	COL(alloc_count_row, count_title, left, 21, title);
+	col_count_title.str_val = "";
+
+	COL(alloc_count_row, count_allocated, right, 16, title);
+	col_count_allocated.str_val = "allocated";
+
+	COL(alloc_count_row, count_nmalloc, right, 16, title);
+	col_count_nmalloc.str_val = "nmalloc";
+	COL(alloc_count_row, count_nmalloc_ps, right, 8, title);
+	col_count_nmalloc_ps.str_val = "(#/sec)";
+
+	COL(alloc_count_row, count_ndalloc, right, 16, title);
+	col_count_ndalloc.str_val = "ndalloc";
+	COL(alloc_count_row, count_ndalloc_ps, right, 8, title);
+	col_count_ndalloc_ps.str_val = "(#/sec)";
+
+	COL(alloc_count_row, count_nrequests, right, 16, title);
+	col_count_nrequests.str_val = "nrequests";
+	COL(alloc_count_row, count_nrequests_ps, right, 10, title);
+	col_count_nrequests_ps.str_val = "(#/sec)";
+
+	COL(alloc_count_row, count_nfills, right, 16, title);
+	col_count_nfills.str_val = "nfill";
+	COL(alloc_count_row, count_nfills_ps, right, 10, title);
+	col_count_nfills_ps.str_val = "(#/sec)";
+
+	COL(alloc_count_row, count_nflushes, right, 16, title);
+	col_count_nflushes.str_val = "nflush";
+	COL(alloc_count_row, count_nflushes_ps, right, 10, title);
+	col_count_nflushes_ps.str_val = "(#/sec)";
+
+	emitter_table_row(emitter, &alloc_count_row);
+
+	col_count_nmalloc_ps.type = emitter_type_uint64;
+	col_count_ndalloc_ps.type = emitter_type_uint64;
+	col_count_nrequests_ps.type = emitter_type_uint64;
+	col_count_nfills_ps.type = emitter_type_uint64;
+	col_count_nflushes_ps.type = emitter_type_uint64;
+
+#define GET_AND_EMIT_ALLOC_STAT(small_or_large, name, valtype)		\
+	CTL_M2_GET("stats.arenas.0." #small_or_large "." #name, i,	\
+	    &small_or_large##_##name, valtype##_t);			\
+	emitter_json_kv(emitter, #name, emitter_type_##valtype,		\
+	    &small_or_large##_##name);					\
+	col_count_##name.type = emitter_type_##valtype;		\
+	col_count_##name.valtype##_val = small_or_large##_##name;
+
+	emitter_json_object_kv_begin(emitter, "small");
+	col_count_title.str_val = "small:";
+
+	GET_AND_EMIT_ALLOC_STAT(small, allocated, size)
+	GET_AND_EMIT_ALLOC_STAT(small, nmalloc, uint64)
+	col_count_nmalloc_ps.uint64_val =
+	    rate_per_second(col_count_nmalloc.uint64_val, uptime);
+	GET_AND_EMIT_ALLOC_STAT(small, ndalloc, uint64)
+	col_count_ndalloc_ps.uint64_val =
+	    rate_per_second(col_count_ndalloc.uint64_val, uptime);
+	GET_AND_EMIT_ALLOC_STAT(small, nrequests, uint64)
+	col_count_nrequests_ps.uint64_val =
+	    rate_per_second(col_count_nrequests.uint64_val, uptime);
+	GET_AND_EMIT_ALLOC_STAT(small, nfills, uint64)
+	col_count_nfills_ps.uint64_val =
+	    rate_per_second(col_count_nfills.uint64_val, uptime);
+	GET_AND_EMIT_ALLOC_STAT(small, nflushes, uint64)
+	col_count_nflushes_ps.uint64_val =
+	    rate_per_second(col_count_nflushes.uint64_val, uptime);
+
+	emitter_table_row(emitter, &alloc_count_row);
+	emitter_json_object_end(emitter); /* Close "small". */
+
+	emitter_json_object_kv_begin(emitter, "large");
+	col_count_title.str_val = "large:";
+
+	GET_AND_EMIT_ALLOC_STAT(large, allocated, size)
+	GET_AND_EMIT_ALLOC_STAT(large, nmalloc, uint64)
+	col_count_nmalloc_ps.uint64_val =
+	    rate_per_second(col_count_nmalloc.uint64_val, uptime);
+	GET_AND_EMIT_ALLOC_STAT(large, ndalloc, uint64)
+	col_count_ndalloc_ps.uint64_val =
+	    rate_per_second(col_count_ndalloc.uint64_val, uptime);
+	GET_AND_EMIT_ALLOC_STAT(large, nrequests, uint64)
+	col_count_nrequests_ps.uint64_val =
+	    rate_per_second(col_count_nrequests.uint64_val, uptime);
+	GET_AND_EMIT_ALLOC_STAT(large, nfills, uint64)
+	col_count_nfills_ps.uint64_val =
+	    rate_per_second(col_count_nfills.uint64_val, uptime);
+	GET_AND_EMIT_ALLOC_STAT(large, nflushes, uint64)
+	col_count_nflushes_ps.uint64_val =
+	    rate_per_second(col_count_nflushes.uint64_val, uptime);
+
+	emitter_table_row(emitter, &alloc_count_row);
+	emitter_json_object_end(emitter); /* Close "large". */
+
+#undef GET_AND_EMIT_ALLOC_STAT
+
+	/* Aggregated small + large stats are emitter only in table mode. */
+	col_count_title.str_val = "total:";
+	col_count_allocated.size_val = small_allocated + large_allocated;
+	col_count_nmalloc.uint64_val = small_nmalloc + large_nmalloc;
+	col_count_ndalloc.uint64_val = small_ndalloc + large_ndalloc;
+	col_count_nrequests.uint64_val = small_nrequests + large_nrequests;
+	col_count_nfills.uint64_val = small_nfills + large_nfills;
+	col_count_nflushes.uint64_val = small_nflushes + large_nflushes;
+	col_count_nmalloc_ps.uint64_val =
+	    rate_per_second(col_count_nmalloc.uint64_val, uptime);
+	col_count_ndalloc_ps.uint64_val =
+	    rate_per_second(col_count_ndalloc.uint64_val, uptime);
+	col_count_nrequests_ps.uint64_val =
+	    rate_per_second(col_count_nrequests.uint64_val, uptime);
+	col_count_nfills_ps.uint64_val =
+	    rate_per_second(col_count_nfills.uint64_val, uptime);
+	col_count_nflushes_ps.uint64_val =
+	    rate_per_second(col_count_nflushes.uint64_val, uptime);
+	emitter_table_row(emitter, &alloc_count_row);
+
+	emitter_row_t mem_count_row;
+	emitter_row_init(&mem_count_row);
+
+	emitter_col_t mem_count_title;
+	emitter_col_init(&mem_count_title, &mem_count_row);
+	mem_count_title.justify = emitter_justify_left;
+	mem_count_title.width = 21;
+	mem_count_title.type = emitter_type_title;
+	mem_count_title.str_val = "";
+
+	emitter_col_t mem_count_val;
+	emitter_col_init(&mem_count_val, &mem_count_row);
+	mem_count_val.justify = emitter_justify_right;
+	mem_count_val.width = 16;
+	mem_count_val.type = emitter_type_title;
+	mem_count_val.str_val = "";
+
+	emitter_table_row(emitter, &mem_count_row);
+	mem_count_val.type = emitter_type_size;
+
+	/* Active count in bytes is emitted only in table mode. */
+	mem_count_title.str_val = "active:";
+	mem_count_val.size_val = pactive * page;
+	emitter_table_row(emitter, &mem_count_row);
+
+#define GET_AND_EMIT_MEM_STAT(stat)					\
+	CTL_M2_GET("stats.arenas.0."#stat, i, &stat, size_t);		\
+	emitter_json_kv(emitter, #stat, emitter_type_size, &stat);	\
+	mem_count_title.str_val = #stat":";				\
+	mem_count_val.size_val = stat;					\
+	emitter_table_row(emitter, &mem_count_row);
+
+	GET_AND_EMIT_MEM_STAT(mapped)
+	GET_AND_EMIT_MEM_STAT(retained)
+	GET_AND_EMIT_MEM_STAT(base)
+	GET_AND_EMIT_MEM_STAT(internal)
+	GET_AND_EMIT_MEM_STAT(metadata_thp)
+	GET_AND_EMIT_MEM_STAT(tcache_bytes)
+	GET_AND_EMIT_MEM_STAT(resident)
+	GET_AND_EMIT_MEM_STAT(abandoned_vm)
+	GET_AND_EMIT_MEM_STAT(extent_avail)
+#undef GET_AND_EMIT_MEM_STAT
+
 	if (mutex) {
-		stats_arena_mutexes_print(write_cb, cbopaque, json,
-		    !(bins || large), i);
+		stats_arena_mutexes_print(emitter, i, uptime);
 	}
 	if (bins) {
-		stats_arena_bins_print(write_cb, cbopaque, json, large, mutex,
-		    i);
+		stats_arena_bins_print(emitter, mutex, i, uptime);
 	}
 	if (large) {
-		stats_arena_lextents_print(write_cb, cbopaque, json, i);
+		stats_arena_lextents_print(emitter, i, uptime);
+	}
+	if (extents) {
+		stats_arena_extents_print(emitter, i);
 	}
 }
 
 static void
-stats_general_print(void (*write_cb)(void *, const char *), void *cbopaque,
-    bool json, bool more) {
+stats_general_print(emitter_t *emitter) {
 	const char *cpv;
-	bool bv;
+	bool bv, bv2;
 	unsigned uv;
 	uint32_t u32v;
 	uint64_t u64v;
-	ssize_t ssv;
+	ssize_t ssv, ssv2;
 	size_t sv, bsz, usz, ssz, sssz, cpsz;
 
 	bsz = sizeof(bool);
@@ -663,365 +998,257 @@ stats_general_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	cpsz = sizeof(const char *);
 
 	CTL_GET("version", &cpv, const char *);
-	if (json) {
-		malloc_cprintf(write_cb, cbopaque,
-		"\t\t\"version\": \"%s\",\n", cpv);
-	} else {
-		malloc_cprintf(write_cb, cbopaque, "Version: %s\n", cpv);
-	}
+	emitter_kv(emitter, "version", "Version", emitter_type_string, &cpv);
 
 	/* config. */
-#define CONFIG_WRITE_BOOL_JSON(n, c)					\
-	if (json) {							\
-		CTL_GET("config."#n, &bv, bool);			\
-		malloc_cprintf(write_cb, cbopaque,			\
-		    "\t\t\t\""#n"\": %s%s\n", bv ? "true" : "false",	\
-		    (c));						\
+	emitter_dict_begin(emitter, "config", "Build-time option settings");
+#define CONFIG_WRITE_BOOL(name)						\
+	do {								\
+		CTL_GET("config."#name, &bv, bool);			\
+		emitter_kv(emitter, #name, "config."#name,		\
+		    emitter_type_bool, &bv);				\
+	} while (0)
+
+	CONFIG_WRITE_BOOL(cache_oblivious);
+	CONFIG_WRITE_BOOL(debug);
+	CONFIG_WRITE_BOOL(fill);
+	CONFIG_WRITE_BOOL(lazy_lock);
+	emitter_kv(emitter, "malloc_conf", "config.malloc_conf",
+	    emitter_type_string, &config_malloc_conf);
+
+	CONFIG_WRITE_BOOL(opt_safety_checks);
+	CONFIG_WRITE_BOOL(prof);
+	CONFIG_WRITE_BOOL(prof_libgcc);
+	CONFIG_WRITE_BOOL(prof_libunwind);
+	CONFIG_WRITE_BOOL(stats);
+	CONFIG_WRITE_BOOL(utrace);
+	CONFIG_WRITE_BOOL(xmalloc);
+#undef CONFIG_WRITE_BOOL
+	emitter_dict_end(emitter); /* Close "config" dict. */
+
+	/* opt. */
+#define OPT_WRITE(name, var, size, emitter_type)			\
+	if (je_mallctl("opt."name, (void *)&var, &size, NULL, 0) ==	\
+	    0) {							\
+		emitter_kv(emitter, name, "opt."name, emitter_type,	\
+		    &var);						\
 	}
 
-	if (json) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\"config\": {\n");
+#define OPT_WRITE_MUTABLE(name, var1, var2, size, emitter_type,		\
+    altname)								\
+	if (je_mallctl("opt."name, (void *)&var1, &size, NULL, 0) ==	\
+	    0 && je_mallctl(altname, (void *)&var2, &size, NULL, 0)	\
+	    == 0) {							\
+		emitter_kv_note(emitter, name, "opt."name,		\
+		    emitter_type, &var1, altname, emitter_type,		\
+		    &var2);						\
 	}
 
-	CONFIG_WRITE_BOOL_JSON(cache_oblivious, ",")
+#define OPT_WRITE_BOOL(name) OPT_WRITE(name, bv, bsz, emitter_type_bool)
+#define OPT_WRITE_BOOL_MUTABLE(name, altname)				\
+	OPT_WRITE_MUTABLE(name, bv, bv2, bsz, emitter_type_bool, altname)
+
+#define OPT_WRITE_UNSIGNED(name)					\
+	OPT_WRITE(name, uv, usz, emitter_type_unsigned)
+
+#define OPT_WRITE_SIZE_T(name)						\
+	OPT_WRITE(name, sv, ssz, emitter_type_size)
+#define OPT_WRITE_SSIZE_T(name)						\
+	OPT_WRITE(name, ssv, sssz, emitter_type_ssize)
+#define OPT_WRITE_SSIZE_T_MUTABLE(name, altname)			\
+	OPT_WRITE_MUTABLE(name, ssv, ssv2, sssz, emitter_type_ssize,	\
+	    altname)
+
+#define OPT_WRITE_CHAR_P(name)						\
+	OPT_WRITE(name, cpv, cpsz, emitter_type_string)
+
+	emitter_dict_begin(emitter, "opt", "Run-time option settings");
+
+	OPT_WRITE_BOOL("abort")
+	OPT_WRITE_BOOL("abort_conf")
+	OPT_WRITE_BOOL("confirm_conf")
+	OPT_WRITE_BOOL("retain")
+	OPT_WRITE_CHAR_P("dss")
+	OPT_WRITE_UNSIGNED("narenas")
+	OPT_WRITE_CHAR_P("percpu_arena")
+	OPT_WRITE_SIZE_T("oversize_threshold")
+	OPT_WRITE_CHAR_P("metadata_thp")
+	OPT_WRITE_BOOL_MUTABLE("background_thread", "background_thread")
+	OPT_WRITE_SSIZE_T_MUTABLE("dirty_decay_ms", "arenas.dirty_decay_ms")
+	OPT_WRITE_SSIZE_T_MUTABLE("muzzy_decay_ms", "arenas.muzzy_decay_ms")
+	OPT_WRITE_SIZE_T("lg_extent_max_active_fit")
+	OPT_WRITE_CHAR_P("junk")
+	OPT_WRITE_BOOL("zero")
+	OPT_WRITE_BOOL("utrace")
+	OPT_WRITE_BOOL("xmalloc")
+	OPT_WRITE_BOOL("tcache")
+	OPT_WRITE_SSIZE_T("lg_tcache_max")
+	OPT_WRITE_CHAR_P("thp")
+	OPT_WRITE_BOOL("prof")
+	OPT_WRITE_CHAR_P("prof_prefix")
+	OPT_WRITE_BOOL_MUTABLE("prof_active", "prof.active")
+	OPT_WRITE_BOOL_MUTABLE("prof_thread_active_init",
+	    "prof.thread_active_init")
+	OPT_WRITE_SSIZE_T_MUTABLE("lg_prof_sample", "prof.lg_sample")
+	OPT_WRITE_BOOL("prof_accum")
+	OPT_WRITE_SSIZE_T("lg_prof_interval")
+	OPT_WRITE_BOOL("prof_gdump")
+	OPT_WRITE_BOOL("prof_final")
+	OPT_WRITE_BOOL("prof_leak")
+	OPT_WRITE_BOOL("stats_print")
+	OPT_WRITE_CHAR_P("stats_print_opts")
+
+	emitter_dict_end(emitter);
+
+#undef OPT_WRITE
+#undef OPT_WRITE_MUTABLE
+#undef OPT_WRITE_BOOL
+#undef OPT_WRITE_BOOL_MUTABLE
+#undef OPT_WRITE_UNSIGNED
+#undef OPT_WRITE_SSIZE_T
+#undef OPT_WRITE_SSIZE_T_MUTABLE
+#undef OPT_WRITE_CHAR_P
 
-	CTL_GET("config.debug", &bv, bool);
-	if (json) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\"debug\": %s,\n", bv ? "true" : "false");
-	} else {
-		malloc_cprintf(write_cb, cbopaque, "Assertions %s\n",
-		    bv ? "enabled" : "disabled");
-	}
+	/* prof. */
+	if (config_prof) {
+		emitter_dict_begin(emitter, "prof", "Profiling settings");
 
-	CONFIG_WRITE_BOOL_JSON(fill, ",")
-	CONFIG_WRITE_BOOL_JSON(lazy_lock, ",")
+		CTL_GET("prof.thread_active_init", &bv, bool);
+		emitter_kv(emitter, "thread_active_init",
+		    "prof.thread_active_init", emitter_type_bool, &bv);
 
-	if (json) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\"malloc_conf\": \"%s\",\n",
-		    config_malloc_conf);
-	} else {
-		malloc_cprintf(write_cb, cbopaque,
-		    "config.malloc_conf: \"%s\"\n", config_malloc_conf);
-	}
+		CTL_GET("prof.active", &bv, bool);
+		emitter_kv(emitter, "active", "prof.active", emitter_type_bool,
+		    &bv);
 
-	CONFIG_WRITE_BOOL_JSON(prof, ",")
-	CONFIG_WRITE_BOOL_JSON(prof_libgcc, ",")
-	CONFIG_WRITE_BOOL_JSON(prof_libunwind, ",")
-	CONFIG_WRITE_BOOL_JSON(stats, ",")
-	CONFIG_WRITE_BOOL_JSON(thp, ",")
-	CONFIG_WRITE_BOOL_JSON(utrace, ",")
-	CONFIG_WRITE_BOOL_JSON(xmalloc, "")
-
-	if (json) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t},\n");
-	}
-#undef CONFIG_WRITE_BOOL_JSON
+		CTL_GET("prof.gdump", &bv, bool);
+		emitter_kv(emitter, "gdump", "prof.gdump", emitter_type_bool,
+		    &bv);
 
-	/* opt. */
-#define OPT_WRITE_BOOL(n, c)						\
-	if (je_mallctl("opt."#n, (void *)&bv, &bsz, NULL, 0) == 0) {	\
-		if (json) {						\
-			malloc_cprintf(write_cb, cbopaque,		\
-			    "\t\t\t\""#n"\": %s%s\n", bv ? "true" :	\
-			    "false", (c));				\
-		} else {						\
-			malloc_cprintf(write_cb, cbopaque,		\
-			    "  opt."#n": %s\n", bv ? "true" : "false");	\
-		}							\
-	}
-#define OPT_WRITE_BOOL_MUTABLE(n, m, c) {				\
-	bool bv2;							\
-	if (je_mallctl("opt."#n, (void *)&bv, &bsz, NULL, 0) == 0 &&	\
-	    je_mallctl(#m, (void *)&bv2, &bsz, NULL, 0) == 0) {		\
-		if (json) {						\
-			malloc_cprintf(write_cb, cbopaque,		\
-			    "\t\t\t\""#n"\": %s%s\n", bv ? "true" :	\
-			    "false", (c));				\
-		} else {						\
-			malloc_cprintf(write_cb, cbopaque,		\
-			    "  opt."#n": %s ("#m": %s)\n", bv ? "true"	\
-			    : "false", bv2 ? "true" : "false");		\
-		}							\
-	}								\
-}
-#define OPT_WRITE_UNSIGNED(n, c)					\
-	if (je_mallctl("opt."#n, (void *)&uv, &usz, NULL, 0) == 0) {	\
-		if (json) {						\
-			malloc_cprintf(write_cb, cbopaque,		\
-			    "\t\t\t\""#n"\": %u%s\n", uv, (c));		\
-		} else {						\
-			malloc_cprintf(write_cb, cbopaque,		\
-			"  opt."#n": %u\n", uv);			\
-		}							\
-	}
-#define OPT_WRITE_SSIZE_T(n, c)						\
-	if (je_mallctl("opt."#n, (void *)&ssv, &sssz, NULL, 0) == 0) {	\
-		if (json) {						\
-			malloc_cprintf(write_cb, cbopaque,		\
-			    "\t\t\t\""#n"\": %zd%s\n", ssv, (c));	\
-		} else {						\
-			malloc_cprintf(write_cb, cbopaque,		\
-			    "  opt."#n": %zd\n", ssv);			\
-		}							\
-	}
-#define OPT_WRITE_SSIZE_T_MUTABLE(n, m, c) {				\
-	ssize_t ssv2;							\
-	if (je_mallctl("opt."#n, (void *)&ssv, &sssz, NULL, 0) == 0 &&	\
-	    je_mallctl(#m, (void *)&ssv2, &sssz, NULL, 0) == 0) {	\
-		if (json) {						\
-			malloc_cprintf(write_cb, cbopaque,		\
-			    "\t\t\t\""#n"\": %zd%s\n", ssv, (c));	\
-		} else {						\
-			malloc_cprintf(write_cb, cbopaque,		\
-			    "  opt."#n": %zd ("#m": %zd)\n",		\
-			    ssv, ssv2);					\
-		}							\
-	}								\
-}
-#define OPT_WRITE_CHAR_P(n, c)						\
-	if (je_mallctl("opt."#n, (void *)&cpv, &cpsz, NULL, 0) == 0) {	\
-		if (json) {						\
-			malloc_cprintf(write_cb, cbopaque,		\
-			    "\t\t\t\""#n"\": \"%s\"%s\n", cpv, (c));	\
-		} else {						\
-			malloc_cprintf(write_cb, cbopaque,		\
-			    "  opt."#n": \"%s\"\n", cpv);		\
-		}							\
-	}
+		CTL_GET("prof.interval", &u64v, uint64_t);
+		emitter_kv(emitter, "interval", "prof.interval",
+		    emitter_type_uint64, &u64v);
 
-	if (json) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\"opt\": {\n");
-	} else {
-		malloc_cprintf(write_cb, cbopaque,
-		    "Run-time option settings:\n");
-	}
-	OPT_WRITE_BOOL(abort, ",")
-	OPT_WRITE_BOOL(abort_conf, ",")
-	OPT_WRITE_BOOL(retain, ",")
-	OPT_WRITE_CHAR_P(dss, ",")
-	OPT_WRITE_UNSIGNED(narenas, ",")
-	OPT_WRITE_CHAR_P(percpu_arena, ",")
-	OPT_WRITE_BOOL_MUTABLE(background_thread, background_thread, ",")
-	OPT_WRITE_SSIZE_T_MUTABLE(dirty_decay_ms, arenas.dirty_decay_ms, ",")
-	OPT_WRITE_SSIZE_T_MUTABLE(muzzy_decay_ms, arenas.muzzy_decay_ms, ",")
-	OPT_WRITE_CHAR_P(junk, ",")
-	OPT_WRITE_BOOL(zero, ",")
-	OPT_WRITE_BOOL(utrace, ",")
-	OPT_WRITE_BOOL(xmalloc, ",")
-	OPT_WRITE_BOOL(tcache, ",")
-	OPT_WRITE_SSIZE_T(lg_tcache_max, ",")
-	OPT_WRITE_BOOL(prof, ",")
-	OPT_WRITE_CHAR_P(prof_prefix, ",")
-	OPT_WRITE_BOOL_MUTABLE(prof_active, prof.active, ",")
-	OPT_WRITE_BOOL_MUTABLE(prof_thread_active_init, prof.thread_active_init,
-	    ",")
-	OPT_WRITE_SSIZE_T_MUTABLE(lg_prof_sample, prof.lg_sample, ",")
-	OPT_WRITE_BOOL(prof_accum, ",")
-	OPT_WRITE_SSIZE_T(lg_prof_interval, ",")
-	OPT_WRITE_BOOL(prof_gdump, ",")
-	OPT_WRITE_BOOL(prof_final, ",")
-	OPT_WRITE_BOOL(prof_leak, ",")
-	OPT_WRITE_BOOL(stats_print, ",")
-	if (json || opt_stats_print) {
-		/*
-		 * stats_print_opts is always emitted for JSON, so as long as it
-		 * comes last it's safe to unconditionally omit the comma here
-		 * (rather than having to conditionally omit it elsewhere
-		 * depending on configuration).
-		 */
-		OPT_WRITE_CHAR_P(stats_print_opts, "")
-	}
-	if (json) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t},\n");
-	}
+		CTL_GET("prof.lg_sample", &ssv, ssize_t);
+		emitter_kv(emitter, "lg_sample", "prof.lg_sample",
+		    emitter_type_ssize, &ssv);
 
-#undef OPT_WRITE_BOOL
-#undef OPT_WRITE_BOOL_MUTABLE
-#undef OPT_WRITE_SSIZE_T
-#undef OPT_WRITE_CHAR_P
+		emitter_dict_end(emitter); /* Close "prof". */
+	}
 
 	/* arenas. */
-	if (json) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\"arenas\": {\n");
-	}
+	/*
+	 * The json output sticks arena info into an "arenas" dict; the table
+	 * output puts them at the top-level.
+	 */
+	emitter_json_object_kv_begin(emitter, "arenas");
 
 	CTL_GET("arenas.narenas", &uv, unsigned);
-	if (json) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\"narenas\": %u,\n", uv);
-	} else {
-		malloc_cprintf(write_cb, cbopaque, "Arenas: %u\n", uv);
-	}
+	emitter_kv(emitter, "narenas", "Arenas", emitter_type_unsigned, &uv);
 
-	if (json) {
-		CTL_GET("arenas.dirty_decay_ms", &ssv, ssize_t);
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\"dirty_decay_ms\": %zd,\n", ssv);
+	/*
+	 * Decay settings are emitted only in json mode; in table mode, they're
+	 * emitted as notes with the opt output, above.
+	 */
+	CTL_GET("arenas.dirty_decay_ms", &ssv, ssize_t);
+	emitter_json_kv(emitter, "dirty_decay_ms", emitter_type_ssize, &ssv);
 
-		CTL_GET("arenas.muzzy_decay_ms", &ssv, ssize_t);
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\"muzzy_decay_ms\": %zd,\n", ssv);
-	}
+	CTL_GET("arenas.muzzy_decay_ms", &ssv, ssize_t);
+	emitter_json_kv(emitter, "muzzy_decay_ms", emitter_type_ssize, &ssv);
 
 	CTL_GET("arenas.quantum", &sv, size_t);
-	if (json) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\"quantum\": %zu,\n", sv);
-	} else {
-		malloc_cprintf(write_cb, cbopaque, "Quantum size: %zu\n", sv);
-	}
+	emitter_kv(emitter, "quantum", "Quantum size", emitter_type_size, &sv);
 
 	CTL_GET("arenas.page", &sv, size_t);
-	if (json) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\"page\": %zu,\n", sv);
-	} else {
-		malloc_cprintf(write_cb, cbopaque, "Page size: %zu\n", sv);
-	}
+	emitter_kv(emitter, "page", "Page size", emitter_type_size, &sv);
 
 	if (je_mallctl("arenas.tcache_max", (void *)&sv, &ssz, NULL, 0) == 0) {
-		if (json) {
-			malloc_cprintf(write_cb, cbopaque,
-			    "\t\t\t\"tcache_max\": %zu,\n", sv);
-		} else {
-			malloc_cprintf(write_cb, cbopaque,
-			    "Maximum thread-cached size class: %zu\n", sv);
-		}
+		emitter_kv(emitter, "tcache_max",
+		    "Maximum thread-cached size class", emitter_type_size, &sv);
 	}
 
-	if (json) {
-		unsigned nbins, nlextents, i;
-
-		CTL_GET("arenas.nbins", &nbins, unsigned);
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\"nbins\": %u,\n", nbins);
+	unsigned nbins;
+	CTL_GET("arenas.nbins", &nbins, unsigned);
+	emitter_kv(emitter, "nbins", "Number of bin size classes",
+	    emitter_type_unsigned, &nbins);
 
-		CTL_GET("arenas.nhbins", &uv, unsigned);
-		malloc_cprintf(write_cb, cbopaque, "\t\t\t\"nhbins\": %u,\n",
-		    uv);
+	unsigned nhbins;
+	CTL_GET("arenas.nhbins", &nhbins, unsigned);
+	emitter_kv(emitter, "nhbins", "Number of thread-cache bin size classes",
+	    emitter_type_unsigned, &nhbins);
 
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\"bin\": [\n");
-		for (i = 0; i < nbins; i++) {
-			malloc_cprintf(write_cb, cbopaque,
-			    "\t\t\t\t{\n");
+	/*
+	 * We do enough mallctls in a loop that we actually want to omit them
+	 * (not just omit the printing).
+	 */
+	if (emitter->output == emitter_output_json) {
+		emitter_json_array_kv_begin(emitter, "bin");
+		for (unsigned i = 0; i < nbins; i++) {
+			emitter_json_object_begin(emitter);
 
 			CTL_M2_GET("arenas.bin.0.size", i, &sv, size_t);
-			malloc_cprintf(write_cb, cbopaque,
-			    "\t\t\t\t\t\"size\": %zu,\n", sv);
+			emitter_json_kv(emitter, "size", emitter_type_size,
+			    &sv);
 
 			CTL_M2_GET("arenas.bin.0.nregs", i, &u32v, uint32_t);
-			malloc_cprintf(write_cb, cbopaque,
-			    "\t\t\t\t\t\"nregs\": %"FMTu32",\n", u32v);
+			emitter_json_kv(emitter, "nregs", emitter_type_uint32,
+			    &u32v);
 
 			CTL_M2_GET("arenas.bin.0.slab_size", i, &sv, size_t);
-			malloc_cprintf(write_cb, cbopaque,
-			    "\t\t\t\t\t\"slab_size\": %zu\n", sv);
+			emitter_json_kv(emitter, "slab_size", emitter_type_size,
+			    &sv);
 
-			malloc_cprintf(write_cb, cbopaque,
-			    "\t\t\t\t}%s\n", (i + 1 < nbins) ? "," : "");
-		}
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t],\n");
-
-		CTL_GET("arenas.nlextents", &nlextents, unsigned);
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\"nlextents\": %u,\n", nlextents);
-
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\"lextent\": [\n");
-		for (i = 0; i < nlextents; i++) {
-			malloc_cprintf(write_cb, cbopaque,
-			    "\t\t\t\t{\n");
-
-			CTL_M2_GET("arenas.lextent.0.size", i, &sv, size_t);
-			malloc_cprintf(write_cb, cbopaque,
-			    "\t\t\t\t\t\"size\": %zu\n", sv);
+			CTL_M2_GET("arenas.bin.0.nshards", i, &u32v, uint32_t);
+			emitter_json_kv(emitter, "nshards", emitter_type_uint32,
+			    &u32v);
 
-			malloc_cprintf(write_cb, cbopaque,
-			    "\t\t\t\t}%s\n", (i + 1 < nlextents) ? "," : "");
+			emitter_json_object_end(emitter);
 		}
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t]\n");
-
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t}%s\n", (config_prof || more) ? "," : "");
+		emitter_json_array_end(emitter); /* Close "bin". */
 	}
 
-	/* prof. */
-	if (config_prof && json) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\"prof\": {\n");
-
-		CTL_GET("prof.thread_active_init", &bv, bool);
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\"thread_active_init\": %s,\n", bv ? "true" :
-		    "false");
-
-		CTL_GET("prof.active", &bv, bool);
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\"active\": %s,\n", bv ? "true" : "false");
-
-		CTL_GET("prof.gdump", &bv, bool);
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\"gdump\": %s,\n", bv ? "true" : "false");
+	unsigned nlextents;
+	CTL_GET("arenas.nlextents", &nlextents, unsigned);
+	emitter_kv(emitter, "nlextents", "Number of large size classes",
+	    emitter_type_unsigned, &nlextents);
 
-		CTL_GET("prof.interval", &u64v, uint64_t);
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\"interval\": %"FMTu64",\n", u64v);
+	if (emitter->output == emitter_output_json) {
+		emitter_json_array_kv_begin(emitter, "lextent");
+		for (unsigned i = 0; i < nlextents; i++) {
+			emitter_json_object_begin(emitter);
 
-		CTL_GET("prof.lg_sample", &ssv, ssize_t);
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\"lg_sample\": %zd\n", ssv);
+			CTL_M2_GET("arenas.lextent.0.size", i, &sv, size_t);
+			emitter_json_kv(emitter, "size", emitter_type_size,
+			    &sv);
 
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t}%s\n", more ? "," : "");
+			emitter_json_object_end(emitter);
+		}
+		emitter_json_array_end(emitter); /* Close "lextent". */
 	}
-}
 
-static void
-read_global_mutex_stats(
-    uint64_t results[mutex_prof_num_global_mutexes][mutex_prof_num_counters]) {
-	char cmd[MUTEX_CTL_STR_MAX_LENGTH];
-
-	mutex_prof_global_ind_t i;
-	for (i = 0; i < mutex_prof_num_global_mutexes; i++) {
-#define OP(c, t)							\
-		gen_mutex_ctl_str(cmd, MUTEX_CTL_STR_MAX_LENGTH,	\
-		    "mutexes", global_mutex_names[i], #c);		\
-		CTL_GET(cmd, (t *)&results[i][mutex_counter_##c], t);
-MUTEX_PROF_COUNTERS
-#undef OP
-	}
+	emitter_json_object_end(emitter); /* Close "arenas" */
 }
 
 static void
-stats_print_helper(void (*write_cb)(void *, const char *), void *cbopaque,
-    bool json, bool merged, bool destroyed, bool unmerged, bool bins,
-    bool large, bool mutex) {
-	size_t allocated, active, metadata, resident, mapped, retained;
+stats_print_helper(emitter_t *emitter, bool merged, bool destroyed,
+    bool unmerged, bool bins, bool large, bool mutex, bool extents) {
+	/*
+	 * These should be deleted.  We keep them around for a while, to aid in
+	 * the transition to the emitter code.
+	 */
+	size_t allocated, active, metadata, metadata_thp, resident, mapped,
+	    retained;
 	size_t num_background_threads;
 	uint64_t background_thread_num_runs, background_thread_run_interval;
 
 	CTL_GET("stats.allocated", &allocated, size_t);
 	CTL_GET("stats.active", &active, size_t);
 	CTL_GET("stats.metadata", &metadata, size_t);
+	CTL_GET("stats.metadata_thp", &metadata_thp, size_t);
 	CTL_GET("stats.resident", &resident, size_t);
 	CTL_GET("stats.mapped", &mapped, size_t);
 	CTL_GET("stats.retained", &retained, size_t);
 
-	uint64_t mutex_stats[mutex_prof_num_global_mutexes][mutex_prof_num_counters];
-	if (mutex) {
-		read_global_mutex_stats(mutex_stats);
-	}
-
 	if (have_background_thread) {
 		CTL_GET("stats.background_thread.num_threads",
 		    &num_background_threads, size_t);
@@ -1035,182 +1262,133 @@ stats_print_helper(void (*write_cb)(void *, const char *), void *cbopaque,
 		background_thread_run_interval = 0;
 	}
 
-	if (json) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\"stats\": {\n");
-
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\"allocated\": %zu,\n", allocated);
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\"active\": %zu,\n", active);
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\"metadata\": %zu,\n", metadata);
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\"resident\": %zu,\n", resident);
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\"mapped\": %zu,\n", mapped);
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\"retained\": %zu,\n", retained);
-
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\"background_thread\": {\n");
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\"num_threads\": %zu,\n", num_background_threads);
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\"num_runs\": %"FMTu64",\n",
-		    background_thread_num_runs);
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t\t\t\"run_interval\": %"FMTu64"\n",
-		    background_thread_run_interval);
-		malloc_cprintf(write_cb, cbopaque, "\t\t\t}%s\n",
-		    mutex ? "," : "");
+	/* Generic global stats. */
+	emitter_json_object_kv_begin(emitter, "stats");
+	emitter_json_kv(emitter, "allocated", emitter_type_size, &allocated);
+	emitter_json_kv(emitter, "active", emitter_type_size, &active);
+	emitter_json_kv(emitter, "metadata", emitter_type_size, &metadata);
+	emitter_json_kv(emitter, "metadata_thp", emitter_type_size,
+	    &metadata_thp);
+	emitter_json_kv(emitter, "resident", emitter_type_size, &resident);
+	emitter_json_kv(emitter, "mapped", emitter_type_size, &mapped);
+	emitter_json_kv(emitter, "retained", emitter_type_size, &retained);
+
+	emitter_table_printf(emitter, "Allocated: %zu, active: %zu, "
+	    "metadata: %zu (n_thp %zu), resident: %zu, mapped: %zu, "
+	    "retained: %zu\n", allocated, active, metadata, metadata_thp,
+	    resident, mapped, retained);
+
+	/* Background thread stats. */
+	emitter_json_object_kv_begin(emitter, "background_thread");
+	emitter_json_kv(emitter, "num_threads", emitter_type_size,
+	    &num_background_threads);
+	emitter_json_kv(emitter, "num_runs", emitter_type_uint64,
+	    &background_thread_num_runs);
+	emitter_json_kv(emitter, "run_interval", emitter_type_uint64,
+	    &background_thread_run_interval);
+	emitter_json_object_end(emitter); /* Close "background_thread". */
+
+	emitter_table_printf(emitter, "Background threads: %zu, "
+	    "num_runs: %"FMTu64", run_interval: %"FMTu64" ns\n",
+	    num_background_threads, background_thread_num_runs,
+	    background_thread_run_interval);
 
-		if (mutex) {
-			malloc_cprintf(write_cb, cbopaque,
-			    "\t\t\t\"mutexes\": {\n");
-			mutex_prof_global_ind_t i;
-			for (i = 0; i < mutex_prof_num_global_mutexes; i++) {
-				mutex_stats_output_json(write_cb, cbopaque,
-				    global_mutex_names[i], mutex_stats[i],
-				    "\t\t\t\t",
-				    i == mutex_prof_num_global_mutexes - 1);
-			}
-			malloc_cprintf(write_cb, cbopaque, "\t\t\t}\n");
-		}
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t\t}%s\n", (merged || unmerged || destroyed) ? "," : "");
-	} else {
-		malloc_cprintf(write_cb, cbopaque,
-		    "Allocated: %zu, active: %zu, metadata: %zu,"
-		    " resident: %zu, mapped: %zu, retained: %zu\n",
-		    allocated, active, metadata, resident, mapped, retained);
-
-		if (have_background_thread && num_background_threads > 0) {
-			malloc_cprintf(write_cb, cbopaque,
-			    "Background threads: %zu, num_runs: %"FMTu64", "
-			    "run_interval: %"FMTu64" ns\n",
-			    num_background_threads,
-			    background_thread_num_runs,
-			    background_thread_run_interval);
-		}
-		if (mutex) {
-			mutex_prof_global_ind_t i;
-			for (i = 0; i < mutex_prof_num_global_mutexes; i++) {
-				mutex_stats_output(write_cb, cbopaque,
-				    global_mutex_names[i], mutex_stats[i],
-				    i == 0);
-			}
+	if (mutex) {
+		emitter_row_t row;
+		emitter_col_t name;
+		emitter_col_t col64[mutex_prof_num_uint64_t_counters];
+		emitter_col_t col32[mutex_prof_num_uint32_t_counters];
+		uint64_t uptime;
+
+		emitter_row_init(&row);
+		mutex_stats_init_cols(&row, "", &name, col64, col32);
+
+		emitter_table_row(emitter, &row);
+		emitter_json_object_kv_begin(emitter, "mutexes");
+
+		CTL_M2_GET("stats.arenas.0.uptime", 0, &uptime, uint64_t);
+
+		for (int i = 0; i < mutex_prof_num_global_mutexes; i++) {
+			mutex_stats_read_global(global_mutex_names[i], &name,
+			    col64, col32, uptime);
+			emitter_json_object_kv_begin(emitter, global_mutex_names[i]);
+			mutex_stats_emit(emitter, &row, col64, col32);
+			emitter_json_object_end(emitter);
 		}
+
+		emitter_json_object_end(emitter); /* Close "mutexes". */
 	}
 
+	emitter_json_object_end(emitter); /* Close "stats". */
+
 	if (merged || destroyed || unmerged) {
 		unsigned narenas;
 
-		if (json) {
-			malloc_cprintf(write_cb, cbopaque,
-			    "\t\t\"stats.arenas\": {\n");
-		}
+		emitter_json_object_kv_begin(emitter, "stats.arenas");
 
 		CTL_GET("arenas.narenas", &narenas, unsigned);
-		{
-			size_t mib[3];
-			size_t miblen = sizeof(mib) / sizeof(size_t);
-			size_t sz;
-			VARIABLE_ARRAY(bool, initialized, narenas);
-			bool destroyed_initialized;
-			unsigned i, j, ninitialized;
-
-			xmallctlnametomib("arena.0.initialized", mib, &miblen);
-			for (i = ninitialized = 0; i < narenas; i++) {
-				mib[1] = i;
-				sz = sizeof(bool);
-				xmallctlbymib(mib, miblen, &initialized[i], &sz,
-				    NULL, 0);
-				if (initialized[i]) {
-					ninitialized++;
-				}
-			}
-			mib[1] = MALLCTL_ARENAS_DESTROYED;
+		size_t mib[3];
+		size_t miblen = sizeof(mib) / sizeof(size_t);
+		size_t sz;
+		VARIABLE_ARRAY(bool, initialized, narenas);
+		bool destroyed_initialized;
+		unsigned i, j, ninitialized;
+
+		xmallctlnametomib("arena.0.initialized", mib, &miblen);
+		for (i = ninitialized = 0; i < narenas; i++) {
+			mib[1] = i;
 			sz = sizeof(bool);
-			xmallctlbymib(mib, miblen, &destroyed_initialized, &sz,
+			xmallctlbymib(mib, miblen, &initialized[i], &sz,
 			    NULL, 0);
-
-			/* Merged stats. */
-			if (merged && (ninitialized > 1 || !unmerged)) {
-				/* Print merged arena stats. */
-				if (json) {
-					malloc_cprintf(write_cb, cbopaque,
-					    "\t\t\t\"merged\": {\n");
-				} else {
-					malloc_cprintf(write_cb, cbopaque,
-					    "\nMerged arenas stats:\n");
-				}
-				stats_arena_print(write_cb, cbopaque, json,
-				    MALLCTL_ARENAS_ALL, bins, large, mutex);
-				if (json) {
-					malloc_cprintf(write_cb, cbopaque,
-					    "\t\t\t}%s\n",
-					    ((destroyed_initialized &&
-					    destroyed) || unmerged) ?  "," :
-					    "");
-				}
+			if (initialized[i]) {
+				ninitialized++;
 			}
+		}
+		mib[1] = MALLCTL_ARENAS_DESTROYED;
+		sz = sizeof(bool);
+		xmallctlbymib(mib, miblen, &destroyed_initialized, &sz,
+		    NULL, 0);
+
+		/* Merged stats. */
+		if (merged && (ninitialized > 1 || !unmerged)) {
+			/* Print merged arena stats. */
+			emitter_table_printf(emitter, "Merged arenas stats:\n");
+			emitter_json_object_kv_begin(emitter, "merged");
+			stats_arena_print(emitter, MALLCTL_ARENAS_ALL, bins,
+			    large, mutex, extents);
+			emitter_json_object_end(emitter); /* Close "merged". */
+		}
 
-			/* Destroyed stats. */
-			if (destroyed_initialized && destroyed) {
-				/* Print destroyed arena stats. */
-				if (json) {
-					malloc_cprintf(write_cb, cbopaque,
-					    "\t\t\t\"destroyed\": {\n");
-				} else {
-					malloc_cprintf(write_cb, cbopaque,
-					    "\nDestroyed arenas stats:\n");
-				}
-				stats_arena_print(write_cb, cbopaque, json,
-				    MALLCTL_ARENAS_DESTROYED, bins, large,
-				    mutex);
-				if (json) {
-					malloc_cprintf(write_cb, cbopaque,
-					    "\t\t\t}%s\n", unmerged ?  "," :
-					    "");
-				}
-			}
+		/* Destroyed stats. */
+		if (destroyed_initialized && destroyed) {
+			/* Print destroyed arena stats. */
+			emitter_table_printf(emitter,
+			    "Destroyed arenas stats:\n");
+			emitter_json_object_kv_begin(emitter, "destroyed");
+			stats_arena_print(emitter, MALLCTL_ARENAS_DESTROYED,
+			    bins, large, mutex, extents);
+			emitter_json_object_end(emitter); /* Close "destroyed". */
+		}
 
-			/* Unmerged stats. */
-			if (unmerged) {
-				for (i = j = 0; i < narenas; i++) {
-					if (initialized[i]) {
-						if (json) {
-							j++;
-							malloc_cprintf(write_cb,
-							    cbopaque,
-							    "\t\t\t\"%u\": {\n",
-							    i);
-						} else {
-							malloc_cprintf(write_cb,
-							    cbopaque,
-							    "\narenas[%u]:\n",
-							    i);
-						}
-						stats_arena_print(write_cb,
-						    cbopaque, json, i, bins,
-						    large, mutex);
-						if (json) {
-							malloc_cprintf(write_cb,
-							    cbopaque,
-							    "\t\t\t}%s\n", (j <
-							    ninitialized) ? ","
-							    : "");
-						}
-					}
+		/* Unmerged stats. */
+		if (unmerged) {
+			for (i = j = 0; i < narenas; i++) {
+				if (initialized[i]) {
+					char arena_ind_str[20];
+					malloc_snprintf(arena_ind_str,
+					    sizeof(arena_ind_str), "%u", i);
+					emitter_json_object_kv_begin(emitter,
+					    arena_ind_str);
+					emitter_table_printf(emitter,
+					    "arenas[%s]:\n", arena_ind_str);
+					stats_arena_print(emitter, i, bins,
+					    large, mutex, extents);
+					/* Close "<arena-ind>". */
+					emitter_json_object_end(emitter);
 				}
 			}
 		}
-
-		if (json) {
-			malloc_cprintf(write_cb, cbopaque,
-			    "\t\t}\n");
-		}
+		emitter_json_object_end(emitter); /* Close "stats.arenas". */
 	}
 }
 
@@ -1257,29 +1435,23 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		}
 	}
 
-	if (json) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "{\n"
-		    "\t\"jemalloc\": {\n");
-	} else {
-		malloc_cprintf(write_cb, cbopaque,
-		    "___ Begin jemalloc statistics ___\n");
-	}
+	emitter_t emitter;
+	emitter_init(&emitter,
+	    json ? emitter_output_json : emitter_output_table, write_cb,
+	    cbopaque);
+	emitter_begin(&emitter);
+	emitter_table_printf(&emitter, "___ Begin jemalloc statistics ___\n");
+	emitter_json_object_kv_begin(&emitter, "jemalloc");
 
 	if (general) {
-		stats_general_print(write_cb, cbopaque, json, config_stats);
+		stats_general_print(&emitter);
 	}
 	if (config_stats) {
-		stats_print_helper(write_cb, cbopaque, json, merged, destroyed,
-		    unmerged, bins, large, mutex);
+		stats_print_helper(&emitter, merged, destroyed, unmerged,
+		    bins, large, mutex, extents);
 	}
 
-	if (json) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "\t}\n"
-		    "}\n");
-	} else {
-		malloc_cprintf(write_cb, cbopaque,
-		    "--- End jemalloc statistics ---\n");
-	}
+	emitter_json_object_end(&emitter); /* Closes the "jemalloc" dict. */
+	emitter_table_printf(&emitter, "--- End jemalloc statistics ---\n");
+	emitter_end(&emitter);
 }
diff --git a/deps/jemalloc/src/sz.c b/deps/jemalloc/src/sz.c
index 0986615f71..8633fb0500 100644
--- a/deps/jemalloc/src/sz.c
+++ b/deps/jemalloc/src/sz.c
@@ -2,105 +2,63 @@
 #include "jemalloc/internal/sz.h"
 
 JEMALLOC_ALIGNED(CACHELINE)
-const size_t sz_pind2sz_tab[NPSIZES+1] = {
-#define PSZ_yes(lg_grp, ndelta, lg_delta)				\
-	(((ZU(1)<<lg_grp) + (ZU(ndelta)<<lg_delta))),
-#define PSZ_no(lg_grp, ndelta, lg_delta)
-#define SC(index, lg_grp, lg_delta, ndelta, psz, bin, pgs, lg_delta_lookup) \
-	PSZ_##psz(lg_grp, ndelta, lg_delta)
-	SIZE_CLASSES
-#undef PSZ_yes
-#undef PSZ_no
-#undef SC
-	(LARGE_MAXCLASS + PAGE)
-};
+size_t sz_pind2sz_tab[SC_NPSIZES+1];
+
+static void
+sz_boot_pind2sz_tab(const sc_data_t *sc_data) {
+	int pind = 0;
+	for (unsigned i = 0; i < SC_NSIZES; i++) {
+		const sc_t *sc = &sc_data->sc[i];
+		if (sc->psz) {
+			sz_pind2sz_tab[pind] = (ZU(1) << sc->lg_base)
+			    + (ZU(sc->ndelta) << sc->lg_delta);
+			pind++;
+		}
+	}
+	for (int i = pind; i <= (int)SC_NPSIZES; i++) {
+		sz_pind2sz_tab[pind] = sc_data->large_maxclass + PAGE;
+	}
+}
 
 JEMALLOC_ALIGNED(CACHELINE)
-const size_t sz_index2size_tab[NSIZES] = {
-#define SC(index, lg_grp, lg_delta, ndelta, psz, bin, pgs, lg_delta_lookup) \
-	((ZU(1)<<lg_grp) + (ZU(ndelta)<<lg_delta)),
-	SIZE_CLASSES
-#undef SC
-};
+size_t sz_index2size_tab[SC_NSIZES];
+
+static void
+sz_boot_index2size_tab(const sc_data_t *sc_data) {
+	for (unsigned i = 0; i < SC_NSIZES; i++) {
+		const sc_t *sc = &sc_data->sc[i];
+		sz_index2size_tab[i] = (ZU(1) << sc->lg_base)
+		    + (ZU(sc->ndelta) << (sc->lg_delta));
+	}
+}
 
+/*
+ * To keep this table small, we divide sizes by the tiny min size, which gives
+ * the smallest interval for which the result can change.
+ */
 JEMALLOC_ALIGNED(CACHELINE)
-const uint8_t sz_size2index_tab[] = {
-#if LG_TINY_MIN == 0
-#warning "Dangerous LG_TINY_MIN"
-#define S2B_0(i)	i,
-#elif LG_TINY_MIN == 1
-#warning "Dangerous LG_TINY_MIN"
-#define S2B_1(i)	i,
-#elif LG_TINY_MIN == 2
-#warning "Dangerous LG_TINY_MIN"
-#define S2B_2(i)	i,
-#elif LG_TINY_MIN == 3
-#define S2B_3(i)	i,
-#elif LG_TINY_MIN == 4
-#define S2B_4(i)	i,
-#elif LG_TINY_MIN == 5
-#define S2B_5(i)	i,
-#elif LG_TINY_MIN == 6
-#define S2B_6(i)	i,
-#elif LG_TINY_MIN == 7
-#define S2B_7(i)	i,
-#elif LG_TINY_MIN == 8
-#define S2B_8(i)	i,
-#elif LG_TINY_MIN == 9
-#define S2B_9(i)	i,
-#elif LG_TINY_MIN == 10
-#define S2B_10(i)	i,
-#elif LG_TINY_MIN == 11
-#define S2B_11(i)	i,
-#else
-#error "Unsupported LG_TINY_MIN"
-#endif
-#if LG_TINY_MIN < 1
-#define S2B_1(i)	S2B_0(i) S2B_0(i)
-#endif
-#if LG_TINY_MIN < 2
-#define S2B_2(i)	S2B_1(i) S2B_1(i)
-#endif
-#if LG_TINY_MIN < 3
-#define S2B_3(i)	S2B_2(i) S2B_2(i)
-#endif
-#if LG_TINY_MIN < 4
-#define S2B_4(i)	S2B_3(i) S2B_3(i)
-#endif
-#if LG_TINY_MIN < 5
-#define S2B_5(i)	S2B_4(i) S2B_4(i)
-#endif
-#if LG_TINY_MIN < 6
-#define S2B_6(i)	S2B_5(i) S2B_5(i)
-#endif
-#if LG_TINY_MIN < 7
-#define S2B_7(i)	S2B_6(i) S2B_6(i)
-#endif
-#if LG_TINY_MIN < 8
-#define S2B_8(i)	S2B_7(i) S2B_7(i)
-#endif
-#if LG_TINY_MIN < 9
-#define S2B_9(i)	S2B_8(i) S2B_8(i)
-#endif
-#if LG_TINY_MIN < 10
-#define S2B_10(i)	S2B_9(i) S2B_9(i)
-#endif
-#if LG_TINY_MIN < 11
-#define S2B_11(i)	S2B_10(i) S2B_10(i)
-#endif
-#define S2B_no(i)
-#define SC(index, lg_grp, lg_delta, ndelta, psz, bin, pgs, lg_delta_lookup) \
-	S2B_##lg_delta_lookup(index)
-	SIZE_CLASSES
-#undef S2B_3
-#undef S2B_4
-#undef S2B_5
-#undef S2B_6
-#undef S2B_7
-#undef S2B_8
-#undef S2B_9
-#undef S2B_10
-#undef S2B_11
-#undef S2B_no
-#undef SC
-};
+uint8_t sz_size2index_tab[(SC_LOOKUP_MAXCLASS >> SC_LG_TINY_MIN) + 1];
+
+static void
+sz_boot_size2index_tab(const sc_data_t *sc_data) {
+	size_t dst_max = (SC_LOOKUP_MAXCLASS >> SC_LG_TINY_MIN) + 1;
+	size_t dst_ind = 0;
+	for (unsigned sc_ind = 0; sc_ind < SC_NSIZES && dst_ind < dst_max;
+	    sc_ind++) {
+		const sc_t *sc = &sc_data->sc[sc_ind];
+		size_t sz = (ZU(1) << sc->lg_base)
+		    + (ZU(sc->ndelta) << sc->lg_delta);
+		size_t max_ind = ((sz + (ZU(1) << SC_LG_TINY_MIN) - 1)
+				   >> SC_LG_TINY_MIN);
+		for (; dst_ind <= max_ind && dst_ind < dst_max; dst_ind++) {
+			sz_size2index_tab[dst_ind] = sc_ind;
+		}
+	}
+}
+
+void
+sz_boot(const sc_data_t *sc_data) {
+	sz_boot_pind2sz_tab(sc_data);
+	sz_boot_index2size_tab(sc_data);
+	sz_boot_size2index_tab(sc_data);
+}
diff --git a/deps/jemalloc/src/tcache.c b/deps/jemalloc/src/tcache.c
index 936ef3140d..50099a9f2c 100644
--- a/deps/jemalloc/src/tcache.c
+++ b/deps/jemalloc/src/tcache.c
@@ -4,7 +4,8 @@
 
 #include "jemalloc/internal/assert.h"
 #include "jemalloc/internal/mutex.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/safety_check.h"
+#include "jemalloc/internal/sc.h"
 
 /******************************************************************************/
 /* Data. */
@@ -12,7 +13,7 @@
 bool	opt_tcache = true;
 ssize_t	opt_lg_tcache_max = LG_TCACHE_MAXCLASS_DEFAULT;
 
-tcache_bin_info_t	*tcache_bin_info;
+cache_bin_info_t	*tcache_bin_info;
 static unsigned		stack_nelms; /* Total stack elms per tcache. */
 
 unsigned		nhbins;
@@ -40,8 +41,8 @@ void
 tcache_event_hard(tsd_t *tsd, tcache_t *tcache) {
 	szind_t binind = tcache->next_gc_bin;
 
-	tcache_bin_t *tbin;
-	if (binind < NBINS) {
+	cache_bin_t *tbin;
+	if (binind < SC_NBINS) {
 		tbin = tcache_small_bin_get(tcache, binind);
 	} else {
 		tbin = tcache_large_bin_get(tcache, binind);
@@ -50,7 +51,7 @@ tcache_event_hard(tsd_t *tsd, tcache_t *tcache) {
 		/*
 		 * Flush (ceiling) 3/4 of the objects below the low water mark.
 		 */
-		if (binind < NBINS) {
+		if (binind < SC_NBINS) {
 			tcache_bin_flush_small(tsd, tcache, tbin, binind,
 			    tbin->ncached - tbin->low_water + (tbin->low_water
 			    >> 2));
@@ -58,7 +59,7 @@ tcache_event_hard(tsd_t *tsd, tcache_t *tcache) {
 			 * Reduce fill count by 2X.  Limit lg_fill_div such that
 			 * the fill count is always at least 1.
 			 */
-			tcache_bin_info_t *tbin_info = &tcache_bin_info[binind];
+			cache_bin_info_t *tbin_info = &tcache_bin_info[binind];
 			if ((tbin_info->ncached_max >>
 			     (tcache->lg_fill_div[binind] + 1)) >= 1) {
 				tcache->lg_fill_div[binind]++;
@@ -72,7 +73,7 @@ tcache_event_hard(tsd_t *tsd, tcache_t *tcache) {
 		 * Increase fill count by 2X for small bins.  Make sure
 		 * lg_fill_div stays greater than 0.
 		 */
-		if (binind < NBINS && tcache->lg_fill_div[binind] > 1) {
+		if (binind < SC_NBINS && tcache->lg_fill_div[binind] > 1) {
 			tcache->lg_fill_div[binind]--;
 		}
 	}
@@ -86,7 +87,7 @@ tcache_event_hard(tsd_t *tsd, tcache_t *tcache) {
 
 void *
 tcache_alloc_small_hard(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
-    tcache_bin_t *tbin, szind_t binind, bool *tcache_success) {
+    cache_bin_t *tbin, szind_t binind, bool *tcache_success) {
 	void *ret;
 
 	assert(tcache->arena != NULL);
@@ -95,33 +96,72 @@ tcache_alloc_small_hard(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
 	if (config_prof) {
 		tcache->prof_accumbytes = 0;
 	}
-	ret = tcache_alloc_easy(tbin, tcache_success);
+	ret = cache_bin_alloc_easy(tbin, tcache_success);
 
 	return ret;
 }
 
+/* Enabled with --enable-extra-size-check. */
+static void
+tbin_extents_lookup_size_check(tsdn_t *tsdn, cache_bin_t *tbin, szind_t binind,
+    size_t nflush, extent_t **extents){
+	rtree_ctx_t rtree_ctx_fallback;
+	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
+
+	/*
+	 * Verify that the items in the tcache all have the correct size; this
+	 * is useful for catching sized deallocation bugs, also to fail early
+	 * instead of corrupting metadata.  Since this can be turned on for opt
+	 * builds, avoid the branch in the loop.
+	 */
+	szind_t szind;
+	size_t sz_sum = binind * nflush;
+	for (unsigned i = 0 ; i < nflush; i++) {
+		rtree_extent_szind_read(tsdn, &extents_rtree,
+		    rtree_ctx, (uintptr_t)*(tbin->avail - 1 - i), true,
+		    &extents[i], &szind);
+		sz_sum -= szind;
+	}
+	if (sz_sum != 0) {
+		safety_check_fail("<jemalloc>: size mismatch in thread cache "
+		    "detected, likely caused by sized deallocation bugs by "
+		    "application. Abort.\n");
+		abort();
+	}
+}
+
 void
-tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
+tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin,
     szind_t binind, unsigned rem) {
 	bool merged_stats = false;
 
-	assert(binind < NBINS);
-	assert(rem <= tbin->ncached);
+	assert(binind < SC_NBINS);
+	assert((cache_bin_sz_t)rem <= tbin->ncached);
 
 	arena_t *arena = tcache->arena;
 	assert(arena != NULL);
 	unsigned nflush = tbin->ncached - rem;
 	VARIABLE_ARRAY(extent_t *, item_extent, nflush);
+
 	/* Look up extent once per item. */
-	for (unsigned i = 0 ; i < nflush; i++) {
-		item_extent[i] = iealloc(tsd_tsdn(tsd), *(tbin->avail - 1 - i));
+	if (config_opt_safety_checks) {
+		tbin_extents_lookup_size_check(tsd_tsdn(tsd), tbin, binind,
+		    nflush, item_extent);
+	} else {
+		for (unsigned i = 0 ; i < nflush; i++) {
+			item_extent[i] = iealloc(tsd_tsdn(tsd),
+			    *(tbin->avail - 1 - i));
+		}
 	}
-
 	while (nflush > 0) {
 		/* Lock the arena bin associated with the first object. */
 		extent_t *extent = item_extent[0];
-		arena_t *bin_arena = extent_arena_get(extent);
-		arena_bin_t *bin = &bin_arena->bins[binind];
+		unsigned bin_arena_ind = extent_arena_ind_get(extent);
+		arena_t *bin_arena = arena_get(tsd_tsdn(tsd), bin_arena_ind,
+		    false);
+		unsigned binshard = extent_binshard_get(extent);
+		assert(binshard < bin_infos[binind].n_shards);
+		bin_t *bin = &bin_arena->bins[binind].bin_shards[binshard];
 
 		if (config_prof && bin_arena == arena) {
 			if (arena_prof_accum(tsd_tsdn(tsd), arena,
@@ -132,8 +172,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
 		}
 
 		malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
-		if (config_stats && bin_arena == arena) {
-			assert(!merged_stats);
+		if (config_stats && bin_arena == arena && !merged_stats) {
 			merged_stats = true;
 			bin->stats.nflushes++;
 			bin->stats.nrequests += tbin->tstats.nrequests;
@@ -145,9 +184,10 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
 			extent = item_extent[i];
 			assert(ptr != NULL && extent != NULL);
 
-			if (extent_arena_get(extent) == bin_arena) {
+			if (extent_arena_ind_get(extent) == bin_arena_ind
+			    && extent_binshard_get(extent) == binshard) {
 				arena_dalloc_bin_junked_locked(tsd_tsdn(tsd),
-				    bin_arena, extent, ptr);
+				    bin_arena, bin, binind, extent, ptr);
 			} else {
 				/*
 				 * This object was allocated via a different
@@ -169,8 +209,9 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
 		 * The flush loop didn't happen to flush to this thread's
 		 * arena, so the stats didn't get merged.  Manually do so now.
 		 */
-		arena_bin_t *bin = &arena->bins[binind];
-		malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
+		unsigned binshard;
+		bin_t *bin = arena_bin_choose_lock(tsd_tsdn(tsd), arena, binind,
+		    &binshard);
 		bin->stats.nflushes++;
 		bin->stats.nrequests += tbin->tstats.nrequests;
 		tbin->tstats.nrequests = 0;
@@ -180,63 +221,76 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
 	memmove(tbin->avail - rem, tbin->avail - tbin->ncached, rem *
 	    sizeof(void *));
 	tbin->ncached = rem;
-	if ((low_water_t)tbin->ncached < tbin->low_water) {
+	if (tbin->ncached < tbin->low_water) {
 		tbin->low_water = tbin->ncached;
 	}
 }
 
 void
-tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, szind_t binind,
+tcache_bin_flush_large(tsd_t *tsd, cache_bin_t *tbin, szind_t binind,
     unsigned rem, tcache_t *tcache) {
 	bool merged_stats = false;
 
 	assert(binind < nhbins);
-	assert(rem <= tbin->ncached);
+	assert((cache_bin_sz_t)rem <= tbin->ncached);
 
-	arena_t *arena = tcache->arena;
-	assert(arena != NULL);
+	arena_t *tcache_arena = tcache->arena;
+	assert(tcache_arena != NULL);
 	unsigned nflush = tbin->ncached - rem;
 	VARIABLE_ARRAY(extent_t *, item_extent, nflush);
+
+#ifndef JEMALLOC_EXTRA_SIZE_CHECK
 	/* Look up extent once per item. */
 	for (unsigned i = 0 ; i < nflush; i++) {
 		item_extent[i] = iealloc(tsd_tsdn(tsd), *(tbin->avail - 1 - i));
 	}
-
+#else
+	tbin_extents_lookup_size_check(tsd_tsdn(tsd), tbin, binind, nflush,
+	    item_extent);
+#endif
 	while (nflush > 0) {
 		/* Lock the arena associated with the first object. */
 		extent_t *extent = item_extent[0];
-		arena_t *locked_arena = extent_arena_get(extent);
-		UNUSED bool idump;
+		unsigned locked_arena_ind = extent_arena_ind_get(extent);
+		arena_t *locked_arena = arena_get(tsd_tsdn(tsd),
+		    locked_arena_ind, false);
+		bool idump;
 
 		if (config_prof) {
 			idump = false;
 		}
 
-		malloc_mutex_lock(tsd_tsdn(tsd), &locked_arena->large_mtx);
+		bool lock_large = !arena_is_auto(locked_arena);
+		if (lock_large) {
+			malloc_mutex_lock(tsd_tsdn(tsd), &locked_arena->large_mtx);
+		}
 		for (unsigned i = 0; i < nflush; i++) {
 			void *ptr = *(tbin->avail - 1 - i);
 			assert(ptr != NULL);
 			extent = item_extent[i];
-			if (extent_arena_get(extent) == locked_arena) {
+			if (extent_arena_ind_get(extent) == locked_arena_ind) {
 				large_dalloc_prep_junked_locked(tsd_tsdn(tsd),
 				    extent);
 			}
 		}
-		if ((config_prof || config_stats) && locked_arena == arena) {
+		if ((config_prof || config_stats) &&
+		    (locked_arena == tcache_arena)) {
 			if (config_prof) {
-				idump = arena_prof_accum(tsd_tsdn(tsd), arena,
-				    tcache->prof_accumbytes);
+				idump = arena_prof_accum(tsd_tsdn(tsd),
+				    tcache_arena, tcache->prof_accumbytes);
 				tcache->prof_accumbytes = 0;
 			}
 			if (config_stats) {
 				merged_stats = true;
-				arena_stats_large_nrequests_add(tsd_tsdn(tsd),
-				    &arena->stats, binind,
+				arena_stats_large_flush_nrequests_add(
+				    tsd_tsdn(tsd), &tcache_arena->stats, binind,
 				    tbin->tstats.nrequests);
 				tbin->tstats.nrequests = 0;
 			}
 		}
-		malloc_mutex_unlock(tsd_tsdn(tsd), &locked_arena->large_mtx);
+		if (lock_large) {
+			malloc_mutex_unlock(tsd_tsdn(tsd), &locked_arena->large_mtx);
+		}
 
 		unsigned ndeferred = 0;
 		for (unsigned i = 0; i < nflush; i++) {
@@ -244,7 +298,7 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, szind_t binind,
 			extent = item_extent[i];
 			assert(ptr != NULL && extent != NULL);
 
-			if (extent_arena_get(extent) == locked_arena) {
+			if (extent_arena_ind_get(extent) == locked_arena_ind) {
 				large_dalloc_finish(tsd_tsdn(tsd), extent);
 			} else {
 				/*
@@ -270,15 +324,15 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, szind_t binind,
 		 * The flush loop didn't happen to flush to this thread's
 		 * arena, so the stats didn't get merged.  Manually do so now.
 		 */
-		arena_stats_large_nrequests_add(tsd_tsdn(tsd), &arena->stats,
-		    binind, tbin->tstats.nrequests);
+		arena_stats_large_flush_nrequests_add(tsd_tsdn(tsd),
+		    &tcache_arena->stats, binind, tbin->tstats.nrequests);
 		tbin->tstats.nrequests = 0;
 	}
 
 	memmove(tbin->avail - rem, tbin->avail - tbin->ncached, rem *
 	    sizeof(void *));
 	tbin->ncached = rem;
-	if ((low_water_t)tbin->ncached < tbin->low_water) {
+	if (tbin->ncached < tbin->low_water) {
 		tbin->low_water = tbin->ncached;
 	}
 }
@@ -291,8 +345,15 @@ tcache_arena_associate(tsdn_t *tsdn, tcache_t *tcache, arena_t *arena) {
 	if (config_stats) {
 		/* Link into list of extant tcaches. */
 		malloc_mutex_lock(tsdn, &arena->tcache_ql_mtx);
+
 		ql_elm_new(tcache, link);
 		ql_tail_insert(&arena->tcache_ql, tcache, link);
+		cache_bin_array_descriptor_init(
+		    &tcache->cache_bin_array_descriptor, tcache->bins_small,
+		    tcache->bins_large);
+		ql_tail_insert(&arena->cache_bin_array_descriptor_ql,
+		    &tcache->cache_bin_array_descriptor, link);
+
 		malloc_mutex_unlock(tsdn, &arena->tcache_ql_mtx);
 	}
 }
@@ -316,6 +377,8 @@ tcache_arena_dissociate(tsdn_t *tsdn, tcache_t *tcache) {
 			assert(in_ql);
 		}
 		ql_remove(&arena->tcache_ql, tcache, link);
+		ql_remove(&arena->cache_bin_array_descriptor_ql,
+		    &tcache->cache_bin_array_descriptor, link);
 		tcache_stats_merge(tsdn, tcache, arena);
 		malloc_mutex_unlock(tsdn, &arena->tcache_ql_mtx);
 	}
@@ -354,10 +417,10 @@ tcache_init(tsd_t *tsd, tcache_t *tcache, void *avail_stack) {
 
 	size_t stack_offset = 0;
 	assert((TCACHE_NSLOTS_SMALL_MAX & 1U) == 0);
-	memset(tcache->tbins_small, 0, sizeof(tcache_bin_t) * NBINS);
-	memset(tcache->tbins_large, 0, sizeof(tcache_bin_t) * (nhbins - NBINS));
+	memset(tcache->bins_small, 0, sizeof(cache_bin_t) * SC_NBINS);
+	memset(tcache->bins_large, 0, sizeof(cache_bin_t) * (nhbins - SC_NBINS));
 	unsigned i = 0;
-	for (; i < NBINS; i++) {
+	for (; i < SC_NBINS; i++) {
 		tcache->lg_fill_div[i] = 1;
 		stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *);
 		/*
@@ -449,16 +512,16 @@ static void
 tcache_flush_cache(tsd_t *tsd, tcache_t *tcache) {
 	assert(tcache->arena != NULL);
 
-	for (unsigned i = 0; i < NBINS; i++) {
-		tcache_bin_t *tbin = tcache_small_bin_get(tcache, i);
+	for (unsigned i = 0; i < SC_NBINS; i++) {
+		cache_bin_t *tbin = tcache_small_bin_get(tcache, i);
 		tcache_bin_flush_small(tsd, tcache, tbin, i, 0);
 
 		if (config_stats) {
 			assert(tbin->tstats.nrequests == 0);
 		}
 	}
-	for (unsigned i = NBINS; i < nhbins; i++) {
-		tcache_bin_t *tbin = tcache_large_bin_get(tcache, i);
+	for (unsigned i = SC_NBINS; i < nhbins; i++) {
+		cache_bin_t *tbin = tcache_large_bin_get(tcache, i);
 		tcache_bin_flush_large(tsd, tbin, i, 0, tcache);
 
 		if (config_stats) {
@@ -482,6 +545,7 @@ tcache_flush(tsd_t *tsd) {
 static void
 tcache_destroy(tsd_t *tsd, tcache_t *tcache, bool tsd_tcache) {
 	tcache_flush_cache(tsd, tcache);
+	arena_t *arena = tcache->arena;
 	tcache_arena_dissociate(tsd_tsdn(tsd), tcache);
 
 	if (tsd_tcache) {
@@ -494,6 +558,23 @@ tcache_destroy(tsd_t *tsd, tcache_t *tcache, bool tsd_tcache) {
 		/* Release both the tcache struct and avail array. */
 		idalloctm(tsd_tsdn(tsd), tcache, NULL, NULL, true, true);
 	}
+
+	/*
+	 * The deallocation and tcache flush above may not trigger decay since
+	 * we are on the tcache shutdown path (potentially with non-nominal
+	 * tsd).  Manually trigger decay to avoid pathological cases.  Also
+	 * include arena 0 because the tcache array is allocated from it.
+	 */
+	arena_decay(tsd_tsdn(tsd), arena_get(tsd_tsdn(tsd), 0, false),
+	    false, false);
+
+	if (arena_nthreads_get(arena, false) == 0 &&
+	    !background_thread_enabled()) {
+		/* Force purging when no threads assigned to the arena anymore. */
+		arena_decay(tsd_tsdn(tsd), arena, false, true);
+	} else {
+		arena_decay(tsd_tsdn(tsd), arena, false, false);
+	}
 }
 
 /* For auto tcache (embedded in TSD) only. */
@@ -523,18 +604,18 @@ tcache_stats_merge(tsdn_t *tsdn, tcache_t *tcache, arena_t *arena) {
 	cassert(config_stats);
 
 	/* Merge and reset tcache stats. */
-	for (i = 0; i < NBINS; i++) {
-		arena_bin_t *bin = &arena->bins[i];
-		tcache_bin_t *tbin = tcache_small_bin_get(tcache, i);
-		malloc_mutex_lock(tsdn, &bin->lock);
+	for (i = 0; i < SC_NBINS; i++) {
+		cache_bin_t *tbin = tcache_small_bin_get(tcache, i);
+		unsigned binshard;
+		bin_t *bin = arena_bin_choose_lock(tsdn, arena, i, &binshard);
 		bin->stats.nrequests += tbin->tstats.nrequests;
 		malloc_mutex_unlock(tsdn, &bin->lock);
 		tbin->tstats.nrequests = 0;
 	}
 
 	for (; i < nhbins; i++) {
-		tcache_bin_t *tbin = tcache_large_bin_get(tcache, i);
-		arena_stats_large_nrequests_add(tsdn, &arena->stats, i,
+		cache_bin_t *tbin = tcache_large_bin_get(tcache, i);
+		arena_stats_large_flush_nrequests_add(tsdn, &arena->stats, i,
 		    tbin->tstats.nrequests);
 		tbin->tstats.nrequests = 0;
 	}
@@ -605,23 +686,32 @@ label_return:
 }
 
 static tcache_t *
-tcaches_elm_remove(tsd_t *tsd, tcaches_t *elm) {
+tcaches_elm_remove(tsd_t *tsd, tcaches_t *elm, bool allow_reinit) {
 	malloc_mutex_assert_owner(tsd_tsdn(tsd), &tcaches_mtx);
 
 	if (elm->tcache == NULL) {
 		return NULL;
 	}
 	tcache_t *tcache = elm->tcache;
-	elm->tcache = NULL;
+	if (allow_reinit) {
+		elm->tcache = TCACHES_ELM_NEED_REINIT;
+	} else {
+		elm->tcache = NULL;
+	}
+
+	if (tcache == TCACHES_ELM_NEED_REINIT) {
+		return NULL;
+	}
 	return tcache;
 }
 
 void
 tcaches_flush(tsd_t *tsd, unsigned ind) {
 	malloc_mutex_lock(tsd_tsdn(tsd), &tcaches_mtx);
-	tcache_t *tcache = tcaches_elm_remove(tsd, &tcaches[ind]);
+	tcache_t *tcache = tcaches_elm_remove(tsd, &tcaches[ind], true);
 	malloc_mutex_unlock(tsd_tsdn(tsd), &tcaches_mtx);
 	if (tcache != NULL) {
+		/* Destroy the tcache; recreate in tcaches_get() if needed. */
 		tcache_destroy(tsd, tcache, false);
 	}
 }
@@ -630,7 +720,7 @@ void
 tcaches_destroy(tsd_t *tsd, unsigned ind) {
 	malloc_mutex_lock(tsd_tsdn(tsd), &tcaches_mtx);
 	tcaches_t *elm = &tcaches[ind];
-	tcache_t *tcache = tcaches_elm_remove(tsd, elm);
+	tcache_t *tcache = tcaches_elm_remove(tsd, elm, false);
 	elm->next = tcaches_avail;
 	tcaches_avail = elm;
 	malloc_mutex_unlock(tsd_tsdn(tsd), &tcaches_mtx);
@@ -643,8 +733,8 @@ bool
 tcache_boot(tsdn_t *tsdn) {
 	/* If necessary, clamp opt_lg_tcache_max. */
 	if (opt_lg_tcache_max < 0 || (ZU(1) << opt_lg_tcache_max) <
-	    SMALL_MAXCLASS) {
-		tcache_maxclass = SMALL_MAXCLASS;
+	    SC_SMALL_MAXCLASS) {
+		tcache_maxclass = SC_SMALL_MAXCLASS;
 	} else {
 		tcache_maxclass = (ZU(1) << opt_lg_tcache_max);
 	}
@@ -657,21 +747,21 @@ tcache_boot(tsdn_t *tsdn) {
 	nhbins = sz_size2index(tcache_maxclass) + 1;
 
 	/* Initialize tcache_bin_info. */
-	tcache_bin_info = (tcache_bin_info_t *)base_alloc(tsdn, b0get(), nhbins
-	    * sizeof(tcache_bin_info_t), CACHELINE);
+	tcache_bin_info = (cache_bin_info_t *)base_alloc(tsdn, b0get(), nhbins
+	    * sizeof(cache_bin_info_t), CACHELINE);
 	if (tcache_bin_info == NULL) {
 		return true;
 	}
 	stack_nelms = 0;
 	unsigned i;
-	for (i = 0; i < NBINS; i++) {
-		if ((arena_bin_info[i].nregs << 1) <= TCACHE_NSLOTS_SMALL_MIN) {
+	for (i = 0; i < SC_NBINS; i++) {
+		if ((bin_infos[i].nregs << 1) <= TCACHE_NSLOTS_SMALL_MIN) {
 			tcache_bin_info[i].ncached_max =
 			    TCACHE_NSLOTS_SMALL_MIN;
-		} else if ((arena_bin_info[i].nregs << 1) <=
+		} else if ((bin_infos[i].nregs << 1) <=
 		    TCACHE_NSLOTS_SMALL_MAX) {
 			tcache_bin_info[i].ncached_max =
-			    (arena_bin_info[i].nregs << 1);
+			    (bin_infos[i].nregs << 1);
 		} else {
 			tcache_bin_info[i].ncached_max =
 			    TCACHE_NSLOTS_SMALL_MAX;
diff --git a/deps/jemalloc/src/test_hooks.c b/deps/jemalloc/src/test_hooks.c
new file mode 100644
index 0000000000..ace00d9c46
--- /dev/null
+++ b/deps/jemalloc/src/test_hooks.c
@@ -0,0 +1,12 @@
+#include "jemalloc/internal/jemalloc_preamble.h"
+
+/*
+ * The hooks are a little bit screwy -- they're not genuinely exported in the
+ * sense that we want them available to end-users, but we do want them visible
+ * from outside the generated library, so that we can use them in test code.
+ */
+JEMALLOC_EXPORT
+void (*test_hooks_arena_new_hook)() = NULL;
+
+JEMALLOC_EXPORT
+void (*test_hooks_libc_hook)() = NULL;
diff --git a/deps/jemalloc/src/tsd.c b/deps/jemalloc/src/tsd.c
index f968992f2b..a31f6b9698 100644
--- a/deps/jemalloc/src/tsd.c
+++ b/deps/jemalloc/src/tsd.c
@@ -12,12 +12,16 @@
 static unsigned ncleanups;
 static malloc_tsd_cleanup_t cleanups[MALLOC_TSD_CLEANUPS_MAX];
 
+/* TSD_INITIALIZER triggers "-Wmissing-field-initializer" */
+JEMALLOC_DIAGNOSTIC_PUSH
+JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
+
 #ifdef JEMALLOC_MALLOC_THREAD_CLEANUP
-__thread tsd_t JEMALLOC_TLS_MODEL tsd_tls = TSD_INITIALIZER;
-__thread bool JEMALLOC_TLS_MODEL tsd_initialized = false;
+JEMALLOC_TSD_TYPE_ATTR(tsd_t) tsd_tls = TSD_INITIALIZER;
+JEMALLOC_TSD_TYPE_ATTR(bool) JEMALLOC_TLS_MODEL tsd_initialized = false;
 bool tsd_booted = false;
 #elif (defined(JEMALLOC_TLS))
-__thread tsd_t JEMALLOC_TLS_MODEL tsd_tls = TSD_INITIALIZER;
+JEMALLOC_TSD_TYPE_ATTR(tsd_t) tsd_tls = TSD_INITIALIZER;
 pthread_key_t tsd_tsd;
 bool tsd_booted = false;
 #elif (defined(_WIN32))
@@ -41,6 +45,7 @@ tsd_init_head_t	tsd_init_head = {
 	ql_head_initializer(blocks),
 	MALLOC_MUTEX_INITIALIZER
 };
+
 tsd_wrapper_t tsd_boot_wrapper = {
 	false,
 	TSD_INITIALIZER
@@ -48,17 +53,164 @@ tsd_wrapper_t tsd_boot_wrapper = {
 bool tsd_booted = false;
 #endif
 
+JEMALLOC_DIAGNOSTIC_POP
 
 /******************************************************************************/
 
+/* A list of all the tsds in the nominal state. */
+typedef ql_head(tsd_t) tsd_list_t;
+static tsd_list_t tsd_nominal_tsds = ql_head_initializer(tsd_nominal_tsds);
+static malloc_mutex_t tsd_nominal_tsds_lock;
+
+/* How many slow-path-enabling features are turned on. */
+static atomic_u32_t tsd_global_slow_count = ATOMIC_INIT(0);
+
+static bool
+tsd_in_nominal_list(tsd_t *tsd) {
+	tsd_t *tsd_list;
+	bool found = false;
+	/*
+	 * We don't know that tsd is nominal; it might not be safe to get data
+	 * out of it here.
+	 */
+	malloc_mutex_lock(TSDN_NULL, &tsd_nominal_tsds_lock);
+	ql_foreach(tsd_list, &tsd_nominal_tsds, TSD_MANGLE(tcache).tsd_link) {
+		if (tsd == tsd_list) {
+			found = true;
+			break;
+		}
+	}
+	malloc_mutex_unlock(TSDN_NULL, &tsd_nominal_tsds_lock);
+	return found;
+}
+
+static void
+tsd_add_nominal(tsd_t *tsd) {
+	assert(!tsd_in_nominal_list(tsd));
+	assert(tsd_state_get(tsd) <= tsd_state_nominal_max);
+	ql_elm_new(tsd, TSD_MANGLE(tcache).tsd_link);
+	malloc_mutex_lock(tsd_tsdn(tsd), &tsd_nominal_tsds_lock);
+	ql_tail_insert(&tsd_nominal_tsds, tsd, TSD_MANGLE(tcache).tsd_link);
+	malloc_mutex_unlock(tsd_tsdn(tsd), &tsd_nominal_tsds_lock);
+}
+
+static void
+tsd_remove_nominal(tsd_t *tsd) {
+	assert(tsd_in_nominal_list(tsd));
+	assert(tsd_state_get(tsd) <= tsd_state_nominal_max);
+	malloc_mutex_lock(tsd_tsdn(tsd), &tsd_nominal_tsds_lock);
+	ql_remove(&tsd_nominal_tsds, tsd, TSD_MANGLE(tcache).tsd_link);
+	malloc_mutex_unlock(tsd_tsdn(tsd), &tsd_nominal_tsds_lock);
+}
+
+static void
+tsd_force_recompute(tsdn_t *tsdn) {
+	/*
+	 * The stores to tsd->state here need to synchronize with the exchange
+	 * in tsd_slow_update.
+	 */
+	atomic_fence(ATOMIC_RELEASE);
+	malloc_mutex_lock(tsdn, &tsd_nominal_tsds_lock);
+	tsd_t *remote_tsd;
+	ql_foreach(remote_tsd, &tsd_nominal_tsds, TSD_MANGLE(tcache).tsd_link) {
+		assert(tsd_atomic_load(&remote_tsd->state, ATOMIC_RELAXED)
+		    <= tsd_state_nominal_max);
+		tsd_atomic_store(&remote_tsd->state, tsd_state_nominal_recompute,
+		    ATOMIC_RELAXED);
+	}
+	malloc_mutex_unlock(tsdn, &tsd_nominal_tsds_lock);
+}
+
+void
+tsd_global_slow_inc(tsdn_t *tsdn) {
+	atomic_fetch_add_u32(&tsd_global_slow_count, 1, ATOMIC_RELAXED);
+	/*
+	 * We unconditionally force a recompute, even if the global slow count
+	 * was already positive.  If we didn't, then it would be possible for us
+	 * to return to the user, have the user synchronize externally with some
+	 * other thread, and then have that other thread not have picked up the
+	 * update yet (since the original incrementing thread might still be
+	 * making its way through the tsd list).
+	 */
+	tsd_force_recompute(tsdn);
+}
+
+void tsd_global_slow_dec(tsdn_t *tsdn) {
+	atomic_fetch_sub_u32(&tsd_global_slow_count, 1, ATOMIC_RELAXED);
+	/* See the note in ..._inc(). */
+	tsd_force_recompute(tsdn);
+}
+
+static bool
+tsd_local_slow(tsd_t *tsd) {
+	return !tsd_tcache_enabled_get(tsd)
+	    || tsd_reentrancy_level_get(tsd) > 0;
+}
+
+bool
+tsd_global_slow() {
+	return atomic_load_u32(&tsd_global_slow_count, ATOMIC_RELAXED) > 0;
+}
+
+/******************************************************************************/
+
+static uint8_t
+tsd_state_compute(tsd_t *tsd) {
+	if (!tsd_nominal(tsd)) {
+		return tsd_state_get(tsd);
+	}
+	/* We're in *a* nominal state; but which one? */
+	if (malloc_slow || tsd_local_slow(tsd) || tsd_global_slow()) {
+		return tsd_state_nominal_slow;
+	} else {
+		return tsd_state_nominal;
+	}
+}
+
 void
 tsd_slow_update(tsd_t *tsd) {
-	if (tsd_nominal(tsd)) {
-		if (malloc_slow || !tsd_tcache_enabled_get(tsd) ||
-		    tsd_reentrancy_level_get(tsd) > 0) {
-			tsd->state = tsd_state_nominal_slow;
+	uint8_t old_state;
+	do {
+		uint8_t new_state = tsd_state_compute(tsd);
+		old_state = tsd_atomic_exchange(&tsd->state, new_state,
+		    ATOMIC_ACQUIRE);
+	} while (old_state == tsd_state_nominal_recompute);
+}
+
+void
+tsd_state_set(tsd_t *tsd, uint8_t new_state) {
+	/* Only the tsd module can change the state *to* recompute. */
+	assert(new_state != tsd_state_nominal_recompute);
+	uint8_t old_state = tsd_atomic_load(&tsd->state, ATOMIC_RELAXED);
+	if (old_state > tsd_state_nominal_max) {
+		/*
+		 * Not currently in the nominal list, but it might need to be
+		 * inserted there.
+		 */
+		assert(!tsd_in_nominal_list(tsd));
+		tsd_atomic_store(&tsd->state, new_state, ATOMIC_RELAXED);
+		if (new_state <= tsd_state_nominal_max) {
+			tsd_add_nominal(tsd);
+		}
+	} else {
+		/*
+		 * We're currently nominal.  If the new state is non-nominal,
+		 * great; we take ourselves off the list and just enter the new
+		 * state.
+		 */
+		assert(tsd_in_nominal_list(tsd));
+		if (new_state > tsd_state_nominal_max) {
+			tsd_remove_nominal(tsd);
+			tsd_atomic_store(&tsd->state, new_state,
+			    ATOMIC_RELAXED);
 		} else {
-			tsd->state = tsd_state_nominal;
+			/*
+			 * This is the tricky case.  We're transitioning from
+			 * one nominal state to another.  The caller can't know
+			 * about any races that are occuring at the same time,
+			 * so we always have to recompute no matter what.
+			 */
+			tsd_slow_update(tsd);
 		}
 	}
 }
@@ -71,12 +223,23 @@ tsd_data_init(tsd_t *tsd) {
 	 */
 	rtree_ctx_data_init(tsd_rtree_ctxp_get_unsafe(tsd));
 
+	/*
+	 * A nondeterministic seed based on the address of tsd reduces
+	 * the likelihood of lockstep non-uniform cache index
+	 * utilization among identical concurrent processes, but at the
+	 * cost of test repeatability.  For debug builds, instead use a
+	 * deterministic seed.
+	 */
+	*tsd_offset_statep_get(tsd) = config_debug ? 0 :
+	    (uint64_t)(uintptr_t)tsd;
+
 	return tsd_tcache_enabled_data_init(tsd);
 }
 
 static void
 assert_tsd_data_cleanup_done(tsd_t *tsd) {
 	assert(!tsd_nominal(tsd));
+	assert(!tsd_in_nominal_list(tsd));
 	assert(*tsd_arenap_get_unsafe(tsd) == NULL);
 	assert(*tsd_iarenap_get_unsafe(tsd) == NULL);
 	assert(*tsd_arenas_tdata_bypassp_get_unsafe(tsd) == true);
@@ -87,8 +250,8 @@ assert_tsd_data_cleanup_done(tsd_t *tsd) {
 
 static bool
 tsd_data_init_nocleanup(tsd_t *tsd) {
-	assert(tsd->state == tsd_state_reincarnated ||
-	    tsd->state == tsd_state_minimal_initialized);
+	assert(tsd_state_get(tsd) == tsd_state_reincarnated ||
+	    tsd_state_get(tsd) == tsd_state_minimal_initialized);
 	/*
 	 * During reincarnation, there is no guarantee that the cleanup function
 	 * will be called (deallocation may happen after all tsd destructors).
@@ -107,27 +270,33 @@ tsd_t *
 tsd_fetch_slow(tsd_t *tsd, bool minimal) {
 	assert(!tsd_fast(tsd));
 
-	if (tsd->state == tsd_state_nominal_slow) {
-		/* On slow path but no work needed. */
-		assert(malloc_slow || !tsd_tcache_enabled_get(tsd) ||
-		    tsd_reentrancy_level_get(tsd) > 0 ||
-		    *tsd_arenas_tdata_bypassp_get(tsd));
-	} else if (tsd->state == tsd_state_uninitialized) {
+	if (tsd_state_get(tsd) == tsd_state_nominal_slow) {
+		/*
+		 * On slow path but no work needed.  Note that we can't
+		 * necessarily *assert* that we're slow, because we might be
+		 * slow because of an asynchronous modification to global state,
+		 * which might be asynchronously modified *back*.
+		 */
+	} else if (tsd_state_get(tsd) == tsd_state_nominal_recompute) {
+		tsd_slow_update(tsd);
+	} else if (tsd_state_get(tsd) == tsd_state_uninitialized) {
 		if (!minimal) {
-			tsd->state = tsd_state_nominal;
-			tsd_slow_update(tsd);
-			/* Trigger cleanup handler registration. */
-			tsd_set(tsd);
-			tsd_data_init(tsd);
+			if (tsd_booted) {
+				tsd_state_set(tsd, tsd_state_nominal);
+				tsd_slow_update(tsd);
+				/* Trigger cleanup handler registration. */
+				tsd_set(tsd);
+				tsd_data_init(tsd);
+			}
 		} else {
-			tsd->state = tsd_state_minimal_initialized;
+			tsd_state_set(tsd, tsd_state_minimal_initialized);
 			tsd_set(tsd);
 			tsd_data_init_nocleanup(tsd);
 		}
-	} else if (tsd->state == tsd_state_minimal_initialized) {
+	} else if (tsd_state_get(tsd) == tsd_state_minimal_initialized) {
 		if (!minimal) {
 			/* Switch to fully initialized. */
-			tsd->state = tsd_state_nominal;
+			tsd_state_set(tsd, tsd_state_nominal);
 			assert(*tsd_reentrancy_levelp_get(tsd) >= 1);
 			(*tsd_reentrancy_levelp_get(tsd))--;
 			tsd_slow_update(tsd);
@@ -135,12 +304,12 @@ tsd_fetch_slow(tsd_t *tsd, bool minimal) {
 		} else {
 			assert_tsd_data_cleanup_done(tsd);
 		}
-	} else if (tsd->state == tsd_state_purgatory) {
-		tsd->state = tsd_state_reincarnated;
+	} else if (tsd_state_get(tsd) == tsd_state_purgatory) {
+		tsd_state_set(tsd, tsd_state_reincarnated);
 		tsd_set(tsd);
 		tsd_data_init_nocleanup(tsd);
 	} else {
-		assert(tsd->state == tsd_state_reincarnated);
+		assert(tsd_state_get(tsd) == tsd_state_reincarnated);
 	}
 
 	return tsd;
@@ -204,7 +373,7 @@ void
 tsd_cleanup(void *arg) {
 	tsd_t *tsd = (tsd_t *)arg;
 
-	switch (tsd->state) {
+	switch (tsd_state_get(tsd)) {
 	case tsd_state_uninitialized:
 		/* Do nothing. */
 		break;
@@ -222,7 +391,7 @@ tsd_cleanup(void *arg) {
 	case tsd_state_nominal:
 	case tsd_state_nominal_slow:
 		tsd_do_data_cleanup(tsd);
-		tsd->state = tsd_state_purgatory;
+		tsd_state_set(tsd, tsd_state_purgatory);
 		tsd_set(tsd);
 		break;
 	case tsd_state_purgatory:
@@ -250,6 +419,10 @@ malloc_tsd_boot0(void) {
 	tsd_t *tsd;
 
 	ncleanups = 0;
+	if (malloc_mutex_init(&tsd_nominal_tsds_lock, "tsd_nominal_tsds_lock",
+	    WITNESS_RANK_OMIT, malloc_mutex_rank_exclusive)) {
+		return NULL;
+	}
 	if (tsd_boot0()) {
 		return NULL;
 	}
@@ -300,7 +473,7 @@ _tls_callback(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpvReserved) {
 #    pragma comment(linker, "/INCLUDE:_tls_callback")
 #  else
 #    pragma comment(linker, "/INCLUDE:_tls_used")
-#    pragma comment(linker, "/INCLUDE:tls_callback")
+#    pragma comment(linker, "/INCLUDE:" STRINGIFY(tls_callback) )
 #  endif
 #  pragma section(".CRT$XLY",long,read)
 #endif
@@ -339,3 +512,23 @@ tsd_init_finish(tsd_init_head_t *head, tsd_init_block_t *block) {
 	malloc_mutex_unlock(TSDN_NULL, &head->lock);
 }
 #endif
+
+void
+tsd_prefork(tsd_t *tsd) {
+	malloc_mutex_prefork(tsd_tsdn(tsd), &tsd_nominal_tsds_lock);
+}
+
+void
+tsd_postfork_parent(tsd_t *tsd) {
+	malloc_mutex_postfork_parent(tsd_tsdn(tsd), &tsd_nominal_tsds_lock);
+}
+
+void
+tsd_postfork_child(tsd_t *tsd) {
+	malloc_mutex_postfork_child(tsd_tsdn(tsd), &tsd_nominal_tsds_lock);
+	ql_new(&tsd_nominal_tsds);
+
+	if (tsd_state_get(tsd) <= tsd_state_nominal_max) {
+		tsd_add_nominal(tsd);
+	}
+}
diff --git a/deps/jemalloc/src/zone.c b/deps/jemalloc/src/zone.c
index 9d3b7b4952..23dfdd04a9 100644
--- a/deps/jemalloc/src/zone.c
+++ b/deps/jemalloc/src/zone.c
@@ -89,6 +89,7 @@ JEMALLOC_ATTR(weak_import);
 static malloc_zone_t *default_zone, *purgeable_zone;
 static malloc_zone_t jemalloc_zone;
 static struct malloc_introspection_t jemalloc_zone_introspect;
+static pid_t zone_force_lock_pid = -1;
 
 /******************************************************************************/
 /* Function prototypes for non-inline static functions. */
@@ -270,6 +271,12 @@ zone_log(malloc_zone_t *zone, void *address) {
 static void
 zone_force_lock(malloc_zone_t *zone) {
 	if (isthreaded) {
+		/*
+		 * See the note in zone_force_unlock, below, to see why we need
+		 * this.
+		 */
+		assert(zone_force_lock_pid == -1);
+		zone_force_lock_pid = getpid();
 		jemalloc_prefork();
 	}
 }
@@ -277,14 +284,25 @@ zone_force_lock(malloc_zone_t *zone) {
 static void
 zone_force_unlock(malloc_zone_t *zone) {
 	/*
-	 * Call jemalloc_postfork_child() rather than
-	 * jemalloc_postfork_parent(), because this function is executed by both
-	 * parent and child.  The parent can tolerate having state
-	 * reinitialized, but the child cannot unlock mutexes that were locked
-	 * by the parent.
+	 * zone_force_lock and zone_force_unlock are the entry points to the
+	 * forking machinery on OS X.  The tricky thing is, the child is not
+	 * allowed to unlock mutexes locked in the parent, even if owned by the
+	 * forking thread (and the mutex type we use in OS X will fail an assert
+	 * if we try).  In the child, we can get away with reinitializing all
+	 * the mutexes, which has the effect of unlocking them.  In the parent,
+	 * doing this would mean we wouldn't wake any waiters blocked on the
+	 * mutexes we unlock.  So, we record the pid of the current thread in
+	 * zone_force_lock, and use that to detect if we're in the parent or
+	 * child here, to decide which unlock logic we need.
 	 */
 	if (isthreaded) {
-		jemalloc_postfork_child();
+		assert(zone_force_lock_pid != -1);
+		if (getpid() == zone_force_lock_pid) {
+			jemalloc_postfork_parent();
+		} else {
+			jemalloc_postfork_child();
+		}
+		zone_force_lock_pid = -1;
 	}
 }
author	Viste <viste02@gmail.com>	2019-11-14 23:17:38 +0300
committer	Kargatum <dowlandtop@yandex.com>	2019-11-15 03:17:38 +0700
commit	685538b01b27ba38c605448e3a0de225bed4bb29 (patch)
tree	36196f0965c5fc2fccdbc45a86a8155f2c986e4d /deps/jemalloc/src
parent	fae7ae95a373530e0b206814662df557882c8f1a (diff)