DEP: Updated Jemalloc to Version 2.5

Signed-off-by: Multivitamin <DasUmba@.(none)>
2026-01-16 07:30:42 +01:00 · 2012-04-13 10:49:09 +02:00
parent 6400c13fcb
commit 126fd13e5d
34 changed files with 2068 additions and 805 deletions
--- a/dep/jemalloc/VERSION
+++ b/dep/jemalloc/VERSION
@@ -1 +1 @@
-2.1.0-0-g1c4b088b08d3bc7617a34387e196ce03716160bf
+2.2.5-0-gfc1bb70e5f0d9a58b39efa39cc549b5af5104760
--- a/dep/jemalloc/include/jemalloc/internal/arena.h
+++ b/dep/jemalloc/include/jemalloc/internal/arena.h
@@ -19,6 +19,7 @@
 #ifdef JEMALLOC_TINY
   /* Smallest size class to support. */
 #  define LG_TINY_MIN		LG_SIZEOF_PTR
+#  define TINY_MIN		(1U << LG_TINY_MIN)
 #endif

 /*
@@ -45,9 +46,10 @@
 * point is implicitly RUN_BFP bits to the left.
 *
 * Note that it is possible to set RUN_MAX_OVRHD low enough that it cannot be
- * honored for some/all object sizes, since there is one bit of header overhead
- * per object (plus a constant).  This constraint is relaxed (ignored) for runs
- * that are so small that the per-region overhead is greater than:
+ * honored for some/all object sizes, since when heap profiling is enabled
+ * there is one pointer of header overhead per object (plus a constant).  This
+ * constraint is relaxed (ignored) for runs that are so small that the
+ * per-region overhead is greater than:
 *
 *   (RUN_MAX_OVRHD / (reg_size << (3+RUN_BFP))
 */
@@ -56,6 +58,10 @@
 #define	RUN_MAX_OVRHD		0x0000003dU
 #define	RUN_MAX_OVRHD_RELAX	0x00001800U

+/* Maximum number of regions in one run. */
+#define	LG_RUN_MAXREGS		11
+#define	RUN_MAXREGS		(1U << LG_RUN_MAXREGS)
+
 /*
 * The minimum ratio of active:dirty pages per arena is computed as:
 *
@@ -69,6 +75,7 @@
 typedef struct arena_chunk_map_s arena_chunk_map_t;
 typedef struct arena_chunk_s arena_chunk_t;
 typedef struct arena_run_s arena_run_t;
+typedef struct arena_bin_info_s arena_bin_info_t;
 typedef struct arena_bin_s arena_bin_t;
 typedef struct arena_s arena_t;

@@ -105,7 +112,7 @@ struct arena_chunk_map_s {
 	 * Run address (or size) and various flags are stored together.  The bit
 	 * layout looks like (assuming 32-bit system):
 	 *
-	 *   ???????? ???????? ????---- ----dzla
+	 *   ???????? ???????? ????---- ----dula
 	 *
 	 * ? : Unallocated: Run address for first/last pages, unset for internal
 	 *                  pages.
@@ -113,7 +120,7 @@ struct arena_chunk_map_s {
 	 *     Large: Run size for first page, unset for trailing pages.
 	 * - : Unused.
 	 * d : dirty?
-	 * z : zeroed?
+	 * u : unzeroed?
 	 * l : large?
 	 * a : allocated?
 	 *
@@ -129,30 +136,30 @@ struct arena_chunk_map_s {
 	 * [dula] : bit unset
 	 *
 	 *   Unallocated (clean):
-	 *     ssssssss ssssssss ssss---- ----du--
+	 *     ssssssss ssssssss ssss---- ----du-a
 	 *     xxxxxxxx xxxxxxxx xxxx---- -----Uxx
-	 *     ssssssss ssssssss ssss---- ----dU--
+	 *     ssssssss ssssssss ssss---- ----dU-a
 	 *
 	 *   Unallocated (dirty):
-	 *     ssssssss ssssssss ssss---- ----D---
+	 *     ssssssss ssssssss ssss---- ----D--a
 	 *     xxxxxxxx xxxxxxxx xxxx---- ----xxxx
-	 *     ssssssss ssssssss ssss---- ----D---
+	 *     ssssssss ssssssss ssss---- ----D--a
 	 *
 	 *   Small:
-	 *     pppppppp pppppppp pppp---- ----d--a
-	 *     pppppppp pppppppp pppp---- -------a
-	 *     pppppppp pppppppp pppp---- ----d--a
+	 *     pppppppp pppppppp pppp---- ----d--A
+	 *     pppppppp pppppppp pppp---- -------A
+	 *     pppppppp pppppppp pppp---- ----d--A
 	 *
 	 *   Large:
-	 *     ssssssss ssssssss ssss---- ----D-la
+	 *     ssssssss ssssssss ssss---- ----D-LA
 	 *     xxxxxxxx xxxxxxxx xxxx---- ----xxxx
-	 *     -------- -------- -------- ----D-la
+	 *     -------- -------- -------- ----D-LA
 	 *
 	 *   Large (sampled, size <= PAGE_SIZE):
-	 *     ssssssss ssssssss sssscccc ccccD-la
+	 *     ssssssss ssssssss sssscccc ccccD-LA
 	 *
 	 *   Large (not sampled, size == PAGE_SIZE):
-	 *     ssssssss ssssssss ssss---- ----D-la
+	 *     ssssssss ssssssss ssss---- ----D-LA
 	 */
 	size_t				bits;
 #ifdef JEMALLOC_PROF
@@ -206,16 +213,52 @@ struct arena_run_s {
 	/* Bin this run is associated with. */
 	arena_bin_t	*bin;

-	/* Stack of available freed regions, or NULL. */
-	void		*avail;
-
-	/* Next region that has never been allocated, or run boundary. */
-	void		*next;
+	/* Index of next region that has never been allocated, or nregs. */
+	uint32_t	nextind;

 	/* Number of free regions in run. */
 	unsigned	nfree;
 };

+/*
+ * Read-only information associated with each element of arena_t's bins array
+ * is stored separately, partly to reduce memory usage (only one copy, rather
+ * than one per arena), but mainly to avoid false cacheline sharing.
+ */
+struct arena_bin_info_s {
+	/* Size of regions in a run for this bin's size class. */
+	size_t		reg_size;
+
+	/* Total size of a run for this bin's size class. */
+	size_t		run_size;
+
+	/* Total number of regions in a run for this bin's size class. */
+	uint32_t	nregs;
+
+	/*
+	 * Offset of first bitmap_t element in a run header for this bin's size
+	 * class.
+	 */
+	uint32_t	bitmap_offset;
+
+	/*
+	 * Metadata used to manipulate bitmaps for runs associated with this
+	 * bin.
+	 */
+	bitmap_info_t	bitmap_info;
+
+#ifdef JEMALLOC_PROF
+	/*
+	 * Offset of first (prof_ctx_t *) in a run header for this bin's size
+	 * class, or 0 if (opt_prof == false).
+	 */
+	uint32_t	ctx0_offset;
+#endif
+
+	/* Offset of first region in a run for this bin's size class. */
+	uint32_t	reg0_offset;
+};
+
 struct arena_bin_s {
 	/*
 	 * All operations on runcur, runs, and stats require that lock be
@@ -240,26 +283,6 @@ struct arena_bin_s {
 	 */
 	arena_run_tree_t runs;

-	/* Size of regions in a run for this bin's size class. */
-	size_t		reg_size;
-
-	/* Total size of a run for this bin's size class. */
-	size_t		run_size;
-
-	/* Total number of regions in a run for this bin's size class. */
-	uint32_t	nregs;
-
-#ifdef JEMALLOC_PROF
-	/*
-	 * Offset of first (prof_ctx_t *) in a run header for this bin's size
-	 * class, or 0 if (opt_prof == false).
-	 */
-	uint32_t	ctx0_offset;
-#endif
-
-	/* Offset of first region in a run for this bin's size class. */
-	uint32_t	reg0_offset;
-
 #ifdef JEMALLOC_STATS
 	/* Bin statistics. */
 	malloc_bin_stats_t stats;
@@ -276,8 +299,18 @@ struct arena_s {
 	unsigned		ind;

 	/*
-	 * All non-bin-related operations on this arena require that lock be
-	 * locked.
+	 * Number of threads currently assigned to this arena.  This field is
+	 * protected by arenas_lock.
+	 */
+	unsigned		nthreads;
+
+	/*
+	 * There are three classes of arena operations from a locking
+	 * perspective:
+	 * 1) Thread asssignment (modifies nthreads) is protected by
+	 *    arenas_lock.
+	 * 2) Bin-related operations are protected by bin locks.
+	 * 3) Chunk- and run-related operations are protected by this mutex.
 	 */
 	malloc_mutex_t		lock;

@@ -347,45 +380,35 @@ struct arena_s {

 	/*
 	 * bins is used to store trees of free regions of the following sizes,
-	 * assuming a 16-byte quantum, 4 KiB page size, and default
-	 * JEMALLOC_OPTIONS.
+	 * assuming a 64-bit system with 16-byte quantum, 4 KiB page size, and
+	 * default MALLOC_CONF.
 	 *
 	 *   bins[i] |   size |
 	 *   --------+--------+
-	 *        0  |      2 |
-	 *        1  |      4 |
-	 *        2  |      8 |
+	 *        0  |      8 |
 	 *   --------+--------+
-	 *        3  |     16 |
-	 *        4  |     32 |
-	 *        5  |     48 |
+	 *        1  |     16 |
+	 *        2  |     32 |
+	 *        3  |     48 |
 	 *           :        :
-	 *        8  |     96 |
-	 *        9  |    112 |
-	 *       10  |    128 |
+	 *        6  |     96 |
+	 *        7  |    112 |
+	 *        8  |    128 |
 	 *   --------+--------+
-	 *       11  |    192 |
-	 *       12  |    256 |
-	 *       13  |    320 |
-	 *       14  |    384 |
-	 *       15  |    448 |
-	 *       16  |    512 |
+	 *        9  |    192 |
+	 *       10  |    256 |
+	 *       11  |    320 |
+	 *       12  |    384 |
+	 *       13  |    448 |
+	 *       14  |    512 |
 	 *   --------+--------+
-	 *       17  |    768 |
-	 *       18  |   1024 |
-	 *       19  |   1280 |
+	 *       15  |    768 |
+	 *       16  |   1024 |
+	 *       17  |   1280 |
 	 *           :        :
-	 *       27  |   3328 |
-	 *       28  |   3584 |
-	 *       29  |   3840 |
-	 *   --------+--------+
-	 *       30  |  4 KiB |
-	 *       31  |  6 KiB |
-	 *       33  |  8 KiB |
-	 *           :        :
-	 *       43  | 28 KiB |
-	 *       44  | 30 KiB |
-	 *       45  | 32 KiB |
+	 *       25  |   3328 |
+	 *       26  |   3584 |
+	 *       27  |   3840 |
 	 *   --------+--------+
 	 */
 	arena_bin_t		bins[1]; /* Dynamically sized. */
@@ -397,8 +420,16 @@ struct arena_s {

 extern size_t	opt_lg_qspace_max;
 extern size_t	opt_lg_cspace_max;
-extern ssize_t		opt_lg_dirty_mult;
+extern ssize_t	opt_lg_dirty_mult;
+/*
+ * small_size2bin is a compact lookup table that rounds request sizes up to
+ * size classes.  In order to reduce cache footprint, the table is compressed,
+ * and all accesses are via the SMALL_SIZE2BIN macro.
+ */
 extern uint8_t const	*small_size2bin;
+#define	SMALL_SIZE2BIN(s)	(small_size2bin[(s-1) >> LG_TINY_MIN])
+
+extern arena_bin_info_t	*arena_bin_info;

 /* Various bin-related settings. */
 #ifdef JEMALLOC_TINY		/* Number of (2^n)-spaced tiny bins. */
@@ -465,8 +496,9 @@ bool	arena_boot(void);
 #ifdef JEMALLOC_H_INLINES

 #ifndef JEMALLOC_ENABLE_INLINE
-unsigned	arena_run_regind(arena_run_t *run, arena_bin_t *bin,
-    const void *ptr, size_t size);
+size_t	arena_bin_index(arena_t *arena, arena_bin_t *bin);
+unsigned	arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info,
+    const void *ptr);
 #  ifdef JEMALLOC_PROF
 prof_ctx_t	*arena_prof_ctx_get(const void *ptr);
 void	arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
@@ -475,21 +507,37 @@ void	arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr);
 #endif

 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ARENA_C_))
+JEMALLOC_INLINE size_t
+arena_bin_index(arena_t *arena, arena_bin_t *bin)
+{
+	size_t binind = bin - arena->bins;
+	assert(binind < nbins);
+	return (binind);
+}
+
 JEMALLOC_INLINE unsigned
-arena_run_regind(arena_run_t *run, arena_bin_t *bin, const void *ptr,
-    size_t size)
+arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info, const void *ptr)
 {
 	unsigned shift, diff, regind;
+	size_t size;

-	assert(run->magic == ARENA_RUN_MAGIC);
+	dassert(run->magic == ARENA_RUN_MAGIC);
+	/*
+	 * Freeing a pointer lower than region zero can cause assertion
+	 * failure.
+	 */
+	assert((uintptr_t)ptr >= (uintptr_t)run +
+	    (uintptr_t)bin_info->reg0_offset);

 	/*
 	 * Avoid doing division with a variable divisor if possible.  Using
 	 * actual division here can reduce allocator throughput by over 20%!
 	 */
-	diff = (unsigned)((uintptr_t)ptr - (uintptr_t)run - bin->reg0_offset);
+	diff = (unsigned)((uintptr_t)ptr - (uintptr_t)run -
+	    bin_info->reg0_offset);

 	/* Rescale (factor powers of 2 out of the numerator and denominator). */
+	size = bin_info->reg_size;
 	shift = ffs(size) - 1;
 	diff >>= shift;
 	size >>= shift;
@@ -512,8 +560,8 @@ arena_run_regind(arena_run_t *run, arena_bin_t *bin, const void *ptr,
 		 * divide by 0, and 1 and 2 are both powers of two, which are
 		 * handled above.
 		 */
-#define	SIZE_INV_SHIFT 21
-#define	SIZE_INV(s) (((1U << SIZE_INV_SHIFT) / (s)) + 1)
+#define	SIZE_INV_SHIFT	((sizeof(unsigned) << 3) - LG_RUN_MAXREGS)
+#define	SIZE_INV(s)	(((1U << SIZE_INV_SHIFT) / (s)) + 1)
 		static const unsigned size_invs[] = {
 		    SIZE_INV(3),
 		    SIZE_INV(4), SIZE_INV(5), SIZE_INV(6), SIZE_INV(7),
@@ -533,7 +581,7 @@ arena_run_regind(arena_run_t *run, arena_bin_t *bin, const void *ptr,
 #undef SIZE_INV_SHIFT
 	}
 	assert(diff == regind * size);
-	assert(regind < bin->nregs);
+	assert(regind < bin_info->nregs);

 	return (regind);
 }
@@ -560,13 +608,14 @@ arena_prof_ctx_get(const void *ptr)
 			arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
 			    (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) <<
 			    PAGE_SHIFT));
-			arena_bin_t *bin = run->bin;
+			size_t binind = arena_bin_index(chunk->arena, run->bin);
+			arena_bin_info_t *bin_info = &arena_bin_info[binind];
 			unsigned regind;

-			assert(run->magic == ARENA_RUN_MAGIC);
-			regind = arena_run_regind(run, bin, ptr, bin->reg_size);
+			dassert(run->magic == ARENA_RUN_MAGIC);
+			regind = arena_run_regind(run, bin_info, ptr);
 			ret = *(prof_ctx_t **)((uintptr_t)run +
-			    bin->ctx0_offset + (regind *
+			    bin_info->ctx0_offset + (regind *
 			    sizeof(prof_ctx_t *)));
 		}
 	} else
@@ -594,12 +643,16 @@ arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 			    (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) <<
 			    PAGE_SHIFT));
 			arena_bin_t *bin = run->bin;
+			size_t binind;
+			arena_bin_info_t *bin_info;
 			unsigned regind;

-			assert(run->magic == ARENA_RUN_MAGIC);
-			regind = arena_run_regind(run, bin, ptr, bin->reg_size);
+			dassert(run->magic == ARENA_RUN_MAGIC);
+			binind = arena_bin_index(chunk->arena, bin);
+			bin_info = &arena_bin_info[binind];
+			regind = arena_run_regind(run, bin_info, ptr);

-			*((prof_ctx_t **)((uintptr_t)run + bin->ctx0_offset
+			*((prof_ctx_t **)((uintptr_t)run + bin_info->ctx0_offset
 			    + (regind * sizeof(prof_ctx_t *)))) = ctx;
 		} else
 			assert((uintptr_t)ctx == (uintptr_t)1U);
@@ -615,7 +668,7 @@ arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 	arena_chunk_map_t *mapelm;

 	assert(arena != NULL);
-	assert(arena->magic == ARENA_MAGIC);
+	dassert(arena->magic == ARENA_MAGIC);
 	assert(chunk->arena == arena);
 	assert(ptr != NULL);
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);
@@ -638,11 +691,18 @@ arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 			run = (arena_run_t *)((uintptr_t)chunk +
 			    (uintptr_t)((pageind - (mapelm->bits >>
 			    PAGE_SHIFT)) << PAGE_SHIFT));
-			assert(run->magic == ARENA_RUN_MAGIC);
-			assert(((uintptr_t)ptr - ((uintptr_t)run +
-			    (uintptr_t)run->bin->reg0_offset)) %
-			    run->bin->reg_size == 0);
+			dassert(run->magic == ARENA_RUN_MAGIC);
 			bin = run->bin;
+#ifdef JEMALLOC_DEBUG
+			{
+				size_t binind = arena_bin_index(arena, bin);
+				arena_bin_info_t *bin_info =
+				    &arena_bin_info[binind];
+				assert(((uintptr_t)ptr - ((uintptr_t)run +
+				    (uintptr_t)bin_info->reg0_offset)) %
+				    bin_info->reg_size == 0);
+			}
+#endif
 			malloc_mutex_lock(&bin->lock);
 			arena_dalloc_bin(arena, chunk, ptr, mapelm);
 			malloc_mutex_unlock(&bin->lock);
--- a/dep/jemalloc/include/jemalloc/internal/atomic.h
+++ b/dep/jemalloc/include/jemalloc/internal/atomic.h
@@ -0,0 +1,169 @@
+/******************************************************************************/
+#ifdef JEMALLOC_H_TYPES
+
+#endif /* JEMALLOC_H_TYPES */
+/******************************************************************************/
+#ifdef JEMALLOC_H_STRUCTS
+
+#endif /* JEMALLOC_H_STRUCTS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_EXTERNS
+
+#define	atomic_read_uint64(p)	atomic_add_uint64(p, 0)
+#define	atomic_read_uint32(p)	atomic_add_uint32(p, 0)
+
+#if (LG_SIZEOF_PTR == 3)
+#  define atomic_read_z(p)						\
+    (size_t)atomic_add_uint64((uint64_t *)p, (uint64_t)0)
+#  define atomic_add_z(p, x)						\
+    (size_t)atomic_add_uint64((uint64_t *)p, (uint64_t)x)
+#  define atomic_sub_z(p, x)						\
+    (size_t)atomic_sub_uint64((uint64_t *)p, (uint64_t)x)
+#elif (LG_SIZEOF_PTR == 2)
+#  define atomic_read_z(p)						\
+    (size_t)atomic_add_uint32((uint32_t *)p, (uint32_t)0)
+#  define atomic_add_z(p, x)						\
+    (size_t)atomic_add_uint32((uint32_t *)p, (uint32_t)x)
+#  define atomic_sub_z(p, x)						\
+    (size_t)atomic_sub_uint32((uint32_t *)p, (uint32_t)x)
+#endif
+
+#endif /* JEMALLOC_H_EXTERNS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_INLINES
+
+#ifndef JEMALLOC_ENABLE_INLINE
+uint64_t	atomic_add_uint64(uint64_t *p, uint64_t x);
+uint64_t	atomic_sub_uint64(uint64_t *p, uint64_t x);
+uint32_t	atomic_add_uint32(uint32_t *p, uint32_t x);
+uint32_t	atomic_sub_uint32(uint32_t *p, uint32_t x);
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ATOMIC_C_))
+/******************************************************************************/
+/* 64-bit operations. */
+#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
+JEMALLOC_INLINE uint64_t
+atomic_add_uint64(uint64_t *p, uint64_t x)
+{
+
+	return (__sync_add_and_fetch(p, x));
+}
+
+JEMALLOC_INLINE uint64_t
+atomic_sub_uint64(uint64_t *p, uint64_t x)
+{
+
+	return (__sync_sub_and_fetch(p, x));
+}
+#elif (defined(JEMALLOC_OSATOMIC))
+JEMALLOC_INLINE uint64_t
+atomic_add_uint64(uint64_t *p, uint64_t x)
+{
+
+	return (OSAtomicAdd64((int64_t)x, (int64_t *)p));
+}
+
+JEMALLOC_INLINE uint64_t
+atomic_sub_uint64(uint64_t *p, uint64_t x)
+{
+
+	return (OSAtomicAdd64(-((int64_t)x), (int64_t *)p));
+}
+#elif (defined(__amd64_) || defined(__x86_64__))
+JEMALLOC_INLINE uint64_t
+atomic_add_uint64(uint64_t *p, uint64_t x)
+{
+
+	asm volatile (
+	    "lock; xaddq %0, %1;"
+	    : "+r" (x), "=m" (*p) /* Outputs. */
+	    : "m" (*p) /* Inputs. */
+	    );
+
+	return (x);
+}
+
+JEMALLOC_INLINE uint64_t
+atomic_sub_uint64(uint64_t *p, uint64_t x)
+{
+
+	x = (uint64_t)(-(int64_t)x);
+	asm volatile (
+	    "lock; xaddq %0, %1;"
+	    : "+r" (x), "=m" (*p) /* Outputs. */
+	    : "m" (*p) /* Inputs. */
+	    );
+
+	return (x);
+}
+#else
+#  if (LG_SIZEOF_PTR == 3)
+#    error "Missing implementation for 64-bit atomic operations"
+#  endif
+#endif
+
+/******************************************************************************/
+/* 32-bit operations. */
+#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4
+JEMALLOC_INLINE uint32_t
+atomic_add_uint32(uint32_t *p, uint32_t x)
+{
+
+	return (__sync_add_and_fetch(p, x));
+}
+
+JEMALLOC_INLINE uint32_t
+atomic_sub_uint32(uint32_t *p, uint32_t x)
+{
+
+	return (__sync_sub_and_fetch(p, x));
+}
+#elif (defined(JEMALLOC_OSATOMIC))
+JEMALLOC_INLINE uint32_t
+atomic_add_uint32(uint32_t *p, uint32_t x)
+{
+
+	return (OSAtomicAdd32((int32_t)x, (int32_t *)p));
+}
+
+JEMALLOC_INLINE uint32_t
+atomic_sub_uint32(uint32_t *p, uint32_t x)
+{
+
+	return (OSAtomicAdd32(-((int32_t)x), (int32_t *)p));
+}
+#elif (defined(__i386__) || defined(__amd64_) || defined(__x86_64__))
+JEMALLOC_INLINE uint32_t
+atomic_add_uint32(uint32_t *p, uint32_t x)
+{
+
+	asm volatile (
+	    "lock; xaddl %0, %1;"
+	    : "+r" (x), "=m" (*p) /* Outputs. */
+	    : "m" (*p) /* Inputs. */
+	    );
+
+	return (x);
+}
+
+JEMALLOC_INLINE uint32_t
+atomic_sub_uint32(uint32_t *p, uint32_t x)
+{
+
+	x = (uint32_t)(-(int32_t)x);
+	asm volatile (
+	    "lock; xaddl %0, %1;"
+	    : "+r" (x), "=m" (*p) /* Outputs. */
+	    : "m" (*p) /* Inputs. */
+	    );
+
+	return (x);
+}
+#else
+#  error "Missing implementation for 32-bit atomic operations"
+#endif
+#endif
+
+#endif /* JEMALLOC_H_INLINES */
+/******************************************************************************/
--- a/dep/jemalloc/include/jemalloc/internal/bitmap.h
+++ b/dep/jemalloc/include/jemalloc/internal/bitmap.h
@@ -0,0 +1,184 @@
+/******************************************************************************/
+#ifdef JEMALLOC_H_TYPES
+
+/* Maximum bitmap bit count is 2^LG_BITMAP_MAXBITS. */
+#define	LG_BITMAP_MAXBITS	LG_RUN_MAXREGS
+
+typedef struct bitmap_level_s bitmap_level_t;
+typedef struct bitmap_info_s bitmap_info_t;
+typedef unsigned long bitmap_t;
+#define	LG_SIZEOF_BITMAP	LG_SIZEOF_LONG
+
+/* Number of bits per group. */
+#define	LG_BITMAP_GROUP_NBITS		(LG_SIZEOF_BITMAP + 3)
+#define	BITMAP_GROUP_NBITS		(ZU(1) << LG_BITMAP_GROUP_NBITS)
+#define	BITMAP_GROUP_NBITS_MASK		(BITMAP_GROUP_NBITS-1)
+
+/* Maximum number of levels possible. */
+#define	BITMAP_MAX_LEVELS						\
+    (LG_BITMAP_MAXBITS / LG_SIZEOF_BITMAP)				\
+    + !!(LG_BITMAP_MAXBITS % LG_SIZEOF_BITMAP)
+
+#endif /* JEMALLOC_H_TYPES */
+/******************************************************************************/
+#ifdef JEMALLOC_H_STRUCTS
+
+struct bitmap_level_s {
+	/* Offset of this level's groups within the array of groups. */
+	size_t group_offset;
+};
+
+struct bitmap_info_s {
+	/* Logical number of bits in bitmap (stored at bottom level). */
+	size_t nbits;
+
+	/* Number of levels necessary for nbits. */
+	unsigned nlevels;
+
+	/*
+	 * Only the first (nlevels+1) elements are used, and levels are ordered
+	 * bottom to top (e.g. the bottom level is stored in levels[0]).
+	 */
+	bitmap_level_t levels[BITMAP_MAX_LEVELS+1];
+};
+
+#endif /* JEMALLOC_H_STRUCTS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_EXTERNS
+
+void	bitmap_info_init(bitmap_info_t *binfo, size_t nbits);
+size_t	bitmap_info_ngroups(const bitmap_info_t *binfo);
+size_t	bitmap_size(size_t nbits);
+void	bitmap_init(bitmap_t *bitmap, const bitmap_info_t *binfo);
+
+#endif /* JEMALLOC_H_EXTERNS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_INLINES
+
+#ifndef JEMALLOC_ENABLE_INLINE
+bool	bitmap_full(bitmap_t *bitmap, const bitmap_info_t *binfo);
+bool	bitmap_get(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit);
+void	bitmap_set(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit);
+size_t	bitmap_sfu(bitmap_t *bitmap, const bitmap_info_t *binfo);
+void	bitmap_unset(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit);
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_BITMAP_C_))
+JEMALLOC_INLINE bool
+bitmap_full(bitmap_t *bitmap, const bitmap_info_t *binfo)
+{
+	unsigned rgoff = binfo->levels[binfo->nlevels].group_offset - 1;
+	bitmap_t rg = bitmap[rgoff];
+	/* The bitmap is full iff the root group is 0. */
+	return (rg == 0);
+}
+
+JEMALLOC_INLINE bool
+bitmap_get(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
+{
+	size_t goff;
+	bitmap_t g;
+
+	assert(bit < binfo->nbits);
+	goff = bit >> LG_BITMAP_GROUP_NBITS;
+	g = bitmap[goff];
+	return (!(g & (1LU << (bit & BITMAP_GROUP_NBITS_MASK))));
+}
+
+JEMALLOC_INLINE void
+bitmap_set(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
+{
+	size_t goff;
+	bitmap_t *gp;
+	bitmap_t g;
+
+	assert(bit < binfo->nbits);
+	assert(bitmap_get(bitmap, binfo, bit) == false);
+	goff = bit >> LG_BITMAP_GROUP_NBITS;
+	gp = &bitmap[goff];
+	g = *gp;
+	assert(g & (1LU << (bit & BITMAP_GROUP_NBITS_MASK)));
+	g ^= 1LU << (bit & BITMAP_GROUP_NBITS_MASK);
+	*gp = g;
+	assert(bitmap_get(bitmap, binfo, bit));
+	/* Propagate group state transitions up the tree. */
+	if (g == 0) {
+		unsigned i;
+		for (i = 1; i < binfo->nlevels; i++) {
+			bit = goff;
+			goff = bit >> LG_BITMAP_GROUP_NBITS;
+			gp = &bitmap[binfo->levels[i].group_offset + goff];
+			g = *gp;
+			assert(g & (1LU << (bit & BITMAP_GROUP_NBITS_MASK)));
+			g ^= 1LU << (bit & BITMAP_GROUP_NBITS_MASK);
+			*gp = g;
+			if (g != 0)
+				break;
+		}
+	}
+}
+
+/* sfu: set first unset. */
+JEMALLOC_INLINE size_t
+bitmap_sfu(bitmap_t *bitmap, const bitmap_info_t *binfo)
+{
+	size_t bit;
+	bitmap_t g;
+	unsigned i;
+
+	assert(bitmap_full(bitmap, binfo) == false);
+
+	i = binfo->nlevels - 1;
+	g = bitmap[binfo->levels[i].group_offset];
+	bit = ffsl(g) - 1;
+	while (i > 0) {
+		i--;
+		g = bitmap[binfo->levels[i].group_offset + bit];
+		bit = (bit << LG_BITMAP_GROUP_NBITS) + (ffsl(g) - 1);
+	}
+
+	bitmap_set(bitmap, binfo, bit);
+	return (bit);
+}
+
+JEMALLOC_INLINE void
+bitmap_unset(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
+{
+	size_t goff;
+	bitmap_t *gp;
+	bitmap_t g;
+	bool propagate;
+
+	assert(bit < binfo->nbits);
+	assert(bitmap_get(bitmap, binfo, bit));
+	goff = bit >> LG_BITMAP_GROUP_NBITS;
+	gp = &bitmap[goff];
+	g = *gp;
+	propagate = (g == 0);
+	assert((g & (1LU << (bit & BITMAP_GROUP_NBITS_MASK))) == 0);
+	g ^= 1LU << (bit & BITMAP_GROUP_NBITS_MASK);
+	*gp = g;
+	assert(bitmap_get(bitmap, binfo, bit) == false);
+	/* Propagate group state transitions up the tree. */
+	if (propagate) {
+		unsigned i;
+		for (i = 1; i < binfo->nlevels; i++) {
+			bit = goff;
+			goff = bit >> LG_BITMAP_GROUP_NBITS;
+			gp = &bitmap[binfo->levels[i].group_offset + goff];
+			g = *gp;
+			propagate = (g == 0);
+			assert((g & (1LU << (bit & BITMAP_GROUP_NBITS_MASK)))
+			    == 0);
+			g ^= 1LU << (bit & BITMAP_GROUP_NBITS_MASK);
+			*gp = g;
+			if (propagate == false)
+				break;
+		}
+	}
+}
+
+#endif
+
+#endif /* JEMALLOC_H_INLINES */
+/******************************************************************************/
--- a/dep/jemalloc/include/jemalloc/internal/chunk.h
+++ b/dep/jemalloc/include/jemalloc/internal/chunk.h
@@ -50,7 +50,7 @@ extern size_t		map_bias; /* Number of arena chunk header pages. */
 extern size_t		arena_maxclass; /* Max size class for arenas. */

 void	*chunk_alloc(size_t size, bool base, bool *zero);
-void	chunk_dealloc(void *chunk, size_t size);
+void	chunk_dealloc(void *chunk, size_t size, bool unmap);
 bool	chunk_boot(void);

 #endif /* JEMALLOC_H_EXTERNS */
--- a/dep/jemalloc/include/jemalloc/internal/ckh.h
+++ b/dep/jemalloc/include/jemalloc/internal/ckh.h
@@ -31,7 +31,7 @@ struct ckhc_s {

 struct ckh_s {
 #ifdef JEMALLOC_DEBUG
-#define	CKH_MAGIG	0x3af2489d
+#define	CKH_MAGIC	0x3af2489d
 	uint32_t	magic;
 #endif

--- a/dep/jemalloc/include/jemalloc/internal/ctl.h
+++ b/dep/jemalloc/include/jemalloc/internal/ctl.h
@@ -29,6 +29,7 @@ struct ctl_node_s {

 struct ctl_arena_stats_s {
 	bool			initialized;
+	unsigned		nthreads;
 	size_t			pactive;
 	size_t			pdirty;
 #ifdef JEMALLOC_STATS
--- a/dep/jemalloc/include/jemalloc/internal/hash.h
+++ b/dep/jemalloc/include/jemalloc/internal/hash.h
@@ -17,7 +17,7 @@
 uint64_t	hash(const void *key, size_t len, uint64_t seed);
 #endif

-#if (defined(JEMALLOC_ENABLE_INLINE) || defined(HASH_C_))
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_HASH_C_))
 /*
 * The following hash function is based on MurmurHash64A(), placed into the
 * public domain by Austin Appleby.  See http://murmurhash.googlepages.com/ for
@@ -26,7 +26,7 @@ uint64_t	hash(const void *key, size_t len, uint64_t seed);
 JEMALLOC_INLINE uint64_t
 hash(const void *key, size_t len, uint64_t seed)
 {
-	const uint64_t m = 0xc6a4a7935bd1e995;
+	const uint64_t m = 0xc6a4a7935bd1e995LLU;
 	const int r = 47;
 	uint64_t h = seed ^ (len * m);
 	const uint64_t *data = (const uint64_t *)key;
@@ -62,7 +62,7 @@ hash(const void *key, size_t len, uint64_t seed)
 	h *= m;
 	h ^= h >> r;

-	return h;
+	return (h);
 }
 #endif

--- a/dep/jemalloc/include/jemalloc/internal/jemalloc_internal.h
+++ b/dep/jemalloc/include/jemalloc/internal/jemalloc_internal.h
@@ -33,6 +33,12 @@
 #define	JEMALLOC_MANGLE
 #include "../jemalloc.h"

+#include "jemalloc/internal/private_namespace.h"
+
+#if (defined(JEMALLOC_OSATOMIC) || defined(JEMALLOC_OSSPIN))
+#include <libkern/OSAtomic.h>
+#endif
+
 #ifdef JEMALLOC_ZONE
 #include <mach/mach_error.h>
 #include <mach/mach_init.h>
@@ -55,8 +61,9 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 * Define a custom assert() in order to reduce the chances of deadlock during
 * assertion failure.
 */
-#ifdef JEMALLOC_DEBUG
-#  define assert(e) do {						\
+#ifndef assert
+#  ifdef JEMALLOC_DEBUG
+#    define assert(e) do {						\
 	if (!(e)) {							\
 		char line_buf[UMAX2S_BUFSIZE];				\
 		malloc_write("<jemalloc>: ");				\
@@ -70,8 +77,15 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 		abort();						\
 	}								\
 } while (0)
+#  else
+#    define assert(e)
+#  endif
+#endif
+
+#ifdef JEMALLOC_DEBUG
+#  define dassert(e) assert(e)
 #else
-#define assert(e)
+#  define dassert(e)
 #endif

 /*
@@ -146,12 +160,19 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #define	QUANTUM_CEILING(a)						\
 	(((a) + QUANTUM_MASK) & ~QUANTUM_MASK)

-#define	SIZEOF_PTR		(1U << LG_SIZEOF_PTR)
+#define	LONG			((size_t)(1U << LG_SIZEOF_LONG))
+#define	LONG_MASK		(LONG - 1)

-/* We can't use TLS in non-PIC programs, since TLS relies on loader magic. */
-#if (!defined(PIC) && !defined(NO_TLS))
-#  define NO_TLS
-#endif
+/* Return the smallest long multiple that is >= a. */
+#define	LONG_CEILING(a)						\
+	(((a) + LONG_MASK) & ~LONG_MASK)
+
+#define	SIZEOF_PTR		(1U << LG_SIZEOF_PTR)
+#define	PTR_MASK		(SIZEOF_PTR - 1)
+
+/* Return the smallest (void *) multiple that is >= a. */
+#define	PTR_CEILING(a)						\
+	(((a) + PTR_MASK) & ~PTR_MASK)

 /*
 * Maximum size of L1 cache line.  This is used to avoid cache line aliasing.
@@ -198,6 +219,7 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #define	PAGE_CEILING(s)							\
 	(((s) + PAGE_MASK) & ~PAGE_MASK)

+#include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/jemprn.h"
 #include "jemalloc/internal/ckh.h"
 #include "jemalloc/internal/stats.h"
@@ -206,6 +228,7 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #include "jemalloc/internal/mb.h"
 #include "jemalloc/internal/extent.h"
 #include "jemalloc/internal/arena.h"
+#include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/base.h"
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
@@ -221,12 +244,14 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 /******************************************************************************/
 #define JEMALLOC_H_STRUCTS

+#include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/jemprn.h"
 #include "jemalloc/internal/ckh.h"
 #include "jemalloc/internal/stats.h"
 #include "jemalloc/internal/ctl.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/mb.h"
+#include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/extent.h"
 #include "jemalloc/internal/arena.h"
 #include "jemalloc/internal/base.h"
@@ -240,6 +265,13 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #endif
 #include "jemalloc/internal/prof.h"

+#ifdef JEMALLOC_STATS
+typedef struct {
+	uint64_t	allocated;
+	uint64_t	deallocated;
+} thread_allocated_t;
+#endif
+
 #undef JEMALLOC_H_STRUCTS
 /******************************************************************************/
 #define JEMALLOC_H_EXTERNS
@@ -269,6 +301,7 @@ extern size_t		lg_pagesize;
 extern unsigned		ncpus;

 extern malloc_mutex_t	arenas_lock; /* Protects arenas initialization. */
+extern pthread_key_t	arenas_tsd;
 #ifndef NO_TLS
 /*
 * Map of pthread_self() --> arenas[???], used for selecting an arena to use
@@ -278,9 +311,9 @@ extern __thread arena_t	*arenas_tls JEMALLOC_ATTR(tls_model("initial-exec"));
 #  define ARENA_GET()	arenas_tls
 #  define ARENA_SET(v)	do {						\
 	arenas_tls = (v);						\
+	pthread_setspecific(arenas_tsd, (void *)(v));			\
 } while (0)
 #else
-extern pthread_key_t	arenas_tsd;
 #  define ARENA_GET()	((arena_t *)pthread_getspecific(arenas_tsd))
 #  define ARENA_SET(v)	do {						\
 	pthread_setspecific(arenas_tsd, (void *)(v));			\
@@ -295,45 +328,28 @@ extern arena_t		**arenas;
 extern unsigned		narenas;

 #ifdef JEMALLOC_STATS
-typedef struct {
-	uint64_t	allocated;
-	uint64_t	deallocated;
-} thread_allocated_t;
 #  ifndef NO_TLS
 extern __thread thread_allocated_t	thread_allocated_tls;
-#    define ALLOCATED_GET() thread_allocated_tls.allocated
-#    define DEALLOCATED_GET() thread_allocated_tls.deallocated
+#    define ALLOCATED_GET() (thread_allocated_tls.allocated)
+#    define ALLOCATEDP_GET() (&thread_allocated_tls.allocated)
+#    define DEALLOCATED_GET() (thread_allocated_tls.deallocated)
+#    define DEALLOCATEDP_GET() (&thread_allocated_tls.deallocated)
 #    define ALLOCATED_ADD(a, d) do {					\
 	thread_allocated_tls.allocated += a;				\
 	thread_allocated_tls.deallocated += d;				\
 } while (0)
 #  else
 extern pthread_key_t	thread_allocated_tsd;
-#    define ALLOCATED_GET()						\
-	(uint64_t)((pthread_getspecific(thread_allocated_tsd) != NULL)	\
-	    ? ((thread_allocated_t *)					\
-	    pthread_getspecific(thread_allocated_tsd))->allocated : 0)
-#    define DEALLOCATED_GET()						\
-	(uint64_t)((pthread_getspecific(thread_allocated_tsd) != NULL)	\
-	    ? ((thread_allocated_t					\
-	    *)pthread_getspecific(thread_allocated_tsd))->deallocated :	\
-	    0)
+thread_allocated_t	*thread_allocated_get_hard(void);
+
+#    define ALLOCATED_GET() (thread_allocated_get()->allocated)
+#    define ALLOCATEDP_GET() (&thread_allocated_get()->allocated)
+#    define DEALLOCATED_GET() (thread_allocated_get()->deallocated)
+#    define DEALLOCATEDP_GET() (&thread_allocated_get()->deallocated)
 #    define ALLOCATED_ADD(a, d) do {					\
-	thread_allocated_t *thread_allocated = (thread_allocated_t *)	\
-	    pthread_getspecific(thread_allocated_tsd);			\
-	if (thread_allocated != NULL) {					\
-		thread_allocated->allocated += (a);			\
-		thread_allocated->deallocated += (d);			\
-	} else {							\
-		thread_allocated = (thread_allocated_t *)		\
-		    imalloc(sizeof(thread_allocated_t));		\
-		if (thread_allocated != NULL) {				\
-			pthread_setspecific(thread_allocated_tsd,	\
-			    thread_allocated);				\
-			thread_allocated->allocated = (a);		\
-			thread_allocated->deallocated = (d);		\
-		}							\
-	}								\
+	thread_allocated_t *thread_allocated = thread_allocated_get();	\
+	thread_allocated->allocated += (a);				\
+	thread_allocated->deallocated += (d);				\
 } while (0)
 #  endif
 #endif
@@ -344,12 +360,14 @@ int	buferror(int errnum, char *buf, size_t buflen);
 void	jemalloc_prefork(void);
 void	jemalloc_postfork(void);

+#include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/jemprn.h"
 #include "jemalloc/internal/ckh.h"
 #include "jemalloc/internal/stats.h"
 #include "jemalloc/internal/ctl.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/mb.h"
+#include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/extent.h"
 #include "jemalloc/internal/arena.h"
 #include "jemalloc/internal/base.h"
@@ -367,6 +385,7 @@ void	jemalloc_postfork(void);
 /******************************************************************************/
 #define JEMALLOC_H_INLINES

+#include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/jemprn.h"
 #include "jemalloc/internal/ckh.h"
 #include "jemalloc/internal/stats.h"
@@ -384,6 +403,9 @@ size_t	s2u(size_t size);
 size_t	sa2u(size_t size, size_t alignment, size_t *run_size_p);
 void	malloc_write(const char *s);
 arena_t	*choose_arena(void);
+#  if (defined(JEMALLOC_STATS) && defined(NO_TLS))
+thread_allocated_t	*thread_allocated_get(void);
+#  endif
 #endif

 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_))
@@ -414,10 +436,10 @@ s2u(size_t size)
 {

 	if (size <= small_maxclass)
-		return arenas[0]->bins[small_size2bin[size]].reg_size;
+		return (arena_bin_info[SMALL_SIZE2BIN(size)].reg_size);
 	if (size <= arena_maxclass)
-		return PAGE_CEILING(size);
-	return CHUNK_CEILING(size);
+		return (PAGE_CEILING(size));
+	return (CHUNK_CEILING(size));
 }

 /*
@@ -458,10 +480,8 @@ sa2u(size_t size, size_t alignment, size_t *run_size_p)
 	}

 	if (usize <= arena_maxclass && alignment <= PAGE_SIZE) {
-		if (usize <= small_maxclass) {
-			return
-			    (arenas[0]->bins[small_size2bin[usize]].reg_size);
-		}
+		if (usize <= small_maxclass)
+			return (arena_bin_info[SMALL_SIZE2BIN(usize)].reg_size);
 		return (PAGE_CEILING(usize));
 	} else {
 		size_t run_size;
@@ -544,8 +564,22 @@ choose_arena(void)

 	return (ret);
 }
+
+#if (defined(JEMALLOC_STATS) && defined(NO_TLS))
+JEMALLOC_INLINE thread_allocated_t *
+thread_allocated_get(void)
+{
+	thread_allocated_t *thread_allocated = (thread_allocated_t *)
+	    pthread_getspecific(thread_allocated_tsd);
+
+	if (thread_allocated == NULL)
+		return (thread_allocated_get_hard());
+	return (thread_allocated);
+}
+#endif
 #endif

+#include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/tcache.h"
 #include "jemalloc/internal/arena.h"
@@ -557,7 +591,7 @@ choose_arena(void)
 #ifndef JEMALLOC_ENABLE_INLINE
 void	*imalloc(size_t size);
 void	*icalloc(size_t size);
-void	*ipalloc(size_t size, size_t alignment, bool zero);
+void	*ipalloc(size_t usize, size_t alignment, bool zero);
 size_t	isalloc(const void *ptr);
 #  ifdef JEMALLOC_IVSALLOC
 size_t	ivsalloc(const void *ptr);
@@ -591,28 +625,39 @@ icalloc(size_t size)
 }

 JEMALLOC_INLINE void *
-ipalloc(size_t size, size_t alignment, bool zero)
+ipalloc(size_t usize, size_t alignment, bool zero)
 {
 	void *ret;
-	size_t usize;
-	size_t run_size
-#  ifdef JEMALLOC_CC_SILENCE
-	    = 0
-#  endif
-	    ;

-	usize = sa2u(size, alignment, &run_size);
-	if (usize == 0)
-		return (NULL);
+	assert(usize != 0);
+	assert(usize == sa2u(usize, alignment, NULL));
+
 	if (usize <= arena_maxclass && alignment <= PAGE_SIZE)
 		ret = arena_malloc(usize, zero);
-	else if (run_size <= arena_maxclass) {
-		ret = arena_palloc(choose_arena(), usize, run_size, alignment,
-		    zero);
-	} else if (alignment <= chunksize)
-		ret = huge_malloc(usize, zero);
-	else
-		ret = huge_palloc(usize, alignment, zero);
+	else {
+		size_t run_size
+#ifdef JEMALLOC_CC_SILENCE
+		    = 0
+#endif
+		    ;
+
+		/*
+		 * Ideally we would only ever call sa2u() once per aligned
+		 * allocation request, and the caller of this function has
+		 * already done so once.  However, it's rather burdensome to
+		 * require every caller to pass in run_size, especially given
+		 * that it's only relevant to large allocations.  Therefore,
+		 * just call it again here in order to get run_size.
+		 */
+		sa2u(usize, alignment, &run_size);
+		if (run_size <= arena_maxclass) {
+			ret = arena_palloc(choose_arena(), usize, run_size,
+			    alignment, zero);
+		} else if (alignment <= chunksize)
+			ret = huge_malloc(usize, zero);
+		else
+			ret = huge_palloc(usize, alignment, zero);
+	}

 	assert(((uintptr_t)ret & (alignment - 1)) == 0);
 	return (ret);
@@ -629,7 +674,7 @@ isalloc(const void *ptr)
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (chunk != ptr) {
 		/* Region. */
-		assert(chunk->arena->magic == ARENA_MAGIC);
+		dassert(chunk->arena->magic == ARENA_MAGIC);

 #ifdef JEMALLOC_PROF
 		ret = arena_salloc_demote(ptr);
@@ -683,7 +728,7 @@ iralloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero,

 	if (alignment != 0 && ((uintptr_t)ptr & ((uintptr_t)alignment-1))
 	    != 0) {
-		size_t copysize;
+		size_t usize, copysize;

 		/*
 		 * Existing object alignment is inadquate; allocate new space
@@ -691,12 +736,18 @@ iralloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero,
 		 */
 		if (no_move)
 			return (NULL);
-		ret = ipalloc(size + extra, alignment, zero);
+		usize = sa2u(size + extra, alignment, NULL);
+		if (usize == 0)
+			return (NULL);
+		ret = ipalloc(usize, alignment, zero);
 		if (ret == NULL) {
 			if (extra == 0)
 				return (NULL);
 			/* Try again, without extra this time. */
-			ret = ipalloc(size, alignment, zero);
+			usize = sa2u(size, alignment, NULL);
+			if (usize == 0)
+				return (NULL);
+			ret = ipalloc(usize, alignment, zero);
 			if (ret == NULL)
 				return (NULL);
 		}
--- a/dep/jemalloc/include/jemalloc/internal/mb.h
+++ b/dep/jemalloc/include/jemalloc/internal/mb.h
@@ -17,7 +17,7 @@
 void	mb_write(void);
 #endif

-#if (defined(JEMALLOC_ENABLE_INLINE) || defined(MB_C_))
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_MB_C_))
 #ifdef __i386__
 /*
 * According to the Intel Architecture Software Developer's Manual, current
--- a/dep/jemalloc/include/jemalloc/internal/mutex.h
+++ b/dep/jemalloc/include/jemalloc/internal/mutex.h
@@ -1,7 +1,11 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_TYPES

+#ifdef JEMALLOC_OSSPIN
+typedef OSSpinLock malloc_mutex_t;
+#else
 typedef pthread_mutex_t malloc_mutex_t;
+#endif

 #ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
 #  define MALLOC_MUTEX_INITIALIZER PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
@@ -41,17 +45,26 @@ JEMALLOC_INLINE void
 malloc_mutex_lock(malloc_mutex_t *mutex)
 {

-	if (isthreaded)
+	if (isthreaded) {
+#ifdef JEMALLOC_OSSPIN
+		OSSpinLockLock(mutex);
+#else
 		pthread_mutex_lock(mutex);
+#endif
+	}
 }

 JEMALLOC_INLINE bool
 malloc_mutex_trylock(malloc_mutex_t *mutex)
 {

-	if (isthreaded)
+	if (isthreaded) {
+#ifdef JEMALLOC_OSSPIN
+		return (OSSpinLockTry(mutex) == false);
+#else
 		return (pthread_mutex_trylock(mutex) != 0);
-	else
+#endif
+	} else
 		return (false);
 }

@@ -59,8 +72,13 @@ JEMALLOC_INLINE void
 malloc_mutex_unlock(malloc_mutex_t *mutex)
 {

-	if (isthreaded)
+	if (isthreaded) {
+#ifdef JEMALLOC_OSSPIN
+		OSSpinLockUnlock(mutex);
+#else
 		pthread_mutex_unlock(mutex);
+#endif
+	}
 }
 #endif

--- a/dep/jemalloc/include/jemalloc/internal/private_namespace.h
+++ b/dep/jemalloc/include/jemalloc/internal/private_namespace.h
@@ -0,0 +1,195 @@
+#define	arena_bin_index JEMALLOC_N(arena_bin_index)
+#define	arena_boot JEMALLOC_N(arena_boot)
+#define	arena_dalloc JEMALLOC_N(arena_dalloc)
+#define	arena_dalloc_bin JEMALLOC_N(arena_dalloc_bin)
+#define	arena_dalloc_large JEMALLOC_N(arena_dalloc_large)
+#define	arena_malloc JEMALLOC_N(arena_malloc)
+#define	arena_malloc_large JEMALLOC_N(arena_malloc_large)
+#define	arena_malloc_small JEMALLOC_N(arena_malloc_small)
+#define	arena_new JEMALLOC_N(arena_new)
+#define	arena_palloc JEMALLOC_N(arena_palloc)
+#define	arena_prof_accum JEMALLOC_N(arena_prof_accum)
+#define	arena_prof_ctx_get JEMALLOC_N(arena_prof_ctx_get)
+#define	arena_prof_ctx_set JEMALLOC_N(arena_prof_ctx_set)
+#define	arena_prof_promoted JEMALLOC_N(arena_prof_promoted)
+#define	arena_purge_all JEMALLOC_N(arena_purge_all)
+#define	arena_ralloc JEMALLOC_N(arena_ralloc)
+#define	arena_ralloc_no_move JEMALLOC_N(arena_ralloc_no_move)
+#define	arena_run_regind JEMALLOC_N(arena_run_regind)
+#define	arena_salloc JEMALLOC_N(arena_salloc)
+#define	arena_salloc_demote JEMALLOC_N(arena_salloc_demote)
+#define	arena_stats_merge JEMALLOC_N(arena_stats_merge)
+#define	arena_tcache_fill_small JEMALLOC_N(arena_tcache_fill_small)
+#define	arenas_bin_i_index JEMALLOC_N(arenas_bin_i_index)
+#define	arenas_extend JEMALLOC_N(arenas_extend)
+#define	arenas_lrun_i_index JEMALLOC_N(arenas_lrun_i_index)
+#define	atomic_add_uint32 JEMALLOC_N(atomic_add_uint32)
+#define	atomic_add_uint64 JEMALLOC_N(atomic_add_uint64)
+#define	atomic_sub_uint32 JEMALLOC_N(atomic_sub_uint32)
+#define	atomic_sub_uint64 JEMALLOC_N(atomic_sub_uint64)
+#define	base_alloc JEMALLOC_N(base_alloc)
+#define	base_boot JEMALLOC_N(base_boot)
+#define	base_node_alloc JEMALLOC_N(base_node_alloc)
+#define	base_node_dealloc JEMALLOC_N(base_node_dealloc)
+#define	bitmap_full JEMALLOC_N(bitmap_full)
+#define	bitmap_get JEMALLOC_N(bitmap_get)
+#define	bitmap_info_init JEMALLOC_N(bitmap_info_init)
+#define	bitmap_info_ngroups JEMALLOC_N(bitmap_info_ngroups)
+#define	bitmap_init JEMALLOC_N(bitmap_init)
+#define	bitmap_set JEMALLOC_N(bitmap_set)
+#define	bitmap_sfu JEMALLOC_N(bitmap_sfu)
+#define	bitmap_size JEMALLOC_N(bitmap_size)
+#define	bitmap_unset JEMALLOC_N(bitmap_unset)
+#define	bt_init JEMALLOC_N(bt_init)
+#define	buferror JEMALLOC_N(buferror)
+#define	choose_arena JEMALLOC_N(choose_arena)
+#define	choose_arena_hard JEMALLOC_N(choose_arena_hard)
+#define	chunk_alloc JEMALLOC_N(chunk_alloc)
+#define	chunk_alloc_dss JEMALLOC_N(chunk_alloc_dss)
+#define	chunk_alloc_mmap JEMALLOC_N(chunk_alloc_mmap)
+#define	chunk_alloc_mmap_noreserve JEMALLOC_N(chunk_alloc_mmap_noreserve)
+#define	chunk_alloc_swap JEMALLOC_N(chunk_alloc_swap)
+#define	chunk_boot JEMALLOC_N(chunk_boot)
+#define	chunk_dealloc JEMALLOC_N(chunk_dealloc)
+#define	chunk_dealloc_dss JEMALLOC_N(chunk_dealloc_dss)
+#define	chunk_dealloc_mmap JEMALLOC_N(chunk_dealloc_mmap)
+#define	chunk_dealloc_swap JEMALLOC_N(chunk_dealloc_swap)
+#define	chunk_dss_boot JEMALLOC_N(chunk_dss_boot)
+#define	chunk_in_dss JEMALLOC_N(chunk_in_dss)
+#define	chunk_in_swap JEMALLOC_N(chunk_in_swap)
+#define	chunk_mmap_boot JEMALLOC_N(chunk_mmap_boot)
+#define	chunk_swap_boot JEMALLOC_N(chunk_swap_boot)
+#define	chunk_swap_enable JEMALLOC_N(chunk_swap_enable)
+#define	ckh_bucket_search JEMALLOC_N(ckh_bucket_search)
+#define	ckh_count JEMALLOC_N(ckh_count)
+#define	ckh_delete JEMALLOC_N(ckh_delete)
+#define	ckh_evict_reloc_insert JEMALLOC_N(ckh_evict_reloc_insert)
+#define	ckh_insert JEMALLOC_N(ckh_insert)
+#define	ckh_isearch JEMALLOC_N(ckh_isearch)
+#define	ckh_iter JEMALLOC_N(ckh_iter)
+#define	ckh_new JEMALLOC_N(ckh_new)
+#define	ckh_pointer_hash JEMALLOC_N(ckh_pointer_hash)
+#define	ckh_pointer_keycomp JEMALLOC_N(ckh_pointer_keycomp)
+#define	ckh_rebuild JEMALLOC_N(ckh_rebuild)
+#define	ckh_remove JEMALLOC_N(ckh_remove)
+#define	ckh_search JEMALLOC_N(ckh_search)
+#define	ckh_string_hash JEMALLOC_N(ckh_string_hash)
+#define	ckh_string_keycomp JEMALLOC_N(ckh_string_keycomp)
+#define	ckh_try_bucket_insert JEMALLOC_N(ckh_try_bucket_insert)
+#define	ckh_try_insert JEMALLOC_N(ckh_try_insert)
+#define	create_zone JEMALLOC_N(create_zone)
+#define	ctl_boot JEMALLOC_N(ctl_boot)
+#define	ctl_bymib JEMALLOC_N(ctl_bymib)
+#define	ctl_byname JEMALLOC_N(ctl_byname)
+#define	ctl_nametomib JEMALLOC_N(ctl_nametomib)
+#define	extent_tree_ad_first JEMALLOC_N(extent_tree_ad_first)
+#define	extent_tree_ad_insert JEMALLOC_N(extent_tree_ad_insert)
+#define	extent_tree_ad_iter JEMALLOC_N(extent_tree_ad_iter)
+#define	extent_tree_ad_iter_recurse JEMALLOC_N(extent_tree_ad_iter_recurse)
+#define	extent_tree_ad_iter_start JEMALLOC_N(extent_tree_ad_iter_start)
+#define	extent_tree_ad_last JEMALLOC_N(extent_tree_ad_last)
+#define	extent_tree_ad_new JEMALLOC_N(extent_tree_ad_new)
+#define	extent_tree_ad_next JEMALLOC_N(extent_tree_ad_next)
+#define	extent_tree_ad_nsearch JEMALLOC_N(extent_tree_ad_nsearch)
+#define	extent_tree_ad_prev JEMALLOC_N(extent_tree_ad_prev)
+#define	extent_tree_ad_psearch JEMALLOC_N(extent_tree_ad_psearch)
+#define	extent_tree_ad_remove JEMALLOC_N(extent_tree_ad_remove)
+#define	extent_tree_ad_reverse_iter JEMALLOC_N(extent_tree_ad_reverse_iter)
+#define	extent_tree_ad_reverse_iter_recurse JEMALLOC_N(extent_tree_ad_reverse_iter_recurse)
+#define	extent_tree_ad_reverse_iter_start JEMALLOC_N(extent_tree_ad_reverse_iter_start)
+#define	extent_tree_ad_search JEMALLOC_N(extent_tree_ad_search)
+#define	extent_tree_szad_first JEMALLOC_N(extent_tree_szad_first)
+#define	extent_tree_szad_insert JEMALLOC_N(extent_tree_szad_insert)
+#define	extent_tree_szad_iter JEMALLOC_N(extent_tree_szad_iter)
+#define	extent_tree_szad_iter_recurse JEMALLOC_N(extent_tree_szad_iter_recurse)
+#define	extent_tree_szad_iter_start JEMALLOC_N(extent_tree_szad_iter_start)
+#define	extent_tree_szad_last JEMALLOC_N(extent_tree_szad_last)
+#define	extent_tree_szad_new JEMALLOC_N(extent_tree_szad_new)
+#define	extent_tree_szad_next JEMALLOC_N(extent_tree_szad_next)
+#define	extent_tree_szad_nsearch JEMALLOC_N(extent_tree_szad_nsearch)
+#define	extent_tree_szad_prev JEMALLOC_N(extent_tree_szad_prev)
+#define	extent_tree_szad_psearch JEMALLOC_N(extent_tree_szad_psearch)
+#define	extent_tree_szad_remove JEMALLOC_N(extent_tree_szad_remove)
+#define	extent_tree_szad_reverse_iter JEMALLOC_N(extent_tree_szad_reverse_iter)
+#define	extent_tree_szad_reverse_iter_recurse JEMALLOC_N(extent_tree_szad_reverse_iter_recurse)
+#define	extent_tree_szad_reverse_iter_start JEMALLOC_N(extent_tree_szad_reverse_iter_start)
+#define	extent_tree_szad_search JEMALLOC_N(extent_tree_szad_search)
+#define	hash JEMALLOC_N(hash)
+#define	huge_boot JEMALLOC_N(huge_boot)
+#define	huge_dalloc JEMALLOC_N(huge_dalloc)
+#define	huge_malloc JEMALLOC_N(huge_malloc)
+#define	huge_palloc JEMALLOC_N(huge_palloc)
+#define	huge_prof_ctx_get JEMALLOC_N(huge_prof_ctx_get)
+#define	huge_prof_ctx_set JEMALLOC_N(huge_prof_ctx_set)
+#define	huge_ralloc JEMALLOC_N(huge_ralloc)
+#define	huge_ralloc_no_move JEMALLOC_N(huge_ralloc_no_move)
+#define	huge_salloc JEMALLOC_N(huge_salloc)
+#define	iallocm JEMALLOC_N(iallocm)
+#define	icalloc JEMALLOC_N(icalloc)
+#define	idalloc JEMALLOC_N(idalloc)
+#define	imalloc JEMALLOC_N(imalloc)
+#define	ipalloc JEMALLOC_N(ipalloc)
+#define	iralloc JEMALLOC_N(iralloc)
+#define	isalloc JEMALLOC_N(isalloc)
+#define	ivsalloc JEMALLOC_N(ivsalloc)
+#define	jemalloc_darwin_init JEMALLOC_N(jemalloc_darwin_init)
+#define	jemalloc_postfork JEMALLOC_N(jemalloc_postfork)
+#define	jemalloc_prefork JEMALLOC_N(jemalloc_prefork)
+#define	malloc_cprintf JEMALLOC_N(malloc_cprintf)
+#define	malloc_mutex_destroy JEMALLOC_N(malloc_mutex_destroy)
+#define	malloc_mutex_init JEMALLOC_N(malloc_mutex_init)
+#define	malloc_mutex_lock JEMALLOC_N(malloc_mutex_lock)
+#define	malloc_mutex_trylock JEMALLOC_N(malloc_mutex_trylock)
+#define	malloc_mutex_unlock JEMALLOC_N(malloc_mutex_unlock)
+#define	malloc_printf JEMALLOC_N(malloc_printf)
+#define	malloc_write JEMALLOC_N(malloc_write)
+#define	mb_write JEMALLOC_N(mb_write)
+#define	pow2_ceil JEMALLOC_N(pow2_ceil)
+#define	prof_backtrace JEMALLOC_N(prof_backtrace)
+#define	prof_boot0 JEMALLOC_N(prof_boot0)
+#define	prof_boot1 JEMALLOC_N(prof_boot1)
+#define	prof_boot2 JEMALLOC_N(prof_boot2)
+#define	prof_ctx_get JEMALLOC_N(prof_ctx_get)
+#define	prof_ctx_set JEMALLOC_N(prof_ctx_set)
+#define	prof_free JEMALLOC_N(prof_free)
+#define	prof_gdump JEMALLOC_N(prof_gdump)
+#define	prof_idump JEMALLOC_N(prof_idump)
+#define	prof_lookup JEMALLOC_N(prof_lookup)
+#define	prof_malloc JEMALLOC_N(prof_malloc)
+#define	prof_mdump JEMALLOC_N(prof_mdump)
+#define	prof_realloc JEMALLOC_N(prof_realloc)
+#define	prof_sample_accum_update JEMALLOC_N(prof_sample_accum_update)
+#define	prof_sample_threshold_update JEMALLOC_N(prof_sample_threshold_update)
+#define	prof_tdata_init JEMALLOC_N(prof_tdata_init)
+#define	pthread_create JEMALLOC_N(pthread_create)
+#define	rtree_get JEMALLOC_N(rtree_get)
+#define	rtree_get_locked JEMALLOC_N(rtree_get_locked)
+#define	rtree_new JEMALLOC_N(rtree_new)
+#define	rtree_set JEMALLOC_N(rtree_set)
+#define	s2u JEMALLOC_N(s2u)
+#define	sa2u JEMALLOC_N(sa2u)
+#define	stats_arenas_i_bins_j_index JEMALLOC_N(stats_arenas_i_bins_j_index)
+#define	stats_arenas_i_index JEMALLOC_N(stats_arenas_i_index)
+#define	stats_arenas_i_lruns_j_index JEMALLOC_N(stats_arenas_i_lruns_j_index)
+#define	stats_cactive_add JEMALLOC_N(stats_cactive_add)
+#define	stats_cactive_get JEMALLOC_N(stats_cactive_get)
+#define	stats_cactive_sub JEMALLOC_N(stats_cactive_sub)
+#define	stats_print JEMALLOC_N(stats_print)
+#define	szone2ozone JEMALLOC_N(szone2ozone)
+#define	tcache_alloc_easy JEMALLOC_N(tcache_alloc_easy)
+#define	tcache_alloc_large JEMALLOC_N(tcache_alloc_large)
+#define	tcache_alloc_small JEMALLOC_N(tcache_alloc_small)
+#define	tcache_alloc_small_hard JEMALLOC_N(tcache_alloc_small_hard)
+#define	tcache_bin_flush_large JEMALLOC_N(tcache_bin_flush_large)
+#define	tcache_bin_flush_small JEMALLOC_N(tcache_bin_flush_small)
+#define	tcache_boot JEMALLOC_N(tcache_boot)
+#define	tcache_create JEMALLOC_N(tcache_create)
+#define	tcache_dalloc_large JEMALLOC_N(tcache_dalloc_large)
+#define	tcache_dalloc_small JEMALLOC_N(tcache_dalloc_small)
+#define	tcache_destroy JEMALLOC_N(tcache_destroy)
+#define	tcache_event JEMALLOC_N(tcache_event)
+#define	tcache_get JEMALLOC_N(tcache_get)
+#define	tcache_stats_merge JEMALLOC_N(tcache_stats_merge)
+#define	thread_allocated_get JEMALLOC_N(thread_allocated_get)
+#define	thread_allocated_get_hard JEMALLOC_N(thread_allocated_get_hard)
+#define	u2s JEMALLOC_N(u2s)
--- a/dep/jemalloc/include/jemalloc/internal/prof.h
+++ b/dep/jemalloc/include/jemalloc/internal/prof.h
@@ -227,9 +227,60 @@ bool	prof_boot2(void);
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES

+#define	PROF_ALLOC_PREP(nignore, size, ret) do {			\
+	prof_tdata_t *prof_tdata;					\
+	prof_bt_t bt;							\
+									\
+	assert(size == s2u(size));					\
+									\
+	prof_tdata = PROF_TCACHE_GET();					\
+	if (prof_tdata == NULL) {					\
+		prof_tdata = prof_tdata_init();				\
+		if (prof_tdata == NULL) {				\
+			ret = NULL;					\
+			break;						\
+		}							\
+	}								\
+									\
+	if (opt_prof_active == false) {					\
+		/* Sampling is currently inactive, so avoid sampling. */\
+		ret = (prof_thr_cnt_t *)(uintptr_t)1U;			\
+	} else if (opt_lg_prof_sample == 0) {				\
+		/* Don't bother with sampling logic, since sampling   */\
+		/* interval is 1.                                     */\
+		bt_init(&bt, prof_tdata->vec);				\
+		prof_backtrace(&bt, nignore, prof_bt_max);		\
+		ret = prof_lookup(&bt);					\
+	} else {							\
+		if (prof_tdata->threshold == 0) {			\
+			/* Initialize.  Seed the prng differently for */\
+			/* each thread.                               */\
+			prof_tdata->prn_state =				\
+			    (uint64_t)(uintptr_t)&size;			\
+			prof_sample_threshold_update(prof_tdata);	\
+		}							\
+									\
+		/* Determine whether to capture a backtrace based on  */\
+		/* whether size is enough for prof_accum to reach     */\
+		/* prof_tdata->threshold.  However, delay updating    */\
+		/* these variables until prof_{m,re}alloc(), because  */\
+		/* we don't know for sure that the allocation will    */\
+		/* succeed.                                           */\
+		/*                                                    */\
+		/* Use subtraction rather than addition to avoid      */\
+		/* potential integer overflow.                        */\
+		if (size >= prof_tdata->threshold -			\
+		    prof_tdata->accum) {				\
+			bt_init(&bt, prof_tdata->vec);			\
+			prof_backtrace(&bt, nignore, prof_bt_max);	\
+			ret = prof_lookup(&bt);				\
+		} else							\
+			ret = (prof_thr_cnt_t *)(uintptr_t)1U;		\
+	}								\
+} while (0)
+
 #ifndef JEMALLOC_ENABLE_INLINE
 void	prof_sample_threshold_update(prof_tdata_t *prof_tdata);
-prof_thr_cnt_t	*prof_alloc_prep(size_t size);
 prof_ctx_t	*prof_ctx_get(const void *ptr);
 void	prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
 bool	prof_sample_accum_update(size_t size);
@@ -247,8 +298,22 @@ prof_sample_threshold_update(prof_tdata_t *prof_tdata)
 	double u;

 	/*
-	 * Compute prof_sample_threshold as a geometrically distributed random
+	 * Compute sample threshold as a geometrically distributed random
 	 * variable with mean (2^opt_lg_prof_sample).
+	 *
+	 *                         __        __
+	 *                         |  log(u)  |                     1
+	 * prof_tdata->threshold = | -------- |, where p = -------------------
+	 *                         | log(1-p) |             opt_lg_prof_sample
+	 *                                                 2
+	 *
+	 * For more information on the math, see:
+	 *
+	 *   Non-Uniform Random Variate Generation
+	 *   Luc Devroye
+	 *   Springer-Verlag, New York, 1986
+	 *   pp 500
+	 *   (http://cg.scs.carleton.ca/~luc/rnbookindex.html)
 	 */
 	prn64(r, 53, prof_tdata->prn_state,
 	    (uint64_t)6364136223846793005LLU, (uint64_t)1442695040888963407LLU);
@@ -258,71 +323,6 @@ prof_sample_threshold_update(prof_tdata_t *prof_tdata)
 	    + (uint64_t)1U;
 }

-JEMALLOC_INLINE prof_thr_cnt_t *
-prof_alloc_prep(size_t size)
-{
-#ifdef JEMALLOC_ENABLE_INLINE
-   /* This function does not have its own stack frame, because it is inlined. */
-#  define NIGNORE 1
-#else
-#  define NIGNORE 2
-#endif
-	prof_thr_cnt_t *ret;
-	prof_tdata_t *prof_tdata;
-	prof_bt_t bt;
-
-	assert(size == s2u(size));
-
-	prof_tdata = PROF_TCACHE_GET();
-	if (prof_tdata == NULL) {
-		prof_tdata = prof_tdata_init();
-		if (prof_tdata == NULL)
-			return (NULL);
-	}
-
-	if (opt_prof_active == false) {
-		/* Sampling is currently inactive, so avoid sampling. */
-		ret = (prof_thr_cnt_t *)(uintptr_t)1U;
-	} else if (opt_lg_prof_sample == 0) {
-		/*
-		 * Don't bother with sampling logic, since sampling interval is
-		 * 1.
-		 */
-		bt_init(&bt, prof_tdata->vec);
-		prof_backtrace(&bt, NIGNORE, prof_bt_max);
-		ret = prof_lookup(&bt);
-	} else {
-		if (prof_tdata->threshold == 0) {
-			/*
-			 * Initialize.  Seed the prng differently for each
-			 * thread.
-			 */
-			prof_tdata->prn_state = (uint64_t)(uintptr_t)&size;
-			prof_sample_threshold_update(prof_tdata);
-		}
-
-		/*
-		 * Determine whether to capture a backtrace based on whether
-		 * size is enough for prof_accum to reach
-		 * prof_tdata->threshold.  However, delay updating these
-		 * variables until prof_{m,re}alloc(), because we don't know
-		 * for sure that the allocation will succeed.
-		 *
-		 * Use subtraction rather than addition to avoid potential
-		 * integer overflow.
-		 */
-		if (size >= prof_tdata->threshold - prof_tdata->accum) {
-			bt_init(&bt, prof_tdata->vec);
-			prof_backtrace(&bt, NIGNORE, prof_bt_max);
-			ret = prof_lookup(&bt);
-		} else
-			ret = (prof_thr_cnt_t *)(uintptr_t)1U;
-	}
-
-	return (ret);
-#undef NIGNORE
-}
-
 JEMALLOC_INLINE prof_ctx_t *
 prof_ctx_get(const void *ptr)
 {
@@ -334,7 +334,7 @@ prof_ctx_get(const void *ptr)
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (chunk != ptr) {
 		/* Region. */
-		assert(chunk->arena->magic == ARENA_MAGIC);
+		dassert(chunk->arena->magic == ARENA_MAGIC);

 		ret = arena_prof_ctx_get(ptr);
 	} else
@@ -353,7 +353,7 @@ prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (chunk != ptr) {
 		/* Region. */
-		assert(chunk->arena->magic == ARENA_MAGIC);
+		dassert(chunk->arena->magic == ARENA_MAGIC);

 		arena_prof_ctx_set(ptr, ctx);
 	} else
@@ -374,7 +374,7 @@ prof_sample_accum_update(size_t size)
 	/* Take care to avoid integer overflow. */
 	if (size >= prof_tdata->threshold - prof_tdata->accum) {
 		prof_tdata->accum -= (prof_tdata->threshold - size);
-		/* Compute new prof_sample_threshold. */
+		/* Compute new sample threshold. */
 		prof_sample_threshold_update(prof_tdata);
 		while (prof_tdata->accum >= prof_tdata->threshold) {
 			prof_tdata->accum -= prof_tdata->threshold;
@@ -401,7 +401,7 @@ prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt)
 			 * always possible to tell in advance how large an
 			 * object's usable size will be, so there should never
 			 * be a difference between the size passed to
-			 * prof_alloc_prep() and prof_malloc().
+			 * PROF_ALLOC_PREP() and prof_malloc().
 			 */
 			assert((uintptr_t)cnt == (uintptr_t)1U);
 		}
@@ -445,7 +445,7 @@ prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
 			if (prof_sample_accum_update(size)) {
 				/*
 				 * Don't sample.  The size passed to
-				 * prof_alloc_prep() was larger than what
+				 * PROF_ALLOC_PREP() was larger than what
 				 * actually got allocated, so a backtrace was
 				 * captured for this allocation, even though
 				 * its actual size was insufficient to cross
--- a/dep/jemalloc/include/jemalloc/internal/rtree.h
+++ b/dep/jemalloc/include/jemalloc/internal/rtree.h
@@ -49,7 +49,7 @@ void	*rtree_get(rtree_t *rtree, uintptr_t key);
 bool	rtree_set(rtree_t *rtree, uintptr_t key, void *val);
 #endif

-#if (defined(JEMALLOC_ENABLE_INLINE) || defined(RTREE_C_))
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_RTREE_C_))
 #define	RTREE_GET_GENERATE(f)						\
 /* The least significant bits of the key are ignored. */		\
 JEMALLOC_INLINE void *							\
--- a/dep/jemalloc/include/jemalloc/internal/stats.h
+++ b/dep/jemalloc/include/jemalloc/internal/stats.h
@@ -154,6 +154,10 @@ struct chunk_stats_s {

 extern bool	opt_stats_print;

+#ifdef JEMALLOC_STATS
+extern size_t	stats_cactive;
+#endif
+
 char	*u2s(uint64_t x, unsigned base, char *s);
 #ifdef JEMALLOC_STATS
 void malloc_cprintf(void (*write)(void *, const char *), void *cbopaque,
@@ -166,9 +170,38 @@ void	stats_print(void (*write)(void *, const char *), void *cbopaque,

 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
-#ifdef JEMALLOC_STATS
 #ifdef JEMALLOC_H_INLINES
+#ifdef JEMALLOC_STATS
+
+#ifndef JEMALLOC_ENABLE_INLINE
+size_t	stats_cactive_get(void);
+void	stats_cactive_add(size_t size);
+void	stats_cactive_sub(size_t size);
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_STATS_C_))
+JEMALLOC_INLINE size_t
+stats_cactive_get(void)
+{
+
+	return (atomic_read_z(&stats_cactive));
+}
+
+JEMALLOC_INLINE void
+stats_cactive_add(size_t size)
+{
+
+	atomic_add_z(&stats_cactive, size);
+}
+
+JEMALLOC_INLINE void
+stats_cactive_sub(size_t size)
+{
+
+	atomic_sub_z(&stats_cactive, size);
+}
+#endif

-#endif /* JEMALLOC_H_INLINES */
 #endif /* JEMALLOC_STATS */
+#endif /* JEMALLOC_H_INLINES */
 /******************************************************************************/
--- a/dep/jemalloc/include/jemalloc/internal/tcache.h
+++ b/dep/jemalloc/include/jemalloc/internal/tcache.h
@@ -2,6 +2,7 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_TYPES

+typedef struct tcache_bin_info_s tcache_bin_info_t;
 typedef struct tcache_bin_s tcache_bin_t;
 typedef struct tcache_s tcache_t;

@@ -32,15 +33,22 @@ typedef struct tcache_s tcache_t;
 /******************************************************************************/
 #ifdef JEMALLOC_H_STRUCTS

+/*
+ * Read-only information associated with each element of tcache_t's tbins array
+ * is stored separately, mainly to reduce memory usage.
+ */
+struct tcache_bin_info_s {
+	unsigned	ncached_max;	/* Upper limit on ncached. */
+};
+
 struct tcache_bin_s {
 #  ifdef JEMALLOC_STATS
 	tcache_bin_stats_t tstats;
 #  endif
-	unsigned	low_water;	/* Min # cached since last GC. */
-	unsigned	high_water;	/* Max # cached since last GC. */
+	int		low_water;	/* Min # cached since last GC. */
+	unsigned	lg_fill_div;	/* Fill (ncached_max >> lg_fill_div). */
 	unsigned	ncached;	/* # of cached objects. */
-	unsigned	ncached_max;	/* Upper limit on ncached. */
-	void		*avail;		/* Chain of available objects. */
+	void		**avail;	/* Stack of available objects. */
 };

 struct tcache_s {
@@ -54,6 +62,12 @@ struct tcache_s {
 	unsigned	ev_cnt;		/* Event count since incremental GC. */
 	unsigned	next_gc_bin;	/* Next bin to GC. */
 	tcache_bin_t	tbins[1];	/* Dynamically sized. */
+	/*
+	 * The pointer stacks associated with tbins follow as a contiguous
+	 * array.  During tcache initialization, the avail pointer in each
+	 * element of tbins is initialized to point to the proper offset within
+	 * this array.
+	 */
 };

 #endif /* JEMALLOC_H_STRUCTS */
@@ -64,6 +78,8 @@ extern bool	opt_tcache;
 extern ssize_t	opt_lg_tcache_max;
 extern ssize_t	opt_lg_tcache_gc_sweep;

+extern tcache_bin_info_t	*tcache_bin_info;
+
 /* Map of thread-specific caches. */
 #ifndef NO_TLS
 extern __thread tcache_t	*tcache_tls
@@ -110,7 +126,7 @@ void	tcache_destroy(tcache_t *tcache);
 #ifdef JEMALLOC_STATS
 void	tcache_stats_merge(tcache_t *tcache, arena_t *arena);
 #endif
-void	tcache_boot(void);
+bool	tcache_boot(void);

 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
@@ -169,6 +185,7 @@ tcache_event(tcache_t *tcache)
 	if (tcache->ev_cnt == tcache_gc_incr) {
 		size_t binind = tcache->next_gc_bin;
 		tcache_bin_t *tbin = &tcache->tbins[binind];
+		tcache_bin_info_t *tbin_info = &tcache_bin_info[binind];

 		if (tbin->low_water > 0) {
 			/*
@@ -192,9 +209,22 @@ tcache_event(tcache_t *tcache)
 #endif
 				    );
 			}
+			/*
+			 * Reduce fill count by 2X.  Limit lg_fill_div such that
+			 * the fill count is always at least 1.
+			 */
+			if ((tbin_info->ncached_max >> (tbin->lg_fill_div+1))
+			    >= 1)
+				tbin->lg_fill_div++;
+		} else if (tbin->low_water < 0) {
+			/*
+			 * Increase fill count by 2X.  Make sure lg_fill_div
+			 * stays greater than 0.
+			 */
+			if (tbin->lg_fill_div > 1)
+				tbin->lg_fill_div--;
 		}
 		tbin->low_water = tbin->ncached;
-		tbin->high_water = tbin->ncached;

 		tcache->next_gc_bin++;
 		if (tcache->next_gc_bin == nhbins)
@@ -208,13 +238,14 @@ tcache_alloc_easy(tcache_bin_t *tbin)
 {
 	void *ret;

-	if (tbin->ncached == 0)
+	if (tbin->ncached == 0) {
+		tbin->low_water = -1;
 		return (NULL);
+	}
 	tbin->ncached--;
-	if (tbin->ncached < tbin->low_water)
+	if ((int)tbin->ncached < tbin->low_water)
 		tbin->low_water = tbin->ncached;
-	ret = tbin->avail;
-	tbin->avail = *(void **)ret;
+	ret = tbin->avail[tbin->ncached];
 	return (ret);
 }

@@ -225,7 +256,7 @@ tcache_alloc_small(tcache_t *tcache, size_t size, bool zero)
 	size_t binind;
 	tcache_bin_t *tbin;

-	binind = small_size2bin[size];
+	binind = SMALL_SIZE2BIN(size);
 	assert(binind < nbins);
 	tbin = &tcache->tbins[binind];
 	ret = tcache_alloc_easy(tbin);
@@ -234,7 +265,7 @@ tcache_alloc_small(tcache_t *tcache, size_t size, bool zero)
 		if (ret == NULL)
 			return (NULL);
 	}
-	assert(arena_salloc(ret) == tcache->arena->bins[binind].reg_size);
+	assert(arena_salloc(ret) == arena_bin_info[binind].reg_size);

 	if (zero == false) {
 #ifdef JEMALLOC_FILL
@@ -250,7 +281,7 @@ tcache_alloc_small(tcache_t *tcache, size_t size, bool zero)
 	tbin->tstats.nrequests++;
 #endif
 #ifdef JEMALLOC_PROF
-	tcache->prof_accumbytes += tcache->arena->bins[binind].reg_size;
+	tcache->prof_accumbytes += arena_bin_info[binind].reg_size;
 #endif
 	tcache_event(tcache);
 	return (ret);
@@ -314,6 +345,7 @@ tcache_dalloc_small(tcache_t *tcache, void *ptr)
 	arena_run_t *run;
 	arena_bin_t *bin;
 	tcache_bin_t *tbin;
+	tcache_bin_info_t *tbin_info;
 	size_t pageind, binind;
 	arena_chunk_map_t *mapelm;

@@ -325,7 +357,7 @@ tcache_dalloc_small(tcache_t *tcache, void *ptr)
 	mapelm = &chunk->map[pageind-map_bias];
 	run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
 	    (mapelm->bits >> PAGE_SHIFT)) << PAGE_SHIFT));
-	assert(run->magic == ARENA_RUN_MAGIC);
+	dassert(run->magic == ARENA_RUN_MAGIC);
 	bin = run->bin;
 	binind = ((uintptr_t)bin - (uintptr_t)&arena->bins) /
 	    sizeof(arena_bin_t);
@@ -333,23 +365,22 @@ tcache_dalloc_small(tcache_t *tcache, void *ptr)

 #ifdef JEMALLOC_FILL
 	if (opt_junk)
-		memset(ptr, 0x5a, bin->reg_size);
+		memset(ptr, 0x5a, arena_bin_info[binind].reg_size);
 #endif

 	tbin = &tcache->tbins[binind];
-	if (tbin->ncached == tbin->ncached_max) {
-		tcache_bin_flush_small(tbin, binind, (tbin->ncached_max >> 1)
+	tbin_info = &tcache_bin_info[binind];
+	if (tbin->ncached == tbin_info->ncached_max) {
+		tcache_bin_flush_small(tbin, binind, (tbin_info->ncached_max >>
+		    1)
 #if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF))
 		    , tcache
 #endif
 		    );
 	}
-	assert(tbin->ncached < tbin->ncached_max);
-	*(void **)ptr = tbin->avail;
-	tbin->avail = ptr;
+	assert(tbin->ncached < tbin_info->ncached_max);
+	tbin->avail[tbin->ncached] = ptr;
 	tbin->ncached++;
-	if (tbin->ncached > tbin->high_water)
-		tbin->high_water = tbin->ncached;

 	tcache_event(tcache);
 }
@@ -361,6 +392,7 @@ tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size)
 	arena_chunk_t *chunk;
 	size_t pageind, binind;
 	tcache_bin_t *tbin;
+	tcache_bin_info_t *tbin_info;

 	assert((size & PAGE_MASK) == 0);
 	assert(arena_salloc(ptr) > small_maxclass);
@@ -377,19 +409,18 @@ tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size)
 #endif

 	tbin = &tcache->tbins[binind];
-	if (tbin->ncached == tbin->ncached_max) {
-		tcache_bin_flush_large(tbin, binind, (tbin->ncached_max >> 1)
+	tbin_info = &tcache_bin_info[binind];
+	if (tbin->ncached == tbin_info->ncached_max) {
+		tcache_bin_flush_large(tbin, binind, (tbin_info->ncached_max >>
+		    1)
 #if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF))
 		    , tcache
 #endif
 		    );
 	}
-	assert(tbin->ncached < tbin->ncached_max);
-	*(void **)ptr = tbin->avail;
-	tbin->avail = ptr;
+	assert(tbin->ncached < tbin_info->ncached_max);
+	tbin->avail[tbin->ncached] = ptr;
 	tbin->ncached++;
-	if (tbin->ncached > tbin->high_water)
-		tbin->high_water = tbin->ncached;

 	tcache_event(tcache);
 }
--- a/dep/jemalloc/include/jemalloc/jemalloc.h
+++ b/dep/jemalloc/include/jemalloc/jemalloc.h
@@ -7,19 +7,19 @@ extern "C" {
 #include <limits.h>
 #include <strings.h>

-#define	JEMALLOC_VERSION "2.1.0-0-g1c4b088b08d3bc7617a34387e196ce03716160bf"
+#define	JEMALLOC_VERSION "2.2.5-0-gfc1bb70e5f0d9a58b39efa39cc549b5af5104760"
 #define	JEMALLOC_VERSION_MAJOR 2
-#define	JEMALLOC_VERSION_MINOR 1
-#define	JEMALLOC_VERSION_BUGFIX 0
+#define	JEMALLOC_VERSION_MINOR 2
+#define	JEMALLOC_VERSION_BUGFIX 5
 #define	JEMALLOC_VERSION_NREV 0
-#define	JEMALLOC_VERSION_GID "1c4b088b08d3bc7617a34387e196ce03716160bf"
+#define	JEMALLOC_VERSION_GID "fc1bb70e5f0d9a58b39efa39cc549b5af5104760"

 #include "jemalloc_defs.h"
 #ifndef JEMALLOC_P
 #  define JEMALLOC_P(s) s
 #endif

-#define	ALLOCM_LG_ALIGN	((int)0x3f)
+#define	ALLOCM_LG_ALIGN(la)	(la)
 #if LG_SIZEOF_PTR == 2
 #define	ALLOCM_ALIGN(a)	(ffs(a)-1)
 #else
--- a/dep/jemalloc/include/jemalloc/jemalloc_defs.h
+++ b/dep/jemalloc/include/jemalloc/jemalloc_defs.h
@@ -19,12 +19,33 @@
 /* #undef JEMALLOC_P */
 #endif

+/*
+ * JEMALLOC_PRIVATE_NAMESPACE is used as a prefix for all library-private APIs.
+ * For shared libraries, symbol visibility mechanisms prevent these symbols
+ * from being exported, but for static libraries, naming collisions are a real
+ * possibility.
+ */
+#define JEMALLOC_PRIVATE_NAMESPACE ""
+#define JEMALLOC_N(string_that_no_one_should_want_to_use_as_a_jemalloc_private_namespace_prefix) string_that_no_one_should_want_to_use_as_a_jemalloc_private_namespace_prefix
+
 /*
 * Hyper-threaded CPUs may need a special instruction inside spin loops in
 * order to yield to another virtual CPU.
 */
 #define CPU_SPINWAIT __asm__ volatile("pause")

+/*
+ * Defined if OSAtomic*() functions are available, as provided by Darwin, and
+ * documented in the atomic(3) manual page.
+ */
+/* #undef JEMALLOC_OSATOMIC */
+
+/*
+ * Defined if OSSpin*() functions are available, as provided by Darwin, and
+ * documented in the spinlock(3) manual page.
+ */
+/* #undef JEMALLOC_OSSPIN */
+
 /* Defined if __attribute__((...)) syntax is supported. */
 #define JEMALLOC_HAVE_ATTR 
 #ifdef JEMALLOC_HAVE_ATTR
@@ -54,18 +75,21 @@
 /* Use libgcc for profile backtracing if defined. */
 /* #undef JEMALLOC_PROF_LIBGCC */

+/* Use gcc intrinsics for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_GCC */
+
 /*
 * JEMALLOC_TINY enables support for tiny objects, which are smaller than one
 * quantum.
 */
-/* #undef JEMALLOC_TINY */
+#define JEMALLOC_TINY 

 /*
 * JEMALLOC_TCACHE enables a thread-specific caching layer for small objects.
 * This makes it possible to allocate/deallocate objects without any locking
 * when the cache is in the steady state.
 */
-/* #undef JEMALLOC_TCACHE */
+#define JEMALLOC_TCACHE 

 /*
 * JEMALLOC_DSS enables use of sbrk(2) to allocate chunks from the data storage
@@ -86,7 +110,7 @@
 /* #undef JEMALLOC_SYSV */

 /* Support lazy locking (avoid locking unless a second thread is launched). */
-/* #undef JEMALLOC_LAZY_LOCK */
+#define JEMALLOC_LAZY_LOCK 

 /* Determine page size at run time if defined. */
 /* #undef DYNAMIC_PAGE_SHIFT */
@@ -133,9 +157,12 @@
 /* #undef JEMALLOC_PURGE_MADVISE_FREE */

 /* sizeof(void *) == 2^LG_SIZEOF_PTR. */
-#define LG_SIZEOF_PTR 2
+#define LG_SIZEOF_PTR 3

 /* sizeof(int) == 2^LG_SIZEOF_INT. */
 #define LG_SIZEOF_INT 2

+/* sizeof(long) == 2^LG_SIZEOF_LONG. */
+#define LG_SIZEOF_LONG 3
+
 #endif /* JEMALLOC_DEFS_H_ */
--- a/dep/jemalloc/src/arena.c
+++ b/dep/jemalloc/src/arena.c
--- a/dep/jemalloc/src/atomic.c
+++ b/dep/jemalloc/src/atomic.c
@@ -0,0 +1,2 @@
+#define	JEMALLOC_ATOMIC_C_
+#include "jemalloc/internal/jemalloc_internal.h"
--- a/dep/jemalloc/src/bitmap.c
+++ b/dep/jemalloc/src/bitmap.c
@@ -0,0 +1,90 @@
+#define JEMALLOC_BITMAP_C_
+#include "jemalloc/internal/jemalloc_internal.h"
+
+/******************************************************************************/
+/* Function prototypes for non-inline static functions. */
+
+static size_t	bits2groups(size_t nbits);
+
+/******************************************************************************/
+
+static size_t
+bits2groups(size_t nbits)
+{
+
+	return ((nbits >> LG_BITMAP_GROUP_NBITS) +
+	    !!(nbits & BITMAP_GROUP_NBITS_MASK));
+}
+
+void
+bitmap_info_init(bitmap_info_t *binfo, size_t nbits)
+{
+	unsigned i;
+	size_t group_count;
+
+	assert(nbits > 0);
+	assert(nbits <= (ZU(1) << LG_BITMAP_MAXBITS));
+
+	/*
+	 * Compute the number of groups necessary to store nbits bits, and
+	 * progressively work upward through the levels until reaching a level
+	 * that requires only one group.
+	 */
+	binfo->levels[0].group_offset = 0;
+	group_count = bits2groups(nbits);
+	for (i = 1; group_count > 1; i++) {
+		assert(i < BITMAP_MAX_LEVELS);
+		binfo->levels[i].group_offset = binfo->levels[i-1].group_offset
+		    + group_count;
+		group_count = bits2groups(group_count);
+	}
+	binfo->levels[i].group_offset = binfo->levels[i-1].group_offset
+	    + group_count;
+	binfo->nlevels = i;
+	binfo->nbits = nbits;
+}
+
+size_t
+bitmap_info_ngroups(const bitmap_info_t *binfo)
+{
+
+	return (binfo->levels[binfo->nlevels].group_offset << LG_SIZEOF_BITMAP);
+}
+
+size_t
+bitmap_size(size_t nbits)
+{
+	bitmap_info_t binfo;
+
+	bitmap_info_init(&binfo, nbits);
+	return (bitmap_info_ngroups(&binfo));
+}
+
+void
+bitmap_init(bitmap_t *bitmap, const bitmap_info_t *binfo)
+{
+	size_t extra;
+	unsigned i;
+
+	/*
+	 * Bits are actually inverted with regard to the external bitmap
+	 * interface, so the bitmap starts out with all 1 bits, except for
+	 * trailing unused bits (if any).  Note that each group uses bit 0 to
+	 * correspond to the first logical bit in the group, so extra bits
+	 * are the most significant bits of the last group.
+	 */
+	memset(bitmap, 0xffU, binfo->levels[binfo->nlevels].group_offset <<
+	    LG_SIZEOF_BITMAP);
+	extra = (BITMAP_GROUP_NBITS - (binfo->nbits & BITMAP_GROUP_NBITS_MASK))
+	    & BITMAP_GROUP_NBITS_MASK;
+	if (extra != 0)
+		bitmap[binfo->levels[1].group_offset - 1] >>= extra;
+	for (i = 1; i < binfo->nlevels; i++) {
+		size_t group_count = binfo->levels[i].group_offset -
+		    binfo->levels[i-1].group_offset;
+		extra = (BITMAP_GROUP_NBITS - (group_count &
+		    BITMAP_GROUP_NBITS_MASK)) & BITMAP_GROUP_NBITS_MASK;
+		if (extra != 0)
+			bitmap[binfo->levels[i+1].group_offset - 1] >>= extra;
+	}
+}
--- a/dep/jemalloc/src/chunk.c
+++ b/dep/jemalloc/src/chunk.c
@@ -70,7 +70,7 @@ RETURN:
 #ifdef JEMALLOC_IVSALLOC
 	if (base == false && ret != NULL) {
 		if (rtree_set(chunks_rtree, (uintptr_t)ret, ret)) {
-			chunk_dealloc(ret, size);
+			chunk_dealloc(ret, size, true);
 			return (NULL);
 		}
 	}
@@ -108,7 +108,7 @@ RETURN:
 }

 void
-chunk_dealloc(void *chunk, size_t size)
+chunk_dealloc(void *chunk, size_t size, bool unmap)
 {

 	assert(chunk != NULL);
@@ -125,15 +125,17 @@ chunk_dealloc(void *chunk, size_t size)
 	malloc_mutex_unlock(&chunks_mtx);
 #endif

+	if (unmap) {
 #ifdef JEMALLOC_SWAP
-	if (swap_enabled && chunk_dealloc_swap(chunk, size) == false)
-		return;
+		if (swap_enabled && chunk_dealloc_swap(chunk, size) == false)
+			return;
 #endif
 #ifdef JEMALLOC_DSS
-	if (chunk_dealloc_dss(chunk, size) == false)
-		return;
+		if (chunk_dealloc_dss(chunk, size) == false)
+			return;
 #endif
-	chunk_dealloc_mmap(chunk, size);
+		chunk_dealloc_mmap(chunk, size);
+	}
 }

 bool
--- a/dep/jemalloc/src/chunk_mmap.c
+++ b/dep/jemalloc/src/chunk_mmap.c
@@ -206,13 +206,15 @@ chunk_alloc_mmap_internal(size_t size, bool noreserve)
 void *
 chunk_alloc_mmap(size_t size)
 {
-	return chunk_alloc_mmap_internal(size, false);
+
+	return (chunk_alloc_mmap_internal(size, false));
 }

 void *
 chunk_alloc_mmap_noreserve(size_t size)
 {
-	return chunk_alloc_mmap_internal(size, true);
+
+	return (chunk_alloc_mmap_internal(size, true));
 }

 void
--- a/dep/jemalloc/src/ckh.c
+++ b/dep/jemalloc/src/ckh.c
@@ -34,7 +34,7 @@
 * respectively.
 *
 ******************************************************************************/
-#define	CKH_C_
+#define	JEMALLOC_CKH_C_
 #include "jemalloc/internal/jemalloc_internal.h"

 /******************************************************************************/
@@ -73,7 +73,7 @@ ckh_isearch(ckh_t *ckh, const void *key)
 	size_t hash1, hash2, bucket, cell;

 	assert(ckh != NULL);
-	assert(ckh->magic = CKH_MAGIG);
+	dassert(ckh->magic == CKH_MAGIC);

 	ckh->hash(key, ckh->lg_curbuckets, &hash1, &hash2);

@@ -262,9 +262,15 @@ ckh_grow(ckh_t *ckh)
 	lg_prevbuckets = ckh->lg_curbuckets;
 	lg_curcells = ckh->lg_curbuckets + LG_CKH_BUCKET_CELLS;
 	while (true) {
+		size_t usize;
+
 		lg_curcells++;
-		tab = (ckhc_t *)ipalloc(sizeof(ckhc_t) << lg_curcells,
-		    ZU(1) << LG_CACHELINE, true);
+		usize = sa2u(sizeof(ckhc_t) << lg_curcells, CACHELINE, NULL);
+		if (usize == 0) {
+			ret = true;
+			goto RETURN;
+		}
+		tab = (ckhc_t *)ipalloc(usize, CACHELINE, true);
 		if (tab == NULL) {
 			ret = true;
 			goto RETURN;
@@ -295,7 +301,7 @@ static void
 ckh_shrink(ckh_t *ckh)
 {
 	ckhc_t *tab, *ttab;
-	size_t lg_curcells;
+	size_t lg_curcells, usize;
 	unsigned lg_prevbuckets;

 	/*
@@ -304,8 +310,10 @@ ckh_shrink(ckh_t *ckh)
 	 */
 	lg_prevbuckets = ckh->lg_curbuckets;
 	lg_curcells = ckh->lg_curbuckets + LG_CKH_BUCKET_CELLS - 1;
-	tab = (ckhc_t *)ipalloc(sizeof(ckhc_t) << lg_curcells,
-	    ZU(1) << LG_CACHELINE, true);
+	usize = sa2u(sizeof(ckhc_t) << lg_curcells, CACHELINE, NULL);
+	if (usize == 0)
+		return;
+	tab = (ckhc_t *)ipalloc(usize, CACHELINE, true);
 	if (tab == NULL) {
 		/*
 		 * An OOM error isn't worth propagating, since it doesn't
@@ -340,7 +348,7 @@ bool
 ckh_new(ckh_t *ckh, size_t minitems, ckh_hash_t *hash, ckh_keycomp_t *keycomp)
 {
 	bool ret;
-	size_t mincells;
+	size_t mincells, usize;
 	unsigned lg_mincells;

 	assert(minitems > 0);
@@ -375,15 +383,19 @@ ckh_new(ckh_t *ckh, size_t minitems, ckh_hash_t *hash, ckh_keycomp_t *keycomp)
 	ckh->hash = hash;
 	ckh->keycomp = keycomp;

-	ckh->tab = (ckhc_t *)ipalloc(sizeof(ckhc_t) << lg_mincells,
-	    (ZU(1) << LG_CACHELINE), true);
+	usize = sa2u(sizeof(ckhc_t) << lg_mincells, CACHELINE, NULL);
+	if (usize == 0) {
+		ret = true;
+		goto RETURN;
+	}
+	ckh->tab = (ckhc_t *)ipalloc(usize, CACHELINE, true);
 	if (ckh->tab == NULL) {
 		ret = true;
 		goto RETURN;
 	}

 #ifdef JEMALLOC_DEBUG
-	ckh->magic = CKH_MAGIG;
+	ckh->magic = CKH_MAGIC;
 #endif

 	ret = false;
@@ -396,7 +408,7 @@ ckh_delete(ckh_t *ckh)
 {

 	assert(ckh != NULL);
-	assert(ckh->magic = CKH_MAGIG);
+	dassert(ckh->magic == CKH_MAGIC);

 #ifdef CKH_VERBOSE
 	malloc_printf(
@@ -421,7 +433,7 @@ ckh_count(ckh_t *ckh)
 {

 	assert(ckh != NULL);
-	assert(ckh->magic = CKH_MAGIG);
+	dassert(ckh->magic == CKH_MAGIC);

 	return (ckh->count);
 }
@@ -452,7 +464,7 @@ ckh_insert(ckh_t *ckh, const void *key, const void *data)
 	bool ret;

 	assert(ckh != NULL);
-	assert(ckh->magic = CKH_MAGIG);
+	dassert(ckh->magic == CKH_MAGIC);
 	assert(ckh_search(ckh, key, NULL, NULL));

 #ifdef CKH_COUNT
@@ -477,7 +489,7 @@ ckh_remove(ckh_t *ckh, const void *searchkey, void **key, void **data)
 	size_t cell;

 	assert(ckh != NULL);
-	assert(ckh->magic = CKH_MAGIG);
+	dassert(ckh->magic == CKH_MAGIC);

 	cell = ckh_isearch(ckh, searchkey);
 	if (cell != SIZE_T_MAX) {
@@ -509,7 +521,7 @@ ckh_search(ckh_t *ckh, const void *searchkey, void **key, void **data)
 	size_t cell;

 	assert(ckh != NULL);
-	assert(ckh->magic = CKH_MAGIG);
+	dassert(ckh->magic == CKH_MAGIC);

 	cell = ckh_isearch(ckh, searchkey);
 	if (cell != SIZE_T_MAX) {
@@ -544,7 +556,7 @@ ckh_string_hash(const void *key, unsigned minbits, size_t *hash1, size_t *hash2)
 	} else {
 		ret1 = h;
 		ret2 = hash(key, strlen((const char *)key),
-		    0x8432a476666bbc13U);
+		    0x8432a476666bbc13LLU);
 	}

 	*hash1 = ret1;
--- a/dep/jemalloc/src/ctl.c
+++ b/dep/jemalloc/src/ctl.c
@@ -182,6 +182,7 @@ CTL_PROTO(stats_arenas_i_lruns_j_highruns)
 CTL_PROTO(stats_arenas_i_lruns_j_curruns)
 INDEX_PROTO(stats_arenas_i_lruns_j)
 #endif
+CTL_PROTO(stats_arenas_i_nthreads)
 CTL_PROTO(stats_arenas_i_pactive)
 CTL_PROTO(stats_arenas_i_pdirty)
 #ifdef JEMALLOC_STATS
@@ -192,6 +193,7 @@ CTL_PROTO(stats_arenas_i_purged)
 #endif
 INDEX_PROTO(stats_arenas_i)
 #ifdef JEMALLOC_STATS
+CTL_PROTO(stats_cactive)
 CTL_PROTO(stats_allocated)
 CTL_PROTO(stats_active)
 CTL_PROTO(stats_mapped)
@@ -434,6 +436,7 @@ static const ctl_node_t stats_arenas_i_lruns_node[] = {
 #endif

 static const ctl_node_t stats_arenas_i_node[] = {
+	{NAME("nthreads"),		CTL(stats_arenas_i_nthreads)},
 	{NAME("pactive"),		CTL(stats_arenas_i_pactive)},
 	{NAME("pdirty"),		CTL(stats_arenas_i_pdirty)}
 #ifdef JEMALLOC_STATS
@@ -458,6 +461,7 @@ static const ctl_node_t stats_arenas_node[] = {

 static const ctl_node_t stats_node[] = {
 #ifdef JEMALLOC_STATS
+	{NAME("cactive"),		CTL(stats_cactive)},
 	{NAME("allocated"),		CTL(stats_allocated)},
 	{NAME("active"),		CTL(stats_active)},
 	{NAME("mapped"),		CTL(stats_mapped)},
@@ -620,6 +624,7 @@ ctl_arena_refresh(arena_t *arena, unsigned i)

 	ctl_arena_clear(astats);

+	sstats->nthreads += astats->nthreads;
 #ifdef JEMALLOC_STATS
 	ctl_arena_stats_amerge(astats, arena);
 	/* Merge into sum stats as well. */
@@ -657,10 +662,17 @@ ctl_refresh(void)
 	 * Clear sum stats, since they will be merged into by
 	 * ctl_arena_refresh().
 	 */
+	ctl_stats.arenas[narenas].nthreads = 0;
 	ctl_arena_clear(&ctl_stats.arenas[narenas]);

 	malloc_mutex_lock(&arenas_lock);
 	memcpy(tarenas, arenas, sizeof(arena_t *) * narenas);
+	for (i = 0; i < narenas; i++) {
+		if (arenas[i] != NULL)
+			ctl_stats.arenas[i].nthreads = arenas[i]->nthreads;
+		else
+			ctl_stats.arenas[i].nthreads = 0;
+	}
 	malloc_mutex_unlock(&arenas_lock);
 	for (i = 0; i < narenas; i++) {
 		bool initialized = (tarenas[i] != NULL);
@@ -1114,8 +1126,8 @@ thread_arena_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 	unsigned newind, oldind;

 	newind = oldind = choose_arena()->ind;
-	WRITE(oldind, unsigned);
-	READ(newind, unsigned);
+	WRITE(newind, unsigned);
+	READ(oldind, unsigned);
 	if (newind != oldind) {
 		arena_t *arena;

@@ -1129,6 +1141,8 @@ thread_arena_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 		malloc_mutex_lock(&arenas_lock);
 		if ((arena = arenas[newind]) == NULL)
 			arena = arenas_extend(newind);
+		arenas[oldind]->nthreads--;
+		arenas[newind]->nthreads++;
 		malloc_mutex_unlock(&arenas_lock);
 		if (arena == NULL) {
 			ret = EAGAIN;
@@ -1137,6 +1151,13 @@ thread_arena_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,

 		/* Set new arena association. */
 		ARENA_SET(arena);
+#ifdef JEMALLOC_TCACHE
+		{
+			tcache_t *tcache = TCACHE_GET();
+			if (tcache != NULL)
+				tcache->arena = arena;
+		}
+#endif
 	}

 	ret = 0;
@@ -1146,9 +1167,9 @@ RETURN:

 #ifdef JEMALLOC_STATS
 CTL_RO_NL_GEN(thread_allocated, ALLOCATED_GET(), uint64_t);
-CTL_RO_NL_GEN(thread_allocatedp, &ALLOCATED_GET(), uint64_t *);
+CTL_RO_NL_GEN(thread_allocatedp, ALLOCATEDP_GET(), uint64_t *);
 CTL_RO_NL_GEN(thread_deallocated, DEALLOCATED_GET(), uint64_t);
-CTL_RO_NL_GEN(thread_deallocatedp, &DEALLOCATED_GET(), uint64_t *);
+CTL_RO_NL_GEN(thread_deallocatedp, DEALLOCATEDP_GET(), uint64_t *);
 #endif

 /******************************************************************************/
@@ -1284,9 +1305,9 @@ CTL_RO_NL_GEN(opt_overcommit, opt_overcommit, bool)

 /******************************************************************************/

-CTL_RO_NL_GEN(arenas_bin_i_size, arenas[0]->bins[mib[2]].reg_size, size_t)
-CTL_RO_NL_GEN(arenas_bin_i_nregs, arenas[0]->bins[mib[2]].nregs, uint32_t)
-CTL_RO_NL_GEN(arenas_bin_i_run_size, arenas[0]->bins[mib[2]].run_size, size_t)
+CTL_RO_NL_GEN(arenas_bin_i_size, arena_bin_info[mib[2]].reg_size, size_t)
+CTL_RO_NL_GEN(arenas_bin_i_nregs, arena_bin_info[mib[2]].nregs, uint32_t)
+CTL_RO_NL_GEN(arenas_bin_i_run_size, arena_bin_info[mib[2]].run_size, size_t)
 const ctl_node_t *
 arenas_bin_i_index(const size_t *mib, size_t miblen, size_t i)
 {
@@ -1531,6 +1552,7 @@ stats_arenas_i_lruns_j_index(const size_t *mib, size_t miblen, size_t j)
 }

 #endif
+CTL_RO_GEN(stats_arenas_i_nthreads, ctl_stats.arenas[mib[2]].nthreads, unsigned)
 CTL_RO_GEN(stats_arenas_i_pactive, ctl_stats.arenas[mib[2]].pactive, size_t)
 CTL_RO_GEN(stats_arenas_i_pdirty, ctl_stats.arenas[mib[2]].pdirty, size_t)
 #ifdef JEMALLOC_STATS
@@ -1562,6 +1584,7 @@ RETURN:
 }

 #ifdef JEMALLOC_STATS
+CTL_RO_GEN(stats_cactive, &stats_cactive, size_t *)
 CTL_RO_GEN(stats_allocated, ctl_stats.allocated, size_t)
 CTL_RO_GEN(stats_active, ctl_stats.active, size_t)
 CTL_RO_GEN(stats_mapped, ctl_stats.mapped, size_t)
--- a/dep/jemalloc/src/hash.c
+++ b/dep/jemalloc/src/hash.c
@@ -1,2 +1,2 @@
-#define	HASH_C_
+#define	JEMALLOC_HASH_C_
 #include "jemalloc/internal/jemalloc_internal.h"
--- a/dep/jemalloc/src/huge.c
+++ b/dep/jemalloc/src/huge.c
@@ -50,6 +50,7 @@ huge_malloc(size_t size, bool zero)
 	malloc_mutex_lock(&huge_mtx);
 	extent_tree_ad_insert(&huge, node);
 #ifdef JEMALLOC_STATS
+	stats_cactive_add(csize);
 	huge_nmalloc++;
 	huge_allocated += csize;
 #endif
@@ -83,7 +84,7 @@ huge_palloc(size_t size, size_t alignment, bool zero)
 	 * alignment, in order to assure the alignment can be achieved, then
 	 * unmap leading and trailing chunks.
 	 */
-	assert(alignment >= chunksize);
+	assert(alignment > chunksize);

 	chunk_size = CHUNK_CEILING(size);

@@ -109,12 +110,12 @@ huge_palloc(size_t size, size_t alignment, bool zero)
 	if (offset == 0) {
 		/* Trim trailing space. */
 		chunk_dealloc((void *)((uintptr_t)ret + chunk_size), alloc_size
-		    - chunk_size);
+		    - chunk_size, true);
 	} else {
 		size_t trailsize;

 		/* Trim leading space. */
-		chunk_dealloc(ret, alignment - offset);
+		chunk_dealloc(ret, alignment - offset, true);

 		ret = (void *)((uintptr_t)ret + (alignment - offset));

@@ -123,7 +124,7 @@ huge_palloc(size_t size, size_t alignment, bool zero)
 		    /* Trim trailing space. */
 		    assert(trailsize < alloc_size);
 		    chunk_dealloc((void *)((uintptr_t)ret + chunk_size),
-			trailsize);
+			trailsize, true);
 		}
 	}

@@ -134,6 +135,7 @@ huge_palloc(size_t size, size_t alignment, bool zero)
 	malloc_mutex_lock(&huge_mtx);
 	extent_tree_ad_insert(&huge, node);
 #ifdef JEMALLOC_STATS
+	stats_cactive_add(chunk_size);
 	huge_nmalloc++;
 	huge_allocated += chunk_size;
 #endif
@@ -192,7 +194,7 @@ huge_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
 	 * different size class.  In that case, fall back to allocating new
 	 * space and copying.
 	 */
-	if (alignment != 0)
+	if (alignment > chunksize)
 		ret = huge_palloc(size + extra, alignment, zero);
 	else
 		ret = huge_malloc(size + extra, zero);
@@ -201,7 +203,7 @@ huge_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
 		if (extra == 0)
 			return (NULL);
 		/* Try again, this time without extra. */
-		if (alignment != 0)
+		if (alignment > chunksize)
 			ret = huge_palloc(size, alignment, zero);
 		else
 			ret = huge_malloc(size, zero);
@@ -232,6 +234,13 @@ huge_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
 	    ) {
 		size_t newsize = huge_salloc(ret);

+		/*
+		 * Remove ptr from the tree of huge allocations before
+		 * performing the remap operation, in order to avoid the
+		 * possibility of another thread acquiring that mapping before
+		 * this one removes it from the tree.
+		 */
+		huge_dalloc(ptr, false);
 		if (mremap(ptr, oldsize, newsize, MREMAP_MAYMOVE|MREMAP_FIXED,
 		    ret) == MAP_FAILED) {
 			/*
@@ -251,9 +260,8 @@ huge_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
 			if (opt_abort)
 				abort();
 			memcpy(ret, ptr, copysize);
-			idalloc(ptr);
-		} else
-			huge_dalloc(ptr, false);
+			chunk_dealloc_mmap(ptr, oldsize);
+		}
 	} else
 #endif
 	{
@@ -278,6 +286,7 @@ huge_dalloc(void *ptr, bool unmap)
 	extent_tree_ad_remove(&huge, node);

 #ifdef JEMALLOC_STATS
+	stats_cactive_sub(node->size);
 	huge_ndalloc++;
 	huge_allocated -= node->size;
 #endif
@@ -292,9 +301,10 @@ huge_dalloc(void *ptr, bool unmap)
 			memset(node->addr, 0x5a, node->size);
 #endif
 #endif
-		chunk_dealloc(node->addr, node->size);
 	}

+	chunk_dealloc(node->addr, node->size, unmap);
+
 	base_node_dealloc(node);
 }

--- a/dep/jemalloc/src/jemalloc.c
+++ b/dep/jemalloc/src/jemalloc.c
@@ -7,12 +7,10 @@
 malloc_mutex_t		arenas_lock;
 arena_t			**arenas;
 unsigned		narenas;
-static unsigned		next_arena;

+pthread_key_t		arenas_tsd;
 #ifndef NO_TLS
 __thread arena_t	*arenas_tls JEMALLOC_ATTR(tls_model("initial-exec"));
-#else
-pthread_key_t		arenas_tsd;
 #endif

 #ifdef JEMALLOC_STATS
@@ -30,7 +28,13 @@ static bool		malloc_initialized = false;
 static pthread_t	malloc_initializer = (unsigned long)0;

 /* Used to avoid initialization races. */
-static malloc_mutex_t	init_lock = MALLOC_MUTEX_INITIALIZER;
+static malloc_mutex_t	init_lock =
+#ifdef JEMALLOC_OSSPIN
+    0
+#else
+    MALLOC_MUTEX_INITIALIZER
+#endif
+    ;

 #ifdef DYNAMIC_PAGE_SHIFT
 size_t		pagesize;
@@ -70,6 +74,7 @@ size_t	opt_narenas = 0;
 static void	wrtmessage(void *cbopaque, const char *s);
 static void	stats_print_atexit(void);
 static unsigned	malloc_ncpus(void);
+static void	arenas_cleanup(void *arg);
 #if (defined(JEMALLOC_STATS) && defined(NO_TLS))
 static void	thread_allocated_cleanup(void *arg);
 #endif
@@ -79,6 +84,7 @@ static void	malloc_conf_error(const char *msg, const char *k, size_t klen,
    const char *v, size_t vlen);
 static void	malloc_conf_init(void);
 static bool	malloc_init_hard(void);
+static int	imemalign(void **memptr, size_t alignment, size_t size);

 /******************************************************************************/
 /* malloc_message() setup. */
@@ -147,13 +153,53 @@ choose_arena_hard(void)
 	arena_t *ret;

 	if (narenas > 1) {
+		unsigned i, choose, first_null;
+
+		choose = 0;
+		first_null = narenas;
 		malloc_mutex_lock(&arenas_lock);
-		if ((ret = arenas[next_arena]) == NULL)
-			ret = arenas_extend(next_arena);
-		next_arena = (next_arena + 1) % narenas;
+		assert(arenas[0] != NULL);
+		for (i = 1; i < narenas; i++) {
+			if (arenas[i] != NULL) {
+				/*
+				 * Choose the first arena that has the lowest
+				 * number of threads assigned to it.
+				 */
+				if (arenas[i]->nthreads <
+				    arenas[choose]->nthreads)
+					choose = i;
+			} else if (first_null == narenas) {
+				/*
+				 * Record the index of the first uninitialized
+				 * arena, in case all extant arenas are in use.
+				 *
+				 * NB: It is possible for there to be
+				 * discontinuities in terms of initialized
+				 * versus uninitialized arenas, due to the
+				 * "thread.arena" mallctl.
+				 */
+				first_null = i;
+			}
+		}
+
+		if (arenas[choose] == 0 || first_null == narenas) {
+			/*
+			 * Use an unloaded arena, or the least loaded arena if
+			 * all arenas are already initialized.
+			 */
+			ret = arenas[choose];
+		} else {
+			/* Initialize a new arena. */
+			ret = arenas_extend(first_null);
+		}
+		ret->nthreads++;
 		malloc_mutex_unlock(&arenas_lock);
-	} else
+	} else {
 		ret = arenas[0];
+		malloc_mutex_lock(&arenas_lock);
+		ret->nthreads++;
+		malloc_mutex_unlock(&arenas_lock);
+	}

 	ARENA_SET(ret);

@@ -213,6 +259,28 @@ stats_print_atexit(void)
 	JEMALLOC_P(malloc_stats_print)(NULL, NULL, NULL);
 }

+#if (defined(JEMALLOC_STATS) && defined(NO_TLS))
+thread_allocated_t *
+thread_allocated_get_hard(void)
+{
+	thread_allocated_t *thread_allocated = (thread_allocated_t *)
+	    imalloc(sizeof(thread_allocated_t));
+	if (thread_allocated == NULL) {
+		static thread_allocated_t static_thread_allocated = {0, 0};
+		malloc_write("<jemalloc>: Error allocating TSD;"
+		    " mallctl(\"thread.{de,}allocated[p]\", ...)"
+		    " will be inaccurate\n");
+		if (opt_abort)
+			abort();
+		return (&static_thread_allocated);
+	}
+	pthread_setspecific(thread_allocated_tsd, thread_allocated);
+	thread_allocated->allocated = 0;
+	thread_allocated->deallocated = 0;
+	return (thread_allocated);
+}
+#endif
+
 /*
 * End miscellaneous support functions.
 */
@@ -237,6 +305,16 @@ malloc_ncpus(void)
 	return (ret);
 }

+static void
+arenas_cleanup(void *arg)
+{
+	arena_t *arena = (arena_t *)arg;
+
+	malloc_mutex_lock(&arenas_lock);
+	arena->nthreads--;
+	malloc_mutex_unlock(&arenas_lock);
+}
+
 #if (defined(JEMALLOC_STATS) && defined(NO_TLS))
 static void
 thread_allocated_cleanup(void *arg)
@@ -421,8 +499,8 @@ malloc_conf_init(void)
 			if ((opts = getenv(envname)) != NULL) {
 				/*
 				 * Do nothing; opts is already initialized to
-				 * the value of the JEMALLOC_OPTIONS
-				 * environment variable.
+				 * the value of the MALLOC_CONF environment
+				 * variable.
 				 */
 			} else {
 				/* No configuration specified. */
@@ -611,7 +689,7 @@ malloc_init_hard(void)

 		result = sysconf(_SC_PAGESIZE);
 		assert(result != -1);
-		pagesize = (unsigned)result;
+		pagesize = (size_t)result;

 		/*
 		 * We assume that pagesize is a power of 2 when calculating
@@ -671,7 +749,10 @@ malloc_init_hard(void)
 	}

 #ifdef JEMALLOC_TCACHE
-	tcache_boot();
+	if (tcache_boot()) {
+		malloc_mutex_unlock(&init_lock);
+		return (true);
+	}
 #endif

 	if (huge_boot()) {
@@ -688,6 +769,14 @@ malloc_init_hard(void)
 	}
 #endif

+	if (malloc_mutex_init(&arenas_lock))
+		return (true);
+
+	if (pthread_key_create(&arenas_tsd, arenas_cleanup) != 0) {
+		malloc_mutex_unlock(&init_lock);
+		return (true);
+	}
+
 	/*
 	 * Create enough scaffolding to allow recursive allocation in
 	 * malloc_ncpus().
@@ -712,8 +801,7 @@ malloc_init_hard(void)
 	 * threaded mode.
 	 */
 	ARENA_SET(arenas[0]);
-
-	malloc_mutex_init(&arenas_lock);
+	arenas[0]->nthreads++;

 #ifdef JEMALLOC_PROF
 	if (prof_boot2()) {
@@ -753,15 +841,6 @@ malloc_init_hard(void)
 		malloc_write(")\n");
 	}

-	next_arena = (narenas > 0) ? 1 : 0;
-
-#ifdef NO_TLS
-	if (pthread_key_create(&arenas_tsd, NULL) != 0) {
-		malloc_mutex_unlock(&init_lock);
-		return (true);
-	}
-#endif
-
 	/* Allocate and initialize arenas. */
 	arenas = (arena_t **)base_alloc(sizeof(arena_t *) * narenas);
 	if (arenas == NULL) {
@@ -793,7 +872,6 @@ malloc_init_hard(void)
 	return (false);
 }

-
 #ifdef JEMALLOC_ZONE
 JEMALLOC_ATTR(constructor)
 void
@@ -862,7 +940,8 @@ JEMALLOC_P(malloc)(size_t size)
 #ifdef JEMALLOC_PROF
 	if (opt_prof) {
 		usize = s2u(size);
-		if ((cnt = prof_alloc_prep(usize)) == NULL) {
+		PROF_ALLOC_PREP(1, usize, cnt);
+		if (cnt == NULL) {
 			ret = NULL;
 			goto OOM;
 		}
@@ -911,19 +990,23 @@ RETURN:
 }

 JEMALLOC_ATTR(nonnull(1))
-JEMALLOC_ATTR(visibility("default"))
-int
-JEMALLOC_P(posix_memalign)(void **memptr, size_t alignment, size_t size)
+#ifdef JEMALLOC_PROF
+/*
+ * Avoid any uncertainty as to how many backtrace frames to ignore in 
+ * PROF_ALLOC_PREP().
+ */
+JEMALLOC_ATTR(noinline)
+#endif
+static int
+imemalign(void **memptr, size_t alignment, size_t size)
 {
 	int ret;
-	void *result;
-#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
 	size_t usize
-#  ifdef JEMALLOC_CC_SILENCE
+#ifdef JEMALLOC_CC_SILENCE
 	    = 0
-#  endif
-	    ;
 #endif
+	    ;
+	void *result;
 #ifdef JEMALLOC_PROF
 	prof_thr_cnt_t *cnt
 #  ifdef JEMALLOC_CC_SILENCE
@@ -973,34 +1056,38 @@ JEMALLOC_P(posix_memalign)(void **memptr, size_t alignment, size_t size)
 			goto RETURN;
 		}

+		usize = sa2u(size, alignment, NULL);
+		if (usize == 0) {
+			result = NULL;
+			ret = ENOMEM;
+			goto RETURN;
+		}
+
 #ifdef JEMALLOC_PROF
 		if (opt_prof) {
-			usize = sa2u(size, alignment, NULL);
-			if ((cnt = prof_alloc_prep(usize)) == NULL) {
+			PROF_ALLOC_PREP(2, usize, cnt);
+			if (cnt == NULL) {
 				result = NULL;
 				ret = EINVAL;
 			} else {
 				if (prof_promote && (uintptr_t)cnt !=
 				    (uintptr_t)1U && usize <= small_maxclass) {
-					result = ipalloc(small_maxclass+1,
-					    alignment, false);
+					assert(sa2u(small_maxclass+1,
+					    alignment, NULL) != 0);
+					result = ipalloc(sa2u(small_maxclass+1,
+					    alignment, NULL), alignment, false);
 					if (result != NULL) {
 						arena_prof_promoted(result,
 						    usize);
 					}
 				} else {
-					result = ipalloc(size, alignment,
+					result = ipalloc(usize, alignment,
 					    false);
 				}
 			}
 		} else
 #endif
-		{
-#ifdef JEMALLOC_STATS
-			usize = sa2u(size, alignment, NULL);
-#endif
-			result = ipalloc(size, alignment, false);
-		}
+			result = ipalloc(usize, alignment, false);
 	}

 	if (result == NULL) {
@@ -1032,6 +1119,15 @@ RETURN:
 	return (ret);
 }

+JEMALLOC_ATTR(nonnull(1))
+JEMALLOC_ATTR(visibility("default"))
+int
+JEMALLOC_P(posix_memalign)(void **memptr, size_t alignment, size_t size)
+{
+
+	return imemalign(memptr, alignment, size);
+}
+
 JEMALLOC_ATTR(malloc)
 JEMALLOC_ATTR(visibility("default"))
 void *
@@ -1087,7 +1183,8 @@ JEMALLOC_P(calloc)(size_t num, size_t size)
 #ifdef JEMALLOC_PROF
 	if (opt_prof) {
 		usize = s2u(num_size);
-		if ((cnt = prof_alloc_prep(usize)) == NULL) {
+		PROF_ALLOC_PREP(1, usize, cnt);
+		if (cnt == NULL) {
 			ret = NULL;
 			goto RETURN;
 		}
@@ -1200,7 +1297,9 @@ JEMALLOC_P(realloc)(void *ptr, size_t size)
 		if (opt_prof) {
 			usize = s2u(size);
 			old_ctx = prof_ctx_get(ptr);
-			if ((cnt = prof_alloc_prep(usize)) == NULL) {
+			PROF_ALLOC_PREP(1, usize, cnt);
+			if (cnt == NULL) {
+				old_ctx = NULL;
 				ret = NULL;
 				goto OOM;
 			}
@@ -1210,8 +1309,13 @@ JEMALLOC_P(realloc)(void *ptr, size_t size)
 				    false, false);
 				if (ret != NULL)
 					arena_prof_promoted(ret, usize);
-			} else
+				else
+					old_ctx = NULL;
+			} else {
 				ret = iralloc(ptr, size, 0, 0, false, false);
+				if (ret == NULL)
+					old_ctx = NULL;
+			}
 		} else
 #endif
 		{
@@ -1249,7 +1353,8 @@ OOM:
 #ifdef JEMALLOC_PROF
 			if (opt_prof) {
 				usize = s2u(size);
-				if ((cnt = prof_alloc_prep(usize)) == NULL)
+				PROF_ALLOC_PREP(1, usize, cnt);
+				if (cnt == NULL)
 					ret = NULL;
 				else {
 					if (prof_promote && (uintptr_t)cnt !=
@@ -1354,7 +1459,7 @@ JEMALLOC_P(memalign)(size_t alignment, size_t size)
 #ifdef JEMALLOC_CC_SILENCE
 	int result =
 #endif
-	    JEMALLOC_P(posix_memalign)(&ret, alignment, size);
+	    imemalign(&ret, alignment, size);
 #ifdef JEMALLOC_CC_SILENCE
 	if (result != 0)
 		return (NULL);
@@ -1373,7 +1478,7 @@ JEMALLOC_P(valloc)(size_t size)
 #ifdef JEMALLOC_CC_SILENCE
 	int result =
 #endif
-	    JEMALLOC_P(posix_memalign)(&ret, PAGE_SIZE, size);
+	    imemalign(&ret, PAGE_SIZE, size);
 #ifdef JEMALLOC_CC_SILENCE
 	if (result != 0)
 		return (NULL);
@@ -1454,15 +1559,18 @@ JEMALLOC_P(mallctlbymib)(const size_t *mib, size_t miblen, void *oldp,
 }

 JEMALLOC_INLINE void *
-iallocm(size_t size, size_t alignment, bool zero)
+iallocm(size_t usize, size_t alignment, bool zero)
 {

+	assert(usize == ((alignment == 0) ? s2u(usize) : sa2u(usize, alignment,
+	    NULL)));
+
 	if (alignment != 0)
-		return (ipalloc(size, alignment, zero));
+		return (ipalloc(usize, alignment, zero));
 	else if (zero)
-		return (icalloc(size));
+		return (icalloc(usize));
 	else
-		return (imalloc(size));
+		return (imalloc(usize));
 }

 JEMALLOC_ATTR(nonnull(1))
@@ -1485,38 +1593,43 @@ JEMALLOC_P(allocm)(void **ptr, size_t *rsize, size_t size, int flags)
 	if (malloc_init())
 		goto OOM;

+	usize = (alignment == 0) ? s2u(size) : sa2u(size, alignment, NULL);
+	if (usize == 0)
+		goto OOM;
+
 #ifdef JEMALLOC_PROF
 	if (opt_prof) {
-		usize = (alignment == 0) ? s2u(size) : sa2u(size, alignment,
-		    NULL);
-		if ((cnt = prof_alloc_prep(usize)) == NULL)
+		PROF_ALLOC_PREP(1, usize, cnt);
+		if (cnt == NULL)
 			goto OOM;
 		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && usize <=
 		    small_maxclass) {
-			p = iallocm(small_maxclass+1, alignment, zero);
+			size_t usize_promoted = (alignment == 0) ?
+			    s2u(small_maxclass+1) : sa2u(small_maxclass+1,
+			    alignment, NULL);
+			assert(usize_promoted != 0);
+			p = iallocm(usize_promoted, alignment, zero);
 			if (p == NULL)
 				goto OOM;
 			arena_prof_promoted(p, usize);
 		} else {
-			p = iallocm(size, alignment, zero);
+			p = iallocm(usize, alignment, zero);
 			if (p == NULL)
 				goto OOM;
 		}
-
+		prof_malloc(p, usize, cnt);
 		if (rsize != NULL)
 			*rsize = usize;
 	} else
 #endif
 	{
-		p = iallocm(size, alignment, zero);
+		p = iallocm(usize, alignment, zero);
 		if (p == NULL)
 			goto OOM;
 #ifndef JEMALLOC_STATS
 		if (rsize != NULL)
 #endif
 		{
-			usize = (alignment == 0) ? s2u(size) : sa2u(size,
-			    alignment, NULL);
 #ifdef JEMALLOC_STATS
 			if (rsize != NULL)
 #endif
@@ -1559,7 +1672,6 @@ JEMALLOC_P(rallocm)(void **ptr, size_t *rsize, size_t size, size_t extra,
 	bool no_move = flags & ALLOCM_NO_MOVE;
 #ifdef JEMALLOC_PROF
 	prof_thr_cnt_t *cnt;
-	prof_ctx_t *old_ctx;
 #endif

 	assert(ptr != NULL);
@@ -1574,25 +1686,33 @@ JEMALLOC_P(rallocm)(void **ptr, size_t *rsize, size_t size, size_t extra,
 		/*
 		 * usize isn't knowable before iralloc() returns when extra is
 		 * non-zero.  Therefore, compute its maximum possible value and
-		 * use that in prof_alloc_prep() to decide whether to capture a
+		 * use that in PROF_ALLOC_PREP() to decide whether to capture a
 		 * backtrace.  prof_realloc() will use the actual usize to
 		 * decide whether to sample.
 		 */
 		size_t max_usize = (alignment == 0) ? s2u(size+extra) :
 		    sa2u(size+extra, alignment, NULL);
+		prof_ctx_t *old_ctx = prof_ctx_get(p);
 		old_size = isalloc(p);
-		old_ctx = prof_ctx_get(p);
-		if ((cnt = prof_alloc_prep(max_usize)) == NULL)
+		PROF_ALLOC_PREP(1, max_usize, cnt);
+		if (cnt == NULL)
 			goto OOM;
-		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && max_usize
-		    <= small_maxclass) {
+		/*
+		 * Use minimum usize to determine whether promotion may happen.
+		 */
+		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U
+		    && ((alignment == 0) ? s2u(size) : sa2u(size,
+		    alignment, NULL)) <= small_maxclass) {
 			q = iralloc(p, small_maxclass+1, (small_maxclass+1 >=
 			    size+extra) ? 0 : size+extra - (small_maxclass+1),
 			    alignment, zero, no_move);
 			if (q == NULL)
 				goto ERR;
-			usize = isalloc(q);
-			arena_prof_promoted(q, usize);
+			if (max_usize < PAGE_SIZE) {
+				usize = max_usize;
+				arena_prof_promoted(q, usize);
+			} else
+				usize = isalloc(q);
 		} else {
 			q = iralloc(p, size, extra, alignment, zero, no_move);
 			if (q == NULL)
@@ -1600,6 +1720,8 @@ JEMALLOC_P(rallocm)(void **ptr, size_t *rsize, size_t size, size_t extra,
 			usize = isalloc(q);
 		}
 		prof_realloc(q, usize, cnt, old_size, old_ctx);
+		if (rsize != NULL)
+			*rsize = usize;
 	} else
 #endif
 	{
--- a/dep/jemalloc/src/mb.c
+++ b/dep/jemalloc/src/mb.c
@@ -1,2 +1,2 @@
-#define	MB_C_
+#define	JEMALLOC_MB_C_
 #include "jemalloc/internal/jemalloc_internal.h"
--- a/dep/jemalloc/src/mutex.c
+++ b/dep/jemalloc/src/mutex.c
@@ -55,6 +55,9 @@ pthread_create(pthread_t *__restrict thread,
 bool
 malloc_mutex_init(malloc_mutex_t *mutex)
 {
+#ifdef JEMALLOC_OSSPIN
+	*mutex = 0;
+#else
 	pthread_mutexattr_t attr;

 	if (pthread_mutexattr_init(&attr) != 0)
@@ -70,6 +73,7 @@ malloc_mutex_init(malloc_mutex_t *mutex)
 	}
 	pthread_mutexattr_destroy(&attr);

+#endif
 	return (false);
 }

@@ -77,8 +81,10 @@ void
 malloc_mutex_destroy(malloc_mutex_t *mutex)
 {

+#ifndef JEMALLOC_OSSPIN
 	if (pthread_mutex_destroy(mutex) != 0) {
 		malloc_write("<jemalloc>: Error in pthread_mutex_destroy()\n");
 		abort();
 	}
+#endif
 }
--- a/dep/jemalloc/src/prof.c
+++ b/dep/jemalloc/src/prof.c
@@ -3,15 +3,15 @@
 #ifdef JEMALLOC_PROF
 /******************************************************************************/

-#ifdef JEMALLOC_PROF_LIBGCC
-#include <unwind.h>
-#endif
-
 #ifdef JEMALLOC_PROF_LIBUNWIND
 #define	UNW_LOCAL_ONLY
 #include <libunwind.h>
 #endif

+#ifdef JEMALLOC_PROF_LIBGCC
+#include <unwind.h>
+#endif
+
 /******************************************************************************/
 /* Data. */

@@ -169,39 +169,7 @@ prof_leave(void)
 		prof_gdump();
 }

-#ifdef JEMALLOC_PROF_LIBGCC
-static _Unwind_Reason_Code
-prof_unwind_init_callback(struct _Unwind_Context *context, void *arg)
-{
-
-	return (_URC_NO_REASON);
-}
-
-static _Unwind_Reason_Code
-prof_unwind_callback(struct _Unwind_Context *context, void *arg)
-{
-	prof_unwind_data_t *data = (prof_unwind_data_t *)arg;
-
-	if (data->nignore > 0)
-		data->nignore--;
-	else {
-		data->bt->vec[data->bt->len] = (void *)_Unwind_GetIP(context);
-		data->bt->len++;
-		if (data->bt->len == data->max)
-			return (_URC_END_OF_STACK);
-	}
-
-	return (_URC_NO_REASON);
-}
-
-void
-prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max)
-{
-	prof_unwind_data_t data = {bt, nignore, max};
-
-	_Unwind_Backtrace(prof_unwind_callback, &data);
-}
-#elif defined(JEMALLOC_PROF_LIBUNWIND)
+#ifdef JEMALLOC_PROF_LIBUNWIND
 void
 prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max)
 {
@@ -236,7 +204,41 @@ prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max)
 			break;
 	}
 }
-#else
+#endif
+#ifdef JEMALLOC_PROF_LIBGCC
+static _Unwind_Reason_Code
+prof_unwind_init_callback(struct _Unwind_Context *context, void *arg)
+{
+
+	return (_URC_NO_REASON);
+}
+
+static _Unwind_Reason_Code
+prof_unwind_callback(struct _Unwind_Context *context, void *arg)
+{
+	prof_unwind_data_t *data = (prof_unwind_data_t *)arg;
+
+	if (data->nignore > 0)
+		data->nignore--;
+	else {
+		data->bt->vec[data->bt->len] = (void *)_Unwind_GetIP(context);
+		data->bt->len++;
+		if (data->bt->len == data->max)
+			return (_URC_END_OF_STACK);
+	}
+
+	return (_URC_NO_REASON);
+}
+
+void
+prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max)
+{
+	prof_unwind_data_t data = {bt, nignore, max};
+
+	_Unwind_Backtrace(prof_unwind_callback, &data);
+}
+#endif
+#ifdef JEMALLOC_PROF_GCC
 void
 prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max)
 {
@@ -432,6 +434,7 @@ prof_lookup(prof_bt_t *bt)
 			prof_ctx_t	*p;
 			void		*v;
 		} ctx;
+		bool new_ctx;

 		/*
 		 * This thread's cache lacks bt.  Look for it in the global
@@ -468,12 +471,26 @@ prof_lookup(prof_bt_t *bt)
 				idalloc(ctx.v);
 				return (NULL);
 			}
+			/*
+			 * Artificially raise curobjs, in order to avoid a race
+			 * condition with prof_ctx_merge()/prof_ctx_destroy().
+			 *
+			 * No locking is necessary for ctx here because no other
+			 * threads have had the opportunity to fetch it from
+			 * bt2ctx yet.
+			 */
+			ctx.p->cnt_merged.curobjs++;
+			new_ctx = true;
+		} else {
+			/*
+			 * Artificially raise curobjs, in order to avoid a race
+			 * condition with prof_ctx_merge()/prof_ctx_destroy().
+			 */
+			malloc_mutex_lock(&ctx.p->lock);
+			ctx.p->cnt_merged.curobjs++;
+			malloc_mutex_unlock(&ctx.p->lock);
+			new_ctx = false;
 		}
-		/*
-		 * Acquire ctx's lock before releasing bt2ctx_mtx, in order to
-		 * avoid a race condition with prof_ctx_destroy().
-		 */
-		malloc_mutex_lock(&ctx.p->lock);
 		prof_leave();

 		/* Link a prof_thd_cnt_t into ctx for this thread. */
@@ -486,8 +503,9 @@ prof_lookup(prof_bt_t *bt)
 			 */
 			ret.p = ql_last(&prof_tdata->lru_ql, lru_link);
 			assert(ret.v != NULL);
-			ckh_remove(&prof_tdata->bt2cnt, ret.p->ctx->bt, NULL,
-			    NULL);
+			if (ckh_remove(&prof_tdata->bt2cnt, ret.p->ctx->bt,
+			    NULL, NULL))
+				assert(false);
 			ql_remove(&prof_tdata->lru_ql, ret.p, lru_link);
 			prof_ctx_merge(ret.p->ctx, ret.p);
 			/* ret can now be re-used. */
@@ -498,7 +516,8 @@ prof_lookup(prof_bt_t *bt)
 			/* Allocate and partially initialize a new cnt. */
 			ret.v = imalloc(sizeof(prof_thr_cnt_t));
 			if (ret.p == NULL) {
-				malloc_mutex_unlock(&ctx.p->lock);
+				if (new_ctx)
+					prof_ctx_destroy(ctx.p);
 				return (NULL);
 			}
 			ql_elm_new(ret.p, cnts_link);
@@ -509,12 +528,15 @@ prof_lookup(prof_bt_t *bt)
 		ret.p->epoch = 0;
 		memset(&ret.p->cnts, 0, sizeof(prof_cnt_t));
 		if (ckh_insert(&prof_tdata->bt2cnt, btkey.v, ret.v)) {
-			malloc_mutex_unlock(&ctx.p->lock);
+			if (new_ctx)
+				prof_ctx_destroy(ctx.p);
 			idalloc(ret.v);
 			return (NULL);
 		}
 		ql_head_insert(&prof_tdata->lru_ql, ret.p, lru_link);
+		malloc_mutex_lock(&ctx.p->lock);
 		ql_tail_insert(&ctx.p->cnts_ql, ret.p, cnts_link);
+		ctx.p->cnt_merged.curobjs--;
 		malloc_mutex_unlock(&ctx.p->lock);
 	} else {
 		/* Move ret to the front of the LRU. */
@@ -628,11 +650,10 @@ prof_ctx_destroy(prof_ctx_t *ctx)

 	/*
 	 * Check that ctx is still unused by any thread cache before destroying
-	 * it.  prof_lookup() interlocks bt2ctx_mtx and ctx->lock in order to
-	 * avoid a race condition with this function, and prof_ctx_merge()
-	 * artificially raises ctx->cnt_merged.curobjs in order to avoid a race
-	 * between the main body of prof_ctx_merge() and entry into this
-	 * function.
+	 * it.  prof_lookup() artificially raises ctx->cnt_merge.curobjs in
+	 * order to avoid a race condition with this function, as does
+	 * prof_ctx_merge() in order to avoid a race between the main body of
+	 * prof_ctx_merge() and entry into this function.
 	 */
 	prof_enter();
 	malloc_mutex_lock(&ctx->lock);
@@ -641,7 +662,8 @@ prof_ctx_destroy(prof_ctx_t *ctx)
 		assert(ctx->cnt_merged.accumobjs == 0);
 		assert(ctx->cnt_merged.accumbytes == 0);
 		/* Remove ctx from bt2ctx. */
-		ckh_remove(&bt2ctx, ctx->bt, NULL, NULL);
+		if (ckh_remove(&bt2ctx, ctx->bt, NULL, NULL))
+			assert(false);
 		prof_leave();
 		/* Destroy ctx. */
 		malloc_mutex_unlock(&ctx->lock);
@@ -649,7 +671,10 @@ prof_ctx_destroy(prof_ctx_t *ctx)
 		malloc_mutex_destroy(&ctx->lock);
 		idalloc(ctx);
 	} else {
-		/* Compensate for increment in prof_ctx_merge(). */
+		/*
+		 * Compensate for increment in prof_ctx_merge() or
+		 * prof_lookup().
+		 */
 		ctx->cnt_merged.curobjs--;
 		malloc_mutex_unlock(&ctx->lock);
 		prof_leave();
@@ -1056,7 +1081,7 @@ prof_bt_hash(const void *key, unsigned minbits, size_t *hash1, size_t *hash2)
 	} else {
 		ret1 = h;
 		ret2 = hash(bt->vec, bt->len * sizeof(void *),
-		    0x8432a476666bbc13U);
+		    0x8432a476666bbc13LLU);
 	}

 	*hash1 = ret1;
@@ -1093,7 +1118,6 @@ prof_tdata_init(void)

 	prof_tdata->vec = imalloc(sizeof(void *) * prof_bt_max);
 	if (prof_tdata->vec == NULL) {
-
 		ckh_delete(&prof_tdata->bt2cnt);
 		idalloc(prof_tdata);
 		return (NULL);
@@ -1111,33 +1135,26 @@ prof_tdata_init(void)
 static void
 prof_tdata_cleanup(void *arg)
 {
-	prof_tdata_t *prof_tdata;
+	prof_thr_cnt_t *cnt;
+	prof_tdata_t *prof_tdata = (prof_tdata_t *)arg;

-	prof_tdata = PROF_TCACHE_GET();
-	if (prof_tdata != NULL) {
-		prof_thr_cnt_t *cnt;
+	/*
+	 * Delete the hash table.  All of its contents can still be iterated
+	 * over via the LRU.
+	 */
+	ckh_delete(&prof_tdata->bt2cnt);

-		/*
-		 * Delete the hash table.  All of its contents can still be
-		 * iterated over via the LRU.
-		 */
-		ckh_delete(&prof_tdata->bt2cnt);
-
-		/*
-		 * Iteratively merge cnt's into the global stats and delete
-		 * them.
-		 */
-		while ((cnt = ql_last(&prof_tdata->lru_ql, lru_link)) != NULL) {
-			prof_ctx_merge(cnt->ctx, cnt);
-			ql_remove(&prof_tdata->lru_ql, cnt, lru_link);
-			idalloc(cnt);
-		}
-
-		idalloc(prof_tdata->vec);
-
-		idalloc(prof_tdata);
-		PROF_TCACHE_SET(NULL);
+	/* Iteratively merge cnt's into the global stats and delete them. */
+	while ((cnt = ql_last(&prof_tdata->lru_ql, lru_link)) != NULL) {
+		ql_remove(&prof_tdata->lru_ql, cnt, lru_link);
+		prof_ctx_merge(cnt->ctx, cnt);
+		idalloc(cnt);
 	}
+
+	idalloc(prof_tdata->vec);
+
+	idalloc(prof_tdata);
+	PROF_TCACHE_SET(NULL);
 }

 void
--- a/dep/jemalloc/src/rtree.c
+++ b/dep/jemalloc/src/rtree.c
@@ -1,4 +1,4 @@
-#define	RTREE_C_
+#define	JEMALLOC_RTREE_C_
 #include "jemalloc/internal/jemalloc_internal.h"

 rtree_t *
@@ -20,7 +20,10 @@ rtree_new(unsigned bits)
 	memset(ret, 0, offsetof(rtree_t, level2bits) + (sizeof(unsigned) *
 	    height));

-	malloc_mutex_init(&ret->mutex);
+	if (malloc_mutex_init(&ret->mutex)) {
+		/* Leak the rtree. */
+		return (NULL);
+	}
 	ret->height = height;
 	if (bits_per_level * height > bits)
 		ret->level2bits[0] = bits % bits_per_level;
--- a/dep/jemalloc/src/stats.c
+++ b/dep/jemalloc/src/stats.c
@@ -39,6 +39,10 @@

 bool	opt_stats_print = false;

+#ifdef JEMALLOC_STATS
+size_t	stats_cactive = 0;
+#endif
+
 /******************************************************************************/
 /* Function prototypes for non-inline static functions. */

@@ -319,6 +323,7 @@ static void
 stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
    unsigned i)
 {
+	unsigned nthreads;
 	size_t pagesize, pactive, pdirty, mapped;
 	uint64_t npurge, nmadvise, purged;
 	size_t small_allocated;
@@ -328,6 +333,9 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,

 	CTL_GET("arenas.pagesize", &pagesize, size_t);

+	CTL_I_GET("stats.arenas.0.nthreads", &nthreads, unsigned);
+	malloc_cprintf(write_cb, cbopaque,
+	    "assigned threads: %u\n", nthreads);
 	CTL_I_GET("stats.arenas.0.pactive", &pactive, size_t);
 	CTL_I_GET("stats.arenas.0.pdirty", &pdirty, size_t);
 	CTL_I_GET("stats.arenas.0.npurge", &npurge, uint64_t);
@@ -669,21 +677,26 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 #ifdef JEMALLOC_STATS
 	{
 		int err;
-		size_t ssz;
+		size_t sszp, ssz;
+		size_t *cactive;
 		size_t allocated, active, mapped;
 		size_t chunks_current, chunks_high, swap_avail;
 		uint64_t chunks_total;
 		size_t huge_allocated;
 		uint64_t huge_nmalloc, huge_ndalloc;

+		sszp = sizeof(size_t *);
 		ssz = sizeof(size_t);

+		CTL_GET("stats.cactive", &cactive, size_t *);
 		CTL_GET("stats.allocated", &allocated, size_t);
 		CTL_GET("stats.active", &active, size_t);
 		CTL_GET("stats.mapped", &mapped, size_t);
 		malloc_cprintf(write_cb, cbopaque,
-		    "Allocated: %zu, active: %zu, mapped: %zu\n", allocated,
-		    active, mapped);
+		    "Allocated: %zu, active: %zu, mapped: %zu\n",
+		    allocated, active, mapped);
+		malloc_cprintf(write_cb, cbopaque,
+		    "Current active ceiling: %zu\n", atomic_read_z(cactive));

 		/* Print chunk stats. */
 		CTL_GET("stats.chunks.total", &chunks_total, uint64_t);
@@ -735,7 +748,7 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 						ninitialized++;
 				}

-				if (ninitialized > 1) {
+				if (ninitialized > 1 || unmerged == false) {
 					/* Print merged arena stats. */
 					malloc_cprintf(write_cb, cbopaque,
 					    "\nMerged arenas stats:\n");
--- a/dep/jemalloc/src/tcache.c
+++ b/dep/jemalloc/src/tcache.c
@@ -8,6 +8,9 @@ bool	opt_tcache = true;
 ssize_t	opt_lg_tcache_max = LG_TCACHE_MAXCLASS_DEFAULT;
 ssize_t	opt_lg_tcache_gc_sweep = LG_TCACHE_GC_SWEEP_DEFAULT;

+tcache_bin_info_t	*tcache_bin_info;
+static unsigned		stack_nelms; /* Total stack elms per tcache. */
+
 /* Map of thread-specific caches. */
 #ifndef NO_TLS
 __thread tcache_t	*tcache_tls JEMALLOC_ATTR(tls_model("initial-exec"));
@@ -55,18 +58,19 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem
 #endif
    )
 {
-	void *flush, *deferred, *ptr;
+	void *ptr;
 	unsigned i, nflush, ndeferred;
-	bool first_pass;
+#ifdef JEMALLOC_STATS
+	bool merged_stats = false;
+#endif

 	assert(binind < nbins);
 	assert(rem <= tbin->ncached);
-	assert(tbin->ncached > 0 || tbin->avail == NULL);

-	for (flush = tbin->avail, nflush = tbin->ncached - rem, first_pass =
-	    true; flush != NULL; flush = deferred, nflush = ndeferred) {
+	for (nflush = tbin->ncached - rem; nflush > 0; nflush = ndeferred) {
 		/* Lock the arena bin associated with the first object. */
-		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(flush);
+		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(
+		    tbin->avail[0]);
 		arena_t *arena = chunk->arena;
 		arena_bin_t *bin = &arena->bins[binind];

@@ -82,17 +86,17 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem
 		malloc_mutex_lock(&bin->lock);
 #ifdef JEMALLOC_STATS
 		if (arena == tcache->arena) {
+			assert(merged_stats == false);
+			merged_stats = true;
 			bin->stats.nflushes++;
 			bin->stats.nrequests += tbin->tstats.nrequests;
 			tbin->tstats.nrequests = 0;
 		}
 #endif
-		deferred = NULL;
 		ndeferred = 0;
 		for (i = 0; i < nflush; i++) {
-			ptr = flush;
+			ptr = tbin->avail[i];
 			assert(ptr != NULL);
-			flush = *(void **)ptr;
 			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 			if (chunk->arena == arena) {
 				size_t pageind = ((uintptr_t)ptr -
@@ -107,21 +111,31 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem
 				 * locked.  Stash the object, so that it can be
 				 * handled in a future pass.
 				 */
-				*(void **)ptr = deferred;
-				deferred = ptr;
+				tbin->avail[ndeferred] = ptr;
 				ndeferred++;
 			}
 		}
 		malloc_mutex_unlock(&bin->lock);
-
-		if (first_pass) {
-			tbin->avail = flush;
-			first_pass = false;
-		}
 	}
+#ifdef JEMALLOC_STATS
+	if (merged_stats == false) {
+		/*
+		 * The flush loop didn't happen to flush to this thread's
+		 * arena, so the stats didn't get merged.  Manually do so now.
+		 */
+		arena_bin_t *bin = &tcache->arena->bins[binind];
+		malloc_mutex_lock(&bin->lock);
+		bin->stats.nflushes++;
+		bin->stats.nrequests += tbin->tstats.nrequests;
+		tbin->tstats.nrequests = 0;
+		malloc_mutex_unlock(&bin->lock);
+	}
+#endif

+	memmove(tbin->avail, &tbin->avail[tbin->ncached - rem],
+	    rem * sizeof(void *));
 	tbin->ncached = rem;
-	if (tbin->ncached < tbin->low_water)
+	if ((int)tbin->ncached < tbin->low_water)
 		tbin->low_water = tbin->ncached;
 }

@@ -132,18 +146,19 @@ tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem
 #endif
    )
 {
-	void *flush, *deferred, *ptr;
+	void *ptr;
 	unsigned i, nflush, ndeferred;
-	bool first_pass;
+#ifdef JEMALLOC_STATS
+	bool merged_stats = false;
+#endif

 	assert(binind < nhbins);
 	assert(rem <= tbin->ncached);
-	assert(tbin->ncached > 0 || tbin->avail == NULL);

-	for (flush = tbin->avail, nflush = tbin->ncached - rem, first_pass =
-	    true; flush != NULL; flush = deferred, nflush = ndeferred) {
+	for (nflush = tbin->ncached - rem; nflush > 0; nflush = ndeferred) {
 		/* Lock the arena associated with the first object. */
-		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(flush);
+		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(
+		    tbin->avail[0]);
 		arena_t *arena = chunk->arena;

 		malloc_mutex_lock(&arena->lock);
@@ -155,6 +170,7 @@ tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem
 			tcache->prof_accumbytes = 0;
 #endif
 #ifdef JEMALLOC_STATS
+			merged_stats = true;
 			arena->stats.nrequests_large += tbin->tstats.nrequests;
 			arena->stats.lstats[binind - nbins].nrequests +=
 			    tbin->tstats.nrequests;
@@ -163,12 +179,10 @@ tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem
 #if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
 		}
 #endif
-		deferred = NULL;
 		ndeferred = 0;
 		for (i = 0; i < nflush; i++) {
-			ptr = flush;
+			ptr = tbin->avail[i];
 			assert(ptr != NULL);
-			flush = *(void **)ptr;
 			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 			if (chunk->arena == arena)
 				arena_dalloc_large(arena, chunk, ptr);
@@ -179,21 +193,32 @@ tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem
 				 * Stash the object, so that it can be handled
 				 * in a future pass.
 				 */
-				*(void **)ptr = deferred;
-				deferred = ptr;
+				tbin->avail[ndeferred] = ptr;
 				ndeferred++;
 			}
 		}
 		malloc_mutex_unlock(&arena->lock);
-
-		if (first_pass) {
-			tbin->avail = flush;
-			first_pass = false;
-		}
 	}
+#ifdef JEMALLOC_STATS
+	if (merged_stats == false) {
+		/*
+		 * The flush loop didn't happen to flush to this thread's
+		 * arena, so the stats didn't get merged.  Manually do so now.
+		 */
+		arena_t *arena = tcache->arena;
+		malloc_mutex_lock(&arena->lock);
+		arena->stats.nrequests_large += tbin->tstats.nrequests;
+		arena->stats.lstats[binind - nbins].nrequests +=
+		    tbin->tstats.nrequests;
+		tbin->tstats.nrequests = 0;
+		malloc_mutex_unlock(&arena->lock);
+	}
+#endif

+	memmove(tbin->avail, &tbin->avail[tbin->ncached - rem],
+	    rem * sizeof(void *));
 	tbin->ncached = rem;
-	if (tbin->ncached < tbin->low_water)
+	if ((int)tbin->ncached < tbin->low_water)
 		tbin->low_water = tbin->ncached;
 }

@@ -201,10 +226,14 @@ tcache_t *
 tcache_create(arena_t *arena)
 {
 	tcache_t *tcache;
-	size_t size;
+	size_t size, stack_offset;
 	unsigned i;

 	size = offsetof(tcache_t, tbins) + (sizeof(tcache_bin_t) * nhbins);
+	/* Naturally align the pointer stacks. */
+	size = PTR_CEILING(size);
+	stack_offset = size;
+	size += stack_nelms * sizeof(void *);
 	/*
 	 * Round up to the nearest multiple of the cacheline size, in order to
 	 * avoid the possibility of false cacheline sharing.
@@ -217,6 +246,8 @@ tcache_create(arena_t *arena)

 	if (size <= small_maxclass)
 		tcache = (tcache_t *)arena_malloc_small(arena, size, true);
+	else if (size <= tcache_maxclass)
+		tcache = (tcache_t *)arena_malloc_large(arena, size, true);
 	else
 		tcache = (tcache_t *)icalloc(size);

@@ -233,15 +264,12 @@ tcache_create(arena_t *arena)

 	tcache->arena = arena;
 	assert((TCACHE_NSLOTS_SMALL_MAX & 1U) == 0);
-	for (i = 0; i < nbins; i++) {
-		if ((arena->bins[i].nregs << 1) <= TCACHE_NSLOTS_SMALL_MAX) {
-			tcache->tbins[i].ncached_max = (arena->bins[i].nregs <<
-			    1);
-		} else
-			tcache->tbins[i].ncached_max = TCACHE_NSLOTS_SMALL_MAX;
+	for (i = 0; i < nhbins; i++) {
+		tcache->tbins[i].lg_fill_div = 1;
+		tcache->tbins[i].avail = (void **)((uintptr_t)tcache +
+		    (uintptr_t)stack_offset);
+		stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *);
 	}
-	for (; i < nhbins; i++)
-		tcache->tbins[i].ncached_max = TCACHE_NSLOTS_LARGE;

 	TCACHE_SET(tcache);

@@ -252,6 +280,7 @@ void
 tcache_destroy(tcache_t *tcache)
 {
 	unsigned i;
+	size_t tcache_size;

 #ifdef JEMALLOC_STATS
 	/* Unlink from list of extant tcaches. */
@@ -308,7 +337,8 @@ tcache_destroy(tcache_t *tcache)
 	}
 #endif

-	if (arena_salloc(tcache) <= small_maxclass) {
+	tcache_size = arena_salloc(tcache);
+	if (tcache_size <= small_maxclass) {
 		arena_chunk_t *chunk = CHUNK_ADDR2BASE(tcache);
 		arena_t *arena = chunk->arena;
 		size_t pageind = ((uintptr_t)tcache - (uintptr_t)chunk) >>
@@ -322,6 +352,13 @@ tcache_destroy(tcache_t *tcache)
 		malloc_mutex_lock(&bin->lock);
 		arena_dalloc_bin(arena, chunk, tcache, mapelm);
 		malloc_mutex_unlock(&bin->lock);
+	} else if (tcache_size <= tcache_maxclass) {
+		arena_chunk_t *chunk = CHUNK_ADDR2BASE(tcache);
+		arena_t *arena = chunk->arena;
+
+		malloc_mutex_lock(&arena->lock);
+		arena_dalloc_large(arena, chunk, tcache);
+		malloc_mutex_unlock(&arena->lock);
 	} else
 		idalloc(tcache);
 }
@@ -378,11 +415,13 @@ tcache_stats_merge(tcache_t *tcache, arena_t *arena)
 }
 #endif

-void
+bool
 tcache_boot(void)
 {

 	if (opt_tcache) {
+		unsigned i;
+
 		/*
 		 * If necessary, clamp opt_lg_tcache_max, now that
 		 * small_maxclass and arena_maxclass are known.
@@ -397,6 +436,28 @@ tcache_boot(void)

 		nhbins = nbins + (tcache_maxclass >> PAGE_SHIFT);

+		/* Initialize tcache_bin_info. */
+		tcache_bin_info = (tcache_bin_info_t *)base_alloc(nhbins *
+		    sizeof(tcache_bin_info_t));
+		if (tcache_bin_info == NULL)
+			return (true);
+		stack_nelms = 0;
+		for (i = 0; i < nbins; i++) {
+			if ((arena_bin_info[i].nregs << 1) <=
+			    TCACHE_NSLOTS_SMALL_MAX) {
+				tcache_bin_info[i].ncached_max =
+				    (arena_bin_info[i].nregs << 1);
+			} else {
+				tcache_bin_info[i].ncached_max =
+				    TCACHE_NSLOTS_SMALL_MAX;
+			}
+			stack_nelms += tcache_bin_info[i].ncached_max;
+		}
+		for (; i < nhbins; i++) {
+			tcache_bin_info[i].ncached_max = TCACHE_NSLOTS_LARGE;
+			stack_nelms += tcache_bin_info[i].ncached_max;
+		}
+
 		/* Compute incremental GC event threshold. */
 		if (opt_lg_tcache_gc_sweep >= 0) {
 			tcache_gc_incr = ((1U << opt_lg_tcache_gc_sweep) /
@@ -412,6 +473,8 @@ tcache_boot(void)
 			abort();
 		}
 	}
+
+	return (false);
 }
 /******************************************************************************/
 #endif /* JEMALLOC_TCACHE */